├── .gitignore ├── .scalafmt.conf ├── CHANGELOG.md ├── LICENSE ├── README.md ├── azure-pipelines.yml ├── build.sbt ├── project ├── Dependencies.scala ├── build.properties └── plugins.sbt └── src ├── main └── scala │ └── com │ └── coxautodata │ ├── OptionsParsing.scala │ ├── SparkDistCP.scala │ ├── SparkDistCPOptions.scala │ ├── objects │ ├── Accumulators.scala │ ├── ConfigSerDeser.scala │ ├── CopyPartitioner.scala │ ├── CopyResult.scala │ ├── DeleteResult.scala │ ├── DistCPResult.scala │ ├── ExceptionCountAccumulator.scala │ ├── FileSystemObjectCacher.scala │ ├── Logging.scala │ ├── SerializableFileStatus.scala │ └── SingleCopyDefinition.scala │ └── utils │ ├── CopyUtils.scala │ ├── FileListUtils.scala │ ├── FileUtils.scala │ └── PathUtils.scala └── test ├── resources └── com │ └── coxautodata │ └── test.filters └── scala └── com └── coxautodata ├── TestOptionsParsing.scala ├── TestSparkDistCP.scala ├── TestSparkDistCPOptions.scala ├── TestSpec.scala ├── objects ├── TestAccumulators.scala ├── TestCopyPartitioner.scala ├── TestExceptionCountAccumulator.scala └── TestFileSystemObjectCacher.scala └── utils ├── TestCopyUtils.scala ├── TestFileListUtils.scala ├── TestFileUtils.scala └── TestPathUtils.scala /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/maven,scala,java,eclipse,intellij,netbeans 3 | 4 | ### Maven ### 5 | target/ 6 | pom.xml.tag 7 | pom.xml.releaseBackup 8 | pom.xml.versionsBackup 9 | pom.xml.next 10 | release.properties 11 | dependency-reduced-pom.xml 12 | buildNumber.properties 13 | .mvn/timing.properties 14 | 15 | 16 | ### Scala ### 17 | *.class 18 | *.log 19 | 20 | # sbt specific 21 | .cache 22 | .history 23 | .lib/ 24 | dist/* 25 | target/ 26 | lib_managed/ 27 | src_managed/ 28 | project/boot/ 29 | project/plugins/project/ 30 | 31 | # Scala-IDE specific 32 | .scala_dependencies 33 | .worksheet 34 | 35 | 36 | ### Eclipse ### 37 | 38 | .metadata 39 | bin/ 40 | tmp/ 41 | *.tmp 42 | *.bak 43 | *.swp 44 | *~.nib 45 | local.properties 46 | .settings/ 47 | .loadpath 48 | .recommenders 49 | 50 | # Eclipse Core 51 | .project 52 | 53 | # External tool builders 54 | .externalToolBuilders/ 55 | 56 | # Locally stored "Eclipse launch configurations" 57 | *.launch 58 | 59 | # PyDev specific (Python IDE for Eclipse) 60 | *.pydevproject 61 | 62 | # CDT-specific (C/C++ Development Tooling) 63 | .cproject 64 | 65 | # JDT-specific (Eclipse Java Development Tools) 66 | .classpath 67 | 68 | # Java annotation processor (APT) 69 | .factorypath 70 | 71 | # PDT-specific (PHP Development Tools) 72 | .buildpath 73 | 74 | # sbteclipse plugin 75 | .target 76 | 77 | # Tern plugin 78 | .tern-project 79 | 80 | # TeXlipse plugin 81 | .texlipse 82 | 83 | # STS (Spring Tool Suite) 84 | .springBeans 85 | 86 | # Code Recommenders 87 | .recommenders/ 88 | 89 | 90 | ### Intellij ### 91 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 92 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 93 | 94 | # User-specific stuff: 95 | .idea/workspace.xml 96 | .idea/tasks.xml 97 | .idea/dictionaries 98 | .idea/vcs.xml 99 | .idea/jsLibraryMappings.xml 100 | .idea 101 | 102 | # Sensitive or high-churn files: 103 | .idea/dataSources.ids 104 | .idea/dataSources.xml 105 | .idea/dataSources.local.xml 106 | .idea/sqlDataSources.xml 107 | .idea/dynamic.xml 108 | .idea/uiDesigner.xml 109 | 110 | # Gradle: 111 | .idea/gradle.xml 112 | .idea/libraries 113 | 114 | # Mongo Explorer plugin: 115 | .idea/mongoSettings.xml 116 | 117 | ## File-based project format: 118 | *.iws 119 | 120 | ## Plugin-specific files: 121 | 122 | # IntelliJ 123 | /out/ 124 | 125 | # mpeltonen/sbt-idea plugin 126 | .idea_modules/ 127 | 128 | # JIRA plugin 129 | atlassian-ide-plugin.xml 130 | 131 | # Crashlytics plugin (for Android Studio and IntelliJ) 132 | com_crashlytics_export_strings.xml 133 | crashlytics.properties 134 | crashlytics-build.properties 135 | fabric.properties 136 | 137 | ### Intellij Patch ### 138 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 139 | 140 | *.iml 141 | # modules.xml 142 | # .idea/misc.xml 143 | # *.ipr 144 | 145 | 146 | ### NetBeans ### 147 | nbproject/private/ 148 | build/ 149 | nbbuild/ 150 | dist/ 151 | nbdist/ 152 | nbactions.xml 153 | .nb-gradle/ 154 | 155 | 156 | ### Java ### 157 | *.class 158 | 159 | # Mobile Tools for Java (J2ME) 160 | .mtj.tmp/ 161 | 162 | # Package Files # 163 | *.jar 164 | *.war 165 | *.ear 166 | 167 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 168 | hs_err_pid* 169 | 170 | #Hive metastore 171 | metastore_db 172 | spark-warehouse 173 | *.attach_pid* 174 | 175 | ## metals/bloop 176 | project/.bloop 177 | project/project 178 | project/metals.sbt 179 | .bloop 180 | .metals 181 | .vscode 182 | .bsp/ 183 | null/ -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.3.1 2 | runner.dialect = scala213source3 3 | preset = IntelliJ -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v0.2.3 - 2022-01-12 4 | - Move build to Azure Devops and update dependencies 5 | - Add cross build for scala 2.13 6 | - Add spark 3 (3.1 and 3.2) support 7 | 8 | ## v0.2.2 - 2019-06-21 9 | 10 | ### Fixed 11 | - Human readable byte representation sometimes trimming too many digits 12 | 13 | ## v0.2.1 - 2019-06-21 14 | 15 | ### Fixed 16 | - Files within a partition are now batched in a consistent way using sorting 17 | - Jobs no longer fail if batching keys are incorrect 18 | 19 | ## v0.2 - 2019-06-18 20 | 21 | ### Added 22 | - Statistics of the copy/delete operations are now collected during the application and logged on completion 23 | 24 | ### Fixed 25 | - Various various command-line parsing issues and increased coverage of tests to include these cases 26 | 27 | ## v0.1 - 2019-06-12 28 | 29 | ### Added 30 | - Initial release 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SparkDistCP 2 | [![Build Status](https://dev.azure.com/coxautodata/Open%20Source/_apis/build/status/CoxAutomotiveDataSolutions.spark-distcp?branchName=master)](https://dev.azure.com/coxautodata/Open%20Source/_build/latest?definitionId=6&branchName=master) 3 | [![Maven Central](https://img.shields.io/maven-central/v/com.coxautodata/spark-distcp_2.11.svg)](https://search.maven.org/search?q=g:com.coxautodata%20AND%20a:spark-distcp*) [![Coverage Status](https://img.shields.io/codecov/c/github/CoxAutomotiveDataSolutions/spark-distcp/master.svg)](https://codecov.io/gh/CoxAutomotiveDataSolutions/spark-distcp/branch/master) 4 | 5 | ## What is SparkDistCP? 6 | 7 | SparkDistCP is an attempt at reimplementing [Hadoop DistCP](https://hadoop.apache.org/docs/current/hadoop-distcp/DistCp.html) in Apache Spark. 8 | 9 | There are several reasons you might want to do this: 10 | * Using DistCP in a Spark-only/non-YARN environment 11 | * Reducing DistCP copy times by generating many smaller tasks therefore limiting long-running tasks/map tasks 12 | * To use DistCP programmatically through Spark 13 | 14 | **Note:** Not all features of Hadoop DistCP have been reimplemented yet. See [What is currently missing from SparkDistCP?](#what-is-currently-missing-from-sparkdistcp) for an overview on what has not yet been implemented. 15 | 16 | **Further note:** SparkDistCP is in early development therefore you should use this library with caution! We provide absolutely no guarantee that this tool will not cause accidental data loss. 17 | 18 | ## How do I run SparkDistCP? 19 | 20 | You can run SparkDistCP from the command-line using: 21 | ```shell 22 | bin/spark-submit --packages com.coxautodata:spark-distcp_2.11:{{version}} --class com.coxautodata.SparkDistCP "" --help 23 | ``` 24 | 25 | The empty string is needed here as `spark-submit` requires an application Jar to be specified however the Main class is in the dependency specified in `packages`. 26 | 27 | The usage of the command-line arguments resembles that of the Hadoop DistCP: 28 | 29 | ```shell 30 | Usage: [options] [source_path...] 31 | ``` 32 | 33 | Like Hadoop DistCP, SparkDistCP takes several options, one or more source paths and a target path. 34 | 35 | SparkDistCP can also be invoked programmatically from a Spark shell in two way: 36 | 37 | * By calling `main` directory and passing an array of command-line arguments: 38 | ```scala 39 | import com.coxautodata.SparkDistCP 40 | SparkDistCP.main(Array("--help")) 41 | ``` 42 | 43 | * Or using the typed API: 44 | ```scala 45 | def run(sparkSession: SparkSession, sourcePaths: Seq[Path], destinationPath: Path, options: SparkDistCPOptions): Unit 46 | ``` 47 | 48 | For example: 49 | ```scala 50 | import org.apache.hadoop.fs.Path 51 | import com.coxautodata.{SparkDistCP, SparkDistCPOptions} 52 | SparkDistCP.run(spark, Seq(new Path("hdfs://nn1:8020/foo/bar")), new Path("hdfs://nn2:8020/bar/foo"), SparkDistCPOptions(dryRun = true)) 53 | ``` 54 | 55 | ### Options: 56 | 57 | | SparkDistCP Flag | Equivalent Hadoop DistCP Flag | Description | Notes | 58 | |--------------------------------|-------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| 59 | | `--i` | `-i` | Ignore failures | | 60 | | `--log ` | `-log` | Write logs to a URI | Logs can be written to any URI with a supported scheme on the classpath. | 61 | | `--dryrun` | N/A | Perform a trial run with no changes made | | 62 | | `--verbose` | `-v` | Run in verbose mode | Does not affect logfile output | 63 | | `--overwrite` | `-overwrite` | Overwrite destination | Changes how destination paths are generated identically to how Hadoop DistCP does. | 64 | | `--update` | `-update` | Overwrite if source and destination differ in size, or checksum | Does not currently compare blocksize unlike Hadoop DistCP. Changes how destination paths are generated identically to how Hadoop DistCP does. | 65 | | `--filters ` | `-filters` | The path to a file containing a list of pattern strings, one string per line, such that paths matching the pattern will be excluded from the copy. | File can be stored on any URI with a supported scheme on the classpath. | 66 | | `--delete` | `-delete` | Delete the files existing in the dst but not in src | | 67 | | `--numListstatusThreads ` | `-numListstatusThreads` | Number of threads to use for building file listing | | 68 | | `--consistentPathBehaviour` | N/A | Revert the path behaviour when using overwrite or update to the path behaviour of non-overwrite/non-update | | 69 | | `--maxFilesPerTask ` | N/A | Maximum number of files to copy in a single Spark task | | 70 | | `--maxBytesPerTask ` | N/A | Maximum number of bytes to copy in a single Spark task | | 71 | 72 | ### Path Behaviour 73 | 74 | SparkDistCP aims to have the same _interesting_ path behaviour to that of Hadoop DistCP (specifically around update and overwrite). 75 | 76 | ## What is currently missing from SparkDistCP? 77 | 78 | SparkDistCP is not a complete like-for-like reimplementation of Hadoop DistCP and there are differences in behaviour and features: 79 | 80 | * No use of blocks, including during the copy and for comparison when using the `update` flag 81 | * No use of snapshots 82 | * No atomic commit option 83 | * No preserve flag 84 | * No append flag 85 | * No file list flag 86 | * No option to limit bandwidth 87 | * No option to skip CRC check 88 | * When using the delete option files are **not** moved into trash 89 | * The log file in no way resembles that created by Hadoop DistCP 90 | 91 | ## How can I contribute to SparkDistCP? 92 | 93 | We welcome all users to contribute to the development of SparkDistCP by raising pull-requests. We kindly ask that you include suitable unit tests along with proposed changes. 94 | 95 | As you can see above, there is a wealth of work that can be done on SparkDistCP to reach feature parity with hadoop DistCP. 96 | 97 | ## What is SparkDistCP licensed under? 98 | 99 | Licensed under the Apache License, Version 2.0 (the "License"); 100 | you may not use this file except in compliance with the License. 101 | You may obtain a copy of the License at 102 | 103 | http://www.apache.org/licenses/LICENSE-2.0 104 | 105 | Unless required by applicable law or agreed to in writing, software 106 | distributed under the License is distributed on an "AS IS" BASIS, 107 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 108 | See the License for the specific language governing permissions and 109 | limitations under the License. 110 | 111 | Copyright 2019 Cox Automotive UK Limited 112 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | batch: true 3 | branches: 4 | include: 5 | - "*" 6 | tags: 7 | include: 8 | - refs/tags/* 9 | 10 | pr: 11 | autoCancel: true 12 | branches: 13 | include: 14 | - '*' 15 | 16 | pool: 17 | vmImage: 'ubuntu-latest' 18 | 19 | variables: 20 | - name: COURSIER_CACHE 21 | value: $(Pipeline.Workspace)/.coursier 22 | - name: IVY_CACHE_FOLDER 23 | value: $(Pipeline.Workspace)/.ivy2 24 | - name: SBT_OPTS 25 | value: -Dsbt.boot.directory=$(Pipeline.Workspace)/.sbt/boot -Dsbt.coursier.home=$(Pipeline.Workspace)/.coursier 26 | - group: Sonatype 27 | 28 | stages: 29 | - stage: testcrossspark 30 | displayName: "Test against multiple spark versions" 31 | jobs: 32 | - job: tests 33 | displayName: Test multiple spark versions 34 | strategy: 35 | matrix: 36 | Spark3012: 37 | SPARKVERSION: '3.0.2' 38 | SCALAVERSION: '2.12.15' 39 | Spark3112: 40 | SPARKVERSION: '3.1.2' 41 | SCALAVERSION: '2.12.15' 42 | Spark24711: 43 | SPARKVERSION: '2.4.7' 44 | SCALAVERSION: '2.11.12' 45 | Spark24511: 46 | SPARKVERSION: '2.4.5' 47 | SCALAVERSION: '2.11.12' 48 | Spark32012: 49 | SPARKVERSION: '3.2.0' 50 | SCALAVERSION: '2.12.15' 51 | Spark32013: 52 | SPARKVERSION: '3.2.0' 53 | SCALAVERSION: '2.13.8' 54 | maxParallel: 10 55 | steps: 56 | - task: JavaToolInstaller@0 57 | inputs: 58 | versionSpec: '8' 59 | jdkArchitectureOption: 'x64' 60 | jdkSourceOption: 'PreInstalled' 61 | - script: | 62 | echo Scala Version: $SCALA_VERSION 63 | echo Spark Version: $SPARK_VERSION 64 | sbt '; compile ; test' 65 | env: 66 | SPARK_VERSION: $(SPARKVERSION) 67 | SCALA_VERSION: $(SCALAVERSION) 68 | - stage: sbtcrossbuild 69 | dependsOn: [testcrossspark] 70 | displayName: "Run sbt cross build" 71 | jobs: 72 | - job: build 73 | displayName: Run sbt cross build 74 | steps: 75 | - task: Cache@2 76 | inputs: 77 | key: 'sbt | ivy2 | **/build.sbt | project/Dependencies.scala' 78 | restoreKeys: | 79 | sbt | ivy2 | **/build.sbt 80 | sbt | ivy2 81 | path: $(Pipeline.Workspace)/.ivy2 82 | - task: Cache@2 83 | inputs: 84 | key: 'sbt | coursier | **/build.sbt | project/Dependencies.scala' 85 | restoreKeys: | 86 | sbt | coursier | **/build.sbt 87 | sbt | coursier 88 | path: $(Pipeline.Workspace)/.coursier 89 | - task: Cache@2 90 | inputs: 91 | key: 'sbt | boot | **/build.sbt | project/Dependencies.scala' 92 | restoreKeys: | 93 | sbt | boot | **/build.sbt 94 | sbt | boot 95 | path: $(Pipeline.Workspace)/.sbt/boot 96 | - task: JavaToolInstaller@0 97 | inputs: 98 | versionSpec: '8' 99 | jdkArchitectureOption: 'x64' 100 | jdkSourceOption: 'PreInstalled' 101 | - script: | 102 | mkdir -p $(Pipeline.Workspace)/.ivy2 103 | sbt ci 104 | displayName: "Run sbt tests" 105 | - stage: deployToSonatype 106 | dependsOn: [sbtcrossbuild] 107 | condition: and(succeeded(), startsWith(variables['Build.SourceBranch'], 'refs/tags/v')) 108 | displayName: Push new version 109 | jobs: 110 | - job: Deploy 111 | displayName: Push 112 | steps: 113 | - task: Cache@2 114 | inputs: 115 | key: 'sbt | ivy2 | **/build.sbt | project/Dependencies.scala' 116 | restoreKeys: | 117 | sbt | ivy2 | **/build.sbt 118 | sbt | ivy2 119 | path: $(Pipeline.Workspace)/.ivy2 120 | - task: Cache@2 121 | inputs: 122 | key: 'sbt | coursier | **/build.sbt | project/Dependencies.scala' 123 | restoreKeys: | 124 | sbt | coursier | **/build.sbt 125 | sbt | coursier 126 | path: $(Pipeline.Workspace)/.coursier 127 | - task: Cache@2 128 | inputs: 129 | key: 'sbt | boot | **/build.sbt | project/Dependencies.scala' 130 | restoreKeys: | 131 | sbt | boot | **/build.sbt 132 | sbt | boot 133 | path: $(Pipeline.Workspace)/.sbt/boot 134 | - task: JavaToolInstaller@0 135 | inputs: 136 | versionSpec: '8' 137 | jdkArchitectureOption: 'x64' 138 | jdkSourceOption: 'PreInstalled' 139 | - script: | 140 | mkdir -p $(Pipeline.Workspace)/.ivy2 141 | sbt ci-release 142 | displayName: "Run sbt ci-release" 143 | env: 144 | PGP_PASSPHRASE: $(PGP-PASSPHRASE) 145 | PGP_SECRET: $(PGP-SECRET-B64) 146 | SONATYPE_PASSWORD: $(SONATYPE-PASSWORD) 147 | SONATYPE_USERNAME: $(SONATYPE-USERNAME) -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import sbt.Keys.{developers, fork, homepage, scalaVersion, scmInfo} 2 | import sbt.url 3 | import xerial.sbt.Sonatype._ 4 | import Dependencies.{scopt, spark, test} 5 | 6 | lazy val scala212 = Dependencies.scala212 7 | lazy val scala211 = Dependencies.scala211 8 | lazy val scala213 = Dependencies.scala213 9 | lazy val supportedScalaVersions = List(scala213, scala212, scala211) 10 | 11 | ThisBuild / scalaVersion := Dependencies.scalaVers 12 | ThisBuild / organization := "com.coxautodata" 13 | 14 | lazy val compilerOptions = Seq( 15 | "-unchecked", 16 | "-feature", 17 | "-language:existentials", 18 | "-language:higherKinds", 19 | "-language:implicitConversions", 20 | "-language:postfixOps", 21 | "-deprecation", 22 | "-target:jvm-1.8", 23 | "-encoding", 24 | "utf8", 25 | "-Yrangepos" 26 | ) 27 | 28 | addCommandAlias("ci", ";+compile ;+test") 29 | 30 | lazy val sparkdistcp = (project in file(".")) 31 | .settings( 32 | name := "spark-distcp", 33 | Test / fork := true, 34 | scalacOptions ++= compilerOptions, 35 | crossScalaVersions := supportedScalaVersions, 36 | libraryDependencies += test, 37 | libraryDependencies += scopt, 38 | libraryDependencies ++= spark(scalaVersion.value), 39 | libraryDependencies += "org.scala-lang.modules" %% "scala-collection-compat" % Dependencies.collectionCompat % Provided, 40 | assemblyPackageScala / assembleArtifact := false, 41 | assembly / assemblyOption ~= { 42 | _.withIncludeScala(false) 43 | }, 44 | assembly / Keys.test := {}, 45 | assembly / artifact := { 46 | val art = (assembly / artifact).value 47 | art.withClassifier(Some("assembly")) 48 | }, 49 | ThisBuild / assemblyShadeRules := Seq( 50 | ShadeRule.rename("scopt.**" -> "internal.spark.distcp.scopt.@1").inAll 51 | ), 52 | licenses := Seq( 53 | "APL2" -> url("http://www.apache.org/licenses/LICENSE-2.0.txt") 54 | ), 55 | description := "A re-implementation of Hadoop DistCP in Apache Spark", 56 | homepage := Some( 57 | url("https://github.com/CoxAutomotiveDataSolutions/spark-distcp") 58 | ), 59 | scmInfo := Some( 60 | ScmInfo( 61 | url("https://github.com/CoxAutomotiveDataSolutions/spark-distcp"), 62 | "scm:git@github.com:CoxAutomotiveDataSolutions/spark-distcp.git" 63 | ) 64 | ), 65 | developers := List( 66 | Developer( 67 | id = "alexjbush", 68 | name = "Alex Bush", 69 | email = "alex.bush@coxauto.co.uk", 70 | url = url("https://alexbu.sh") 71 | ), 72 | Developer( 73 | id = "vavison", 74 | name = "Vicky Avison", 75 | email = "vicky.avison@coxauto.co.uk", 76 | url = url("https://coxautodata.com") 77 | ), 78 | Developer( 79 | id = "jamesfielder", 80 | name = "James Fielder", 81 | email = "james@fielder.dev", 82 | url = url("https://james.fielder.dev") 83 | ) 84 | ), 85 | sonatypeProjectHosting := Some( 86 | GitHubHosting( 87 | "CoxAutomotiveDataSolutions", 88 | "spark-distcp", 89 | "james@fielder.dev" 90 | ) 91 | ), 92 | Test / publishArtifact := true, 93 | publishConfiguration := publishConfiguration.value 94 | .withOverwrite(isSnapshot.value), 95 | publishLocalConfiguration := publishLocalConfiguration.value 96 | .withOverwrite(isSnapshot.value), 97 | addArtifact(assembly / artifact, assembly) 98 | ) 99 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | 3 | object Dependencies { 4 | 5 | val sparkVersion24 = "2.4.7" 6 | val sparkVersion3 = "3.2.0" 7 | val sparkVersion31 = "3.1.2" 8 | // wait on https://github.com/scopt/scopt/issues/312 9 | val scoptVersion = "4.0.1" 10 | val scalatestVersion = "3.2.10" 11 | val scala212 = "2.12.15" 12 | val scala211 = "2.11.12" 13 | val scala213 = "2.13.8" 14 | val collectionCompat = "2.6.0" 15 | 16 | val defaultSparkVersion = sparkVersion3 17 | val defaultScalaVersion = scala213 18 | 19 | lazy val scalaVers = sys.env.getOrElse("SCALA_VERSION", defaultScalaVersion) 20 | lazy val sparkVers = sys.env.getOrElse("SPARK_VERSION", defaultSparkVersion) 21 | 22 | val test = "org.scalatest" %% "scalatest" % scalatestVersion % Test 23 | val scopt = "com.github.scopt" %% "scopt" % scoptVersion % Compile 24 | 25 | def spark(scalaVersion: String) = { 26 | 27 | val deps = (version: String) => { 28 | Seq( 29 | "org.apache.spark" %% "spark-sql" % version % Provided, 30 | "org.apache.spark" %% "spark-core" % version % Provided 31 | ) 32 | } 33 | 34 | val sparkVersEnv = sys.env.get("SPARK_VERSION") 35 | 36 | sparkVersEnv match { 37 | case Some(version) => deps(version) 38 | case None => 39 | val sparkVers = scalaVersion match { 40 | case `scala211` => `sparkVersion24` 41 | case `scala212` => `sparkVersion31` 42 | case `scala213` => `sparkVersion3` 43 | case _ => `sparkVersion3` 44 | } 45 | deps(sparkVers) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.6.1 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.5.10") 2 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") 3 | addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.6.1") 4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.1.0") 5 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6") 6 | addCompilerPlugin("org.scalameta" % "semanticdb-scalac" % "4.4.32" cross CrossVersion.full) 7 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/OptionsParsing.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import java.net.URI 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs.Path 7 | 8 | object OptionsParsing { 9 | 10 | /** Parse a set of command-line arguments into a [[Config]] object 11 | */ 12 | def parse(args: Array[String]): Config = { 13 | 14 | val parser = new scopt.OptionParser[Config]("") { 15 | opt[Unit]("i") 16 | .action((_, c) => c.copyOptions(_.copy(ignoreErrors = true))) 17 | .text("Ignore failures") 18 | 19 | opt[String]("log") 20 | .action((log, c) => c.copyOptions(_.copy(log = Some(new URI(log))))) 21 | .text("Write logs to a URI") 22 | 23 | opt[Unit]("dryrun") 24 | .action((_, c) => c.copyOptions(_.copy(dryRun = true))) 25 | .text("Perform a trial run with no changes made") 26 | 27 | opt[Unit]("verbose") 28 | .action((_, c) => c.copyOptions(_.copy(verbose = true))) 29 | .text("Run in verbose mode") 30 | 31 | opt[Unit]("overwrite") 32 | .action((_, c) => c.copyOptions(_.copy(overwrite = true))) 33 | .text("Overwrite destination") 34 | 35 | opt[Unit]("update") 36 | .action((_, c) => c.copyOptions(_.copy(update = true))) 37 | .text("Overwrite if source and destination differ in size, or checksum") 38 | 39 | opt[String]("filters") 40 | .action((f, c) => c.copyOptions(_.copy(filters = Some(new URI(f))))) 41 | .text( 42 | "The path to a file containing a list of pattern strings, one string per line, such that paths matching the pattern will be excluded from the copy." 43 | ) 44 | 45 | opt[Unit]("delete") 46 | .action((_, c) => c.copyOptions(_.copy(delete = true))) 47 | .text("Delete the files existing in the dst but not in src") 48 | 49 | opt[Int]("numListstatusThreads") 50 | .action((i, c) => c.copyOptions(_.copy(numListstatusThreads = i))) 51 | .text("Number of threads to use for building file listing") 52 | 53 | opt[Unit]("consistentPathBehaviour") 54 | .action((_, c) => c.copyOptions(_.copy(consistentPathBehaviour = true))) 55 | .text( 56 | "Revert the path behaviour when using overwrite or update to the path behaviour of non-overwrite/non-update" 57 | ) 58 | 59 | opt[Int]("maxFilesPerTask") 60 | .action((i, c) => c.copyOptions(_.copy(maxFilesPerTask = i))) 61 | .text("Maximum number of files to copy in a single Spark task") 62 | 63 | opt[Long]("maxBytesPerTask") 64 | .action((i, c) => c.copyOptions(_.copy(maxBytesPerTask = i))) 65 | .text("Maximum number of bytes to copy in a single Spark task") 66 | 67 | help("help").text("prints this usage text") 68 | 69 | arg[String]("[source_path...] ") 70 | .unbounded() 71 | .action((u, c) => c.copy(URIs = c.URIs :+ new URI(u))) 72 | 73 | } 74 | 75 | parser.parse(args, Config()) match { 76 | case Some(config) => 77 | config.validateUris() 78 | config.options.validateOptions() 79 | config 80 | case _ => 81 | throw new RuntimeException("Failed to parse arguments") 82 | } 83 | } 84 | } 85 | 86 | case class Config( 87 | options: SparkDistCPOptions = SparkDistCPOptions(), 88 | URIs: Seq[URI] = Seq.empty 89 | ) { 90 | 91 | def copyOptions(f: SparkDistCPOptions => SparkDistCPOptions): Config = { 92 | this.copy(options = f(options)) 93 | } 94 | 95 | def validateUris(): Unit = { 96 | require( 97 | URIs.length >= 2, 98 | "you must supply two or more paths, representing the source paths and a destination" 99 | ) 100 | } 101 | 102 | def sourceAndDestPaths: (Seq[Path], Path) = { 103 | URIs.reverse match { 104 | case d :: s :: ts => 105 | ((s :: ts).reverse.map(u => new Path(u)), new Path(d)) 106 | case _ => throw new RuntimeException("Incorrect number of URIs") 107 | } 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/SparkDistCP.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import java.net.URI 4 | 5 | import com.coxautodata.objects._ 6 | import com.coxautodata.utils.{CopyUtils, FileListUtils, PathUtils} 7 | import org.apache.hadoop.fs._ 8 | import org.apache.log4j.Level 9 | import org.apache.spark.rdd.RDD 10 | import org.apache.spark.sql.{SaveMode, SparkSession} 11 | import org.apache.spark.{HashPartitioner, TaskContext} 12 | 13 | /** Spark-based DistCp application. [[SparkDistCP.main]] is the command-line 14 | * entry to the application and [[SparkDistCP.run]] is the programmatic API 15 | * entry to the application 16 | */ 17 | object SparkDistCP extends Logging { 18 | 19 | type KeyedCopyDefinition = (URI, CopyDefinitionWithDependencies) 20 | 21 | /** Main entry point for command-line. Arguments are currently: Usage: 22 | * SparkDistCP [options] [source_path...] 23 | * 24 | * --i Ignore failures 25 | * --log Write logs to a URI 26 | * --dryrun Perform a trial run with no changes made 27 | * --verbose Run in verbose mode 28 | * --overwrite Overwrite destination 29 | * --update Overwrite if source and destination differ in size, or checksum 30 | * --filters The path to a file containing a list of pattern strings, 31 | * one string per line, such that paths matching the pattern will be excluded 32 | * from the copy. 33 | * --delete Delete the files existing in the dst but not in src 34 | * --numListstatusThreads Number of threads to use for building file 35 | * listing 36 | * --consistentPathBehaviour Revert the path behaviour when using overwrite 37 | * or update to the path behaviour of non-overwrite/non-update 38 | * --maxFilesPerTask Maximum number of files to copy in a single 39 | * Spark task 40 | * --maxBytesPerTask Maximum number of bytes to copy in a single 41 | * Spark task 42 | * --help prints this usage text [source_path...] 43 | */ 44 | def main(args: Array[String]): Unit = { 45 | 46 | val config = OptionsParsing.parse(args) 47 | val sparkSession = SparkSession.builder().getOrCreate() 48 | val options = config.options.withFiltersFromFile( 49 | sparkSession.sparkContext.hadoopConfiguration 50 | ) 51 | val (src, dest) = config.sourceAndDestPaths 52 | run(sparkSession, src, dest, options) 53 | 54 | } 55 | 56 | /** Main entry point for programmatic access to the application. 57 | * 58 | * @param sparkSession 59 | * Active Spark Session 60 | * @param sourcePaths 61 | * Source paths to copy from 62 | * @param destinationPath 63 | * Destination path to copy to 64 | * @param options 65 | * Options to use in the application 66 | */ 67 | def run( 68 | sparkSession: SparkSession, 69 | sourcePaths: Seq[Path], 70 | destinationPath: Path, 71 | options: SparkDistCPOptions 72 | ): Unit = { 73 | import sparkSession.implicits._ 74 | 75 | assert(sourcePaths.nonEmpty, "At least one source path must be given") 76 | options.validateOptions() 77 | 78 | if (options.verbose) { 79 | sparkSession.sparkContext.setLogLevel("DEBUG") 80 | setLogLevel(Level.DEBUG) 81 | } 82 | 83 | val qualifiedSourcePaths = sourcePaths.map( 84 | PathUtils 85 | .pathToQualifiedPath(sparkSession.sparkContext.hadoopConfiguration, _) 86 | ) 87 | val qualifiedDestinationPath = PathUtils.pathToQualifiedPath( 88 | sparkSession.sparkContext.hadoopConfiguration, 89 | destinationPath 90 | ) 91 | 92 | val sourceRDD = FileListUtils.getSourceFiles( 93 | sparkSession.sparkContext, 94 | qualifiedSourcePaths.map(_.toUri), 95 | qualifiedDestinationPath.toUri, 96 | options.updateOverwritePathBehaviour, 97 | options.numListstatusThreads, 98 | options.filterNot 99 | ) 100 | val destinationRDD = FileListUtils.getDestinationFiles( 101 | sparkSession.sparkContext, 102 | qualifiedDestinationPath, 103 | options 104 | ) 105 | 106 | val joined = sourceRDD.fullOuterJoin(destinationRDD) 107 | 108 | val toCopy = joined.collect { case (_, (Some(s), _)) => s } 109 | 110 | val accumulators = new Accumulators(sparkSession) 111 | 112 | val copyResult: RDD[DistCPResult] = doCopy(toCopy, accumulators, options) 113 | 114 | val deleteResult: RDD[DistCPResult] = { 115 | if (options.delete) { 116 | val toDelete = joined.collect { case (d, (None, _)) => d } 117 | doDelete(toDelete, accumulators, options) 118 | } else { 119 | sparkSession.sparkContext.emptyRDD[DistCPResult] 120 | } 121 | } 122 | 123 | val allResults = copyResult union deleteResult 124 | 125 | options.log match { 126 | case None => allResults.foreach(_ => ()) 127 | case Some(f) => 128 | allResults 129 | .repartition(1) 130 | .map(_.getMessage) 131 | .toDS() 132 | .write 133 | .mode(SaveMode.Append) 134 | .csv(f.toString) 135 | } 136 | 137 | logInfo("SparkDistCP Run Statistics\n" + accumulators.getOutputText) 138 | 139 | } 140 | 141 | /** Perform the copy portion of the DistCP 142 | */ 143 | private[coxautodata] def doCopy( 144 | sourceRDD: RDD[CopyDefinitionWithDependencies], 145 | accumulators: Accumulators, 146 | options: SparkDistCPOptions 147 | ): RDD[DistCPResult] = { 148 | 149 | val serConfig = new ConfigSerDeser( 150 | sourceRDD.sparkContext.hadoopConfiguration 151 | ) 152 | batchAndPartitionFiles( 153 | sourceRDD, 154 | options.maxFilesPerTask, 155 | options.maxBytesPerTask 156 | ) 157 | .mapPartitions { iterator => 158 | val hadoopConfiguration = serConfig.get() 159 | val attemptID = TaskContext.get().taskAttemptId() 160 | val fsCache = new FileSystemObjectCacher(hadoopConfiguration) 161 | 162 | iterator 163 | .flatMap(_._2.getAllCopyDefinitions) 164 | .collectMapWithEmptyCollection( 165 | (d, z) => z.contains(d), 166 | d => { 167 | val r = CopyUtils.handleCopy( 168 | fsCache.getOrCreate(d.source.uri), 169 | fsCache.getOrCreate(d.destination), 170 | d, 171 | options, 172 | attemptID 173 | ) 174 | accumulators.handleResult(r) 175 | r 176 | } 177 | ) 178 | } 179 | } 180 | 181 | /** Perform the delete from destination portion of the DistCP 182 | */ 183 | private[coxautodata] def doDelete( 184 | destRDD: RDD[URI], 185 | accumulators: Accumulators, 186 | options: SparkDistCPOptions 187 | ): RDD[DistCPResult] = { 188 | val serConfig = new ConfigSerDeser(destRDD.sparkContext.hadoopConfiguration) 189 | val count = destRDD.count() 190 | destRDD 191 | .repartition((count / options.maxFilesPerTask).toInt.max(1)) 192 | .mapPartitions { iterator => 193 | val hadoopConfiguration = serConfig.get() 194 | val fsCache = new FileSystemObjectCacher(hadoopConfiguration) 195 | iterator 196 | .collectMapWithEmptyCollection( 197 | (d, z) => z.exists(p => PathUtils.uriIsChild(p, d)), 198 | d => { 199 | val r = CopyUtils.handleDelete(fsCache.getOrCreate(d), d, options) 200 | accumulators.handleResult(r) 201 | r 202 | } 203 | ) 204 | } 205 | } 206 | 207 | /** DistCP helper implicits on iterators 208 | */ 209 | private[coxautodata] implicit class DistCPIteratorImplicit[B]( 210 | iterator: Iterator[B] 211 | ) { 212 | 213 | /** Scan over an iterator, mapping as we go with `action`, but making a 214 | * decision on which objects to actually keep using a set of what objects 215 | * have been seen and the `skip` function. Similar to a combining `collect` 216 | * and `foldLeft`. 217 | * 218 | * @param skip 219 | * Should a mapped version of this element not be included in the output 220 | * @param action 221 | * Function to map the element 222 | * @return 223 | * An iterator 224 | */ 225 | def collectMapWithEmptyCollection( 226 | skip: (B, Set[B]) => Boolean, 227 | action: B => DistCPResult 228 | ): Iterator[DistCPResult] = { 229 | 230 | iterator 231 | .scanLeft((Set.empty[B], None: Option[DistCPResult])) { 232 | case ((z, _), d) if skip(d, z) => (z, None) 233 | case ((z, _), d) => 234 | (z + d, Some(action(d))) 235 | } 236 | .collect { case (_, Some(r)) => r } 237 | 238 | } 239 | 240 | } 241 | 242 | /** Batch the given RDD into groups of files depending on 243 | * [[SparkDistCPOptions.maxFilesPerTask]] and 244 | * [[SparkDistCPOptions.maxBytesPerTask]] and repartition the RDD so files in 245 | * the same batches are in the same partitions 246 | */ 247 | private[coxautodata] def batchAndPartitionFiles( 248 | rdd: RDD[CopyDefinitionWithDependencies], 249 | maxFilesPerTask: Int, 250 | maxBytesPerTask: Long 251 | ): RDD[((Int, Int), CopyDefinitionWithDependencies)] = { 252 | val partitioner = 253 | rdd.partitioner.getOrElse(new HashPartitioner(rdd.partitions.length)) 254 | val sorted = rdd 255 | .map(v => (v.source.uri.toString, v)) 256 | .repartitionAndSortWithinPartitions(partitioner) 257 | .map(_._2) 258 | val batched = sorted.mapPartitionsWithIndex( 259 | generateBatchedFileKeys(maxFilesPerTask, maxBytesPerTask) 260 | ) // sorted 261 | 262 | batched.partitionBy(CopyPartitioner(batched)) 263 | } 264 | 265 | /** Key the RDD within partitions based on batches of files based on 266 | * [[SparkDistCPOptions.maxFilesPerTask]] and 267 | * [[SparkDistCPOptions.maxBytesPerTask]] thresholds 268 | */ 269 | private[coxautodata] def generateBatchedFileKeys( 270 | maxFilesPerTask: Int, 271 | maxBytesPerTask: Long 272 | ): (Int, Iterator[CopyDefinitionWithDependencies]) => Iterator[ 273 | ((Int, Int), CopyDefinitionWithDependencies) 274 | ] = { (partition, iterator) => 275 | iterator 276 | .scanLeft[(Int, Int, Long, CopyDefinitionWithDependencies)]( 277 | 0, 278 | 0, 279 | 0, 280 | null 281 | ) { case ((index, count, bytes, _), definition) => 282 | val newCount = count + 1 283 | val newBytes = bytes + definition.source.getLen 284 | if (newCount > maxFilesPerTask || newBytes > maxBytesPerTask) { 285 | (index + 1, 1, definition.source.getLen, definition) 286 | } else { 287 | (index, newCount, newBytes, definition) 288 | } 289 | } 290 | .drop(1) 291 | .map { case (index, _, _, file) => ((partition, index), file) } 292 | } 293 | 294 | } 295 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/SparkDistCPOptions.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import org.apache.hadoop.conf.Configuration 4 | import org.apache.hadoop.fs.Path 5 | 6 | import java.io.IOException 7 | import java.net.URI 8 | import scala.util.matching.Regex 9 | 10 | /** Options for the DistCP application See [[OptionsParsing.parse]] for the 11 | * explanation of each option 12 | */ 13 | case class SparkDistCPOptions( 14 | update: Boolean = SparkDistCPOptions.Defaults.update, 15 | overwrite: Boolean = SparkDistCPOptions.Defaults.overwrite, 16 | delete: Boolean = SparkDistCPOptions.Defaults.delete, 17 | log: Option[URI] = SparkDistCPOptions.Defaults.log, 18 | ignoreErrors: Boolean = SparkDistCPOptions.Defaults.ignoreErrors, 19 | dryRun: Boolean = SparkDistCPOptions.Defaults.dryRun, 20 | consistentPathBehaviour: Boolean = 21 | SparkDistCPOptions.Defaults.consistentPathBehaviour, 22 | maxFilesPerTask: Int = SparkDistCPOptions.Defaults.maxFilesPerTask, 23 | maxBytesPerTask: Long = SparkDistCPOptions.Defaults.maxBytesPerTask, 24 | filters: Option[URI] = SparkDistCPOptions.Defaults.filters, 25 | filterNot: List[Regex] = SparkDistCPOptions.Defaults.filterNot, 26 | numListstatusThreads: Int = SparkDistCPOptions.Defaults.numListstatusThreads, 27 | verbose: Boolean = SparkDistCPOptions.Defaults.verbose 28 | ) { 29 | 30 | val updateOverwritePathBehaviour: Boolean = 31 | !consistentPathBehaviour && (update || overwrite) 32 | 33 | def validateOptions(): Unit = { 34 | assert(maxFilesPerTask > 0, "maxFilesPerTask must be positive") 35 | 36 | assert(maxBytesPerTask > 0, "maxBytesPerTask must be positive") 37 | 38 | assert(numListstatusThreads > 0, "numListstatusThreads must be positive") 39 | 40 | assert( 41 | !(update && overwrite), 42 | "Both update and overwrite cannot be specified" 43 | ) 44 | 45 | assert( 46 | !(delete && !overwrite && !update), 47 | "Delete must be specified with either overwrite or update" 48 | ) 49 | } 50 | 51 | def withFiltersFromFile( 52 | hadoopConfiguration: Configuration 53 | ): SparkDistCPOptions = { 54 | 55 | val fn = filters 56 | .map(f => { 57 | try { 58 | val path = new Path(f) 59 | val fs = path.getFileSystem(hadoopConfiguration) 60 | 61 | val in = fs.open(path) 62 | 63 | val r = scala.io.Source.fromInputStream(in).getLines().map(_.r).toList 64 | 65 | in.close() 66 | r 67 | } catch { 68 | case e: IOException => 69 | throw new RuntimeException("Invalid filter file " + f, e) 70 | } 71 | }) 72 | .getOrElse(List.empty) 73 | 74 | this.copy(filterNot = fn) 75 | 76 | } 77 | 78 | } 79 | 80 | object SparkDistCPOptions { 81 | 82 | object Defaults { 83 | val update: Boolean = false 84 | val overwrite: Boolean = false 85 | val delete: Boolean = false 86 | val log: Option[URI] = None 87 | val ignoreErrors: Boolean = false 88 | val dryRun: Boolean = false 89 | val consistentPathBehaviour: Boolean = false 90 | val maxFilesPerTask: Int = 1000 91 | val maxBytesPerTask: Long = 1073741824L 92 | val filters: Option[URI] = None 93 | val filterNot: List[Regex] = List.empty 94 | val numListstatusThreads: Int = 10 95 | val verbose: Boolean = false 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/Accumulators.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import com.coxautodata.utils.FileUtils 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.util.LongAccumulator 6 | 7 | import scala.jdk.CollectionConverters._ 8 | 9 | class Accumulators(sparkSession: SparkSession) extends Serializable { 10 | 11 | def handleResult(result: DistCPResult): Unit = result match { 12 | case DeleteResult( 13 | _, 14 | DeleteActionResult.SkippedDoesNotExists | 15 | DeleteActionResult.SkippedDryRun 16 | ) => 17 | deleteOperationsSkipped.add(1) 18 | case DeleteResult(_, DeleteActionResult.Deleted) => 19 | deleteOperationsSuccessful.add(1) 20 | case DeleteResult(_, DeleteActionResult.Failed(e)) => 21 | deleteOperationsSkipped.add(1) 22 | deleteOperationsFailed.add(1) 23 | exceptionCount.add(e) 24 | case DirectoryCopyResult( 25 | _, 26 | _, 27 | CopyActionResult.SkippedAlreadyExists | CopyActionResult.SkippedDryRun 28 | ) => 29 | foldersSkipped.add(1) 30 | case DirectoryCopyResult(_, _, CopyActionResult.Created) => 31 | foldersCreated.add(1) 32 | case DirectoryCopyResult(_, _, CopyActionResult.Failed(e)) => 33 | foldersFailed.add(1) 34 | foldersSkipped.add(1) 35 | exceptionCount.add(e) 36 | case FileCopyResult( 37 | _, 38 | _, 39 | l, 40 | CopyActionResult.SkippedAlreadyExists | 41 | CopyActionResult.SkippedIdenticalFileAlreadyExists | 42 | CopyActionResult.SkippedDryRun 43 | ) => 44 | filesSkipped.add(1) 45 | bytesSkipped.add(l) 46 | case FileCopyResult(_, _, l, CopyActionResult.Copied) => 47 | filesCopied.add(1) 48 | bytesCopied.add(l) 49 | case FileCopyResult(_, _, l, CopyActionResult.OverwrittenOrUpdated) => 50 | filesCopied.add(1) 51 | bytesCopied.add(l) 52 | filesUpdatedOrOverwritten.add(1) 53 | case FileCopyResult(_, _, l, CopyActionResult.Failed(e)) => 54 | filesFailed.add(1) 55 | exceptionCount.add(e) 56 | filesSkipped.add(1) 57 | bytesSkipped.add(l) 58 | } 59 | 60 | def getOutputText: String = { 61 | val intFormatter = java.text.NumberFormat.getIntegerInstance 62 | s"""--Raw data-- 63 | |Data copied: ${FileUtils.byteCountToDisplaySize(bytesCopied.value)} 64 | |Data skipped (already existing files, dry-run and failures): ${FileUtils 65 | .byteCountToDisplaySize(bytesSkipped.value)} 66 | |--Files-- 67 | |Files copied (new files and overwritten/updated files): ${intFormatter 68 | .format(filesCopied.value)} 69 | |Files overwritten/updated: ${intFormatter.format( 70 | filesUpdatedOrOverwritten.value 71 | )} 72 | |Skipped files for copying (already existing files, dry-run and failures): ${intFormatter 73 | .format(filesSkipped.value)} 74 | |Failed files during copy: ${intFormatter.format(filesFailed.value)} 75 | |--Folders-- 76 | |Folders created: ${intFormatter.format(foldersCreated.value)} 77 | |Skipped folder creates (already existing folders, dry-run and failures): ${intFormatter 78 | .format(foldersSkipped.value)} 79 | |Failed folder creates: ${intFormatter.format(foldersFailed.value)} 80 | |--Deletes-- 81 | |Successful delete operations: ${intFormatter.format( 82 | deleteOperationsSuccessful.value 83 | )} 84 | |Skipped delete operations (files/folders already missing, dry-run and failures): ${intFormatter 85 | .format(deleteOperationsSkipped.value)} 86 | |Failed delete operations: ${intFormatter.format( 87 | deleteOperationsFailed.value 88 | )} 89 | |--Exception counts-- 90 | |""".stripMargin ++ 91 | exceptionCount.value.asScala.toSeq 92 | .sortWith { case ((_, v1), (_, v2)) => v1 > v2 } 93 | .map { case (k, v) => s"$k: ${intFormatter.format(v)}" } 94 | .mkString("\n") 95 | } 96 | 97 | val bytesCopied: LongAccumulator = 98 | sparkSession.sparkContext.longAccumulator("BytesCopied") 99 | val bytesSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator( 100 | "BytesSkipped" 101 | ) // Already exists, dryrun and failure 102 | 103 | val foldersCreated: LongAccumulator = 104 | sparkSession.sparkContext.longAccumulator("FoldersCreated") 105 | val foldersSkipped: LongAccumulator = 106 | sparkSession.sparkContext.longAccumulator("FoldersSkipped") 107 | val foldersFailed: LongAccumulator = 108 | sparkSession.sparkContext.longAccumulator("FoldersFailed") 109 | 110 | val filesCopied: LongAccumulator = 111 | sparkSession.sparkContext.longAccumulator("FilesCopied") 112 | val filesSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator( 113 | "FilesSkipped" 114 | ) // Already exists, dryrun and failure 115 | val filesFailed: LongAccumulator = 116 | sparkSession.sparkContext.longAccumulator("FilesFailed") 117 | val filesUpdatedOrOverwritten: LongAccumulator = 118 | sparkSession.sparkContext.longAccumulator("FilesUpdatedOrOverwritten") 119 | 120 | val deleteOperationsSuccessful: LongAccumulator = 121 | sparkSession.sparkContext.longAccumulator("DeleteOperationsSuccessful") 122 | val deleteOperationsSkipped: LongAccumulator = 123 | sparkSession.sparkContext.longAccumulator( 124 | "DeleteOperationsSkipped" 125 | ) // Already exists, dryrun and failure 126 | val deleteOperationsFailed: LongAccumulator = 127 | sparkSession.sparkContext.longAccumulator("DeleteOperationsFailed") 128 | 129 | val exceptionCount: ExceptionCountAccumulator = new ExceptionCountAccumulator 130 | sparkSession.sparkContext.register(exceptionCount, "ExceptionCount") 131 | } 132 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/ConfigSerDeser.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | 22 | /** Class to make Hadoop configurations serializable; uses the `Writeable` 23 | * operations to do this. Note: this only serializes the explicitly set values, 24 | * not any set in site/default or other XML resources. 25 | */ 26 | class ConfigSerDeser(var conf: Configuration) extends Serializable { 27 | 28 | def this() = { 29 | this(new Configuration()) 30 | } 31 | 32 | def get(): Configuration = conf 33 | 34 | private def writeObject(out: java.io.ObjectOutputStream): Unit = { 35 | conf.write(out) 36 | } 37 | 38 | private def readObject(in: java.io.ObjectInputStream): Unit = { 39 | conf = new Configuration() 40 | conf.readFields(in) 41 | } 42 | 43 | private def readObjectNoData(): Unit = { 44 | conf = new Configuration() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/CopyPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import org.apache.spark.Partitioner 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** Custom partitioner based on the indexes array containing (partitionid, 7 | * number of batches within partition) Will handle missing partitions. 8 | */ 9 | case class CopyPartitioner(indexes: Array[(Int, Int)]) extends Partitioner { 10 | 11 | val indexesAsMap: Map[Int, Int] = indexes.toMap 12 | 13 | override val numPartitions: Int = indexes.map(_._2).sum + indexes.length 14 | 15 | val partitionOffsets: Map[Int, Int] = { 16 | indexes 17 | .scanRight((-1, numPartitions)) { 18 | case ((partition, maxKey), (_, previousOffset)) => 19 | (partition, previousOffset - maxKey - 1) 20 | } 21 | .dropRight(1) 22 | .toMap 23 | } 24 | 25 | override def getPartition(key: Any): Int = key match { 26 | case (p: Int, i: Int) => 27 | if (!indexesAsMap.keySet.contains(p)) 28 | throw new RuntimeException( 29 | s"Key partition $p of key [($p, $i)] was not found in the indexes [${indexesAsMap.keySet.mkString(", ")}]." 30 | ) 31 | // Modulo the batch id to prevent exceptions if the batch id is out of the range 32 | partitionOffsets(p) + (i % (indexesAsMap(p) + 1)) 33 | case u => 34 | throw new RuntimeException( 35 | s"Partitioned does not support key [$u]. Must be (Int, Int)." 36 | ) 37 | } 38 | 39 | } 40 | 41 | object CopyPartitioner { 42 | def apply( 43 | rdd: RDD[((Int, Int), CopyDefinitionWithDependencies)] 44 | ): CopyPartitioner = new CopyPartitioner( 45 | rdd.map(_._1).reduceByKey(_ max _).collect() 46 | ) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/CopyResult.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.net.URI 4 | 5 | /** Result of the DistCP copy used for both logging to a logger and a file. 6 | */ 7 | trait CopyResult extends DistCPResult 8 | 9 | case class FileCopyResult( 10 | source: URI, 11 | destination: URI, 12 | len: Long, 13 | copyAction: FileCopyActionResult 14 | ) extends CopyResult { 15 | def getMessage: String = 16 | s"Source: [$source], Destination: [$destination], Type: [FileCopy: $len bytes], Result: [${copyAction.message}]" 17 | } 18 | 19 | case class DirectoryCopyResult( 20 | source: URI, 21 | destination: URI, 22 | copyAction: DirectoryCreateActionResult 23 | ) extends CopyResult { 24 | def getMessage: String = 25 | s"Source: [$source], Destination: [$destination], Type: [DirectoryCreate], Result: [${copyAction.message}]" 26 | } 27 | 28 | sealed trait CopyActionResult extends Serializable { 29 | def message: String = this.getClass.getSimpleName.stripSuffix("$") 30 | } 31 | 32 | sealed trait FileCopyActionResult extends CopyActionResult 33 | 34 | sealed trait DirectoryCreateActionResult extends CopyActionResult 35 | 36 | object CopyActionResult { 37 | 38 | object SkippedAlreadyExists 39 | extends FileCopyActionResult 40 | with DirectoryCreateActionResult 41 | 42 | object SkippedIdenticalFileAlreadyExists extends FileCopyActionResult 43 | 44 | object SkippedDryRun 45 | extends FileCopyActionResult 46 | with DirectoryCreateActionResult 47 | 48 | object Created extends DirectoryCreateActionResult 49 | 50 | object Copied extends FileCopyActionResult 51 | 52 | object OverwrittenOrUpdated extends FileCopyActionResult 53 | 54 | case class Failed(e: Throwable) 55 | extends FileCopyActionResult 56 | with DirectoryCreateActionResult { 57 | override def message: String = s"${super.message}: ${e.getMessage}" 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/DeleteResult.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.net.URI 4 | 5 | /** Result of the DistCP delete used for both logging to a logger and a file. 6 | */ 7 | case class DeleteResult(path: URI, actionResult: DeleteActionResult) 8 | extends DistCPResult { 9 | def getMessage: String = 10 | s"Path: [$path], Type: [Delete], Result: [${actionResult.message}]" 11 | } 12 | 13 | sealed trait DeleteActionResult extends Serializable { 14 | def message: String = this.getClass.getSimpleName 15 | } 16 | 17 | object DeleteActionResult { 18 | 19 | object SkippedDoesNotExists extends DeleteActionResult 20 | 21 | object SkippedDryRun extends DeleteActionResult 22 | 23 | object Deleted extends DeleteActionResult 24 | 25 | case class Failed(e: Throwable) extends DeleteActionResult { 26 | override def message: String = s"${super.message}: ${e.getMessage}" 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/DistCPResult.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | /** Result of the DistCP action (copy/delete) used for both logging to a logger 4 | * and a file. 5 | */ 6 | trait DistCPResult extends Serializable { 7 | 8 | def getMessage: String 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/ExceptionCountAccumulator.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.util 4 | import java.util.Collections 5 | import java.util.function.{BiConsumer, BiFunction} 6 | 7 | import org.apache.spark.util.AccumulatorV2 8 | 9 | class ExceptionCountAccumulator 10 | extends AccumulatorV2[String, java.util.Map[String, Long]] { 11 | 12 | private val _map: java.util.Map[String, Long] = 13 | Collections.synchronizedMap(new util.HashMap[String, Long]()) 14 | 15 | override def isZero: Boolean = _map.isEmpty 16 | 17 | override def copyAndReset(): ExceptionCountAccumulator = 18 | new ExceptionCountAccumulator 19 | 20 | override def copy(): ExceptionCountAccumulator = { 21 | val newAcc = new ExceptionCountAccumulator 22 | _map.synchronized { 23 | newAcc._map.putAll(_map) 24 | } 25 | newAcc 26 | } 27 | 28 | override def reset(): Unit = _map.clear() 29 | 30 | def add(e: Throwable): Unit = add(e.getClass.getName.stripSuffix("$")) 31 | 32 | override def add(k: String): Unit = { 33 | add(k, 1) 34 | } 35 | 36 | private def add(k: String, v: Long): Unit = { 37 | _map.merge(k, v, CombineCounts) 38 | } 39 | 40 | override def merge( 41 | other: AccumulatorV2[String, util.Map[String, Long]] 42 | ): Unit = { 43 | other match { 44 | case e: ExceptionCountAccumulator => 45 | e._map.forEach { 46 | new BiConsumer[String, Long] { 47 | override def accept(k: String, v: Long): Unit = add(k, v) 48 | } 49 | } 50 | case _ => 51 | throw new UnsupportedOperationException( 52 | s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}" 53 | ) 54 | } 55 | } 56 | 57 | override def value: util.Map[String, Long] = _map 58 | } 59 | 60 | object CombineCounts extends BiFunction[Long, Long, Long] { 61 | override def apply(t: Long, u: Long): Long = t + u 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/FileSystemObjectCacher.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.net.URI 4 | 5 | import com.coxautodata.utils.PathUtils 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.fs.FileSystem 8 | 9 | import scala.collection.mutable 10 | 11 | /** FileSystem caching class. Aims to prevent many of the same FileSystem 12 | * objects being created between copy and delete actions in the same partition. 13 | */ 14 | class FileSystemObjectCacher(hadoopConfiguration: Configuration) { 15 | 16 | private val cache: mutable.Map[URI, FileSystem] = mutable.Map.empty 17 | 18 | /** Get a FileSystem object based on the given URI if it already exists. If it 19 | * doesn't exist, create one and store it. 20 | */ 21 | def getOrCreate(uri: URI): FileSystem = get(uri) match { 22 | case Some(fs) => fs 23 | case None => 24 | val fs = FileSystem.get(uri, hadoopConfiguration) 25 | cache.update(fs.getUri, fs) 26 | fs 27 | } 28 | 29 | /** Get a FileSystem object based on the given URI if it already exists. 30 | */ 31 | def get(uri: URI): Option[FileSystem] = cache.collectFirst { 32 | case (u, f) if PathUtils.uriIsChild(u.resolve("/"), uri) => f 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/Logging.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import org.apache.log4j.{Level, LogManager, Logger} 4 | 5 | trait Logging { 6 | 7 | // Method to get the logger name for this object 8 | protected def logName: String = { 9 | // Ignore trailing $'s in the class names for Scala objects 10 | this.getClass.getName.stripSuffix("$") 11 | } 12 | 13 | private val log: Logger = LogManager.getLogger(logName) 14 | 15 | // Set logger level 16 | protected def setLogLevel(level: Level): Unit = log.setLevel(level) 17 | 18 | // Log methods that take only a String 19 | protected def logInfo(msg: => String): Unit = { 20 | if (log.isInfoEnabled) log.info(msg) 21 | } 22 | 23 | protected def logDebug(msg: => String): Unit = { 24 | if (log.isDebugEnabled) log.debug(msg) 25 | } 26 | 27 | protected def logTrace(msg: => String): Unit = { 28 | if (log.isTraceEnabled) log.trace(msg) 29 | } 30 | 31 | protected def logWarning(msg: => String): Unit = { 32 | log.warn(msg) 33 | } 34 | 35 | protected def logError(msg: => String): Unit = { 36 | log.error(msg) 37 | } 38 | 39 | // Log methods that take Throwables (Exceptions/Errors) too 40 | protected def logInfo(msg: => String, throwable: Throwable): Unit = { 41 | if (log.isInfoEnabled) log.info(msg, throwable) 42 | } 43 | 44 | protected def logDebug(msg: => String, throwable: Throwable): Unit = { 45 | if (log.isDebugEnabled) log.debug(msg, throwable) 46 | } 47 | 48 | protected def logTrace(msg: => String, throwable: Throwable): Unit = { 49 | if (log.isTraceEnabled) log.trace(msg, throwable) 50 | } 51 | 52 | protected def logWarning(msg: => String, throwable: Throwable): Unit = { 53 | log.warn(msg, throwable) 54 | } 55 | 56 | protected def logError(msg: => String, throwable: Throwable): Unit = { 57 | log.error(msg, throwable) 58 | } 59 | 60 | protected def isTraceEnabled: Boolean = { 61 | log.isTraceEnabled 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/SerializableFileStatus.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.net.URI 4 | 5 | import org.apache.hadoop.fs.{FileStatus, Path} 6 | 7 | /** Case class to represent a simple status of a File. Exists because 8 | * [[FileStatus]] is not serializable 9 | */ 10 | case class SerializableFileStatus(uri: URI, len: Long, fileType: FileType) 11 | extends Serializable { 12 | def getPath: Path = new Path(uri) 13 | 14 | def getLen: Long = len 15 | 16 | def isDirectory: Boolean = fileType == Directory 17 | 18 | def isFile: Boolean = fileType == File 19 | } 20 | 21 | object SerializableFileStatus { 22 | 23 | /** Create a [[SerializableFileStatus]] from a [[FileStatus]] object 24 | */ 25 | def apply(fileStatus: FileStatus): SerializableFileStatus = { 26 | 27 | val fileType = 28 | if (fileStatus.isDirectory) Directory 29 | else if (fileStatus.isFile) File 30 | else 31 | throw new RuntimeException( 32 | s"File [$fileStatus] is neither a directory or file" 33 | ) 34 | 35 | new SerializableFileStatus( 36 | fileStatus.getPath.toUri, 37 | fileStatus.getLen, 38 | fileType 39 | ) 40 | } 41 | } 42 | 43 | sealed trait FileType extends Serializable 44 | 45 | case object File extends FileType 46 | 47 | case object Directory extends FileType 48 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/objects/SingleCopyDefinition.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.net.URI 4 | 5 | import com.coxautodata.SparkDistCP.KeyedCopyDefinition 6 | 7 | /** Definition of a single copy 8 | * 9 | * @param source 10 | * Source file/folder to copy 11 | * @param destination 12 | * Destination to copy to 13 | */ 14 | case class SingleCopyDefinition( 15 | source: SerializableFileStatus, 16 | destination: URI 17 | ) 18 | 19 | /** Definition of a copy that includes any copying of parent folders this 20 | * file/folder depends on 21 | * 22 | * @param source 23 | * Source file/folder to copy 24 | * @param destination 25 | * Destination to copy to 26 | * @param dependentFolders 27 | * Any dependent folder copies this file/folder depends on 28 | */ 29 | case class CopyDefinitionWithDependencies( 30 | source: SerializableFileStatus, 31 | destination: URI, 32 | dependentFolders: Seq[SingleCopyDefinition] 33 | ) { 34 | 35 | def toKeyedDefinition: KeyedCopyDefinition = (destination, this) 36 | 37 | def getAllCopyDefinitions: Seq[SingleCopyDefinition] = 38 | dependentFolders :+ SingleCopyDefinition(source, destination) 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/utils/CopyUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import java.io.FileNotFoundException 4 | import java.net.URI 5 | 6 | import com.coxautodata.SparkDistCPOptions 7 | import com.coxautodata.objects._ 8 | import org.apache.hadoop.fs._ 9 | import org.apache.hadoop.io.IOUtils 10 | import org.apache.log4j.Level 11 | 12 | import scala.util.{Failure, Success, Try} 13 | 14 | object CopyUtils extends Logging { 15 | 16 | /** Handle the copy of a file/folder 17 | * 18 | * @param sourceFS 19 | * Source FileSystem object 20 | * @param destFS 21 | * Destination FileSystem object 22 | * @param definition 23 | * Definition of the copy 24 | * @param options 25 | * SparkDistCP options 26 | * @param taskAttemptID 27 | * Spark task attempt ID (used to create a unique temporary file) 28 | */ 29 | def handleCopy( 30 | sourceFS: FileSystem, 31 | destFS: FileSystem, 32 | definition: SingleCopyDefinition, 33 | options: SparkDistCPOptions, 34 | taskAttemptID: Long 35 | ): DistCPResult = { 36 | 37 | if (options.verbose) setLogLevel(Level.DEBUG) 38 | 39 | val r = { 40 | if (definition.source.isDirectory) { 41 | CopyUtils.createDirectory(destFS, definition, options) 42 | } else if (definition.source.isFile) { 43 | CopyUtils.copyFile(sourceFS, destFS, definition, options, taskAttemptID) 44 | } else 45 | throw new UnsupportedOperationException( 46 | s"Given file is neither file nor directory. Copy unsupported: ${definition.source.getPath}" 47 | ) 48 | } 49 | 50 | logInfo(r.getMessage) 51 | r 52 | } 53 | 54 | /** Handle the delete of a file/folder 55 | * 56 | * @param fs 57 | * FileSystem to delete the file from 58 | * @param uri 59 | * URI of file/path 60 | * @param options 61 | * DistCP options 62 | */ 63 | def handleDelete( 64 | fs: FileSystem, 65 | uri: URI, 66 | options: SparkDistCPOptions 67 | ): DeleteResult = { 68 | 69 | if (options.verbose) setLogLevel(Level.DEBUG) 70 | 71 | val path = new Path(uri) 72 | 73 | val r = deleteFile(fs, path, options) 74 | logInfo(r.getMessage) 75 | r 76 | 77 | } 78 | 79 | /** Internal delete function 80 | */ 81 | private[utils] def deleteFile( 82 | fs: FileSystem, 83 | path: Path, 84 | options: SparkDistCPOptions 85 | ): DeleteResult = { 86 | if (!fs.exists(path)) { 87 | DeleteResult(path.toUri, DeleteActionResult.SkippedDoesNotExists) 88 | } else if (options.dryRun) { 89 | DeleteResult(path.toUri, DeleteActionResult.SkippedDryRun) 90 | } else { 91 | Try(fs.delete(path, true)) match { 92 | case Success(true) => 93 | DeleteResult(path.toUri, DeleteActionResult.Deleted) 94 | case Success(false) if !fs.exists(path) => 95 | DeleteResult(path.toUri, DeleteActionResult.SkippedDoesNotExists) 96 | case Success(false) if options.ignoreErrors => 97 | DeleteResult( 98 | path.toUri, 99 | DeleteActionResult.Failed( 100 | new RuntimeException(s"Failed to delete directory [$path].") 101 | ) 102 | ) 103 | case Success(false) => 104 | throw new RuntimeException(s"Failed to delete directory [$path].") 105 | case Failure(e) if options.ignoreErrors => 106 | DeleteResult(path.toUri, DeleteActionResult.Failed(e)) 107 | case Failure(e) => throw e 108 | } 109 | } 110 | } 111 | 112 | /** Internal create directory function 113 | */ 114 | private[utils] def createDirectory( 115 | destFS: FileSystem, 116 | definition: SingleCopyDefinition, 117 | options: SparkDistCPOptions 118 | ): DirectoryCopyResult = { 119 | val destPath = new Path(definition.destination) 120 | if (destFS.exists(destPath)) { 121 | DirectoryCopyResult( 122 | definition.source.getPath.toUri, 123 | definition.destination, 124 | CopyActionResult.SkippedAlreadyExists 125 | ) 126 | } else if (options.dryRun) { 127 | DirectoryCopyResult( 128 | definition.source.getPath.toUri, 129 | definition.destination, 130 | CopyActionResult.SkippedDryRun 131 | ) 132 | } else { 133 | val result = Try { 134 | if (destFS.exists(destPath.getParent)) { 135 | destFS.mkdirs(destPath) 136 | DirectoryCopyResult( 137 | definition.source.getPath.toUri, 138 | definition.destination, 139 | CopyActionResult.Created 140 | ) 141 | } else 142 | throw new FileNotFoundException( 143 | s"Parent folder [${destPath.getParent}] does not exist." 144 | ) 145 | } 146 | .recover { case _: FileAlreadyExistsException => 147 | DirectoryCopyResult( 148 | definition.source.getPath.toUri, 149 | definition.destination, 150 | CopyActionResult.SkippedAlreadyExists 151 | ) 152 | } 153 | result match { 154 | case Success(v) => v 155 | case Failure(e) if options.ignoreErrors => 156 | logError( 157 | s"Exception whilst creating directory [${definition.destination}]", 158 | e 159 | ) 160 | DirectoryCopyResult( 161 | definition.source.getPath.toUri, 162 | definition.destination, 163 | CopyActionResult.Failed(e) 164 | ) 165 | case Failure(e) => 166 | throw e 167 | } 168 | } 169 | } 170 | 171 | /** Internal copy file function 172 | */ 173 | private[utils] def copyFile( 174 | sourceFS: FileSystem, 175 | destFS: FileSystem, 176 | definition: SingleCopyDefinition, 177 | options: SparkDistCPOptions, 178 | taskAttemptID: Long 179 | ): FileCopyResult = { 180 | val destPath = new Path(definition.destination) 181 | Try(destFS.getFileStatus(destPath)) match { 182 | case Failure(_: FileNotFoundException) if options.dryRun => 183 | FileCopyResult( 184 | definition.source.getPath.toUri, 185 | definition.destination, 186 | definition.source.len, 187 | CopyActionResult.SkippedDryRun 188 | ) 189 | case Failure(_: FileNotFoundException) => 190 | performCopy( 191 | sourceFS, 192 | definition.source, 193 | destFS, 194 | definition.destination, 195 | removeExisting = false, 196 | ignoreErrors = options.ignoreErrors, 197 | taskAttemptID 198 | ) 199 | case Failure(e) if options.ignoreErrors => 200 | logError( 201 | s"Exception whilst getting destination file information [${definition.destination}]", 202 | e 203 | ) 204 | FileCopyResult( 205 | definition.source.getPath.toUri, 206 | definition.destination, 207 | definition.source.len, 208 | CopyActionResult.Failed(e) 209 | ) 210 | case Failure(e) => 211 | throw e 212 | case Success(_) if options.overwrite && options.dryRun => 213 | FileCopyResult( 214 | definition.source.getPath.toUri, 215 | definition.destination, 216 | definition.source.len, 217 | CopyActionResult.SkippedDryRun 218 | ) 219 | case Success(_) if options.overwrite => 220 | performCopy( 221 | sourceFS, 222 | definition.source, 223 | destFS, 224 | definition.destination, 225 | removeExisting = true, 226 | ignoreErrors = options.ignoreErrors, 227 | taskAttemptID 228 | ) 229 | case Success(d) if options.update => 230 | Try { 231 | filesAreIdentical( 232 | definition.source, 233 | Option(sourceFS.getFileChecksum(definition.source.getPath)), 234 | SerializableFileStatus(d), 235 | Option(destFS.getFileChecksum(destPath)) 236 | ) 237 | } match { 238 | case Failure(e) if options.ignoreErrors => 239 | logError( 240 | s"Exception whilst getting source and destination checksum: source [${definition.source.getPath}] destination [${definition.destination}", 241 | e 242 | ) 243 | FileCopyResult( 244 | definition.source.getPath.toUri, 245 | definition.destination, 246 | definition.source.len, 247 | CopyActionResult.Failed(e) 248 | ) 249 | case Failure(e) => 250 | throw e 251 | case Success(true) => 252 | FileCopyResult( 253 | definition.source.getPath.toUri, 254 | definition.destination, 255 | definition.source.len, 256 | CopyActionResult.SkippedIdenticalFileAlreadyExists 257 | ) 258 | case Success(false) if options.dryRun => 259 | FileCopyResult( 260 | definition.source.getPath.toUri, 261 | definition.destination, 262 | definition.source.len, 263 | CopyActionResult.SkippedDryRun 264 | ) 265 | case Success(false) => 266 | performCopy( 267 | sourceFS, 268 | definition.source, 269 | destFS, 270 | definition.destination, 271 | removeExisting = true, 272 | ignoreErrors = options.ignoreErrors, 273 | taskAttemptID 274 | ) 275 | } 276 | case Success(_) => 277 | FileCopyResult( 278 | definition.source.getPath.toUri, 279 | definition.destination, 280 | definition.source.len, 281 | CopyActionResult.SkippedAlreadyExists 282 | ) 283 | } 284 | } 285 | 286 | /** Check whether two files match, based on length and checksum. If either of 287 | * the checksums are None, then checksums are not used for comparison. 288 | */ 289 | private[utils] def filesAreIdentical( 290 | f1: SerializableFileStatus, 291 | mc1: => Option[FileChecksum], 292 | f2: SerializableFileStatus, 293 | mc2: => Option[FileChecksum] 294 | ): Boolean = { 295 | if (f1.getLen != f2.getLen) { 296 | logDebug( 297 | s"Length [${f1.getLen}] of file [${f1.uri}] was not the same as length [${f2.getLen}] of file [${f2.uri}]. Files are not identical." 298 | ) 299 | false 300 | } else { 301 | val c1 = mc1 302 | val c2 = mc2 303 | val same = mc1.flatMap(c1 => mc2.map(c1 ==)).getOrElse(true) 304 | if (same) { 305 | logDebug( 306 | s"CRC [$c1] of file [${f1.uri}] was the same as CRC [$c2] of file [${f2.uri}]. Files are identical." 307 | ) 308 | true 309 | } else { 310 | logDebug( 311 | s"CRC [$c1] of file [${f1.uri}] was not the same as CRC [$c2] of file [${f2.uri}]. Files are not identical." 312 | ) 313 | false 314 | } 315 | 316 | } 317 | 318 | } 319 | 320 | /** Internal copy function Only pass in true for removeExisting if the file 321 | * actually exists 322 | */ 323 | def performCopy( 324 | sourceFS: FileSystem, 325 | sourceFile: SerializableFileStatus, 326 | destFS: FileSystem, 327 | dest: URI, 328 | removeExisting: Boolean, 329 | ignoreErrors: Boolean, 330 | taskAttemptID: Long 331 | ): FileCopyResult = { 332 | 333 | val destPath = new Path(dest) 334 | 335 | val tempPath = new Path( 336 | destPath.getParent, 337 | s".sparkdistcp.$taskAttemptID.${destPath.getName}" 338 | ) 339 | 340 | Try { 341 | var in: Option[FSDataInputStream] = None 342 | var out: Option[FSDataOutputStream] = None 343 | try { 344 | in = Some(sourceFS.open(sourceFile.getPath)) 345 | if (!destFS.exists(tempPath.getParent)) 346 | throw new RuntimeException( 347 | s"Destination folder [${tempPath.getParent}] does not exist" 348 | ) 349 | out = Some(destFS.create(tempPath, false)) 350 | IOUtils.copyBytes( 351 | in.get, 352 | out.get, 353 | sourceFS.getConf.getInt("io.file.buffer.size", 4096) 354 | ) 355 | 356 | } catch { 357 | case e: Throwable => throw e 358 | } finally { 359 | in.foreach(_.close()) 360 | out.foreach(_.close()) 361 | } 362 | }.map { _ => 363 | val tempFile = destFS.getFileStatus(tempPath) 364 | if (sourceFile.getLen != tempFile.getLen) 365 | throw new RuntimeException( 366 | s"Written file [${tempFile.getPath}] length [${tempFile.getLen}] did not match source file [${sourceFile.getPath}] length [${sourceFile.getLen}]" 367 | ) 368 | 369 | if (removeExisting) { 370 | val res = destFS.delete(destPath, false) 371 | if (!res) 372 | throw new RuntimeException( 373 | s"Failed to clean up existing file [$destPath]" 374 | ) 375 | } 376 | if (destFS.exists(destPath)) 377 | throw new RuntimeException( 378 | s"Cannot create file [$destPath] as it already exists" 379 | ) 380 | val res = destFS.rename(tempPath, destPath) 381 | if (!res) 382 | throw new RuntimeException( 383 | s"Failed to rename temporary file [$tempPath] to [$destPath]" 384 | ) 385 | } match { 386 | case Success(_) if removeExisting => 387 | FileCopyResult( 388 | sourceFile.getPath.toUri, 389 | dest, 390 | sourceFile.len, 391 | CopyActionResult.OverwrittenOrUpdated 392 | ) 393 | case Success(_) => 394 | FileCopyResult( 395 | sourceFile.getPath.toUri, 396 | dest, 397 | sourceFile.len, 398 | CopyActionResult.Copied 399 | ) 400 | case Failure(e) if ignoreErrors => 401 | logError( 402 | s"Failed to copy file [${sourceFile.getPath}] to [$destPath]", 403 | e 404 | ) 405 | FileCopyResult( 406 | sourceFile.getPath.toUri, 407 | dest, 408 | sourceFile.len, 409 | CopyActionResult.Failed(e) 410 | ) 411 | case Failure(e) => 412 | throw e 413 | } 414 | 415 | } 416 | 417 | } 418 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/utils/FileListUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import java.net.URI 4 | import java.util.UUID 5 | import java.util.concurrent.{ConcurrentHashMap, Executors, TimeUnit} 6 | 7 | import com.coxautodata.SparkDistCP.KeyedCopyDefinition 8 | import com.coxautodata.SparkDistCPOptions 9 | import com.coxautodata.objects.{ 10 | CopyDefinitionWithDependencies, 11 | Logging, 12 | SerializableFileStatus, 13 | SingleCopyDefinition 14 | } 15 | import org.apache.hadoop.fs.{FileSystem, Path, RemoteIterator} 16 | import org.apache.spark.SparkContext 17 | import org.apache.spark.rdd.RDD 18 | 19 | import scala.jdk.CollectionConverters._ 20 | import scala.concurrent.duration.Duration 21 | import scala.concurrent.{Await, Future} 22 | import scala.util.Try 23 | import scala.util.matching.Regex 24 | 25 | object FileListUtils extends Logging { 26 | 27 | /** Turn a [[RemoteIterator]] into a Scala [[Iterator]] 28 | */ 29 | private implicit class ScalaRemoteIterator[T](underlying: RemoteIterator[T]) 30 | extends Iterator[T] { 31 | override def hasNext: Boolean = underlying.hasNext 32 | 33 | override def next(): T = underlying.next() 34 | } 35 | 36 | /** Recursively list files in a given directory on a given FileSystem. This 37 | * will be done in parallel depending on the value of `threads`. An optional 38 | * list of regex filters to filter out files can be given. 39 | * 40 | * @param fs 41 | * FileSystem to search 42 | * @param path 43 | * Root path to search from 44 | * @param threads 45 | * Number of threads to search in parallel 46 | * @param includePathRootInDependents 47 | * Whether to include the root path `path` in the search output 48 | * @param filterNot 49 | * A list of regex filters that will filter out any results that match one 50 | * or more of the filters 51 | */ 52 | def listFiles( 53 | fs: FileSystem, 54 | path: Path, 55 | threads: Int, 56 | includePathRootInDependents: Boolean, 57 | filterNot: List[Regex] 58 | ): Seq[(SerializableFileStatus, Seq[SerializableFileStatus])] = { 59 | 60 | assert(threads > 0, "Number of threads must be positive") 61 | 62 | val maybePathRoot = 63 | if (includePathRootInDependents) 64 | Some(SerializableFileStatus(fs.getFileStatus(path))) 65 | else None 66 | 67 | val processed = new java.util.concurrent.LinkedBlockingQueue[ 68 | (SerializableFileStatus, Seq[SerializableFileStatus]) 69 | ](maybePathRoot.map((_, Seq.empty)).toSeq.asJava) 70 | val toProcess = new java.util.concurrent.LinkedBlockingDeque[ 71 | (Path, Seq[SerializableFileStatus]) 72 | ](List((path, maybePathRoot.toSeq)).asJava) 73 | val exceptions = new java.util.concurrent.ConcurrentLinkedQueue[Exception]() 74 | val threadsWorking = new ConcurrentHashMap[UUID, Boolean]() 75 | 76 | class FileLister extends Runnable { 77 | 78 | private val localFS = FileSystem.get(fs.getUri, fs.getConf) 79 | 80 | private val uuid = UUID.randomUUID() 81 | threadsWorking.put(uuid, true) 82 | 83 | override def run(): Unit = { 84 | while (threadsWorking.containsValue(true)) { 85 | Try( 86 | Option(toProcess.pollFirst(50, TimeUnit.MILLISECONDS)) 87 | ).toOption.flatten match { 88 | case None => 89 | threadsWorking.put(uuid, false) 90 | case Some(p) => 91 | logDebug( 92 | s"Thread [$uuid] searching [${p._1}], waiting to process depth [${toProcess.size()}]" 93 | ) 94 | threadsWorking.put(uuid, true) 95 | try { 96 | localFS 97 | .listLocatedStatus(p._1) 98 | .foreach { 99 | case l if l.isSymlink => 100 | throw new RuntimeException(s"Link [$l] is not supported") 101 | case d if d.isDirectory => 102 | if ( 103 | !filterNot.exists( 104 | _.findFirstIn(d.getPath.toString).isDefined 105 | ) 106 | ) { 107 | val s = SerializableFileStatus(d) 108 | toProcess.addFirst((d.getPath, p._2 :+ s)) 109 | processed.add((s, p._2)) 110 | } 111 | case f => 112 | if ( 113 | !filterNot.exists( 114 | _.findFirstIn(f.getPath.toString).isDefined 115 | ) 116 | ) processed.add((SerializableFileStatus(f), p._2)) 117 | } 118 | } catch { 119 | case e: Exception => exceptions.add(e) 120 | } 121 | } 122 | } 123 | } 124 | } 125 | 126 | val pool = Executors.newFixedThreadPool(threads) 127 | 128 | logInfo(s"Beginning recursive list of [$path]") 129 | val tasks: Seq[Future[Unit]] = List 130 | .fill(threads)(new FileLister) 131 | .map(pool.submit) 132 | .map(j => 133 | Future { 134 | j.get() 135 | () 136 | }(scala.concurrent.ExecutionContext.global) 137 | ) 138 | 139 | import scala.concurrent.ExecutionContext.Implicits.global 140 | Await.result(Future.sequence(tasks), Duration.Inf) 141 | pool.shutdown() 142 | 143 | if (!toProcess.isEmpty) 144 | throw new RuntimeException( 145 | "Exception listing files, toProcess queue was not empty" 146 | ) 147 | 148 | if (!exceptions.isEmpty) { 149 | val collectedExceptions = exceptions.iterator().asScala.toList 150 | collectedExceptions 151 | .foreach { e => 152 | logError("Exception during file listing", e) 153 | } 154 | throw collectedExceptions.head 155 | } 156 | 157 | logInfo(s"Finished recursive list of [$path]") 158 | 159 | processed.iterator().asScala.toSeq // Lazy streamify? 160 | 161 | } 162 | 163 | /** List all files in the given source URIs. This function will throw an 164 | * exception if any source files collide on identical destination locations 165 | * and any collisions on any cases where a source files is the same as the 166 | * destination file (copying between the same FileSystem) 167 | */ 168 | def getSourceFiles( 169 | sparkContext: SparkContext, 170 | sourceURIs: Seq[URI], 171 | destinationURI: URI, 172 | updateOverwritePathBehaviour: Boolean, 173 | numListstatusThreads: Int, 174 | filterNot: List[Regex] 175 | ): RDD[KeyedCopyDefinition] = { 176 | val sourceRDD = sourceURIs 177 | .map { sourceURI => 178 | val sourceFS = 179 | new Path(sourceURI).getFileSystem(sparkContext.hadoopConfiguration) 180 | sparkContext 181 | .parallelize( 182 | FileListUtils.listFiles( 183 | sourceFS, 184 | new Path(sourceURI), 185 | numListstatusThreads, 186 | !updateOverwritePathBehaviour, 187 | filterNot 188 | ) 189 | ) 190 | .map { case (f, d) => 191 | val dependentFolders = d.map { dl => 192 | val udl = PathUtils.sourceURIToDestinationURI( 193 | dl.uri, 194 | sourceURI, 195 | destinationURI, 196 | updateOverwritePathBehaviour 197 | ) 198 | SingleCopyDefinition(dl, udl) 199 | } 200 | val fu = PathUtils.sourceURIToDestinationURI( 201 | f.uri, 202 | sourceURI, 203 | destinationURI, 204 | updateOverwritePathBehaviour 205 | ) 206 | CopyDefinitionWithDependencies(f, fu, dependentFolders) 207 | } 208 | } 209 | .reduce(_ union _) 210 | .map(_.toKeyedDefinition) 211 | 212 | handleSourceCollisions(sourceRDD) 213 | 214 | handleDestCollisions(sourceRDD) 215 | 216 | sourceRDD 217 | } 218 | 219 | /** List all files at the destination path 220 | */ 221 | def getDestinationFiles( 222 | sparkContext: SparkContext, 223 | destinationPath: Path, 224 | options: SparkDistCPOptions 225 | ): RDD[(URI, SerializableFileStatus)] = { 226 | val destinationFS = 227 | destinationPath.getFileSystem(sparkContext.hadoopConfiguration) 228 | sparkContext 229 | .parallelize( 230 | FileListUtils.listFiles( 231 | destinationFS, 232 | destinationPath, 233 | options.numListstatusThreads, 234 | false, 235 | List.empty 236 | ) 237 | ) 238 | .map { case (f, _) => (f.getPath.toUri, f) } 239 | } 240 | 241 | /** Throw an exception if any source files collide on identical destination 242 | * locations 243 | */ 244 | def handleSourceCollisions(source: RDD[KeyedCopyDefinition]): Unit = { 245 | val collisions = source 246 | .groupByKey() 247 | .filter(_._2.size > 1) 248 | 249 | collisions 250 | .foreach { case (f, l) => 251 | logError( 252 | s"The following files will collide on destination file [$f]: ${l.map(_.source.getPath).mkString(", ")}" 253 | ) 254 | } 255 | 256 | if (!collisions.isEmpty()) 257 | throw new RuntimeException( 258 | "Collisions found where multiple source files lead to the same destination location; check executor logs for specific collision detail." 259 | ) 260 | } 261 | 262 | /** Throw an exception for any collisions on any cases where a source files is 263 | * the same as the destination file (copying between the same FileSystem) 264 | */ 265 | def handleDestCollisions(source: RDD[KeyedCopyDefinition]): Unit = { 266 | 267 | val collisions = source 268 | .collect { 269 | case (_, CopyDefinitionWithDependencies(s, d, _)) if s.uri == d => d 270 | } 271 | 272 | collisions 273 | .foreach { d => 274 | logError( 275 | s"The following file has the same source and destination location: [$d]" 276 | ) 277 | } 278 | 279 | if (!collisions.isEmpty()) 280 | throw new RuntimeException( 281 | "Collisions found where a file has the same source and destination location; check executor logs for specific collision detail." 282 | ) 283 | } 284 | 285 | } 286 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/utils/FileUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import java.math.{BigDecimal, BigInteger} 4 | 5 | // Adapted from: https://jira.apache.org/jira/secure/attachment/12542305/roundedByteCountToDisplaySize.patch 6 | object FileUtils { 7 | 8 | val ONE_KB = 1024 9 | val ONE_KB_BI: BigInteger = BigInteger.valueOf(ONE_KB) 10 | val ONE_MB: Long = ONE_KB * ONE_KB 11 | val ONE_MB_BI: BigInteger = ONE_KB_BI.multiply(ONE_KB_BI) 12 | val ONE_GB: Long = ONE_KB * ONE_MB 13 | val ONE_GB_BI: BigInteger = ONE_KB_BI.multiply(ONE_MB_BI) 14 | val ONE_TB: Long = ONE_KB * ONE_GB 15 | val ONE_TB_BI: BigInteger = ONE_KB_BI.multiply(ONE_GB_BI) 16 | val ONE_PB: Long = ONE_KB * ONE_TB 17 | val ONE_PB_BI: BigInteger = ONE_KB_BI.multiply(ONE_TB_BI) 18 | val ONE_EB: Long = ONE_KB * ONE_PB 19 | val ONE_EB_BI: BigInteger = ONE_KB_BI.multiply(ONE_PB_BI) 20 | val ONE_ZB: BigInteger = 21 | BigInteger.valueOf(ONE_KB).multiply(BigInteger.valueOf(ONE_EB)) 22 | val ONE_YB: BigInteger = ONE_KB_BI.multiply(ONE_ZB) 23 | 24 | def byteCountToDisplaySize(size: BigInteger): String = { 25 | val sizeBD = new BigDecimal(size) 26 | if (size.divide(ONE_YB).compareTo(BigInteger.ZERO) > 0) 27 | getThreeSigFigs(sizeBD.divide(new BigDecimal(ONE_YB))) + s" YB (${String.valueOf(size)} bytes)" 28 | else if (size.divide(ONE_ZB).compareTo(BigInteger.ZERO) > 0) 29 | getThreeSigFigs(sizeBD.divide(new BigDecimal(ONE_ZB))) + s" ZB (${String.valueOf(size)} bytes)" 30 | else if (size.divide(ONE_EB_BI).compareTo(BigInteger.ZERO) > 0) 31 | getThreeSigFigs( 32 | sizeBD.divide(new BigDecimal(ONE_EB_BI)) 33 | ) + s" EB (${String.valueOf(size)} bytes)" 34 | else if (size.divide(ONE_PB_BI).compareTo(BigInteger.ZERO) > 0) 35 | getThreeSigFigs( 36 | sizeBD.divide(new BigDecimal(ONE_PB_BI)) 37 | ) + s" PB (${String.valueOf(size)} bytes)" 38 | else if (size.divide(ONE_TB_BI).compareTo(BigInteger.ZERO) > 0) 39 | getThreeSigFigs( 40 | sizeBD.divide(new BigDecimal(ONE_TB_BI)) 41 | ) + s" TB (${String.valueOf(size)} bytes)" 42 | else if (size.divide(ONE_GB_BI).compareTo(BigInteger.ZERO) > 0) 43 | getThreeSigFigs( 44 | sizeBD.divide(new BigDecimal(ONE_GB_BI)) 45 | ) + s" GB (${String.valueOf(size)} bytes)" 46 | else if (size.divide(ONE_MB_BI).compareTo(BigInteger.ZERO) > 0) 47 | getThreeSigFigs( 48 | sizeBD.divide(new BigDecimal(ONE_MB_BI)) 49 | ) + s" MB (${String.valueOf(size)} bytes)" 50 | else if (size.divide(ONE_KB_BI).compareTo(BigInteger.ZERO) > 0) 51 | getThreeSigFigs( 52 | sizeBD.divide(new BigDecimal(ONE_KB_BI)) 53 | ) + s" KB (${String.valueOf(size)} bytes)" 54 | else String.valueOf(size) + " bytes" 55 | } 56 | 57 | def byteCountToDisplaySize(size: Long): String = byteCountToDisplaySize( 58 | BigInteger.valueOf(size) 59 | ) 60 | 61 | private def getThreeSigFigs(size: BigDecimal): String = { 62 | val (isDecimal, _, sizeS) = size.toString.foldLeft((false, 0, "")) { 63 | case ((decimal, count, agg), c) => 64 | if (c == '.' && !decimal) (true, count, agg + c) 65 | else if (count < 3 || !decimal) (decimal, count + 1, agg + c) 66 | else (decimal, count + 1, agg) 67 | } 68 | 69 | if (isDecimal) 70 | sizeS.reverse.dropWhile(c => c == '0').reverse.stripSuffix(".") 71 | else sizeS 72 | 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/coxautodata/utils/PathUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import java.net.URI 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs.{FileSystem, Path} 7 | 8 | object PathUtils { 9 | 10 | /** Qualify a path, making the path both absolute and qualifies with a scheme. 11 | * If the input path is not absolute, the default working directory is used. 12 | * If the input path does not have a scheme, the default URI used in the 13 | * Hadoop Configuration is used. 14 | */ 15 | def pathToQualifiedPath( 16 | hadoopConfiguration: Configuration, 17 | path: Path 18 | ): Path = { 19 | val fs = FileSystem.get(hadoopConfiguration) 20 | path.makeQualified(fs.getUri, fs.getWorkingDirectory) 21 | } 22 | 23 | /** Transform a source input path URI into a destination path URI. This 24 | * function determines how a source file path is mapped to the destination. 25 | * The behaviour is different depending on if update or overwrite is used. 26 | * This follows the behaviour of Hadoop DistCP. See the Hadoop DistCP 27 | * documentation for more explanation of this behaviour. 28 | * 29 | * @param file 30 | * URI of source file 31 | * @param sourceURI 32 | * URI of root copy folder on source FileSystem 33 | * @param destinationURI 34 | * URI of root copy folder on destination FileSystem 35 | * @param updateOverwritePathBehaviour 36 | * Whether to use the overwrite/update path behaviour 37 | * @return 38 | * Source file path URI mapped to the destination FileSystem 39 | */ 40 | def sourceURIToDestinationURI( 41 | file: URI, 42 | sourceURI: URI, 43 | destinationURI: URI, 44 | updateOverwritePathBehaviour: Boolean 45 | ): URI = { 46 | val sourceFolderURI: URI = { 47 | if (updateOverwritePathBehaviour) sourceURI 48 | else 49 | Option(new Path(sourceURI).getParent).map(_.toUri).getOrElse(sourceURI) 50 | } 51 | val relativeFile = sourceFolderURI.relativize(file).getPath 52 | new Path(new Path(destinationURI), relativeFile).toUri 53 | } 54 | 55 | /** Check whether one URI is the parent of another URI 56 | */ 57 | def uriIsChild(parent: URI, child: URI): Boolean = { 58 | if (!parent.isAbsolute || !child.isAbsolute) 59 | throw new RuntimeException( 60 | s"URIs [$parent] and [$child] must have a scheme component." 61 | ) 62 | else if (!parent.getPath.startsWith("/") || !child.getPath.startsWith("/")) 63 | throw new RuntimeException( 64 | s"URIs [$parent] and [$child] must have an absolute path component." 65 | ) 66 | else parent.relativize(child) != child 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/test/resources/com/coxautodata/test.filters: -------------------------------------------------------------------------------- 1 | .*/_temporary($|/.*) 2 | .*/_committed.* 3 | .*/_started.* 4 | .*/_SUCCESS.* -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/TestOptionsParsing.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import java.net.URI 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.fs.Path 7 | import org.scalatest.funspec.AnyFunSpec 8 | import org.scalatest.matchers.should.Matchers 9 | 10 | class TestOptionsParsing extends AnyFunSpec with Matchers { 11 | 12 | describe("Successful parsing") { 13 | 14 | it("default options one source") { 15 | 16 | val conf = OptionsParsing.parse(Array("src", "dest")) 17 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 18 | conf.options should be(SparkDistCPOptions()) 19 | 20 | } 21 | 22 | it("default options two sources") { 23 | 24 | val conf = 25 | OptionsParsing.parse(Array("src1", "src2", "dest")) 26 | conf.sourceAndDestPaths should be( 27 | Seq(new Path("src1"), new Path("src2")), 28 | new Path("dest") 29 | ) 30 | val options = conf.options.withFiltersFromFile(new Configuration()) 31 | options should be(SparkDistCPOptions()) 32 | 33 | } 34 | 35 | it("ignore failures flag") { 36 | 37 | val conf = 38 | OptionsParsing.parse(Array("--i", "src", "dest")) 39 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 40 | val options = conf.options.withFiltersFromFile(new Configuration()) 41 | options should be(SparkDistCPOptions(ignoreErrors = true)) 42 | 43 | } 44 | 45 | it("log option") { 46 | 47 | val conf = OptionsParsing.parse( 48 | Array("--log", "log", "src", "dest") 49 | ) 50 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 51 | conf.options should be(SparkDistCPOptions(log = Some(new URI("log")))) 52 | 53 | } 54 | 55 | it("dry-run flag") { 56 | 57 | val conf = OptionsParsing.parse( 58 | Array("--dryrun", "src", "dest") 59 | ) 60 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 61 | conf.options should be(SparkDistCPOptions(dryRun = true)) 62 | 63 | } 64 | 65 | it("verbose flag") { 66 | 67 | val conf = OptionsParsing.parse( 68 | Array("--verbose", "src", "dest") 69 | ) 70 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 71 | conf.options should be(SparkDistCPOptions(verbose = true)) 72 | 73 | } 74 | 75 | it("overwrite flag") { 76 | 77 | val conf = OptionsParsing.parse( 78 | Array("--overwrite", "src", "dest") 79 | ) 80 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 81 | conf.options should be(SparkDistCPOptions(overwrite = true)) 82 | 83 | } 84 | 85 | it("update flag") { 86 | 87 | val conf = OptionsParsing.parse( 88 | Array("--update", "src", "dest") 89 | ) 90 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 91 | conf.options should be(SparkDistCPOptions(update = true)) 92 | 93 | } 94 | 95 | it("filters flag") { 96 | 97 | val filtersFile = this.getClass.getResource("test.filters").getPath 98 | val conf = OptionsParsing.parse( 99 | Array("--filters", filtersFile, "src", "dest") 100 | ) 101 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 102 | val options = conf.options.withFiltersFromFile(new Configuration()) 103 | options.filterNot.map(_.toString()) should be( 104 | List( 105 | ".*/_temporary($|/.*)", 106 | ".*/_committed.*", 107 | ".*/_started.*", 108 | ".*/_SUCCESS.*" 109 | ) 110 | ) 111 | val resetOptions = options.copy(filters = None).withFiltersFromFile(new Configuration()) 112 | resetOptions should be(SparkDistCPOptions()) 113 | 114 | } 115 | 116 | it("delete flag") { 117 | 118 | val conf = OptionsParsing.parse( 119 | Array("--delete", "--update", "src", "dest") 120 | ) 121 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 122 | conf.options should be(SparkDistCPOptions(delete = true, update = true)) 123 | 124 | } 125 | 126 | it("numListstatusThreads option") { 127 | 128 | val conf = OptionsParsing.parse( 129 | Array("--numListstatusThreads", "3", "src", "dest") 130 | ) 131 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 132 | conf.options should be(SparkDistCPOptions(numListstatusThreads = 3)) 133 | 134 | } 135 | 136 | it("consistentPathBehaviour option") { 137 | 138 | val conf = OptionsParsing.parse( 139 | Array("--consistentPathBehaviour", "src", "dest") 140 | ) 141 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 142 | conf.options should be(SparkDistCPOptions(consistentPathBehaviour = true)) 143 | 144 | } 145 | 146 | it("maxFilesPerTask option") { 147 | 148 | val conf = OptionsParsing.parse( 149 | Array("--maxFilesPerTask", "3", "src", "dest") 150 | ) 151 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 152 | conf.options should be(SparkDistCPOptions(maxFilesPerTask = 3)) 153 | 154 | } 155 | 156 | it("maxBytesPerTask option") { 157 | 158 | val conf = OptionsParsing.parse( 159 | Array("--maxBytesPerTask", "30000000", "src", "dest") 160 | ) 161 | conf.sourceAndDestPaths should be(Seq(new Path("src")), new Path("dest")) 162 | conf.options should be(SparkDistCPOptions(maxBytesPerTask = 30000000)) 163 | 164 | } 165 | 166 | } 167 | 168 | describe("Failure parsing") { 169 | 170 | it("single path") { 171 | 172 | intercept[IllegalArgumentException] { 173 | OptionsParsing.parse(Array("path")) 174 | }.getMessage should be( 175 | "requirement failed: you must supply two or more paths, representing the source paths and a destination" 176 | ) 177 | 178 | } 179 | 180 | it("missing filters file") { 181 | 182 | intercept[RuntimeException] { 183 | OptionsParsing.parse( 184 | Array("--filters", "none", "src", "dest") 185 | ).options.withFiltersFromFile(new Configuration()) 186 | }.getMessage should be("Invalid filter file none") 187 | 188 | } 189 | 190 | it("negative max files") { 191 | 192 | intercept[java.lang.AssertionError] { 193 | OptionsParsing.parse( 194 | Array("--maxFilesPerTask", "-2", "src", "dest") 195 | ) 196 | }.getMessage should be( 197 | "assertion failed: maxFilesPerTask must be positive" 198 | ) 199 | 200 | } 201 | 202 | it("negative max bytes") { 203 | 204 | intercept[java.lang.AssertionError] { 205 | OptionsParsing.parse( 206 | Array("--maxBytesPerTask", "-2", "src", "dest") 207 | ) 208 | }.getMessage should be( 209 | "assertion failed: maxBytesPerTask must be positive" 210 | ) 211 | 212 | } 213 | 214 | it("negative num list status threads") { 215 | 216 | intercept[java.lang.AssertionError] { 217 | OptionsParsing.parse( 218 | Array("--numListstatusThreads", "-2", "src", "dest") 219 | ) 220 | }.getMessage should be( 221 | "assertion failed: numListstatusThreads must be positive" 222 | ) 223 | 224 | } 225 | 226 | it("both update and overwrite specified") { 227 | 228 | intercept[java.lang.AssertionError] { 229 | OptionsParsing.parse( 230 | Array("--update", "--overwrite", "src", "dest") 231 | ) 232 | }.getMessage should be( 233 | "assertion failed: Both update and overwrite cannot be specified" 234 | ) 235 | 236 | } 237 | 238 | it("delete specified without update or overwrite") { 239 | 240 | intercept[java.lang.AssertionError] { 241 | OptionsParsing.parse( 242 | Array("--delete", "src", "dest") 243 | ) 244 | }.getMessage should be( 245 | "assertion failed: Delete must be specified with either overwrite or update" 246 | ) 247 | 248 | } 249 | 250 | } 251 | 252 | } 253 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/TestSparkDistCP.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import com.coxautodata.SparkDistCP._ 4 | import com.coxautodata.objects.{ 5 | CopyDefinitionWithDependencies, 6 | CopyPartitioner, 7 | Directory, 8 | File, 9 | SerializableFileStatus 10 | } 11 | import com.coxautodata.utils.FileListUtils.listFiles 12 | import com.coxautodata.utils.FileListing 13 | import org.apache.hadoop.fs.Path 14 | import org.apache.spark.sql.SparkSession 15 | import org.apache.spark.{SparkConf, SparkContext} 16 | 17 | class TestSparkDistCP extends TestSpec { 18 | 19 | describe("generateBatchedFileKeys") { 20 | 21 | it("batch files correctly") { 22 | 23 | val in = List( 24 | CopyDefinitionWithDependencies( 25 | SerializableFileStatus(new Path("/one").toUri, 0, Directory), 26 | new Path("/dest/one").toUri, 27 | Seq.empty 28 | ), 29 | CopyDefinitionWithDependencies( 30 | SerializableFileStatus(new Path("/two").toUri, 0, Directory), 31 | new Path("/dest/two").toUri, 32 | Seq.empty 33 | ), 34 | CopyDefinitionWithDependencies( 35 | SerializableFileStatus(new Path("/three").toUri, 0, Directory), 36 | new Path("/dest/three").toUri, 37 | Seq.empty 38 | ), 39 | CopyDefinitionWithDependencies( 40 | SerializableFileStatus(new Path("/file1").toUri, 1500, File), 41 | new Path("/dest/file1").toUri, 42 | Seq.empty 43 | ), 44 | CopyDefinitionWithDependencies( 45 | SerializableFileStatus(new Path("/file2").toUri, 20, File), 46 | new Path("/dest/file2").toUri, 47 | Seq.empty 48 | ), 49 | CopyDefinitionWithDependencies( 50 | SerializableFileStatus(new Path("/file3").toUri, 500, File), 51 | new Path("/dest/file3").toUri, 52 | Seq.empty 53 | ) 54 | ) 55 | 56 | generateBatchedFileKeys(3, 2000)(1, in.iterator).toSeq 57 | .map { case (k, c) => 58 | (k, c.source.getPath.toString) 59 | } should contain theSameElementsInOrderAs Seq( 60 | ((1, 0), "/one"), 61 | ((1, 0), "/two"), 62 | ((1, 0), "/three"), 63 | ((1, 1), "/file1"), 64 | ((1, 1), "/file2"), 65 | ((1, 2), "/file3") 66 | ) 67 | 68 | } 69 | 70 | } 71 | 72 | describe("batchAndPartitionFiles") { 73 | 74 | it("correctly partition files with 2 input partitions") { 75 | val spark = new SparkContext( 76 | new SparkConf().setAppName("test").setMaster("local[1]") 77 | ) 78 | 79 | val in = List( 80 | CopyDefinitionWithDependencies( 81 | SerializableFileStatus(new Path("/one").toUri, 0, Directory), 82 | new Path("/dest/one").toUri, 83 | Seq.empty 84 | ), 85 | CopyDefinitionWithDependencies( 86 | SerializableFileStatus(new Path("/two").toUri, 0, Directory), 87 | new Path("/dest/two").toUri, 88 | Seq.empty 89 | ), 90 | CopyDefinitionWithDependencies( 91 | SerializableFileStatus(new Path("/three").toUri, 0, Directory), 92 | new Path("/dest/three").toUri, 93 | Seq.empty 94 | ), 95 | CopyDefinitionWithDependencies( 96 | SerializableFileStatus(new Path("/file1").toUri, 1500, File), 97 | new Path("/dest/file1").toUri, 98 | Seq.empty 99 | ), 100 | CopyDefinitionWithDependencies( 101 | SerializableFileStatus(new Path("/file2").toUri, 20, File), 102 | new Path("/dest/file2").toUri, 103 | Seq.empty 104 | ), 105 | CopyDefinitionWithDependencies( 106 | SerializableFileStatus(new Path("/file3").toUri, 500, File), 107 | new Path("/dest/file3").toUri, 108 | Seq.empty 109 | ) 110 | ) 111 | 112 | val inRDD = spark 113 | .parallelize(in) 114 | .repartition(2) 115 | 116 | batchAndPartitionFiles(inRDD, 2, 2000) 117 | .mapPartitionsWithIndex { case (p, v) => 118 | List((p, v.toList)).iterator 119 | } 120 | .collect() 121 | .toSeq 122 | .flatMap { case (p, l) => 123 | l.map { case ((pp, i), d) => 124 | ((p, pp, i), d.source.getPath.toString) 125 | } 126 | } should contain theSameElementsAs Seq( 127 | ((0, 0, 0), "/file1"), 128 | ((0, 0, 0), "/file3"), 129 | ((1, 1, 0), "/file2"), 130 | ((1, 1, 0), "/one"), 131 | ((2, 1, 1), "/three"), 132 | ((2, 1, 1), "/two") 133 | ) 134 | 135 | spark.stop() 136 | } 137 | 138 | it("correctly partition files with 1 input partition") { 139 | val spark = new SparkContext( 140 | new SparkConf().setAppName("test").setMaster("local[1]") 141 | ) 142 | 143 | val in = List( 144 | CopyDefinitionWithDependencies( 145 | SerializableFileStatus(new Path("/one").toUri, 0, Directory), 146 | new Path("/dest/one").toUri, 147 | Seq.empty 148 | ), 149 | CopyDefinitionWithDependencies( 150 | SerializableFileStatus(new Path("/two").toUri, 0, Directory), 151 | new Path("/dest/two").toUri, 152 | Seq.empty 153 | ), 154 | CopyDefinitionWithDependencies( 155 | SerializableFileStatus(new Path("/three").toUri, 0, Directory), 156 | new Path("/dest/three").toUri, 157 | Seq.empty 158 | ), 159 | CopyDefinitionWithDependencies( 160 | SerializableFileStatus(new Path("/file1").toUri, 1500, File), 161 | new Path("/dest/file1").toUri, 162 | Seq.empty 163 | ), 164 | CopyDefinitionWithDependencies( 165 | SerializableFileStatus(new Path("/file2").toUri, 20, File), 166 | new Path("/dest/file2").toUri, 167 | Seq.empty 168 | ), 169 | CopyDefinitionWithDependencies( 170 | SerializableFileStatus(new Path("/file3").toUri, 1990, File), 171 | new Path("/dest/file3").toUri, 172 | Seq.empty 173 | ) 174 | ) 175 | 176 | val inRDD = spark 177 | .parallelize(in) 178 | .repartition(1) 179 | 180 | batchAndPartitionFiles(inRDD, 2, 2000) 181 | .mapPartitionsWithIndex { case (p, v) => 182 | List((p, v.toList)).iterator 183 | } 184 | .collect() 185 | .toSeq 186 | .flatMap { case (p, l) => 187 | l.map { case ((pp, i), d) => 188 | ((p, pp, i), d.source.getPath.toString) 189 | } 190 | } should contain theSameElementsAs Seq( 191 | ((0, 0, 0), "/file1"), 192 | ((0, 0, 0), "/file2"), 193 | ((1, 0, 1), "/file3"), 194 | ((1, 0, 1), "/one"), 195 | ((2, 0, 2), "/three"), 196 | ((2, 0, 2), "/two") 197 | ) 198 | 199 | spark.stop() 200 | } 201 | 202 | it("produce predictable batching") { 203 | val spark = new SparkContext( 204 | new SparkConf().setAppName("test").setMaster("local[1]") 205 | ) 206 | 207 | val in = List( 208 | CopyDefinitionWithDependencies( 209 | SerializableFileStatus(new Path("/1").toUri, 1, File), 210 | new Path("/dest/file1").toUri, 211 | Seq.empty 212 | ), 213 | CopyDefinitionWithDependencies( 214 | SerializableFileStatus(new Path("/3").toUri, 3000, File), 215 | new Path("/dest/file3").toUri, 216 | Seq.empty 217 | ), 218 | CopyDefinitionWithDependencies( 219 | SerializableFileStatus(new Path("/2").toUri, 1, File), 220 | new Path("/dest/file2").toUri, 221 | Seq.empty 222 | ) 223 | ) 224 | 225 | val inRDD = spark 226 | .parallelize(in) 227 | .repartition(1) 228 | 229 | val unsorted = batchAndPartitionFiles(inRDD, 3, 2000).partitioner.get 230 | .asInstanceOf[CopyPartitioner] 231 | 232 | val sorted = batchAndPartitionFiles( 233 | inRDD.sortBy(_.source.uri.toString), 234 | 3, 235 | 2000 236 | ).partitioner.get.asInstanceOf[CopyPartitioner] 237 | 238 | unsorted.indexesAsMap should be(sorted.indexesAsMap) 239 | 240 | spark.stop() 241 | } 242 | 243 | } 244 | 245 | describe("run") { 246 | 247 | it("perform distcp with non-update/non-overwrite") { 248 | 249 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 250 | 251 | val input = List( 252 | "src/1.file", 253 | "src/2.file", 254 | "src/3.file", 255 | "src/sub1/1.file", 256 | "src/sub1/2.file", 257 | "src/sub1/3.file", 258 | "src/sub2/1.file", 259 | "src/sub2/2.file", 260 | "src/sub2/3.file", 261 | "src/sub2/subsub1/1.file" 262 | ) 263 | 264 | val sourceOnlyResult = Seq( 265 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 266 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 267 | FileListing( 268 | new Path(testingBaseDirPath, "src/1.file").toString, 269 | Some(10) 270 | ), 271 | FileListing( 272 | new Path(testingBaseDirPath, "src/2.file").toString, 273 | Some(10) 274 | ), 275 | FileListing( 276 | new Path(testingBaseDirPath, "src/3.file").toString, 277 | Some(10) 278 | ), 279 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 280 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 281 | FileListing( 282 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 283 | None 284 | ), 285 | FileListing( 286 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 287 | Some(15) 288 | ), 289 | FileListing( 290 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 291 | Some(15) 292 | ), 293 | FileListing( 294 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 295 | Some(15) 296 | ), 297 | FileListing( 298 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 299 | Some(15) 300 | ), 301 | FileListing( 302 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 303 | Some(15) 304 | ), 305 | FileListing( 306 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 307 | Some(15) 308 | ), 309 | FileListing( 310 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 311 | Some(23) 312 | ) 313 | ) 314 | 315 | val destOnlyResult = Seq( 316 | FileListing(new Path(testingBaseDirPath, "dest/src").toString, None), 317 | FileListing( 318 | new Path(testingBaseDirPath, "dest/src/1.file").toString, 319 | Some(10) 320 | ), 321 | FileListing( 322 | new Path(testingBaseDirPath, "dest/src/2.file").toString, 323 | Some(10) 324 | ), 325 | FileListing( 326 | new Path(testingBaseDirPath, "dest/src/3.file").toString, 327 | Some(10) 328 | ), 329 | FileListing( 330 | new Path(testingBaseDirPath, "dest/src/sub1").toString, 331 | None 332 | ), 333 | FileListing( 334 | new Path(testingBaseDirPath, "dest/src/sub2").toString, 335 | None 336 | ), 337 | FileListing( 338 | new Path(testingBaseDirPath, "dest/src/sub2/subsub1").toString, 339 | None 340 | ), 341 | FileListing( 342 | new Path(testingBaseDirPath, "dest/src/sub1/1.file").toString, 343 | Some(15) 344 | ), 345 | FileListing( 346 | new Path(testingBaseDirPath, "dest/src/sub1/2.file").toString, 347 | Some(15) 348 | ), 349 | FileListing( 350 | new Path(testingBaseDirPath, "dest/src/sub1/3.file").toString, 351 | Some(15) 352 | ), 353 | FileListing( 354 | new Path(testingBaseDirPath, "dest/src/sub2/1.file").toString, 355 | Some(15) 356 | ), 357 | FileListing( 358 | new Path(testingBaseDirPath, "dest/src/sub2/2.file").toString, 359 | Some(15) 360 | ), 361 | FileListing( 362 | new Path(testingBaseDirPath, "dest/src/sub2/3.file").toString, 363 | Some(15) 364 | ), 365 | FileListing( 366 | new Path(testingBaseDirPath, "dest/src/sub2/subsub1/1.file").toString, 367 | Some(23) 368 | ) 369 | ) 370 | 371 | input.foreach(f => createFile(new Path(f), f.getBytes)) 372 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "dest")) 373 | 374 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 375 | .map(f => 376 | fileStatusToResult(f._1) 377 | ) should contain theSameElementsAs sourceOnlyResult 378 | 379 | SparkDistCP.run( 380 | spark, 381 | Seq(new Path(testingBaseDirPath, "src")), 382 | new Path(testingBaseDirPath, "dest"), 383 | SparkDistCPOptions(dryRun = true) 384 | ) 385 | 386 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 387 | .map(f => 388 | fileStatusToResult(f._1) 389 | ) should contain theSameElementsAs sourceOnlyResult 390 | 391 | SparkDistCP.run( 392 | spark, 393 | Seq(new Path(testingBaseDirPath, "src")), 394 | new Path(testingBaseDirPath, "dest"), 395 | SparkDistCPOptions() 396 | ) 397 | 398 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 399 | .map(f => 400 | fileStatusToResult(f._1) 401 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 402 | 403 | spark.stop() 404 | 405 | } 406 | 407 | it("perform distcp with filter") { 408 | 409 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 410 | 411 | val input = List( 412 | "src/1.file", 413 | "src/2.file", 414 | "src/3.file", 415 | "src/sub1/1.file", 416 | "src/sub1/2.file", 417 | "src/sub1/3.file", 418 | "src/sub2/1.file", 419 | "src/sub2/2.file", 420 | "src/sub2/3.file", 421 | "src/sub2/subsub1/1.file" 422 | ) 423 | 424 | val sourceOnlyResult = Seq( 425 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 426 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 427 | FileListing( 428 | new Path(testingBaseDirPath, "src/1.file").toString, 429 | Some(10) 430 | ), 431 | FileListing( 432 | new Path(testingBaseDirPath, "src/2.file").toString, 433 | Some(10) 434 | ), 435 | FileListing( 436 | new Path(testingBaseDirPath, "src/3.file").toString, 437 | Some(10) 438 | ), 439 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 440 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 441 | FileListing( 442 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 443 | None 444 | ), 445 | FileListing( 446 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 447 | Some(15) 448 | ), 449 | FileListing( 450 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 451 | Some(15) 452 | ), 453 | FileListing( 454 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 455 | Some(15) 456 | ), 457 | FileListing( 458 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 459 | Some(15) 460 | ), 461 | FileListing( 462 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 463 | Some(15) 464 | ), 465 | FileListing( 466 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 467 | Some(15) 468 | ), 469 | FileListing( 470 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 471 | Some(23) 472 | ) 473 | ) 474 | 475 | val destOnlyResult = Seq( 476 | FileListing(new Path(testingBaseDirPath, "dest/src").toString, None), 477 | FileListing( 478 | new Path(testingBaseDirPath, "dest/src/2.file").toString, 479 | Some(10) 480 | ), 481 | FileListing( 482 | new Path(testingBaseDirPath, "dest/src/3.file").toString, 483 | Some(10) 484 | ), 485 | FileListing( 486 | new Path(testingBaseDirPath, "dest/src/sub1").toString, 487 | None 488 | ), 489 | FileListing( 490 | new Path(testingBaseDirPath, "dest/src/sub2").toString, 491 | None 492 | ), 493 | FileListing( 494 | new Path(testingBaseDirPath, "dest/src/sub2/subsub1").toString, 495 | None 496 | ), 497 | FileListing( 498 | new Path(testingBaseDirPath, "dest/src/sub1/2.file").toString, 499 | Some(15) 500 | ), 501 | FileListing( 502 | new Path(testingBaseDirPath, "dest/src/sub1/3.file").toString, 503 | Some(15) 504 | ), 505 | FileListing( 506 | new Path(testingBaseDirPath, "dest/src/sub2/2.file").toString, 507 | Some(15) 508 | ), 509 | FileListing( 510 | new Path(testingBaseDirPath, "dest/src/sub2/3.file").toString, 511 | Some(15) 512 | ) 513 | ) 514 | 515 | input.foreach(f => createFile(new Path(f), f.getBytes)) 516 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "dest")) 517 | 518 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 519 | .map(f => 520 | fileStatusToResult(f._1) 521 | ) should contain theSameElementsAs sourceOnlyResult 522 | 523 | SparkDistCP.run( 524 | spark, 525 | Seq(new Path(testingBaseDirPath, "src")), 526 | new Path(testingBaseDirPath, "dest"), 527 | SparkDistCPOptions(dryRun = true, filterNot = List(""".*/1\.file$""".r)) 528 | ) 529 | 530 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 531 | .map(f => 532 | fileStatusToResult(f._1) 533 | ) should contain theSameElementsAs sourceOnlyResult 534 | 535 | SparkDistCP.run( 536 | spark, 537 | Seq(new Path(testingBaseDirPath, "src")), 538 | new Path(testingBaseDirPath, "dest"), 539 | SparkDistCPOptions(filterNot = List(""".*/1\.file$""".r)) 540 | ) 541 | 542 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 543 | .map(f => 544 | fileStatusToResult(f._1) 545 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 546 | 547 | spark.stop() 548 | 549 | } 550 | 551 | it("perform distcp with update") { 552 | 553 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 554 | 555 | val input = List( 556 | "src/1.file", 557 | "src/2.file", 558 | "src/3.file", 559 | "src/sub1/1.file", 560 | "src/sub1/2.file", 561 | "src/sub1/3.file", 562 | "src/sub2/1.file", 563 | "src/sub2/2.file", 564 | "src/sub2/3.file", 565 | "src/sub2/subsub1/1.file" 566 | ) 567 | 568 | val sourceOnlyResult = Seq( 569 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 570 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 571 | FileListing( 572 | new Path(testingBaseDirPath, "src/1.file").toString, 573 | Some(10) 574 | ), 575 | FileListing( 576 | new Path(testingBaseDirPath, "src/2.file").toString, 577 | Some(10) 578 | ), 579 | FileListing( 580 | new Path(testingBaseDirPath, "src/3.file").toString, 581 | Some(10) 582 | ), 583 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 584 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 585 | FileListing( 586 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 587 | None 588 | ), 589 | FileListing( 590 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 591 | Some(15) 592 | ), 593 | FileListing( 594 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 595 | Some(15) 596 | ), 597 | FileListing( 598 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 599 | Some(15) 600 | ), 601 | FileListing( 602 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 603 | Some(15) 604 | ), 605 | FileListing( 606 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 607 | Some(15) 608 | ), 609 | FileListing( 610 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 611 | Some(15) 612 | ), 613 | FileListing( 614 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 615 | Some(23) 616 | ) 617 | ) 618 | 619 | val destOnlyResult = Seq( 620 | FileListing( 621 | new Path(testingBaseDirPath, "dest/1.file").toString, 622 | Some(10) 623 | ), 624 | FileListing( 625 | new Path(testingBaseDirPath, "dest/2.file").toString, 626 | Some(10) 627 | ), 628 | FileListing( 629 | new Path(testingBaseDirPath, "dest/3.file").toString, 630 | Some(10) 631 | ), 632 | FileListing(new Path(testingBaseDirPath, "dest/sub1").toString, None), 633 | FileListing(new Path(testingBaseDirPath, "dest/sub2").toString, None), 634 | FileListing( 635 | new Path(testingBaseDirPath, "dest/sub2/subsub1").toString, 636 | None 637 | ), 638 | FileListing( 639 | new Path(testingBaseDirPath, "dest/sub1/1.file").toString, 640 | Some(15) 641 | ), 642 | FileListing( 643 | new Path(testingBaseDirPath, "dest/sub1/2.file").toString, 644 | Some(15) 645 | ), 646 | FileListing( 647 | new Path(testingBaseDirPath, "dest/sub1/3.file").toString, 648 | Some(15) 649 | ), 650 | FileListing( 651 | new Path(testingBaseDirPath, "dest/sub2/1.file").toString, 652 | Some(15) 653 | ), 654 | FileListing( 655 | new Path(testingBaseDirPath, "dest/sub2/2.file").toString, 656 | Some(15) 657 | ), 658 | FileListing( 659 | new Path(testingBaseDirPath, "dest/sub2/3.file").toString, 660 | Some(15) 661 | ), 662 | FileListing( 663 | new Path(testingBaseDirPath, "dest/sub2/subsub1/1.file").toString, 664 | Some(23) 665 | ) 666 | ) 667 | 668 | input.foreach(f => createFile(new Path(f), f.getBytes)) 669 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "dest")) 670 | 671 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 672 | .map(f => 673 | fileStatusToResult(f._1) 674 | ) should contain theSameElementsAs sourceOnlyResult 675 | 676 | SparkDistCP.run( 677 | spark, 678 | Seq(new Path(testingBaseDirPath, "src")), 679 | new Path(testingBaseDirPath, "dest"), 680 | SparkDistCPOptions(dryRun = true, update = true) 681 | ) 682 | 683 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 684 | .map(f => 685 | fileStatusToResult(f._1) 686 | ) should contain theSameElementsAs sourceOnlyResult 687 | 688 | SparkDistCP.run( 689 | spark, 690 | Seq(new Path(testingBaseDirPath, "src")), 691 | new Path(testingBaseDirPath, "dest"), 692 | SparkDistCPOptions(update = true) 693 | ) 694 | 695 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 696 | .map(f => 697 | fileStatusToResult(f._1) 698 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 699 | 700 | spark.stop() 701 | 702 | } 703 | 704 | it( 705 | "perform distcp with update/overwrite with non-update paths and delete" 706 | ) { 707 | 708 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 709 | 710 | val input = List( 711 | "src/1.file", 712 | "src/2.file", 713 | "src/3.file", 714 | "src/sub1/1.file", 715 | "src/sub1/2.file", 716 | "src/sub1/3.file", 717 | "src/sub2/1.file", 718 | "src/sub2/2.file", 719 | "src/sub2/3.file", 720 | "src/sub2/subsub1/1.file", 721 | "dest/a.file", 722 | "dest/suba/b.file", 723 | "dest/suba/c.file", 724 | "dest/subb/c.file" 725 | ) 726 | 727 | val sourceOnlyResult = Seq( 728 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 729 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 730 | FileListing( 731 | new Path(testingBaseDirPath, "src/1.file").toString, 732 | Some(10) 733 | ), 734 | FileListing( 735 | new Path(testingBaseDirPath, "src/2.file").toString, 736 | Some(10) 737 | ), 738 | FileListing( 739 | new Path(testingBaseDirPath, "src/3.file").toString, 740 | Some(10) 741 | ), 742 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 743 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 744 | FileListing( 745 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 746 | None 747 | ), 748 | FileListing( 749 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 750 | Some(15) 751 | ), 752 | FileListing( 753 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 754 | Some(15) 755 | ), 756 | FileListing( 757 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 758 | Some(15) 759 | ), 760 | FileListing( 761 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 762 | Some(15) 763 | ), 764 | FileListing( 765 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 766 | Some(15) 767 | ), 768 | FileListing( 769 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 770 | Some(15) 771 | ), 772 | FileListing( 773 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 774 | Some(23) 775 | ) 776 | ) 777 | 778 | val destExisting = Seq( 779 | FileListing( 780 | new Path(testingBaseDirPath, "dest/a.file").toString, 781 | Some(11) 782 | ), 783 | FileListing(new Path(testingBaseDirPath, "dest/suba").toString, None), 784 | FileListing(new Path(testingBaseDirPath, "dest/subb").toString, None), 785 | FileListing( 786 | new Path(testingBaseDirPath, "dest/suba/b.file").toString, 787 | Some(16) 788 | ), 789 | FileListing( 790 | new Path(testingBaseDirPath, "dest/suba/c.file").toString, 791 | Some(16) 792 | ), 793 | FileListing( 794 | new Path(testingBaseDirPath, "dest/subb/c.file").toString, 795 | Some(16) 796 | ) 797 | ) 798 | 799 | val destOnlyResult = Seq( 800 | FileListing(new Path(testingBaseDirPath, "dest/src").toString, None), 801 | FileListing( 802 | new Path(testingBaseDirPath, "dest/src/1.file").toString, 803 | Some(10) 804 | ), 805 | FileListing( 806 | new Path(testingBaseDirPath, "dest/src/2.file").toString, 807 | Some(10) 808 | ), 809 | FileListing( 810 | new Path(testingBaseDirPath, "dest/src/3.file").toString, 811 | Some(10) 812 | ), 813 | FileListing( 814 | new Path(testingBaseDirPath, "dest/src/sub1").toString, 815 | None 816 | ), 817 | FileListing( 818 | new Path(testingBaseDirPath, "dest/src/sub2").toString, 819 | None 820 | ), 821 | FileListing( 822 | new Path(testingBaseDirPath, "dest/src/sub2/subsub1").toString, 823 | None 824 | ), 825 | FileListing( 826 | new Path(testingBaseDirPath, "dest/src/sub1/1.file").toString, 827 | Some(15) 828 | ), 829 | FileListing( 830 | new Path(testingBaseDirPath, "dest/src/sub1/2.file").toString, 831 | Some(15) 832 | ), 833 | FileListing( 834 | new Path(testingBaseDirPath, "dest/src/sub1/3.file").toString, 835 | Some(15) 836 | ), 837 | FileListing( 838 | new Path(testingBaseDirPath, "dest/src/sub2/1.file").toString, 839 | Some(15) 840 | ), 841 | FileListing( 842 | new Path(testingBaseDirPath, "dest/src/sub2/2.file").toString, 843 | Some(15) 844 | ), 845 | FileListing( 846 | new Path(testingBaseDirPath, "dest/src/sub2/3.file").toString, 847 | Some(15) 848 | ), 849 | FileListing( 850 | new Path(testingBaseDirPath, "dest/src/sub2/subsub1/1.file").toString, 851 | Some(23) 852 | ) 853 | ) 854 | 855 | input.foreach(f => createFile(new Path(f), f.getBytes)) 856 | 857 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 858 | .map(f => 859 | fileStatusToResult(f._1) 860 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 861 | 862 | SparkDistCP.run( 863 | spark, 864 | Seq(new Path(testingBaseDirPath, "src")), 865 | new Path(testingBaseDirPath, "dest"), 866 | SparkDistCPOptions( 867 | dryRun = true, 868 | consistentPathBehaviour = true, 869 | overwrite = true, 870 | delete = true 871 | ) 872 | ) 873 | 874 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 875 | .map(f => 876 | fileStatusToResult(f._1) 877 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 878 | 879 | SparkDistCP.run( 880 | spark, 881 | Seq(new Path(testingBaseDirPath, "src")), 882 | new Path(testingBaseDirPath, "dest"), 883 | SparkDistCPOptions( 884 | consistentPathBehaviour = true, 885 | overwrite = true, 886 | delete = true 887 | ) 888 | ) 889 | 890 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 891 | .map(f => 892 | fileStatusToResult(f._1) 893 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 894 | 895 | spark.stop() 896 | 897 | } 898 | 899 | it("perform distcp with update/overwrite with delete") { 900 | 901 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 902 | 903 | val input = List( 904 | "src/1.file", 905 | "src/2.file", 906 | "src/3.file", 907 | "src/sub1/1.file", 908 | "src/sub1/2.file", 909 | "src/sub1/3.file", 910 | "src/sub2/1.file", 911 | "src/sub2/2.file", 912 | "src/sub2/3.file", 913 | "src/sub2/subsub1/1.file", 914 | "dest/a.file", 915 | "dest/suba/b.file", 916 | "dest/suba/c.file", 917 | "dest/subb/c.file" 918 | ) 919 | 920 | val sourceOnlyResult = Seq( 921 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 922 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 923 | FileListing( 924 | new Path(testingBaseDirPath, "src/1.file").toString, 925 | Some(10) 926 | ), 927 | FileListing( 928 | new Path(testingBaseDirPath, "src/2.file").toString, 929 | Some(10) 930 | ), 931 | FileListing( 932 | new Path(testingBaseDirPath, "src/3.file").toString, 933 | Some(10) 934 | ), 935 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 936 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 937 | FileListing( 938 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 939 | None 940 | ), 941 | FileListing( 942 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 943 | Some(15) 944 | ), 945 | FileListing( 946 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 947 | Some(15) 948 | ), 949 | FileListing( 950 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 951 | Some(15) 952 | ), 953 | FileListing( 954 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 955 | Some(15) 956 | ), 957 | FileListing( 958 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 959 | Some(15) 960 | ), 961 | FileListing( 962 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 963 | Some(15) 964 | ), 965 | FileListing( 966 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 967 | Some(23) 968 | ) 969 | ) 970 | 971 | val destExisting = Seq( 972 | FileListing( 973 | new Path(testingBaseDirPath, "dest/a.file").toString, 974 | Some(11) 975 | ), 976 | FileListing(new Path(testingBaseDirPath, "dest/suba").toString, None), 977 | FileListing(new Path(testingBaseDirPath, "dest/subb").toString, None), 978 | FileListing( 979 | new Path(testingBaseDirPath, "dest/suba/b.file").toString, 980 | Some(16) 981 | ), 982 | FileListing( 983 | new Path(testingBaseDirPath, "dest/suba/c.file").toString, 984 | Some(16) 985 | ), 986 | FileListing( 987 | new Path(testingBaseDirPath, "dest/subb/c.file").toString, 988 | Some(16) 989 | ) 990 | ) 991 | 992 | val destOnlyResult = Seq( 993 | FileListing( 994 | new Path(testingBaseDirPath, "dest/1.file").toString, 995 | Some(10) 996 | ), 997 | FileListing( 998 | new Path(testingBaseDirPath, "dest/2.file").toString, 999 | Some(10) 1000 | ), 1001 | FileListing( 1002 | new Path(testingBaseDirPath, "dest/3.file").toString, 1003 | Some(10) 1004 | ), 1005 | FileListing(new Path(testingBaseDirPath, "dest/sub1").toString, None), 1006 | FileListing(new Path(testingBaseDirPath, "dest/sub2").toString, None), 1007 | FileListing( 1008 | new Path(testingBaseDirPath, "dest/sub2/subsub1").toString, 1009 | None 1010 | ), 1011 | FileListing( 1012 | new Path(testingBaseDirPath, "dest/sub1/1.file").toString, 1013 | Some(15) 1014 | ), 1015 | FileListing( 1016 | new Path(testingBaseDirPath, "dest/sub1/2.file").toString, 1017 | Some(15) 1018 | ), 1019 | FileListing( 1020 | new Path(testingBaseDirPath, "dest/sub1/3.file").toString, 1021 | Some(15) 1022 | ), 1023 | FileListing( 1024 | new Path(testingBaseDirPath, "dest/sub2/1.file").toString, 1025 | Some(15) 1026 | ), 1027 | FileListing( 1028 | new Path(testingBaseDirPath, "dest/sub2/2.file").toString, 1029 | Some(15) 1030 | ), 1031 | FileListing( 1032 | new Path(testingBaseDirPath, "dest/sub2/3.file").toString, 1033 | Some(15) 1034 | ), 1035 | FileListing( 1036 | new Path(testingBaseDirPath, "dest/sub2/subsub1/1.file").toString, 1037 | Some(23) 1038 | ) 1039 | ) 1040 | 1041 | input.foreach(f => createFile(new Path(f), f.getBytes)) 1042 | 1043 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1044 | .map(f => 1045 | fileStatusToResult(f._1) 1046 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 1047 | 1048 | SparkDistCP.run( 1049 | spark, 1050 | Seq(new Path(testingBaseDirPath, "src")), 1051 | new Path(testingBaseDirPath, "dest"), 1052 | SparkDistCPOptions(dryRun = true, overwrite = true, delete = true) 1053 | ) 1054 | 1055 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1056 | .map(f => 1057 | fileStatusToResult(f._1) 1058 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 1059 | 1060 | SparkDistCP.run( 1061 | spark, 1062 | Seq(new Path(testingBaseDirPath, "src")), 1063 | new Path(testingBaseDirPath, "dest"), 1064 | SparkDistCPOptions(overwrite = true, delete = true) 1065 | ) 1066 | 1067 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1068 | .map(f => 1069 | fileStatusToResult(f._1) 1070 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 1071 | 1072 | spark.stop() 1073 | 1074 | } 1075 | 1076 | it( 1077 | "perform distcp with update/overwrite with non-update paths and delete and filter" 1078 | ) { 1079 | 1080 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 1081 | 1082 | val input = List( 1083 | "src/1.file", 1084 | "src/2.file", 1085 | "src/3.file", 1086 | "src/sub1/1.file", 1087 | "src/sub1/2.file", 1088 | "src/sub1/3.file", 1089 | "src/sub2/1.file", 1090 | "src/sub2/2.file", 1091 | "src/sub2/3.file", 1092 | "src/sub2/subsub1/1.file", 1093 | "dest/a.file", 1094 | "dest/c.file", 1095 | "dest/suba/b.file", 1096 | "dest/suba/c.file", 1097 | "dest/subb/c.file" 1098 | ) 1099 | 1100 | val sourceOnlyResult = Seq( 1101 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 1102 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 1103 | FileListing( 1104 | new Path(testingBaseDirPath, "src/1.file").toString, 1105 | Some(10) 1106 | ), 1107 | FileListing( 1108 | new Path(testingBaseDirPath, "src/2.file").toString, 1109 | Some(10) 1110 | ), 1111 | FileListing( 1112 | new Path(testingBaseDirPath, "src/3.file").toString, 1113 | Some(10) 1114 | ), 1115 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 1116 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 1117 | FileListing( 1118 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 1119 | None 1120 | ), 1121 | FileListing( 1122 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 1123 | Some(15) 1124 | ), 1125 | FileListing( 1126 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 1127 | Some(15) 1128 | ), 1129 | FileListing( 1130 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 1131 | Some(15) 1132 | ), 1133 | FileListing( 1134 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 1135 | Some(15) 1136 | ), 1137 | FileListing( 1138 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 1139 | Some(15) 1140 | ), 1141 | FileListing( 1142 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 1143 | Some(15) 1144 | ), 1145 | FileListing( 1146 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 1147 | Some(23) 1148 | ) 1149 | ) 1150 | 1151 | val destExisting = Seq( 1152 | FileListing( 1153 | new Path(testingBaseDirPath, "dest/a.file").toString, 1154 | Some(11) 1155 | ), 1156 | FileListing( 1157 | new Path(testingBaseDirPath, "dest/c.file").toString, 1158 | Some(11) 1159 | ), 1160 | FileListing(new Path(testingBaseDirPath, "dest/suba").toString, None), 1161 | FileListing(new Path(testingBaseDirPath, "dest/subb").toString, None), 1162 | FileListing( 1163 | new Path(testingBaseDirPath, "dest/suba/b.file").toString, 1164 | Some(16) 1165 | ), 1166 | FileListing( 1167 | new Path(testingBaseDirPath, "dest/suba/c.file").toString, 1168 | Some(16) 1169 | ), 1170 | FileListing( 1171 | new Path(testingBaseDirPath, "dest/subb/c.file").toString, 1172 | Some(16) 1173 | ) 1174 | ) 1175 | 1176 | val destOnlyResult = Seq( 1177 | FileListing(new Path(testingBaseDirPath, "dest/src").toString, None), 1178 | FileListing( 1179 | new Path(testingBaseDirPath, "dest/src/2.file").toString, 1180 | Some(10) 1181 | ), 1182 | FileListing( 1183 | new Path(testingBaseDirPath, "dest/src/3.file").toString, 1184 | Some(10) 1185 | ), 1186 | FileListing( 1187 | new Path(testingBaseDirPath, "dest/src/sub1").toString, 1188 | None 1189 | ), 1190 | FileListing( 1191 | new Path(testingBaseDirPath, "dest/src/sub2").toString, 1192 | None 1193 | ), 1194 | FileListing( 1195 | new Path(testingBaseDirPath, "dest/src/sub2/subsub1").toString, 1196 | None 1197 | ), 1198 | FileListing( 1199 | new Path(testingBaseDirPath, "dest/src/sub1/2.file").toString, 1200 | Some(15) 1201 | ), 1202 | FileListing( 1203 | new Path(testingBaseDirPath, "dest/src/sub1/3.file").toString, 1204 | Some(15) 1205 | ), 1206 | FileListing( 1207 | new Path(testingBaseDirPath, "dest/src/sub2/2.file").toString, 1208 | Some(15) 1209 | ), 1210 | FileListing( 1211 | new Path(testingBaseDirPath, "dest/src/sub2/3.file").toString, 1212 | Some(15) 1213 | ) 1214 | ) 1215 | 1216 | input.foreach(f => createFile(new Path(f), f.getBytes)) 1217 | 1218 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1219 | .map(f => 1220 | fileStatusToResult(f._1) 1221 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 1222 | 1223 | SparkDistCP.run( 1224 | spark, 1225 | Seq(new Path(testingBaseDirPath, "src")), 1226 | new Path(testingBaseDirPath, "dest"), 1227 | SparkDistCPOptions( 1228 | dryRun = true, 1229 | consistentPathBehaviour = true, 1230 | overwrite = true, 1231 | delete = true, 1232 | filterNot = List(""".*/1\.file$""".r, """.*/c\.file$""".r) 1233 | ) 1234 | ) 1235 | 1236 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1237 | .map(f => 1238 | fileStatusToResult(f._1) 1239 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 1240 | 1241 | SparkDistCP.run( 1242 | spark, 1243 | Seq(new Path(testingBaseDirPath, "src")), 1244 | new Path(testingBaseDirPath, "dest"), 1245 | SparkDistCPOptions( 1246 | consistentPathBehaviour = true, 1247 | overwrite = true, 1248 | delete = true, 1249 | filterNot = List(""".*/1\.file$""".r, """.*/c\.file$""".r) 1250 | ) 1251 | ) 1252 | 1253 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1254 | .map(f => 1255 | fileStatusToResult(f._1) 1256 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 1257 | 1258 | spark.stop() 1259 | 1260 | } 1261 | 1262 | it("perform distcp with update/overwrite with and delete and filter") { 1263 | 1264 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 1265 | 1266 | val input = List( 1267 | "src/1.file", 1268 | "src/2.file", 1269 | "src/3.file", 1270 | "src/sub1/1.file", 1271 | "src/sub1/2.file", 1272 | "src/sub1/3.file", 1273 | "src/sub2/1.file", 1274 | "src/sub2/2.file", 1275 | "src/sub2/3.file", 1276 | "src/sub2/subsub1/1.file", 1277 | "dest/a.file", 1278 | "dest/c.file", 1279 | "dest/suba/b.file", 1280 | "dest/suba/c.file", 1281 | "dest/subb/c.file" 1282 | ) 1283 | 1284 | val sourceOnlyResult = Seq( 1285 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 1286 | FileListing(new Path(testingBaseDirPath, "dest").toString, None), 1287 | FileListing( 1288 | new Path(testingBaseDirPath, "src/1.file").toString, 1289 | Some(10) 1290 | ), 1291 | FileListing( 1292 | new Path(testingBaseDirPath, "src/2.file").toString, 1293 | Some(10) 1294 | ), 1295 | FileListing( 1296 | new Path(testingBaseDirPath, "src/3.file").toString, 1297 | Some(10) 1298 | ), 1299 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 1300 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 1301 | FileListing( 1302 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 1303 | None 1304 | ), 1305 | FileListing( 1306 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 1307 | Some(15) 1308 | ), 1309 | FileListing( 1310 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 1311 | Some(15) 1312 | ), 1313 | FileListing( 1314 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 1315 | Some(15) 1316 | ), 1317 | FileListing( 1318 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 1319 | Some(15) 1320 | ), 1321 | FileListing( 1322 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 1323 | Some(15) 1324 | ), 1325 | FileListing( 1326 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 1327 | Some(15) 1328 | ), 1329 | FileListing( 1330 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 1331 | Some(23) 1332 | ) 1333 | ) 1334 | 1335 | val destExisting = Seq( 1336 | FileListing( 1337 | new Path(testingBaseDirPath, "dest/a.file").toString, 1338 | Some(11) 1339 | ), 1340 | FileListing( 1341 | new Path(testingBaseDirPath, "dest/c.file").toString, 1342 | Some(11) 1343 | ), 1344 | FileListing(new Path(testingBaseDirPath, "dest/suba").toString, None), 1345 | FileListing(new Path(testingBaseDirPath, "dest/subb").toString, None), 1346 | FileListing( 1347 | new Path(testingBaseDirPath, "dest/suba/b.file").toString, 1348 | Some(16) 1349 | ), 1350 | FileListing( 1351 | new Path(testingBaseDirPath, "dest/suba/c.file").toString, 1352 | Some(16) 1353 | ), 1354 | FileListing( 1355 | new Path(testingBaseDirPath, "dest/subb/c.file").toString, 1356 | Some(16) 1357 | ) 1358 | ) 1359 | 1360 | val destOnlyResult = Seq( 1361 | FileListing( 1362 | new Path(testingBaseDirPath, "dest/2.file").toString, 1363 | Some(10) 1364 | ), 1365 | FileListing( 1366 | new Path(testingBaseDirPath, "dest/3.file").toString, 1367 | Some(10) 1368 | ), 1369 | FileListing(new Path(testingBaseDirPath, "dest/sub1").toString, None), 1370 | FileListing(new Path(testingBaseDirPath, "dest/sub2").toString, None), 1371 | FileListing( 1372 | new Path(testingBaseDirPath, "dest/sub2/subsub1").toString, 1373 | None 1374 | ), 1375 | FileListing( 1376 | new Path(testingBaseDirPath, "dest/sub1/2.file").toString, 1377 | Some(15) 1378 | ), 1379 | FileListing( 1380 | new Path(testingBaseDirPath, "dest/sub1/3.file").toString, 1381 | Some(15) 1382 | ), 1383 | FileListing( 1384 | new Path(testingBaseDirPath, "dest/sub2/2.file").toString, 1385 | Some(15) 1386 | ), 1387 | FileListing( 1388 | new Path(testingBaseDirPath, "dest/sub2/3.file").toString, 1389 | Some(15) 1390 | ) 1391 | ) 1392 | 1393 | input.foreach(f => createFile(new Path(f), f.getBytes)) 1394 | 1395 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1396 | .map(f => 1397 | fileStatusToResult(f._1) 1398 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 1399 | 1400 | SparkDistCP.run( 1401 | spark, 1402 | Seq(new Path(testingBaseDirPath, "src")), 1403 | new Path(testingBaseDirPath, "dest"), 1404 | SparkDistCPOptions( 1405 | dryRun = true, 1406 | overwrite = true, 1407 | delete = true, 1408 | filterNot = List(""".*/1\.file$""".r, """.*/c\.file$""".r) 1409 | ) 1410 | ) 1411 | 1412 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1413 | .map(f => 1414 | fileStatusToResult(f._1) 1415 | ) should contain theSameElementsAs (sourceOnlyResult ++ destExisting) 1416 | 1417 | SparkDistCP.run( 1418 | spark, 1419 | Seq(new Path(testingBaseDirPath, "src")), 1420 | new Path(testingBaseDirPath, "dest"), 1421 | SparkDistCPOptions( 1422 | overwrite = true, 1423 | delete = true, 1424 | filterNot = List(""".*/1\.file$""".r, """.*/c\.file$""".r) 1425 | ) 1426 | ) 1427 | 1428 | listFiles(localFileSystem, testingBaseDirPath, 10, false, List.empty) 1429 | .map(f => 1430 | fileStatusToResult(f._1) 1431 | ) should contain theSameElementsAs (sourceOnlyResult ++ destOnlyResult) 1432 | 1433 | spark.stop() 1434 | 1435 | } 1436 | 1437 | it("perform distcp with non-update/non-overwrite and logging") { 1438 | 1439 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 1440 | 1441 | val input = List( 1442 | "src/1.file", 1443 | "src/2.file", 1444 | "src/3.file", 1445 | "src/sub1/1.file", 1446 | "src/sub1/2.file", 1447 | "src/sub1/3.file", 1448 | "src/sub2/1.file", 1449 | "src/sub2/2.file", 1450 | "src/sub2/3.file", 1451 | "src/sub2/subsub1/1.file" 1452 | ) 1453 | 1454 | input.foreach(f => createFile(new Path(f), f.getBytes)) 1455 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "dest")) 1456 | 1457 | localFileSystem.exists( 1458 | new Path(testingBaseDirPath, "dryrun.log") 1459 | ) should be(false) 1460 | 1461 | SparkDistCP.run( 1462 | spark, 1463 | Seq(new Path(testingBaseDirPath, "src")), 1464 | new Path(testingBaseDirPath, "dest"), 1465 | SparkDistCPOptions( 1466 | dryRun = true, 1467 | log = Some(new Path(testingBaseDirPath, "dryrun.log").toUri) 1468 | ) 1469 | ) 1470 | 1471 | localFileSystem.exists( 1472 | new Path(testingBaseDirPath, "dryrun.log") 1473 | ) should be(true) 1474 | 1475 | localFileSystem.exists(new Path(testingBaseDirPath, "run.log")) should be( 1476 | false 1477 | ) 1478 | 1479 | SparkDistCP.run( 1480 | spark, 1481 | Seq(new Path(testingBaseDirPath, "src")), 1482 | new Path(testingBaseDirPath, "dest"), 1483 | SparkDistCPOptions(log = 1484 | Some(new Path(testingBaseDirPath, "run.log").toUri) 1485 | ) 1486 | ) 1487 | 1488 | localFileSystem.exists(new Path(testingBaseDirPath, "run.log")) should be( 1489 | true 1490 | ) 1491 | 1492 | spark.stop() 1493 | 1494 | } 1495 | 1496 | it("provide an empty source path list") { 1497 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 1498 | 1499 | intercept[java.lang.AssertionError] { 1500 | SparkDistCP.run( 1501 | spark, 1502 | Seq.empty, 1503 | new Path("test"), 1504 | SparkDistCPOptions() 1505 | ) 1506 | }.getMessage should be( 1507 | "assertion failed: At least one source path must be given" 1508 | ) 1509 | 1510 | spark.stop() 1511 | } 1512 | 1513 | it("provide an incorrect options configuration") { 1514 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 1515 | 1516 | intercept[java.lang.AssertionError] { 1517 | SparkDistCP.run( 1518 | spark, 1519 | Seq(new Path("src")), 1520 | new Path("dest"), 1521 | SparkDistCPOptions(update = true, overwrite = true) 1522 | ) 1523 | }.getMessage should be( 1524 | "assertion failed: Both update and overwrite cannot be specified" 1525 | ) 1526 | 1527 | spark.stop() 1528 | } 1529 | 1530 | } 1531 | 1532 | } 1533 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/TestSparkDistCPOptions.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import org.scalatest.funspec.AnyFunSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class TestSparkDistCPOptions extends AnyFunSpec with Matchers { 7 | 8 | it("updateOverwritePathBehaviour") { 9 | 10 | SparkDistCPOptions().updateOverwritePathBehaviour should be(false) 11 | SparkDistCPOptions(consistentPathBehaviour = 12 | true 13 | ).updateOverwritePathBehaviour should be(false) 14 | SparkDistCPOptions( 15 | update = true, 16 | consistentPathBehaviour = true 17 | ).updateOverwritePathBehaviour should be(false) 18 | SparkDistCPOptions( 19 | overwrite = true, 20 | consistentPathBehaviour = true 21 | ).updateOverwritePathBehaviour should be(false) 22 | SparkDistCPOptions(update = true).updateOverwritePathBehaviour should be( 23 | true 24 | ) 25 | SparkDistCPOptions(overwrite = true).updateOverwritePathBehaviour should be( 26 | true 27 | ) 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/TestSpec.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata 2 | 3 | import java.io.ByteArrayInputStream 4 | import java.nio.file.Files 5 | 6 | import com.coxautodata.objects.SerializableFileStatus 7 | import com.coxautodata.utils.FileListing 8 | import org.apache.commons.io.{FileUtils, IOUtils} 9 | import org.apache.hadoop.conf.Configuration 10 | import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} 11 | import org.scalatest.BeforeAndAfterEach 12 | import org.scalatest.funspec.AnyFunSpec 13 | import org.scalatest.matchers.should.Matchers 14 | 15 | trait TestSpec extends AnyFunSpec with Matchers with BeforeAndAfterEach { 16 | 17 | var testingBaseDir: java.nio.file.Path = _ 18 | var testingBaseDirName: String = _ 19 | var testingBaseDirPath: Path = _ 20 | var localFileSystem: LocalFileSystem = _ 21 | 22 | override def beforeEach(): Unit = { 23 | super.beforeEach() 24 | testingBaseDir = Files.createTempDirectory("test_output") 25 | testingBaseDirName = testingBaseDir.toString 26 | localFileSystem = FileSystem.getLocal(new Configuration()) 27 | testingBaseDirPath = 28 | localFileSystem.makeQualified(new Path(testingBaseDirName)) 29 | } 30 | 31 | override def afterEach(): Unit = { 32 | super.afterEach() 33 | FileUtils.deleteDirectory(testingBaseDir.toFile) 34 | } 35 | 36 | def createFile( 37 | relativePath: Path, 38 | content: Array[Byte] 39 | ): SerializableFileStatus = { 40 | val path = new Path(testingBaseDirPath, relativePath) 41 | localFileSystem.mkdirs(path.getParent) 42 | val in = new ByteArrayInputStream(content) 43 | val out = localFileSystem.create(path) 44 | IOUtils.copy(in, out) 45 | in.close() 46 | out.close() 47 | SerializableFileStatus(localFileSystem.getFileStatus(path)) 48 | } 49 | 50 | def fileStatusToResult(f: SerializableFileStatus): FileListing = { 51 | FileListing(f.getPath.toString, if (f.isFile) Some(f.getLen) else None) 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/objects/TestAccumulators.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.io.FileNotFoundException 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.scalatest.funspec.AnyFunSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class TestAccumulators extends AnyFunSpec with Matchers { 10 | 11 | it("test all accumulator conditions") { 12 | 13 | val spark = SparkSession.builder().master("local[*]").getOrCreate() 14 | 15 | val testCases: Seq[DistCPResult] = List( 16 | DeleteResult(null, DeleteActionResult.SkippedDoesNotExists), 17 | DeleteResult(null, DeleteActionResult.SkippedDryRun), 18 | DeleteResult(null, DeleteActionResult.Deleted), 19 | DeleteResult( 20 | null, 21 | DeleteActionResult.Failed(new RuntimeException("test")) 22 | ), 23 | DirectoryCopyResult(null, null, CopyActionResult.SkippedAlreadyExists), 24 | DirectoryCopyResult(null, null, CopyActionResult.SkippedDryRun), 25 | DirectoryCopyResult(null, null, CopyActionResult.Created), 26 | DirectoryCopyResult( 27 | null, 28 | null, 29 | CopyActionResult.Failed(new RuntimeException("test")) 30 | ), 31 | FileCopyResult(null, null, 1, CopyActionResult.SkippedAlreadyExists), 32 | FileCopyResult( 33 | null, 34 | null, 35 | 1000, 36 | CopyActionResult.SkippedIdenticalFileAlreadyExists 37 | ), 38 | FileCopyResult(null, null, 1000000, CopyActionResult.SkippedDryRun), 39 | FileCopyResult(null, null, 1000000, CopyActionResult.Copied), 40 | FileCopyResult(null, null, 1000, CopyActionResult.OverwrittenOrUpdated), 41 | FileCopyResult( 42 | null, 43 | null, 44 | 50000, 45 | CopyActionResult.Failed(new FileNotFoundException("test")) 46 | ) 47 | ) 48 | 49 | val acc = new Accumulators(spark) 50 | 51 | testCases.foreach(acc.handleResult) 52 | 53 | acc.getOutputText should be("""--Raw data-- 54 | |Data copied: 977 KB (1001000 bytes) 55 | |Data skipped (already existing files, dry-run and failures): 1 MB (1051001 bytes) 56 | |--Files-- 57 | |Files copied (new files and overwritten/updated files): 2 58 | |Files overwritten/updated: 1 59 | |Skipped files for copying (already existing files, dry-run and failures): 4 60 | |Failed files during copy: 1 61 | |--Folders-- 62 | |Folders created: 1 63 | |Skipped folder creates (already existing folders, dry-run and failures): 3 64 | |Failed folder creates: 1 65 | |--Deletes-- 66 | |Successful delete operations: 1 67 | |Skipped delete operations (files/folders already missing, dry-run and failures): 3 68 | |Failed delete operations: 1 69 | |--Exception counts-- 70 | |java.lang.RuntimeException: 2 71 | |java.io.FileNotFoundException: 1""".stripMargin) 72 | 73 | spark.stop() 74 | 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/objects/TestCopyPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import org.scalatest.funspec.AnyFunSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class TestCopyPartitioner extends AnyFunSpec with Matchers { 7 | 8 | describe("getCopyPartitioner") { 9 | 10 | it("produce a valid partitioner") { 11 | 12 | val input = Array(0 -> 2, 1 -> 1, 2 -> 0, 3 -> 1) 13 | val partitioner = CopyPartitioner(input) 14 | 15 | partitioner.partitionOffsets should contain theSameElementsAs Map( 16 | 0 -> 0, 17 | 1 -> 3, 18 | 2 -> 5, 19 | 3 -> 6 20 | ) 21 | 22 | partitioner.numPartitions should be(8) 23 | 24 | partitioner.getPartition((3, 1)) should be(7) 25 | 26 | intercept[RuntimeException] { 27 | partitioner.getPartition(1) 28 | }.getMessage should be( 29 | "Partitioned does not support key [1]. Must be (Int, Int)." 30 | ) 31 | 32 | intercept[RuntimeException] { 33 | partitioner.getPartition((4, 1)) 34 | }.getMessage should be( 35 | "Key partition 4 of key [(4, 1)] was not found in the indexes [0, 1, 2, 3]." 36 | ) 37 | 38 | partitioner.getPartition((2, 0)) should be(5) 39 | 40 | partitioner.getPartition((2, 1)) should be(5) 41 | 42 | } 43 | 44 | it("partitioner with missing partitions") { 45 | 46 | val input = Array(0 -> 2, 1 -> 1, 3 -> 1) 47 | val partitioner = CopyPartitioner(input) 48 | 49 | partitioner.partitionOffsets should contain theSameElementsAs Map( 50 | 0 -> 0, 51 | 1 -> 3, 52 | 3 -> 5 53 | ) 54 | 55 | partitioner.numPartitions should be(7) 56 | 57 | partitioner.getPartition((0, 0)) should be(0) 58 | partitioner.getPartition((0, 1)) should be(1) 59 | partitioner.getPartition((0, 2)) should be(2) 60 | partitioner.getPartition((1, 0)) should be(3) 61 | partitioner.getPartition((1, 1)) should be(4) 62 | intercept[RuntimeException] { 63 | partitioner.getPartition((2, 0)) 64 | }.getMessage should be( 65 | "Key partition 2 of key [(2, 0)] was not found in the indexes [0, 1, 3]." 66 | ) 67 | partitioner.getPartition((3, 0)) should be(5) 68 | partitioner.getPartition((3, 1)) should be(6) 69 | 70 | } 71 | 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/objects/TestExceptionCountAccumulator.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.util 4 | 5 | import org.apache.spark.util.AccumulatorV2 6 | import org.scalatest.funspec.AnyFunSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | import scala.jdk.CollectionConverters._ 10 | 11 | class TestExceptionCountAccumulator extends AnyFunSpec with Matchers { 12 | 13 | it("test implementation") { 14 | 15 | val acc = new ExceptionCountAccumulator 16 | 17 | acc.add(new RuntimeException) 18 | acc.add("TestException") 19 | 20 | acc.value.asScala should contain theSameElementsAs Map( 21 | "TestException" -> 1, 22 | "java.lang.RuntimeException" -> 1 23 | ) 24 | 25 | val accCopy = acc.copy() 26 | acc.value.asScala should contain theSameElementsAs Map( 27 | "TestException" -> 1, 28 | "java.lang.RuntimeException" -> 1 29 | ) 30 | acc.value.asScala should contain theSameElementsAs accCopy.value.asScala 31 | 32 | acc.merge(accCopy) 33 | acc.value.asScala should contain theSameElementsAs Map( 34 | "TestException" -> 2, 35 | "java.lang.RuntimeException" -> 2 36 | ) 37 | 38 | acc.reset() 39 | acc.isZero should be(true) 40 | acc.value.isEmpty should be(true) 41 | 42 | intercept[UnsupportedOperationException] { 43 | acc.merge(new TestAcc) 44 | }.getMessage should be( 45 | "Cannot merge com.coxautodata.objects.ExceptionCountAccumulator with com.coxautodata.objects.TestAcc" 46 | ) 47 | 48 | } 49 | 50 | } 51 | 52 | class TestAcc extends AccumulatorV2[String, java.util.Map[String, Long]] { 53 | override def isZero: Boolean = ??? 54 | 55 | override def copy(): AccumulatorV2[String, util.Map[String, Long]] = ??? 56 | 57 | override def reset(): Unit = ??? 58 | 59 | override def add(v: String): Unit = ??? 60 | 61 | override def merge( 62 | other: AccumulatorV2[String, util.Map[String, Long]] 63 | ): Unit = ??? 64 | 65 | override def value: util.Map[String, Long] = ??? 66 | } 67 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/objects/TestFileSystemObjectCacher.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.objects 2 | 3 | import java.net.URI 4 | 5 | import org.apache.hadoop.conf.Configuration 6 | import org.scalatest.funspec.AnyFunSpec 7 | import org.scalatest.matchers.should.Matchers 8 | 9 | class TestFileSystemObjectCacher extends AnyFunSpec with Matchers { 10 | 11 | it("should create and cache a filesystem") { 12 | 13 | val conf = new Configuration() 14 | 15 | val cache = new FileSystemObjectCacher(conf) 16 | 17 | cache.get(new URI("file:///test/file")) should be(None) 18 | 19 | cache.getOrCreate(new URI("file:///test/file")).getUri.toString should be( 20 | "file:///" 21 | ) 22 | 23 | cache.get(new URI("file:///test2/file2")).map(_.getUri.toString) should be( 24 | Some("file:///") 25 | ) 26 | 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/utils/TestCopyUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import com.coxautodata.objects.DeleteActionResult.{ 4 | Deleted, 5 | SkippedDoesNotExists, 6 | SkippedDryRun 7 | } 8 | import com.coxautodata.objects._ 9 | import com.coxautodata.utils.CopyUtils._ 10 | import com.coxautodata.{SparkDistCPOptions, TestSpec} 11 | import org.apache.hadoop.fs.Path 12 | 13 | class TestCopyUtils extends TestSpec { 14 | 15 | describe("filesAreIdentical") { 16 | 17 | it("differing lengths") { 18 | val one = createFile(new Path("1.file"), "a".getBytes) 19 | val two = createFile(new Path("2.file"), "aa".getBytes) 20 | filesAreIdentical( 21 | one, 22 | Option(localFileSystem.getFileChecksum(one.getPath)), 23 | two, 24 | Option(localFileSystem.getFileChecksum(two.getPath)) 25 | ) should be(false) 26 | } 27 | 28 | it("same file") { 29 | val one = createFile(new Path("1.file"), "a".getBytes) 30 | val two = createFile(new Path("2.file"), "a".getBytes) 31 | filesAreIdentical( 32 | one, 33 | Option(localFileSystem.getFileChecksum(one.getPath)), 34 | two, 35 | Option(localFileSystem.getFileChecksum(two.getPath)) 36 | ) should be(true) 37 | } 38 | 39 | } 40 | 41 | describe("performCopy") { 42 | 43 | it("successful copy") { 44 | 45 | val source = createFile(new Path("1.file"), "a".getBytes) 46 | val destPath = new Path(testingBaseDirPath, "2.file") 47 | performCopy( 48 | localFileSystem, 49 | source, 50 | localFileSystem, 51 | destPath.toUri, 52 | removeExisting = false, 53 | ignoreErrors = false, 54 | taskAttemptID = 1 55 | ) 56 | 57 | filesAreIdentical( 58 | source, 59 | Option(localFileSystem.getFileChecksum(source.getPath)), 60 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 61 | Option(localFileSystem.getFileChecksum(destPath)) 62 | ) should be(true) 63 | 64 | } 65 | 66 | it("successful copy with overwrite") { 67 | 68 | val source = createFile(new Path("1.file"), "a".getBytes) 69 | val destPath = new Path(testingBaseDirPath, "2.file") 70 | 71 | createFile(new Path(destPath.getName), "aa".getBytes) 72 | 73 | filesAreIdentical( 74 | source, 75 | Option(localFileSystem.getFileChecksum(source.getPath)), 76 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 77 | Option(localFileSystem.getFileChecksum(destPath)) 78 | ) should be(false) 79 | 80 | performCopy( 81 | localFileSystem, 82 | source, 83 | localFileSystem, 84 | destPath.toUri, 85 | removeExisting = true, 86 | ignoreErrors = false, 87 | taskAttemptID = 1 88 | ) 89 | 90 | filesAreIdentical( 91 | source, 92 | Option(localFileSystem.getFileChecksum(source.getPath)), 93 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 94 | Option(localFileSystem.getFileChecksum(destPath)) 95 | ) should be(true) 96 | 97 | } 98 | 99 | it("failed rename") { 100 | val source = createFile(new Path("1.file"), "a".getBytes) 101 | val destPath = new Path(testingBaseDirPath, "2.file") 102 | 103 | createFile(new Path(destPath.getName), "aa".getBytes) 104 | 105 | intercept[RuntimeException] { 106 | performCopy( 107 | localFileSystem, 108 | source, 109 | localFileSystem, 110 | destPath.toUri, 111 | removeExisting = false, 112 | ignoreErrors = false, 113 | taskAttemptID = 1 114 | ) 115 | } 116 | 117 | performCopy( 118 | localFileSystem, 119 | source, 120 | localFileSystem, 121 | destPath.toUri, 122 | removeExisting = false, 123 | ignoreErrors = true, 124 | taskAttemptID = 2 125 | ).getMessage should be( 126 | s"Source: [${source.getPath.toUri}], Destination: [$destPath], " + 127 | s"Type: [FileCopy: 1 bytes], Result: [Failed: Cannot create file [$destPath] as it already exists]" 128 | ) 129 | 130 | } 131 | 132 | } 133 | 134 | describe("createDirectory") { 135 | 136 | it("create a directory") { 137 | val sourcePath = new Path(testingBaseDirPath, "sub") 138 | localFileSystem.mkdirs(sourcePath) 139 | val destPath = new Path(testingBaseDirPath, "dest") 140 | val copyDefinition = SingleCopyDefinition( 141 | SerializableFileStatus(localFileSystem.getFileStatus(sourcePath)), 142 | destPath.toUri 143 | ) 144 | 145 | // Dry run 146 | localFileSystem.exists(destPath) should be(false) 147 | createDirectory( 148 | localFileSystem, 149 | copyDefinition, 150 | SparkDistCPOptions(dryRun = true) 151 | ) 152 | localFileSystem.exists(destPath) should be(false) 153 | 154 | // Create 155 | localFileSystem.exists(destPath) should be(false) 156 | createDirectory( 157 | localFileSystem, 158 | copyDefinition, 159 | SparkDistCPOptions() 160 | ).copyAction should be(CopyActionResult.Created) 161 | localFileSystem.exists(destPath) should be(true) 162 | 163 | // Already exists 164 | createDirectory( 165 | localFileSystem, 166 | copyDefinition, 167 | SparkDistCPOptions() 168 | ).copyAction should be(CopyActionResult.SkippedAlreadyExists) 169 | } 170 | 171 | } 172 | 173 | describe("copyFile") { 174 | 175 | it("successful copy") { 176 | 177 | val source = createFile(new Path("1.file"), "a".getBytes) 178 | val destPath = new Path(testingBaseDirPath, "2.file") 179 | val copyDefinition = SingleCopyDefinition(source, destPath.toUri) 180 | 181 | // Dry run 182 | copyFile( 183 | localFileSystem, 184 | localFileSystem, 185 | copyDefinition, 186 | SparkDistCPOptions(dryRun = true), 187 | 1 188 | ).copyAction should be(CopyActionResult.SkippedDryRun) 189 | localFileSystem.exists(destPath) should be(false) 190 | 191 | // Missing at destination 192 | copyFile( 193 | localFileSystem, 194 | localFileSystem, 195 | copyDefinition, 196 | SparkDistCPOptions(), 197 | 1 198 | ).copyAction should be(CopyActionResult.Copied) 199 | localFileSystem.exists(destPath) should be(true) 200 | 201 | filesAreIdentical( 202 | source, 203 | Option(localFileSystem.getFileChecksum(source.getPath)), 204 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 205 | Option(localFileSystem.getFileChecksum(destPath)) 206 | ) should be(true) 207 | 208 | // Present at destination, skipped 209 | localFileSystem.delete(destPath, false) 210 | createFile(new Path("2.file"), "aa".getBytes).getPath 211 | filesAreIdentical( 212 | source, 213 | Option(localFileSystem.getFileChecksum(source.getPath)), 214 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 215 | Option(localFileSystem.getFileChecksum(destPath)) 216 | ) should be(false) 217 | 218 | copyFile( 219 | localFileSystem, 220 | localFileSystem, 221 | copyDefinition, 222 | SparkDistCPOptions(), 223 | 1 224 | ).copyAction should be(CopyActionResult.SkippedAlreadyExists) 225 | 226 | filesAreIdentical( 227 | source, 228 | Option(localFileSystem.getFileChecksum(source.getPath)), 229 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 230 | Option(localFileSystem.getFileChecksum(destPath)) 231 | ) should be(false) 232 | 233 | } 234 | 235 | it("successful copy with overwrite") { 236 | val source = createFile(new Path("1.file"), "a".getBytes) 237 | val destPath = createFile(new Path("2.file"), "aa".getBytes).getPath 238 | val copyDefinition = SingleCopyDefinition(source, destPath.toUri) 239 | 240 | filesAreIdentical( 241 | source, 242 | Option(localFileSystem.getFileChecksum(source.getPath)), 243 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 244 | Option(localFileSystem.getFileChecksum(destPath)) 245 | ) should be(false) 246 | 247 | // Present at destination, overwrite dry-run 248 | copyFile( 249 | localFileSystem, 250 | localFileSystem, 251 | copyDefinition, 252 | SparkDistCPOptions(overwrite = true, dryRun = true), 253 | 1 254 | ).copyAction should be(CopyActionResult.SkippedDryRun) 255 | 256 | filesAreIdentical( 257 | source, 258 | Option(localFileSystem.getFileChecksum(source.getPath)), 259 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 260 | Option(localFileSystem.getFileChecksum(destPath)) 261 | ) should be(false) 262 | 263 | // Present at destination, overwrite 264 | copyFile( 265 | localFileSystem, 266 | localFileSystem, 267 | copyDefinition, 268 | SparkDistCPOptions(overwrite = true), 269 | 1 270 | ).copyAction should be(CopyActionResult.OverwrittenOrUpdated) 271 | 272 | filesAreIdentical( 273 | source, 274 | Option(localFileSystem.getFileChecksum(source.getPath)), 275 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 276 | Option(localFileSystem.getFileChecksum(destPath)) 277 | ) should be(true) 278 | } 279 | 280 | it("successful copy with update") { 281 | val source = createFile(new Path("1.file"), "a".getBytes) 282 | val destPath = createFile(new Path("2.file"), "aa".getBytes).getPath 283 | val copyDefinition = SingleCopyDefinition(source, destPath.toUri) 284 | 285 | filesAreIdentical( 286 | source, 287 | Option(localFileSystem.getFileChecksum(source.getPath)), 288 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 289 | Option(localFileSystem.getFileChecksum(destPath)) 290 | ) should be(false) 291 | 292 | // Present at destination, update dry-run 293 | copyFile( 294 | localFileSystem, 295 | localFileSystem, 296 | copyDefinition, 297 | SparkDistCPOptions(update = true, dryRun = true), 298 | 1 299 | ).copyAction should be(CopyActionResult.SkippedDryRun) 300 | 301 | filesAreIdentical( 302 | source, 303 | Option(localFileSystem.getFileChecksum(source.getPath)), 304 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 305 | Option(localFileSystem.getFileChecksum(destPath)) 306 | ) should be(false) 307 | 308 | // Present at destination, differing file, update 309 | copyFile( 310 | localFileSystem, 311 | localFileSystem, 312 | copyDefinition, 313 | SparkDistCPOptions(update = true), 314 | 1 315 | ).copyAction should be(CopyActionResult.OverwrittenOrUpdated) 316 | 317 | filesAreIdentical( 318 | source, 319 | Option(localFileSystem.getFileChecksum(source.getPath)), 320 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 321 | Option(localFileSystem.getFileChecksum(destPath)) 322 | ) should be(true) 323 | 324 | copyFile( 325 | localFileSystem, 326 | localFileSystem, 327 | copyDefinition, 328 | SparkDistCPOptions(update = true), 329 | 1 330 | ).copyAction should be(CopyActionResult.SkippedIdenticalFileAlreadyExists) 331 | } 332 | 333 | it("failed copy with sub directory") { 334 | 335 | val source = createFile(new Path("1.file"), "a".getBytes) 336 | val destPath = new Path(testingBaseDirPath, new Path("sub", "1.file")) 337 | val copyDefinition = SingleCopyDefinition(source, destPath.toUri) 338 | 339 | intercept[RuntimeException] { 340 | copyFile( 341 | localFileSystem, 342 | localFileSystem, 343 | copyDefinition, 344 | SparkDistCPOptions(), 345 | 1 346 | ) 347 | } 348 | 349 | copyFile( 350 | localFileSystem, 351 | localFileSystem, 352 | copyDefinition, 353 | SparkDistCPOptions(ignoreErrors = true), 354 | 1 355 | ).getMessage should be( 356 | s"Source: [${source.getPath.toUri}], " + 357 | s"Destination: [$destPath], Type: [FileCopy: 1 bytes], " + 358 | s"Result: [Failed: Destination folder [${destPath.getParent}] does not exist]" 359 | ) 360 | 361 | } 362 | } 363 | 364 | describe("handleCopy") { 365 | 366 | it("successful copy with sub directory") { 367 | 368 | val source = createFile(new Path("sub", "1.file"), "a".getBytes) 369 | val destPath = new Path(testingBaseDirPath, new Path("sub1", "1.file")) 370 | val parentCopyDefinition = SingleCopyDefinition( 371 | SerializableFileStatus( 372 | localFileSystem.getFileStatus(source.getPath.getParent) 373 | ), 374 | new Path(testingBaseDirPath, "sub1").toUri 375 | ) 376 | val fileCopyDefinition = CopyDefinitionWithDependencies( 377 | source, 378 | destPath.toUri, 379 | Seq(parentCopyDefinition) 380 | ) 381 | val options = SparkDistCPOptions() 382 | 383 | fileCopyDefinition.getAllCopyDefinitions.foreach( 384 | handleCopy(localFileSystem, localFileSystem, _, options, 1) 385 | ) 386 | 387 | filesAreIdentical( 388 | source, 389 | Option(localFileSystem.getFileChecksum(source.getPath)), 390 | SerializableFileStatus(localFileSystem.getFileStatus(destPath)), 391 | Option(localFileSystem.getFileChecksum(destPath)) 392 | ) should be(true) 393 | 394 | } 395 | 396 | } 397 | 398 | describe("deleteFile") { 399 | 400 | it("delete directory that does not exist") { 401 | 402 | val path = new Path(testingBaseDirPath, "sub") 403 | deleteFile(localFileSystem, path, SparkDistCPOptions()) should be( 404 | DeleteResult(path.toUri, SkippedDoesNotExists) 405 | ) 406 | 407 | } 408 | 409 | it("delete a directory containing a file") { 410 | 411 | val filePath = createFile(new Path("sub/file"), "a".getBytes).getPath 412 | val dirPath = filePath.getParent 413 | 414 | localFileSystem.exists(filePath) should be(true) 415 | localFileSystem.exists(dirPath) should be(true) 416 | deleteFile( 417 | localFileSystem, 418 | dirPath, 419 | SparkDistCPOptions(dryRun = true) 420 | ) should be(DeleteResult(dirPath.toUri, SkippedDryRun)) 421 | localFileSystem.exists(filePath) should be(true) 422 | localFileSystem.exists(dirPath) should be(true) 423 | deleteFile(localFileSystem, dirPath, SparkDistCPOptions()) should be( 424 | DeleteResult(dirPath.toUri, Deleted) 425 | ) 426 | localFileSystem.exists(filePath) should be(false) 427 | localFileSystem.exists(dirPath) should be(false) 428 | } 429 | 430 | } 431 | 432 | } 433 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/utils/TestFileListUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import com.coxautodata.TestSpec 4 | import com.coxautodata.utils.FileListUtils._ 5 | import org.apache.hadoop.fs.Path 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | class TestFileListUtils extends TestSpec { 9 | 10 | describe("listFiles") { 11 | 12 | it("Non-existing folder") { 13 | intercept[java.io.FileNotFoundException] { 14 | listFiles( 15 | localFileSystem, 16 | new Path(testingBaseDirPath, "src"), 17 | 10, 18 | false, 19 | List.empty 20 | ) 21 | } 22 | } 23 | 24 | it("Empty folder") { 25 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "src")) 26 | listFiles( 27 | localFileSystem, 28 | new Path(testingBaseDirPath, "src"), 29 | 10, 30 | false, 31 | List.empty 32 | ) should contain theSameElementsAs Seq.empty 33 | } 34 | 35 | it("Empty folder with include root") { 36 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "src")) 37 | listFiles( 38 | localFileSystem, 39 | new Path(testingBaseDirPath, "src"), 40 | 10, 41 | true, 42 | List.empty 43 | ) 44 | .map(f => 45 | (fileStatusToResult(f._1), f._2.map(fileStatusToResult)) 46 | ) should contain theSameElementsAs Seq( 47 | ( 48 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 49 | List() 50 | ) 51 | ) 52 | } 53 | 54 | it("Single file with include root") { 55 | localFileSystem.mkdirs(new Path(testingBaseDirPath, "src")) 56 | createFile(new Path("src/file"), "a".getBytes) 57 | listFiles( 58 | localFileSystem, 59 | new Path(testingBaseDirPath, "src"), 60 | 10, 61 | true, 62 | List.empty 63 | ) 64 | .map(f => 65 | (fileStatusToResult(f._1), f._2.map(fileStatusToResult)) 66 | ) should contain theSameElementsAs Seq( 67 | ( 68 | FileListing(new Path(testingBaseDirPath, "src").toString, None), 69 | List() 70 | ), 71 | ( 72 | FileListing( 73 | new Path(testingBaseDirPath, "src/file").toString, 74 | Some(1) 75 | ), 76 | List(FileListing(new Path(testingBaseDirPath, "src").toString, None)) 77 | ) 78 | ) 79 | } 80 | 81 | it("Should list all files in folder") { 82 | 83 | val input = List( 84 | "src/1.file", 85 | "src/2.file", 86 | "src/3.file", 87 | "src/sub1/1.file", 88 | "src/sub1/2.file", 89 | "src/sub1/3.file", 90 | "src/sub2/1.file", 91 | "src/sub2/2.file", 92 | "src/sub2/3.file", 93 | "src/sub2/subsub1/1.file" 94 | ) 95 | 96 | input.foreach(f => createFile(new Path(f), f.getBytes)) 97 | 98 | listFiles( 99 | localFileSystem, 100 | new Path(testingBaseDirPath, "src"), 101 | 10, 102 | false, 103 | List.empty 104 | ) 105 | .map(f => 106 | (fileStatusToResult(f._1), f._2.map(fileStatusToResult)) 107 | ) should contain theSameElementsAs Seq( 108 | ( 109 | FileListing( 110 | new Path(testingBaseDirPath, "src/1.file").toString, 111 | Some(10) 112 | ), 113 | List() 114 | ), 115 | ( 116 | FileListing( 117 | new Path(testingBaseDirPath, "src/2.file").toString, 118 | Some(10) 119 | ), 120 | List() 121 | ), 122 | ( 123 | FileListing( 124 | new Path(testingBaseDirPath, "src/3.file").toString, 125 | Some(10) 126 | ), 127 | List() 128 | ), 129 | ( 130 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 131 | List() 132 | ), 133 | ( 134 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 135 | List() 136 | ), 137 | ( 138 | FileListing( 139 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 140 | None 141 | ), 142 | List( 143 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 144 | ) 145 | ), 146 | ( 147 | FileListing( 148 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 149 | Some(15) 150 | ), 151 | List( 152 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 153 | ) 154 | ), 155 | ( 156 | FileListing( 157 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 158 | Some(15) 159 | ), 160 | List( 161 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 162 | ) 163 | ), 164 | ( 165 | FileListing( 166 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 167 | Some(15) 168 | ), 169 | List( 170 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 171 | ) 172 | ), 173 | ( 174 | FileListing( 175 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 176 | Some(15) 177 | ), 178 | List( 179 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 180 | ) 181 | ), 182 | ( 183 | FileListing( 184 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 185 | Some(15) 186 | ), 187 | List( 188 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 189 | ) 190 | ), 191 | ( 192 | FileListing( 193 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 194 | Some(15) 195 | ), 196 | List( 197 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 198 | ) 199 | ), 200 | ( 201 | FileListing( 202 | new Path(testingBaseDirPath, "src/sub2/subsub1/1.file").toString, 203 | Some(23) 204 | ), 205 | List( 206 | FileListing( 207 | new Path(testingBaseDirPath, "src/sub2").toString, 208 | None 209 | ), 210 | FileListing( 211 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 212 | None 213 | ) 214 | ) 215 | ) 216 | ) 217 | 218 | } 219 | 220 | it("Should list all files in folder with a filter") { 221 | 222 | val input = List( 223 | "src/1.file", 224 | "src/2.file", 225 | "src/3.file", 226 | "src/sub1/1.file", 227 | "src/sub1/2.file", 228 | "src/sub1/3.file", 229 | "src/sub2/1.file", 230 | "src/sub2/2.file", 231 | "src/sub2/3.file", 232 | "src/sub2/subsub1/1.file" 233 | ) 234 | 235 | input.foreach(f => createFile(new Path(f), f.getBytes)) 236 | 237 | listFiles( 238 | localFileSystem, 239 | new Path(testingBaseDirPath, "src"), 240 | 10, 241 | false, 242 | List(""".*/1\.file$""".r, """.*/3\.file$""".r) 243 | ) 244 | .map(f => 245 | (fileStatusToResult(f._1), f._2.map(fileStatusToResult)) 246 | ) should contain theSameElementsAs Seq( 247 | ( 248 | FileListing( 249 | new Path(testingBaseDirPath, "src/2.file").toString, 250 | Some(10) 251 | ), 252 | List() 253 | ), 254 | ( 255 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 256 | List() 257 | ), 258 | ( 259 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 260 | List() 261 | ), 262 | ( 263 | FileListing( 264 | new Path(testingBaseDirPath, "src/sub2/subsub1").toString, 265 | None 266 | ), 267 | List( 268 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 269 | ) 270 | ), 271 | ( 272 | FileListing( 273 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 274 | Some(15) 275 | ), 276 | List( 277 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 278 | ) 279 | ), 280 | ( 281 | FileListing( 282 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 283 | Some(15) 284 | ), 285 | List( 286 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 287 | ) 288 | ) 289 | ) 290 | } 291 | 292 | it("Should list all files in folder with a folder filter") { 293 | 294 | val input = List( 295 | "src/1.file", 296 | "src/2.file", 297 | "src/3.file", 298 | "src/sub1/1.file", 299 | "src/sub1/2.file", 300 | "src/sub1/3.file", 301 | "src/sub2/1.file", 302 | "src/sub2/2.file", 303 | "src/sub2/3.file", 304 | "src/sub2/subsub1/1.file" 305 | ) 306 | 307 | input.foreach(f => createFile(new Path(f), f.getBytes)) 308 | 309 | listFiles( 310 | localFileSystem, 311 | new Path(testingBaseDirPath, "src"), 312 | 10, 313 | false, 314 | List(""".*/subsub1($|/.*)""".r) 315 | ) 316 | .map(f => 317 | (fileStatusToResult(f._1), f._2.map(fileStatusToResult)) 318 | ) should contain theSameElementsAs Seq( 319 | ( 320 | FileListing( 321 | new Path(testingBaseDirPath, "src/1.file").toString, 322 | Some(10) 323 | ), 324 | List() 325 | ), 326 | ( 327 | FileListing( 328 | new Path(testingBaseDirPath, "src/2.file").toString, 329 | Some(10) 330 | ), 331 | List() 332 | ), 333 | ( 334 | FileListing( 335 | new Path(testingBaseDirPath, "src/3.file").toString, 336 | Some(10) 337 | ), 338 | List() 339 | ), 340 | ( 341 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None), 342 | List() 343 | ), 344 | ( 345 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None), 346 | List() 347 | ), 348 | ( 349 | FileListing( 350 | new Path(testingBaseDirPath, "src/sub1/1.file").toString, 351 | Some(15) 352 | ), 353 | List( 354 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 355 | ) 356 | ), 357 | ( 358 | FileListing( 359 | new Path(testingBaseDirPath, "src/sub1/2.file").toString, 360 | Some(15) 361 | ), 362 | List( 363 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 364 | ) 365 | ), 366 | ( 367 | FileListing( 368 | new Path(testingBaseDirPath, "src/sub1/3.file").toString, 369 | Some(15) 370 | ), 371 | List( 372 | FileListing(new Path(testingBaseDirPath, "src/sub1").toString, None) 373 | ) 374 | ), 375 | ( 376 | FileListing( 377 | new Path(testingBaseDirPath, "src/sub2/1.file").toString, 378 | Some(15) 379 | ), 380 | List( 381 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 382 | ) 383 | ), 384 | ( 385 | FileListing( 386 | new Path(testingBaseDirPath, "src/sub2/2.file").toString, 387 | Some(15) 388 | ), 389 | List( 390 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 391 | ) 392 | ), 393 | ( 394 | FileListing( 395 | new Path(testingBaseDirPath, "src/sub2/3.file").toString, 396 | Some(15) 397 | ), 398 | List( 399 | FileListing(new Path(testingBaseDirPath, "src/sub2").toString, None) 400 | ) 401 | ) 402 | ) 403 | } 404 | 405 | } 406 | 407 | describe("getSourceFiles") { 408 | 409 | it("throw an exception on source collisions") { 410 | 411 | val spark = new SparkContext( 412 | new SparkConf().setAppName("test").setMaster("local[*]") 413 | ) 414 | 415 | val input = List("source/first/file", "source/second/file") 416 | 417 | input.foreach(f => createFile(new Path(f), f.getBytes)) 418 | 419 | intercept[RuntimeException] { 420 | getSourceFiles( 421 | spark, 422 | Seq( 423 | new Path(testingBaseDirPath, "source/first").toUri, 424 | new Path(testingBaseDirPath, "source/second").toUri 425 | ), 426 | new Path(testingBaseDirPath, "target").toUri, 427 | true, 428 | 2, 429 | List.empty 430 | ) 431 | }.getMessage should be( 432 | "Collisions found where multiple source files lead to the same destination location; check executor logs for specific collision detail." 433 | ) 434 | 435 | spark.stop() 436 | 437 | } 438 | 439 | it("throw an exception if source collides with dest") { 440 | 441 | val spark = new SparkContext( 442 | new SparkConf().setAppName("test").setMaster("local[*]") 443 | ) 444 | 445 | val input = List("source/file") 446 | 447 | input.foreach(f => createFile(new Path(f), f.getBytes)) 448 | 449 | intercept[RuntimeException] { 450 | getSourceFiles( 451 | spark, 452 | Seq(new Path(testingBaseDirPath, "source").toUri), 453 | new Path(testingBaseDirPath, "source").toUri, 454 | true, 455 | 2, 456 | List.empty 457 | ) 458 | }.getMessage should be( 459 | "Collisions found where a file has the same source and destination location; check executor logs for specific collision detail." 460 | ) 461 | 462 | spark.stop() 463 | 464 | } 465 | 466 | } 467 | 468 | } 469 | 470 | case class FileListing(name: String, length: Option[Long]) 471 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/utils/TestFileUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import org.scalatest.funspec.AnyFunSpec 4 | import org.scalatest.matchers.should.Matchers 5 | 6 | class TestFileUtils extends AnyFunSpec with Matchers { 7 | 8 | it("byteCountToDisplaySize") { 9 | 10 | import java.math.BigInteger 11 | val b1023 = BigInteger.valueOf(1023) 12 | val b1025 = BigInteger.valueOf(1025) 13 | val KB1 = BigInteger.valueOf(1024) 14 | val MB1 = KB1.multiply(KB1) 15 | val GB1 = MB1.multiply(KB1) 16 | val GB2 = GB1.add(GB1) 17 | val TB1 = GB1.multiply(KB1) 18 | val PB1 = TB1.multiply(KB1) 19 | val EB1 = PB1.multiply(KB1) 20 | FileUtils.byteCountToDisplaySize(BigInteger.ZERO) should be("0 bytes") 21 | FileUtils.byteCountToDisplaySize(BigInteger.ONE) should be("1 bytes") 22 | FileUtils.byteCountToDisplaySize(b1023) should be("1023 bytes") 23 | FileUtils.byteCountToDisplaySize(KB1) should be("1 KB (1024 bytes)") 24 | FileUtils.byteCountToDisplaySize(b1025) should be("1 KB (1025 bytes)") 25 | FileUtils.byteCountToDisplaySize(MB1.subtract(BigInteger.ONE)) should be( 26 | "1023 KB (1048575 bytes)" 27 | ) 28 | FileUtils.byteCountToDisplaySize(MB1) should be("1 MB (1048576 bytes)") 29 | FileUtils.byteCountToDisplaySize(MB1.add(BigInteger.ONE)) should be( 30 | "1 MB (1048577 bytes)" 31 | ) 32 | FileUtils.byteCountToDisplaySize(GB1.subtract(BigInteger.ONE)) should be( 33 | "1023 MB (1073741823 bytes)" 34 | ) 35 | FileUtils.byteCountToDisplaySize(GB1) should be("1 GB (1073741824 bytes)") 36 | FileUtils.byteCountToDisplaySize(GB1.add(BigInteger.ONE)) should be( 37 | "1 GB (1073741825 bytes)" 38 | ) 39 | FileUtils.byteCountToDisplaySize(GB2) should be("2 GB (2147483648 bytes)") 40 | FileUtils.byteCountToDisplaySize(GB2.subtract(BigInteger.ONE)) should be( 41 | "1.99 GB (2147483647 bytes)" 42 | ) 43 | FileUtils.byteCountToDisplaySize(TB1) should be( 44 | "1 TB (1099511627776 bytes)" 45 | ) 46 | FileUtils.byteCountToDisplaySize(PB1) should be( 47 | "1 PB (1125899906842624 bytes)" 48 | ) 49 | FileUtils.byteCountToDisplaySize(EB1) should be( 50 | "1 EB (1152921504606846976 bytes)" 51 | ) 52 | FileUtils.byteCountToDisplaySize(java.lang.Long.MAX_VALUE) should be( 53 | "7.99 EB (9223372036854775807 bytes)" 54 | ) 55 | // Other MAX_VALUEs 56 | FileUtils.byteCountToDisplaySize( 57 | BigInteger.valueOf(Character.MAX_VALUE) 58 | ) should be("63.9 KB (65535 bytes)") 59 | FileUtils.byteCountToDisplaySize( 60 | BigInteger.valueOf(java.lang.Short.MAX_VALUE) 61 | ) should be("31.9 KB (32767 bytes)") 62 | FileUtils.byteCountToDisplaySize( 63 | BigInteger.valueOf(Integer.MAX_VALUE) 64 | ) should be("1.99 GB (2147483647 bytes)") 65 | // Other Values 66 | FileUtils.byteCountToDisplaySize(105013122725L) should be( 67 | "97.8 GB (105013122725 bytes)" 68 | ) 69 | FileUtils.byteCountToDisplaySize(644353293312L) should be( 70 | "600 GB (644353293312 bytes)" 71 | ) 72 | 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/com/coxautodata/utils/TestPathUtils.scala: -------------------------------------------------------------------------------- 1 | package com.coxautodata.utils 2 | 3 | import java.net.URI 4 | 5 | import PathUtils._ 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.fs.Path 8 | import org.scalatest.funspec.AnyFunSpec 9 | import org.scalatest.matchers.should.Matchers 10 | 11 | class TestPathUtils extends AnyFunSpec with Matchers { 12 | 13 | describe("pathToQualifiedPath") { 14 | 15 | it("qualify a path with the default fs") { 16 | val path = new Path("/test") 17 | 18 | path.toString should be("/test") 19 | pathToQualifiedPath(new Configuration(), path).toString should be( 20 | "file:/test" 21 | ) 22 | } 23 | 24 | it("not change an existing uri") { 25 | val path = new Path("hdfs:/test") 26 | 27 | path.toString should be("hdfs:/test") 28 | pathToQualifiedPath(new Configuration(), path).toString should be( 29 | "hdfs:/test" 30 | ) 31 | } 32 | 33 | } 34 | 35 | describe("sourcePathToDestinationPath") { 36 | 37 | it("non-update/non-overwrite behaviour") { 38 | 39 | // Source is subdirectory 40 | sourceURIToDestinationURI( 41 | new Path("hdfs://nn1:8020/source/first/1").toUri, 42 | new Path("hdfs://nn1:8020/source/first").toUri, 43 | new Path("hdfs://nn2:8020/target").toUri, 44 | updateOverwritePathBehaviour = false 45 | ) should be(new Path("hdfs://nn2:8020/target/first/1").toUri) 46 | 47 | // Source is root 48 | sourceURIToDestinationURI( 49 | new Path("hdfs://nn1:8020/first/1").toUri, 50 | new Path("hdfs://nn1:8020/").toUri, 51 | new Path("hdfs://nn2:8020/target").toUri, 52 | updateOverwritePathBehaviour = false 53 | ) should be(new Path("hdfs://nn2:8020/target/first/1").toUri) 54 | 55 | } 56 | 57 | it("update/overwrite behaviour") { 58 | 59 | // Source is subdirectory 60 | sourceURIToDestinationURI( 61 | new Path("hdfs://nn1:8020/source/first/1").toUri, 62 | new Path("hdfs://nn1:8020/source/first").toUri, 63 | new Path("hdfs://nn2:8020/target").toUri, 64 | updateOverwritePathBehaviour = true 65 | ) should be(new Path("hdfs://nn2:8020/target/1").toUri) 66 | 67 | // Source is root 68 | sourceURIToDestinationURI( 69 | new Path("hdfs://nn1:8020/first/1").toUri, 70 | new Path("hdfs://nn1:8020/").toUri, 71 | new Path("hdfs://nn2:8020/target").toUri, 72 | updateOverwritePathBehaviour = true 73 | ) should be(new Path("hdfs://nn2:8020/target/first/1").toUri) 74 | 75 | } 76 | 77 | } 78 | 79 | describe("uriIsChild") { 80 | 81 | it("check whether path is a child") { 82 | 83 | uriIsChild( 84 | new URI("file:/test/folder"), 85 | new URI("file:/test/folder/child") 86 | ) should be(true) 87 | uriIsChild( 88 | new URI("file:/test/folder"), 89 | new URI("file:/test/folder") 90 | ) should be(true) 91 | uriIsChild(new URI("file:/test/folder"), new URI("file:/test")) should be( 92 | false 93 | ) 94 | uriIsChild( 95 | new URI("file:/test/folder"), 96 | new URI("hdfs:/test/folder/child") 97 | ) should be(false) 98 | uriIsChild( 99 | new URI("file:/test/folder"), 100 | new URI("hdfs:/test/folder.file") 101 | ) should be(false) 102 | 103 | } 104 | 105 | it("fail on non-absolute URIs") { 106 | 107 | intercept[RuntimeException] { 108 | uriIsChild(new URI("file:/test/folder"), new URI("/test/folder/child")) 109 | } 110 | 111 | intercept[RuntimeException] { 112 | uriIsChild(new URI("/test/folder"), new URI("file:/test/folder/child")) 113 | } 114 | 115 | intercept[RuntimeException] { 116 | uriIsChild( 117 | new URI("file:/test/folder"), 118 | new URI("file:test/folder/child") 119 | ) 120 | } 121 | 122 | intercept[RuntimeException] { 123 | uriIsChild( 124 | new URI("file:test/folder"), 125 | new URI("file:/test/folder/child") 126 | ) 127 | } 128 | 129 | } 130 | 131 | } 132 | 133 | } 134 | --------------------------------------------------------------------------------