├── .circleci └── config.yml ├── .gitignore ├── DCO ├── LICENSE ├── README.md ├── config ├── java.header ├── scalastyle-output.xml ├── scalastyle-output_dg-common.xml ├── scalastyle_config.xml └── sun_checkstyle.xml ├── examples └── TransformationTesting │ ├── pom.xml │ └── src │ └── test │ ├── java │ └── org │ │ └── finra │ │ └── msd │ │ └── examples │ │ ├── FileToPgTest.java │ │ ├── H2ToPgTest.java │ │ └── db │ │ ├── H2Database.java │ │ └── PostgresDatabase.java │ └── resources │ ├── appliance_source.txt │ ├── h2_db.sql │ └── pg_db.sql ├── mega-spark-diff ├── pom.xml └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── finra │ │ │ └── msd │ │ │ ├── containers │ │ │ ├── CmdLine.java │ │ │ └── SourceVars.java │ │ │ ├── enums │ │ │ ├── SourceType.java │ │ │ └── VisualResultType.java │ │ │ ├── launcher │ │ │ └── Launcher.java │ │ │ └── util │ │ │ └── FileUtil.java │ ├── resources │ │ ├── data_sources │ │ │ ├── hive_test.txt │ │ │ ├── jdbc_test.txt │ │ │ ├── person_tables.txt │ │ │ └── s3_test.txt │ │ ├── htmltemplates │ │ │ └── horizontalTableTemplate.html │ │ ├── run_configs │ │ │ └── spark_configs.txt │ │ └── shell │ │ │ └── msd.sh │ └── scala │ │ ├── com │ │ └── audienceproject │ │ │ └── spark │ │ │ └── dynamodb │ │ │ └── msd │ │ │ └── datasource │ │ │ ├── DefaultSource.scala │ │ │ ├── DynamoDbPartitionReader.scala │ │ │ ├── DynamoDbPartitionReaderFactory.scala │ │ │ ├── DynamoDbScan.scala │ │ │ ├── DynamoDbScanBuilder.scala │ │ │ ├── DynamoDbTable.scala │ │ │ └── JavaCollections.scala │ │ └── org │ │ ├── apache │ │ └── spark │ │ │ └── sql │ │ │ └── execution │ │ │ └── datasources │ │ │ └── msd │ │ │ └── DefaultSource.scala │ │ └── finra │ │ └── msd │ │ ├── containers │ │ ├── AppleTable.scala │ │ ├── CountResult.scala │ │ └── DiffResult.scala │ │ ├── controllers │ │ └── TemplateController.scala │ │ ├── customExceptions │ │ ├── ColumnNullException.scala │ │ ├── DataFrameNullException.scala │ │ ├── InValidKeyException.scala │ │ ├── JoinKeysNullException.scala │ │ └── SparkSessionNullException.scala │ │ ├── implicits │ │ └── DataFrameImplicits.scala │ │ ├── outputwriters │ │ └── OutputWriter.scala │ │ ├── sparkcompare │ │ └── SparkCompare.scala │ │ ├── sparkfactory │ │ └── SparkFactory.scala │ │ └── visualization │ │ └── Visualizer.scala │ └── test │ ├── java │ └── org │ │ └── finra │ │ └── msd │ │ ├── helpers │ │ ├── FileHelper.java │ │ ├── JsonDeserializerStringMap.java │ │ └── JsonHelper.java │ │ └── memorydb │ │ ├── MemoryDbDynamo.java │ │ └── MemoryDbHsql.java │ ├── resources │ ├── compare │ │ ├── JsonTestMapList.json │ │ ├── JsonTestMapList.sql │ │ ├── JsonTestMapList.txt │ │ └── JsonTestMapListDiffValue.json │ ├── csv │ │ ├── TestCSV.txt │ │ ├── TestCSV_2.txt │ │ ├── TestCSV_commas.txt │ │ └── TestCSV_pipes.txt │ ├── dynamodb │ │ ├── DynamoDbTestSet.json │ │ ├── DynamoDbTestSet.txt │ │ ├── DynamoDbTestSetBrackets.json │ │ ├── DynamoDbTestSetBrackets.txt │ │ ├── DynamoDbTestSetDiffElementOrder.json │ │ ├── DynamoDbTestSetDiffElementOrder.txt │ │ ├── DynamoDbTestSetMixed.json │ │ └── DynamoDbTestSetMixed.txt │ ├── jdbc │ │ ├── EnhancedFruit1.sql │ │ ├── EnhancedFruit2.sql │ │ ├── Fruit1.sql │ │ ├── Fruit2.sql │ │ ├── Fruit3.sql │ │ ├── Fruit4.sql │ │ ├── Fruit5.sql │ │ ├── JdbcTestSimpleToJson.sql │ │ └── Persons1.sql │ ├── json │ │ ├── JsonTestList.json │ │ ├── JsonTestList.txt │ │ ├── JsonTestListBrackets.json │ │ ├── JsonTestListBrackets.txt │ │ ├── JsonTestListMixedType.json │ │ ├── JsonTestListMixedType.txt │ │ ├── JsonTestMap.json │ │ ├── JsonTestMap.txt │ │ ├── JsonTestMapMixedType.json │ │ ├── JsonTestMapMixedType.txt │ │ ├── JsonTestSimple.json │ │ ├── JsonTestSimple.txt │ │ ├── JsonTestSimpleExtraNull.json │ │ ├── JsonTestSimpleExtraNull.txt │ │ ├── JsonTestSimpleMissingElement.json │ │ ├── JsonTestSimpleMissingElement.txt │ │ ├── JsonTestSimpleMixedType.json │ │ └── JsonTestSimpleMixedType.txt │ └── txt │ │ ├── Fruit1.txt │ │ ├── Fruit2.txt │ │ ├── Fruit3.txt │ │ ├── Fruit4.txt │ │ ├── Fruit5.txt │ │ ├── Fruit6.txt │ │ ├── TC1DiffsAndDups1.txt │ │ ├── TC5NullsAndEmptyData1.txt │ │ └── TC5NullsAndEmptyData2.txt │ └── scala │ └── org │ └── finra │ └── msd │ ├── basetestclasses │ ├── JsonFormatTests.scala │ ├── JsonFormatToFileTests.scala │ ├── JsonFormatToJdbcTests.scala │ ├── SparkFunSuite.scala │ ├── SparkFunSuiteDynamoDb.scala │ └── TestHelpers.scala │ ├── controllers │ └── TemplateControllerSuite.scala │ ├── sparkcompare │ ├── CsvToCsvSuite.scala │ ├── DynamoDbToDynamoDbSuite.scala │ ├── DynamoDbToFileSuite.scala │ ├── DynamoDbToJdbcSuite.scala │ ├── DynamoDbToJsonSuite.scala │ ├── FileToFileSuite.scala │ ├── JdbcToFileSuite.scala │ ├── JdbcToJdbcSuite.scala │ ├── JsonToFileSuite.scala │ ├── JsonToJdbcSuite.scala │ ├── JsonToJsonSuite.scala │ └── SparkCompareSuite.scala │ ├── sparkfactory │ ├── DynamoDbSuite.scala │ ├── JdbcSuite.scala │ └── JsonSuite.scala │ ├── stats │ └── CountsSuite.scala │ └── visualization │ └── VisualizerSuite.scala └── pom.xml /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Use the latest 2.1 version of CircleCI pipeline process engine. 2 | # See: https://circleci.com/docs/2.0/configuration-reference 3 | version: 2.1 4 | 5 | commands: 6 | early_return_for_forked_pull_requests: 7 | description: >- 8 | If this build is from a fork, stop executing the current job and return success. 9 | steps: 10 | - run: 11 | name: Early return if this build is from a forked PR 12 | command: | 13 | if [ -n "$CIRCLE_PR_NUMBER" ]; then 14 | echo "Nothing else to do for forked PRs, so marking this step successful" 15 | circleci-agent step halt 16 | fi 17 | 18 | # Define a job to be invoked later in a workflow. 19 | # See: https://circleci.com/docs/2.0/configuration-reference/#jobs 20 | jobs: 21 | # Below is the definition of your job to build and test your app, you can rename and customize it as you want. 22 | build-and-test: 23 | # These next lines define a Docker executor: https://circleci.com/docs/2.0/executor-types/ 24 | # You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub. 25 | # Be sure to update the Docker image tag below to openjdk version of your application. 26 | # A list of available CircleCI Docker Convenience Images are available here: https://circleci.com/developer/images/image/cimg/openjdk 27 | docker: 28 | - image: cimg/openjdk:8.0 29 | # Add steps to the job 30 | # See: https://circleci.com/docs/2.0/configuration-reference/#steps 31 | steps: 32 | # Checkout the code as the first step. 33 | - checkout 34 | # Use mvn clean and package as the standard maven build phase 35 | - run: 36 | name: Build 37 | command: mvn -B -DskipTests clean package 38 | # Then run your tests! 39 | - run: 40 | name: Test 41 | command: mvn test 42 | - run: 43 | name: Generate Javadocs 44 | command: | 45 | cd mega-spark-diff 46 | mvn javadoc:javadoc 47 | - run: 48 | name: Generate Scaladocs 49 | command: | 50 | cd mega-spark-diff 51 | mvn scala:doc 52 | - early_return_for_forked_pull_requests 53 | - persist_to_workspace: 54 | root: mega-spark-diff/target/site 55 | paths: 56 | - apidocs 57 | - scaladocs 58 | - run: 59 | name: Scan 60 | command: bash <(curl -s https://copilot.blackducksoftware.com/ci/circle2/scripts/upload) 61 | docs-deploy: 62 | docker: 63 | - image: cimg/openjdk:8.0 64 | steps: 65 | - run: 66 | name: add known_hosts 67 | command: | 68 | mkdir ~/.ssh 69 | echo 'github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk= 70 | ' >> ~/.ssh/known_hosts 71 | - run: 72 | name: Checkout main 73 | command: | 74 | git clone $CIRCLE_REPOSITORY_URL --depth 1 -b main main 75 | cd main 76 | git config --global user.email "$(git log --format=%ae -n 1)" 77 | git config --global user.name "$(git log --format=%an -n 1)" 78 | echo "export msg="\"$(git log --format=%B -n 1)\" >> $BASH_ENV 79 | - run: 80 | name: Checkout gh-pages 81 | command: | 82 | git clone $CIRCLE_REPOSITORY_URL --depth 1 -b gh-pages gh-pages 83 | rm -rf gh-pages/apidocs 84 | rm -rf gh-pages/scaladocs 85 | - attach_workspace: 86 | at: gh-pages 87 | - run: 88 | name: Copy CircleCI config 89 | command: | 90 | mkdir -p gh-pages/.circleci 91 | cp main/.circleci/config.yml gh-pages/.circleci/config.yml 92 | - add_ssh_keys: 93 | fingerprints: 94 | - "08:56:e3:f6:88:8d:30:f7:c3:df:dc:23:9c:91:eb:5f" 95 | - run: 96 | name: Deploy docs to gh-pages branch 97 | command: | 98 | cd gh-pages 99 | git add .circleci/config.yml 100 | git add -A -- apidocs/ 101 | git add -A -- scaladocs/ 102 | git commit -am "$msg" 103 | git push origin gh-pages 104 | 105 | # Invoke jobs via workflows 106 | # See: https://circleci.com/docs/2.0/configuration-reference/#workflows 107 | workflows: 108 | maven_test: # This is the name of the workflow, feel free to change it to better match your workflow. 109 | # Inside the workflow, you define the jobs you want to run. 110 | jobs: 111 | - build-and-test: 112 | filters: 113 | branches: 114 | ignore: gh-pages 115 | - docs-deploy: 116 | requires: 117 | - build-and-test 118 | filters: 119 | branches: 120 | only: main 121 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Java template 3 | # Compiled class file 4 | *.class 5 | 6 | # Log file 7 | *.log 8 | 9 | # BlueJ files 10 | *.ctxt 11 | 12 | # Mobile Tools for Java (J2ME) 13 | .mtj.tmp/ 14 | 15 | # Package Files # 16 | *.jar 17 | *.war 18 | *.ear 19 | *.zip 20 | *.tar.gz 21 | *.rar 22 | 23 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 24 | hs_err_pid* 25 | 26 | 27 | .idea/vcs.xml 28 | mega-spark-diff.iml 29 | dependency-reduced-pom.xml 30 | .idea/ 31 | spark-warehouse/ 32 | target/ 33 | /nbproject/ 34 | sparkOutputDirectory 35 | sparkOutputDirectory/ 36 | /mega-spark-diff.iml 37 | 38 | # Eclipse project files 39 | /.classpath 40 | /.project 41 | /.cache-main 42 | /.cache-tests 43 | /.settings/ 44 | -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Contributions require sign-off. The sign-off is required for all patch or pull requests, which certifies the following agreement given below. 2 | 3 | Contributor Agreement 4 | --------------------- 5 | 6 | By making a contribution to this project, I certify that: 7 | 8 | (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or 9 | 10 | (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or 11 | 12 | (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. 13 | 14 | (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. 15 | 16 | (e) I also agree to the following terms and conditions: 17 | 18 | (1) Grant of Copyright License. Subject to the terms and conditions of this agreement, You hereby grant to the maintainer and to recipients of software distributed by the maintainer a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare derivative works of, publicly display, publicly perform, sublicense, and distribute your contributions and such derivative works. 19 | 20 | (2) Grant of Patent License. Subject to the terms and conditions of this agreement, You hereby grant to the maintainer and to recipients of software distributed by the maintainer a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the work, where such license applies only to those patent claims licensable by you that are necessarily infringed by your contribution(s) alone or by combination of your contribution(s) with the work to which such contribution(s) was submitted. If any entity institutes patent litigation against you or any other entity (including a cross-claim or counterclaim in a lawsuit) alleging that your contribution, or the work to which you have contributed, constitutes direct or contributory patent infringement, then any patent licenses granted to that entity under this agreement for that contribution or work shall terminate as of the date such litigation is filed. 21 | 22 | Committing 23 | ---------- 24 | 25 | Add a line stating 26 | 27 | Signed-off-by: Random J Developer 28 | 29 | When committing using the command line you can sign off using the --signoff or -s flag. This adds a Signed-off-by line by the committer at the end of the commit log message. 30 | 31 | git commit -s -m "Commit message" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Black Duck Security Risk](https://copilot.blackducksoftware.com/github/repos/FINRAOS/MegaSparkDiff/branches/main/badge-risk.svg)](https://copilot.blackducksoftware.com/github/repos/FINRAOS/MegaSparkDiff/branches/main) 2 | 3 |

MegaSparkDiff

4 | 5 | MegaSparkDiff is an open source tool that helps you compare any pair 6 | combination of data sets that are of the following types: 7 | 8 | (HDFS, JDBC, S3, Hbase, Text Files, Hive, Json, DynamoDb). 9 | 10 | MegaSparkDiff can run on 11 | (a) Amazon EMR (Elastic Map Reduce), 12 | (b) Amazon EC2 instances and cloud environments with compatible Spark distributions. 13 | (c) DataBricks Interactive Notebooks with Visualizations via displayhtml function. 14 | 15 | How to Use from Within a Java or Scala Project 16 | ---------------------------------------------- 17 | ```sh 18 | 19 | org.finra.megasparkdiff 20 | mega-spark-diff 21 | 0.4.0 22 | 23 | ``` 24 | 25 | SparkFactory 26 | ----------- 27 | parallelizes source/target data. 28 | 29 | The data sources can be in following forms: 30 | Text File 31 | HDFS File 32 | SQL query over a JDBC data source 33 | Hive Table 34 | Json File 35 | DynamoDb Table 36 | 37 | SparkCompare 38 | ------------ 39 | Compares pair combinations of supported sources, 40 | Please note in case of comparing a schema-based source to a non-schema based source, the SparkCompare 41 | class will attempt to flatten the schema based source to delimited values and then do the comparison. The delimiter 42 | can be specified while launching the compare job. 43 | 44 | How to use via shell script in EMR 45 | ---------------------------------- 46 | There will exist a shell script named a3a.sh that will wrap around 47 | this Java/Scala project. This script will accept several parameters 48 | related to source definitions, output destination, and run 49 | configurations, as well as which two data sets to compare. 50 | 51 | The parameters are as follows: 52 | -ds=: The folder where the database 53 | connection parameters and data queries reside 54 | -od=: The directory where MegaSparkDiff will write 55 | its output 56 | -rc=: The file that will be used to load 57 | any special run and Spark configurations. This parameter is 58 | optional 59 | 60 | To specify a data set to compare, pass in the name of one of the 61 | data queries found in a config file inside 62 | prepended by "--". The program will execute the queries assigned to 63 | the names passed into the command line, store them into tables, and 64 | perform the comparison. 65 | 66 | Example call: 67 | ./msd.sh -ds=./data_sources/ -od=output --shraddha --carlos 68 | 69 | Additionally, the user will have the option to add JDBC Driver jar 70 | files by including them in the classpath. This is to enable them to 71 | extract from whichever database they choose. 72 | 73 | Run tests on Windows 74 | ------------ 75 | 1. Download [Hadoop winutils](https://github.com/steveloughran/winutils) 76 | 1. Extract to some path, e.g. C:\Users\MegaSparkDiffFan\bin 77 | 1. Run tests while defining `hadoop.home.dir`, e.g. `mvn test -Dhadoop.home.dir=C:\Users\MegaSparkDiffFan` 78 | -------------------------------------------------------------------------------- /config/java.header: -------------------------------------------------------------------------------- 1 | (/\*|#|)$ 2 | ( \*|#|)$ 3 | ( \*|#|)$ 4 | ( \*|#|)$ 5 | ( \*|#|)$ 6 | ( \*|#|)$ 7 | ( \*|#|)$ 8 | ( \*|#|)$ 9 | ( \*|#|)$ 10 | ( \*|#|)$ 11 | ( \*|#|)$ 12 | ( \*|#|)$ 13 | ( \*|#|)$ 14 | ( \*|#|)$ 15 | ( \*/|#|)$ -------------------------------------------------------------------------------- /config/scalastyle-output.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /config/scalastyle-output_dg-common.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /examples/TransformationTesting/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.finra.megasparkdiff 8 | examples.transformation-testing 9 | 0.4.0 10 | 11 | 12 | UTF-8 13 | 1.8 14 | 1.8 15 | 3.11.0 16 | 3.6.1 17 | true 18 | 19 | 20 | 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-compiler-plugin 25 | ${maven.compiler.plugin.version} 26 | 27 | ${maven.compiler.source} 28 | ${maven.compiler.target} 29 | 30 | 31 | 32 | org.apache.maven.plugins 33 | maven-dependency-plugin 34 | ${maven.dependency.plugin.version} 35 | 36 | 37 | 38 | 39 | 40 | 41 | org.finra.megasparkdiff 42 | mega-spark-diff 43 | 0.4.0 44 | 45 | 46 | 47 | io.zonky.test 48 | embedded-postgres 49 | 2.0.6 50 | test 51 | 52 | 53 | 54 | org.postgresql 55 | postgresql 56 | 42.7.1 57 | test 58 | 59 | 60 | 61 | com.h2database 62 | h2 63 | 2.2.224 64 | test 65 | 66 | 67 | 68 | junit 69 | junit 70 | 4.13.2 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/java/org/finra/msd/examples/FileToPgTest.java: -------------------------------------------------------------------------------- 1 | package org.finra.msd.examples; 2 | 3 | import org.apache.commons.lang.WordUtils; 4 | import org.apache.spark.api.java.function.MapFunction; 5 | import org.apache.spark.sql.Dataset; 6 | import org.apache.spark.sql.Row; 7 | import org.apache.spark.sql.RowFactory; 8 | import org.apache.spark.sql.catalyst.encoders.RowEncoder; 9 | import org.apache.spark.sql.functions; 10 | import org.apache.spark.sql.types.DataTypes; 11 | import org.apache.spark.sql.types.StructField; 12 | import org.apache.spark.sql.types.StructType; 13 | import org.finra.msd.containers.AppleTable; 14 | import org.finra.msd.containers.DiffResult; 15 | import org.finra.msd.examples.db.PostgresDatabase; 16 | import org.finra.msd.sparkcompare.SparkCompare; 17 | import org.finra.msd.sparkfactory.SparkFactory; 18 | import org.junit.*; 19 | 20 | import java.io.IOException; 21 | import java.math.BigDecimal; 22 | import java.sql.SQLException; 23 | import java.util.ArrayList; 24 | import java.util.Arrays; 25 | import java.util.List; 26 | 27 | public class FileToPgTest { 28 | 29 | @BeforeClass 30 | public static void start() throws IOException, ClassNotFoundException { 31 | PostgresDatabase.startPostgres(); 32 | } 33 | 34 | @AfterClass 35 | public static void stop() throws IOException { 36 | PostgresDatabase.stopPostgres(); 37 | } 38 | 39 | @Before 40 | public void setUp() throws IOException, SQLException { 41 | PostgresDatabase.setUp(); 42 | } 43 | 44 | @After 45 | public void tearDown() throws IOException, SQLException { 46 | PostgresDatabase.tearDown(); 47 | } 48 | 49 | @Test 50 | public void testTransformFileToPg() throws SQLException { 51 | SparkFactory.initializeSparkLocalMode("local[*]", "WARN", "1"); 52 | 53 | // See H2ToPgTest example to find other methods of reading from DBs 54 | 55 | // Parallelize the source text file 56 | AppleTable leftTable = SparkFactory 57 | .parallelizeTextSource(FileToPgTest.class.getResource( 58 | "/appliance_source.txt").getPath(), 59 | "appliance_left"); 60 | 61 | // Parallelize the target data 62 | AppleTable rightTable = SparkFactory 63 | .parallelizeJDBCSource("org.postgresql.Driver", 64 | PostgresDatabase.getUrl(), 65 | PostgresDatabase.getProperties().getProperty("user"), 66 | PostgresDatabase.getProperties().getProperty("password"), 67 | "(select * from appliance) a", "appliance_right"); 68 | 69 | // Parallelize the reference data 70 | AppleTable typeTable = SparkFactory 71 | .parallelizeJDBCSource("org.postgresql.Driver", 72 | PostgresDatabase.getUrl(), 73 | PostgresDatabase.getProperties().getProperty("user"), 74 | PostgresDatabase.getProperties().getProperty("password"), 75 | "(select * from appliance_type) a", "appliance_type"); 76 | 77 | 78 | 79 | 80 | // Handle the source's "NAME" column transformation/split to the target's "name" and "brand" 81 | // columns. 82 | // First register two new UDFs, giving the UDFs a name, a lambda, and a data type to return. 83 | // We use the WordUtils.capitalize method of the Apache Commons Lang library to capitalize 84 | // the first character of each word. 85 | // Both UDFs return a String type, since the column data type is the same among both tables. 86 | 87 | // For the target name, the lambda gets the data before the comma. 88 | SparkFactory.sparkSession().udf().register("split_name", 89 | (String x) -> WordUtils.capitalize(x.substring(0, x.indexOf(","))), 90 | DataTypes.StringType); 91 | 92 | // And for the target brand, the lambda gets the data after the comma. 93 | SparkFactory.sparkSession().udf().register("split_brand", 94 | (String x) -> WordUtils.capitalize(x.substring(x.indexOf(",") + 1)), 95 | DataTypes.StringType); 96 | 97 | 98 | // Handle the round-up integer division of "SALES_AMOUNT" and "PRICE" to determine units_sold. 99 | SparkFactory.sparkSession().udf().register("calculate_units_sold", 100 | (String x, String y) -> Integer.valueOf( 101 | new BigDecimal(x).divide(new BigDecimal(y), BigDecimal.ROUND_HALF_UP).setScale(0, BigDecimal.ROUND_HALF_UP).toString() 102 | ), 103 | DataTypes.IntegerType); 104 | 105 | // Handle the capitalization the first letter of each word of the source's "TYPE". 106 | SparkFactory.sparkSession().udf().register("capitalize_type", 107 | (String x) -> WordUtils.capitalize(x), 108 | DataTypes.StringType); 109 | 110 | 111 | 112 | 113 | // Create a list of the source column names 114 | List fieldNamesLeft = Arrays 115 | .asList("name", "type", "sales_amount", "price", "date_added"); 116 | 117 | // Create the schema for each column with the String data type and nullable property of true 118 | List structFieldsLeft = new ArrayList<>(); 119 | for (String fieldNameLeft : fieldNamesLeft) { 120 | structFieldsLeft.add(DataTypes.createStructField(fieldNameLeft, DataTypes.StringType, true)); 121 | } 122 | 123 | StructType leftSchema = DataTypes.createStructType(structFieldsLeft); 124 | 125 | // Create a dataframe containing the schema defined above, with data populated by splitting the 126 | // text file by character ";" 127 | Dataset leftTableTransformDF = leftTable.getDataFrame().map((MapFunction) x -> { 128 | Object[] columns = x.getString(0).split(";"); 129 | return RowFactory.create(columns); 130 | }, RowEncoder.apply(leftSchema)); 131 | 132 | 133 | 134 | 135 | // Call withColumn operations, passing the source "NAME" to the UDFs "split_name" and 136 | // "split_brand" and storing the results in "name_temp" and "brand" respectively. 137 | // Then drop column "name" and rename "name_temp" to "name". 138 | leftTableTransformDF = leftTableTransformDF 139 | .withColumn("name_temp", 140 | functions.callUDF("split_name", functions.col("name"))) 141 | .withColumn("brand", 142 | functions.callUDF("split_brand", functions.col("name"))) 143 | .drop("name") 144 | .withColumnRenamed("name_temp", "name"); 145 | 146 | // Call the withColumn operation, passing both the source SALES_AMOUNT and PRICE columns to the 147 | // UDF "calculate_units_sold" and storing the result in column "units_sold". 148 | leftTableTransformDF = leftTableTransformDF 149 | .withColumn("units_sold", 150 | functions.callUDF("calculate_units_sold", functions.col("sales_amount"), 151 | functions.col("price"))); 152 | 153 | // Call the withColumn operation, passing the "TYPE" column to the UDF "capitalize_type" and 154 | // storing the result in column "type". 155 | leftTableTransformDF = leftTableTransformDF 156 | .withColumn("type", 157 | functions.callUDF("capitalize_type", functions.col("type"))); 158 | 159 | // Join with the reference table to get the "type_id" value, drop the original "type" column, 160 | // and rename "type_id" to "type". 161 | leftTableTransformDF = leftTableTransformDF.as("a").join(typeTable.getDataFrame().as("b"), 162 | leftTableTransformDF.col("type").equalTo(typeTable.getDataFrame().col("type_name")), 163 | "leftouter") 164 | .select("a.*", "b.type_id") 165 | .drop("type") 166 | .withColumnRenamed("type_id", "type"); 167 | 168 | 169 | 170 | 171 | // Select all columns in transformed left dataframe that exist in right dataframe, preserving 172 | // order of columns in right dataframe. 173 | leftTableTransformDF = leftTableTransformDF.selectExpr(rightTable.getDataFrame().columns()); 174 | 175 | // Update the view of the transformed left dataframe 176 | leftTableTransformDF.createOrReplaceTempView(leftTable.getTempViewName()); 177 | 178 | // Flatten the transformed dataframe 179 | leftTableTransformDF = SparkFactory.flattenDataFrame(leftTableTransformDF, ","); 180 | 181 | // Create a new AppleTable with transformed dataframe 182 | AppleTable leftTableTransform = new AppleTable(leftTable.getSourceType(), leftTableTransformDF, 183 | leftTable.getDelimiter(), leftTable.getTempViewName()); 184 | 185 | 186 | 187 | 188 | // Comparison of transformed left dataframe and right dataframe 189 | DiffResult result = SparkCompare 190 | .compareAppleTables(leftTableTransform, rightTable); 191 | 192 | Assert.assertEquals(0, result.inLeftNotInRight().count()); 193 | Assert.assertEquals(0, result.inRightNotInLeft().count()); 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/java/org/finra/msd/examples/H2ToPgTest.java: -------------------------------------------------------------------------------- 1 | package org.finra.msd.examples; 2 | 3 | import org.apache.commons.lang.WordUtils; 4 | import org.apache.spark.sql.Dataset; 5 | import org.apache.spark.sql.Row; 6 | import org.apache.spark.sql.functions; 7 | import org.apache.spark.sql.types.DataTypes; 8 | import org.finra.msd.containers.AppleTable; 9 | import org.finra.msd.containers.DiffResult; 10 | import org.finra.msd.enums.SourceType; 11 | import org.finra.msd.examples.db.H2Database; 12 | import org.finra.msd.examples.db.PostgresDatabase; 13 | import org.finra.msd.sparkcompare.SparkCompare; 14 | import org.finra.msd.sparkfactory.SparkFactory; 15 | import org.junit.*; 16 | 17 | import java.io.IOException; 18 | import java.math.BigDecimal; 19 | import java.sql.SQLException; 20 | 21 | public class H2ToPgTest { 22 | 23 | @BeforeClass 24 | public static void start() throws IOException, ClassNotFoundException { 25 | PostgresDatabase.startPostgres(); 26 | H2Database.setH2Driver(); 27 | } 28 | 29 | @AfterClass 30 | public static void stop() throws IOException { 31 | PostgresDatabase.stopPostgres(); 32 | } 33 | 34 | @Before 35 | public void setUp() throws IOException, SQLException { 36 | PostgresDatabase.setUp(); 37 | H2Database.setUp(); 38 | } 39 | 40 | @After 41 | public void tearDown() throws IOException, SQLException { 42 | PostgresDatabase.tearDown(); 43 | H2Database.tearDown(); 44 | } 45 | 46 | @Test 47 | public void testTransformH2ToPg() throws SQLException { 48 | SparkFactory.initializeSparkLocalMode("local[*]", "WARN", "1"); 49 | 50 | // Shown below are the two methods of reading a JDBC database. 51 | // The first uses a pre-configured method that directly gives you an AppleTable. 52 | // The second builds a customized Spark RDD and converts it into an AppleTable. 53 | // The first is nice for quick setups, and basic applications. 54 | // The second is nearly required for any performance enhancements, and is highly recommended. 55 | // Parallelize the source data using pre-configured method. 56 | AppleTable leftTable = SparkFactory 57 | .parallelizeJDBCSource("org.h2.Driver", 58 | H2Database.getUrl(), 59 | H2Database.getProperties().getProperty("user"), 60 | H2Database.getProperties().getProperty("password"), 61 | "(select * from appliance) a", "appliance_left"); 62 | 63 | // Parallelize the target data using a customized Spark RDD. 64 | // See the below link to find out what these options do. 65 | // It's recommended to use the below settings as a bare minimum. 66 | // http://spark.apache.org/docs/latest/sql-programming-guide.html#jdbc-to-other-databases 67 | Dataset rightDataFrame = SparkFactory.sparkSession().sqlContext().read() 68 | .format("jdbc") 69 | .option("driver", "org.postgresql.Driver") 70 | .option("url", PostgresDatabase.getUrl()) 71 | .option("dbtable", "(select * from appliance) a") 72 | .option("user", PostgresDatabase.getProperties().getProperty("user")) 73 | .option("password", PostgresDatabase.getProperties().getProperty("password")) 74 | .option("partitionColumn", "price") // A numeric column 75 | .option("lowerBound", "0") // Typically you want this to be the minimum value 76 | .option("upperBound", "500") // Typically you want this to be the maximum value 77 | .option("numPartitions", "2") // Number of partitions to break the db into 78 | .option("fetchSize", "10") // Default is 10, increasing reduces network lag 79 | .load(); 80 | 81 | rightDataFrame.createOrReplaceTempView("appliance_right"); 82 | 83 | AppleTable rightTable = new AppleTable(SourceType.JDBC, rightDataFrame, ",", "appliance_right"); 84 | 85 | // Parallelize the reference data 86 | AppleTable typeTable = SparkFactory 87 | .parallelizeJDBCSource("org.postgresql.Driver", 88 | PostgresDatabase.getUrl(), 89 | PostgresDatabase.getProperties().getProperty("user"), 90 | PostgresDatabase.getProperties().getProperty("password"), 91 | "(select * from appliance_type) a", "appliance_type"); 92 | 93 | // Handle the source's "NAME" column transformation/split to the target's "name" and "brand" 94 | // columns. 95 | // First register two new UDFs, giving the UDFs a name, a lambda, and a data type to return. 96 | // We use the WordUtils.capitalize method of the Apache Commons Lang library to capitalize 97 | // the first character of each word. 98 | // Both UDFs return a String type, since the column data type is the same among both tables. 99 | 100 | // For the target name, the lambda gets the data before the comma. 101 | SparkFactory.sparkSession().udf().register("split_name", 102 | (String x) -> WordUtils.capitalize(x.substring(0, x.indexOf(","))), 103 | DataTypes.StringType); 104 | 105 | // And for the target brand, the lambda gets the data after the comma. 106 | SparkFactory.sparkSession().udf().register("split_brand", 107 | (String x) -> WordUtils.capitalize(x.substring(x.indexOf(",") + 1)), 108 | DataTypes.StringType); 109 | 110 | 111 | // Handle the round-up integer division of "SALES_AMOUNT" and "PRICE" to determine units_sold. 112 | SparkFactory.sparkSession().udf().register("calculate_units_sold", 113 | (BigDecimal x, BigDecimal y) -> Integer.valueOf( 114 | x.divide(y, BigDecimal.ROUND_HALF_UP).setScale(0, BigDecimal.ROUND_HALF_UP).toString() 115 | ), 116 | DataTypes.IntegerType); 117 | 118 | // Handle the capitalization the first letter of each word of the source's "TYPE". 119 | SparkFactory.sparkSession().udf().register("capitalize_type", 120 | (String x) -> WordUtils.capitalize(x), 121 | DataTypes.StringType); 122 | 123 | 124 | 125 | 126 | Dataset leftTableTransformDF = leftTable.getDataFrame(); 127 | 128 | // Lower case all the columns in the source dataframe. 129 | for (String column : leftTableTransformDF.columns()) { 130 | leftTableTransformDF = leftTableTransformDF.withColumnRenamed(column, column.toLowerCase()); 131 | } 132 | 133 | // Call withColumn operations, passing the source "NAME" to the UDFs "split_name" and 134 | // "split_brand" and storing the results in "name_temp" and "brand" respectively. 135 | // Then drop column "name" and rename "name_temp" to "name". 136 | leftTableTransformDF = leftTableTransformDF 137 | .withColumn("name_temp", 138 | functions.callUDF("split_name", functions.col("name"))) 139 | .withColumn("brand", 140 | functions.callUDF("split_brand", functions.col("name"))) 141 | .drop("name") 142 | .withColumnRenamed("name_temp", "name"); 143 | 144 | // Call the withColumn operation, passing both the source SALES_AMOUNT and PRICE columns to the 145 | // UDF "calculate_units_sold" and storing the result in column "units_sold". 146 | leftTableTransformDF = leftTableTransformDF 147 | .withColumn("units_sold", 148 | functions.callUDF("calculate_units_sold", functions.col("sales_amount"), 149 | functions.col("price"))); 150 | 151 | // Call the withColumn operation, passing the "TYPE" column to the UDF "capitalize_type" and 152 | // storing the result in column "type". 153 | leftTableTransformDF = leftTableTransformDF 154 | .withColumn("type", 155 | functions.callUDF("capitalize_type", functions.col("type"))); 156 | 157 | // Join with the reference table to get the "type_id" value, drop the original "type" column, 158 | // and rename "type_id" to "type". 159 | leftTableTransformDF = leftTableTransformDF.as("a").join(typeTable.getDataFrame().as("b"), 160 | leftTableTransformDF.col("type").equalTo(typeTable.getDataFrame().col("type_name")), 161 | "leftouter") 162 | .select("a.*", "b.type_id") 163 | .drop("type") 164 | .withColumnRenamed("type_id", "type"); 165 | 166 | 167 | 168 | 169 | // Select all columns in transformed left dataframe that exist in right dataframe, preserving 170 | // order of columns in right dataframe. 171 | leftTableTransformDF = leftTableTransformDF.selectExpr(rightTable.getDataFrame().columns()); 172 | 173 | // Update the view of the transformed left dataframe 174 | leftTableTransformDF.createOrReplaceTempView(leftTable.getTempViewName()); 175 | 176 | // Create a new AppleTable with transformed dataframe 177 | AppleTable leftTableTransform = new AppleTable(leftTable.getSourceType(), leftTableTransformDF, 178 | leftTable.getDelimiter(), leftTable.getTempViewName()); 179 | 180 | 181 | 182 | 183 | // Comparison of transformed left dataframe and right dataframe 184 | DiffResult result = SparkCompare 185 | .compareAppleTables(leftTableTransform, rightTable); 186 | 187 | Assert.assertEquals(0, result.inLeftNotInRight().count()); 188 | Assert.assertEquals(0, result.inRightNotInLeft().count()); 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/java/org/finra/msd/examples/db/H2Database.java: -------------------------------------------------------------------------------- 1 | package org.finra.msd.examples.db; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.io.Resources; 5 | import java.io.IOException; 6 | import java.sql.Connection; 7 | import java.sql.DriverManager; 8 | import java.sql.SQLException; 9 | import java.sql.Statement; 10 | import java.util.Properties; 11 | 12 | public class H2Database { 13 | private static String url = "jdbc:h2:./sample"; 14 | private static Properties properties = new Properties(); 15 | static { 16 | properties.setProperty("user", "username"); 17 | properties.setProperty("password", "password"); 18 | } 19 | 20 | public static void setH2Driver() throws ClassNotFoundException { 21 | Class.forName("org.h2.Driver"); 22 | } 23 | 24 | public static String getUrl() { 25 | return url; 26 | } 27 | 28 | public static Properties getProperties() { 29 | return properties; 30 | } 31 | 32 | public static void setUp() throws IOException, SQLException { 33 | try (Connection conn = DriverManager.getConnection(url, properties)) { 34 | try (Statement stmt = conn.createStatement()) { 35 | stmt.executeUpdate(Resources.toString(Resources.getResource( 36 | "h2_db.sql"), Charsets.UTF_8)); 37 | } 38 | } 39 | } 40 | 41 | public static void tearDown() throws IOException, SQLException { 42 | try (Connection conn = DriverManager.getConnection(url, properties)) { 43 | try (Statement stmt = conn.createStatement()) { 44 | stmt.executeUpdate("DROP TABLE appliance"); 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/java/org/finra/msd/examples/db/PostgresDatabase.java: -------------------------------------------------------------------------------- 1 | package org.finra.msd.examples.db; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.io.Resources; 5 | import io.zonky.test.db.postgres.embedded.EmbeddedPostgres; 6 | 7 | import java.io.IOException; 8 | import java.sql.Connection; 9 | import java.sql.SQLException; 10 | import java.sql.Statement; 11 | import java.util.Properties; 12 | 13 | public class PostgresDatabase { 14 | private static EmbeddedPostgres postgres; 15 | 16 | private static Properties properties = new Properties(); 17 | static { 18 | properties.setProperty("user", "postgres"); 19 | properties.setProperty("password", "postgres"); 20 | } 21 | 22 | public static String getUrl() { 23 | return postgres.getJdbcUrl(properties.getProperty("user"), "postgres"); 24 | } 25 | 26 | public static Properties getProperties() { 27 | return properties; 28 | } 29 | 30 | public static void startPostgres() throws IOException { 31 | postgres = EmbeddedPostgres.builder().start(); 32 | } 33 | 34 | public static void stopPostgres() throws IOException { 35 | postgres.close(); 36 | } 37 | 38 | public static void setUp() throws IOException, SQLException { 39 | try (Connection conn = postgres.getPostgresDatabase().getConnection()) { 40 | try (Statement stmt = conn.createStatement()) { 41 | stmt.executeUpdate(Resources.toString(Resources.getResource( 42 | "pg_db.sql"), Charsets.UTF_8)); 43 | } 44 | } 45 | } 46 | 47 | public static void tearDown() throws SQLException { 48 | try (Connection conn = postgres.getPostgresDatabase().getConnection()) { 49 | try (Statement stmt = conn.createStatement()) { 50 | stmt.executeUpdate("DROP TABLE appliance, appliance_type"); 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/resources/appliance_source.txt: -------------------------------------------------------------------------------- 1 | some refrigerator,some brand 1;refrigerator;1000.00;250.00;2017-06-01 2 | some washer,some brand 2;washer;5000.00;500.00;2017-01-17 3 | some dryer,some brand 3;dryer;2500.00;500.00;2017-04-23 -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/resources/h2_db.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS APPLIANCE; 2 | 3 | CREATE TABLE APPLIANCE 4 | ( 5 | NAME varchar(50), 6 | TYPE varchar(20), 7 | SALES_AMOUNT decimal(10,2), 8 | PRICE decimal(10,2), 9 | DATE_ADDED date 10 | ); 11 | 12 | INSERT INTO APPLIANCE VALUES 13 | ('some refrigerator,some brand 1', 'refrigerator', 1000.00, 250.00, '2017-06-01'), 14 | ('some washer,some brand 2', 'washer', 5000.00, 500.00, '2017-01-17'), 15 | ('some dryer,some brand 3', 'dryer', 2500.00, 500.00, '2017-04-23'); -------------------------------------------------------------------------------- /examples/TransformationTesting/src/test/resources/pg_db.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS appliance; 2 | 3 | CREATE TABLE appliance 4 | ( 5 | name varchar(25), 6 | brand varchar(25), 7 | type bigint, 8 | units_sold integer, 9 | price decimal(10,2), 10 | date_added date 11 | ); 12 | 13 | INSERT INTO appliance VALUES 14 | ('Some Refrigerator', 'Some Brand 1', 1, 4, 250.00, '2017-06-01'), 15 | ('Some Washer', 'Some Brand 2', 2, 10, 500.00, '2017-01-17'), 16 | ('Some Dryer', 'Some Brand 3', 3, 5, 500.00, '2017-04-23'); 17 | 18 | DROP TABLE IF EXISTS appliance_type; 19 | 20 | CREATE TABLE appliance_type 21 | ( 22 | type_id bigint, 23 | type_name varchar(20) 24 | ); 25 | 26 | INSERT INTO appliance_type VALUES 27 | (1, 'Refrigerator'), 28 | (2, 'Washer'), 29 | (3, 'Dryer'); -------------------------------------------------------------------------------- /mega-spark-diff/src/main/java/org/finra/msd/containers/SourceVars.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.containers; 18 | 19 | import org.finra.msd.util.FileUtil; 20 | 21 | import java.io.File; 22 | import java.util.HashMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | /** 27 | * Extracts the database connection details 28 | */ 29 | public class SourceVars 30 | { 31 | /** 32 | * Specifies source type 33 | */ 34 | private String connection; 35 | 36 | /** 37 | * Connection variables 38 | */ 39 | private Map vars; 40 | 41 | /** 42 | * Queries to retrieve data 43 | */ 44 | private Map queries; 45 | 46 | /** 47 | * Initializes the constructor with a file consisting database connection details 48 | * @param fileName 49 | */ 50 | public SourceVars(String fileName) 51 | { 52 | this(new File(fileName)); 53 | } 54 | 55 | /** 56 | * Convert an input configuration file into a map 57 | * containing connection properties and query specifications 58 | * @param source this is the input file that contains the configuration per the spec in the usage documentation 59 | */ 60 | public SourceVars(File source) 61 | { 62 | vars = new HashMap(); 63 | queries = new HashMap(); 64 | 65 | List fileContents = FileUtil.fileToStringList(source); 66 | for (String line : fileContents) 67 | { 68 | line = line.trim().replaceAll(" +"," "); 69 | if (line.isEmpty()) 70 | continue; 71 | if (line.charAt(0) == '@') 72 | { 73 | String dataName = line.substring(1,line.indexOf(":")).trim(), 74 | query = line.substring(line.indexOf(":")+1).trim(); 75 | queries.put(dataName,query); 76 | } 77 | else if (line.contains("=")) 78 | { 79 | String key = line.substring(0, line.indexOf("=")).trim(); 80 | String val = line.substring(line.indexOf("=") + 1).trim(); 81 | if (key.equals("connection")) 82 | connection = val; 83 | else 84 | vars.put(key,val); 85 | } 86 | } 87 | } 88 | 89 | 90 | /** 91 | * Gets the type of database connection 92 | * @return This string represents the full JDBC connection that will be passed to the JDBC driver in order 93 | * to connect to the DB. The expectation is that it contains the authentication user and password as well. 94 | */ 95 | public String getConnection() { return connection; } 96 | 97 | /** 98 | * Gets the database connection details based on the key provided 99 | * @param key Variable used to define the connection details like driver, URL, username, password 100 | * @return The value of the passed variable for database connection 101 | */ 102 | public String getVar(String key) { return vars.get(key); } 103 | 104 | /** 105 | * Gets the data retrieval query specified in the source file based on the corresponding variable name 106 | * @param dataName variable to describe the source 107 | * @return query associated with the passed variable name 108 | */ 109 | public String getQuery(String dataName) { return queries.get(dataName); } 110 | } 111 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/java/org/finra/msd/enums/SourceType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.enums; 18 | 19 | /** 20 | * List of different type of sources that MegaSparkDiff supports 21 | */ 22 | public enum SourceType { 23 | JDBC("JDBC"), 24 | HIVE("HIVE"), 25 | FILE("FILE"), 26 | DYNAMODB("DYNAMODB"), 27 | JSON("JSON"), 28 | CSV("CSV"); 29 | 30 | /** 31 | * Represents the source type 32 | */ 33 | private final String text; 34 | 35 | /** 36 | * Initializes the constructor with the source type provided 37 | * @param text 38 | */ 39 | private SourceType(final String text) { 40 | this.text = text; 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return text; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/java/org/finra/msd/enums/VisualResultType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.enums; 18 | 19 | public enum VisualResultType { 20 | LEFT("LEFT"), 21 | RIGHT("RIGHT"), 22 | BOTH("BOTH"); 23 | 24 | /** 25 | * Represents the visual result type 26 | */ 27 | private final String text; 28 | 29 | /** 30 | * Initializes the constructor with the visual result type provided 31 | * @param text 32 | */ 33 | private VisualResultType(final String text) { 34 | this.text = text; 35 | } 36 | 37 | @Override 38 | public String toString() { 39 | return text; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/java/org/finra/msd/launcher/Launcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.launcher; 18 | 19 | import org.finra.msd.containers.AppleTable; 20 | import org.finra.msd.containers.CmdLine; 21 | import org.finra.msd.containers.SourceVars; 22 | import org.finra.msd.sparkcompare.SparkCompare; 23 | import org.finra.msd.sparkfactory.SparkFactory; 24 | import scala.Option; 25 | 26 | import java.io.IOException; 27 | 28 | /** 29 | * Main class that parses the command line arguments 30 | * which includes source/output/configuration details, carries out comparison and provides the result of comparison 31 | */ 32 | public class Launcher { 33 | 34 | public static void main(String[] args) throws IOException 35 | { 36 | /** Parse **/ 37 | CmdLine values = new CmdLine(args); 38 | SourceVars sv1 = new SourceVars(values.getSource1()), 39 | sv2 = new SourceVars(values.getSource2()); 40 | 41 | /** Generate AppleTables from inputs **/ 42 | SparkFactory.initializeSparkContext(); 43 | AppleTable leftAppleTable = generateAppleTable(sv1, values.getData1(),"table1"); 44 | AppleTable rightAppleTable = generateAppleTable(sv2, values.getData2(),"table2"); 45 | 46 | 47 | /** Compare tables and save output to file **/ 48 | SparkCompare.compareAppleTablesSaveResults( 49 | leftAppleTable, 50 | rightAppleTable, 51 | values.getOutputDirectory(), 52 | true , values.getDelimiter()); 53 | SparkFactory.stopSparkContext(); 54 | } 55 | 56 | 57 | /** 58 | * Generate the table (in our internal format) containing the data specified by the input 59 | * @param sv Source Variables 60 | * @param dataSetName the parameter name representing the designated dataset 61 | * @param tempViewName a user specified name for this view 62 | * @return 63 | */ 64 | public static AppleTable generateAppleTable(SourceVars sv, String dataSetName, String tempViewName) 65 | { 66 | switch (sv.getConnection().toLowerCase()) 67 | { 68 | case "jdbc": return SparkFactory.parallelizeJDBCSource( 69 | sv.getVar("driver"), 70 | sv.getVar("url"), 71 | sv.getVar("user"), 72 | sv.getVar("password"), 73 | sv.getQuery(dataSetName), 74 | tempViewName, 75 | Option.apply(sv.getVar("delimiter"))); 76 | case "hive": return SparkFactory.parallelizeHiveSource( 77 | sv.getQuery(dataSetName), 78 | tempViewName); 79 | case "file": return SparkFactory.parallelizeTextSource( 80 | sv.getQuery(dataSetName), 81 | tempViewName); 82 | default: return null; 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/java/org/finra/msd/util/FileUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.util; 18 | 19 | import org.apache.commons.io.FileUtils; 20 | 21 | import java.io.BufferedReader; 22 | import java.io.File; 23 | import java.io.FileReader; 24 | import java.io.IOException; 25 | import java.util.ArrayList; 26 | import java.util.Base64; 27 | import java.util.List; 28 | 29 | /** 30 | * Contains file handling operations 31 | */ 32 | public class FileUtil 33 | { 34 | /** 35 | * Returns a file as a list of its lines 36 | * @param fileName 37 | * @return file contents per line 38 | */ 39 | public static List fileToStringList(File fileName) 40 | { 41 | BufferedReader br; 42 | List fileContent = new ArrayList(); 43 | 44 | try 45 | { 46 | br = new BufferedReader(new FileReader(fileName)); 47 | String line; 48 | while ((line = br.readLine()) != null) 49 | fileContent.add(line.trim().replaceAll(" +"," ")); 50 | br.close(); 51 | } 52 | catch (IOException e) 53 | { 54 | e.printStackTrace(); 55 | } 56 | 57 | return fileContent; 58 | } 59 | 60 | /** 61 | * Creates a directory 62 | * @param path Location where the directory has to be created 63 | */ 64 | public static void createDirectory(String path) 65 | { 66 | File theDir = new File(path); 67 | // if the directory does not exist, create it 68 | if (!theDir.exists()) { 69 | theDir.mkdir(); 70 | } 71 | } 72 | 73 | /** 74 | * Return the decoded contents of a file encoded in base 64 75 | * @param absoluteFileLocation 76 | * @return 77 | */ 78 | public static String decodeBase64File(String absoluteFileLocation) 79 | { 80 | String decoded = ""; 81 | try { 82 | decoded = new String(Base64.getDecoder().decode(FileUtils.readFileToString(new File(absoluteFileLocation)).trim())).trim(); 83 | } catch (IOException e) { 84 | System.out.println(e); 85 | } 86 | return decoded; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/data_sources/hive_test.txt: -------------------------------------------------------------------------------- 1 | connection = hive 2 | 3 | @v6 : select * from version_six 4 | @v9 : select * from version_nine -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/data_sources/jdbc_test.txt: -------------------------------------------------------------------------------- 1 | connection = jdbc 2 | 3 | driver = org.hsqldb.jdbc.JDBCDriver 4 | url = jdbc:hsqldb:hsql://127.0.0.1:9001/testDb 5 | user = username 6 | password = password 7 | 8 | @test_table : select * from table_name 9 | @another_table : select * from another_table -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/data_sources/person_tables.txt: -------------------------------------------------------------------------------- 1 | connection = jdbc 2 | 3 | driver = org.hsqldb.jdbc.JDBCDriver 4 | url = jdbc:hsqldb:hsql://127.0.0.1:9001/testDb 5 | user = SA 6 | password = 7 | 8 | @shraddha : (select * from Persons1 where personid=2) 9 | @carlos : (select * from Persons1 Where personid=1) -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/data_sources/s3_test.txt: -------------------------------------------------------------------------------- 1 | connection = file 2 | 3 | cloudpass.blahblahbla = pffft 4 | more.cloudpass.stuff = nope 5 | 6 | @s3_loc_1 : s3://asdfadsfasdfas 7 | @s3_loc_2 : s3://zxcvzxcvzxcvvv -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/htmltemplates/horizontalTableTemplate.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 43 | 44 | 45 | #tableContent 46 | 47 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/run_configs/spark_configs.txt: -------------------------------------------------------------------------------- 1 | numExecutors = 2 | executorMemory = 3 | controllerRAM = 4 | numPartitions = 5 | 6 | partitionByColumn = 7 | lowerBound = 8 | upperBound = -------------------------------------------------------------------------------- /mega-spark-diff/src/main/resources/shell/msd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | filepath=`dirname $0` 4 | HERE=`echo $(cd $filepath; pwd)` 5 | 6 | spark-submit ${HERE}/mega-spark-diff-0.1.jar $@ 7 | #java -jar ${HERE}/mega-spark-diff-0.1.jar $@ -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | import org.apache.spark.sql.connector.catalog.{Table, TableProvider} 4 | import org.apache.spark.sql.connector.expressions.Transform 5 | import org.apache.spark.sql.types.StructType 6 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 7 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 8 | 9 | import java.util 10 | 11 | class DefaultSource extends TableProvider { 12 | 13 | override def inferSchema(options: CaseInsensitiveStringMap): StructType = 14 | throw new IllegalArgumentException( 15 | s"dynamodb does not support inferred schema. Please specify the schema.") 16 | 17 | override def getTable(schema: StructType, partitioning: Array[Transform], properties: util.Map[String, String]): Table = 18 | new DynamoDbTable(options = new CaseInsensitiveStringMap(properties), schema) 19 | 20 | override def supportsExternalMetadata(): Boolean = true 21 | } 22 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/DynamoDbPartitionReader.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | import com.amazonaws.services.dynamodbv2.document.Item 4 | import com.amazonaws.util.json.Jackson 5 | import com.audienceproject.spark.dynamodb.connector.DynamoConnector 6 | import com.audienceproject.spark.dynamodb.datasource.ScanPartition 7 | import org.apache.spark.sql.catalyst.InternalRow 8 | import org.apache.spark.sql.connector.read.PartitionReader 9 | import org.apache.spark.sql.types.StructType 10 | import org.apache.spark.unsafe.types.UTF8String 11 | 12 | import scala.collection.JavaConverters.asScalaIteratorConverter 13 | 14 | class DynamoDbPartitionReader(connector: DynamoConnector, schema: StructType, partition: ScanPartition) extends PartitionReader[InternalRow] { 15 | 16 | /** 17 | * code based on com.audienceproject:spark.dynamodb 18 | * DynamoReaderFactory 19 | */ 20 | private val pageIterator = 21 | connector.scan(partition.partitionIndex, partition.requiredColumns, partition.filters).pages().iterator().asScala 22 | 23 | private var rowIterator = Iterator[Item]() 24 | private var result = new Item 25 | 26 | override def next(): Boolean = { 27 | if (rowIterator.hasNext) { 28 | result = rowIterator.next() 29 | true 30 | } 31 | else if (pageIterator.hasNext) { 32 | rowIterator = pageIterator.next().getLowLevelResult.getItems.iterator().asScala 33 | next() 34 | } 35 | else false 36 | } 37 | 38 | override def get(): InternalRow = { 39 | val resultRow = for (x <- schema) yield 40 | if (result.isNull(x.name) || result.get(x.name) == null) null 41 | else UTF8String.fromString( 42 | result.get(x.name) match { 43 | case value: MapAny => toJSON(value) 44 | case value: ListAny => toJSON(value) 45 | case value: SetAny => toJSON(value) 46 | case _ => toJSON(result.get(x.name)) 47 | }) 48 | val row = InternalRow(resultRow:_*) 49 | row 50 | } 51 | 52 | private def toJSON(value: Any): String = { 53 | Jackson.toJsonString(value) 54 | } 55 | 56 | override def close(): Unit = Unit 57 | } 58 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/DynamoDbPartitionReaderFactory.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | import com.audienceproject.spark.dynamodb.connector.DynamoConnector 4 | import com.audienceproject.spark.dynamodb.datasource.ScanPartition 5 | import org.apache.spark.sql.catalyst.InternalRow 6 | import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} 7 | import org.apache.spark.sql.types.StructType 8 | 9 | class DynamoDbPartitionReaderFactory(connector: DynamoConnector, schema: StructType) extends PartitionReaderFactory { 10 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = 11 | new DynamoDbPartitionReader(connector, schema, partition.asInstanceOf[ScanPartition]) 12 | } 13 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/DynamoDbScan.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | import com.audienceproject.spark.dynamodb.connector.DynamoConnector 4 | import com.audienceproject.spark.dynamodb.datasource.ScanPartition 5 | import org.apache.spark.sql.connector.read.{Batch, InputPartition, PartitionReaderFactory, Scan} 6 | import org.apache.spark.sql.sources.Filter 7 | import org.apache.spark.sql.types.StructType 8 | 9 | class DynamoDbScan(connector: DynamoConnector, filters: Array[Filter], schema: StructType) 10 | extends Scan with Batch { 11 | 12 | override def readSchema(): StructType = schema 13 | 14 | override def toBatch: Batch = this 15 | 16 | /** 17 | * code based on com.audienceproject:spark.dynamodb 18 | * DynamoBatchReader 19 | * 20 | * @return array of input partitions 21 | */ 22 | override def planInputPartitions(): Array[InputPartition] = { 23 | Array.tabulate(connector.totalSegments)(new ScanPartition(_, schema.fieldNames, filters)) 24 | } 25 | 26 | override def createReaderFactory(): PartitionReaderFactory = 27 | new DynamoDbPartitionReaderFactory(connector, schema) 28 | } 29 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/DynamoDbScanBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | import com.audienceproject.spark.dynamodb.connector.{DynamoConnector, FilterPushdown} 4 | import org.apache.spark.sql.connector.expressions.filter.Predicate 5 | import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, SupportsPushDownV2Filters} 6 | import org.apache.spark.sql.sources.Filter 7 | import org.apache.spark.sql.types.StructType 8 | 9 | class DynamoDbScanBuilder(connector: DynamoConnector, schema: StructType) 10 | extends ScanBuilder 11 | with SupportsPushDownRequiredColumns 12 | with SupportsPushDownFilters { 13 | 14 | private var pushedFilter = Array.empty[Filter] 15 | private var finalSchema = schema 16 | 17 | /** 18 | * code from com.audienceproject:spark.dynamodb 19 | * DynamoScanBuilder 20 | * 21 | * @return DynamoDbScan instance 22 | */ 23 | override def build(): Scan = new DynamoDbScan(connector, pushedFilters(), finalSchema) 24 | 25 | /** 26 | * code from com.audienceproject:spark.dynamodb 27 | * DynamoScanBuilder 28 | */ 29 | override def pruneColumns(requiredSchema: StructType): Unit = { 30 | val keyColumns = Seq(Some(connector.keySchema.hashKeyName), connector.keySchema.rangeKeyName).flatten 31 | .flatMap(keyName => finalSchema.fields.find(_.name == keyName)) 32 | val requiredColumns = keyColumns ++ requiredSchema.fields 33 | val newFields = finalSchema.fields.filter(requiredColumns.contains) 34 | finalSchema = StructType(newFields) 35 | } 36 | 37 | /** 38 | * code from com.audienceproject:spark.dynamodb 39 | * DynamoScanBuilder 40 | * 41 | * @return array of filters 42 | */ 43 | override def pushFilters(filters: Array[Filter]): Array[Filter] = { 44 | if (connector.filterPushdownEnabled) { 45 | val (acceptedFilters, postScanFilters) = FilterPushdown.acceptFilters(filters) 46 | this.pushedFilter = acceptedFilters 47 | postScanFilters 48 | } else { 49 | filters 50 | } 51 | } 52 | 53 | /** 54 | * code from com.audienceproject:spark.dynamodb 55 | * DynamoScanBuilder 56 | * 57 | * @return array of filters 58 | */ 59 | override def pushedFilters(): Array[Filter] = pushedFilter 60 | } 61 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/DynamoDbTable.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | import com.audienceproject.spark.dynamodb.connector.{TableConnector, TableIndexConnector} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} 6 | import org.apache.spark.sql.connector.read.ScanBuilder 7 | import org.apache.spark.sql.types.StructType 8 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 9 | 10 | import java.util 11 | import scala.collection.JavaConverters._ 12 | 13 | class DynamoDbTable(options: CaseInsensitiveStringMap, schema: StructType) extends Table with SupportsRead { 14 | /** 15 | * code from com.audienceproject:spark.dynamodb 16 | * DynamoTable 17 | */ 18 | private val dynamoConnector = { 19 | val indexName = Option(options.get("indexname")) 20 | val defaultParallelism = Option(options.get("defaultparallelism")).map(_.toInt).getOrElse(getDefaultParallelism) 21 | val optionsMap = Map(options.asScala.toSeq: _*) 22 | 23 | if (indexName.isDefined) new TableIndexConnector(name(), indexName.get, defaultParallelism, optionsMap) 24 | else new TableConnector(name(), defaultParallelism, optionsMap) 25 | } 26 | 27 | /** 28 | * code from com.audienceproject:spark.dynamodb 29 | * DynamoTable 30 | * 31 | * @return default parallelism 32 | */ 33 | private def getDefaultParallelism: Int = 34 | SparkSession.getActiveSession match { 35 | case Some(spark) => spark.sparkContext.defaultParallelism 36 | } 37 | 38 | override def name(): String = options.get("table") 39 | 40 | override def schema(): StructType = schema 41 | 42 | override def capabilities(): util.Set[TableCapability] = 43 | Set(TableCapability.BATCH_READ).asJava 44 | 45 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = 46 | new DynamoDbScanBuilder(dynamoConnector, schema()) 47 | } 48 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/com/audienceproject/spark/dynamodb/msd/datasource/JavaCollections.scala: -------------------------------------------------------------------------------- 1 | package com.audienceproject.spark.dynamodb.msd.datasource 2 | 3 | case class MapAny(value: java.util.Map[Any, Any]) 4 | case class ListAny(value: java.util.List[Any]) 5 | case class SetAny(value: java.util.Set[Any]) -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/apache/spark/sql/execution/datasources/msd/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.execution.datasources.msd 2 | 3 | import com.fasterxml.jackson.databind.JsonNode 4 | import com.fasterxml.jackson.databind.json.JsonMapper 5 | import com.fasterxml.jackson.databind.node.{JsonNodeFactory, JsonNodeType, ObjectNode} 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.fs.FileStatus 8 | import org.apache.hadoop.mapreduce.Job 9 | import org.apache.spark.sql.SparkSession 10 | import org.apache.spark.sql.catalyst.InternalRow 11 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 12 | import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriterFactory, PartitionedFile, TextBasedFileFormat} 13 | import org.apache.spark.sql.sources.{DataSourceRegister, Filter} 14 | import org.apache.spark.sql.types.StructType 15 | import org.apache.spark.unsafe.types.UTF8String 16 | import org.apache.spark.util.SerializableConfiguration 17 | 18 | import java.util 19 | import java.util.Collections 20 | 21 | class DefaultSource extends TextBasedFileFormat with DataSourceRegister { 22 | override val shortName: String = "jsonmsd" 23 | 24 | override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = null 25 | 26 | override def prepareWrite(sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = null 27 | 28 | /** 29 | * code based on org.apache.spark:spark-core 30 | * JsonFileFormat 31 | * 32 | * @param sparkSession sparkSession 33 | * @param dataSchema dataSchema 34 | * @param partitionSchema partitionSchema 35 | * @param requiredSchema requiredSchema 36 | * @param filters filters 37 | * @param options options 38 | * @param hadoopConf hadoopConf 39 | * @return PartitionedFile => Iterator[InternalRow] 40 | */ 41 | override protected def buildReader(sparkSession: SparkSession, 42 | dataSchema: StructType, 43 | partitionSchema: StructType, 44 | requiredSchema: StructType, 45 | filters: Seq[Filter], 46 | options: Map[String, String], 47 | hadoopConf: Configuration 48 | ): PartitionedFile => Iterator[InternalRow] = { 49 | val broadcastedHadoopConf = 50 | sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf)) 51 | (file: PartitionedFile) => { 52 | JsonFileFormatMsd.readFile( 53 | broadcastedHadoopConf.value.value, 54 | file, 55 | requiredSchema 56 | ) 57 | } 58 | } 59 | } 60 | 61 | class JsonNodeFactorySortedKeys extends JsonNodeFactory { 62 | override def objectNode(): ObjectNode = 63 | new ObjectNode(this, new util.TreeMap[String, JsonNode]()) 64 | } 65 | 66 | object JsonFileFormatMsd extends Serializable { 67 | def readFile( 68 | conf: Configuration, 69 | file: PartitionedFile, 70 | schema: StructType): Iterator[InternalRow] = { 71 | val inputStream = CodecStreams.createInputStreamWithCloseResource(conf, file.toPath) 72 | 73 | val objectMapper = JsonMapper.builder().nodeFactory(new JsonNodeFactorySortedKeys()).build() 74 | 75 | val json = objectMapper.readTree(inputStream) 76 | 77 | val jsonIterator = json.getNodeType match { 78 | case JsonNodeType.ARRAY => json.iterator() 79 | case _ => Collections.singletonList(json).iterator() 80 | } 81 | 82 | new Iterator[InternalRow] { 83 | override def hasNext: Boolean = jsonIterator.hasNext 84 | 85 | override def next(): InternalRow = { 86 | val row = jsonIterator.next().fields() 87 | val internalRow = new GenericInternalRow(schema.length) 88 | 89 | while (row.hasNext) { 90 | val item = row.next() 91 | 92 | val fieldIndex = schema.getFieldIndex(item.getKey) 93 | if (fieldIndex.getOrElse(-1) != -1) { 94 | if (item.getValue.isNull) 95 | internalRow.update(fieldIndex.get, null) 96 | else 97 | internalRow.update(fieldIndex.get, UTF8String.fromString(item.getValue.toString)) 98 | } 99 | } 100 | internalRow 101 | } 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/containers/AppleTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.containers 18 | 19 | 20 | import org.apache.spark.sql.DataFrame 21 | import org.finra.msd.enums.SourceType 22 | 23 | import scala.beans.BeanProperty 24 | 25 | 26 | /** 27 | * Custom container class that stores all data gotten from any source 28 | * 29 | * @param sourceType 30 | * @param dataFrame 31 | * @param delimiter 32 | * @param tempViewName 33 | */ 34 | case class AppleTable(@BeanProperty sourceType: SourceType, @BeanProperty dataFrame: DataFrame, @BeanProperty delimiter: String 35 | , @BeanProperty tempViewName: String) { 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/containers/CountResult.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.containers 18 | 19 | import scala.beans.BeanProperty 20 | 21 | case class CountResult(@BeanProperty leftCount: Long, @BeanProperty rightCount: Long) { 22 | 23 | } 24 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/containers/DiffResult.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.containers 18 | 19 | import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SQLImplicits} 20 | import org.apache.spark.sql.functions._ 21 | 22 | import scala.beans.BeanProperty 23 | 24 | /*** 25 | * The container for the comparison result. 26 | * @param inLeftNotInRight the data set that contains the results in left but not in right 27 | * @param inRightNotInLeft the data set that contains the results in right but not in left 28 | */ 29 | case class DiffResult(@BeanProperty inLeftNotInRight: DataFrame, @BeanProperty inRightNotInLeft: DataFrame) { 30 | 31 | import org.finra.msd.sparkfactory.SparkFactory.sparkImplicits._ 32 | 33 | /** 34 | * Order the result by the provided columns. 35 | * 36 | * @param orderByCols is the column provided for order 37 | * @param isAsc is the indicator for ascending or descending order. 38 | * @return the ordered data set 39 | */ 40 | def getOrderedResult(orderByCols: Array[String], isAsc: Boolean): DiffResult = { 41 | var cols: Array[Column] = orderByCols.map(str => col(str)) 42 | if (!isAsc) { 43 | cols = orderByCols.map(str => col(str).desc) 44 | } 45 | val left = inLeftNotInRight.sort(cols: _*) 46 | val right = inRightNotInLeft.sort(cols: _*) 47 | DiffResult(left, right) 48 | } 49 | 50 | /** 51 | * Exclude some columns from the data set so that it won't be saved. 52 | * 53 | * @param excludeCols the column array that contains the columns to exclude from the data set 54 | * @return the data set without the columns 55 | */ 56 | def removeCols(excludeCols: Array[String]) : DiffResult = { 57 | val left = inLeftNotInRight.drop(excludeCols:_*) 58 | val right = inRightNotInLeft.drop(excludeCols:_*) 59 | DiffResult(left, right) 60 | } 61 | 62 | /** 63 | * Indicating whether there is difference in the comparison. 64 | * 65 | * @return true if there is no difference; false if there is difference. 66 | */ 67 | def noDiff(): Boolean = { 68 | if (inLeftNotInRight.count() == 0 && inRightNotInLeft.count() == 0) { 69 | return true 70 | } 71 | false 72 | } 73 | 74 | /** 75 | * This method does a full outer join between the resulting left and right DataFrames from the method 76 | * SparkCompare.compareSchemaDataFrames. It will return a single DataFrame having the left columns prefixed with l_ 77 | * and the right columns prefixed with r_. the Key columns will not have prefixed. The resulting DataFrame will have 78 | * all l_ columns on the left, then the Key columns in the middle, then the r_ columns on the right. 79 | * 80 | * @param compositeKeyStrs a Sequence of Strings having the primary keys applicable for both DataFrames 81 | * @return a DataFrame having the resulting full outer join operation. 82 | */ 83 | def fullOuterJoinDataFrames(compositeKeyStrs: Seq[String]): DataFrame = { 84 | 85 | val compositeKeysUpperCaseSeq = compositeKeyStrs.map(k => k.toUpperCase) 86 | val compositeKeysUpperCase = compositeKeyStrs.map(k => k.toUpperCase).toSeq 87 | var tempLeft: DataFrame = inLeftNotInRight.select(inLeftNotInRight.columns.map(c => col(c).as(c.toUpperCase)): _*) 88 | var tempRight: DataFrame = inRightNotInLeft.select(inRightNotInLeft.columns.map(c => col(c).as(c.toUpperCase)): _*) 89 | 90 | for (col <- inLeftNotInRight.columns) 91 | { 92 | if (!compositeKeysUpperCaseSeq.contains(col.toUpperCase)) 93 | { 94 | tempLeft = tempLeft.withColumnRenamed(col ,"l_" + col.toUpperCase) 95 | } 96 | } 97 | 98 | for (col <- inRightNotInLeft.columns) 99 | { 100 | if (!compositeKeysUpperCaseSeq.contains(col.toUpperCase)) 101 | { 102 | tempRight = tempRight.withColumnRenamed(col ,"r_" + col.toUpperCase) 103 | } 104 | } 105 | val leftCols: Seq[String] = tempLeft.columns.filter(c => !compositeKeysUpperCase.contains(c.toUpperCase)).toSeq 106 | val rightCols: Seq[String] = tempRight.columns.filter(c => !compositeKeysUpperCase.contains(c.toUpperCase)).toSeq 107 | val joinedDf: DataFrame = tempLeft.as("l_") 108 | .join(tempRight.as("r_"), compositeKeysUpperCaseSeq, "full_outer") 109 | val allColsWithKeysInTheMiddle = leftCols.toSeq ++ compositeKeysUpperCase ++ rightCols.toSeq 110 | joinedDf.select( allColsWithKeysInTheMiddle.map(c => col(c)) :_*) 111 | } 112 | 113 | /** 114 | * This method compares all "l_" with their corresponding "r_" columns from the joined table returned in 115 | * fullOuterJoinDataFrames() and returns a DataFrame that maps column names with the amount of discrepant entries 116 | * between those l_ and r_ columns. 117 | * 118 | * @param compositeKeyStrs a Sequence of Strings having the primary keys applicable for both DataFrames 119 | * @return a DataFrame that maps between column names and the amount of discrepant entries for those "l_/r_" rows in 120 | * the full outer joined table. 121 | */ 122 | def discrepancyStats(compositeKeyStrs: Seq[String]): DataFrame = { 123 | 124 | val joinedDf: DataFrame = fullOuterJoinDataFrames(compositeKeyStrs) 125 | 126 | val compositeKeysUpperCase: Seq[String] = compositeKeyStrs.map(k => k.toUpperCase) 127 | val nonKeyCols: Seq[String] = inLeftNotInRight.columns.filter(c => !compositeKeysUpperCase.contains(c.toUpperCase)).toSeq.map(k => k.toUpperCase) 128 | val zColumns: Seq[String] = nonKeyCols.map( c => "z_" + c) 129 | 130 | //create new table with z_ columns that contain 0 if there the corresponding l_ and r_ columns were equal, 1 otherwise 131 | var withEqFlags = joinedDf 132 | for ( c <- nonKeyCols ) 133 | withEqFlags = withEqFlags.withColumn("z_" + c, when(withEqFlags("l_" + c) === withEqFlags("r_" + c), "0").otherwise("1")) 134 | 135 | //for each column, sum the corresponding z_ column to count how many discrepancies there were 136 | var counts:Map[String,Int] = Map() 137 | for ( c <- zColumns ) 138 | counts += ( c.substring(2) -> withEqFlags.agg(sum(c)).first().getDouble(0).toInt ) 139 | 140 | //sort the columns in decending order of how many discrepancies they had 141 | val problems:Seq[String] = nonKeyCols.sortWith(counts(_) > counts(_)) 142 | 143 | //return a DataFrame containing the column discrepancy count information from above 144 | var sortedCounts:Seq[(String,Int)] = Seq() 145 | for ( c <- problems ) 146 | if ( !c.equals("RECORDREPEATCOUNT") ) // If table has keys, it shouldn't have duplicates, so this column is ignored 147 | sortedCounts = sortedCounts :+ (c,counts(c)) 148 | sortedCounts.toDF("COLUMN_NAME","DISCREPANCIES") 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/controllers/TemplateController.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.controllers 18 | 19 | /** 20 | * This class is intended to hold locations and actual values of html templates that will be used for visualizations 21 | * whether reports that are saved to disc or templates for use inside DataBricks or Jyputer 22 | */ 23 | object TemplateController { 24 | 25 | /** 26 | * Actual HTML value as String for the horizontal table HTML template. The marker is #tableContent which should 27 | * be replaced by the actual HTML table 28 | */ 29 | lazy val horizontalTableTemplate: String = { 30 | s""" 31 | | 32 | | 33 | | 34 | | 35 | | 73 | | 74 | | 75 | |#tableContent 76 | | 77 | | 78 | """.stripMargin 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/customExceptions/ColumnNullException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.customExceptions 18 | 19 | final case class ColumnNullException(private val message: String = "", 20 | private val cause: Throwable = None.orNull) extends Exception(message, cause) { 21 | 22 | } -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/customExceptions/DataFrameNullException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.customExceptions 18 | 19 | final case class DataFrameNullException(private val message: String = "", 20 | private val cause: Throwable = None.orNull) extends Exception(message, cause) { 21 | 22 | } -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/customExceptions/InValidKeyException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.customExceptions 18 | 19 | final case class InValidKeyException(private val message: String = "", 20 | private val cause: Throwable = None.orNull) extends Exception(message, cause) { 21 | 22 | } 23 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/customExceptions/JoinKeysNullException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.customExceptions 18 | 19 | final case class JoinKeysNullException(private val message: String = "", 20 | private val cause: Throwable = None.orNull) extends Exception(message, cause) { 21 | 22 | } 23 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/customExceptions/SparkSessionNullException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.customExceptions 18 | 19 | final case class SparkSessionNullException(private val message: String = "", 20 | private val cause: Throwable = None.orNull) extends Exception(message, cause) { 21 | 22 | } -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/implicits/DataFrameImplicits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.implicits 18 | 19 | import org.apache.spark.sql.{Column, DataFrame} 20 | 21 | object DataFrameImplicits { 22 | 23 | implicit class DataFrameImprovements(df: DataFrame) { 24 | def getColumnsSeq(): Seq[Column] = { 25 | df.columns.map(c => df(c)).toSeq 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /mega-spark-diff/src/main/scala/org/finra/msd/outputwriters/OutputWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.outputwriters 18 | 19 | import org.apache.spark.sql.DataFrame 20 | 21 | object OutputWriter { 22 | 23 | /** 24 | * Stores comparison results locally 25 | * 26 | * @param leftResult a dataframe which contains the values in RDD1 and not in RDD2 27 | * @param rightResult a dataframe which contains the values in RDD2 and not in RDD1 28 | * @param outputDirectory location where the comparison results are to be stored 29 | * @param singleFile a boolean variable to denote the number of output files to be one or more than one 30 | */ 31 | def saveResultsToDisk(leftResult: DataFrame, rightResult: DataFrame, 32 | outputDirectory: String, singleFile: Boolean, delimiter: String): Unit = { 33 | var left: DataFrame = leftResult 34 | var right: DataFrame = rightResult 35 | 36 | 37 | if (singleFile) { 38 | left = leftResult.coalesce(1) 39 | right = rightResult.coalesce(1) 40 | } 41 | 42 | // Write the symmetric difference to their own output directories 43 | val header: Boolean = true 44 | left.write.format("com.databricks.spark.csv").option("header", header + "").option("delimiter", delimiter).mode("overwrite").save(outputDirectory + "/inLeftNotInRight") 45 | right.write.format("com.databricks.spark.csv").option("header", header + "").option("delimiter", delimiter).mode("overwrite").save(outputDirectory + "/inRightNotInLeft") 46 | 47 | } 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/java/org/finra/msd/helpers/FileHelper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.helpers; 18 | 19 | import java.io.IOException; 20 | import java.net.URISyntaxException; 21 | import java.nio.charset.StandardCharsets; 22 | import java.nio.file.Files; 23 | import java.nio.file.Path; 24 | import java.nio.file.Paths; 25 | import java.util.Objects; 26 | import java.util.Set; 27 | import java.util.stream.Collectors; 28 | import java.util.stream.Stream; 29 | import org.apache.commons.io.IOUtils; 30 | 31 | public class FileHelper { 32 | public static Set getFilenames(String directory, String prefix, String postfix) throws URISyntaxException { 33 | Path path = Paths.get(Objects.requireNonNull(FileHelper.class.getResource("/" + directory)).toURI()); 34 | try (Stream stream = Files.list(path)) { 35 | return stream.filter(x -> !Files.isDirectory(x) && 36 | x.getFileName().toString().startsWith(prefix) && 37 | x.toString().endsWith(postfix)) 38 | .map(Path::getFileName) 39 | .map(Path::toString) 40 | .map(x -> directory + "/" + x) 41 | .collect(Collectors.toSet()); 42 | } catch (IOException e) { 43 | throw new RuntimeException(e); 44 | } 45 | } 46 | 47 | public static String getStringFromResource(String resource) { 48 | try { 49 | return IOUtils.toString( 50 | Objects.requireNonNull(FileHelper.class.getResourceAsStream(resource)), 51 | StandardCharsets.UTF_8); 52 | } catch (IOException e) { 53 | throw new RuntimeException(e); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/java/org/finra/msd/helpers/JsonDeserializerStringMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.helpers; 18 | 19 | import com.google.gson.JsonDeserializationContext; 20 | import com.google.gson.JsonDeserializer; 21 | import com.google.gson.JsonElement; 22 | import com.google.gson.JsonParseException; 23 | import java.lang.reflect.Type; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.Map; 27 | import java.util.Map.Entry; 28 | import java.util.stream.Collectors; 29 | 30 | public class JsonDeserializerStringMap implements JsonDeserializer>> { 31 | 32 | @Override 33 | public List> deserialize(JsonElement json, Type typeOfT, 34 | JsonDeserializationContext context) throws JsonParseException { 35 | List> list = new ArrayList<>(); 36 | for (JsonElement element : json.getAsJsonArray()) { 37 | list.add(element.getAsJsonObject() 38 | .entrySet() 39 | .stream() 40 | .filter(x -> !x.getValue().isJsonNull()) 41 | .collect( 42 | Collectors.toMap( 43 | Entry::getKey, 44 | x -> x.getValue().toString() 45 | ))); 46 | } 47 | 48 | return list; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/java/org/finra/msd/helpers/JsonHelper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.helpers; 18 | 19 | import com.google.gson.Gson; 20 | import com.google.gson.GsonBuilder; 21 | import com.google.gson.reflect.TypeToken; 22 | import java.lang.reflect.Type; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | public class JsonHelper { 27 | 28 | public static Type type = new TypeToken>>() {}.getType(); 29 | 30 | public static Gson gson = new GsonBuilder().registerTypeAdapter( 31 | type, 32 | new JsonDeserializerStringMap() 33 | ).create(); 34 | 35 | public static List> jsonToMapList(String jsonString) { 36 | return gson.fromJson(jsonString, type); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/java/org/finra/msd/memorydb/MemoryDbDynamo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.memorydb; 18 | 19 | import com.amazonaws.auth.AWSCredentials; 20 | import com.amazonaws.auth.AWSCredentialsProvider; 21 | import com.amazonaws.auth.BasicAWSCredentials; 22 | import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration; 23 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; 24 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; 25 | import com.amazonaws.services.dynamodbv2.document.DynamoDB; 26 | import com.amazonaws.services.dynamodbv2.document.Item; 27 | import com.amazonaws.services.dynamodbv2.document.Table; 28 | import com.amazonaws.services.dynamodbv2.local.main.ServerRunner; 29 | import com.amazonaws.services.dynamodbv2.local.server.DynamoDBProxyServer; 30 | import com.amazonaws.services.dynamodbv2.model.AttributeDefinition; 31 | import com.amazonaws.services.dynamodbv2.model.KeySchemaElement; 32 | import com.amazonaws.services.dynamodbv2.model.KeyType; 33 | import com.amazonaws.services.dynamodbv2.model.ProvisionedThroughput; 34 | import com.amazonaws.services.dynamodbv2.model.ScalarAttributeType; 35 | import com.google.gson.Gson; 36 | import com.google.gson.JsonArray; 37 | import com.google.gson.JsonElement; 38 | import com.google.gson.JsonPrimitive; 39 | import com.google.gson.reflect.TypeToken; 40 | import java.net.URISyntaxException; 41 | import java.util.Arrays; 42 | import java.util.Collection; 43 | import java.util.HashMap; 44 | import java.util.HashSet; 45 | import java.util.List; 46 | import java.util.Map; 47 | import java.util.Set; 48 | import org.finra.msd.helpers.FileHelper; 49 | 50 | public class MemoryDbDynamo { 51 | 52 | private static MemoryDbDynamo instance = null; 53 | private DynamoDBProxyServer server = null; 54 | 55 | private static final AmazonDynamoDB client = AmazonDynamoDBClientBuilder.standard() 56 | .withCredentials(new AWSCredentialsProvider() { 57 | @Override 58 | public AWSCredentials getCredentials() { 59 | return new BasicAWSCredentials("test", "test"); 60 | } 61 | 62 | @Override 63 | public void refresh() { 64 | 65 | } 66 | }) 67 | .withEndpointConfiguration(new EndpointConfiguration("http://localhost:8000", null)) 68 | .build(); 69 | 70 | protected MemoryDbDynamo() { 71 | // Exists to prevent external instantiation. 72 | } 73 | 74 | public static MemoryDbDynamo getInstance() { 75 | if (instance == null) { 76 | instance = new MemoryDbDynamo(); 77 | } 78 | return instance; 79 | } 80 | 81 | public synchronized void initializeMemoryDb() throws Exception { 82 | if (server == null) { 83 | System.setProperty("sqlite4java.library.path", "target/testDependencies"); 84 | 85 | String[] localArgs = {"-inMemory", "-port", "8000"}; 86 | server = ServerRunner.createServerFromCommandLineArgs(localArgs); 87 | server.start(); 88 | 89 | stageTablesAndTestData(); 90 | } 91 | } 92 | 93 | private void stageTablesAndTestData() throws URISyntaxException { 94 | DynamoDB dynamoDB = new DynamoDB(client); 95 | 96 | Map tables = new HashMap<>(); 97 | 98 | Set filenames = FileHelper.getFilenames("dynamodb", "DynamoDbTest", ".json"); 99 | filenames.addAll(FileHelper.getFilenames("json", "JsonTest", ".json")); 100 | filenames.addAll(FileHelper.getFilenames("compare", "JsonTest", ".json")); 101 | 102 | for (String filename : filenames) { 103 | String tableName = filename 104 | .substring(0, filename.length() - 5); 105 | tables.put(filename, 106 | dynamoDB.createTable(tableName.replace("/", "_"), 107 | Arrays.asList(new KeySchemaElement("key1", KeyType.HASH), 108 | new KeySchemaElement("key2", KeyType.RANGE)), 109 | Arrays.asList(new AttributeDefinition("key1", ScalarAttributeType.S), 110 | new AttributeDefinition("key2", ScalarAttributeType.N)), 111 | new ProvisionedThroughput(10L, 10L))); 112 | } 113 | 114 | for (Map.Entry entry : tables.entrySet()) { 115 | Table table = entry.getValue(); 116 | String json = FileHelper.getStringFromResource("/" + entry.getKey()); 117 | 118 | List jsonList = new Gson().fromJson(json, 119 | new TypeToken>() { 120 | }.getType()); 121 | 122 | for (JsonElement jsonRow : jsonList) { 123 | Item jsonItem = Item.fromJSON(jsonRow.toString()); 124 | jsonItem = jsonItem.withPrimaryKey("key1", jsonItem.get("key1"), "key2", 125 | Integer.valueOf(String.valueOf(jsonItem.get("key2")))); 126 | 127 | if (table.getTableName().startsWith("dynamodb_DynamoDbTestSet")) { 128 | Object attribute2 = jsonRow.getAsJsonObject().get("attribute2"); 129 | if (attribute2 instanceof JsonArray) { 130 | JsonArray array = (JsonArray) attribute2; 131 | Set set = new HashSet<>(); 132 | for (int i = 0; i < array.size(); i++) { 133 | JsonPrimitive element = array.get(i).getAsJsonPrimitive(); 134 | set.add(element.isNumber() ? element.getAsNumber() : element.getAsString()); 135 | } 136 | jsonItem = jsonItem.with("attribute2", set); 137 | } 138 | } 139 | 140 | table.putItem(jsonItem); 141 | } 142 | } 143 | } 144 | 145 | public synchronized void shutdownMemoryDb() throws Exception { 146 | if (server != null) { 147 | server.stop(); 148 | server = null; 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/java/org/finra/msd/memorydb/MemoryDbHsql.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.memorydb; 18 | 19 | import com.google.gson.Gson; 20 | import com.google.gson.JsonElement; 21 | import com.google.gson.JsonObject; 22 | import com.google.gson.JsonPrimitive; 23 | import com.google.gson.reflect.TypeToken; 24 | import java.net.URISyntaxException; 25 | import java.sql.PreparedStatement; 26 | import java.util.Collection; 27 | import java.util.List; 28 | import java.util.Set; 29 | import java.util.stream.Collectors; 30 | import org.finra.msd.helpers.FileHelper; 31 | import org.hsqldb.server.Server; 32 | 33 | import java.sql.Connection; 34 | import java.sql.DriverManager; 35 | import java.sql.SQLException; 36 | import java.sql.Statement; 37 | import java.util.concurrent.TimeUnit; 38 | 39 | public class MemoryDbHsql { 40 | 41 | 42 | private static MemoryDbHsql instance = null; 43 | private Server hsqlDbServer = null; 44 | 45 | public static final String hsqlDriverName = "org.hsqldb.jdbc.JDBCDriver"; 46 | public static final String hsqlUrl = "jdbc:hsqldb:hsql://127.0.0.1:9001/testDb"; 47 | 48 | 49 | protected MemoryDbHsql() { 50 | // Exists only to defeat instantiation. 51 | } 52 | 53 | public static MemoryDbHsql getInstance() { 54 | if(instance == null) { 55 | instance = new MemoryDbHsql(); 56 | } 57 | return instance; 58 | } 59 | 60 | public synchronized void initializeMemoryDB() throws URISyntaxException, SQLException 61 | { 62 | if (hsqlDbServer == null) 63 | { 64 | hsqlDbServer = new Server(); 65 | hsqlDbServer.setDatabaseName(0, "testDb"); 66 | hsqlDbServer.setDatabasePath(0, "mem:testDb"); 67 | hsqlDbServer.setPort(9001); // this is the default port 68 | hsqlDbServer.setSilent(true); 69 | hsqlDbServer.start(); 70 | stageTablesAndTestData(); 71 | } 72 | } 73 | 74 | public int getState() 75 | { 76 | if (hsqlDbServer == null) 77 | { 78 | return 0; //meaning the server is not created yet 79 | } 80 | return hsqlDbServer.getState(); 81 | } 82 | 83 | private String getDataType(JsonElement element) { 84 | if (element.isJsonPrimitive()) { 85 | JsonPrimitive primitive = (JsonPrimitive) element; 86 | if (primitive.isBoolean()) 87 | return "boolean"; 88 | else if (primitive.isNumber()) 89 | return "int"; 90 | else 91 | return "varchar(255)"; 92 | } else 93 | return "varchar(255)"; 94 | } 95 | 96 | private String formatStringFromJson(JsonElement jsonElement) { 97 | if (jsonElement == null) return null; 98 | return jsonElement.toString().replaceAll("\\A\"(.*)\"\\z", "$1"); 99 | } 100 | 101 | private void stageTablesAndTestData() throws URISyntaxException 102 | { 103 | try { 104 | Class.forName(hsqlDriverName); 105 | } catch (ClassNotFoundException e) { 106 | e.printStackTrace(); 107 | } 108 | try (Connection conn = DriverManager.getConnection(hsqlUrl, "SA", "")) { 109 | try (Statement stmt = conn.createStatement()) { 110 | List filenamesNew = FileHelper.getFilenames("jdbc", "", ".sql").stream() 111 | .sorted().collect( 112 | Collectors.toList()); 113 | filenamesNew.addAll(FileHelper.getFilenames("compare", "JsonTest", ".sql")); 114 | 115 | for (String filename : filenamesNew) { 116 | String[] statements = FileHelper.getStringFromResource("/" + filename) 117 | .split(";"); 118 | for (String statement : statements) { 119 | stmt.execute(statement); 120 | } 121 | } 122 | 123 | Set filenames = FileHelper.getFilenames("dynamodb", "DynamoDbTest", 124 | ".json"); 125 | filenames.addAll(FileHelper.getFilenames("json", "JsonTest", ".json")); 126 | 127 | for (String filename : filenames) { 128 | String tableName = filename 129 | .substring(0, filename.length() - 5).replace("/", "_"); 130 | stmt.execute("DROP TABLE IF EXISTS " + tableName + ";"); 131 | 132 | String json = FileHelper.getStringFromResource("/" + filename); 133 | 134 | List jsonList = new Gson().fromJson(json, 135 | new TypeToken>() { 136 | }.getType()); 137 | 138 | stmt.execute("CREATE TABLE " + tableName + "(\n" + 139 | " key1 varchar(255),\n" + 140 | " key2 int,\n" + 141 | " attribute1 varchar(255),\n" + 142 | " attribute2 varchar(255),\n" + 143 | " attribute3 varchar(255));"); 144 | 145 | for (JsonElement jsonRow : jsonList) { 146 | JsonObject jsonObject = jsonRow.getAsJsonObject(); 147 | try (PreparedStatement preparedStatement = conn.prepareStatement( 148 | "INSERT INTO " + tableName + " values(?, ?, ?, ?, ?)")) { 149 | String test = jsonObject.get("attribute1").toString(); 150 | String key = jsonObject.get("key1").getAsString(); 151 | preparedStatement.setString(1, jsonObject.get("key1").getAsString()); 152 | preparedStatement.setInt(2, jsonObject.get("key2").getAsInt()); 153 | preparedStatement.setString(3, 154 | formatStringFromJson(jsonObject.get("attribute1"))); 155 | preparedStatement.setObject(4, 156 | formatStringFromJson(jsonObject.get("attribute2"))); 157 | preparedStatement.setString(5, 158 | formatStringFromJson(jsonObject.get("attribute3"))); 159 | preparedStatement.executeUpdate(); 160 | } 161 | } 162 | } 163 | } 164 | } catch (SQLException e) { 165 | e.printStackTrace(); 166 | } 167 | } 168 | 169 | public synchronized void shutdownMemoryDb() 170 | { 171 | hsqlDbServer.shutdown(); 172 | while (hsqlDbServer.getState() != 16) 173 | { 174 | try { 175 | TimeUnit.MILLISECONDS.sleep(5); 176 | } catch (InterruptedException e) { 177 | e.printStackTrace(); 178 | } 179 | } 180 | hsqlDbServer = null; 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/compare/JsonTestMapList.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": { 7 | "element1": "1" 8 | }, 9 | "attribute3": [ 10 | "1" 11 | ] 12 | }, 13 | { 14 | "key1": "TEST2", 15 | "key2": 1, 16 | "attribute1": "test number 2", 17 | "attribute2": { 18 | "element1": "2" 19 | }, 20 | "attribute3": [ 21 | "2" 22 | ] 23 | }, 24 | { 25 | "key1": "TEST3", 26 | "key2": 3, 27 | "attribute1": "test number 3", 28 | "attribute2": { 29 | "element1": "3", 30 | "element2": "4" 31 | }, 32 | "attribute3": [ 33 | "3", 34 | "4" 35 | ] 36 | } 37 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/compare/JsonTestMapList.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS compare_JsonTestMapList; 2 | 3 | CREATE TABLE compare_JsonTestMapList ( 4 | key1 VARCHAR(255), 5 | key2 INT, 6 | attribute1 VARCHAR(255), 7 | attribute2 VARCHAR(255), 8 | attribute3 VARCHAR(255) 9 | ); 10 | 11 | INSERT INTO compare_JsonTestMapList VALUES('TEST1', 1, 'test number 1', '{"element1":"1"}', '["1"]'); 12 | INSERT INTO compare_JsonTestMapList VALUES('TEST2', 1, 'test number 2', '{"element1":"2"}', '["2"]'); 13 | INSERT INTO compare_JsonTestMapList VALUES('TEST3', 3, 'test number 3', '{"element1":"3","element2":"4"}', '["3","4"]'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/compare/JsonTestMapList.txt: -------------------------------------------------------------------------------- 1 | "test number 1";{"element1":"1"};["1"];"TEST1";1 2 | "test number 2";{"element1":"2"};["2"];"TEST2";1 3 | "test number 3";{"element1":"3","element2":"4"};["3","4"];"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/compare/JsonTestMapListDiffValue.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": { 7 | "element1": "1" 8 | }, 9 | "attribute3": [ 10 | "1" 11 | ] 12 | }, 13 | { 14 | "key1": "TEST2", 15 | "key2": 1, 16 | "attribute1": "test number 2", 17 | "attribute2": { 18 | "element1": "2" 19 | }, 20 | "attribute3": [ 21 | "2" 22 | ] 23 | }, 24 | { 25 | "key1": "TEST3", 26 | "key2": 3, 27 | "attribute1": "test number 3", 28 | "attribute2": { 29 | "element1": "3", 30 | "element2": "5" 31 | }, 32 | "attribute3": [ 33 | "3", 34 | "4" 35 | ] 36 | } 37 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/csv/TestCSV.txt: -------------------------------------------------------------------------------- 1 | "A,p\"p\" 2 | le"|5|10|Red 3 | "Ba\"n\" 4 | ana"|4|8|Yellow 5 | Orange|2|9|Blue 6 | Kiwi|8|7|Fuzzy-Green 7 | Watermelon|3|11|Green 8 | Mango|6|12|Yellow 9 | Papaya|190534|4|I forget 10 | Strawberry|5|10|Acne 11 | Plum|8261|6|Purple 12 | Tomato|0|0|Red 13 | "Text,with comma 14 | and return char"|2|4|Blue -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/csv/TestCSV_2.txt: -------------------------------------------------------------------------------- 1 | "A,p\"p\" 2 | le","5","10","Red" 3 | "Ba\"n\" 4 | ana","4","8","Yellow" 5 | "Orange","2","9","Blue" 6 | "Kiwi","8","7","Fuzzy-Green" 7 | "Watermelon","3","11","Green" 8 | "Mango","6","12","Yellow" 9 | "Papaya","190534","4","I forget" 10 | "Strawberry","5","10","Acne" 11 | "Plum","8261","6","Purple" 12 | "Tomato","0","0","Red" 13 | "Text,with comma 14 | and return char","2","4","Blue" -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/csv/TestCSV_commas.txt: -------------------------------------------------------------------------------- 1 | "A,p""p"" 2 | le",5,10,Red 3 | "A,p\"p\" 4 | le",5,10,Red 5 | "Ba""n"" 6 | ana",4,8,Yellow 7 | Orange,2,9,Blue 8 | Kiwi,8,7,Fuzzy-Green 9 | Watermelon,3,11,Green 10 | Mango,6,12,Yellow 11 | Papaya,190534,4,I forget 12 | Strawberry,5,10,Acne 13 | Plum,8261,6,Purple 14 | Tomato,0,0,Red 15 | "Text,with comma 16 | and return char",2,4,Blue -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/csv/TestCSV_pipes.txt: -------------------------------------------------------------------------------- 1 | "A|p""p"" 2 | le"|5|10|Red 3 | "A|p\"p\" 4 | le"|5|10|Red 5 | "Ba""n"" 6 | ana"|4|8|Yellow 7 | Orange|2|9|Blue 8 | Kiwi|8|7|Fuzzy-Green 9 | Watermelon|3|11|Green 10 | Mango|6|12|Yellow 11 | Papaya|190534|4|I forget 12 | Strawberry|5|10|Acne 13 | Plum|8261|6|Purple 14 | Tomato|0|0|Red 15 | "Text,with comma 16 | and return char"|2|4|Blue -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSet.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": [ 7 | "1" 8 | ], 9 | "attribute3": "1" 10 | }, 11 | { 12 | "key1": "TEST2", 13 | "key2": 1, 14 | "attribute1": "test number 2", 15 | "attribute2": [ 16 | "2" 17 | ], 18 | "attribute3": "2" 19 | }, 20 | { 21 | "key1": "TEST3", 22 | "key2": 3, 23 | "attribute1": "test number 3", 24 | "attribute2": [ 25 | "3", 26 | "4" 27 | ], 28 | "attribute3": "3" 29 | } 30 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSet.txt: -------------------------------------------------------------------------------- 1 | "test number 1";["1"];"1";"TEST1";1 2 | "test number 2";["2"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSetBrackets.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": [ 7 | "1" 8 | ], 9 | "attribute3": "1" 10 | }, 11 | { 12 | "key1": "TEST2", 13 | "key2": 1, 14 | "attribute1": "test number 2", 15 | "attribute2": [ 16 | "[2]" 17 | ], 18 | "attribute3": "2" 19 | }, 20 | { 21 | "key1": "TEST3", 22 | "key2": 3, 23 | "attribute1": "test number 3", 24 | "attribute2": [ 25 | "3", 26 | "4" 27 | ], 28 | "attribute3": "3" 29 | } 30 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSetBrackets.txt: -------------------------------------------------------------------------------- 1 | "test number 1";["1"];"1";"TEST1";1 2 | "test number 2";["[2]"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSetDiffElementOrder.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": [ 7 | "1" 8 | ], 9 | "attribute3": "1" 10 | }, 11 | { 12 | "key1": "TEST2", 13 | "key2": 1, 14 | "attribute1": "test number 2", 15 | "attribute2": [ 16 | "2" 17 | ], 18 | "attribute3": "2" 19 | }, 20 | { 21 | "key1": "TEST3", 22 | "key2": 3, 23 | "attribute1": "test number 3", 24 | "attribute2": [ 25 | "4", 26 | "3" 27 | ], 28 | "attribute3": "3" 29 | } 30 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSetDiffElementOrder.txt: -------------------------------------------------------------------------------- 1 | "test number 1";["1"];"1";"TEST1";1 2 | "test number 2";["2"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSetMixed.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": 1, 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute2": [ 14 | "2" 15 | ], 16 | "attribute3": "2" 17 | }, 18 | { 19 | "key1": "TEST3", 20 | "key2": 3, 21 | "attribute1": "test number 3", 22 | "attribute2": [ 23 | "3", 24 | "4" 25 | ], 26 | "attribute3": "3" 27 | } 28 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/dynamodb/DynamoDbTestSetMixed.txt: -------------------------------------------------------------------------------- 1 | "test number 1";1;"1";"TEST1";1 2 | "test number 2";["2"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/EnhancedFruit1.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS EnhancedFruit1; 2 | 3 | CREATE TABLE EnhancedFruit1 ( 4 | Fruit VARCHAR(255), 5 | Price DOUBLE, 6 | Ripeness INT, 7 | Color VARCHAR(255), 8 | ImportDate DATE, 9 | ImportTimeStamp TIMESTAMP, 10 | Status BOOLEAN, 11 | BValues BLOB, 12 | CValues CLOB 13 | ); 14 | 15 | INSERT INTO EnhancedFruit1 VALUES('Mango', 6.45, 12, 'Yellow', '2017-05-20', '2017-05-20 10:22:10', FALSE, X'01FF', 'clob'); 16 | INSERT INTO EnhancedFruit1 VALUES('Papaya', 190534.12, 4, 'I forget', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 17 | INSERT INTO EnhancedFruit1 VALUES('Kiwi', 8.83, 7, 'Fuzzy-Green', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 18 | INSERT INTO EnhancedFruit1 VALUES('Watermelon', null, 11, null, '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/EnhancedFruit2.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS EnhancedFruit2; 2 | 3 | CREATE TABLE EnhancedFruit2 ( 4 | Fruit VARCHAR(255), 5 | Price DOUBLE, 6 | Ripeness INT, 7 | Color VARCHAR(255), 8 | ImportDate DATE, 9 | ImportTimeStamp TIMESTAMP, 10 | Status BOOLEAN, 11 | BValues BLOB, 12 | CValues CLOB 13 | ); 14 | 15 | INSERT INTO EnhancedFruit2 VALUES('Mango', 6.11, 12, '', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 16 | INSERT INTO EnhancedFruit2 VALUES('Papaya', 190534.12, 4, 'I forget', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 17 | INSERT INTO EnhancedFruit2 VALUES('Strawberry', 5.89, 10, 'Acne', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 18 | INSERT INTO EnhancedFruit2 VALUES('Plum', 8261.05, 6, 'Purple', '2017-05-20', '2017-05-20 10:22:10', FALSE, X'01FF', 'clob'); 19 | INSERT INTO EnhancedFruit2 VALUES('Tomato', 0.9, 0, 'Red', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 20 | INSERT INTO EnhancedFruit2 VALUES('Watermelon', null, 11, null, '2017-05-21', '2017-05-21 8:10:18', FALSE, X'01FF', 'clob'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/Fruit1.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Fruit1; 2 | 3 | CREATE TABLE Fruit1 ( 4 | Fruit VARCHAR(255), 5 | Price INT, 6 | Ripeness INT, 7 | Color VARCHAR(255) 8 | ); 9 | 10 | INSERT INTO Fruit1 VALUES('Apple', 5, 10, 'Red'); 11 | INSERT INTO Fruit1 VALUES('Banana', 4, 8, 'Yellow'); 12 | INSERT INTO Fruit1 VALUES('Orange', 2, 9, 'Blue'); 13 | INSERT INTO Fruit1 VALUES('Kiwi', 8, 7, 'Fuzzy-Green'); 14 | INSERT INTO Fruit1 VALUES('Watermelon', 3, 11, 'Green'); 15 | INSERT INTO Fruit1 VALUES('Mango', 6, 12, 'Yellow'); 16 | INSERT INTO Fruit1 VALUES('Papaya', 190534, 4, 'I forget'); 17 | INSERT INTO Fruit1 VALUES('Strawberry', 5, 10, 'Acne'); 18 | INSERT INTO Fruit1 VALUES('Plum', 8261, 6, 'Purple'); 19 | INSERT INTO Fruit1 VALUES('Tomato', 0, 0, 'Red'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/Fruit2.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Fruit2; 2 | 3 | CREATE TABLE Fruit2 ( 4 | Fruit VARCHAR(255), 5 | Price INT, 6 | Ripeness INT, 7 | Color VARCHAR(255) 8 | ); 9 | 10 | INSERT INTO Fruit2 VALUES('Apple', 5, 10, 'Red'); 11 | INSERT INTO Fruit2 VALUES('Banana', 4, 8, 'Yellow'); 12 | INSERT INTO Fruit2 VALUES('Orange', 2, 9, 'Blue'); 13 | INSERT INTO Fruit2 VALUES('Kiwi', 8, 7, 'Fuzzy-Green'); 14 | INSERT INTO Fruit2 VALUES('Watermelon', 3, 11, 'Green'); 15 | INSERT INTO Fruit2 VALUES('Mango', 6, 12, 'Yellow'); 16 | INSERT INTO Fruit2 VALUES('Papaya', 190534, 4, 'I forget'); 17 | INSERT INTO Fruit2 VALUES('Strawberry', 5, 10, 'Acne'); 18 | INSERT INTO Fruit2 VALUES('Plum', 8261, 6, 'Purple'); 19 | INSERT INTO Fruit2 VALUES('Tomato', 0, 0, 'Red'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/Fruit3.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Fruit3; 2 | 3 | CREATE TABLE Fruit3 ( 4 | Fruit VARCHAR(255), 5 | Price INT, 6 | Ripeness INT, 7 | Color VARCHAR(255) 8 | ); 9 | 10 | INSERT INTO Fruit3 VALUES('Apple', 5, 10, 'Red'); 11 | INSERT INTO Fruit3 VALUES('Banana', 4, 8, 'Yellow'); 12 | INSERT INTO Fruit3 VALUES('Orange', 2, -9, 'Blue'); --diff 13 | INSERT INTO Fruit3 VALUES('Kiwi', 8, 7, 'Fuzzy-Green'); 14 | INSERT INTO Fruit3 VALUES('Watermelon', 3, 11, 'Green'); 15 | INSERT INTO Fruit3 VALUES('Mango', 6, 12, 'Yellow'); 16 | INSERT INTO Fruit3 VALUES('Papaya', 190534, 4, 'I remember now'); --diff 17 | INSERT INTO Fruit3 VALUES('Strawberry', 5, 10, 'Acne'); 18 | INSERT INTO Fruit3 VALUES('Plum', 8261, 6, 'Purple'); 19 | INSERT INTO Fruit3 VALUES('Tomato', 0, 0, 'Red'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/Fruit4.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Fruit4; 2 | 3 | CREATE TABLE Fruit4 ( 4 | Fruit VARCHAR(255), 5 | Price INT, 6 | Ripeness INT, 7 | Color VARCHAR(255) 8 | ); 9 | 10 | INSERT INTO Fruit4 VALUES('Apple', 5, 10, 'Red'); 11 | INSERT INTO Fruit4 VALUES('Banana', 4, 8, 'Yellow'); 12 | INSERT INTO Fruit4 VALUES('Orange', 2, 9, 'Blue'); 13 | INSERT INTO Fruit4 VALUES('Kiwi', 8, 7, 'Fuzzy-Green'); 14 | INSERT INTO Fruit4 VALUES('Watermelon', 3, 11, 'Green'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/Fruit5.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Fruit5; 2 | 3 | CREATE TABLE Fruit5 ( 4 | Fruit VARCHAR(255), 5 | Price INT, 6 | Ripeness INT, 7 | Color VARCHAR(255) 8 | ); 9 | 10 | INSERT INTO Fruit5 VALUES('Mango', 6, 12, 'Yellow'); 11 | INSERT INTO Fruit5 VALUES('Papaya', 190534, 4, 'I forget'); 12 | INSERT INTO Fruit5 VALUES('Strawberry', 5, 10, 'Acne'); 13 | INSERT INTO Fruit5 VALUES('Plum', 8261, 6, 'Purple'); 14 | INSERT INTO Fruit5 VALUES('Tomato', 0, 0, 'Red'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/JdbcTestSimpleToJson.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS JdbcTestSimpleToJson; 2 | 3 | CREATE TABLE JdbcTestSimpleToJson ( 4 | Fruit VARCHAR(255), 5 | Price DOUBLE, 6 | Ripeness INT, 7 | Color VARCHAR(255), 8 | ImportDate DATE, 9 | ImportTimeStamp TIMESTAMP, 10 | Status BOOLEAN, 11 | BValues BLOB, 12 | CValues CLOB 13 | ); 14 | 15 | INSERT INTO JdbcTestSimpleToJson VALUES('Mango', 6.45, 12, 'Yellow', '2017-05-20', '2017-05-20 10:22:10', FALSE, X'01FF', 'clob'); 16 | INSERT INTO JdbcTestSimpleToJson VALUES('Papaya', 190534.12, 4, '["I forget","I do not remember"]', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 17 | INSERT INTO JdbcTestSimpleToJson VALUES('Kiwi', 8.83, 7, '{"inside":"Fuzzy-Green","outside":"Brown"}', '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); 18 | INSERT INTO JdbcTestSimpleToJson VALUES('Watermelon', null, 11, null, '2017-05-20', '2017-05-20 10:22:10', TRUE, X'01FF', 'clob'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/jdbc/Persons1.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Persons1; 2 | 3 | CREATE TABLE Persons1 ( 4 | PersonID INT, 5 | LastName VARCHAR(255), 6 | FirstName VARCHAR(255), 7 | Address VARCHAR(255), 8 | City VARCHAR(255) 9 | ); 10 | 11 | INSERT INTO Persons1 VALUES(1,'Garcia', 'Carlos', 'lives somewhere', 'Rockville'); 12 | INSERT INTO Persons1 VALUES(2,'Patel', 'Shraddha', 'lives somewhere', 'Maryland'); -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestList.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": [ 7 | "1" 8 | ], 9 | "attribute3": "1" 10 | }, 11 | { 12 | "key1": "TEST2", 13 | "key2": 1, 14 | "attribute1": "test number 2", 15 | "attribute2": [ 16 | "2" 17 | ], 18 | "attribute3": "2" 19 | }, 20 | { 21 | "key1": "TEST3", 22 | "key2": 3, 23 | "attribute1": "test number 3", 24 | "attribute2": [ 25 | "3", 26 | "4" 27 | ], 28 | "attribute3": "3" 29 | } 30 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestList.txt: -------------------------------------------------------------------------------- 1 | "test number 1";["1"];"1";"TEST1";1 2 | "test number 2";["2"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestListBrackets.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": [ 7 | "1" 8 | ], 9 | "attribute3": "1" 10 | }, 11 | { 12 | "key1": "TEST2", 13 | "key2": 1, 14 | "attribute1": "test number 2", 15 | "attribute2": [ 16 | "[2]" 17 | ], 18 | "attribute3": "2" 19 | }, 20 | { 21 | "key1": "TEST3", 22 | "key2": 3, 23 | "attribute1": "test number 3", 24 | "attribute2": [ 25 | "3", 26 | "4" 27 | ], 28 | "attribute3": "3" 29 | } 30 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestListBrackets.txt: -------------------------------------------------------------------------------- 1 | "test number 1";["1"];"1";"TEST1";1 2 | "test number 2";["[2]"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestListMixedType.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": 1, 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute2": [ 14 | "2" 15 | ], 16 | "attribute3": "2" 17 | }, 18 | { 19 | "key1": "TEST3", 20 | "key2": 3, 21 | "attribute1": "test number 3", 22 | "attribute2": [ 23 | "3", 24 | "4" 25 | ], 26 | "attribute3": "3" 27 | } 28 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestListMixedType.txt: -------------------------------------------------------------------------------- 1 | "test number 1";1;"1";"TEST1";1 2 | "test number 2";["2"];"2";"TEST2";1 3 | "test number 3";["3","4"];"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestMap.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": { 7 | "element1": "1" 8 | }, 9 | "attribute3": "1" 10 | }, 11 | { 12 | "key1": "TEST2", 13 | "key2": 1, 14 | "attribute1": "test number 2", 15 | "attribute2": { 16 | "element1": "2" 17 | }, 18 | "attribute3": "2" 19 | }, 20 | { 21 | "key1": "TEST3", 22 | "key2": 3, 23 | "attribute1": "test number 3", 24 | "attribute2": { 25 | "element1": "3", 26 | "element2": "4" 27 | }, 28 | "attribute3": "3" 29 | } 30 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestMap.txt: -------------------------------------------------------------------------------- 1 | "test number 1";{"element1":"1"};"1";"TEST1";1 2 | "test number 2";{"element1":"2"};"2";"TEST2";1 3 | "test number 3";{"element1":"3","element2":"4"};"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestMapMixedType.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": 1, 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute2": { 14 | "element1": "2" 15 | }, 16 | "attribute3": "2" 17 | }, 18 | { 19 | "key1": "TEST3", 20 | "key2": 3, 21 | "attribute1": "test number 3", 22 | "attribute2": { 23 | "element1": "3", 24 | "element2": "4" 25 | }, 26 | "attribute3": "3" 27 | } 28 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestMapMixedType.txt: -------------------------------------------------------------------------------- 1 | "test number 1";1;"1";"TEST1";1 2 | "test number 2";{"element1":"2"};"2";"TEST2";1 3 | "test number 3";{"element1":"3","element2":"4"};"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimple.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": "1", 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute2": "2", 14 | "attribute3": "2" 15 | }, 16 | { 17 | "key1": "TEST3", 18 | "key2": 3, 19 | "attribute1": "test number 3", 20 | "attribute2": "true", 21 | "attribute3": "3" 22 | } 23 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimple.txt: -------------------------------------------------------------------------------- 1 | "test number 1";"1";"1";"TEST1";1 2 | "test number 2";"2";"2";"TEST2";1 3 | "test number 3";"true";"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimpleExtraNull.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": "1", 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute2": "2", 14 | "attribute3": "2", 15 | "attribute4": null 16 | }, 17 | { 18 | "key1": "TEST3", 19 | "key2": 3, 20 | "attribute1": "test number 3", 21 | "attribute2": "true", 22 | "attribute3": "3" 23 | } 24 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimpleExtraNull.txt: -------------------------------------------------------------------------------- 1 | "test number 1";"1";"1";;"TEST1";1 2 | "test number 2";"2";"2";;"TEST2";1 3 | "test number 3";"true";"3";;"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimpleMissingElement.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": "1", 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute3": "2" 14 | }, 15 | { 16 | "key1": "TEST3", 17 | "key2": 3, 18 | "attribute1": "test number 3", 19 | "attribute2": "true", 20 | "attribute3": "3" 21 | } 22 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimpleMissingElement.txt: -------------------------------------------------------------------------------- 1 | "test number 1";"1";"1";"TEST1";1 2 | "test number 2";;"2";"TEST2";1 3 | "test number 3";"true";"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimpleMixedType.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "key1": "TEST1", 4 | "key2": 1, 5 | "attribute1": "test number 1", 6 | "attribute2": "1", 7 | "attribute3": "1" 8 | }, 9 | { 10 | "key1": "TEST2", 11 | "key2": 1, 12 | "attribute1": "test number 2", 13 | "attribute2": 2, 14 | "attribute3": "2" 15 | }, 16 | { 17 | "key1": "TEST3", 18 | "key2": 3, 19 | "attribute1": "test number 3", 20 | "attribute2": true, 21 | "attribute3": "3" 22 | } 23 | ] -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/json/JsonTestSimpleMixedType.txt: -------------------------------------------------------------------------------- 1 | "test number 1";"1";"1";"TEST1";1 2 | "test number 2";2;"2";"TEST2";1 3 | "test number 3";true;"3";"TEST3";3 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/Fruit1.txt: -------------------------------------------------------------------------------- 1 | Apple,5,10,Red 2 | Banana,4,8,Yellow 3 | Orange,2,9,Blue 4 | Kiwi,8,7,Fuzzy-Green 5 | Watermelon,3,11,Green 6 | Mango,6,12,Yellow 7 | Papaya,190534,4,I forget 8 | Strawberry,5,10,Acne 9 | Plum,8261,6,Purple 10 | Tomato,0,0,Red -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/Fruit2.txt: -------------------------------------------------------------------------------- 1 | Apple,5,10,Red 2 | Banana,4,8,Yellow 3 | Orange,2,9,Blue 4 | Kiwi,8,7,Fuzzy-Green 5 | Watermelon,3,11,Green 6 | Mango,6,12,Yellow 7 | Papaya,190534,4,I forget 8 | Strawberry,5,10,Acne 9 | Plum,8261,6,Purple 10 | Tomato,0,0,Red -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/Fruit3.txt: -------------------------------------------------------------------------------- 1 | Apple,5,10,Red 2 | Banana,4,8,Yellow 3 | Orange,2,-9,Blue 4 | Kiwi,8,7,Fuzzy-Green 5 | Watermelon,3,11,Green 6 | Mango,6,12,Yellow 7 | Papaya,190534,4,I remember now 8 | Strawberry,5,10,Acne 9 | Plum,8261,6,Purple 10 | Tomato,0,0,Red -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/Fruit4.txt: -------------------------------------------------------------------------------- 1 | Apple,5,10,Red 2 | Banana,4,8,Yellow 3 | Orange,2,9,Blue 4 | Kiwi,8,7,Fuzzy-Green 5 | Watermelon,3,11,Green 6 | Mango,6,12,Yellow -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/Fruit5.txt: -------------------------------------------------------------------------------- 1 | Mango,6,12,Yellow 2 | Papaya,190534,4,I forget 3 | Strawberry,5,10,Acne 4 | Plum,8261,6,Purple 5 | Tomato,0,0,Red -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/Fruit6.txt: -------------------------------------------------------------------------------- 1 | Apple,5,10,Green 2 | Banana,4,8,Yellow 3 | Orange,2,9,Blue 4 | Kiwi,8,7,Fuzzy-Green 5 | Watermelon,3,11,Green -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/TC1DiffsAndDups1.txt: -------------------------------------------------------------------------------- 1 | USA 2 | Australia 3 | France 4 | Germany 5 | Germany 6 | NoCountry -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/TC5NullsAndEmptyData1.txt: -------------------------------------------------------------------------------- 1 | Australia, 2 | USA, 3 | France|11 4 | Germany,22 5 | NoCountry,12 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/resources/txt/TC5NullsAndEmptyData2.txt: -------------------------------------------------------------------------------- 1 | Australia, 2 | USA,NULL 3 | France|11 4 | Germany,22 5 | NoCountry,12 -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/basetestclasses/JsonFormatTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.basetestclasses 18 | 19 | import org.finra.msd.containers.DiffResult 20 | 21 | trait JsonFormatTests { 22 | this: SparkFunSuite => 23 | def returnDiff(tableLeft: String, tableRight: String, sameSchema: Boolean = false): DiffResult 24 | 25 | def testSameDataTypesComplexJsonFormat(tableLeft: String, tableRight: String): Unit = { 26 | test("testSameDataTypesComplex") { 27 | val expectedDiffs = 0 28 | val diffResult = returnDiff(tableLeft, tableRight) 29 | helpers.reportDiffs(diffResult, expectedDiffs) 30 | } 31 | } 32 | 33 | def testSameDataTypesComplexDiffValueJsonFormat(tableLeft: String, tableRight: String): Unit = { 34 | test("testSameDataTypesComplexDiffValue") { 35 | val expectedDiffs = 1 36 | val diffResult = returnDiff(tableLeft, tableRight) 37 | helpers.reportDiffs(diffResult, expectedDiffs) 38 | } 39 | } 40 | 41 | def testMixedDataTypesSimpleJsonFormat(tableLeft: String, tableRight: String): Unit = { 42 | test("testMixedDataTypesSimple") { 43 | val expectedDiffs = 0 44 | val diffResult = returnDiff(tableLeft, tableRight) 45 | helpers.reportDiffs(diffResult, expectedDiffs) 46 | } 47 | } 48 | 49 | def testMixedDataTypesSimpleDiffJsonFormat(tableLeft: String, tableRight: String): Unit = { 50 | test("testMixedDataTypesSimpleDiff") { 51 | val expectedDiffs = 2 52 | val diffResult = returnDiff(tableLeft, tableRight) 53 | helpers.reportDiffs(diffResult, expectedDiffs) 54 | } 55 | } 56 | 57 | def testSameDataTypesSimpleDiffMissingElementJsonFormat(tableLeft: String, tableRight: String): Unit = { 58 | test("testSameDataTypesSimpleDiffMissingElement") { 59 | val expectedDiffs = 1 60 | val diffResult = returnDiff(tableLeft, tableRight) 61 | helpers.reportDiffs(diffResult, expectedDiffs) 62 | } 63 | } 64 | 65 | def testSameDataTypesSimpleDiffExtraNullElementJsonFormat(tableLeft: String, tableRight: String): Unit = { 66 | test("testSameDataTypesSimpleDiffExtraNullElement") { 67 | val expectedDiffs = 0 68 | val diffResult = returnDiff(tableLeft, tableRight, sameSchema = true) 69 | helpers.reportDiffs(diffResult, expectedDiffs) 70 | 71 | val reason = "Expected \"Column Names Did Not Match\" exception." 72 | try { 73 | returnDiff(tableLeft, tableRight) 74 | fail(reason) 75 | } 76 | catch { 77 | case e: Exception => 78 | if (!e.getMessage.equals("Column Names Did Not Match")) fail(reason) 79 | case _: Throwable => 80 | fail(reason) 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/basetestclasses/JsonFormatToFileTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.basetestclasses 18 | 19 | trait JsonFormatToFileTests extends JsonFormatTests { 20 | this: SparkFunSuite => 21 | 22 | override def testSameDataTypesSimpleDiffExtraNullElementJsonFormat(tableLeft: String, tableRight: String): Unit = { 23 | test("testSameDataTypesSimpleExtraNullElement") { 24 | val expectedDiffs = 3 25 | val diffResult = returnDiff(tableLeft, tableRight) 26 | helpers.reportDiffs(diffResult, expectedDiffs) 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/basetestclasses/JsonFormatToJdbcTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.basetestclasses 18 | 19 | trait JsonFormatToJdbcTests extends JsonFormatTests { 20 | this: SparkFunSuite => 21 | 22 | override def testMixedDataTypesSimpleJsonFormat(tableLeft: String, tableRight: String): Unit = { 23 | test("testMixedDataTypesSimple") { 24 | val expectedDiffs = 2 25 | val diffResult = returnDiff(tableLeft, tableRight) 26 | helpers.reportDiffs(diffResult, expectedDiffs) 27 | } 28 | } 29 | 30 | override def testSameDataTypesSimpleDiffExtraNullElementJsonFormat(tableLeft: String, tableRight: String): Unit = { 31 | test("testSameDataTypesSimpleExtraNullElement") { 32 | val reason = "Expected \"Column Names Did Not Match\" exception." 33 | try { 34 | returnDiff(tableLeft, tableRight) 35 | fail(reason) 36 | } 37 | catch { 38 | case e: Exception => 39 | if (!e.getMessage.equals("Column Names Did Not Match")) fail(reason) 40 | case _: Throwable => 41 | fail(reason) 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/basetestclasses/SparkFunSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.finra.msd.basetestclasses 19 | 20 | // scalastyle:off 21 | import org.apache.spark.internal.Logging 22 | import org.apache.spark.sql.types.StructType 23 | import org.apache.spark.sql.{DataFrame, Row, SQLContext, SQLImplicits} 24 | import org.finra.msd.memorydb.MemoryDbHsql 25 | import org.finra.msd.sparkfactory.SparkFactory 26 | import org.scalatest._ 27 | import org.scalatest.funsuite.AnyFunSuite 28 | import org.scalatest.matchers.should.Matchers 29 | 30 | import scala.reflect.io.Path 31 | import scala.util.Try 32 | 33 | class SparkFunSuite 34 | extends AnyFunSuite 35 | with BeforeAndAfterAll 36 | with Logging 37 | with Matchers { 38 | 39 | 40 | protected val outputDirectory: String = System.getProperty("user.dir") + "/sparkOutputDirectory" 41 | 42 | private lazy val sparkSession = SparkFactory.sparkSession 43 | 44 | protected object testImplicits extends SQLImplicits { 45 | protected override def _sqlContext: SQLContext = SparkFactory.sparkSession.sqlContext 46 | } 47 | 48 | val helpers = new TestHelpers() 49 | 50 | implicit class SequenceImprovements(seq: Seq[Row]) { 51 | def toDf(schema: StructType): DataFrame = { 52 | val rowRdd = sparkSession.sparkContext.parallelize(seq) 53 | sparkSession.createDataFrame(rowRdd, schema) 54 | } 55 | } 56 | 57 | override def beforeAll(): Unit = synchronized { 58 | 59 | SparkFactory.initializeSparkLocalMode("local[*]", "WARN", "1") 60 | 61 | if (MemoryDbHsql.getInstance.getState != 1) { 62 | MemoryDbHsql.getInstance.initializeMemoryDB() 63 | } 64 | super.beforeAll() 65 | } 66 | 67 | 68 | override def afterAll(): Unit = { 69 | MemoryDbHsql.getInstance().shutdownMemoryDb() 70 | 71 | super.afterAll() 72 | } 73 | 74 | // helper function 75 | protected final def getTestResourceFile(file: String): java.io.File = { 76 | new java.io.File(getClass.getClassLoader.getResource(file).getFile) 77 | } 78 | 79 | protected final def getTestResourcePath(file: String): String = { 80 | getTestResourceFile(file).getCanonicalPath 81 | } 82 | 83 | /** 84 | * Log the suite name and the test name before and after each test. 85 | * 86 | * Subclasses should never override this method. If they wish to run 87 | * custom code before and after each test, they should mix in the 88 | * {{org.scalatest.BeforeAndAfter}} trait instead. 89 | */ 90 | final protected override def withFixture(test: NoArgTest): Outcome = { 91 | val testName = test.text 92 | val suiteName = this.getClass.getName 93 | val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") 94 | try { 95 | logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") 96 | test() 97 | } finally { 98 | logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") 99 | } 100 | } 101 | 102 | def deleteSavedFiles(folder : String): Unit = { 103 | val path: Path = Path(outputDirectory + "/" + folder + "/") 104 | Try(path.deleteRecursively()) 105 | } 106 | 107 | def readSavedFile(folder : String): String = { 108 | val file = new java.io.File(outputDirectory + "/" + folder + "/") 109 | .listFiles 110 | .filter(_.isFile) 111 | .filter(_.getName.endsWith(".csv"))(0) 112 | 113 | scala.reflect.io.File(file).slurp() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/basetestclasses/SparkFunSuiteDynamoDb.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.finra.msd.basetestclasses 19 | 20 | // scalastyle:off 21 | 22 | import org.finra.msd.memorydb.MemoryDbDynamo 23 | 24 | class SparkFunSuiteDynamoDb 25 | extends SparkFunSuite { 26 | 27 | 28 | protected final val dynamoDbEndpoint = "http://localhost:8000" 29 | protected final val dynamoDbCustomAWSCredentialsProvider = "com.amazonaws.auth.SystemPropertiesCredentialsProvider" 30 | 31 | override def beforeAll(): Unit = synchronized { 32 | super.beforeAll() 33 | 34 | MemoryDbDynamo.getInstance().initializeMemoryDb() 35 | 36 | System.setProperty("aws.dynamodb.endpoint", dynamoDbEndpoint) 37 | 38 | System.setProperty("aws.accessKeyId", "test") 39 | System.setProperty("aws.secretKey", "test") 40 | } 41 | 42 | 43 | override def afterAll(): Unit = { 44 | super.afterAll() 45 | 46 | MemoryDbDynamo.getInstance().shutdownMemoryDb() 47 | } 48 | } -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/basetestclasses/TestHelpers.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.basetestclasses 18 | 19 | import org.apache.commons.io.FileUtils 20 | import org.apache.spark.sql.{Column, DataFrame} 21 | import org.apache.spark.sql.types.StringType 22 | import org.finra.msd.containers.{AppleTable, DiffResult} 23 | import org.finra.msd.helpers.JsonHelper 24 | import org.scalatest.Assertions.fail 25 | 26 | import java.io.File 27 | import java.nio.charset.StandardCharsets 28 | 29 | class TestHelpers { 30 | 31 | def reportDiffs(diffResult: DiffResult, 32 | expectedDiffs: Int): Unit = { 33 | reportDiffs(diffResult, expectedDiffs, expectedDiffs) 34 | } 35 | 36 | def reportDiffs(diffResult: DiffResult, 37 | expectedDiffsLeftNotInRight: Int, expectedDiffsRightNotInLeft: Int): Unit = { 38 | if (diffResult.inLeftNotInRight.count != expectedDiffsLeftNotInRight) 39 | fail("Expected " + expectedDiffsLeftNotInRight + 40 | " differences coming from left dataframe." + 41 | " Instead, found " + diffResult.inLeftNotInRight.count + ".") 42 | if (diffResult.inRightNotInLeft.count != expectedDiffsRightNotInLeft) 43 | fail("Expected " + expectedDiffsRightNotInLeft + 44 | " differences coming from right dataframe." + 45 | " Instead, found " + diffResult.inRightNotInLeft.count + ".") 46 | } 47 | 48 | private def seqStringsToColumns(seq: Seq[String]): Seq[Column] = { 49 | seq.map(x => new Column(x)) 50 | } 51 | 52 | def compareJdbcDataFrameSimpleToJsonFormat(appleTable: AppleTable, jsonFormatDf: DataFrame, keyColumns: Seq[String]): Unit = { 53 | val df = appleTable.getDataFrame 54 | 55 | val orderedColumns = df.schema.fieldNames.sorted 56 | 57 | val actualCount = jsonFormatDf.count() 58 | val expectedCount = df.count() 59 | 60 | assert(df.schema.fieldNames.length == jsonFormatDf.schema.fieldNames.length) 61 | assert(actualCount == expectedCount) 62 | val keyCols = seqStringsToColumns(keyColumns) 63 | val actualRows = jsonFormatDf.orderBy(keyCols:_*).take(actualCount.toInt) 64 | val expectedRows = df.orderBy(keyCols:_*).take(expectedCount.toInt) 65 | 66 | for (i <- 0 until expectedCount.toInt) { 67 | val actualRow = actualRows(i) 68 | val expectedRow = expectedRows(i) 69 | 70 | for (j <- 0 until expectedRow.length) { 71 | val columnName = orderedColumns(j) 72 | val actualFieldIndex = jsonFormatDf.schema.fieldIndex(columnName) 73 | val actual = actualRow.get(actualFieldIndex) 74 | val expectedFieldIndex = df.schema.fieldIndex(columnName) 75 | val expectedOriginal = expectedRow.get(expectedFieldIndex) 76 | val expected = 77 | if (jsonFormatDf.schema.fields(expectedFieldIndex).dataType == StringType && expectedOriginal != null) { 78 | if ((expectedOriginal.toString.startsWith("[") && expectedOriginal.toString.endsWith("]")) 79 | || (expectedOriginal.toString.startsWith("{") && expectedOriginal.toString.endsWith("}"))) expectedOriginal 80 | else "\"" + expectedOriginal + "\"" 81 | } else expectedRow.get(expectedFieldIndex) 82 | 83 | if (actual == null) { 84 | assert(expected == null) 85 | } else if (actual.isInstanceOf[Array[Byte]] || expected.isInstanceOf[Array[Byte]]) { 86 | assert(actual.asInstanceOf[Array[Byte]].deep == expected.asInstanceOf[Array[Byte]].deep) 87 | } else { 88 | assert(actual.equals(expected)) 89 | } 90 | } 91 | } 92 | } 93 | 94 | def compareJsonDataFrameActualToExpected(appleTable: AppleTable, jsonPath: String, txtPath: String, keyColumns: Seq[String], delimiter: String): Unit = { 95 | val df = appleTable.getDataFrame 96 | 97 | val jsonString = FileUtils.readFileToString(new File(jsonPath), StandardCharsets.UTF_8) 98 | val txtString = FileUtils.readFileToString(new File(txtPath), StandardCharsets.UTF_8) 99 | 100 | val expectedRowsJson = JsonHelper.jsonToMapList(jsonString) 101 | val expectedRowsTxt = txtString.split("\n") 102 | 103 | val orderedColumns = df.schema.fieldNames.sorted 104 | val expectedCount = expectedRowsJson.size() 105 | 106 | assert(expectedCount == expectedRowsTxt.length) 107 | assert(df.count() == expectedCount) 108 | val keyCols = seqStringsToColumns(keyColumns) 109 | val actualRows = df.orderBy(keyCols:_*).take(expectedCount) 110 | 111 | for (i <- 0 until expectedCount) { 112 | val actualRow = actualRows(i) 113 | val expectedRowJson = expectedRowsJson.get(i) 114 | val expectedRowTxt = expectedRowsTxt(i).split(delimiter) 115 | 116 | for (j <- 0 until expectedRowJson.size()) { 117 | val columnName = orderedColumns(j) 118 | val fieldIndex = df.schema.fieldIndex(columnName) 119 | val actual = actualRow.get(fieldIndex) 120 | val expectValJson = expectedRowJson.get(columnName) 121 | val expectValTxt = expectedRowTxt(j) 122 | 123 | if (actual == null) { 124 | assert(expectValJson == null) 125 | assert(expectValTxt == "") 126 | } else { 127 | assert(actual.equals(expectValJson)) 128 | assert(actual.equals(expectValTxt)) 129 | } 130 | } 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/controllers/TemplateControllerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.controllers 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuite 20 | 21 | class TemplateControllerSuite extends SparkFunSuite { 22 | 23 | test("html template loading") { 24 | val html = TemplateController.horizontalTableTemplate 25 | assert(html.contains("")) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/CsvToCsvSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.apache.spark.sql.types.{StringType, StructType} 20 | import org.finra.msd.basetestclasses.SparkFunSuite 21 | import org.finra.msd.containers.DiffResult 22 | import org.finra.msd.sparkfactory.SparkFactory 23 | 24 | class CsvToCsvSuite extends SparkFunSuite { 25 | 26 | private def returnDiff(file1: String, file2: String, delimiter1: String, delimiter2: String): DiffResult = { 27 | val expectedSchema = new StructType() 28 | .add("fruit", StringType, nullable = true) 29 | .add("price", StringType, nullable = true) 30 | .add("ripeness", StringType, nullable = true) 31 | .add("color", StringType, nullable = true) 32 | 33 | val file1Path = this.getClass.getClassLoader.getResource(file1).getPath 34 | val leftAppleTable = SparkFactory.parallelizeCSVSource(file1Path, "left_table", Option(expectedSchema), Option(delimiter1)) 35 | val file2Path = this.getClass.getClassLoader.getResource(file2).getPath 36 | val rightAppleTable = SparkFactory.parallelizeCSVSource(file2Path, "right_table", Option(expectedSchema), Option(delimiter2)) 37 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 38 | } 39 | 40 | test("test CSV partial quote to full quote") { 41 | val expectedDiffs = 0 42 | val diffResult = returnDiff("csv/TestCSV.txt", "csv/TestCSV_2.txt", "|", ",") 43 | //comparison complains about column name diffs between the tables (from the schemaStruct/inferSchema) 44 | helpers.reportDiffs(diffResult, expectedDiffs) 45 | } 46 | 47 | test("test CSV escaped quotes") { 48 | val expectedDiffs = 2 49 | val diffResult = returnDiff("csv/TestCSV_commas.txt", "csv/TestCSV_pipes.txt", ",", "|") 50 | //comparison complains about column name diffs between the tables (from the schemaStruct/inferSchema) 51 | helpers.reportDiffs(diffResult, expectedDiffs) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/DynamoDbToDynamoDbSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatTests, SparkFunSuiteDynamoDb} 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.sparkfactory.SparkFactory 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | class DynamoDbToDynamoDbSuite extends SparkFunSuiteDynamoDb 25 | with JsonFormatTests { 26 | 27 | override def returnDiff(table1: String, table2: String, sameSchema: Boolean): DiffResult = { 28 | val schema1 = sparkSession.sqlContext.read 29 | .option("multiLine", "true") 30 | .option("primitivesAsString", "true") 31 | .json(this.getClass.getClassLoader.getResource(table1 + ".json").getPath).schema 32 | 33 | val schema2 = sparkSession.sqlContext.read 34 | .option("multiLine", "true") 35 | .option("primitivesAsString", "true") 36 | .json(this.getClass.getClassLoader.getResource(table2 + ".json").getPath).schema 37 | 38 | parallelizeTablesAndCompare(table1.replace("/", "_"), 39 | table2.replace("/", "_"), 40 | schema1.fieldNames, 41 | if (sameSchema) schema1.fieldNames else schema2.fieldNames) 42 | } 43 | 44 | def parallelizeTablesAndCompare(table1: String, table2: String, 45 | fieldNames1: Array[String], fieldNames2: Array[String]): DiffResult = { 46 | val leftAppleTable = SparkFactory.parallelizeDynamoDBSource(table1, table1 + "_left", fieldNames1) 47 | val rightAppleTable = SparkFactory.parallelizeDynamoDBSource(table2, table2 + "_right", fieldNames2) 48 | 49 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 50 | } 51 | 52 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList", "compare/JsonTestMapList") 53 | 54 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue", "compare/JsonTestMapList") 55 | 56 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimpleMixedType") 57 | 58 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimple") 59 | 60 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement", "json/JsonTestSimple") 61 | 62 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull", "json/JsonTestSimple") 63 | } 64 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/DynamoDbToFileSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatToFileTests, SparkFunSuiteDynamoDb} 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.sparkfactory.SparkFactory 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | class DynamoDbToFileSuite extends SparkFunSuiteDynamoDb 25 | with JsonFormatToFileTests { 26 | override def returnDiff(table: String, textFile: String, sameSchema: Boolean): DiffResult = { 27 | val schema = sparkSession.sqlContext.read 28 | .option("multiLine", "true") 29 | .option("primitivesAsString", "true") 30 | .json(this.getClass.getClassLoader.getResource(table + ".json").getPath).schema 31 | 32 | parallelizeTablesAndCompare(table.replace("/", "_"), textFile, schema.fieldNames) 33 | } 34 | 35 | def parallelizeTablesAndCompare(table: String, textFile: String, 36 | fieldNames: Array[String]): DiffResult = { 37 | val leftAppleTable = SparkFactory.parallelizeDynamoDBSource(table, table + "_left", fieldNames, Option.apply(";")) 38 | val filePath = this.getClass.getClassLoader.getResource(textFile).getPath 39 | val rightAppleTable = SparkFactory.parallelizeTextSource(filePath, "text_right") 40 | 41 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 42 | } 43 | 44 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList", "compare/JsonTestMapList.txt") 45 | 46 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue", "compare/JsonTestMapList.txt") 47 | 48 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimpleMixedType.txt") 49 | 50 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimple.txt") 51 | 52 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement", "json/JsonTestSimple.txt") 53 | 54 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull", "json/JsonTestSimple.txt") 55 | } 56 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/DynamoDbToJdbcSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatToJdbcTests, SparkFunSuiteDynamoDb} 20 | import org.finra.msd.containers.{AppleTable, DiffResult} 21 | import org.finra.msd.memorydb.MemoryDbHsql 22 | import org.finra.msd.sparkfactory.SparkFactory 23 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 24 | 25 | class DynamoDbToJdbcSuite extends SparkFunSuiteDynamoDb 26 | with JsonFormatToJdbcTests { 27 | override def returnDiff(table1: String, table2: String, sameSchema: Boolean): DiffResult = { 28 | val schema = sparkSession.sqlContext.read 29 | .option("multiLine", "true") 30 | .option("primitivesAsString", "true") 31 | .json(this.getClass.getClassLoader.getResource(table1 + ".json").getPath).schema 32 | 33 | parallelizeTablesAndCompare(table1.replace("/", "_"), table2.replace("/", "_"), schema.fieldNames) 34 | } 35 | 36 | def parallelizeTablesAndCompare(table1: String, table2: String, 37 | fieldNames: Array[String]): DiffResult = { 38 | val leftAppleTable = SparkFactory.parallelizeDynamoDBSource(table1, table1 + "_left", fieldNames) 39 | val rightAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 40 | "(select * from " + table2 + ")", table2 + "_right") 41 | 42 | var rightDf = rightAppleTable.dataFrame 43 | 44 | for (colName <- rightDf.columns) { 45 | rightDf = rightDf.withColumnRenamed(colName, colName.toLowerCase) 46 | } 47 | 48 | rightDf = rightDf.selectExpr(leftAppleTable.dataFrame.columns 49 | .filter(x => rightDf.columns.contains(x)): _*) 50 | 51 | val rightAppleTableLowerCase = AppleTable(rightAppleTable.sourceType, rightDf, rightAppleTable.delimiter, rightAppleTable.tempViewName) 52 | 53 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTableLowerCase) 54 | } 55 | 56 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList", "compare/JsonTestMapList") 57 | 58 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue", "compare/JsonTestMapList") 59 | 60 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimpleMixedType") 61 | 62 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimple") 63 | 64 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement", "json/JsonTestSimple") 65 | 66 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull", "json/JsonTestSimple") 67 | } 68 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/DynamoDbToJsonSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatTests, SparkFunSuiteDynamoDb} 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.sparkfactory.SparkFactory 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | class DynamoDbToJsonSuite extends SparkFunSuiteDynamoDb 25 | with JsonFormatTests { 26 | 27 | override def returnDiff(table: String, jsonFile: String, sameSchema: Boolean): DiffResult = { 28 | val schema1 = sparkSession.sqlContext.read 29 | .option("multiLine", "true") 30 | .option("primitivesAsString", "true") 31 | .json(this.getClass.getClassLoader.getResource(table + ".json").getPath).schema 32 | 33 | val schema2 = sparkSession.sqlContext.read 34 | .option("multiLine", "true") 35 | .option("primitivesAsString", "true") 36 | .json(this.getClass.getClassLoader.getResource(jsonFile).getPath).schema 37 | 38 | parallelizeTablesAndCompare(table.replace("/", "_"), jsonFile, 39 | schema1.fieldNames, 40 | if (sameSchema) schema1.fieldNames else schema2.fieldNames) 41 | } 42 | 43 | def parallelizeTablesAndCompare(table: String, jsonFile: String, 44 | fieldNames1: Array[String], fieldNames2: Array[String]): DiffResult = { 45 | val leftAppleTable = SparkFactory.parallelizeDynamoDBSource(table, table + "_left", fieldNames1) 46 | val filePath = this.getClass.getClassLoader.getResource(jsonFile).getPath 47 | val rightAppleTable = SparkFactory.parallelizeJSONSource(filePath, "json_right", fieldNames2) 48 | 49 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 50 | } 51 | 52 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList", "compare/JsonTestMapList.json") 53 | 54 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue", "compare/JsonTestMapList.json") 55 | 56 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimpleMixedType.json") 57 | 58 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType", "json/JsonTestSimple.json") 59 | 60 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement", "json/JsonTestSimple.json") 61 | 62 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull", "json/JsonTestSimple.json") 63 | } 64 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/FileToFileSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuite 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.sparkfactory.SparkFactory 22 | 23 | class FileToFileSuite() extends SparkFunSuite { 24 | private def returnDiff(fileName1: String, fileName2: String): DiffResult = { 25 | val file1Path = this.getClass.getClassLoader.getResource(fileName1 + ".txt").getPath 26 | val leftAppleTable = SparkFactory.parallelizeTextSource(file1Path, "table1") 27 | val file2Path = this.getClass.getClassLoader.getResource(fileName2 + ".txt").getPath 28 | val rightAppleTable = SparkFactory.parallelizeTextSource(file2Path, "table2") 29 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 30 | } 31 | 32 | test("testCompareEqualFiles") { 33 | val expectedDiffs = 0 34 | val diffResult = returnDiff("txt/Fruit1", "txt/Fruit2") 35 | //the expectation is that both tables are equal 36 | helpers.reportDiffs(diffResult, expectedDiffs) 37 | } 38 | 39 | test("testCompareCompletelyDifferentFiles") { 40 | val expectedDiffsLeftNotInRight = 5 41 | val expectedDiffsRightNotInLeft = 4 42 | val diffResult = returnDiff("txt/Fruit4", "txt/Fruit5") 43 | //the expectation is that both tables are completely different 44 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 45 | } 46 | 47 | test("testCompareAFewDifferences") { 48 | val expectedDiffs = 2 49 | val diffResult = returnDiff("txt/Fruit1", "txt/Fruit3") 50 | //the expectation is that there are only a few differences 51 | helpers.reportDiffs(diffResult, expectedDiffs) 52 | } 53 | 54 | test("testCompareTable1IsSubset") { 55 | val expectedDiffsLeftNotInRight = 0 56 | val expectedDiffsRightNotInLeft = 4 57 | val diffResult = returnDiff("txt/Fruit4", "txt/Fruit1") 58 | //the expectation is that table1 is a complete subset of table2 59 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 60 | } 61 | 62 | test("testCompareTable2IsSubset") { 63 | val expectedDiffsLeftNotInRight = 5 64 | val expectedDiffsRightNotInLeft = 0 65 | val diffResult = returnDiff("txt/Fruit1", "txt/Fruit5") 66 | //the expectation is that table2 is a complete subset of table1 67 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 68 | } 69 | } -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/JdbcToFileSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuite 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.memorydb.MemoryDbHsql 22 | import org.finra.msd.sparkfactory.SparkFactory 23 | 24 | class JdbcToFileSuite() extends SparkFunSuite { 25 | 26 | private def returnDiff(table1: String, table2: String): DiffResult = { 27 | val leftAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 28 | "(select * from " + table1 + ")", "table1") 29 | val file2Path = this.getClass.getClassLoader.getResource(table2 + ".txt").getPath 30 | val rightAppleTable = SparkFactory.parallelizeTextSource(file2Path, "table2") 31 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 32 | } 33 | 34 | test("testCompareEqualTables") { 35 | val expectedDiffs = 0 36 | val diffResult = returnDiff("Fruit1", "txt/Fruit1") 37 | //the expectation is that both tables are equal 38 | helpers.reportDiffs(diffResult, expectedDiffs) 39 | } 40 | 41 | test("testCompareJDBCTableToTextFile") { 42 | val expectedDiffsLeftNotInRight = 0 43 | val expectedDiffsRightNotInLeft = 1 44 | val diffResult = returnDiff("Fruit4", "txt/Fruit4") 45 | //the expectation is that both tables are completely different 46 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 47 | } 48 | 49 | test("testCompareAFewDifferences") { 50 | val expectedDiffs = 2 51 | val diffResult = returnDiff("Fruit1", "txt/Fruit3") 52 | //the expectation is that there are only a few differences 53 | helpers.reportDiffs(diffResult, expectedDiffs) 54 | } 55 | 56 | test("testCompareTable1IsSubset") { 57 | val expectedDiffsLeftNotInRight = 0 58 | val expectedDiffsRightNotInLeft = 5 59 | val diffResult = returnDiff("Fruit4", "txt/Fruit1") 60 | //the expectation is that table1 is a complete subset of table2 61 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 62 | } 63 | 64 | test("testCompareTable2IsSubset") { 65 | val expectedDiffsLeftNotInRight = 5 66 | val expectedDiffsRightNotInLeft = 0 67 | val diffResult = returnDiff("Fruit1", "txt/Fruit5") 68 | //the expectation is that table2 is a complete subset of table1 69 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 70 | } 71 | } -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/JdbcToJdbcSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuite 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.memorydb.MemoryDbHsql 22 | import org.finra.msd.sparkfactory.SparkFactory 23 | 24 | class JdbcToJdbcSuite() extends SparkFunSuite { 25 | private def returnDiff(table1: String, table2: String): DiffResult = { 26 | val leftAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 27 | "(select * from " + table1 + ")", "table1") 28 | val rightAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 29 | "(select * from " + table2 + ")", "table2") 30 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 31 | } 32 | 33 | private def returnDiffWithSavingResult(table1: String, table2: String, testName: String) : Boolean = { 34 | val leftAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 35 | "(select * from " + table1 + ")", "table1") 36 | val rightAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 37 | "(select * from " + table2 + ")", "table2") 38 | val outputPath = outputDirectory + "/"+ testName + "/" 39 | 40 | // The save result will exclude the "Ripeness" column and result order by "PRICE" descending 41 | SparkCompare.compareAppleTablesSaveResultsWithManipulation( 42 | leftAppleTable, rightAppleTable, outputPath, singleFileOutput = true, ",", 43 | Option.apply(Array("Ripeness")), Option.apply(Array("PRICE")), ascOrder = false) 44 | } 45 | 46 | test("testCompareDifferentSchemas") { 47 | val reason = "Expected \"Column Names Did Not Match\" exception." 48 | try { 49 | returnDiff("Persons1", "Fruit1") 50 | fail(reason) 51 | } 52 | catch { 53 | case e: Exception => 54 | if (!e.getMessage.equals("Column Names Did Not Match")) fail(reason) 55 | case _: Throwable => 56 | fail(reason) 57 | } 58 | } 59 | 60 | test("testCompareEqualTables") { 61 | val expectedDiffs = 0 62 | val diffResult = returnDiff("Fruit1", "Fruit2") 63 | //the expectation is that both tables are equal 64 | helpers.reportDiffs(diffResult, expectedDiffs) 65 | } 66 | 67 | test("testCompareCompletelyDifferent") { 68 | val expectedDiffs = 5 69 | val diffResult = returnDiff("Fruit4", "Fruit5") 70 | //the expectation is that both tables are completely different 71 | helpers.reportDiffs(diffResult, expectedDiffs) 72 | } 73 | 74 | test("testCompareAFewDifferences") { 75 | val expectedDiffs = 2 76 | val diffResult = returnDiff("Fruit1", "Fruit3") 77 | //the expectation is that there are only a few differences 78 | helpers.reportDiffs(diffResult, expectedDiffs) 79 | } 80 | 81 | test("testCompareTable1IsSubset") { 82 | val expectedDiffsLeftNotInRight = 0 83 | val expectedDiffsRightNotInLeft = 5 84 | val diffResult = returnDiff("Fruit4", "Fruit1") 85 | //the expectation is that table1 is a complete subset of table2 86 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 87 | } 88 | 89 | test("testCompareTable2IsSubset") { 90 | val expectedDiffsLeftNotInRight = 5 91 | val expectedDiffsRightNotInLeft = 0 92 | val diffResult = returnDiff("Fruit1", "Fruit5") 93 | //the expectation is that table2 is a complete subset of table1 94 | helpers.reportDiffs(diffResult, expectedDiffsLeftNotInRight, expectedDiffsRightNotInLeft) 95 | } 96 | 97 | test("testCompareAndSaveFile") { 98 | deleteSavedFiles("testCompareAndSaveFile") 99 | val noDiff = returnDiffWithSavingResult("Fruit1", "Fruit3", "testCompareAndSaveFile") 100 | if (noDiff) 101 | fail("Expected differences. Instead found no difference!") 102 | 103 | val leftDiff = readSavedFile("testCompareAndSaveFile/inLeftNotInRight").split("\n") 104 | val rightDiff = readSavedFile("testCompareAndSaveFile/inRightNotInLeft").split("\n") 105 | if (leftDiff.length != 3) // Includes header line 106 | fail("Expected 3 rows (1 header and 2 data) coming from left table." + " Instead, found " + leftDiff.length + ".") 107 | if (rightDiff.length != 3) // Includes header line 108 | fail("Expected 3 rows (1 header and 2 data) coming from right table." + " Instead, found " + rightDiff.length + ".") 109 | 110 | val leftCols = leftDiff(0).split(",") 111 | val rightCols = rightDiff(0).split(",") 112 | if (leftCols.length != 4) 113 | fail("Expected 4 columns (3 data columns and 1 repeated row count) returned in left table differences." + " Instead, found " + leftCols.length + ".") 114 | if (rightCols.length != 4) 115 | fail("Expected 4 columns (3 data columns and 1 repeated row count) returned in right table differences." + " Instead, found " + rightCols.length + ".") 116 | 117 | if (leftDiff(0).contains("RIPENESS")) 118 | fail("Expected ripeness not to be included in left table differences.") 119 | if (rightDiff(0).contains("Ripeness")) { 120 | fail("Expected ripeness not to be included in right table differences.") 121 | } 122 | if (leftDiff(1).split(",")(1).toInt < leftDiff(2).split(",")(1).toInt) 123 | fail("Expected results to be sorted descending by price in left table differences.") 124 | if (rightDiff(1).split(",")(1).toInt < rightDiff(2).split(",")(1).toInt) 125 | fail("Expected results to be sorted descending by price in right table differences.") 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/JsonToFileSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatToFileTests, SparkFunSuiteDynamoDb} 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.sparkfactory.SparkFactory 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | class JsonToFileSuite extends SparkFunSuiteDynamoDb 25 | with JsonFormatToFileTests { 26 | override def returnDiff(jsonFile: String, textFile: String, sameSchema: Boolean): DiffResult = { 27 | val schema = sparkSession.sqlContext.read 28 | .option("multiLine", "true") 29 | .option("primitivesAsString", "true") 30 | .json(this.getClass.getClassLoader.getResource(jsonFile).getPath).schema 31 | 32 | parallelizeTablesAndCompare(jsonFile, textFile, schema.fieldNames) 33 | } 34 | 35 | def parallelizeTablesAndCompare(jsonFile: String, textFile: String, 36 | fieldNames: Array[String]): DiffResult = { 37 | val filePathJson = this.getClass.getClassLoader.getResource(jsonFile).getPath 38 | val leftAppleTable = SparkFactory.parallelizeJSONSource(filePathJson, "json_left", fieldNames, Option.apply(";")) 39 | val filePathText = this.getClass.getClassLoader.getResource(textFile).getPath 40 | val rightAppleTable = SparkFactory.parallelizeTextSource(filePathText, "text_right") 41 | 42 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 43 | } 44 | 45 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList.json", "compare/JsonTestMapList.txt") 46 | 47 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue.json", "compare/JsonTestMapList.txt") 48 | 49 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType.json", "json/JsonTestSimpleMixedType.txt") 50 | 51 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType.json", "json/JsonTestSimple.txt") 52 | 53 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement.json", "json/JsonTestSimple.txt") 54 | 55 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull.json", "json/JsonTestSimple.txt") 56 | } 57 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/JsonToJdbcSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatToJdbcTests, SparkFunSuiteDynamoDb} 20 | import org.finra.msd.containers.{AppleTable, DiffResult} 21 | import org.finra.msd.memorydb.MemoryDbHsql 22 | import org.finra.msd.sparkfactory.SparkFactory 23 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 24 | 25 | class JsonToJdbcSuite extends SparkFunSuiteDynamoDb 26 | with JsonFormatToJdbcTests { 27 | override def returnDiff(jsonFile: String, table: String, sameSchema: Boolean): DiffResult = { 28 | val schema = sparkSession.sqlContext.read 29 | .option("multiLine", "true") 30 | .option("primitivesAsString", "true") 31 | .json(this.getClass.getClassLoader.getResource(jsonFile).getPath).schema 32 | 33 | parallelizeTablesAndCompare(jsonFile, table.replace("/", "_"), schema.fieldNames) 34 | } 35 | 36 | def parallelizeTablesAndCompare(jsonFile: String, table: String, 37 | fieldNames: Array[String]): DiffResult = { 38 | val filePath = this.getClass.getClassLoader.getResource(jsonFile).getPath 39 | val leftAppleTable = SparkFactory.parallelizeJSONSource(filePath, "json_left", fieldNames) 40 | val rightAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 41 | "(select * from " + table + ")", table + "_right") 42 | 43 | var rightDf = rightAppleTable.dataFrame 44 | 45 | for (colName <- rightDf.columns) { 46 | rightDf = rightDf.withColumnRenamed(colName, colName.toLowerCase) 47 | } 48 | 49 | rightDf = rightDf.selectExpr(leftAppleTable.dataFrame.columns 50 | .filter(x => rightDf.columns.contains(x)): _*) 51 | 52 | val rightAppleTableLowerCase = AppleTable(rightAppleTable.sourceType, rightDf, rightAppleTable.delimiter, rightAppleTable.tempViewName) 53 | 54 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTableLowerCase) 55 | } 56 | 57 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList.json", "compare/JsonTestMapList") 58 | 59 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue.json", "compare/JsonTestMapList") 60 | 61 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType.json", "json/JsonTestSimpleMixedType") 62 | 63 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType.json", "json/JsonTestSimple") 64 | 65 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement.json", "json/JsonTestSimple") 66 | 67 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull.json", "json/JsonTestSimple") 68 | } 69 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkcompare/JsonToJsonSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkcompare 18 | 19 | import org.finra.msd.basetestclasses.{JsonFormatTests, SparkFunSuite} 20 | import org.finra.msd.containers.DiffResult 21 | import org.finra.msd.sparkfactory.SparkFactory 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | class JsonToJsonSuite extends SparkFunSuite 25 | with JsonFormatTests { 26 | override def returnDiff(jsonFile1: String, jsonFile2: String, sameSchema: Boolean): DiffResult = { 27 | val schema1 = sparkSession.sqlContext.read 28 | .option("multiLine", "true") 29 | .option("primitivesAsString", "true") 30 | .json(this.getClass.getClassLoader.getResource(jsonFile1).getPath).schema 31 | 32 | val schema2 = sparkSession.sqlContext.read 33 | .option("multiLine", "true") 34 | .option("primitivesAsString", "true") 35 | .json(this.getClass.getClassLoader.getResource(jsonFile2).getPath).schema 36 | 37 | parallelizeTablesAndCompare(jsonFile1, jsonFile2, schema1.fieldNames, if (sameSchema) schema1.fieldNames else schema2.fieldNames) 38 | } 39 | 40 | def parallelizeTablesAndCompare(jsonFile1: String, jsonFile2: String, 41 | fieldNames1: Array[String], fieldNames2: Array[String]): DiffResult = { 42 | val filePath1 = this.getClass.getClassLoader.getResource(jsonFile1).getPath 43 | val leftAppleTable = SparkFactory.parallelizeJSONSource(filePath1, "json_left", fieldNames1) 44 | val filePath2 = this.getClass.getClassLoader.getResource(jsonFile2).getPath 45 | val rightAppleTable = SparkFactory.parallelizeJSONSource(filePath2, "json_right", fieldNames2) 46 | 47 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 48 | } 49 | 50 | testSameDataTypesComplexJsonFormat("compare/JsonTestMapList.json", "compare/JsonTestMapList.json") 51 | 52 | testSameDataTypesComplexDiffValueJsonFormat("compare/JsonTestMapListDiffValue.json", "compare/JsonTestMapList.json") 53 | 54 | testMixedDataTypesSimpleJsonFormat("json/JsonTestSimpleMixedType.json", "json/JsonTestSimpleMixedType.json") 55 | 56 | testMixedDataTypesSimpleDiffJsonFormat("json/JsonTestSimpleMixedType.json", "json/JsonTestSimple.json") 57 | 58 | testSameDataTypesSimpleDiffMissingElementJsonFormat("json/JsonTestSimpleMissingElement.json", "json/JsonTestSimple.json") 59 | 60 | testSameDataTypesSimpleDiffExtraNullElementJsonFormat("json/JsonTestSimpleExtraNull.json", "json/JsonTestSimple.json") 61 | } 62 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkfactory/DynamoDbSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkfactory 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuiteDynamoDb 20 | import org.finra.msd.enums.SourceType 21 | import org.finra.msd.helpers.FileHelper 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` 25 | 26 | class DynamoDbSuite extends SparkFunSuiteDynamoDb { 27 | 28 | val directoryJson = "json" 29 | val directoryDynamoDb = "dynamodb" 30 | val expectedSourceType = SourceType.DYNAMODB 31 | 32 | (FileHelper.getFilenames(directoryJson, "JsonTest", ".json") 33 | ++ FileHelper.getFilenames(directoryDynamoDb, "DynamoDbTest", ".json")) 34 | .foreach(filename => { 35 | test("SparkFactory DynamoDb test - " + filename) { 36 | val baseFile = filename.substring(0, filename.length - 5) 37 | val tableName = baseFile.replaceAll("/", "_") 38 | val jsonFile = filename 39 | val txtFile = baseFile + ".txt" 40 | val jsonPath = this.getClass.getClassLoader.getResource(jsonFile).getPath 41 | val txtPath = this.getClass.getClassLoader.getResource(txtFile).getPath 42 | 43 | val schema = sparkSession.sqlContext.read 44 | .option("multiLine", "true") 45 | .option("primitivesAsString", "true") 46 | .json(jsonPath).schema 47 | 48 | val appleTable = SparkFactory.parallelizeDynamoDBSource(tableName, "dynamodb_test", schema.fieldNames) 49 | assert(appleTable.getSourceType.equals(expectedSourceType)) 50 | 51 | helpers.compareJsonDataFrameActualToExpected(appleTable, 52 | if (baseFile.endsWith("SetDiffElementOrder")) 53 | jsonPath.replace("SetDiffElementOrder.json", "Set.json") 54 | else jsonPath, 55 | txtPath, Seq("key1", "key2"), ";") 56 | } 57 | } 58 | ) 59 | 60 | test("Dynamodb column filter") { 61 | val jsonPath = this.getClass.getClassLoader.getResource("json/JsonTestSimple.json").getPath 62 | 63 | val schema = sparkSession.sqlContext.read 64 | .option("multiLine", "true") 65 | .option("primitivesAsString", "true") 66 | .json(jsonPath).schema 67 | 68 | val appleTable = SparkFactory.parallelizeDynamoDBSource("json_JsonTestSimple", "dynamodb_test", schema.fieldNames, 69 | Option.apply(","), selectColumns = Option.apply(Array("key1", "attribute1"))) 70 | assert(appleTable.getSourceType.equals(expectedSourceType)) 71 | 72 | val dataResult = appleTable.getDataFrame.orderBy("key1").collect() 73 | assert(dataResult.length == 3) 74 | 75 | val schemaResult = appleTable.getDataFrame.schema 76 | assert(schemaResult.fields.length == 2) 77 | 78 | assert(dataResult.take(3)(0)(schemaResult.fieldIndex("key1")).equals("\"TEST1\"")) 79 | assert(dataResult.take(3)(0)(schemaResult.fieldIndex("attribute1")).equals("\"test number 1\"")) 80 | assert(dataResult.take(3)(1)(schemaResult.fieldIndex("key1")).equals("\"TEST2\"")) 81 | assert(dataResult.take(3)(1)(schemaResult.fieldIndex("attribute1")).equals("\"test number 2\"")) 82 | assert(dataResult.take(3)(2)(schemaResult.fieldIndex("key1")).equals("\"TEST3\"")) 83 | assert(dataResult.take(3)(2)(schemaResult.fieldIndex("attribute1")).equals("\"test number 3\"")) 84 | } 85 | 86 | test("Dynamodb row filter") { 87 | val jsonPath = this.getClass.getClassLoader.getResource("json/JsonTestSimple.json").getPath 88 | 89 | val schema = sparkSession.sqlContext.read 90 | .option("multiLine", "true") 91 | .option("primitivesAsString", "true") 92 | .json(jsonPath).schema 93 | 94 | val appleTable = SparkFactory.parallelizeDynamoDBSource("json_JsonTestSimple", "dynamodb_test", schema.fieldNames, 95 | delimiter = Option.apply(","), filter = Option.apply("key1 = 'TEST3'")) 96 | assert(appleTable.getSourceType.equals(expectedSourceType)) 97 | 98 | val dataResult = appleTable.getDataFrame.collect() 99 | assert(dataResult.length == 1) 100 | 101 | val schemaResult = appleTable.getDataFrame.schema 102 | assert(schemaResult.fields.length == 5) 103 | 104 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("key1")).equals("\"TEST3\"")) 105 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("key2")).equals("3")) 106 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute1")).equals("\"test number 3\"")) 107 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute2")).equals("\"true\"")) 108 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute3")).equals("\"3\"")) 109 | } 110 | 111 | test("Dynamodb column and row filter") { 112 | val jsonPath = this.getClass.getClassLoader.getResource("json/JsonTestSimple.json").getPath 113 | 114 | val schema = sparkSession.sqlContext.read 115 | .option("multiLine", "true") 116 | .option("primitivesAsString", "true") 117 | .json(jsonPath).schema 118 | 119 | val appleTable = SparkFactory.parallelizeDynamoDBSource("json_JsonTestSimple", "dynamodb_test", schema.fieldNames, 120 | delimiter = Option.apply(","), selectColumns = Option.apply(Array("key1", "attribute1")), filter = Option.apply("key1 = 'TEST2'")) 121 | assert(appleTable.getSourceType.equals(expectedSourceType)) 122 | 123 | val dataResult = appleTable.getDataFrame.collect() 124 | assert(dataResult.length == 1) 125 | 126 | val schemaResult = appleTable.getDataFrame.schema 127 | assert(schemaResult.fields.length == 2) 128 | 129 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("key1")).equals("\"TEST2\"")) 130 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute1")).equals("\"test number 2\"")) 131 | } 132 | 133 | test("Dynamodb row filter post scan") { 134 | val jsonPath = this.getClass.getClassLoader.getResource("json/JsonTestSimple.json").getPath 135 | 136 | val schema = sparkSession.sqlContext.read 137 | .option("multiLine", "true") 138 | .option("primitivesAsString", "true") 139 | .json(jsonPath).schema 140 | 141 | val appleTable = SparkFactory.parallelizeDynamoDBSource("json_JsonTestSimple", "dynamodb_test", schema.fieldNames, 142 | delimiter = Option.apply(","), filter = Option.apply("key1 = '\"TEST3\"'"), 143 | filterPushdown = Option.apply("false") 144 | ) 145 | assert(appleTable.getSourceType.equals(expectedSourceType)) 146 | 147 | val dataResult = appleTable.getDataFrame.collect() 148 | assert(dataResult.length == 1) 149 | 150 | val schemaResult = appleTable.getDataFrame.schema 151 | assert(schemaResult.fields.length == 5) 152 | 153 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("key1")).equals("\"TEST3\"")) 154 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("key2")).equals("3")) 155 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute1")).equals("\"test number 3\"")) 156 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute2")).equals("\"true\"")) 157 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute3")).equals("\"3\"")) 158 | } 159 | 160 | test("Dynamodb column and row filter with multiple options") { 161 | val jsonPath = this.getClass.getClassLoader.getResource("json/JsonTestSimple.json").getPath 162 | 163 | val schema = sparkSession.sqlContext.read 164 | .option("multiLine", "true") 165 | .option("primitivesAsString", "true") 166 | .json(jsonPath).schema 167 | 168 | val appleTable = SparkFactory.parallelizeDynamoDBSource("json_JsonTestSimple", "dynamodb_test", schema.fieldNames, 169 | delimiter = Option.apply(","), selectColumns = Option.apply(Array("key1", "attribute1")), filter = Option.apply("key1 = 'TEST2'"), 170 | region = Option.apply("us-east-1"), roleArn = Option.empty, 171 | readPartitions = Option.apply("2"), maxPartitionBytes = Option.apply("64"), 172 | defaultParallelism = Option.apply("2"), targetCapacity = Option.apply("0.6"), 173 | stronglyConsistentReads = Option.apply("true"), bytesPerRCU = Option.apply("40"), 174 | filterPushdown = Option.empty, throughput = Option.apply("10")) 175 | assert(appleTable.getSourceType.equals(expectedSourceType)) 176 | 177 | val dataResult = appleTable.getDataFrame.collect() 178 | assert(dataResult.length == 1) 179 | 180 | val schemaResult = appleTable.getDataFrame.schema 181 | assert(schemaResult.fields.length == 2) 182 | 183 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("key1")).equals("\"TEST2\"")) 184 | assert(dataResult.take(1)(0)(schemaResult.fieldIndex("attribute1")).equals("\"test number 2\"")) 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkfactory/JdbcSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkfactory 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuite 20 | import org.finra.msd.enums.SourceType 21 | import org.finra.msd.helpers.FileHelper 22 | import org.finra.msd.memorydb.MemoryDbHsql 23 | 24 | import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` 25 | 26 | class JdbcSuite extends SparkFunSuite { 27 | 28 | val directory = "jdbc" 29 | val expectedSourceType = SourceType.JDBC 30 | 31 | FileHelper.getFilenames(directory, "JdbcTest", ".sql") 32 | .foreach(filename => { 33 | test("SparkFactory Jdbc test - " + filename) { 34 | val baseFile = filename.substring(0, filename.length - 4) 35 | 36 | val tableName = baseFile.replace(directory + "/", "").toUpperCase() 37 | 38 | val appleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 39 | "(select * from " + tableName + ")", tableName + "_jdbc_test") 40 | 41 | val jsonFormatDf = SparkFactory.simpleTableToSimpleJSONFormatTable(appleTable.dataFrame) 42 | 43 | assert(appleTable.getSourceType.equals(expectedSourceType)) 44 | 45 | helpers.compareJdbcDataFrameSimpleToJsonFormat(appleTable, jsonFormatDf, Seq("Fruit")) 46 | } 47 | }) 48 | 49 | test("parallelizeSqlQueryTest") { 50 | val appleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 51 | "(select * from Persons1)", "table1") 52 | if (appleTable.getDataFrame.count == 0) fail("dataset was empty") 53 | } 54 | 55 | test("parrallelizeSqlQueryWithPartitioning") { 56 | val rightAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 57 | "(select * from Fruit1 )", "my_partition_test", Option.empty, "Price", "0", "200000", "2") 58 | if (rightAppleTable.getDataFrame.rdd.getNumPartitions != 2) fail("expected 2 partitions but received " + rightAppleTable.getDataFrame.rdd.getNumPartitions) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/sparkfactory/JsonSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.sparkfactory 18 | 19 | import org.finra.msd.basetestclasses.SparkFunSuite 20 | import org.finra.msd.enums.SourceType 21 | import org.finra.msd.helpers.FileHelper 22 | import org.finra.msd.sparkfactory.SparkFactory.sparkSession 23 | 24 | import scala.collection.convert.ImplicitConversions.`collection AsScalaIterable` 25 | 26 | class JsonSuite extends SparkFunSuite { 27 | 28 | val directory = "json" 29 | val expectedSourceType = SourceType.JSON 30 | 31 | FileHelper.getFilenames(directory, "JsonTest", ".json") 32 | .foreach(filename => { 33 | test("SparkFactory Json test - " + filename) { 34 | val baseFile = filename.substring(0, filename.length - 5) 35 | val jsonFile = filename 36 | val txtFile = baseFile + ".txt" 37 | val jsonPath = this.getClass.getClassLoader.getResource(jsonFile).getPath 38 | val txtPath = this.getClass.getClassLoader.getResource(txtFile).getPath 39 | 40 | val schema = sparkSession.sqlContext.read 41 | .option("multiLine", "true") 42 | .option("primitivesAsString", "true") 43 | .json(jsonPath).schema 44 | 45 | val appleTable = SparkFactory.parallelizeJSONSource(jsonPath, "json_test", schema.fieldNames) 46 | assert(appleTable.getSourceType.equals(expectedSourceType)) 47 | 48 | helpers.compareJsonDataFrameActualToExpected(appleTable, jsonPath, txtPath, Seq("key1", "key2"), ";") 49 | } 50 | }) 51 | } 52 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/stats/CountsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.stats 18 | 19 | import org.apache.spark.sql.DataFrame 20 | import org.finra.msd.basetestclasses.SparkFunSuite 21 | import org.finra.msd.containers.DiffResult 22 | import org.finra.msd.sparkcompare.SparkCompare 23 | import org.scalatest.BeforeAndAfterAll 24 | 25 | class CountsSuite extends SparkFunSuite with BeforeAndAfterAll { 26 | 27 | import testImplicits._ 28 | 29 | test("Way too many discrepancies") { 30 | 31 | val key: Seq[String] = Seq("key1", "key2") 32 | 33 | val left = Seq( 34 | ("1","1","Adam","Andreson"), 35 | ("2","2","Bob","Branson"), 36 | ("4","4","Chad","Charly"), 37 | ("5","5","Joe","Smith"), 38 | ("5","5","Joe","Smith"), 39 | ("6","6","Edward","Eddy"), 40 | ("7","7","normal","normal") 41 | ).toDF("key1" , "key2" , "value1" , "value2") 42 | 43 | val right = Seq( 44 | ("3","3","Young","Yan"), 45 | ("5","5","Joe","Smith"), 46 | ("6","6","Edward","Eddy"), 47 | ("7","7","normal","normal"), 48 | (null,null,"null key","null key") 49 | ).toDF("key1" , "key2", "value1" , "value2") 50 | 51 | val comparisonResult: DiffResult = SparkCompare.compareSchemaDataFrames(left, right) 52 | val stats = comparisonResult.discrepancyStats(key) 53 | 54 | val result = stats.orderBy("COLUMN_NAME").collect() 55 | val schemaResult = stats.schema 56 | 57 | assert(result.length == 2) 58 | assert(result.take(2)(0)(schemaResult.fieldIndex("COLUMN_NAME")).equals("VALUE1")) 59 | assert(result.take(2)(0)(schemaResult.fieldIndex("DISCREPANCIES")) == 5) 60 | assert(result.take(2)(1)(schemaResult.fieldIndex("COLUMN_NAME")).equals("VALUE2")) 61 | assert(result.take(2)(1)(schemaResult.fieldIndex("DISCREPANCIES")) == 5) 62 | } 63 | 64 | test("A few discrepancies across two non-key columns") { 65 | 66 | val key: Seq[String] = Seq("a_column") 67 | 68 | val left = Seq( 69 | ("a1","b1","c1","d1"), 70 | ("a2","b2","c2","d2"), 71 | ("a3","b3","c3","d3"), 72 | ("a4","b4","c4","d4"), 73 | ("a5","b5","c5","d5"), 74 | ("a6","b6","c6","d6"), 75 | ("a7","b7","c7","d7") 76 | ).toDF("a_column","b_column","c_column","d_column") 77 | 78 | val right = Seq( 79 | ("a1","b1","c1","d1"), 80 | ("a2","b2","c2","x2"), 81 | ("a3","b3","c3","d3"), 82 | ("a4","x4","c4","d4"), 83 | ("a5","b5","c5","d5"), 84 | ("a6","b6","c6","d6"), 85 | ("a7","x7","c7","d7") 86 | ).toDF("a_column","b_column","c_column","d_column") 87 | 88 | val comparisonResult: DiffResult = SparkCompare.compareSchemaDataFrames(left, right) 89 | val stats = comparisonResult.discrepancyStats(key) 90 | 91 | val result = stats.orderBy("COLUMN_NAME").collect() 92 | val schemaResult = stats.schema 93 | 94 | assert(result.length == 3) 95 | assert(result.take(3)(0)(schemaResult.fieldIndex("COLUMN_NAME")).equals("B_COLUMN")) 96 | assert(result.take(3)(0)(schemaResult.fieldIndex("DISCREPANCIES")) == 2) 97 | assert(result.take(3)(1)(schemaResult.fieldIndex("COLUMN_NAME")).equals("C_COLUMN")) 98 | assert(result.take(3)(1)(schemaResult.fieldIndex("DISCREPANCIES")) == 0) 99 | assert(result.take(3)(2)(schemaResult.fieldIndex("COLUMN_NAME")).equals("D_COLUMN")) 100 | assert(result.take(3)(2)(schemaResult.fieldIndex("DISCREPANCIES")) == 1) 101 | } 102 | 103 | test("Discrepancies in the key columns") { 104 | 105 | val key: Seq[String] = Seq("a_column") 106 | 107 | val left = Seq( 108 | ("a1","b1","c1","d1"), 109 | ("a2","b2","c2","d2"), 110 | ("a3","b3","c3","d3"), 111 | ("a4","b4","c4","d4"), 112 | ("a5","b5","c5","d5"), 113 | ("a6","b6","c6","d6"), 114 | ("a7","b7","c7","d7") 115 | ).toDF("a_column","b_column","c_column","d_column") 116 | 117 | val right = Seq( 118 | ("a1","b1","c1","d1"), 119 | ("a2","b2","c2","d2"), 120 | ("a3","b3","c3","d3"), 121 | ("a8","b4","c4","d4"), 122 | ("a5","b5","c5","d5"), 123 | ("a6","b6","c6","d6"), 124 | ("a9","b7","c7","d7") 125 | ).toDF("a_column","b_column","c_column","d_column") 126 | 127 | val comparisonResult: DiffResult = SparkCompare.compareSchemaDataFrames(left, right) 128 | val stats = comparisonResult.discrepancyStats(key) 129 | 130 | val result = stats.orderBy("COLUMN_NAME").collect() 131 | val schemaResult = stats.schema 132 | 133 | assert(result.length == 3) 134 | assert(result.take(3)(0)(schemaResult.fieldIndex("COLUMN_NAME")).equals("B_COLUMN")) 135 | assert(result.take(3)(0)(schemaResult.fieldIndex("DISCREPANCIES")) == 4) 136 | assert(result.take(3)(1)(schemaResult.fieldIndex("COLUMN_NAME")).equals("C_COLUMN")) 137 | assert(result.take(3)(1)(schemaResult.fieldIndex("DISCREPANCIES")) == 4) 138 | assert(result.take(3)(2)(schemaResult.fieldIndex("COLUMN_NAME")).equals("D_COLUMN")) 139 | assert(result.take(3)(2)(schemaResult.fieldIndex("DISCREPANCIES")) == 4) 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /mega-spark-diff/src/test/scala/org/finra/msd/visualization/VisualizerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 MegaSparkDiff Contributors 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.finra.msd.visualization 18 | 19 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 20 | import org.finra.msd.basetestclasses.SparkFunSuite 21 | import org.finra.msd.containers.DiffResult 22 | import org.finra.msd.memorydb.MemoryDbHsql 23 | import org.finra.msd.sparkcompare.SparkCompare 24 | import org.finra.msd.sparkfactory.SparkFactory 25 | 26 | class VisualizerSuite extends SparkFunSuite { 27 | 28 | import testImplicits._ 29 | 30 | private def generateString(left: Dataset[Row], right: Dataset[Row], key: String, maxRecords: Int) = { 31 | //Primary Key as Java List 32 | val primaryKeySeq = Seq(key) 33 | val html = Visualizer.generateVisualizerTemplate(left, right, primaryKeySeq, maxRecords) 34 | html 35 | } 36 | 37 | private def getAppleTablediffResult(testName1: String, testName2: String) = { 38 | val leftAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 39 | "(select * from " + testName1 + ")", "table1") 40 | val rightAppleTable = SparkFactory.parallelizeJDBCSource(MemoryDbHsql.hsqlDriverName, MemoryDbHsql.hsqlUrl, "SA", "", 41 | "(select * from " + testName2 + ")", "table2") 42 | SparkCompare.compareAppleTables(leftAppleTable, rightAppleTable) 43 | } 44 | 45 | test("Visualize as Text") { 46 | val left = Seq( 47 | ("key11", "key12", "A", "A"), 48 | ("key21", "key22", "B", "B"), 49 | ("4", "4", "C", "C"), 50 | ("5", "5", "D", "D"), 51 | ("5", "5", "D", "D"), 52 | ("6", "6", "E", "E") 53 | ).toDF("key1", "key2", "value1", "value2") 54 | 55 | val right = Seq( 56 | ("key11", "key12", null, null), 57 | ("3", "3", "Y", "Y"), 58 | ("5", "5", "D", "D"), 59 | ("6", "6", "E", "E"), 60 | (null, null, "zz", "zz") 61 | ).toDF("key1", "key2", "value1", "value2") 62 | 63 | val comparisonResult: DiffResult = SparkCompare.compareSchemaDataFrames(left, right) 64 | 65 | val key: Seq[String] = Seq("key1", "key2") 66 | val joinedResults: DataFrame = comparisonResult.fullOuterJoinDataFrames(key) 67 | val html = Visualizer.renderHorizontalTable(joinedResults, 100) 68 | 69 | assert(html.contains("class='different'")) 70 | assert(html.contains("class='same'")) 71 | assert(html.contains("")) 72 | } 73 | 74 | test("basicVisualizerTest") { 75 | val diffResult = getAppleTablediffResult("EnhancedFruit1", "EnhancedFruit2") 76 | val html = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "FRUIT", 100) 77 | if (html.isEmpty) fail("html was empty") 78 | } 79 | 80 | test("emptyLeftDfTest") { 81 | val diffResult = getAppleTablediffResult("Fruit4", "Fruit1") 82 | val html = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "FRUIT", 100) 83 | if (html.isEmpty) fail("html was empty") 84 | } 85 | 86 | test("emptyRightDfTest") { 87 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 88 | val html = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "FRUIT", 100) 89 | if (html.isEmpty) fail("html was empty") 90 | } 91 | 92 | test("nullLeftDfTest") { 93 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 94 | val html = generateString(null, diffResult.inRightNotInLeft, "FRUIT", 100) 95 | assert("

Error message: Left dataframe is null

" == html) 96 | } 97 | 98 | test("nullRightDfTest") { 99 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 100 | val html = generateString(diffResult.inLeftNotInRight, null, "FRUIT", 100) 101 | assert("

Error message: Right dataframe is null

" == html) 102 | } 103 | 104 | test("emptyKeyTest") { 105 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 106 | val html = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "", 100) 107 | assert("

Error message: One or more keys is empty or null

" == html) 108 | } 109 | 110 | test("nullKeyTest") { 111 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 112 | val html = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, null, 100) 113 | assert("

Error message: One or more keys is empty or null

" == html) 114 | } 115 | 116 | test("keyCaseTest") { 117 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 118 | var flag = true 119 | var result1 = "" 120 | var result2 = "" 121 | try { 122 | result1 = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "Fruit", 100) 123 | result2 = generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "FrUit", 100) 124 | } catch { 125 | case ex: Exception => 126 | flag = false 127 | } 128 | assert(flag) 129 | assert(result1 == result2) 130 | } 131 | 132 | test("invalidMaxRecordsTest") { 133 | val diffResult = getAppleTablediffResult("Fruit1", "Fruit4") 134 | var flag = true 135 | try generateString(diffResult.inLeftNotInRight, diffResult.inRightNotInLeft, "FRUIT", -100) 136 | catch { 137 | case ex: Exception => 138 | flag = false 139 | } 140 | assert(flag) 141 | } 142 | } 143 | --------------------------------------------------------------------------------