├── .gitallowed ├── .gitignore ├── cloud-examples └── src │ ├── test │ ├── scala │ │ ├── com │ │ │ └── cloudera │ │ │ │ └── spark │ │ │ │ └── cloud │ │ │ │ ├── s3 │ │ │ │ ├── S3ASeekReadNormalIOSuite.scala │ │ │ │ ├── TestParquetBinding.scala │ │ │ │ ├── S3ASeekReadRandomIOSuite.scala │ │ │ │ ├── commit │ │ │ │ │ ├── AbstractS3ACommitterSuite.scala │ │ │ │ │ ├── S3ACommitterFactorySuite.scala │ │ │ │ │ ├── Events.scala │ │ │ │ │ └── S3ACommitDataframeSuite.scala │ │ │ │ ├── S3ABasicIOSuite.scala │ │ │ │ ├── S3ANumbersSuite.scala │ │ │ │ ├── S3AStreamingSuite.scala │ │ │ │ ├── S3ASeekReadSequentialIOSuite.scala │ │ │ │ ├── S3ADataFrameSuite.scala │ │ │ │ ├── S3ALineCountSuite.scala │ │ │ │ ├── S3ANumbersSuiteV2APISuite.scala │ │ │ │ ├── S3AFileGeneratorSuite.scala │ │ │ │ ├── S3DependencyCheckSuite.scala │ │ │ │ ├── S3ACSVReadSuite.scala │ │ │ │ ├── S3AEncryptionSuite.scala │ │ │ │ └── S3ALineCountWritebackSuite.scala │ │ │ │ ├── gs │ │ │ │ ├── GsDataFrameSuite.scala │ │ │ │ ├── GsCSVReadSuite.scala │ │ │ │ ├── GsBasicIOSuite.scala │ │ │ │ ├── GsCommitDataframeSuite.scala │ │ │ │ ├── AbstractGsCommitterSuite.scala │ │ │ │ └── GSDependencyCheckSuite.scala │ │ │ │ ├── abfs │ │ │ │ ├── AbfsBasicIOSuite.scala │ │ │ │ ├── AbfsDataFrameSuite.scala │ │ │ │ ├── AbfsCSVReadSuite.scala │ │ │ │ └── commit │ │ │ │ │ ├── AbfsCommitDataframeSuite.scala │ │ │ │ │ └── AbstractAbfsCommitterSuite.scala │ │ │ │ ├── azure │ │ │ │ ├── AzureBasicIOSuite.scala │ │ │ │ ├── AzureStreamingSuite.scala │ │ │ │ ├── AzureCSVReadSuite.scala │ │ │ │ ├── AzureSeekReadSuite.scala │ │ │ │ ├── AzureFileGeneratorSuite.scala │ │ │ │ ├── AzureLineCountSuite.scala │ │ │ │ └── AzureDataFrameSuite.scala │ │ │ │ ├── csv │ │ │ │ ├── LocalHugeCsvIOSuite.scala │ │ │ │ └── AbfsHugeCsvIOSuite.scala │ │ │ │ ├── common │ │ │ │ ├── StreamingTests.scala │ │ │ │ ├── CloudSuiteWithCSVDatasource.scala │ │ │ │ ├── HadoopVersionSuite.scala │ │ │ │ ├── ReadSample.scala │ │ │ │ ├── DataFrameTests.scala │ │ │ │ ├── FileGeneratorTests.scala │ │ │ │ └── SeekReadTests.scala │ │ │ │ ├── examples │ │ │ │ └── S3DataFrameExampleSuite.scala │ │ │ │ └── committers │ │ │ │ └── AbstractCommitterSuite.scala │ │ └── org │ │ │ └── apache │ │ │ └── spark │ │ │ ├── sql │ │ │ ├── hive │ │ │ │ └── orc │ │ │ │ │ ├── gs │ │ │ │ │ ├── GsParquetPartitionSuite.scala │ │ │ │ │ ├── GsOrcRelationSuite.scala │ │ │ │ │ ├── GsParquetRelationSuite.scala │ │ │ │ │ ├── GsOrcPartitionSuite.scala │ │ │ │ │ └── GsParquetRelationScaleSuite.scala │ │ │ │ │ ├── abfs │ │ │ │ │ ├── AbfsParquetPartitionSuite.scala │ │ │ │ │ ├── AbfsOrcRelationSuite.scala │ │ │ │ │ ├── AbfsOrcPartitionSuite.scala │ │ │ │ │ ├── AbfsParquetRelationSuite.scala │ │ │ │ │ └── AbfsParquetRelationScaleSuite.scala │ │ │ │ │ └── cloud │ │ │ │ │ ├── S3AOrcRelationSuite.scala │ │ │ │ │ ├── S3AOrcPartitionSuite.scala │ │ │ │ │ ├── S3AParquetPartitionSuite.scala │ │ │ │ │ ├── S3AParquetRelationSuite.scala │ │ │ │ │ ├── S3AParquetRelationScaleSuite.scala │ │ │ │ │ └── S3AOrcRelationScaleSuite.scala │ │ │ └── sources │ │ │ │ ├── MustDeclareDatasource.scala │ │ │ │ ├── ParquetRelationTrait.scala │ │ │ │ ├── AbtractOrcRelationSuite.scala │ │ │ │ ├── CloudPartitionTest.scala │ │ │ │ └── HiveTestTrait.scala │ │ │ └── SparkScopeWorkarounds.scala │ └── resources │ │ ├── core-site.xml │ │ └── log4j2.properties │ └── main │ ├── scala │ ├── com │ │ └── cloudera │ │ │ └── spark │ │ │ └── cloud │ │ │ ├── s3 │ │ │ ├── NormalIOPolicy.scala │ │ │ ├── SequentialIOPolicy.scala │ │ │ ├── IOPolicy.scala │ │ │ ├── RandomIOPolicy.scala │ │ │ ├── S3AFileGenerator.scala │ │ │ ├── S3ALineCount.scala │ │ │ ├── S3AStreaming.scala │ │ │ ├── S3ADataFrames.scala │ │ │ ├── S3AExampleSetup.scala │ │ │ └── S3ATestSetup.scala │ │ │ ├── utils │ │ │ ├── Demo.scala │ │ │ ├── ForceRecentHadoopVersion.scala │ │ │ ├── ExtraAssertions.scala │ │ │ └── StatisticsTracker.scala │ │ │ ├── adl │ │ │ └── AdlTestSetup.scala │ │ │ ├── azure │ │ │ └── AzureTestSetup.scala │ │ │ ├── gs │ │ │ └── GsTestSetup.scala │ │ │ ├── abfs │ │ │ └── AbfsTestSetup.scala │ │ │ ├── local │ │ │ └── LocalTestSetup.scala │ │ │ ├── common │ │ │ ├── CsvDatasourceSupport.scala │ │ │ ├── ContextFreeCloudSuite.scala │ │ │ ├── StoreTestHelper.scala │ │ │ ├── StoreTestOperations.scala │ │ │ └── CloudSuite.scala │ │ │ └── examples │ │ │ └── AzureStreamingExample.scala │ └── org │ │ └── apache │ │ └── hadoop │ │ └── fs │ │ └── FSHelper.scala │ ├── site │ └── using.md │ └── resources │ └── log4j.properties ├── spark-cloud-integration └── src │ └── main │ ├── scala │ ├── org │ │ └── apache │ │ │ └── spark │ │ │ └── cloudera │ │ │ ├── package.scala │ │ │ └── statistics │ │ │ ├── IOStatisticsAccumulator.scala │ │ │ └── IOStatisticsCollectorExecutorPlugin.scala │ └── com │ │ └── cloudera │ │ └── spark │ │ └── cloud │ │ ├── test │ │ └── UnitTestSuite.scala │ │ ├── s3 │ │ └── audit │ │ │ ├── ServerLogEntry.scala │ │ │ └── LogParser.scala │ │ ├── utils │ │ ├── IntegrationUtils.scala │ │ ├── ConfigSerDeser.scala │ │ └── HConf.scala │ │ ├── CommitterInfo.scala │ │ └── CommitterBinding.scala │ └── site │ └── markdown │ └── integration.md ├── README.md └── .travis.yml /.gitallowed: -------------------------------------------------------------------------------- 1 | # serialization 2 | \-[0-9]+L -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | pom.xml.tag 3 | pom.xml.releaseBackup 4 | pom.xml.versionsBackup 5 | pom.xml.next 6 | release.properties 7 | dependency-reduced-pom.xml 8 | buildNumber.properties 9 | .mvn/timing.properties 10 | cloud.xml 11 | cloud-examples/metastore_db 12 | cloud-examples/derby.log 13 | cloud-examples/spark-warehouse 14 | cloud-examples/src/scripts 15 | spark-snapshot 16 | *.iws 17 | *.ipr 18 | 19 | /cloud-examples/build.properties 20 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ASeekReadNormalIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | class S3ASeekReadNormalIOSuite extends S3ASeekReadSequentialIOSuite { 21 | 22 | override def inputPolicy: String = NORMAL_IO 23 | } 24 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/NormalIOPolicy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | trait NormalIOPolicy extends IOPolicy { 21 | 22 | /** 23 | * Use original sequential IO 24 | * 25 | * @return the IO type 26 | */ 27 | override def inputPolicy: String = NORMAL_IO 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/TestParquetBinding.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.test.UnitTestSuite 21 | 22 | /** 23 | * Look at what Parquet committer binding is up to 24 | */ 25 | class TestParquetBinding extends UnitTestSuite { 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/SequentialIOPolicy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | trait SequentialIOPolicy extends IOPolicy { 21 | 22 | /** 23 | * Use original sequential IO 24 | * 25 | * @return the IO type 26 | */ 27 | override def inputPolicy: String = SEQUENTIAL_IO 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ASeekReadRandomIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | /** 21 | * Subclass of `S3aSeekReadSuite` with random IO turned on. 22 | */ 23 | class S3ASeekReadRandomIOSuite extends S3ASeekReadSequentialIOSuite { 24 | 25 | override def inputPolicy: String = RANDOM_IO 26 | } 27 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/IOPolicy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | /** 21 | * IO Policy to support. 22 | */ 23 | trait IOPolicy extends S3AConstants { 24 | 25 | /** 26 | * What input policy to request 27 | * 28 | * @return the IO type 29 | */ 30 | def inputPolicy: String 31 | 32 | } 33 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsParquetPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.gs 19 | 20 | import org.apache.spark.sql.sources.ParquetRelationTrait 21 | 22 | /** 23 | * Partitioned queries with ORC data against ABFS. 24 | */ 25 | class GsParquetPartitionSuite extends GsOrcPartitionSuite with 26 | ParquetRelationTrait { 27 | 28 | } 29 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/RandomIOPolicy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | /** 21 | * Switch to Random S3A IO, 22 | */ 23 | trait RandomIOPolicy extends IOPolicy { 24 | 25 | /** 26 | * Use Random IO for high performance ORC 27 | * 28 | * @return the IO type 29 | */ 30 | override def inputPolicy: String = RANDOM_IO 31 | 32 | } 33 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/AbstractS3ACommitterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3.commit 19 | 20 | import com.cloudera.spark.cloud.committers.AbstractCommitterSuite 21 | import com.cloudera.spark.cloud.s3.S3ATestSetup 22 | 23 | abstract class AbstractS3ACommitterSuite 24 | extends AbstractCommitterSuite with S3ATestSetup { 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/org/apache/spark/cloudera/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | 19 | package org.apache.spark 20 | 21 | /** 22 | * Package to put things which need to get at Spark Private structures. 23 | * 24 | * These have to be viewed as unstable; if something breaks due to a spark 25 | * change, that has to be accepted as inevitable. 26 | */ 27 | package object cloudera { 28 | 29 | } 30 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsParquetPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.abfs 19 | 20 | import org.apache.spark.sql.sources.ParquetRelationTrait 21 | 22 | /** 23 | * Partitioned queries with ORC data against ABFS. 24 | */ 25 | class AbfsParquetPartitionSuite extends AbfsOrcPartitionSuite with 26 | ParquetRelationTrait { 27 | 28 | } 29 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3AFileGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.operations.CloudFileGenerator 21 | 22 | /** 23 | * Generate a file containing some numbers in the remote repository. 24 | */ 25 | object S3AFileGenerator extends CloudFileGenerator with S3AExampleSetup 26 | with SequentialIOPolicy { 27 | 28 | } 29 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/Demo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import com.github.lalyos.jfiglet.FigletFont 21 | 22 | object Demo { 23 | 24 | /** 25 | * Uses figlet to render to a string. 26 | * see: https://github.com/lalyos/jfiglet 27 | */ 28 | def text(m: String): String = { 29 | "\n" + FigletFont.convertOneLine(m) 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsDataFrameSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import com.cloudera.spark.cloud.common.DataFrameTests 21 | 22 | /** 23 | * Test GS and DataFrames. 24 | */ 25 | class GsDataFrameSuite extends DataFrameTests with GsTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/AbfsBasicIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.abfs 19 | 20 | import com.cloudera.spark.cloud.common.BasicIOTests 21 | 22 | /** 23 | * Azure's basic IO operations. 24 | */ 25 | class AbfsBasicIOSuite extends BasicIOTests with AbfsTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureBasicIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.BasicIOTests 21 | 22 | /** 23 | * Azure's basic IO operations. 24 | */ 25 | class AzureBasicIOSuite extends BasicIOTests with AzureTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/AbfsDataFrameSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.abfs 19 | 20 | import com.cloudera.spark.cloud.common.DataFrameTests 21 | 22 | /** 23 | * Test Azure and DataFrames. 24 | */ 25 | class AbfsDataFrameSuite extends DataFrameTests with AbfsTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsCSVReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import com.cloudera.spark.cloud.common.CSVReadTests 21 | 22 | class GsCSVReadSuite extends CSVReadTests with GsTestSetup { 23 | init() 24 | 25 | /** 26 | * set up FS if enabled. 27 | */ 28 | def init(): Unit = { 29 | if (enabled) { 30 | initFS() 31 | initDatasources() 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureStreamingSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.StreamingTests 21 | 22 | /** 23 | * Test Streaming under Azure. 24 | */ 25 | class AzureStreamingSuite extends StreamingTests with AzureTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/AbfsCSVReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.abfs 19 | 20 | import com.cloudera.spark.cloud.common.CSVReadTests 21 | 22 | class AbfsCSVReadSuite extends CSVReadTests with AbfsTestSetup { 23 | init() 24 | 25 | /** 26 | * set up FS if enabled. 27 | */ 28 | def init(): Unit = { 29 | if (enabled) { 30 | initFS() 31 | initDatasources() 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureCSVReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.CSVReadTests 21 | 22 | class AzureCSVReadSuite extends CSVReadTests with AzureTestSetup { 23 | init() 24 | 25 | /** 26 | * set up FS if enabled. 27 | */ 28 | def init(): Unit = { 29 | if (enabled) { 30 | initFS() 31 | initDatasources() 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureSeekReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.SeekReadTests 21 | 22 | class AzureSeekReadSuite extends SeekReadTests with AzureTestSetup { 23 | init() 24 | 25 | /** 26 | * set up FS if enabled. 27 | */ 28 | def init(): Unit = { 29 | if (enabled) { 30 | initFS() 31 | initDatasources() 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ABasicIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.BasicIOTests 21 | 22 | /** 23 | * Basic S3A IO Tests. 24 | */ 25 | class S3ABasicIOSuite extends BasicIOTests with S3ATestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | // propagate S3 credentials 31 | if (enabled) { 32 | initFS() 33 | } 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsBasicIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import com.cloudera.spark.cloud.common.BasicIOTests 21 | 22 | /** 23 | * GS's basic IO operations. 24 | */ 25 | class GsBasicIOSuite extends BasicIOTests with GsTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } else { 33 | log.info("suite is not enabled") 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/test/UnitTestSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.test 19 | 20 | import org.scalatest.funsuite.AnyFunSuite 21 | import org.scalatest.matchers.must.Matchers 22 | 23 | 24 | import org.apache.spark.internal.Logging 25 | 26 | /** 27 | * Base class for test suites. 28 | * Added because scalatest imports are too brittle to use. 29 | */ 30 | class UnitTestSuite extends AnyFunSuite with Logging with Matchers { 31 | 32 | } 33 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ANumbersSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.NumbersRddTests 21 | 22 | class S3ANumbersSuite extends NumbersRddTests with S3ATestSetup { 23 | init() 24 | 25 | def init(): Unit = { 26 | // propagate S3 credentials 27 | if (enabled) { 28 | initFS() 29 | } 30 | } 31 | 32 | override protected def pathname = { 33 | "s3a_numbers_suite" 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsOrcRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.gs 19 | 20 | import com.cloudera.spark.cloud.gs.GsTestSetup 21 | 22 | import org.apache.spark.sql.sources.AbtractOrcRelationSuite 23 | 24 | class GsOrcRelationSuite extends AbtractOrcRelationSuite with GsTestSetup { 25 | 26 | init() 27 | 28 | def init(): Unit = { 29 | // propagate credentials 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/csv/LocalHugeCsvIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.csv 19 | 20 | import com.cloudera.spark.cloud.local.LocalTestSetup 21 | 22 | /** 23 | * local csv tests to act as a baseline for performance/correctness. 24 | * always runs. 25 | */ 26 | class LocalHugeCsvIOSuite extends AbstractHugeCsvIOSuite with LocalTestSetup { 27 | 28 | init() 29 | 30 | /** 31 | * set up FS. 32 | */ 33 | def init(): Unit = { 34 | initFS() 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsOrcRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.abfs 19 | 20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 21 | 22 | import org.apache.spark.sql.sources.AbtractOrcRelationSuite 23 | 24 | class AbfsOrcRelationSuite extends AbtractOrcRelationSuite with AbfsTestSetup { 25 | 26 | init() 27 | 28 | def init(): Unit = { 29 | // propagate credentials 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/SparkScopeWorkarounds.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark 19 | 20 | import org.apache.spark.sql.hive.HiveUtils 21 | 22 | /** 23 | * Here to get at useful stuff that Spark keeps private but which turn out be 24 | * invaluable during testing. 25 | * 26 | * Needless to say: things may break here without warning or redress. 27 | */ 28 | object SparkScopeWorkarounds { 29 | def tempHiveConfig(): Map[String, String] = { 30 | HiveUtils.newTemporaryConfiguration(true) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsCommitDataframeSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import com.cloudera.spark.cloud.committers.AbstractCommitDataframeSuite 21 | 22 | class GsCommitDataframeSuite 23 | extends AbstractCommitDataframeSuite with GsTestSetup { 24 | 25 | init() 26 | 27 | def init(): Unit = { 28 | if (enabled) { 29 | initFS() 30 | } 31 | } 32 | 33 | override def committers: Seq[String] = Seq("manifest") 34 | 35 | override def schema: String = "gs" 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AOrcRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.cloud 19 | 20 | 21 | import com.cloudera.spark.cloud.s3.S3ATestSetup 22 | 23 | import org.apache.spark.sql.sources.AbtractOrcRelationSuite 24 | 25 | class S3AOrcRelationSuite extends AbtractOrcRelationSuite with S3ATestSetup { 26 | 27 | 28 | 29 | init() 30 | 31 | def init(): Unit = { 32 | // propagate S3 credentials 33 | if (enabled) { 34 | initFS() 35 | } 36 | } 37 | 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureFileGeneratorSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.FileGeneratorTests 21 | 22 | /** 23 | * Test the `FileGenerator` entry point under Azure. 24 | */ 25 | class AzureFileGeneratorSuite extends FileGeneratorTests with AzureTestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | after { 36 | cleanFilesystemInTeardown() 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AOrcPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.cloud 19 | 20 | import com.cloudera.spark.cloud.s3.S3ATestSetup 21 | 22 | import org.apache.spark.sql.sources.CloudPartitionTest 23 | 24 | class S3AOrcPartitionSuite extends CloudPartitionTest with S3ATestSetup { 25 | 26 | init() 27 | 28 | def init(): Unit = { 29 | // propagate S3 credentials 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | override val dataSourceName: String = "orc" 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AParquetPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.cloud 19 | 20 | import com.cloudera.spark.cloud.s3.S3ATestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudPartitionTest, ParquetRelationTrait} 23 | 24 | class S3AParquetPartitionSuite extends CloudPartitionTest with S3ATestSetup 25 | with ParquetRelationTrait { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | // propagate S3 credentials 31 | if (enabled) { 32 | initFS() 33 | } 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsParquetRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.gs 19 | 20 | import com.cloudera.spark.cloud.gs.GsTestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudRelationBasicSuite, ParquetRelationTrait} 23 | 24 | class GsParquetRelationSuite extends CloudRelationBasicSuite 25 | with GsTestSetup 26 | with ParquetRelationTrait { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | // propagate credentials 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AParquetRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.cloud 19 | 20 | import com.cloudera.spark.cloud.s3.S3ATestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudRelationBasicSuite, ParquetRelationTrait} 23 | 24 | class S3AParquetRelationSuite extends CloudRelationBasicSuite 25 | with S3ATestSetup 26 | with ParquetRelationTrait { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | // propagate S3 credentials 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3AStreamingSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.StreamingTests 21 | import com.cloudera.spark.cloud.operations.CloudStreaming 22 | 23 | /** 24 | * Test Streaming against S3A. 25 | */ 26 | class S3AStreamingSuite extends StreamingTests with S3ATestSetup { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | // propagate S3 credentials 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | override protected val instance: CloudStreaming = S3AStreaming 38 | } 39 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsOrcPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.gs 19 | 20 | import com.cloudera.spark.cloud.gs.GsTestSetup 21 | 22 | import org.apache.spark.sql.sources.CloudPartitionTest 23 | 24 | /** 25 | * Partitioned queries with ORC data against GS. 26 | */ 27 | class GsOrcPartitionSuite extends CloudPartitionTest with GsTestSetup { 28 | 29 | init() 30 | 31 | def init(): Unit = { 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | override def dataSourceName(): String = { 38 | "orc" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ASeekReadSequentialIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.SeekReadTests 21 | 22 | /** 23 | * Tests reading in the S3A CSV file using sequential and Random IO. 24 | */ 25 | class S3ASeekReadSequentialIOSuite extends SeekReadTests with S3ATestSetup 26 | with SequentialIOPolicy { 27 | 28 | init() 29 | 30 | /** 31 | * set up FS if enabled. 32 | */ 33 | def init(): Unit = { 34 | if (enabled) { 35 | initFS() 36 | initDatasources() 37 | } 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/sources/MustDeclareDatasource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.sources 19 | 20 | import org.apache.spark.sql.types.DataType 21 | 22 | /** 23 | * Subclasses must declare their datasource. 24 | */ 25 | trait MustDeclareDatasource { 26 | /** 27 | * Name of the data source: this must be declared. 28 | */ 29 | def dataSourceName(): String; 30 | 31 | 32 | /** 33 | * Datatype mapping. 34 | * 35 | * @param dataType type 36 | * @return true of supported 37 | */ 38 | def supportsDataType( 39 | dataType: DataType): Boolean; 40 | } 41 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/site/markdown/integration.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | # Integrating the Apache Hadoop S3A Committers with Apache Spark 16 | 17 | This document looks at the whole issue of "how to integrate the Hadoop S3A Committers" 18 | with Apache Spark —it is intended to apply to any custom `PathOutputCommitter` 19 | implementation. 20 | 21 | 22 | ## Background: Hadoop 23 | 24 | Hadoop has two MapReduce APIs, MRv1 and MRv2 (not to be distnguished from the v1/v2 commit 25 | algorithms.) MRv1 classes are found under the packages `org.apache.hadoop.mapred`; 26 | the MRv2 classes under `org.apache.hadoop.mapreduce`. This is important, as 27 | they often share classnames. 28 | 29 | 30 | 31 | The "original" V1 API shipped in Hadoop 1. The newer v2 API came in Hadoop 2. 32 | In Spark's `RDD.saveAsTextFile()` uses the MRv2 APIs to write data. 33 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ADataFrameSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.DataFrameTests 21 | import com.cloudera.spark.cloud.operations.CloudDataFrames 22 | 23 | /** 24 | * Test the [S3DataFrames] logic. 25 | */ 26 | class S3ADataFrameSuite extends DataFrameTests with S3ATestSetup { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | // propagate S3 credentials 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | override protected val instance: CloudDataFrames = S3ADataFrames 38 | 39 | } 40 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsOrcPartitionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.abfs 19 | 20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 21 | 22 | import org.apache.spark.sql.sources.CloudPartitionTest 23 | 24 | /** 25 | * Partitioned queries with ORC data against ABFS. 26 | */ 27 | class AbfsOrcPartitionSuite extends CloudPartitionTest with AbfsTestSetup { 28 | 29 | init() 30 | 31 | def init(): Unit = { 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | override def dataSourceName(): String = { 38 | "orc" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsParquetRelationScaleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.gs 19 | 20 | import com.cloudera.spark.cloud.gs.GsTestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudRelationScaleTest, ParquetRelationTrait} 23 | 24 | class GsParquetRelationScaleSuite extends CloudRelationScaleTest 25 | with GsTestSetup 26 | with ParquetRelationTrait { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | if (enabled) { 32 | initFS() 33 | } 34 | } 35 | 36 | override def enabled: Boolean = super.enabled && isScaleTestEnabled 37 | 38 | } 39 | -------------------------------------------------------------------------------- /cloud-examples/src/test/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 20 | 24 | 25 | 26 | 27 | fs.s3a.committer.name 28 | directory 29 | 30 | Committer to create for output to S3A, one of: 31 | "file", "directory", "partitioned", "magic". 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsParquetRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.abfs 19 | 20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudRelationBasicSuite, ParquetRelationTrait} 23 | 24 | class AbfsParquetRelationSuite extends CloudRelationBasicSuite 25 | with AbfsTestSetup 26 | with ParquetRelationTrait { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | // propagate credentials 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | override def dynamicPartitioning: Boolean = true; 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AParquetRelationScaleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.cloud 19 | 20 | import com.cloudera.spark.cloud.s3.S3ATestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudRelationScaleTest, ParquetRelationTrait} 23 | 24 | class S3AParquetRelationScaleSuite extends CloudRelationScaleTest 25 | with S3ATestSetup 26 | with ParquetRelationTrait { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | if (enabled) { 32 | initFS() 33 | } 34 | } 35 | 36 | override def enabled: Boolean = super.enabled && isScaleTestEnabled 37 | 38 | } 39 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/sources/ParquetRelationTrait.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.sources 19 | 20 | import org.apache.spark.sql.types.{CalendarIntervalType, DataType, NullType} 21 | 22 | 23 | 24 | trait ParquetRelationTrait extends MustDeclareDatasource { 25 | // Parquet does not play well with NullType. 26 | override def supportsDataType( 27 | dataType: DataType): Boolean = dataType match { 28 | case _: NullType => false 29 | case _: CalendarIntervalType => false 30 | case _ => true 31 | } 32 | 33 | override def dataSourceName(): String = { 34 | "parquet" 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/commit/AbfsCommitDataframeSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.abfs.commit 19 | 20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 21 | import com.cloudera.spark.cloud.committers.AbstractCommitDataframeSuite 22 | 23 | private class AbfsCommitDataframeSuite extends AbstractCommitDataframeSuite 24 | with AbfsTestSetup { 25 | 26 | init() 27 | 28 | def init(): Unit = { 29 | if (enabled) { 30 | initFS() 31 | } 32 | } 33 | 34 | 35 | override def committers: Seq[String] = Seq("manifest") 36 | 37 | 38 | override def schema: String = "abfs" 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsParquetRelationScaleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.abfs 19 | 20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 21 | 22 | import org.apache.spark.sql.sources.{CloudRelationScaleTest, ParquetRelationTrait} 23 | 24 | class AbfsParquetRelationScaleSuite extends CloudRelationScaleTest 25 | with AbfsTestSetup 26 | with ParquetRelationTrait { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | if (enabled) { 32 | initFS() 33 | } 34 | } 35 | 36 | override def enabled: Boolean = super.enabled && isScaleTestEnabled 37 | 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/org/apache/hadoop/fs/FSHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.hadoop.fs 19 | 20 | import java.io.IOException 21 | import java.net.URI 22 | 23 | import org.apache.hadoop.conf.Configuration 24 | 25 | /** 26 | * Help with testing by accessing package-private methods in FileSystem which 27 | * are designed for aiding testability. They are normally accessed via 28 | * `FileSystemTestHelper`, but as that is in hadoop-common-test JAR, a simple 29 | * object here avoids maven import conflict problems. 30 | */ 31 | object FSHelper { 32 | 33 | @throws[IOException] 34 | def addFileSystemForTesting(uri: URI, conf: Configuration, fs: FileSystem) { 35 | FileSystem.addFileSystemForTesting(uri, conf, fs) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AOrcRelationScaleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.hive.orc.cloud 19 | 20 | import com.cloudera.spark.cloud.s3.S3ATestSetup 21 | 22 | import org.apache.spark.sql.hive.orc.OrcFileFormat 23 | import org.apache.spark.sql.sources.CloudRelationScaleTest 24 | 25 | class S3AOrcRelationScaleSuite extends CloudRelationScaleTest with S3ATestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | // propagate S3 credentials 31 | if (enabled) { 32 | initFS() 33 | } 34 | } 35 | 36 | override def enabled: Boolean = super.enabled && isScaleTestEnabled 37 | 38 | override val dataSourceName: String = classOf[OrcFileFormat].getCanonicalName 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/adl/AdlTestSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.adl 19 | 20 | import java.net.URI 21 | 22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait 23 | import org.apache.hadoop.fs.FileSystem 24 | 25 | /** 26 | * Trait for ADL tests. 27 | * 28 | * This trait supports CSV data source by copying over the data from S3A if 29 | * it isn't already in a ADL URL 30 | */ 31 | trait AdlTestSetup extends CopyCsvFileTrait { 32 | 33 | override def enabled: Boolean = { 34 | getConf.getBoolean(ADL_TESTS_ENABLED, false) 35 | } 36 | 37 | def initFS(): FileSystem = { 38 | val uri = new URI(requiredOption(ADL_TEST_URI)) 39 | logDebug(s"Executing Azure tests against $uri") 40 | createFilesystem(uri) 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/azure/AzureTestSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import java.net.URI 21 | 22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait 23 | import org.apache.hadoop.fs.FileSystem 24 | 25 | /** 26 | * Trait for Azure ADL tests. 27 | * 28 | * This trait supports CSV data source by copying over the data from S3A if 29 | * it isn't already in a WASB URL 30 | */ 31 | trait AzureTestSetup extends CopyCsvFileTrait { 32 | 33 | override def enabled: Boolean = { 34 | getConf.getBoolean(AZURE_TESTS_ENABLED, false) 35 | } 36 | 37 | def initFS(): FileSystem = { 38 | val uri = new URI(requiredOption(AZURE_TEST_URI)) 39 | logDebug(s"Executing Azure tests against $uri") 40 | createFilesystem(uri) 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/StreamingTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import com.cloudera.spark.cloud.operations.CloudStreaming 21 | 22 | /** 23 | * Test Streaming. 24 | */ 25 | abstract class StreamingTests extends CloudSuite { 26 | 27 | after { 28 | cleanFilesystemInTeardown() 29 | } 30 | 31 | /** 32 | * Override point: instantiate 33 | */ 34 | protected val instance: CloudStreaming = new CloudStreaming() 35 | 36 | ctest("streaming", 37 | "Execute the Streaming example") { 38 | val conf = newSparkConf() 39 | conf.setAppName("Streaming") 40 | val destDir = testPath(filesystem, "streaming") 41 | val rowCount = 1000 42 | 43 | assert(0 === instance.action(conf, Seq(destDir, rowCount))) 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/gs/GsTestSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import java.net.URI 21 | 22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait 23 | import org.apache.hadoop.fs.FileSystem 24 | /** 25 | * Trait for GCS. 26 | * 27 | * This trait supports CSV data source by copying over the data from S3A if 28 | * it isn't already in a gcs URL 29 | */ 30 | trait GsTestSetup extends CopyCsvFileTrait { 31 | 32 | override def enabled: Boolean = { 33 | getConf.getBoolean(GS_TESTS_ENABLED, false) 34 | } 35 | 36 | def initFS(): FileSystem = { 37 | val uri = new URI(requiredOption(GS_TEST_URI)) 38 | logDebug(s"Executing GCS tests against $uri") 39 | createFilesystem(uri) 40 | } 41 | 42 | override def dynamicPartitioning: Boolean = true; 43 | } 44 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/abfs/AbfsTestSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.abfs 19 | 20 | import java.net.URI 21 | 22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait 23 | import org.apache.hadoop.fs.FileSystem 24 | 25 | /** 26 | * Trait for Azure ABFS tests. 27 | * 28 | * This trait supports CSV data source by copying over the data from S3A if 29 | * it isn't already in a ABFS URL 30 | */ 31 | trait AbfsTestSetup extends CopyCsvFileTrait { 32 | 33 | override def enabled: Boolean = { 34 | getConf.getBoolean(ABFS_TESTS_ENABLED, false) 35 | } 36 | 37 | override def dynamicPartitioning: Boolean = true 38 | 39 | def initFS(): FileSystem = { 40 | val uri = new URI(requiredOption(ABFS_TEST_URI)) 41 | logDebug(s"Executing Abfs tests against $uri") 42 | createFilesystem(uri) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ALineCountSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource 21 | 22 | /** 23 | * Test the `S3LineCount` entry point. 24 | */ 25 | class S3ALineCountSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | if (enabled) { 31 | setupFilesystemConfiguration(getConf) 32 | } 33 | } 34 | 35 | override def enabled: Boolean = super.enabled && hasCSVTestFile 36 | 37 | ctest("S3ALineCountReadData", 38 | "Execute the S3ALineCount example with the default values (i.e. no arguments)") { 39 | val sparkConf = newSparkConf(getTestCSVPath()) 40 | sparkConf.setAppName("S3ALineCountDefaults") 41 | assert(0 === S3ALineCount.action(sparkConf, Seq())) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ANumbersSuiteV2APISuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.NumbersRddTests 21 | import org.apache.hadoop.fs.Path 22 | 23 | import org.apache.spark.rdd.RDD 24 | 25 | class S3ANumbersSuiteV2APISuite extends NumbersRddTests with S3ATestSetup { 26 | init() 27 | 28 | def init(): Unit = { 29 | // propagate S3 credentials 30 | if (enabled) { 31 | initFS() 32 | } 33 | } 34 | 35 | override protected def pathname = { 36 | "numbers_rdd_tests_v2api" 37 | } 38 | 39 | /** 40 | * Save the RDD. 41 | * 42 | * @param numbers RDD to save 43 | * @param dest destination path 44 | */ 45 | override protected def saveRDD( 46 | numbers: RDD[Int], 47 | dest: Path): Unit = { 48 | saveRDDviaMRv2(numbers, dest) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3ALineCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.CloudTestKeys 21 | import com.cloudera.spark.cloud.operations.LineCount 22 | 23 | import org.apache.spark.SparkConf 24 | 25 | /** 26 | * A line count example which has a default reference of a public Amazon S3 27 | * CSV .gz file in the absence of anything on the command line. 28 | */ 29 | object S3ALineCount extends LineCount with S3AExampleSetup with SequentialIOPolicy { 30 | 31 | override def defaultSource: Option[String] = { 32 | Some(CloudTestKeys.S3A_CSV_PATH_DEFAULT) 33 | } 34 | 35 | override def maybeEnableAnonymousAccess( 36 | sparkConf: SparkConf, 37 | dest: Option[String]): Unit = { 38 | if (dest.isEmpty) { 39 | hconf(sparkConf, AWS_CREDENTIALS_PROVIDER, ANONYMOUS_CREDENTIALS) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/ForceRecentHadoopVersion.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import org.apache.hadoop.fs.azure.AzureException 21 | import org.apache.hadoop.fs.s3a.RenameFailedException 22 | 23 | /** 24 | * This class is used to ensure that a recent Hadoop version is on the classpath. 25 | * 26 | * If it does not compile: the version of Spark it is built against has out of date 27 | * dependencies. 28 | * 29 | * If it does not findClass, the version of Spark it is running against is out of date. 30 | * 31 | * Currently: requires Hadoop 2.8+ 32 | */ 33 | class ForceRecentHadoopVersion { 34 | 35 | /** compile/link failure against Hadoop 2.6 */ 36 | val requireAzure = new AzureException("needs Hadoop 2.7+") 37 | 38 | /** compile failure against Hadoop 2.7 */ 39 | val requireRecentAWS = new RenameFailedException("/", "Needs something", "") 40 | } 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cloud Integration for Apache Spark 2 | 3 | The [cloud-integration](https://github.com/hortonworks-spark/cloud-integration) 4 | repository provides modules to improve Apache Spark's integration with cloud infrastructures. 5 | 6 | 7 | 8 | ## Module `spark-cloud-integration` 9 | 10 | Classes and Tools to make Spark work better in-cloud 11 | 12 | * Committer integration with the s3a committers. 13 | * Proof of concept cloud-first distcp replacement. 14 | * Serialization for Hadoop `Configuration`: class `ConfigSerDeser`. Use this 15 | to get a configuration into an RDD method 16 | * Trait `HConf` to manipulate the hadoop options in a spark config. 17 | * Anything else which turns out to be useful. 18 | * Variant of `FileInputStream` for cloud storage, `org.apache.spark.streaming.cloudera.CloudInputDStream` 19 | 20 | See [Spark Cloud Integration](spark-cloud-integration/src/main/site/markdown/index.md) 21 | 22 | 23 | 24 | ## Module `cloud-examples` 25 | 26 | This does the packaging/integration tests for Spark and cloud against AWS, Azure and Google GCS. 27 | 28 | These are basic tests of the core functionality of I/O, streaming, and verify that 29 | the commmitters work. 30 | 31 | As well as running as unit tests, they have CLI entry points which can be used for scalable functional testing. 32 | 33 | 34 | ## Module `minimal-integration-test` 35 | 36 | This is a minimal JAR for integration tests 37 | 38 | Usage 39 | ```bash 40 | spark-submit --class com.cloudera.spark.cloud.integration.Generator \ 41 | --master yarn \ 42 | --num-executors 2 \ 43 | --driver-memory 512m \ 44 | --executor-memory 512m \ 45 | --executor-cores 1 \ 46 | minimal-integration-test-1.0-SNAPSHOT.jar \ 47 | adl://example.azuredatalakestore.net/output/dest/1 \ 48 | 2 2 15 49 | ``` 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/s3/audit/ServerLogEntry.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3.audit 19 | 20 | case class ServerLogEntry( 21 | bucketowner: String, 22 | bucket_name: String, 23 | requestdatetime: String, 24 | remoteip: String, 25 | requester: String, 26 | requestid: String, 27 | operation: String, 28 | key: String, 29 | request_uri: String, 30 | httpstatus: String, 31 | errorcode: String, 32 | bytessent: Long, 33 | objectsize: Long, 34 | totaltime: String, 35 | turnaroundtime: String, 36 | referrer: String, 37 | useragent: String, 38 | versionid: String, 39 | hostid: String, 40 | sigv: String, 41 | ciphersuite: String, 42 | authtype: String, 43 | endpoint: String, 44 | tlsversion: String) { 45 | 46 | override def toString: String = 47 | s"$operation /$bucket_name/$key $httpstatus $errorcode $bytessent $requestdatetime" 48 | } 49 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/utils/IntegrationUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import java.net.URL 21 | 22 | import org.apache.hadoop.util.ExitUtil 23 | 24 | /** 25 | * A class to instantiate for all the general utils 26 | */ 27 | class IntegrationUtils extends TimeOperations with HConf { 28 | private val E_NO_CLASS = 11 29 | 30 | def findClass(src: String, classname: String): (String, String, URL, Class[_]) = { 31 | try { 32 | val loader = this.getClass.getClassLoader 33 | val res = classname.replaceAll("\\.", "/") + ".class" 34 | val url = loader.getResource(res) 35 | val clazz = loader.loadClass(classname) 36 | (src, classname, url, clazz) 37 | } catch { 38 | case e: Exception => 39 | throw new ExitUtil.ExitException(E_NO_CLASS, 40 | s"Failed to findClass Class $classname from $src").initCause(e) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Spark provides this Travis CI configuration file to help contributors 17 | # check Scala/Java style conformance and JDK7/8 compilation easily 18 | # during their preparing pull requests. 19 | # - Scalastyle is executed during `maven install` implicitly. 20 | # - Java Checkstyle is executed by `lint-java`. 21 | # See the related discussion here. 22 | # https://github.com/apache/spark/pull/12980 23 | 24 | # 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM) 25 | #%sudo: required 26 | #%dist: trusty 27 | 28 | # 2. Choose language and target JDKs for parallel builds. 29 | language: java 30 | jdk: 31 | - oraclejdk8 32 | 33 | # 3. Setup cache directory for SBT and Maven. 34 | cache: 35 | directories: 36 | - $HOME/.sbt 37 | - $HOME/.m2 38 | 39 | # 4. Turn off notifications. 40 | notifications: 41 | email: false 42 | 43 | # 5. Run maven install before running lint-java. 44 | install: 45 | - export MAVEN_SKIP_RC=1 46 | - mvn -T 1C install 47 | 48 | 49 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/local/LocalTestSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.local 19 | 20 | import java.io.File 21 | 22 | import com.cloudera.spark.cloud.common.CloudSuiteTrait 23 | import org.apache.hadoop.fs.{FileSystem, Path} 24 | 25 | /** 26 | * Trait for the local fs; goal is for benchmarking/validating/writing 27 | * new tests. 28 | * 29 | */ 30 | trait LocalTestSetup extends CloudSuiteTrait { 31 | 32 | override def enabled: Boolean = { 33 | true 34 | } 35 | 36 | def initFS(): FileSystem = { 37 | val fs = getLocalFS 38 | setFilesystem(fs) 39 | fs 40 | } 41 | 42 | override def dynamicPartitioning: Boolean = true; 43 | 44 | /** 45 | * the test path here is always to something under the temp dir. 46 | */ 47 | override protected def testDir: Path = { 48 | val f = File.createTempFile(this.getClass.getSimpleName, "") 49 | f.delete() 50 | f.mkdir() 51 | new Path(f.toURI) 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/examples/S3DataFrameExampleSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.examples 19 | 20 | import com.cloudera.spark.cloud.common.CloudSuite 21 | import com.cloudera.spark.cloud.s3.S3ATestSetup 22 | 23 | /** 24 | * Test the [S3DataFrames] logic. 25 | */ 26 | class S3DataFrameExampleSuite extends CloudSuite with S3ATestSetup { 27 | 28 | init() 29 | 30 | def init(): Unit = { 31 | // propagate S3 credentials 32 | if (enabled) { 33 | initFS() 34 | } 35 | } 36 | 37 | /** 38 | * Override point: the data frame operation to execute 39 | */ 40 | ctest("DataFrames", 41 | "Dataframe IO") { 42 | val conf = newSparkConf() 43 | conf.setAppName("DataFrames") 44 | val destDir = testPath(filesystem, "dataframes") 45 | val instance = new S3DataFrameExample() 46 | val args = Seq(destDir) 47 | assert(0 === instance.action(conf, args), 48 | s" action($args) failed against $instance") 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3AStreaming.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.operations.CloudStreaming 21 | 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.streaming._ 24 | 25 | /** 26 | * An example/test for streaming with a source of S3. 27 | */ 28 | object S3AStreaming extends CloudStreaming with S3AExampleSetup 29 | with SequentialIOPolicy { 30 | 31 | /** 32 | * This is never executed; it's just here as the source of the example in the 33 | * documentation. 34 | */ 35 | def streamingExample(): Unit = { 36 | val sparkConf = new SparkConf() 37 | val ssc = new StreamingContext(sparkConf, Milliseconds(1000)) 38 | try { 39 | val lines = ssc.textFileStream("s3a://testbucket/incoming") 40 | val matches = lines.filter(_.endsWith("3")) 41 | matches.print() 42 | ssc.start() 43 | ssc.awaitTermination() 44 | } finally { 45 | ssc.stop(true) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/csv/AbfsHugeCsvIOSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.csv 19 | 20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 21 | import com.cloudera.spark.cloud.ObjectStoreConfigurations.ABFS_READAHEAD_HADOOP_OPTIONS 22 | import org.apache.hadoop.conf.Configuration 23 | 24 | /** 25 | * The real test of HADOOP-18521. 26 | */ 27 | class AbfsHugeCsvIOSuite extends AbstractHugeCsvIOSuite with AbfsTestSetup() { 28 | 29 | init() 30 | 31 | /** 32 | * set up FS if enabled. 33 | */ 34 | def init(): Unit = { 35 | if (enabled) { 36 | initFS() 37 | } 38 | } 39 | 40 | /** 41 | * Patch in ABFS readahead options, to ensure they are 42 | * always set. 43 | * @return the configuration to create the fs with 44 | */ 45 | override def createConfiguration(): Configuration = { 46 | val conf = super.createConfiguration() 47 | for (kv <- ABFS_READAHEAD_HADOOP_OPTIONS) { 48 | conf.set(kv._1, kv._2) 49 | } 50 | conf 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureLineCountSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource 21 | import com.cloudera.spark.cloud.operations.LineCount 22 | 23 | /** 24 | * Test the `S3LineCount` entry point. 25 | */ 26 | class AzureLineCountSuite extends CloudSuiteWithCSVDatasource with AzureTestSetup { 27 | 28 | init() 29 | 30 | /** 31 | * set up FS if enabled. 32 | */ 33 | def init(): Unit = { 34 | if (enabled) { 35 | initFS() 36 | initDatasources() 37 | } 38 | } 39 | 40 | override def enabled: Boolean = super.enabled && hasCSVTestFile 41 | 42 | after { 43 | cleanFilesystemInTeardown() 44 | } 45 | 46 | ctest("AzureLineCountSuite", 47 | "Execute the LineCount example") { 48 | val src = getTestCSVPath() 49 | val sparkConf = newSparkConf(src) 50 | sparkConf.setAppName("AzureLineCountSuite") 51 | assert(0 === new LineCount().action(sparkConf, 52 | Seq(src.toUri.toString))) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/CloudSuiteWithCSVDatasource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | import org.apache.hadoop.fs.{FileSystem, Path} 22 | 23 | /** 24 | * Any cloud suite which requires the datasource to be a (possibly copied over) 25 | * CSV file. 26 | */ 27 | class CloudSuiteWithCSVDatasource extends CloudSuite with CsvDatasourceSupport { 28 | 29 | /** 30 | * Call this to set up the datasource for tests. 31 | */ 32 | def initDatasources(): Unit = { 33 | if (hasCSVTestFile()) { 34 | prepareTestCSVFile() 35 | testCSVFilePath.get 36 | } 37 | } 38 | 39 | /** 40 | * Get the CSV source path and filesystem to read from it. 41 | * The filesystem uses the endpoint defined for the CSV file. 42 | * 43 | * @return Patn and FS of a CSV source file. 44 | */ 45 | def getCSVSourceAndFileSystem(): (Path, FileSystem) = { 46 | val source = getTestCSVPath() 47 | (source, FileSystem.newInstance(source.toUri, new Configuration(getConf))) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /cloud-examples/src/main/site/using.md: -------------------------------------------------------------------------------- 1 | 14 | 15 | # Using the extra features in these examples 16 | 17 | ### Example: Spark Streaming and Cloud Storage 18 | 19 | Spark Streaming can monitor files added to object stores, by 20 | creating a `FileInputDStream` DStream monitoring a path under a bucket. 21 | 22 | ```scala 23 | import org.apache.spark.SparkConf 24 | import org.apache.spark.sql.SparkSession 25 | import org.apache.spark.streaming._ 26 | 27 | val sparkConf = new SparkConf() 28 | val ssc = new StreamingContext(sparkConf, Milliseconds(5000)) 29 | try { 30 | val lines = ssc.textFileStream("s3a://bucket/incoming") 31 | val matches = lines.filter(_.endsWith("3")) 32 | matches.print() 33 | ssc.start() 34 | ssc.awaitTermination() 35 | } finally { 36 | ssc.stop(true) 37 | } 38 | ``` 39 | 40 | 41 | 1. The time to scan for new files is proportional to the number of files 42 | under the path —not the number of *new* files, and that it can become a slow operation. 43 | The size of the window needs to be set to handle this. 44 | 45 | 1. Files only appear in an object store once they are completely written; there 46 | is no need for a worklow of write-then-rename to ensure that files aren't picked up 47 | while they are still being written. Applications can write straight to the monitored directory. 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3ADataFrames.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.CloudTestKeys 21 | import com.cloudera.spark.cloud.operations.CloudDataFrames 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.fs.{FileSystem, Path} 24 | 25 | import org.apache.spark.sql.SparkSession 26 | 27 | /** 28 | * Test dataframe operations using S3 as the destination and source of operations. 29 | * This validates the various conversion jobs all work against the object store. 30 | * 31 | * It doesn't verify timings, though some information is printed. 32 | */ 33 | object S3ADataFrames extends CloudDataFrames with S3AExampleSetup { 34 | 35 | override def extraValidation( 36 | session: SparkSession, 37 | conf: Configuration, 38 | fs: FileSystem, 39 | results: Seq[(String, Path, Long, Long)]): Unit = { 40 | 41 | val operations = new CommitterOperations(fs) 42 | if (conf.getBoolean(CloudTestKeys.S3A_COMMITTER_TEST_ENABLED, false)) { 43 | results.foreach((tuple: (String, Path, Long, Long)) => { 44 | operations.verifyCommitter(tuple._2, None, None, "") 45 | }) 46 | } 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3AFileGeneratorSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.FileGeneratorTests 21 | import org.apache.hadoop.fs.Path 22 | 23 | import org.apache.spark.SparkConf 24 | 25 | /** 26 | * Test the `S3FileGenerator` entry point. 27 | */ 28 | class S3AFileGeneratorSuite extends FileGeneratorTests with S3ATestSetup { 29 | 30 | init() 31 | 32 | def init(): Unit = { 33 | // propagate S3 credentials 34 | if (enabled) { 35 | initFS() 36 | } 37 | } 38 | 39 | after { 40 | cleanFilesystemInTeardown() 41 | } 42 | 43 | ctest("FileGeneratorUsage", 44 | "Execute the S3FileGenerator example with a bad argument; expect a failure") { 45 | val conf = newSparkConf() 46 | conf.setAppName("FileGenerator") 47 | assert(-2 === S3AFileGenerator.action(conf, Seq())) 48 | } 49 | 50 | override def generate( 51 | conf: SparkConf, 52 | destDir: Path, 53 | monthCount: Int, 54 | fileCount: Int, 55 | rowCount: Int): Int = { 56 | val result = S3AFileGenerator.action(conf, Seq(destDir, 57 | monthCount, 58 | fileCount, 59 | rowCount)) 60 | result 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/HadoopVersionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import java.util 21 | import java.util.Collections 22 | 23 | import scala.collection.JavaConverters._ 24 | 25 | import com.cloudera.spark.cloud.common.CloudSuite._ 26 | import com.cloudera.spark.cloud.test.UnitTestSuite 27 | 28 | class HadoopVersionSuite extends UnitTestSuite { 29 | 30 | test("Sysprops") { 31 | val props = System.getProperties 32 | val list = new util.ArrayList[String](props.stringPropertyNames()) 33 | Collections.sort(list) 34 | val plist = list.asScala 35 | .filter(k => (!k.startsWith("java.") && !k.startsWith("sun."))) 36 | .map(key => s"$key = ${props.getProperty(key)}") 37 | .mkString("\n") 38 | logInfo(s"Properties:\n$plist") 39 | } 40 | 41 | test("PropagatedValues") { 42 | val mapped = StoreTestHelper.loadConfiguration().asScala 43 | .filter { entry => 44 | val k = entry.getKey 45 | k.startsWith("fs.s3a") && !k.contains("key") 46 | } 47 | .map(entry => s"${entry.getKey} = ${entry.getValue}").toList.sorted 48 | val list = mapped.mkString("\n") 49 | logInfo(s"S3A config options:\n${list}") 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3DependencyCheckSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.test.UnitTestSuite 21 | 22 | /** 23 | * Force findClass in hadoop s3n/s3a classes and some dependencies. 24 | * Dependency problems should be picked up at compile time; runtime may 25 | * identify problems with any other transitive library 26 | */ 27 | class S3DependencyCheckSuite extends UnitTestSuite { 28 | 29 | test("Create S3A FS Instance") { 30 | instantiate("org.apache.hadoop.fs.s3a.S3AFileSystem") 31 | } 32 | 33 | test("hive") { 34 | instantiate("org.apache.hadoop.hive.conf.HiveConf") 35 | } 36 | 37 | /** 38 | * Instantiate the class. 39 | * This is wrapped because scalatest gets confused about instantiation Errors raised 40 | * in a test case: they aren't methods, see. 41 | * @param classname class to instantiate. 42 | */ 43 | def instantiate(classname: String) { 44 | try { 45 | val clazz = this.getClass.getClassLoader.loadClass(classname) 46 | clazz.newInstance() 47 | } catch { 48 | case e: Exception => throw e 49 | case e: Throwable => throw new Exception(s"Could not instantiate $classname", e) 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/CommitterInfo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud 19 | 20 | import com.cloudera.spark.cloud.CommitterBinding.factoryForSchema 21 | import com.cloudera.spark.cloud.utils.HConf 22 | import org.apache.hadoop.conf.Configuration 23 | 24 | import org.apache.spark.SparkConf 25 | 26 | /** 27 | * representation of a committer 28 | * @param name committe name for s3a manifestation 29 | * @param factory factory classname 30 | */ 31 | case class CommitterInfo(name: String, factory: String) 32 | extends HConf { 33 | 34 | def bind(sparkConf: SparkConf): Unit = { 35 | bindToSchema(sparkConf, "s3a") 36 | } 37 | 38 | def bind(conf: Configuration): Unit = { 39 | bindToSchema(conf, "s3a") 40 | } 41 | 42 | def bindToSchema(sparkConf: SparkConf, fsSchema: String): Unit = { 43 | hconf(sparkConf, factoryForSchema(fsSchema), factory) 44 | hconf(sparkConf, CommitterBinding.S3A_COMMITTER_NAME, 45 | name) 46 | } 47 | 48 | def bindToSchema(conf: Configuration, fsSchema: String): Unit = { 49 | conf.set(factoryForSchema(fsSchema), factory) 50 | conf.set(CommitterBinding.S3A_COMMITTER_NAME, name) 51 | } 52 | 53 | override def toString: String = s"Committer binding $factory($name)" 54 | } 55 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/sources/AbtractOrcRelationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.sources 19 | 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.hive.orc.OrcFileFormat 22 | import org.apache.spark.sql.internal.SQLConf 23 | 24 | /** 25 | * cloud relation suite with some orc specific tests. 26 | */ 27 | abstract class AbtractOrcRelationSuite extends CloudRelationBasicSuite { 28 | 29 | import testImplicits._ 30 | 31 | override val dataSourceName: String = classOf[OrcFileFormat].getCanonicalName 32 | 33 | ctest("SPARK-12218", 34 | "'Not' is included in ORC filter pushdown", false) { 35 | 36 | withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") { 37 | withTempPathDir("SPARK-12218") { dir => 38 | val path = s"${dir.toString}/table1" 39 | (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path) 40 | 41 | checkAnswer( 42 | spark.read.orc(path).where("not (a = 2) or not(b in ('1'))"), 43 | (1 to 5).map(i => Row(i, (i % 2).toString))) 44 | 45 | checkAnswer( 46 | spark.read.orc(path).where("not (a = 2 and b in ('1'))"), 47 | (1 to 5).map(i => Row(i, (i % 2).toString))) 48 | } 49 | } 50 | } 51 | 52 | 53 | } 54 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/ExtraAssertions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | import org.scalatest.Assertions 22 | 23 | trait ExtraAssertions extends Assertions { 24 | 25 | /** 26 | * Expect a specific value; raise an assertion if it is not there 27 | * 28 | * @param v value 29 | * @param msg message 30 | * @tparam T type 31 | * @return the actual value 32 | */ 33 | def expectSome[T](v: Option[T], msg: => String): T = { 34 | v.getOrElse(throw new AssertionError(msg)) 35 | } 36 | 37 | /** 38 | * Expect a value to be non-null; return it. It will 39 | * implicitly be non-null in further use. 40 | * 41 | * @param v value to check 42 | * @param msg message for any assertion 43 | * @tparam T type of value 44 | * @return 45 | */ 46 | def expectNotNull[T](v: T, msg: => String): T = { 47 | if (v != null) v else throw new AssertionError(msg) 48 | } 49 | 50 | /** 51 | * Expect a configuration option to be set 52 | * 53 | * @param c config 54 | * @param key kjey to look for 55 | * @return the set value 56 | */ 57 | def expectOptionSet(c: Configuration, key: String): String = { 58 | expectNotNull(c.get(key), s"Unset property ${key}") 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/CsvDatasourceSupport.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import org.apache.hadoop.fs.Path 21 | 22 | trait CsvDatasourceSupport { 23 | 24 | /** 25 | * Predicate to define whether or not there's a CSV file to work with. 26 | * 27 | * @return true if the CSV test file is defined. 28 | */ 29 | def hasCSVTestFile(): Boolean = false 30 | 31 | /** 32 | * Path to the CSV file's original source 33 | * @return a path 34 | */ 35 | def sourceCSVFilePath : Option[Path] = None 36 | 37 | /** 38 | * Path to the CSV file used in the tests themselves; may differ from 39 | * the original source 40 | * 41 | * @return path to test data: valid after `prepareTestCSVFile`. 42 | */ 43 | def testCSVFilePath : Option[Path] = sourceCSVFilePath 44 | 45 | /** 46 | * Get the test CSV file or raise an exception. 47 | * @return the CSV path for tests 48 | */ 49 | def getTestCSVPath(): Path = testCSVFilePath.get 50 | 51 | /** 52 | * Any operation to prepare the CSV file. After completion, returns 53 | * the path to the test CSV file. 54 | */ 55 | def prepareTestCSVFile(): Unit = { 56 | require(hasCSVTestFile(), "No CSV file") 57 | require(sourceCSVFilePath.isDefined, "No source CSV file") 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /cloud-examples/src/test/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # lifted from spark core/src/test/resources. 19 | # from the log4j docs: 20 | # > An understanding of how loggers work in Log4j is critical before 21 | # > trying to configure them. 22 | # > Please reference the Log4j architecture if more information is required. 23 | # > Trying to configure Log4j without understanding those concepts will lead to frustration. 24 | 25 | # Set everything to be logged to the file target/unit-tests.log 26 | rootLogger.level = info 27 | rootLogger.appenderRef.file.ref = ${sys:test.appender:-File} 28 | 29 | appender.file.type = File 30 | appender.file.name = File 31 | appender.file.fileName = target/unit-tests.log 32 | appender.file.layout.type = PatternLayout 33 | appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n%ex 34 | 35 | # Tests that launch java subprocesses can set the "test.appender" system property to 36 | # "console" to avoid having the child process's logs overwrite the unit test's 37 | # log file. 38 | appender.console.type = Console 39 | appender.console.name = console 40 | appender.console.target = SYSTEM_ERR 41 | appender.console.layout.type = PatternLayout 42 | appender.console.layout.pattern = %t: %m%n%ex 43 | 44 | # Ignore messages below warning level from Jetty, because it's a bit verbose 45 | logger.jetty.name = org.sparkproject.jetty 46 | logger.jetty.level = warn 47 | 48 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/AbstractGsCommitterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations 21 | import com.cloudera.spark.cloud.common.CloudSuite 22 | 23 | 24 | 25 | import org.apache.spark.{SparkConf, SparkScopeWorkarounds} 26 | 27 | abstract class AbstractGsCommitterSuite extends CloudSuite with GsTestSetup { 28 | /** 29 | * Patch up hive for re-use. 30 | * 31 | * @param sparkConf configuration to patch 32 | */ 33 | def addTransientDerbySettings(sparkConf: SparkConf): Unit = { 34 | hconf(sparkConf, SparkScopeWorkarounds.tempHiveConfig()) 35 | } 36 | 37 | /** 38 | * Override point for suites: a method which is called 39 | * in all the `newSparkConf()` methods. 40 | * This can be used to alter values for the configuration. 41 | * It is called before the configuration read in from the command line 42 | * is applied, so that tests can override the values applied in-code. 43 | * 44 | * @param sparkConf spark configuration to alter 45 | */ 46 | override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = { 47 | super.addSuiteConfigurationOptions(sparkConf) 48 | logDebug("Patching spark conf with committer bindings") 49 | sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS) 50 | addTransientDerbySettings(sparkConf) 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ACSVReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.CSVReadTests 21 | 22 | /** 23 | * A suite of tests reading in the S3A CSV file. 24 | */ 25 | class S3ACSVReadSuite extends CSVReadTests with S3ATestSetup with SequentialIOPolicy { 26 | 27 | init() 28 | 29 | def init(): Unit = { 30 | setupFilesystemConfiguration(getConf) 31 | if (enabled) { 32 | initDatasources() 33 | } 34 | } 35 | 36 | 37 | /* class RemoteOutputIterator[T](private val source: RemoteIterator[T]) extends Iterator[T] { 38 | def hasNext: Boolean = source.hasNext 39 | 40 | def next: T = source.next() 41 | }*/ 42 | 43 | /* 44 | * This doesn't do much, except that it is designed to be pasted straight into 45 | * Zeppelin and work 46 | */ 47 | /* ctest("DirOps", "simple directory ops in spark context process") { 48 | val source = CSV_TESTFILE.get 49 | sc = new SparkContext("local", "CSVgz", newSparkConf(source)) 50 | 51 | import org.apache.hadoop.fs._ 52 | val landsat = "s3a://landsat-pds/scene_list.gz" 53 | val landsatPath = new Path(landsat) 54 | val fs = landsatPath.getFileSystem(sc.hadoopConfiguration) 55 | val files = fs.listFiles(landsatPath.getParent, false) 56 | val listing = new RemoteOutputIterator(files) 57 | listing.foreach(print(_)) 58 | 59 | }*/ 60 | 61 | } 62 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/committers/AbstractCommitterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.committers 19 | 20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations 21 | import com.cloudera.spark.cloud.common.CloudSuite 22 | import com.cloudera.spark.cloud.s3.S3ATestSetup 23 | 24 | import org.apache.spark.{SparkConf, SparkScopeWorkarounds} 25 | 26 | abstract class AbstractCommitterSuite extends CloudSuite { 27 | /** 28 | * Patch up hive for re-use. 29 | * 30 | * @param sparkConf configuration to patch 31 | */ 32 | def addTransientDerbySettings(sparkConf: SparkConf): Unit = { 33 | hconf(sparkConf, SparkScopeWorkarounds.tempHiveConfig()) 34 | } 35 | 36 | /** 37 | * Override point for suites: a method which is called 38 | * in all the `newSparkConf()` methods. 39 | * This can be used to alter values for the configuration. 40 | * It is called before the configuration read in from the command line 41 | * is applied, so that tests can override the values applied in-code. 42 | * 43 | * @param sparkConf spark configuration to alter 44 | */ 45 | override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = { 46 | super.addSuiteConfigurationOptions(sparkConf) 47 | logDebug("Patching spark conf with s3a committer bindings") 48 | sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS) 49 | addTransientDerbySettings(sparkConf) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureDataFrameSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.azure 19 | 20 | import com.cloudera.spark.cloud.common.DataFrameTests 21 | 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.sql.SparkSession 24 | import org.apache.spark.sql.types.StringType 25 | 26 | /** 27 | * Test Azure and DataFrames 28 | */ 29 | class AzureDataFrameSuite extends DataFrameTests with AzureTestSetup { 30 | 31 | init() 32 | 33 | def init(): Unit = { 34 | if (enabled) { 35 | initFS() 36 | } 37 | } 38 | 39 | /** 40 | * This is the source for the example; it is here to ensure it compiles. 41 | */ 42 | def example(sparkConf: SparkConf): Unit = { 43 | val spark = SparkSession 44 | .builder 45 | .appName("DataFrames") 46 | .config(sparkConf) 47 | .getOrCreate() 48 | import spark.implicits._ 49 | val numRows = 1000 50 | val sourceData = spark.range(0, numRows).select($"id".as("l"), $"id".cast(StringType).as("s")) 51 | val dest = "wasb://yourcontainer@youraccount.blob.core.windows.net/dataframes" 52 | val orcFile = dest + "/data.orc" 53 | sourceData.write.format("orc").save(orcFile) 54 | // read it back 55 | val orcData = spark.read.format("orc").load(orcFile) 56 | // save it to parquet 57 | val parquetFile = dest + "/data.parquet" 58 | orcData.write.format("parquet").save(parquetFile) 59 | spark.stop() 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/commit/AbstractAbfsCommitterSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.abfs.commit 19 | 20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations 21 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup 22 | import com.cloudera.spark.cloud.common.CloudSuite 23 | 24 | import org.apache.spark.{SparkConf, SparkScopeWorkarounds} 25 | 26 | abstract class AbstractAbfsCommitterSuite extends CloudSuite with AbfsTestSetup { 27 | /** 28 | * Patch up hive for re-use. 29 | * 30 | * @param sparkConf configuration to patch 31 | */ 32 | def addTransientDerbySettings(sparkConf: SparkConf): Unit = { 33 | hconf(sparkConf, SparkScopeWorkarounds.tempHiveConfig()) 34 | } 35 | 36 | /** 37 | * Override point for suites: a method which is called 38 | * in all the `newSparkConf()` methods. 39 | * This can be used to alter values for the configuration. 40 | * It is called before the configuration read in from the command line 41 | * is applied, so that tests can override the values applied in-code. 42 | * 43 | * @param sparkConf spark configuration to alter 44 | */ 45 | override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = { 46 | super.addSuiteConfigurationOptions(sparkConf) 47 | logDebug("Patching spark conf with committer bindings") 48 | sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS) 49 | addTransientDerbySettings(sparkConf) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/ReadSample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | //import org.apache.spark.mllib.linalg.Vectors 21 | 22 | /** 23 | * A sample of a read operation. 24 | * @param started start time in nS 25 | * @param duration duration nS 26 | * @param blockSize size of block worked with 27 | * @param bytesRequested how many bytes were requested 28 | * @param bytesRead how many bytes were actually returned 29 | * @param pos position in the object where the read was requested. 30 | */ 31 | class ReadSample( 32 | val started: Long, 33 | val duration: Long, 34 | val blockSize: Int, 35 | val bytesRequested: Int, 36 | val bytesRead: Int, 37 | val pos: Long) extends Serializable { 38 | 39 | def perByte: Long = { if (duration > 0) bytesRead / duration else -1L } 40 | 41 | def delta: Int = { bytesRequested - bytesRead } 42 | 43 | override def toString: String = s"ReadSample(started=$started, duration=$duration," + 44 | s" blockSize=$blockSize, bytesRequested=$bytesRequested, bytesRead=$bytesRead)" + 45 | s" pos=$pos" 46 | 47 | /* def toVector = { 48 | val a = new Array[Double](8) 49 | a(0) = started.toDouble 50 | a(1) = duration.toDouble 51 | a(2) = blockSize.toDouble 52 | a(3) = bytesRequested.toDouble 53 | a(4) = bytesRead.toDouble 54 | a(5) = pos.toDouble 55 | a(6) = perByte.toDouble 56 | a(7) = delta.toDouble 57 | Vectors.dense(a) 58 | }*/ 59 | 60 | } 61 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/DataFrameTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import com.cloudera.spark.cloud.operations.CloudDataFrames 21 | 22 | /** 23 | * Test dataframe and object store integration 24 | */ 25 | abstract class DataFrameTests extends CloudSuite { 26 | 27 | after { 28 | cleanFilesystemInTeardown() 29 | } 30 | 31 | /** 32 | * Override point: the data frame operation to execute 33 | */ 34 | protected val instance: CloudDataFrames = new CloudDataFrames() 35 | 36 | ctest("DataFrames", 37 | "Execute the Data Frames example") { 38 | val conf = newSparkConf() 39 | conf.setAppName("DataFrames") 40 | val destDir = testPath(filesystem, "dataframes") 41 | val rowCount = 1000 42 | 43 | val args = Seq(destDir, rowCount) 44 | assert(0 === instance.action(conf, args), 45 | s" action($args) failed against $instance") 46 | 47 | // do a recursive listFiles 48 | val listing = logDuration("listFiles(recursive)") { 49 | listFiles(filesystem, destDir, true) 50 | } 51 | 52 | var recursivelyListedFilesDataset = 0L 53 | var recursivelyListedFiles = 0 54 | logDuration("scan result list") { 55 | listing.foreach{status => 56 | recursivelyListedFiles += 1 57 | recursivelyListedFilesDataset += status.getLen 58 | logInfo(s"${status.getPath}[${status.getLen}]") 59 | } 60 | } 61 | 62 | logInfo(s"FileSystem $filesystem") 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/ContextFreeCloudSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import com.cloudera.spark.cloud.s3.S3AConstants 21 | import org.scalatest.concurrent.Eventually 22 | import org.scalatest.BeforeAndAfter 23 | 24 | import org.apache.spark.SparkFunSuite 25 | import org.apache.spark.sql.SparkSession 26 | 27 | /** 28 | * A cloud suite which doesn't create a spark context. 29 | */ 30 | abstract class ContextFreeCloudSuite extends SparkFunSuite 31 | with BeforeAndAfter 32 | with Eventually with S3AConstants with CloudSuiteTrait { 33 | 34 | } 35 | 36 | /** 37 | * Cloud test suite with a spark session to clean up afterwards 38 | */ 39 | abstract class SparkSessionCloudSuite extends ContextFreeCloudSuite { 40 | 41 | var _sparkSession: SparkSession = null 42 | 43 | def sparkSession = _sparkSession 44 | 45 | def setSparkSession(s: SparkSession): Unit = { 46 | _sparkSession = s 47 | } 48 | 49 | /** 50 | * Close any spark session. 51 | */ 52 | def closeSparkSession(): Unit = { 53 | if (_sparkSession != null) { 54 | _sparkSession.close() 55 | _sparkSession = null 56 | // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown 57 | // (based on LocalSparkContext; no idea if still holds) 58 | System.clearProperty("spark.driver.port") 59 | } 60 | } 61 | 62 | 63 | override def afterEach(): Unit = { 64 | try { 65 | closeSparkSession() 66 | } finally { 67 | super.afterEach() 68 | } 69 | } 70 | 71 | } -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GSDependencyCheckSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.gs 19 | 20 | import com.cloudera.spark.cloud.test.UnitTestSuite 21 | import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 22 | import com.google.cloud.hadoop.fs.gcs.HadoopConfigurationProperty 23 | import org.apache.hadoop.fs.FileSystem 24 | 25 | /** 26 | * Force findClass in hadoop gcs classes and some dependencies. 27 | * Dependency problems should be picked up at compile time; runtime may 28 | * identify problems with any other transitive library 29 | */ 30 | class GSDependencyCheckSuite extends UnitTestSuite { 31 | 32 | test("Create GCS FS Instance") { 33 | instantiate("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") 34 | } 35 | 36 | test("compile time check of filesystem") { 37 | val fs = new GoogleHadoopFileSystem() 38 | assert(fs.isInstanceOf[FileSystem]) 39 | } 40 | 41 | test("config") { 42 | new HadoopConfigurationProperty("key") 43 | } 44 | 45 | /** 46 | * Instantiate the class. 47 | * This is wrapped because scalatest gets confused about instantiation Errors raised 48 | * in a test case: they aren't methods, see. 49 | * @param classname class to instantiate. 50 | */ 51 | def instantiate(classname: String) { 52 | try { 53 | val clazz = this.getClass.getClassLoader.loadClass(classname) 54 | clazz.newInstance() 55 | } catch { 56 | case e: Exception => throw e 57 | case e: Throwable => throw new Exception(s"Could not instantiate $classname", e) 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/utils/ConfigSerDeser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import java.io.{ObjectInputStream, ObjectOutputStream} 21 | 22 | import org.apache.hadoop.conf.Configuration 23 | 24 | /** 25 | * Class to make Hadoop configurations serializable; uses the 26 | * `Writeable` operations to do this. 27 | * Note: this only serializes the explicitly set values, not any set 28 | * in site/default or other XML resources. 29 | * @param conf configuration to serialize 30 | */ 31 | class ConfigSerDeser(var conf: Configuration) extends Serializable { 32 | 33 | private val serialVersionUID = 0xABBA0000 34 | 35 | /** 36 | * Empty constructor: binds to a `new Configuration()`. 37 | */ 38 | def this() { 39 | this(new Configuration()) 40 | } 41 | 42 | /** 43 | * Get the current configuration. 44 | * @return the configuration. 45 | */ 46 | def get(): Configuration = conf 47 | 48 | /** 49 | * Serializable writer. 50 | * @param out ouput stream 51 | */ 52 | private def writeObject (out: ObjectOutputStream): Unit = { 53 | conf.write(out) 54 | } 55 | 56 | /** 57 | * Serializable reader. 58 | * @param in input 59 | */ 60 | private def readObject (in: ObjectInputStream): Unit = { 61 | conf = new Configuration() 62 | conf.readFields(in) 63 | } 64 | 65 | /** 66 | * Handle a read without data; this should never be called, but it 67 | * is here as a safety mechanism. 68 | */ 69 | private def readObjectNoData(): Unit = { 70 | conf = new Configuration() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3AExampleSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations 21 | import com.cloudera.spark.cloud.common.StoreTestOperations 22 | import org.apache.hadoop.conf.Configuration 23 | 24 | import org.apache.spark.SparkConf 25 | 26 | /** 27 | * Base Class for examples working with S3. 28 | */ 29 | trait S3AExampleSetup extends StoreTestOperations with S3AConstants { 30 | 31 | /** 32 | * Set the standard S3A Hadoop options to be used in test/examples. 33 | * If Random IO is expected, then the experimental fadvise option is 34 | * set to random. 35 | * 36 | * @param sparkConf spark configuration to patch 37 | * @param randomIO is the IO expected to be random access? 38 | */ 39 | override protected def applyObjectStoreConfigurationOptions( 40 | sparkConf: SparkConf, randomIO: Boolean): Unit = { 41 | super.applyObjectStoreConfigurationOptions(sparkConf, true) 42 | // smaller block size to divide up work 43 | hconf(sparkConf, BLOCK_SIZE, 1 * 1024 * 1024) 44 | hconf(sparkConf, MULTIPART_SIZE, MIN_PERMITTED_MULTIPART_SIZE) 45 | hconf(sparkConf, READAHEAD_RANGE, "128K") 46 | hconf(sparkConf, MIN_MULTIPART_THRESHOLD, MIN_PERMITTED_MULTIPART_SIZE) 47 | hconf(sparkConf, INPUT_FADVISE, if (randomIO) RANDOM_IO else NORMAL_IO) 48 | // disable file output in the path output committer as s safety check 49 | hconf(sparkConf, REJECT_FILE_OUTPUT, true) 50 | verifyConfigurationOptions(sparkConf, 51 | ObjectStoreConfigurations.COMMITTER_OPTIONS) 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/sources/CloudPartitionTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.sources 19 | 20 | import org.apache.hadoop.fs.Path 21 | 22 | import org.apache.spark.sql._ 23 | import org.apache.spark.sql.types.{IntegerType, StructField, StructType} 24 | 25 | /** 26 | * Test of a single operation; isolated for debugging. 27 | */ 28 | abstract class CloudPartitionTest extends AbstractCloudRelationTest { 29 | 30 | import testImplicits._ 31 | 32 | protected val rows = 3 33 | protected val part1size = 2 34 | 35 | ctest( 36 | "save-findClass-partitioned-part-columns-in-data", 37 | "Save sets of files in explicitly set up partition tree; read") { 38 | withTempPathDir("part-columns", None) { path => 39 | for (p1 <- 1 to part1size; p2 <- Seq("foo", "bar")) { 40 | val partitionDir = new Path(path, s"p1=$p1/p2=$p2") 41 | val df = sparkContext 42 | .parallelize(for (i <- 1 to rows) yield (i, s"val_$i", p1)) 43 | .toDF("a", "b", "p1") 44 | 45 | df.write 46 | .format(dataSourceName) 47 | .mode(SaveMode.ErrorIfExists) 48 | .save(partitionDir.toString) 49 | // each of these directories as its own success file; there is 50 | // none at the root 51 | resolveSuccessFile(partitionDir, true) 52 | } 53 | 54 | val dataSchemaWithPartition = 55 | StructType( 56 | dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) 57 | 58 | checkQueries( 59 | spark.read.options(Map( 60 | "path" -> path.toString, 61 | "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName) 62 | .load()) 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/org/apache/spark/cloudera/statistics/IOStatisticsAccumulator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.cloudera.statistics 19 | 20 | import org.apache.hadoop.fs.statistics.{IOStatistics, IOStatisticsSnapshot, IOStatisticsSource} 21 | 22 | import org.apache.spark.util.AccumulatorV2 23 | 24 | /** 25 | * An accumulator which collects and aggregates IOStatistics. 26 | */ 27 | class IOStatisticsAccumulator extends AccumulatorV2[IOStatistics, IOStatisticsSnapshot] 28 | with IOStatisticsSource { 29 | 30 | // the snapshot to accumulate. 31 | private var iostatistics: IOStatisticsSnapshot = new IOStatisticsSnapshot() 32 | 33 | /** 34 | * Empty if all the various maps are empty. 35 | * Not thread safe. 36 | * @return true if the accumulator is empty. 37 | */ 38 | override def isZero: Boolean = iostatistics.counters().isEmpty && 39 | iostatistics.gauges().isEmpty && 40 | iostatistics.maximums().isEmpty && 41 | iostatistics.minimums().isEmpty && 42 | iostatistics.meanStatistics().isEmpty 43 | 44 | override def copy(): AccumulatorV2[IOStatistics, IOStatisticsSnapshot] = { 45 | val newAcc = new IOStatisticsAccumulator() 46 | newAcc.add(this.iostatistics) 47 | newAcc 48 | } 49 | 50 | override def reset(): Unit = { 51 | iostatistics.clear() 52 | } 53 | 54 | override def add(v: IOStatistics): Unit = iostatistics.aggregate(v) 55 | 56 | override def merge(other: AccumulatorV2[IOStatistics, IOStatisticsSnapshot]): Unit = 57 | add(other.value) 58 | 59 | override def value: IOStatisticsSnapshot = iostatistics 60 | 61 | override def getIOStatistics: IOStatistics = iostatistics 62 | 63 | def register(name: String): Unit = { 64 | super.isRegistered 65 | 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/StatisticsTracker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import scala.collection.JavaConverters._ 21 | 22 | import org.apache.hadoop.fs.{FileSystem, StorageStatistics} 23 | 24 | import org.apache.spark.internal.Logging 25 | 26 | class StatisticsTracker(fs: FileSystem) extends Logging { 27 | 28 | private val start: StorageStatistics = fs.getStorageStatistics 29 | 30 | import StatisticsTracker._ 31 | 32 | val original: Map[String, Long] = statsToMap(start) 33 | 34 | var updated: Map[String, Long] = Map() 35 | 36 | def update(): StatisticsTracker = { 37 | updated = statsToMap(fs.getStorageStatistics) 38 | this 39 | } 40 | 41 | /** 42 | * Build a diff from current to actual. 43 | * @return map of changed values only 44 | */ 45 | def diff(): Map[String, Long] = { 46 | updated.map { case (name: String, value: Long) => 47 | name -> (value - original.getOrElse(name, 0L)) 48 | }.filter{tuple => tuple._2 != 0} 49 | } 50 | 51 | /** 52 | * Dump all changed values. 53 | * @param prefix prefix of a line 54 | * @param join join between values 55 | * @param suffix suffix each line 56 | * @param merge merge between lines 57 | * @return 58 | */ 59 | def dump(prefix: String, join: String, suffix: String, merge: String): String = { 60 | diff.map { case (name, value) => 61 | (prefix + name + join + value + suffix) 62 | }.mkString(merge) 63 | 64 | } 65 | 66 | def dump(): String = { 67 | fs.getUri + "\n" + dump(" [", " = ", "]", "\n") 68 | } 69 | 70 | 71 | } 72 | 73 | object StatisticsTracker { 74 | 75 | def statsToMap(stats: StorageStatistics): Map[String, Long] = { 76 | 77 | stats.getLongStatistics.asScala.map { s => 78 | s.getName -> s.getValue 79 | }.toMap 80 | 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/S3ACommitterFactorySuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3.commit 19 | 20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations 21 | import org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory 22 | import org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitterFactory 23 | import org.apache.hadoop.fs.s3a.commit.staging.{DirectoryStagingCommitterFactory, PartitionedStagingCommitterFactory} 24 | 25 | import org.apache.spark.SparkConf 26 | 27 | /** 28 | * Explicitly create the S3A committers; forces compile-time 29 | * validation that the factory classes are on the classpath, 30 | * along with any direct dependencies. 31 | */ 32 | class S3ACommitterFactorySuite extends AbstractS3ACommitterSuite { 33 | 34 | init() 35 | 36 | def init(): Unit = { 37 | // propagate S3 credentials 38 | if (enabled) { 39 | initFS() 40 | } 41 | } 42 | 43 | /** 44 | * Override point for suites: a method which is called 45 | * in all the `newSparkConf()` methods. 46 | * This can be used to alter values for the configuration. 47 | * It is called before the configuration read in from the command line 48 | * is applied, so that tests can override the values applied in-code. 49 | * 50 | * @param sparkConf spark configuration to alter 51 | */ 52 | override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = { 53 | super.addSuiteConfigurationOptions(sparkConf) 54 | sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS) 55 | } 56 | 57 | ctest("DirectoryStagingCommitterFactory on CP") { 58 | new DirectoryStagingCommitterFactory() 59 | } 60 | 61 | ctest("PartitionedStagingCommitterFactory on CP") { 62 | new PartitionedStagingCommitterFactory() 63 | } 64 | 65 | ctest("MagicS3GuardCommitterFactory on CP") { 66 | new MagicS3GuardCommitterFactory() 67 | } 68 | 69 | ctest("S3ACommitterFactory on CP") { 70 | new S3ACommitterFactory() 71 | } 72 | 73 | 74 | } 75 | -------------------------------------------------------------------------------- /cloud-examples/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | # log4j configuration used during build and unit tests 14 | 15 | log4j.rootLogger=INFO,stdout 16 | log4j.threshold=ALL 17 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 18 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} [%t] %-5p %c{2} (%F:%M(%L)) - %m%n 20 | 21 | # ALWAYS leave this at debug, it's used to explore what's up with logging 22 | log4j.logger.com.cloudera.spark.test.loglevels=DEBUG 23 | 24 | # Spark commit protocol 25 | #log4j.logger.org.apache.spark.internal.io=DEBUG 26 | #log4j.logger.com.hortonworks.spark=DEBUG 27 | 28 | #log4j.logger.org.apache.hadoop.fs.s3a=DEBUG 29 | log4j.logger.org.apache.hadoop.fs.s3a.S3ABlockOutputStream=INFO 30 | log4j.logger.org.apache.hadoop.fs.s3a.S3AStorageStatistics=INFO 31 | log4j.logger.org.apache.hadoop.fs.s3a.S3AUtils=INFO 32 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory=DEBUG 33 | log4j.logger.org.apache.hadoop.fs.s3a.commit=DEBUG 34 | #log4j.logger.org.apache.hadoop.fs.s3a=DEBUG 35 | 36 | log4j.logger.org.apache.spark.ContextCleaner=WARN 37 | log4j.logger.org.apache.spark.storage.memory.MemoryStore=WARN 38 | log4j.logger.org.apache.spark.sql.execution.FileSourceScanExec=WARN 39 | log4j.logger.org.apache.spark.storage=WARN 40 | log4j.logger.org.apache.spark.sql.catalyst=WARN 41 | log4j.logger.org.apache.spark.SecurityManager=WARN 42 | log4j.logger.org.apache.spark.sql.internal=WARN 43 | log4j.logger.org.apache.spark.scheduler=WARN 44 | log4j.logger.org.apache.spark.SparkEnv=WARN 45 | log4j.logger.org.apache.spark.executor.Executor=WARN 46 | log4j.logger.org.apache.spark.sql.execution.streaming.state=WARN 47 | log4j.logger.org.apache.hadoop.hive.ql.io.orc.RecordReaderFactory=WARN 48 | 49 | 50 | #log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=DEBUG 51 | #log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=DEBUG 52 | 53 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR 54 | log4j.logger.org.mortbay.jetty=ERROR 55 | # disable deprecation noise 56 | log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR 57 | 58 | # turn off other logs which 59 | log4j.logger.org.eclipse.jetty=ERROR 60 | log4j.logger.org.spark_project.jetty=ERROR 61 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.committer.manifest=DEBUG 62 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/Events.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3.commit 19 | 20 | import scala.collection.immutable 21 | 22 | /** 23 | * Case class for the dataframes 24 | */ 25 | case class Event( 26 | year: Int, month: Int, day: Int, ymd: Int, monthname: String, 27 | datestr: String, value: String) 28 | 29 | object Events { 30 | 31 | /** 32 | * Build up an event sequence across years, every month in every 33 | * year has "rows" events generated. 34 | * @param year1 start year 35 | * @param year2 end year 36 | * @param startMonth start month 37 | * @param endMonth end month 38 | * @param rows rows per month 39 | * @return the event sequence. 40 | */ 41 | def events( 42 | year1: Int, 43 | year2: Int, 44 | startMonth: Int, 45 | endMonth: Int, 46 | rows: Int): immutable.IndexedSeq[Event] = { 47 | for (year <- year1 to year2; 48 | month <- startMonth to endMonth; 49 | day <- 1 to Months(month - 1)._2; 50 | r <- 1 to rows) 51 | yield event(year, 52 | month, 53 | day, 54 | "%d/%04f".format(r, Math.random() * 10000)) 55 | } 56 | 57 | def monthCount( 58 | year1: Int, 59 | year2: Int, 60 | startMonth: Int, 61 | endMonth: Int): Int = { 62 | var count = 0 63 | for (year <- year1 to year2; 64 | month <- startMonth to endMonth) 65 | count += 1 66 | count 67 | } 68 | 69 | /** 70 | * Create an event. 71 | * 72 | * @return the event. 73 | */ 74 | def event(year: Int, month: Int, day: Int, value: String): Event = { 75 | new Event(year, month, day, 76 | day + month * 100 + year * 10000, 77 | Months(month - 1)._1, 78 | "%04d-%02d0-%02d".format(year, month, day), 79 | value 80 | ) 81 | } 82 | 83 | val Months = Array( 84 | ("Jan", 31), 85 | ("Feb", 28), 86 | ("Mar", 31), 87 | ("Apr", 30), 88 | ("May", 31), 89 | ("Jun", 30), 90 | ("Jul", 31), 91 | ("Aug", 31), 92 | ("Sep", 30), 93 | ("Oct", 31), 94 | ("Nov", 30), 95 | ("Dec", 31)) 96 | 97 | } 98 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/utils/HConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.utils 19 | 20 | import org.apache.spark.SparkConf 21 | 22 | /** 23 | * A minimal trait purely to set Hadoop configuration values in a Spark 24 | * Configuration. 25 | */ 26 | trait HConf { 27 | /** 28 | * Set a Hadoop option in a spark configuration. 29 | * 30 | * @param sparkConf configuration to update 31 | * @param key key 32 | * @param value new value 33 | */ 34 | def hconf(sparkConf: SparkConf, key: String, value: String): SparkConf = { 35 | sparkConf.set(hkey(key), value) 36 | sparkConf 37 | } 38 | 39 | /** 40 | * Set a Hadoop option in a spark configuration. 41 | * 42 | * @param sparkConf configuration to update 43 | * @param key key 44 | * @param value new value 45 | */ 46 | 47 | def hconf(sparkConf: SparkConf, key: String, value: Boolean): SparkConf = { 48 | sparkConf.set(hkey(key), value.toString) 49 | sparkConf 50 | } 51 | 52 | /** 53 | * Take a Hadoop key, add the prefix to allow it to be added to 54 | * a Spark Config and then picked up properly later. 55 | * 56 | * @param key key 57 | * @return the new key 58 | */ 59 | def hkey(key: String): String = { 60 | "spark.hadoop." + key 61 | } 62 | 63 | /** 64 | * Set a long hadoop option in a spark configuration. 65 | * 66 | * @param sparkConf configuration to update 67 | * @param key key 68 | * @param value new value 69 | */ 70 | def hconf(sparkConf: SparkConf, key: String, value: Long): SparkConf = { 71 | sparkConf.set(hkey(key), value.toString) 72 | sparkConf 73 | } 74 | 75 | /** 76 | * Set all supplied options to the spark configuration as hadoop options. 77 | * 78 | * @param sparkConf Spark configuration to update 79 | * @param settings map of settings. 80 | */ 81 | def hconf(sparkConf: SparkConf, 82 | settings: Traversable[(String, Object)]): SparkConf = { 83 | settings.foreach(e => hconf(sparkConf, e._1, e._2.toString)) 84 | sparkConf 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/org/apache/spark/sql/sources/HiveTestTrait.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.sources 19 | 20 | import java.io.File 21 | 22 | import com.cloudera.spark.cloud.ObjectStoreConfigurations 23 | import org.scalatest.BeforeAndAfterAll 24 | 25 | import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} 26 | import org.apache.spark.sql.SparkSession 27 | import org.apache.spark.sql.hive.test.TestHiveContext 28 | import org.apache.spark.util.Utils 29 | 30 | /** 31 | * A trait for tests which bonds to a hive context 32 | * After all tests the hive context is reset then it and the spark session 33 | * closed. 34 | */ 35 | trait HiveTestTrait extends SparkFunSuite with BeforeAndAfterAll { 36 | // override protected val enableAutoThreadAudit = false 37 | protected var hiveContext: HiveInstanceForTests = _ 38 | protected var spark: SparkSession = _ 39 | 40 | 41 | protected override def beforeAll(): Unit = { 42 | super.beforeAll() 43 | // set up spark and hive context 44 | hiveContext = new HiveInstanceForTests() 45 | spark = hiveContext.sparkSession 46 | } 47 | 48 | protected override def afterAll(): Unit = { 49 | try { 50 | SparkSession.clearActiveSession() 51 | 52 | if (hiveContext != null) { 53 | hiveContext.reset() 54 | hiveContext = null 55 | } 56 | if (spark != null) { 57 | spark.close() 58 | spark = null 59 | } 60 | } finally { 61 | super.afterAll() 62 | } 63 | } 64 | 65 | } 66 | 67 | class HiveInstanceForTests 68 | extends TestHiveContext( 69 | new SparkContext( 70 | System.getProperty("spark.sql.test.master", "local[1]"), 71 | "TestSQLContext", 72 | new SparkConf() 73 | .setAll(ObjectStoreConfigurations.RW_TEST_OPTIONS) 74 | .set("spark.sql.warehouse.dir", 75 | TestSetup.makeWarehouseDir().toURI.getPath) 76 | ) 77 | ) { 78 | 79 | } 80 | 81 | 82 | 83 | 84 | object TestSetup { 85 | 86 | def makeWarehouseDir(): File = { 87 | val warehouseDir = Utils.createTempDir(namePrefix = "warehouse") 88 | warehouseDir.delete() 89 | warehouseDir 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/org/apache/spark/cloudera/statistics/IOStatisticsCollectorExecutorPlugin.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.cloudera.statistics 19 | 20 | import java.util 21 | 22 | import org.apache.hadoop.fs.statistics.IOStatisticsContext 23 | 24 | import org.apache.spark.{SparkContext, TaskContext, TaskFailedReason} 25 | import org.apache.spark.api.plugin.{ExecutorPlugin, PluginContext} 26 | import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} 27 | import org.apache.spark.util.TaskCompletionListener 28 | 29 | class IOStatisticsCollectorExecutorPlugin extends ExecutorPlugin { 30 | 31 | var context: PluginContext = _ 32 | 33 | override def init( 34 | ctx: PluginContext, 35 | extraConf: util.Map[String, String]): Unit = { 36 | 37 | context = ctx 38 | // somehow get the active spark context to register 39 | // the accumulator 40 | SparkContext.getOrCreate() 41 | 42 | } 43 | override def shutdown(): Unit = super.shutdown() 44 | 45 | override def onTaskStart(): Unit = { 46 | val iostatsCtx:IOStatisticsContext = IOStatisticsContext.getCurrentIOStatisticsContext 47 | iostatsCtx.reset; 48 | val acc = new IOStatisticsAccumulator 49 | 50 | 51 | val taskContext = TaskContext.get() 52 | 53 | 54 | taskContext.registerAccumulator(acc) 55 | taskContext.addTaskCompletionListener(new TaskCompleted(acc, iostatsCtx)) 56 | 57 | } 58 | 59 | override def onTaskSucceeded(): Unit = super.onTaskSucceeded() 60 | 61 | override def onTaskFailed( 62 | failureReason: TaskFailedReason): Unit = super 63 | .onTaskFailed(failureReason) 64 | 65 | private class TaskCompleted( 66 | val acc: IOStatisticsAccumulator, 67 | val iostatsCtx: IOStatisticsContext) extends TaskCompletionListener { 68 | 69 | override def onTaskCompletion(context: TaskContext): Unit = { 70 | acc.add(iostatsCtx.getIOStatistics) 71 | } 72 | 73 | } 74 | 75 | private class SparkListenerImpl extends SparkListener { 76 | override def onJobStart( 77 | jobStart: SparkListenerJobStart): Unit = super 78 | .onJobStart(jobStart) 79 | } 80 | } 81 | 82 | 83 | object IOStatisticsCollectorExecutorPlugin { 84 | val ACCUMULATOR_NAME = "io_statistics" 85 | } -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/StoreTestHelper.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.spark.cloud.common 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.{File, FileNotFoundException} 21 | 22 | import com.cloudera.spark.cloud.s3.S3AConstants 23 | import com.cloudera.spark.cloud.CommitterBinding 24 | import org.apache.hadoop.conf.Configuration 25 | 26 | import org.apache.spark.internal.Logging 27 | 28 | /** 29 | * Instantiation of StoreTestHelper. 30 | */ 31 | object StoreTestHelper extends StoreTestOperations 32 | with Logging 33 | with S3AConstants 34 | with CloudSuiteTrait { 35 | 36 | private var configLogged = false 37 | 38 | /** 39 | * Load the configuration file from the system property `SYSPROP_CLOUD_TEST_CONFIGURATION_FILE`. 40 | * Throws FileNotFoundException if a configuration is named but not present. 41 | * 42 | * @return the configuration 43 | */ 44 | def loadConfiguration(): Configuration = { 45 | val config = new Configuration(true) 46 | getKnownSysprop(SYSPROP_CLOUD_TEST_CONFIGURATION_FILE).foreach { filename => 47 | logDebug(s"Configuration property = `$filename`") 48 | val f = new File(filename) 49 | if (f.exists()) { 50 | // unsynced but its only a log statement 51 | if (configLogged) { 52 | configLogged = true 53 | logInfo(s"Loading configuration from $f") 54 | } 55 | config.addResource(f.toURI.toURL) 56 | } else { 57 | throw new FileNotFoundException(s"No file '$filename'" + 58 | s" declared in property $SYSPROP_CLOUD_TEST_CONFIGURATION_FILE") 59 | } 60 | } 61 | overlayConfiguration( 62 | config, 63 | Seq( 64 | HIVE_TESTS_DISABLED, 65 | REQUIRED_HADOOP_VERSION, 66 | SCALE_TEST_ENABLED, 67 | SCALE_TEST_SIZE_FACTOR, 68 | S3A_COMMITTER_TEST_ENABLED, 69 | S3A_ENCRYPTION_KEY_1, 70 | S3A_ENCRYPTION_KEY_2 71 | ) 72 | ) 73 | 74 | // setup the committer from any property passed in 75 | getKnownSysprop(S3A_COMMITTER_NAME).foreach(committer => { 76 | val binding = CommitterBinding.COMMITTERS_BY_NAME(committer.toLowerCase()) 77 | binding.bind(config) 78 | logInfo(s"Using committer binding $binding") 79 | }) 80 | config 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/FileGeneratorTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import com.cloudera.spark.cloud.operations.CloudFileGenerator 21 | import org.apache.hadoop.fs.Path 22 | 23 | import org.apache.spark.SparkConf 24 | 25 | /** 26 | * Test the `FileGenerator` entry point. Use a small file number to keep the unit tests fast; some 27 | * cloud infras are very slow here. System tests can use the CLI instead. 28 | */ 29 | abstract class FileGeneratorTests extends CloudSuite { 30 | 31 | ctest("FileGenerator", "Execute the FileGenerator example") { 32 | val conf = newSparkConf() 33 | conf.setAppName("FileGenerator") 34 | val destDir = testPath(filesystem, "filegenerator") 35 | val months = 2 36 | val fileCount = 1 37 | val rowCount = 500 38 | 39 | assert(0 === generate(conf, destDir, months, fileCount, rowCount)) 40 | 41 | val status = filesystem.getFileStatus(destDir) 42 | assert(status.isDirectory, s"Not a directory: $status") 43 | 44 | val totalExpectedFiles = months * fileCount 45 | 46 | // do a recursive listFiles 47 | val listing = logDuration("listFiles(recursive)") { 48 | listFiles(filesystem, destDir, true) 49 | } 50 | var recursivelyListedFilesDataset = 0L 51 | var recursivelyListedFiles = 0 52 | logDuration("scan result list") { 53 | listing.foreach { status => 54 | recursivelyListedFiles += 1 55 | recursivelyListedFilesDataset += status.getLen 56 | logInfo(s"${status.getPath}[${status.getLen}]") 57 | } 58 | } 59 | 60 | logInfo(s"FileSystem $filesystem") 61 | assert(totalExpectedFiles === recursivelyListedFiles) 62 | } 63 | 64 | /** 65 | * Generate a set of files 66 | * @param conf configuration 67 | * @param destDir destination directory 68 | * @param monthCount number of months to generate 69 | * @param fileCount files per month 70 | * @param rowCount rows per file 71 | * @return the exit code of the operation 72 | */ 73 | def generate( 74 | conf: SparkConf, 75 | destDir: Path, 76 | monthCount: Int, 77 | fileCount: Int, 78 | rowCount: Int): Int = { 79 | val result = new CloudFileGenerator().action( 80 | conf, 81 | Seq(destDir, 82 | monthCount, 83 | fileCount, 84 | rowCount)) 85 | result 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/s3/audit/LogParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3.audit 19 | 20 | import java.util.regex.Matcher 21 | 22 | import org.apache.hadoop.fs.s3a.audit.S3LogParser 23 | import org.apache.hadoop.fs.s3a.audit.S3LogParser._ 24 | 25 | 26 | /** 27 | * Log parsing using s3a audit classes. 28 | */ 29 | object LogParser { 30 | 31 | private val pattern = S3LogParser.LOG_ENTRY_PATTERN 32 | 33 | private def entry(matcher: Matcher, group: String): String = { 34 | val g = matcher.group(group) 35 | assert(g != null, s"Group $group is null") 36 | assert(!g.isEmpty, s"Group $group is empty") 37 | g 38 | } 39 | 40 | private def longEntry(m: Matcher, group: String): Long = { 41 | entry(m, group).toLong 42 | } 43 | 44 | /** 45 | * Parse a line. 46 | * @param line line 47 | * @return the entry or None if the regexp didn't match 48 | * @throws AssertionError if a group is null/empty 49 | */ 50 | def parse(line: String): Option[ServerLogEntry] = { 51 | val m = pattern.matcher(line) 52 | 53 | if (m.matches()) { 54 | return None 55 | } else { 56 | Some(ServerLogEntry( 57 | bucketowner = entry(m, OWNER_GROUP), 58 | bucket_name = entry(m, BUCKET_GROUP), 59 | requestdatetime = entry(m, TIMESTAMP_GROUP), 60 | remoteip = entry(m, REMOTEIP_GROUP), 61 | requester = entry(m, REQUESTER_GROUP), 62 | requestid = entry(m, REQUESTID_GROUP), 63 | operation = entry(m, VERB_GROUP), 64 | key = entry(m, KEY_GROUP), 65 | request_uri = entry(m, REQUESTURI_GROUP), 66 | httpstatus = entry(m, HTTP_GROUP), 67 | errorcode = entry(m, AWSERRORCODE_GROUP), 68 | bytessent = longEntry(m, BYTESSENT_GROUP), 69 | objectsize = longEntry(m, OBJECTSIZE_GROUP), 70 | totaltime = entry(m, TOTALTIME_GROUP), 71 | turnaroundtime = entry(m, TURNAROUNDTIME_GROUP), 72 | referrer = entry(m, REFERRER_GROUP), 73 | useragent = entry(m, USERAGENT_GROUP), 74 | versionid = entry(m, VERSION_GROUP), 75 | hostid = entry(m, HOSTID_GROUP), 76 | sigv = entry(m, SIGV_GROUP), 77 | ciphersuite = entry(m, CYPHER_GROUP), 78 | authtype = entry(m, AUTH_GROUP), 79 | endpoint = entry(m, ENDPOINT_GROUP), 80 | tlsversion = entry(m, TLS_GROUP))) 81 | } 82 | 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/StoreTestOperations.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import scala.concurrent.duration._ 21 | import scala.language.postfixOps 22 | 23 | import com.cloudera.spark.cloud.{GeneralCommitterConstants, ObjectStoreOperations} 24 | import org.apache.hadoop.conf.Configuration 25 | import org.apache.hadoop.fs.{FileStatus, FileSystem, LocatedFileStatus, Path} 26 | import org.scalatest.concurrent.Eventually 27 | import org.scalatest.time.Span 28 | 29 | import org.apache.spark.sql._ 30 | 31 | /** 32 | * Extends ObjectStoreOperations with some extra ones for testing. 33 | */ 34 | trait StoreTestOperations extends ObjectStoreOperations with Eventually { 35 | 36 | protected val retryTimeout: Span = 30 seconds 37 | 38 | protected val retryInterval: Span = 1000 milliseconds 39 | 40 | /** 41 | * Try to get the file status, _eventually_. 42 | * 43 | * @param fs filesystem 44 | * @param p path 45 | * @return the result 46 | */ 47 | def eventuallyGetFileStatus(fs: FileSystem, p: Path): FileStatus = { 48 | fs.getFileStatus(p) 49 | } 50 | 51 | /** 52 | * findClass a DF and verify it has the expected number of rows 53 | * 54 | * @param spark session 55 | * @param fs filesystem 56 | * @param source path 57 | * @param srcFormat format of source 58 | * @param rowCount expected row caount 59 | * @return return how long it took 60 | */ 61 | def validateRowCount( 62 | spark: SparkSession, 63 | fs: FileSystem, 64 | source: Path, 65 | srcFormat: String, 66 | rowCount: Long): Long = { 67 | val success = new Path(source, GeneralCommitterConstants.SUCCESS_FILE_NAME) 68 | val status = fs.getFileStatus(success) 69 | assert(status.isDirectory || status.getBlockSize > 0, 70 | s"Block size 0 in $status") 71 | val files = listFiles(fs, source, true).filter { st => 72 | val name = st.getPath.getName 73 | st.isFile && !name.startsWith(".") && !name.startsWith("_") 74 | } 75 | assert(files.nonEmpty, s"No files in the directory $source") 76 | val (loadedCount, loadTime) = durationOf(loadDF(spark, source, srcFormat) 77 | .count()) 78 | logInfo(s"Loaded $source in $loadTime nS") 79 | require(rowCount == loadedCount, 80 | s"Expected $rowCount rows, but got $loadedCount from $source formatted as $srcFormat") 81 | loadTime 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3ATestSetup.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import java.net.URI 21 | 22 | import com.cloudera.spark.cloud.common.{CloudSuiteTrait, CsvDatasourceSupport} 23 | import org.apache.hadoop.conf.Configuration 24 | import org.apache.hadoop.fs.{FileSystem, Path} 25 | 26 | /** 27 | * Trait for S3A tests. 28 | */ 29 | trait S3ATestSetup extends CloudSuiteTrait with RandomIOPolicy with 30 | CsvDatasourceSupport { 31 | 32 | override def enabled: Boolean = { 33 | getConf.getBoolean(S3A_TESTS_ENABLED, false) && super.enabled 34 | 35 | } 36 | 37 | /** 38 | * this is *not* true but here to make sure the tests 39 | * fail the way they are meant to. 40 | * @return true if the store committer expected to support dynamic override 41 | */ 42 | override def dynamicPartitioning: Boolean = false 43 | 44 | def initFS(): FileSystem = { 45 | setupFilesystemConfiguration(getConf) 46 | createTestS3AFS 47 | } 48 | 49 | /** 50 | * do the work of setting up the S3Test FS 51 | * @return the filesystem 52 | */ 53 | protected def createTestS3AFS: FileSystem = { 54 | val s3aURI = new URI(requiredOption(S3A_TEST_URI)) 55 | logInfo(s"Executing S3 tests against $s3aURI with read policy $inputPolicy") 56 | createFilesystem(s3aURI) 57 | } 58 | 59 | /** 60 | * Override point: set up the configuration for the filesystem. 61 | * The base implementation sets up buffer directory, block size and IO Policy. 62 | * @param config configuration to set up 63 | */ 64 | def setupFilesystemConfiguration(config: Configuration): Unit = { 65 | config.set(BUFFER_DIR, localTmpDir.getAbsolutePath) 66 | // a block size of 1MB 67 | config.set(BLOCK_SIZE, (1024 * 1024).toString) 68 | // the input policy 69 | config.set(INPUT_FADVISE, inputPolicy) 70 | } 71 | 72 | lazy val CSV_TESTFILE: Option[Path] = { 73 | val pathname = getConf.get(S3A_CSVFILE_PATH, S3A_CSV_PATH_DEFAULT) 74 | if (!pathname.isEmpty) Some(new Path(pathname)) else None 75 | } 76 | 77 | /** 78 | * Predicate to define whether or not there's a CSV file to work with. 79 | * @return true if the CSV test file is defined. 80 | */ 81 | override def hasCSVTestFile(): Boolean = CSV_TESTFILE.isDefined 82 | 83 | /** 84 | * Path to the CSV file's original source 85 | * 86 | * @return a path 87 | */ 88 | override def sourceCSVFilePath: Option[Path] = CSV_TESTFILE 89 | } 90 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/S3ACommitDataframeSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3.commit 19 | 20 | import com.cloudera.spark.cloud.CommitterBinding._ 21 | import com.cloudera.spark.cloud.committers.AbstractCommitDataframeSuite 22 | import com.cloudera.spark.cloud.s3.S3ATestSetup 23 | import com.cloudera.spark.cloud.CommitterInfo 24 | import org.apache.hadoop.fs.{FileSystem, Path} 25 | 26 | import org.apache.spark.sql.{Dataset, SparkSession} 27 | import org.apache.spark.SparkConf 28 | 29 | /** 30 | * Tests different data formats through the committers. 31 | */ 32 | class S3ACommitDataframeSuite 33 | extends AbstractCommitDataframeSuite with S3ATestSetup { 34 | 35 | init() 36 | 37 | def init(): Unit = { 38 | // propagate S3 credentials 39 | if (enabled) { 40 | initFS() 41 | } 42 | } 43 | 44 | override def schema: String = "s3a" 45 | 46 | 47 | // there's an empty string at the end to aid with commenting out different 48 | // committers and not have to worry about any trailing commas 49 | override def committers: Seq[String] = Seq( 50 | DIRECTORY, 51 | PARTITIONED, 52 | MAGIC, 53 | "" 54 | ) 55 | 56 | 57 | override protected def setDynamicPartitioningOptions( 58 | sparkConf: SparkConf, 59 | committerInfo: CommitterInfo): Unit = { 60 | if (committerInfo.name == PARTITIONED) { 61 | hconf(sparkConf, S3A_CONFLICT_MODE, CONFLICT_MODE_REPLACE) 62 | } else { 63 | super 64 | .setDynamicPartitioningOptions(sparkConf, committerInfo) 65 | } 66 | } 67 | 68 | 69 | override protected def expectDynamicPartitioningToSucceed( 70 | committerInfo: CommitterInfo): Boolean = { 71 | committerInfo.name == PARTITIONED 72 | } 73 | 74 | override def anyOtherTests(spark: SparkSession, 75 | filesystem: FileSystem, 76 | subdir: Path, format: String, 77 | sourceData: Dataset[Event], 78 | eventData2: Dataset[Event], 79 | committerInfo: CommitterInfo): Unit = { 80 | if (committerInfo.name == PARTITIONED) { 81 | logInfo("Executing partitioned committer tests") 82 | // although the dynamic command doesn't work, 83 | // a normal query will trigger overwrite 84 | logDuration(s"overwrite datset2 to $subdir in format $format") { 85 | eventData2 86 | .write 87 | .mode("overwrite") 88 | .partitionBy("year", "month") 89 | .format(format) 90 | .save(subdir.toString) 91 | } 92 | } 93 | 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/CommitterBinding.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud 19 | 20 | import com.cloudera.spark.cloud.GeneralCommitterConstants.{ABFS_MANIFEST_COMMITTER_FACTORY, DEFAULT_COMMITTER_FACTORY, MANIFEST_COMMITTER_FACTORY, MANIFEST_COMMITTER_NAME} 21 | import org.apache.hadoop.fs.s3a.commit.CommitConstants 22 | 23 | /** 24 | * Constants related to the S3A committers. 25 | * Originally a copy & paste of the java values, it's now just a reference, 26 | * though retained to reserve the option of moving back to copied values. 27 | */ 28 | object CommitterBinding { 29 | 30 | def factoryForSchema(s: String): String = 31 | String.format( 32 | GeneralCommitterConstants.OUTPUTCOMMITTER_FACTORY_SCHEME_PATTERN, 33 | s) 34 | 35 | 36 | val S3A_SCHEME_COMMITTER_FACTORY: String = factoryForSchema("s3a") 37 | val STAGING_PACKAGE = "org.apache.hadoop.fs.s3a.commit.staging." 38 | val S3A_COMMITTER_FACTORY: String = CommitConstants.S3A_COMMITTER_FACTORY 39 | 40 | val S3A_COMMITTER_NAME: String = CommitConstants.FS_S3A_COMMITTER_NAME 41 | 42 | val MAGIC = "magic" 43 | val STAGING = "staging" 44 | val DIRECTORY = "directory" 45 | val PARTITIONED = "partitioned" 46 | val MANIFEST = "manifest" 47 | val MANIFEST_ABFS = "manifest_abfs" 48 | val FILE = "file" 49 | 50 | val S3A_CONFLICT_MODE: String = 51 | CommitConstants.FS_S3A_COMMITTER_STAGING_CONFLICT_MODE 52 | 53 | /** Conflict mode */ 54 | val CONFLICT_MODE_FAIL: String = "fail" 55 | 56 | val CONFLICT_MODE_APPEND: String = "append" 57 | 58 | val CONFLICT_MODE_REPLACE: String = "replace" 59 | 60 | /** 61 | * Committer name to: name in _SUCCESS, factory classname, requires consistent FS. 62 | * 63 | * If the first field is "", it means "this committer doesn't put its name into 64 | * the success file (or that it isn't actually created). 65 | */ 66 | val COMMITTERS_BY_NAME: Map[String, CommitterInfo] = Map( 67 | MAGIC -> CommitterInfo(MAGIC, S3A_COMMITTER_FACTORY), 68 | STAGING -> CommitterInfo(STAGING, S3A_COMMITTER_FACTORY), 69 | DIRECTORY -> CommitterInfo(DIRECTORY, S3A_COMMITTER_FACTORY), 70 | PARTITIONED -> CommitterInfo(PARTITIONED, S3A_COMMITTER_FACTORY), 71 | MANIFEST -> CommitterInfo(MANIFEST_COMMITTER_NAME, 72 | MANIFEST_COMMITTER_FACTORY), 73 | MANIFEST_ABFS -> CommitterInfo(MANIFEST_COMMITTER_NAME, 74 | ABFS_MANIFEST_COMMITTER_FACTORY), 75 | FILE -> CommitterInfo("", DEFAULT_COMMITTER_FACTORY) 76 | ) 77 | 78 | } 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/CloudSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import java.io.{File, FileNotFoundException} 21 | 22 | import com.cloudera.spark.cloud.s3.S3AConstants 23 | import com.cloudera.spark.cloud.CommitterBinding 24 | import org.apache.hadoop.conf.Configuration 25 | import org.scalatest.concurrent.Eventually 26 | import org.scalatest.BeforeAndAfter 27 | 28 | import org.apache.spark.{LocalSparkContext, SparkFunSuite} 29 | import org.apache.spark.internal.Logging 30 | 31 | /** 32 | * A cloud suite. 33 | * Adds automatic loading of a Hadoop configuration file with login credentials and 34 | * options to enable/disable tests, and a mechanism to conditionally declare tests 35 | * based on these details 36 | */ 37 | abstract class CloudSuite extends ContextFreeCloudSuite 38 | with LocalSparkContext { 39 | } 40 | 41 | object CloudSuite extends Logging with S3AConstants 42 | with CloudSuiteTrait { 43 | 44 | private var configLogged = false 45 | 46 | /** 47 | * Load the configuration file from the system property `SYSPROP_CLOUD_TEST_CONFIGURATION_FILE`. 48 | * Throws FileNotFoundException if a configuration is named but not present. 49 | * @return the configuration 50 | */ 51 | def loadConfiguration(): Configuration = { 52 | val config = new Configuration(true) 53 | getKnownSysprop(SYSPROP_CLOUD_TEST_CONFIGURATION_FILE).foreach { filename => 54 | logDebug(s"Configuration property = `$filename`") 55 | val f = new File(filename) 56 | if (f.exists()) { 57 | // unsynced but its only a log statement 58 | if (configLogged) { 59 | configLogged = true 60 | logInfo(s"Loading configuration from $f") 61 | } 62 | config.addResource(f.toURI.toURL) 63 | } else { 64 | throw new FileNotFoundException(s"No file '$filename'" + 65 | s" declared in property $SYSPROP_CLOUD_TEST_CONFIGURATION_FILE") 66 | } 67 | } 68 | overlayConfiguration( 69 | config, 70 | Seq( 71 | HIVE_TESTS_DISABLED, 72 | REQUIRED_HADOOP_VERSION, 73 | SCALE_TEST_ENABLED, 74 | SCALE_TEST_SIZE_FACTOR, 75 | S3A_CLIENT_FACTORY_IMPL, 76 | S3A_COMMITTER_TEST_ENABLED, 77 | S3A_ENCRYPTION_KEY_1, 78 | S3A_ENCRYPTION_KEY_2 79 | ) 80 | ) 81 | 82 | // setup the committer from any property passed in 83 | getKnownSysprop(S3A_COMMITTER_NAME).foreach(committer => { 84 | val binding = CommitterBinding.COMMITTERS_BY_NAME(committer.toLowerCase()) 85 | binding.bind(config) 86 | logInfo(s"Using committer binding $binding") 87 | }) 88 | config 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3AEncryptionSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import com.cloudera.spark.cloud.common.CloudSuite 21 | import org.apache.hadoop.conf.Configuration 22 | import org.apache.hadoop.fs._ 23 | 24 | /** 25 | * A suite of tests working with encryption. 26 | * Needs multiple encryption keys to work with. 27 | */ 28 | class S3AEncryptionSuite extends CloudSuite with S3ATestSetup { 29 | 30 | override def enabled: Boolean = { 31 | val conf = getConf 32 | super.enabled && hasConf(conf, S3A_ENCRYPTION_KEY_1) && 33 | hasConf(conf, S3A_ENCRYPTION_KEY_2) 34 | } 35 | 36 | init() 37 | 38 | def init(): Unit = { 39 | if (enabled) { 40 | initFS() 41 | } 42 | } 43 | 44 | override def setupFilesystemConfiguration(config: Configuration): Unit = { 45 | super.setupFilesystemConfiguration(config) 46 | config.set(SERVER_SIDE_ENCRYPTION_ALGORITHM, SSE_KMS) 47 | config.set(SERVER_SIDE_ENCRYPTION_KEY, config.getTrimmed(S3A_ENCRYPTION_KEY_1)) 48 | } 49 | 50 | /** 51 | * Create an FS with key2 52 | */ 53 | def createKey2FS(): FileSystem = { 54 | val config = getConf 55 | config.set(SERVER_SIDE_ENCRYPTION_ALGORITHM, SSE_KMS) 56 | config.set(SERVER_SIDE_ENCRYPTION_KEY, config.getTrimmed(S3A_ENCRYPTION_KEY_2)) 57 | FileSystem.newInstance(filesystemURI, config) 58 | } 59 | 60 | /** 61 | * Create an FS with key2 62 | */ 63 | def createUnencryptedFS(): FileSystem = { 64 | val config = getConf 65 | config.unset(SERVER_SIDE_ENCRYPTION_ALGORITHM) 66 | FileSystem.newInstance(filesystemURI, config) 67 | } 68 | 69 | ctest("TwoKeys", "read and write with two keys") { 70 | val key1 = filesystem.getConf.get(SERVER_SIDE_ENCRYPTION_KEY) 71 | logInfo(s"Test key 1 = $key1") 72 | 73 | val dir = path("TwoKeys") 74 | val key1File = new Path(dir, "key1") 75 | val hello: String = "hello" 76 | write(filesystem, key1File, hello) 77 | 78 | val fs2 = createKey2FS() 79 | val key2 = fs2.getConf.get(SERVER_SIDE_ENCRYPTION_KEY) 80 | logInfo(s"Test key 2 = $key2") 81 | assert( key1 != key2, "same key is used for both filesystems") 82 | 83 | val status = fs2.getFileStatus(key1File) 84 | assert( hello.length === status.getLen, s"wrong length in $status") 85 | 86 | fs2.listStatus(dir) 87 | val data = read(fs2, key1File, 128) 88 | assert (hello.length === data.length) 89 | assert (hello === data) 90 | 91 | val unencryptedFS = createUnencryptedFS() 92 | val dataUnencrypted = read(unencryptedFS, key1File, 128) 93 | assert(hello === dataUnencrypted) 94 | 95 | unencryptedFS.delete(key1File, false) 96 | fs2.delete(dir, true) 97 | 98 | } 99 | 100 | 101 | } 102 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ALineCountWritebackSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.s3 19 | 20 | import scala.concurrent.duration._ 21 | import scala.language.postfixOps 22 | 23 | import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource 24 | import org.apache.hadoop.fs.{FileStatus, Path} 25 | 26 | /** 27 | * Test the `S3LineCount` entry point. 28 | */ 29 | class S3ALineCountWritebackSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup { 30 | 31 | init() 32 | 33 | def init(): Unit = { 34 | // propagate S3 credentials 35 | if (enabled) { 36 | initFS() 37 | } 38 | } 39 | 40 | override def enabled: Boolean = super.enabled && hasCSVTestFile 41 | 42 | override def cleanFSInTeardownEnabled: Boolean = true 43 | 44 | after { 45 | cleanFilesystemInTeardown() 46 | } 47 | 48 | ctest("LineCountWriteback", 49 | "Execute the LineCount example with the results written back to the test filesystem.") { 50 | val sourceFile = getTestCSVPath() 51 | val sourceFS = sourceFile.getFileSystem(getConf) 52 | val sourceInfo = sourceFS.getFileStatus(sourceFile) 53 | val sparkConf = newSparkConf() 54 | sparkConf.setAppName("LineCount") 55 | val destDir = testPath(filesystem, "LineCountWriteback") 56 | assert(0 === S3ALineCount.action(sparkConf, 57 | Array(sourceFile.toString, destDir.toString))) 58 | 59 | 60 | val status = filesystem.getFileStatus(destDir) 61 | assert(status.isDirectory, s"Not a directory: $status") 62 | 63 | // only a small fraction of the source data is needed 64 | val expectedLen = sourceInfo.getLen / 1024 65 | 66 | def validateChildSize(qualifier: String, files: Seq[FileStatus]) = { 67 | val (filenames, size) = enumFileSize(destDir, files) 68 | logInfo(s"total size of $qualifier = $size bytes from ${files.length} files: $filenames") 69 | assert(size >= expectedLen, s"$qualifier size $size in files $filenames" + 70 | s" smaller than exoected length $expectedLen") 71 | } 72 | 73 | val stdInterval = interval(100 milliseconds) 74 | val appId = eventually(timeout(20 seconds), stdInterval) { 75 | validateChildSize("descendants", 76 | listFiles(filesystem, destDir, true) 77 | .filter(f => f.getPath.getName != "_SUCCESS")) 78 | 79 | validateChildSize("children", 80 | filesystem.listStatus(destDir, 81 | pathFilter(p => p.getName != "_SUCCESS")).toSeq) 82 | } 83 | } 84 | 85 | private def enumFileSize(destDir: Path, files: Seq[FileStatus]): (String, Long) = { 86 | assert(files.nonEmpty, s"No files in destination directory $destDir") 87 | var size = 0L 88 | val filenames = new StringBuffer() 89 | files.foreach { f => 90 | size += f.getLen 91 | filenames.append(" ").append(f.getPath) 92 | } 93 | (filenames.toString, size) 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /cloud-examples/src/main/scala/com/cloudera/spark/cloud/examples/AzureStreamingExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.examples 19 | 20 | import com.cloudera.spark.cloud.ObjectStoreExample 21 | import org.apache.hadoop.fs.Path 22 | 23 | import org.apache.spark.SparkConf 24 | import org.apache.spark.streaming.{Seconds, StreamingContext} 25 | 26 | /** 27 | * Simple example of streaming on Azure. 28 | */ 29 | class AzureStreamingExample extends ObjectStoreExample { 30 | 31 | /** 32 | * List of the command args for the current example. 33 | * @return a string 34 | */ 35 | override protected def usageArgs(): String = { 36 | " " 37 | } 38 | 39 | /** 40 | * Action to execute. 41 | * 42 | * @param sparkConf configuration to use 43 | * @param args argument array 44 | * @return an exit code 45 | */ 46 | override def action( 47 | sparkConf: SparkConf, 48 | args: Array[String]): Int = { 49 | if (args.length != 3) { 50 | return usage() 51 | } 52 | sparkConf.setAppName("CloudStreaming") 53 | applyObjectStoreConfigurationOptions(sparkConf, false) 54 | val dest = args(0) 55 | val delay = Integer.valueOf(args(1)) 56 | val interval = Integer.valueOf(args(2)) 57 | 58 | // Create the context 59 | val streaming = new StreamingContext(sparkConf, Seconds(10)) 60 | 61 | try { 62 | // Create the FileInputDStream on the directory regexp and use the 63 | // stream to look for a new file renamed into it 64 | val destPath = new Path(dest) 65 | val sc = streaming.sparkContext 66 | val hc = sc.hadoopConfiguration 67 | 68 | val fs = destPath.getFileSystem(hc) 69 | rm(fs, destPath) 70 | fs.mkdirs(destPath) 71 | 72 | val sightings = sc.longAccumulator("sightings") 73 | 74 | print("===================================") 75 | print(s"Looking for text files under ${destPath}") 76 | print("===================================") 77 | 78 | val lines = streaming.textFileStream(dest) 79 | 80 | val matches = lines.map(line => { 81 | sightings.add(1) 82 | print(s"[${sightings.value}]: $line") 83 | line 84 | }) 85 | 86 | // materialize the operation 87 | matches.print() 88 | 89 | // start the streaming 90 | streaming.start() 91 | 92 | // sleep a bit to get streaming up and running 93 | Thread.sleep(delay * 1000) 94 | print("===================================") 95 | print(s"Seen ${sightings.value} lines") 96 | 0 97 | } finally { 98 | streaming.stop(true) 99 | } 100 | } 101 | 102 | } 103 | 104 | object AzureStreamingExample { 105 | 106 | def main(args: Array[String]) { 107 | new AzureStreamingExample().run(args) 108 | } 109 | } 110 | 111 | 112 | -------------------------------------------------------------------------------- /cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/SeekReadTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.cloudera.spark.cloud.common 19 | 20 | import org.apache.hadoop.fs.FileSystem 21 | 22 | /** 23 | * Tests reading in the CSV file using sequential and Random IO. 24 | */ 25 | class SeekReadTests extends CloudSuiteWithCSVDatasource { 26 | 27 | override def enabled: Boolean = super.enabled && hasCSVTestFile 28 | 29 | 30 | ctest("SeekReadFully", 31 | """Assess cost of seek and read operations. 32 | | When moving the cursor in an input stream, an HTTP connection may be closed and 33 | | then re-opened. This can be very expensive; tactics like streaming forwards instead 34 | | of seeking, and/or postponing movement until the following read ('lazy seek') try 35 | | to address this. Logging these operation times helps track performance. 36 | | This test also tries to catch out a regression, where a `close()` operation 37 | | is implemented through reading through the entire input stream. This is exhibited 38 | | in the time to `close()` while at offset 0 being `O(len(file))`. 39 | | 40 | | Note also the cost of `readFully()`; this method call is common inside libraries 41 | | like Orc and Parquet.""".stripMargin) { 42 | val (source, fs) = getCSVSourceAndFileSystem() 43 | FileSystem.clearStatistics 44 | fs.getStorageStatistics.reset() 45 | val st = logDuration("stat") { 46 | fs.getFileStatus(source) 47 | } 48 | val in = logDuration("open") { 49 | fs.open(source) 50 | } 51 | def time[T](operation: String)(testFun: => T): T = { 52 | logInfo(s"") 53 | var r = logDuration(operation + s" [pos = ${in.getPos}]")(testFun) 54 | logInfo(s" ${in.getWrappedStream}") 55 | r 56 | } 57 | 58 | val eof = st.getLen 59 | 60 | time("read()") { 61 | assert(-1 !== in.read()) 62 | } 63 | time("seek(256)") { 64 | in.seek(256) 65 | } 66 | time("seek(256)") { 67 | in.seek(256) 68 | } 69 | time("seek(EOF-2)") { 70 | in.seek(eof - 2) 71 | } 72 | time("read()") { 73 | assert(-1 !== in.read()) 74 | } 75 | 76 | def readFully(offset: Long, len: Int): Unit = { 77 | time(s"readFully($offset, byte[$len])") { 78 | val bytes = new Array[Byte](len) 79 | assert(-1 !== in.readFully(offset, bytes)) 80 | } 81 | } 82 | readFully(1L, 1) 83 | readFully(1L, 256) 84 | readFully(eof - 350, 300) 85 | readFully(260L, 256) 86 | readFully(1024L, 256) 87 | readFully(1536L, 256) 88 | readFully(8192L, 1024) 89 | readFully(8192L + 1024 + 512, 1024) 90 | readFully(0L, 1024) 91 | readFully(eof - 1024, 1024) 92 | 93 | time("seek(getPos)") { 94 | in.seek(in.getPos()) 95 | } 96 | time("read()") { 97 | assert(-1 !== in.read()) 98 | } 99 | logDuration("close()") { 100 | in.close 101 | } 102 | dumpFileSystemStatistics(fs.getStorageStatistics) 103 | 104 | } 105 | 106 | } 107 | --------------------------------------------------------------------------------