├── .gitallowed
├── .gitignore
├── cloud-examples
    └── src
    │   ├── test
    │       ├── scala
    │       │   ├── com
    │       │   │   └── cloudera
    │       │   │   │   └── spark
    │       │   │   │       └── cloud
    │       │   │   │           ├── s3
    │       │   │   │               ├── S3ASeekReadNormalIOSuite.scala
    │       │   │   │               ├── TestParquetBinding.scala
    │       │   │   │               ├── S3ASeekReadRandomIOSuite.scala
    │       │   │   │               ├── commit
    │       │   │   │               │   ├── AbstractS3ACommitterSuite.scala
    │       │   │   │               │   ├── S3ACommitterFactorySuite.scala
    │       │   │   │               │   ├── Events.scala
    │       │   │   │               │   └── S3ACommitDataframeSuite.scala
    │       │   │   │               ├── S3ABasicIOSuite.scala
    │       │   │   │               ├── S3ANumbersSuite.scala
    │       │   │   │               ├── S3AStreamingSuite.scala
    │       │   │   │               ├── S3ASeekReadSequentialIOSuite.scala
    │       │   │   │               ├── S3ADataFrameSuite.scala
    │       │   │   │               ├── S3ALineCountSuite.scala
    │       │   │   │               ├── S3ANumbersSuiteV2APISuite.scala
    │       │   │   │               ├── S3AFileGeneratorSuite.scala
    │       │   │   │               ├── S3DependencyCheckSuite.scala
    │       │   │   │               ├── S3ACSVReadSuite.scala
    │       │   │   │               ├── S3AEncryptionSuite.scala
    │       │   │   │               └── S3ALineCountWritebackSuite.scala
    │       │   │   │           ├── gs
    │       │   │   │               ├── GsDataFrameSuite.scala
    │       │   │   │               ├── GsCSVReadSuite.scala
    │       │   │   │               ├── GsBasicIOSuite.scala
    │       │   │   │               ├── GsCommitDataframeSuite.scala
    │       │   │   │               ├── AbstractGsCommitterSuite.scala
    │       │   │   │               └── GSDependencyCheckSuite.scala
    │       │   │   │           ├── abfs
    │       │   │   │               ├── AbfsBasicIOSuite.scala
    │       │   │   │               ├── AbfsDataFrameSuite.scala
    │       │   │   │               ├── AbfsCSVReadSuite.scala
    │       │   │   │               └── commit
    │       │   │   │               │   ├── AbfsCommitDataframeSuite.scala
    │       │   │   │               │   └── AbstractAbfsCommitterSuite.scala
    │       │   │   │           ├── azure
    │       │   │   │               ├── AzureBasicIOSuite.scala
    │       │   │   │               ├── AzureStreamingSuite.scala
    │       │   │   │               ├── AzureCSVReadSuite.scala
    │       │   │   │               ├── AzureSeekReadSuite.scala
    │       │   │   │               ├── AzureFileGeneratorSuite.scala
    │       │   │   │               ├── AzureLineCountSuite.scala
    │       │   │   │               └── AzureDataFrameSuite.scala
    │       │   │   │           ├── csv
    │       │   │   │               ├── LocalHugeCsvIOSuite.scala
    │       │   │   │               └── AbfsHugeCsvIOSuite.scala
    │       │   │   │           ├── common
    │       │   │   │               ├── StreamingTests.scala
    │       │   │   │               ├── CloudSuiteWithCSVDatasource.scala
    │       │   │   │               ├── HadoopVersionSuite.scala
    │       │   │   │               ├── ReadSample.scala
    │       │   │   │               ├── DataFrameTests.scala
    │       │   │   │               ├── FileGeneratorTests.scala
    │       │   │   │               └── SeekReadTests.scala
    │       │   │   │           ├── examples
    │       │   │   │               └── S3DataFrameExampleSuite.scala
    │       │   │   │           └── committers
    │       │   │   │               └── AbstractCommitterSuite.scala
    │       │   └── org
    │       │   │   └── apache
    │       │   │       └── spark
    │       │   │           ├── sql
    │       │   │               ├── hive
    │       │   │               │   └── orc
    │       │   │               │   │   ├── gs
    │       │   │               │   │       ├── GsParquetPartitionSuite.scala
    │       │   │               │   │       ├── GsOrcRelationSuite.scala
    │       │   │               │   │       ├── GsParquetRelationSuite.scala
    │       │   │               │   │       ├── GsOrcPartitionSuite.scala
    │       │   │               │   │       └── GsParquetRelationScaleSuite.scala
    │       │   │               │   │   ├── abfs
    │       │   │               │   │       ├── AbfsParquetPartitionSuite.scala
    │       │   │               │   │       ├── AbfsOrcRelationSuite.scala
    │       │   │               │   │       ├── AbfsOrcPartitionSuite.scala
    │       │   │               │   │       ├── AbfsParquetRelationSuite.scala
    │       │   │               │   │       └── AbfsParquetRelationScaleSuite.scala
    │       │   │               │   │   └── cloud
    │       │   │               │   │       ├── S3AOrcRelationSuite.scala
    │       │   │               │   │       ├── S3AOrcPartitionSuite.scala
    │       │   │               │   │       ├── S3AParquetPartitionSuite.scala
    │       │   │               │   │       ├── S3AParquetRelationSuite.scala
    │       │   │               │   │       ├── S3AParquetRelationScaleSuite.scala
    │       │   │               │   │       └── S3AOrcRelationScaleSuite.scala
    │       │   │               └── sources
    │       │   │               │   ├── MustDeclareDatasource.scala
    │       │   │               │   ├── ParquetRelationTrait.scala
    │       │   │               │   ├── AbtractOrcRelationSuite.scala
    │       │   │               │   ├── CloudPartitionTest.scala
    │       │   │               │   └── HiveTestTrait.scala
    │       │   │           └── SparkScopeWorkarounds.scala
    │       └── resources
    │       │   ├── core-site.xml
    │       │   └── log4j2.properties
    │   └── main
    │       ├── scala
    │           ├── com
    │           │   └── cloudera
    │           │   │   └── spark
    │           │   │       └── cloud
    │           │   │           ├── s3
    │           │   │               ├── NormalIOPolicy.scala
    │           │   │               ├── SequentialIOPolicy.scala
    │           │   │               ├── IOPolicy.scala
    │           │   │               ├── RandomIOPolicy.scala
    │           │   │               ├── S3AFileGenerator.scala
    │           │   │               ├── S3ALineCount.scala
    │           │   │               ├── S3AStreaming.scala
    │           │   │               ├── S3ADataFrames.scala
    │           │   │               ├── S3AExampleSetup.scala
    │           │   │               └── S3ATestSetup.scala
    │           │   │           ├── utils
    │           │   │               ├── Demo.scala
    │           │   │               ├── ForceRecentHadoopVersion.scala
    │           │   │               ├── ExtraAssertions.scala
    │           │   │               └── StatisticsTracker.scala
    │           │   │           ├── adl
    │           │   │               └── AdlTestSetup.scala
    │           │   │           ├── azure
    │           │   │               └── AzureTestSetup.scala
    │           │   │           ├── gs
    │           │   │               └── GsTestSetup.scala
    │           │   │           ├── abfs
    │           │   │               └── AbfsTestSetup.scala
    │           │   │           ├── local
    │           │   │               └── LocalTestSetup.scala
    │           │   │           ├── common
    │           │   │               ├── CsvDatasourceSupport.scala
    │           │   │               ├── ContextFreeCloudSuite.scala
    │           │   │               ├── StoreTestHelper.scala
    │           │   │               ├── StoreTestOperations.scala
    │           │   │               └── CloudSuite.scala
    │           │   │           └── examples
    │           │   │               └── AzureStreamingExample.scala
    │           └── org
    │           │   └── apache
    │           │       └── hadoop
    │           │           └── fs
    │           │               └── FSHelper.scala
    │       ├── site
    │           └── using.md
    │       └── resources
    │           └── log4j.properties
├── spark-cloud-integration
    └── src
    │   └── main
    │       ├── scala
    │           ├── org
    │           │   └── apache
    │           │   │   └── spark
    │           │   │       └── cloudera
    │           │   │           ├── package.scala
    │           │   │           └── statistics
    │           │   │               ├── IOStatisticsAccumulator.scala
    │           │   │               └── IOStatisticsCollectorExecutorPlugin.scala
    │           └── com
    │           │   └── cloudera
    │           │       └── spark
    │           │           └── cloud
    │           │               ├── test
    │           │                   └── UnitTestSuite.scala
    │           │               ├── s3
    │           │                   └── audit
    │           │                   │   ├── ServerLogEntry.scala
    │           │                   │   └── LogParser.scala
    │           │               ├── utils
    │           │                   ├── IntegrationUtils.scala
    │           │                   ├── ConfigSerDeser.scala
    │           │                   └── HConf.scala
    │           │               ├── CommitterInfo.scala
    │           │               └── CommitterBinding.scala
    │       └── site
    │           └── markdown
    │               └── integration.md
├── README.md
└── .travis.yml


/.gitallowed:
--------------------------------------------------------------------------------
1 | # serialization
2 | \-[0-9]+L


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | pom.xml.tag
 3 | pom.xml.releaseBackup
 4 | pom.xml.versionsBackup
 5 | pom.xml.next
 6 | release.properties
 7 | dependency-reduced-pom.xml
 8 | buildNumber.properties
 9 | .mvn/timing.properties
10 | cloud.xml
11 | cloud-examples/metastore_db
12 | cloud-examples/derby.log
13 | cloud-examples/spark-warehouse
14 | cloud-examples/src/scripts
15 | spark-snapshot
16 | *.iws
17 | *.ipr
18 | 
19 | /cloud-examples/build.properties
20 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ASeekReadNormalIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | class S3ASeekReadNormalIOSuite extends S3ASeekReadSequentialIOSuite {
21 | 
22 |   override def inputPolicy: String = NORMAL_IO
23 | }
24 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/NormalIOPolicy.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | trait NormalIOPolicy extends IOPolicy {
21 | 
22 |   /**
23 |    * Use original sequential IO
24 |    *
25 |    * @return the IO type
26 |    */
27 |   override def inputPolicy: String = NORMAL_IO
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/TestParquetBinding.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.test.UnitTestSuite
21 | 
22 | /**
23 |  * Look at what Parquet committer binding is up to
24 |  */
25 | class TestParquetBinding extends UnitTestSuite {
26 | 
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/SequentialIOPolicy.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | trait SequentialIOPolicy extends IOPolicy {
21 | 
22 |   /**
23 |    * Use original sequential IO
24 |    *
25 |    * @return the IO type
26 |    */
27 |   override def inputPolicy: String = SEQUENTIAL_IO
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ASeekReadRandomIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | /**
21 |  * Subclass of `S3aSeekReadSuite` with random IO turned on.
22 |  */
23 | class S3ASeekReadRandomIOSuite extends S3ASeekReadSequentialIOSuite {
24 | 
25 |   override def inputPolicy: String = RANDOM_IO
26 | }
27 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/IOPolicy.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | /**
21 |  * IO Policy to support.
22 |  */
23 | trait IOPolicy extends S3AConstants {
24 | 
25 |   /**
26 |    * What input policy to request
27 |    *
28 |    * @return the IO type
29 |    */
30 |   def inputPolicy: String
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsParquetPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.gs
19 | 
20 | import org.apache.spark.sql.sources.ParquetRelationTrait
21 | 
22 | /**
23 |  * Partitioned queries with ORC data against ABFS.
24 |  */
25 | class GsParquetPartitionSuite extends GsOrcPartitionSuite with
26 |   ParquetRelationTrait {
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/RandomIOPolicy.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | /**
21 |  * Switch to Random S3A IO,
22 |  */
23 | trait RandomIOPolicy extends IOPolicy {
24 | 
25 |   /**
26 |    * Use Random IO for high performance ORC
27 |    *
28 |    * @return the IO type
29 |    */
30 |   override def inputPolicy: String = RANDOM_IO
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/AbstractS3ACommitterSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3.commit
19 | 
20 | import com.cloudera.spark.cloud.committers.AbstractCommitterSuite
21 | import com.cloudera.spark.cloud.s3.S3ATestSetup
22 | 
23 | abstract class AbstractS3ACommitterSuite
24 |   extends AbstractCommitterSuite with S3ATestSetup {
25 | 
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/org/apache/spark/cloudera/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | 
19 | package org.apache.spark
20 | 
21 | /**
22 |  * Package to put things which need to get at Spark Private structures.
23 |  *
24 |  * These have to be viewed as unstable; if something breaks due to a spark
25 |  * change, that has to be accepted as inevitable.
26 |  */
27 | package object cloudera {
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsParquetPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.abfs
19 | 
20 | import org.apache.spark.sql.sources.ParquetRelationTrait
21 | 
22 | /**
23 |  * Partitioned queries with ORC data against ABFS.
24 |  */
25 | class AbfsParquetPartitionSuite extends AbfsOrcPartitionSuite with
26 |   ParquetRelationTrait {
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3AFileGenerator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.operations.CloudFileGenerator
21 | 
22 | /**
23 |  * Generate a file containing some numbers in the remote repository.
24 |  */
25 | object S3AFileGenerator extends CloudFileGenerator with S3AExampleSetup
26 |   with SequentialIOPolicy  {
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/Demo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import com.github.lalyos.jfiglet.FigletFont
21 | 
22 | object Demo {
23 | 
24 |   /**
25 |    * Uses figlet to render to a string.
26 |    * see: https://github.com/lalyos/jfiglet
27 |    */
28 |   def text(m: String): String = {
29 |     "\n" + FigletFont.convertOneLine(m)
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsDataFrameSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import com.cloudera.spark.cloud.common.DataFrameTests
21 | 
22 | /**
23 |  * Test GS and DataFrames.
24 |  */
25 | class GsDataFrameSuite extends DataFrameTests with GsTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/AbfsBasicIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.abfs
19 | 
20 | import com.cloudera.spark.cloud.common.BasicIOTests
21 | 
22 | /**
23 |  * Azure's basic IO operations.
24 |  */
25 | class AbfsBasicIOSuite extends BasicIOTests with AbfsTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureBasicIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.BasicIOTests
21 | 
22 | /**
23 |  * Azure's basic IO operations.
24 |  */
25 | class AzureBasicIOSuite extends BasicIOTests with AzureTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/AbfsDataFrameSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.abfs
19 | 
20 | import com.cloudera.spark.cloud.common.DataFrameTests
21 | 
22 | /**
23 |  * Test Azure and DataFrames.
24 |  */
25 | class AbfsDataFrameSuite extends DataFrameTests with AbfsTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsCSVReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import com.cloudera.spark.cloud.common.CSVReadTests
21 | 
22 | class GsCSVReadSuite extends CSVReadTests with GsTestSetup {
23 |   init()
24 | 
25 |   /**
26 |    * set up FS if enabled.
27 |    */
28 |   def init(): Unit = {
29 |     if (enabled) {
30 |       initFS()
31 |       initDatasources()
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureStreamingSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.StreamingTests
21 | 
22 | /**
23 |  * Test Streaming under Azure.
24 |  */
25 | class AzureStreamingSuite extends StreamingTests with AzureTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/AbfsCSVReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.abfs
19 | 
20 | import com.cloudera.spark.cloud.common.CSVReadTests
21 | 
22 | class AbfsCSVReadSuite extends CSVReadTests with AbfsTestSetup {
23 |   init()
24 | 
25 |   /**
26 |    * set up FS if enabled.
27 |    */
28 |   def init(): Unit = {
29 |     if (enabled) {
30 |       initFS()
31 |       initDatasources()
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureCSVReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.CSVReadTests
21 | 
22 | class AzureCSVReadSuite extends CSVReadTests with AzureTestSetup {
23 |   init()
24 | 
25 |   /**
26 |    * set up FS if enabled.
27 |    */
28 |   def init(): Unit = {
29 |     if (enabled) {
30 |       initFS()
31 |       initDatasources()
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureSeekReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.SeekReadTests
21 | 
22 | class AzureSeekReadSuite extends SeekReadTests with AzureTestSetup {
23 |   init()
24 | 
25 |   /**
26 |    * set up FS if enabled.
27 |    */
28 |   def init(): Unit = {
29 |     if (enabled) {
30 |       initFS()
31 |       initDatasources()
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ABasicIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.BasicIOTests
21 | 
22 | /**
23 |  * Basic S3A IO Tests.
24 |  */
25 | class S3ABasicIOSuite extends BasicIOTests with S3ATestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     // propagate S3 credentials
31 |     if (enabled) {
32 |       initFS()
33 |     }
34 |   }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsBasicIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import com.cloudera.spark.cloud.common.BasicIOTests
21 | 
22 | /**
23 |  * GS's basic IO operations.
24 |  */
25 | class GsBasicIOSuite extends BasicIOTests with GsTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     } else {
33 |       log.info("suite is not enabled")
34 |     }
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/test/UnitTestSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.test
19 | 
20 | import org.scalatest.funsuite.AnyFunSuite
21 | import org.scalatest.matchers.must.Matchers
22 | 
23 | 
24 | import org.apache.spark.internal.Logging
25 | 
26 | /**
27 |  * Base class for test suites.
28 |  * Added because scalatest imports are too brittle to use.
29 |  */
30 | class UnitTestSuite extends AnyFunSuite with Logging with Matchers {
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ANumbersSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.NumbersRddTests
21 | 
22 | class S3ANumbersSuite extends NumbersRddTests with S3ATestSetup {
23 |   init()
24 | 
25 |   def init(): Unit = {
26 |     // propagate S3 credentials
27 |     if (enabled) {
28 |       initFS()
29 |     }
30 |   }
31 | 
32 |   override protected def pathname = {
33 |     "s3a_numbers_suite"
34 |   }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsOrcRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.gs
19 | 
20 | import com.cloudera.spark.cloud.gs.GsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.AbtractOrcRelationSuite
23 | 
24 | class GsOrcRelationSuite extends AbtractOrcRelationSuite with GsTestSetup {
25 | 
26 |   init()
27 | 
28 |   def init(): Unit = {
29 |     // propagate credentials
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/csv/LocalHugeCsvIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.csv
19 | 
20 | import com.cloudera.spark.cloud.local.LocalTestSetup
21 | 
22 | /**
23 |  * local csv tests to act as a baseline for performance/correctness.
24 |  * always runs.
25 |  */
26 | class LocalHugeCsvIOSuite extends AbstractHugeCsvIOSuite with LocalTestSetup {
27 | 
28 |   init()
29 | 
30 |   /**
31 |    * set up FS.
32 |    */
33 |   def init(): Unit = {
34 |     initFS()
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsOrcRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.abfs
19 | 
20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.AbtractOrcRelationSuite
23 | 
24 | class AbfsOrcRelationSuite extends AbtractOrcRelationSuite with AbfsTestSetup {
25 | 
26 |   init()
27 | 
28 |   def init(): Unit = {
29 |     // propagate credentials
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/SparkScopeWorkarounds.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark
19 | 
20 | import org.apache.spark.sql.hive.HiveUtils
21 | 
22 | /**
23 |  * Here to get at useful stuff that Spark keeps private but which turn out be
24 |  * invaluable during testing.
25 |  *
26 |  * Needless to say: things may break here without warning or redress.
27 |  */
28 | object SparkScopeWorkarounds {
29 |   def tempHiveConfig(): Map[String, String] = {
30 |     HiveUtils.newTemporaryConfiguration(true)
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GsCommitDataframeSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import com.cloudera.spark.cloud.committers.AbstractCommitDataframeSuite
21 | 
22 | class GsCommitDataframeSuite
23 |   extends AbstractCommitDataframeSuite with GsTestSetup {
24 | 
25 |   init()
26 | 
27 |   def init(): Unit = {
28 |     if (enabled) {
29 |       initFS()
30 |     }
31 |   }
32 | 
33 |   override def committers: Seq[String] = Seq("manifest")
34 | 
35 |   override def schema: String = "gs"
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AOrcRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.cloud
19 | 
20 | 
21 | import com.cloudera.spark.cloud.s3.S3ATestSetup
22 | 
23 | import org.apache.spark.sql.sources.AbtractOrcRelationSuite
24 | 
25 | class S3AOrcRelationSuite extends AbtractOrcRelationSuite with S3ATestSetup {
26 | 
27 | 
28 | 
29 |   init()
30 | 
31 |   def init(): Unit = {
32 |     // propagate S3 credentials
33 |     if (enabled) {
34 |       initFS()
35 |     }
36 |   }
37 | 
38 | 
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureFileGeneratorSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.FileGeneratorTests
21 | 
22 | /**
23 |  * Test the `FileGenerator` entry point under Azure.
24 |  */
25 | class AzureFileGeneratorSuite extends FileGeneratorTests with AzureTestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 |   after {
36 |     cleanFilesystemInTeardown()
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AOrcPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.cloud
19 | 
20 | import com.cloudera.spark.cloud.s3.S3ATestSetup
21 | 
22 | import org.apache.spark.sql.sources.CloudPartitionTest
23 | 
24 | class S3AOrcPartitionSuite extends CloudPartitionTest with S3ATestSetup {
25 | 
26 |   init()
27 | 
28 |   def init(): Unit = {
29 |     // propagate S3 credentials
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 |   override val dataSourceName: String = "orc"
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AParquetPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.cloud
19 | 
20 | import com.cloudera.spark.cloud.s3.S3ATestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudPartitionTest, ParquetRelationTrait}
23 | 
24 | class S3AParquetPartitionSuite extends CloudPartitionTest with S3ATestSetup
25 |   with ParquetRelationTrait {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     // propagate S3 credentials
31 |     if (enabled) {
32 |       initFS()
33 |     }
34 |   }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsParquetRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.gs
19 | 
20 | import com.cloudera.spark.cloud.gs.GsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudRelationBasicSuite, ParquetRelationTrait}
23 | 
24 | class GsParquetRelationSuite extends CloudRelationBasicSuite
25 |   with GsTestSetup
26 |   with ParquetRelationTrait {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     // propagate credentials
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AParquetRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.cloud
19 | 
20 | import com.cloudera.spark.cloud.s3.S3ATestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudRelationBasicSuite, ParquetRelationTrait}
23 | 
24 | class S3AParquetRelationSuite extends CloudRelationBasicSuite
25 |   with S3ATestSetup
26 |   with ParquetRelationTrait {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     // propagate S3 credentials
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3AStreamingSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.StreamingTests
21 | import com.cloudera.spark.cloud.operations.CloudStreaming
22 | 
23 | /**
24 |  * Test Streaming against S3A.
25 |  */
26 | class S3AStreamingSuite extends StreamingTests with S3ATestSetup {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     // propagate S3 credentials
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 |   override protected val instance: CloudStreaming = S3AStreaming
38 | }
39 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsOrcPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.gs
19 | 
20 | import com.cloudera.spark.cloud.gs.GsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.CloudPartitionTest
23 | 
24 | /**
25 |  * Partitioned queries with ORC data against GS.
26 |  */
27 | class GsOrcPartitionSuite extends CloudPartitionTest with GsTestSetup {
28 | 
29 |   init()
30 | 
31 |   def init(): Unit = {
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 |   override def dataSourceName(): String = {
38 |     "orc"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ASeekReadSequentialIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.SeekReadTests
21 | 
22 | /**
23 |  * Tests reading in the S3A CSV file using sequential and Random IO.
24 |  */
25 | class S3ASeekReadSequentialIOSuite extends SeekReadTests with S3ATestSetup
26 |   with SequentialIOPolicy {
27 | 
28 |   init()
29 | 
30 |   /**
31 |    * set up FS if enabled.
32 |    */
33 |   def init(): Unit = {
34 |     if (enabled) {
35 |       initFS()
36 |       initDatasources()
37 |     }
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/sources/MustDeclareDatasource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.sources
19 | 
20 | import org.apache.spark.sql.types.DataType
21 | 
22 | /**
23 |  * Subclasses must declare their datasource.
24 |  */
25 | trait MustDeclareDatasource {
26 |   /**
27 |    * Name of the data source: this must be declared.
28 |    */
29 |   def dataSourceName(): String;
30 | 
31 | 
32 |   /**
33 |    * Datatype mapping.
34 |    *
35 |    * @param dataType type
36 |    * @return true of supported
37 |    */
38 |   def supportsDataType(
39 |     dataType: DataType): Boolean;
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/site/markdown/integration.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # Integrating the Apache Hadoop S3A Committers with Apache Spark
16 | 
17 | This document looks at the whole issue of "how to integrate the Hadoop S3A Committers" 
18 | with Apache Spark —it is intended to apply to any custom `PathOutputCommitter`
19 | implementation.
20 | 
21 | 
22 | ## Background: Hadoop
23 | 
24 | Hadoop has two MapReduce APIs, MRv1 and MRv2 (not to be distnguished from the v1/v2 commit
25 | algorithms.) MRv1 classes are found under the packages `org.apache.hadoop.mapred`;
26 | the MRv2 classes under `org.apache.hadoop.mapreduce`. This is important, as
27 | they often share classnames.
28 | 
29 | 
30 | 
31 | The "original" V1 API shipped in Hadoop 1. The newer v2 API came in Hadoop 2. 
32 | In Spark's `RDD.saveAsTextFile()` uses the MRv2 APIs to write data.
33 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ADataFrameSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.DataFrameTests
21 | import com.cloudera.spark.cloud.operations.CloudDataFrames
22 | 
23 | /**
24 |  * Test the [S3DataFrames] logic.
25 |  */
26 | class S3ADataFrameSuite extends DataFrameTests with S3ATestSetup {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     // propagate S3 credentials
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 |   override protected val instance: CloudDataFrames = S3ADataFrames
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsOrcPartitionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.abfs
19 | 
20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.CloudPartitionTest
23 | 
24 | /**
25 |  * Partitioned queries with ORC data against ABFS.
26 |  */
27 | class AbfsOrcPartitionSuite extends CloudPartitionTest with AbfsTestSetup {
28 | 
29 |   init()
30 | 
31 |   def init(): Unit = {
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 |   override def dataSourceName(): String = {
38 |     "orc"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/gs/GsParquetRelationScaleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.gs
19 | 
20 | import com.cloudera.spark.cloud.gs.GsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudRelationScaleTest, ParquetRelationTrait}
23 | 
24 | class GsParquetRelationScaleSuite extends CloudRelationScaleTest
25 |   with GsTestSetup
26 |   with ParquetRelationTrait {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     if (enabled) {
32 |       initFS()
33 |     }
34 |   }
35 | 
36 |   override def enabled: Boolean = super.enabled && isScaleTestEnabled
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/resources/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   ~ Licensed to the Apache Software Foundation (ASF) under one
 5 |   ~  or more contributor license agreements.  See the NOTICE file
 6 |   ~  distributed with this work for additional information
 7 |   ~  regarding copyright ownership.  The ASF licenses this file
 8 |   ~  to you under the Apache License, Version 2.0 (the
 9 |   ~  "License"); you may not use this file except in compliance
10 |   ~  with the License.  You may obtain a copy of the License at
11 |   ~
12 |   ~       http://www.apache.org/licenses/LICENSE-2.0
13 |   ~
14 |   ~  Unless required by applicable law or agreed to in writing, software
15 |   ~  distributed under the License is distributed on an "AS IS" BASIS,
16 |   ~  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |   ~  See the License for the specific language governing permissions and
18 |   ~  limitations under the License.
19 |   -->
20 | <!--
21 | This config switches S3A to use the staging committer unless otherwise stated,
22 | and Parquet to use the bound committer
23 | -->
24 | 
25 | <configuration>
26 |   <property>
27 |     <name>fs.s3a.committer.name</name>
28 |     <value>directory</value>
29 |     <description>
30 |       Committer to create for output to S3A, one of:
31 |       "file", "directory", "partitioned", "magic".
32 |     </description>
33 |   </property>
34 | </configuration>
35 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsParquetRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.abfs
19 | 
20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudRelationBasicSuite, ParquetRelationTrait}
23 | 
24 | class AbfsParquetRelationSuite extends CloudRelationBasicSuite
25 |   with AbfsTestSetup
26 |   with ParquetRelationTrait {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     // propagate credentials
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 |   override def dynamicPartitioning: Boolean = true;
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AParquetRelationScaleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.cloud
19 | 
20 | import com.cloudera.spark.cloud.s3.S3ATestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudRelationScaleTest, ParquetRelationTrait}
23 | 
24 | class S3AParquetRelationScaleSuite extends CloudRelationScaleTest
25 |   with S3ATestSetup
26 |   with ParquetRelationTrait {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     if (enabled) {
32 |       initFS()
33 |     }
34 |   }
35 | 
36 |   override def enabled: Boolean = super.enabled && isScaleTestEnabled
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/sources/ParquetRelationTrait.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.sources
19 | 
20 | import org.apache.spark.sql.types.{CalendarIntervalType, DataType, NullType}
21 | 
22 | 
23 | 
24 | trait ParquetRelationTrait extends MustDeclareDatasource {
25 |   // Parquet does not play well with NullType.
26 |   override def supportsDataType(
27 |     dataType: DataType): Boolean = dataType match {
28 |     case _: NullType => false
29 |     case _: CalendarIntervalType => false
30 |     case _ => true
31 |   }
32 | 
33 |   override def dataSourceName(): String = {
34 |     "parquet"
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/commit/AbfsCommitDataframeSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.abfs.commit
19 | 
20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
21 | import com.cloudera.spark.cloud.committers.AbstractCommitDataframeSuite
22 | 
23 | private class AbfsCommitDataframeSuite extends AbstractCommitDataframeSuite
24 |   with AbfsTestSetup {
25 | 
26 |   init()
27 | 
28 |   def init(): Unit = {
29 |     if (enabled) {
30 |       initFS()
31 |     }
32 |   }
33 | 
34 | 
35 |   override def committers: Seq[String] = Seq("manifest")
36 | 
37 | 
38 |   override def schema: String = "abfs"
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/abfs/AbfsParquetRelationScaleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.abfs
19 | 
20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
21 | 
22 | import org.apache.spark.sql.sources.{CloudRelationScaleTest, ParquetRelationTrait}
23 | 
24 | class AbfsParquetRelationScaleSuite extends CloudRelationScaleTest
25 |   with AbfsTestSetup
26 |   with ParquetRelationTrait {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     if (enabled) {
32 |       initFS()
33 |     }
34 |   }
35 | 
36 |   override def enabled: Boolean = super.enabled && isScaleTestEnabled
37 | 
38 | 
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/org/apache/hadoop/fs/FSHelper.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.hadoop.fs
19 | 
20 | import java.io.IOException
21 | import java.net.URI
22 | 
23 | import org.apache.hadoop.conf.Configuration
24 | 
25 | /**
26 |  * Help with testing by accessing package-private methods in FileSystem which
27 |  * are designed for aiding testability. They are normally accessed via
28 |  * `FileSystemTestHelper`, but as that is in hadoop-common-test JAR, a simple
29 |  * object here avoids maven import conflict problems.
30 |  */
31 | object FSHelper {
32 | 
33 |   @throws[IOException]
34 |   def addFileSystemForTesting(uri: URI, conf: Configuration, fs: FileSystem) {
35 |     FileSystem.addFileSystemForTesting(uri, conf, fs)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/hive/orc/cloud/S3AOrcRelationScaleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.hive.orc.cloud
19 | 
20 | import com.cloudera.spark.cloud.s3.S3ATestSetup
21 | 
22 | import org.apache.spark.sql.hive.orc.OrcFileFormat
23 | import org.apache.spark.sql.sources.CloudRelationScaleTest
24 | 
25 | class S3AOrcRelationScaleSuite extends CloudRelationScaleTest with S3ATestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     // propagate S3 credentials
31 |     if (enabled) {
32 |       initFS()
33 |     }
34 |   }
35 | 
36 |   override def enabled: Boolean = super.enabled && isScaleTestEnabled
37 | 
38 |   override val dataSourceName: String = classOf[OrcFileFormat].getCanonicalName
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/adl/AdlTestSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.adl
19 | 
20 | import java.net.URI
21 | 
22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait
23 | import org.apache.hadoop.fs.FileSystem
24 | 
25 | /**
26 |  * Trait for ADL tests.
27 |  *
28 |  * This trait supports CSV data source by copying over the data from S3A if
29 |  * it isn't already in a ADL URL
30 |  */
31 | trait AdlTestSetup extends CopyCsvFileTrait {
32 | 
33 |   override def enabled: Boolean =  {
34 |     getConf.getBoolean(ADL_TESTS_ENABLED, false)
35 |   }
36 | 
37 |   def initFS(): FileSystem = {
38 |     val uri = new URI(requiredOption(ADL_TEST_URI))
39 |     logDebug(s"Executing Azure tests against $uri")
40 |     createFilesystem(uri)
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/azure/AzureTestSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import java.net.URI
21 | 
22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait
23 | import org.apache.hadoop.fs.FileSystem
24 | 
25 | /**
26 |  * Trait for Azure  ADL tests.
27 |  *
28 |  * This trait supports CSV data source by copying over the data from S3A if
29 |  * it isn't already in a WASB URL
30 |  */
31 | trait AzureTestSetup extends CopyCsvFileTrait {
32 | 
33 |   override def enabled: Boolean =  {
34 |     getConf.getBoolean(AZURE_TESTS_ENABLED, false)
35 |   }
36 | 
37 |   def initFS(): FileSystem = {
38 |     val uri = new URI(requiredOption(AZURE_TEST_URI))
39 |     logDebug(s"Executing Azure tests against $uri")
40 |     createFilesystem(uri)
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/StreamingTests.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import com.cloudera.spark.cloud.operations.CloudStreaming
21 | 
22 | /**
23 |  * Test Streaming.
24 |  */
25 | abstract class StreamingTests extends CloudSuite  {
26 | 
27 |   after {
28 |     cleanFilesystemInTeardown()
29 |   }
30 | 
31 |   /**
32 |    * Override point: instantiate
33 |    */
34 |   protected val instance: CloudStreaming = new CloudStreaming()
35 | 
36 |   ctest("streaming",
37 |     "Execute the Streaming example") {
38 |     val conf = newSparkConf()
39 |     conf.setAppName("Streaming")
40 |     val destDir = testPath(filesystem, "streaming")
41 |     val rowCount = 1000
42 | 
43 |     assert(0 === instance.action(conf, Seq(destDir, rowCount)))
44 |   }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/gs/GsTestSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import java.net.URI
21 | 
22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait
23 | import org.apache.hadoop.fs.FileSystem
24 | /**
25 |  * Trait for GCS.
26 |  *
27 |  * This trait supports CSV data source by copying over the data from S3A if
28 |  * it isn't already in a gcs URL
29 |  */
30 | trait GsTestSetup extends CopyCsvFileTrait {
31 | 
32 |   override def enabled: Boolean =  {
33 |     getConf.getBoolean(GS_TESTS_ENABLED, false)
34 |   }
35 | 
36 |   def initFS(): FileSystem = {
37 |     val uri = new URI(requiredOption(GS_TEST_URI))
38 |     logDebug(s"Executing GCS tests against $uri")
39 |     createFilesystem(uri)
40 |   }
41 | 
42 |   override def dynamicPartitioning: Boolean = true;
43 | }
44 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/abfs/AbfsTestSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.abfs
19 | 
20 | import java.net.URI
21 | 
22 | import com.cloudera.spark.cloud.common.CopyCsvFileTrait
23 | import org.apache.hadoop.fs.FileSystem
24 | 
25 | /**
26 |  * Trait for Azure ABFS tests.
27 |  *
28 |  * This trait supports CSV data source by copying over the data from S3A if
29 |  * it isn't already in a ABFS URL
30 |  */
31 | trait AbfsTestSetup extends CopyCsvFileTrait {
32 | 
33 |   override def enabled: Boolean =  {
34 |     getConf.getBoolean(ABFS_TESTS_ENABLED, false)
35 |   }
36 | 
37 |   override def dynamicPartitioning: Boolean = true
38 | 
39 |   def initFS(): FileSystem = {
40 |     val uri = new URI(requiredOption(ABFS_TEST_URI))
41 |     logDebug(s"Executing Abfs tests against $uri")
42 |     createFilesystem(uri)
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ALineCountSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource
21 | 
22 | /**
23 |  * Test the `S3LineCount` entry point.
24 |  */
25 | class S3ALineCountSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     if (enabled) {
31 |       setupFilesystemConfiguration(getConf)
32 |     }
33 |   }
34 | 
35 |   override def enabled: Boolean = super.enabled && hasCSVTestFile
36 | 
37 |   ctest("S3ALineCountReadData",
38 |     "Execute the S3ALineCount example with the default values (i.e. no arguments)") {
39 |     val sparkConf = newSparkConf(getTestCSVPath())
40 |     sparkConf.setAppName("S3ALineCountDefaults")
41 |     assert(0 === S3ALineCount.action(sparkConf, Seq()))
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ANumbersSuiteV2APISuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.NumbersRddTests
21 | import org.apache.hadoop.fs.Path
22 | 
23 | import org.apache.spark.rdd.RDD
24 | 
25 | class S3ANumbersSuiteV2APISuite extends NumbersRddTests with S3ATestSetup {
26 |   init()
27 | 
28 |   def init(): Unit = {
29 |     // propagate S3 credentials
30 |     if (enabled) {
31 |       initFS()
32 |     }
33 |   }
34 | 
35 |   override protected def pathname = {
36 |     "numbers_rdd_tests_v2api"
37 |   }
38 | 
39 |   /**
40 |    * Save the RDD.
41 |    *
42 |    * @param numbers RDD to save
43 |    * @param dest destination path
44 |    */
45 |   override protected def saveRDD(
46 |       numbers: RDD[Int],
47 |       dest: Path): Unit = {
48 |     saveRDDviaMRv2(numbers, dest)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3ALineCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.CloudTestKeys
21 | import com.cloudera.spark.cloud.operations.LineCount
22 | 
23 | import org.apache.spark.SparkConf
24 | 
25 | /**
26 |  * A line count example which has a default reference of a public Amazon S3
27 |  * CSV .gz file in the absence of anything on the command line.
28 |  */
29 | object S3ALineCount extends LineCount with S3AExampleSetup with SequentialIOPolicy {
30 | 
31 |   override def defaultSource: Option[String] = {
32 |     Some(CloudTestKeys.S3A_CSV_PATH_DEFAULT)
33 |   }
34 | 
35 |   override def maybeEnableAnonymousAccess(
36 |       sparkConf: SparkConf,
37 |       dest: Option[String]): Unit = {
38 |     if (dest.isEmpty) {
39 |       hconf(sparkConf, AWS_CREDENTIALS_PROVIDER, ANONYMOUS_CREDENTIALS)
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/ForceRecentHadoopVersion.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import org.apache.hadoop.fs.azure.AzureException
21 | import org.apache.hadoop.fs.s3a.RenameFailedException
22 | 
23 | /**
24 |  * This class is used to ensure that a recent Hadoop version is on the classpath.
25 |  *
26 |  * If it does not compile: the version of Spark it is built against has out of date
27 |  * dependencies.
28 |  *
29 |  * If it does not findClass, the version of Spark it is running against is out of date.
30 |  *
31 |  * Currently: requires Hadoop 2.8+
32 |  */
33 | class ForceRecentHadoopVersion {
34 | 
35 |   /** compile/link failure against Hadoop 2.6 */
36 |   val requireAzure = new AzureException("needs Hadoop 2.7+")
37 | 
38 |   /** compile failure against Hadoop 2.7 */
39 |   val requireRecentAWS = new RenameFailedException("/", "Needs something", "")
40 | }
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Cloud Integration for Apache Spark
 2 | 
 3 | The [cloud-integration](https://github.com/hortonworks-spark/cloud-integration)
 4 | repository provides modules to improve Apache Spark's integration with cloud infrastructures.
 5 | 
 6 | 
 7 | 
 8 | ## Module `spark-cloud-integration`
 9 | 
10 | Classes and Tools to make Spark work better in-cloud
11 | 
12 | * Committer integration with the s3a committers.
13 | * Proof of concept cloud-first distcp replacement.
14 | * Serialization for Hadoop `Configuration`: class `ConfigSerDeser`. Use this
15 | to get a configuration into an RDD method
16 | * Trait `HConf` to manipulate the hadoop options in a spark config.
17 | * Anything else which turns out to be useful.
18 | * Variant of `FileInputStream` for cloud storage, `org.apache.spark.streaming.cloudera.CloudInputDStream`
19 | 
20 | See [Spark Cloud Integration](spark-cloud-integration/src/main/site/markdown/index.md)
21 | 
22 | 
23 | 
24 | ## Module `cloud-examples`
25 | 
26 | This does the packaging/integration tests for Spark and cloud against AWS, Azure and Google GCS.
27 | 
28 | These are basic tests of the core functionality of I/O, streaming, and verify that
29 | the commmitters work.
30 | 
31 | As well as running as unit tests, they have CLI entry points which can be used for scalable functional testing.
32 | 
33 | 
34 | ## Module `minimal-integration-test`
35 | 
36 | This is a minimal JAR for integration tests
37 | 
38 | Usage
39 | ```bash
40 | spark-submit --class com.cloudera.spark.cloud.integration.Generator \
41 | --master yarn \
42 | --num-executors 2 \
43 | --driver-memory 512m \
44 | --executor-memory 512m \
45 | --executor-cores 1 \
46 | minimal-integration-test-1.0-SNAPSHOT.jar \
47 | adl://example.azuredatalakestore.net/output/dest/1 \
48 | 2 2 15
49 | ```
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/s3/audit/ServerLogEntry.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3.audit
19 | 
20 | case class ServerLogEntry(
21 |   bucketowner: String,
22 |   bucket_name: String,
23 |   requestdatetime: String,
24 |   remoteip: String,
25 |   requester: String,
26 |   requestid: String,
27 |   operation: String,
28 |   key: String,
29 |   request_uri: String,
30 |   httpstatus: String,
31 |   errorcode: String,
32 |   bytessent: Long,
33 |   objectsize: Long,
34 |   totaltime: String,
35 |   turnaroundtime: String,
36 |   referrer: String,
37 |   useragent: String,
38 |   versionid: String,
39 |   hostid: String,
40 |   sigv: String,
41 |   ciphersuite: String,
42 |   authtype: String,
43 |   endpoint: String,
44 |   tlsversion: String) {
45 | 
46 |   override def toString: String =
47 |     s"$operation /$bucket_name/$key $httpstatus $errorcode $bytessent $requestdatetime"
48 | }
49 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/utils/IntegrationUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import java.net.URL
21 | 
22 | import org.apache.hadoop.util.ExitUtil
23 | 
24 | /**
25 |  * A class to instantiate for all the general utils
26 |  */
27 | class IntegrationUtils extends TimeOperations with HConf {
28 |   private val E_NO_CLASS = 11
29 | 
30 |   def findClass(src: String, classname: String): (String, String, URL, Class[_]) = {
31 |     try {
32 |       val loader = this.getClass.getClassLoader
33 |       val res = classname.replaceAll("\\.", "/") + ".class"
34 |       val url = loader.getResource(res)
35 |       val clazz = loader.loadClass(classname)
36 |       (src, classname, url, clazz)
37 |     } catch {
38 |       case e: Exception =>
39 |         throw new ExitUtil.ExitException(E_NO_CLASS,
40 |           s"Failed to findClass Class $classname from $src").initCause(e)
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements. See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License. You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Spark provides this Travis CI configuration file to help contributors
17 | # check Scala/Java style conformance and JDK7/8 compilation easily
18 | # during their preparing pull requests.
19 | #   - Scalastyle is executed during `maven install` implicitly.
20 | #   - Java Checkstyle is executed by `lint-java`.
21 | # See the related discussion here.
22 | # https://github.com/apache/spark/pull/12980
23 | 
24 | # 1. Choose OS (Ubuntu 14.04.3 LTS Server Edition 64bit, ~2 CORE, 7.5GB RAM)
25 | #%sudo: required
26 | #%dist: trusty
27 | 
28 | # 2. Choose language and target JDKs for parallel builds.
29 | language: java
30 | jdk:
31 |   - oraclejdk8
32 | 
33 | # 3. Setup cache directory for SBT and Maven.
34 | cache:
35 |   directories:
36 |   - $HOME/.sbt
37 |   - $HOME/.m2
38 | 
39 | # 4. Turn off notifications.
40 | notifications:
41 |   email: false
42 | 
43 | # 5. Run maven install before running lint-java.
44 | install:
45 |   - export MAVEN_SKIP_RC=1
46 |   - mvn -T 1C install
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/local/LocalTestSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.local
19 | 
20 | import java.io.File
21 | 
22 | import com.cloudera.spark.cloud.common.CloudSuiteTrait
23 | import org.apache.hadoop.fs.{FileSystem, Path}
24 | 
25 | /**
26 |  * Trait for the local fs; goal is for benchmarking/validating/writing
27 |  * new tests.
28 |  *
29 |  */
30 | trait LocalTestSetup extends CloudSuiteTrait {
31 | 
32 |   override def enabled: Boolean = {
33 |     true
34 |   }
35 | 
36 |   def initFS(): FileSystem = {
37 |     val fs = getLocalFS
38 |     setFilesystem(fs)
39 |     fs
40 |   }
41 | 
42 |   override def dynamicPartitioning: Boolean = true;
43 | 
44 |   /**
45 |    * the test path here is always to something under the temp dir.
46 |    */
47 |   override protected def testDir: Path = {
48 |     val f = File.createTempFile(this.getClass.getSimpleName, "")
49 |     f.delete()
50 |     f.mkdir()
51 |     new Path(f.toURI)
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/examples/S3DataFrameExampleSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.examples
19 | 
20 | import com.cloudera.spark.cloud.common.CloudSuite
21 | import com.cloudera.spark.cloud.s3.S3ATestSetup
22 | 
23 | /**
24 |  * Test the [S3DataFrames] logic.
25 |  */
26 | class S3DataFrameExampleSuite extends CloudSuite with S3ATestSetup {
27 | 
28 |   init()
29 | 
30 |   def init(): Unit = {
31 |     // propagate S3 credentials
32 |     if (enabled) {
33 |       initFS()
34 |     }
35 |   }
36 | 
37 |   /**
38 |    * Override point: the data frame operation to execute
39 |    */
40 |   ctest("DataFrames",
41 |     "Dataframe IO") {
42 |     val conf = newSparkConf()
43 |     conf.setAppName("DataFrames")
44 |     val destDir = testPath(filesystem, "dataframes")
45 |     val instance = new S3DataFrameExample()
46 |     val args = Seq(destDir)
47 |     assert(0 === instance.action(conf, args),
48 |       s" action($args) failed against $instance")
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3AStreaming.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.operations.CloudStreaming
21 | 
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.streaming._
24 | 
25 | /**
26 |  * An example/test for streaming with a source of S3.
27 |  */
28 | object S3AStreaming extends CloudStreaming with S3AExampleSetup
29 |   with SequentialIOPolicy {
30 | 
31 |   /**
32 |    * This is never executed; it's just here as the source of the example in the
33 |    * documentation.
34 |    */
35 |   def streamingExample(): Unit = {
36 |     val sparkConf = new SparkConf()
37 |     val ssc = new StreamingContext(sparkConf, Milliseconds(1000))
38 |     try {
39 |       val lines = ssc.textFileStream("s3a://testbucket/incoming")
40 |       val matches = lines.filter(_.endsWith("3"))
41 |       matches.print()
42 |       ssc.start()
43 |       ssc.awaitTermination()
44 |     } finally {
45 |       ssc.stop(true)
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/csv/AbfsHugeCsvIOSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.csv
19 | 
20 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
21 | import com.cloudera.spark.cloud.ObjectStoreConfigurations.ABFS_READAHEAD_HADOOP_OPTIONS
22 | import org.apache.hadoop.conf.Configuration
23 | 
24 | /**
25 |  * The real test of HADOOP-18521.
26 |  */
27 | class AbfsHugeCsvIOSuite extends AbstractHugeCsvIOSuite with AbfsTestSetup() {
28 | 
29 |   init()
30 | 
31 |   /**
32 |    * set up FS if enabled.
33 |    */
34 |   def init(): Unit = {
35 |     if (enabled) {
36 |       initFS()
37 |     }
38 |   }
39 | 
40 |   /**
41 |    * Patch in ABFS readahead options, to ensure they are
42 |    * always set.
43 |    * @return the configuration to create the fs with
44 |    */
45 |   override def createConfiguration(): Configuration = {
46 |     val conf = super.createConfiguration()
47 |     for (kv <- ABFS_READAHEAD_HADOOP_OPTIONS) {
48 |       conf.set(kv._1, kv._2)
49 |     }
50 |     conf
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureLineCountSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource
21 | import com.cloudera.spark.cloud.operations.LineCount
22 | 
23 | /**
24 |  * Test the `S3LineCount` entry point.
25 |  */
26 | class AzureLineCountSuite extends CloudSuiteWithCSVDatasource with AzureTestSetup {
27 | 
28 |   init()
29 | 
30 |   /**
31 |    * set up FS if enabled.
32 |    */
33 |   def init(): Unit = {
34 |     if (enabled) {
35 |       initFS()
36 |       initDatasources()
37 |     }
38 |   }
39 | 
40 |   override def enabled: Boolean = super.enabled && hasCSVTestFile
41 | 
42 |   after {
43 |     cleanFilesystemInTeardown()
44 |   }
45 | 
46 |   ctest("AzureLineCountSuite",
47 |     "Execute the LineCount example") {
48 |     val src = getTestCSVPath()
49 |     val sparkConf = newSparkConf(src)
50 |     sparkConf.setAppName("AzureLineCountSuite")
51 |     assert(0 === new LineCount().action(sparkConf,
52 |       Seq(src.toUri.toString)))
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/CloudSuiteWithCSVDatasource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import org.apache.hadoop.conf.Configuration
21 | import org.apache.hadoop.fs.{FileSystem, Path}
22 | 
23 | /**
24 |  * Any cloud suite which requires the datasource to be a (possibly copied over)
25 |  * CSV file.
26 |  */
27 | class CloudSuiteWithCSVDatasource extends CloudSuite with CsvDatasourceSupport {
28 | 
29 |   /**
30 |    * Call this to set up the datasource for tests.
31 |    */
32 |   def initDatasources(): Unit = {
33 |     if (hasCSVTestFile()) {
34 |       prepareTestCSVFile()
35 |       testCSVFilePath.get
36 |     }
37 |   }
38 | 
39 |   /**
40 |    * Get the CSV source path and filesystem to read from it.
41 |    * The filesystem uses the endpoint defined for the CSV file.
42 |    *
43 |    * @return Patn and FS of a CSV source file.
44 |    */
45 |   def getCSVSourceAndFileSystem(): (Path, FileSystem) = {
46 |     val source = getTestCSVPath()
47 |     (source, FileSystem.newInstance(source.toUri, new Configuration(getConf)))
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/site/using.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # Using the extra features in these examples
16 | 
17 | ### <a name="streaming"></a>Example: Spark Streaming and Cloud Storage
18 | 
19 | Spark Streaming can monitor files added to object stores, by
20 | creating a `FileInputDStream` DStream monitoring a path under a bucket.
21 | 
22 | ```scala
23 | import org.apache.spark.SparkConf
24 | import org.apache.spark.sql.SparkSession
25 | import org.apache.spark.streaming._
26 | 
27 | val sparkConf = new SparkConf()
28 | val ssc = new StreamingContext(sparkConf, Milliseconds(5000))
29 | try {
30 |   val lines = ssc.textFileStream("s3a://bucket/incoming")
31 |   val matches = lines.filter(_.endsWith("3"))
32 |   matches.print()
33 |   ssc.start()
34 |   ssc.awaitTermination()
35 | } finally {
36 |   ssc.stop(true)
37 | }
38 | ```
39 | 
40 | 
41 | 1. The time to scan for new files is proportional to the number of files
42 | under the path —not the number of *new* files, and that it can become a slow operation.
43 | The size of the window needs to be set to handle this.
44 | 
45 | 1. Files only appear in an object store once they are completely written; there
46 | is no need for a worklow of write-then-rename to ensure that files aren't picked up
47 | while they are still being written. Applications can write straight to the monitored directory.
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3ADataFrames.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.CloudTestKeys
21 | import com.cloudera.spark.cloud.operations.CloudDataFrames
22 | import org.apache.hadoop.conf.Configuration
23 | import org.apache.hadoop.fs.{FileSystem, Path}
24 | 
25 | import org.apache.spark.sql.SparkSession
26 | 
27 | /**
28 |  * Test dataframe operations using S3 as the destination and source of operations.
29 |  * This validates the various conversion jobs all work against the object store.
30 |  *
31 |  * It doesn't verify timings, though some information is printed.
32 |  */
33 | object S3ADataFrames extends CloudDataFrames with S3AExampleSetup {
34 | 
35 |   override def extraValidation(
36 |       session: SparkSession,
37 |       conf: Configuration,
38 |       fs: FileSystem,
39 |       results: Seq[(String, Path, Long, Long)]): Unit = {
40 | 
41 |     val operations = new CommitterOperations(fs)
42 |     if (conf.getBoolean(CloudTestKeys.S3A_COMMITTER_TEST_ENABLED, false)) {
43 |       results.foreach((tuple: (String, Path, Long, Long)) => {
44 |         operations.verifyCommitter(tuple._2, None, None, "")
45 |       })
46 |     }
47 | 
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3AFileGeneratorSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.FileGeneratorTests
21 | import org.apache.hadoop.fs.Path
22 | 
23 | import org.apache.spark.SparkConf
24 | 
25 | /**
26 |  * Test the `S3FileGenerator` entry point.
27 |  */
28 | class S3AFileGeneratorSuite extends FileGeneratorTests with S3ATestSetup {
29 | 
30 |   init()
31 | 
32 |   def init(): Unit = {
33 |     // propagate S3 credentials
34 |     if (enabled) {
35 |       initFS()
36 |     }
37 |   }
38 | 
39 |   after {
40 |     cleanFilesystemInTeardown()
41 |   }
42 | 
43 |   ctest("FileGeneratorUsage",
44 |     "Execute the S3FileGenerator example with a bad argument; expect a failure") {
45 |     val conf = newSparkConf()
46 |     conf.setAppName("FileGenerator")
47 |     assert(-2 === S3AFileGenerator.action(conf, Seq()))
48 |   }
49 | 
50 |   override def generate(
51 |       conf: SparkConf,
52 |       destDir: Path,
53 |       monthCount: Int,
54 |       fileCount: Int,
55 |       rowCount: Int): Int = {
56 |     val result = S3AFileGenerator.action(conf, Seq(destDir,
57 |       monthCount,
58 |       fileCount,
59 |       rowCount))
60 |     result
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/HadoopVersionSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import java.util
21 | import java.util.Collections
22 | 
23 | import scala.collection.JavaConverters._
24 | 
25 | import com.cloudera.spark.cloud.common.CloudSuite._
26 | import com.cloudera.spark.cloud.test.UnitTestSuite
27 | 
28 | class HadoopVersionSuite extends UnitTestSuite {
29 | 
30 |   test("Sysprops") {
31 |     val props = System.getProperties
32 |     val list = new util.ArrayList[String](props.stringPropertyNames())
33 |     Collections.sort(list)
34 |     val plist = list.asScala
35 |       .filter(k => (!k.startsWith("java.") && !k.startsWith("sun.")))
36 |       .map(key => s"$key = ${props.getProperty(key)}")
37 |       .mkString("\n")
38 |     logInfo(s"Properties:\n$plist")
39 |   }
40 | 
41 |   test("PropagatedValues") {
42 |     val mapped = StoreTestHelper.loadConfiguration().asScala
43 |       .filter { entry =>
44 |         val k = entry.getKey
45 |         k.startsWith("fs.s3a") && !k.contains("key")
46 |       }
47 |       .map(entry => s"${entry.getKey} = ${entry.getValue}").toList.sorted
48 |     val list = mapped.mkString("\n")
49 |     logInfo(s"S3A config options:\n${list}")
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3DependencyCheckSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.test.UnitTestSuite
21 | 
22 | /**
23 |  * Force findClass in hadoop s3n/s3a classes and some dependencies.
24 |  * Dependency problems should be picked up at compile time; runtime may
25 |  * identify problems with any other transitive library
26 |  */
27 | class S3DependencyCheckSuite extends UnitTestSuite {
28 | 
29 |   test("Create S3A FS Instance") {
30 |     instantiate("org.apache.hadoop.fs.s3a.S3AFileSystem")
31 |   }
32 | 
33 |   test("hive") {
34 |     instantiate("org.apache.hadoop.hive.conf.HiveConf")
35 |   }
36 | 
37 |   /**
38 |    * Instantiate the class.
39 |    * This is wrapped because scalatest gets confused about instantiation Errors raised
40 |    * in a test case: they aren't methods, see.
41 |    * @param classname class to instantiate.
42 |    */
43 |   def instantiate(classname: String) {
44 |     try {
45 |       val clazz = this.getClass.getClassLoader.loadClass(classname)
46 |       clazz.newInstance()
47 |     } catch {
48 |       case e: Exception => throw e
49 |       case e: Throwable => throw new Exception(s"Could not instantiate $classname", e)
50 |     }
51 |   }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/CommitterInfo.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud
19 | 
20 | import com.cloudera.spark.cloud.CommitterBinding.factoryForSchema
21 | import com.cloudera.spark.cloud.utils.HConf
22 | import org.apache.hadoop.conf.Configuration
23 | 
24 | import org.apache.spark.SparkConf
25 | 
26 | /**
27 |  * representation of a committer
28 |  * @param name committe name for s3a manifestation
29 |  * @param factory factory classname
30 |  */
31 | case class CommitterInfo(name: String, factory: String)
32 |   extends HConf {
33 | 
34 |   def bind(sparkConf: SparkConf): Unit = {
35 |     bindToSchema(sparkConf, "s3a")
36 |   }
37 | 
38 |   def bind(conf: Configuration): Unit = {
39 |     bindToSchema(conf, "s3a")
40 |   }
41 | 
42 |   def bindToSchema(sparkConf: SparkConf, fsSchema: String): Unit = {
43 |     hconf(sparkConf, factoryForSchema(fsSchema), factory)
44 |     hconf(sparkConf, CommitterBinding.S3A_COMMITTER_NAME,
45 |       name)
46 |   }
47 | 
48 |   def bindToSchema(conf: Configuration, fsSchema: String): Unit = {
49 |     conf.set(factoryForSchema(fsSchema), factory)
50 |     conf.set(CommitterBinding.S3A_COMMITTER_NAME, name)
51 |   }
52 | 
53 |   override def toString: String = s"Committer binding $factory($name)"
54 | }
55 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/sources/AbtractOrcRelationSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.sources
19 | 
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.hive.orc.OrcFileFormat
22 | import org.apache.spark.sql.internal.SQLConf
23 | 
24 | /**
25 |  * cloud relation suite with some orc specific tests.
26 |  */
27 | abstract class AbtractOrcRelationSuite extends CloudRelationBasicSuite {
28 | 
29 | import testImplicits._
30 | 
31 |   override val dataSourceName: String = classOf[OrcFileFormat].getCanonicalName
32 | 
33 |   ctest("SPARK-12218",
34 |     "'Not' is included in ORC filter pushdown", false) {
35 | 
36 |     withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
37 |       withTempPathDir("SPARK-12218") { dir =>
38 |         val path = s"${dir.toString}/table1"
39 |         (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path)
40 | 
41 |         checkAnswer(
42 |           spark.read.orc(path).where("not (a = 2) or not(b in ('1'))"),
43 |           (1 to 5).map(i => Row(i, (i % 2).toString)))
44 | 
45 |         checkAnswer(
46 |           spark.read.orc(path).where("not (a = 2 and b in ('1'))"),
47 |           (1 to 5).map(i => Row(i, (i % 2).toString)))
48 |       }
49 |     }
50 |   }
51 | 
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/ExtraAssertions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import org.apache.hadoop.conf.Configuration
21 | import org.scalatest.Assertions
22 | 
23 | trait ExtraAssertions extends Assertions {
24 | 
25 |   /**
26 |    * Expect a specific value; raise an assertion if it is not there
27 |    *
28 |    * @param v value
29 |    * @param msg message
30 |    * @tparam T type
31 |    * @return the actual value
32 |    */
33 |   def expectSome[T](v: Option[T], msg: => String): T = {
34 |     v.getOrElse(throw new AssertionError(msg))
35 |   }
36 | 
37 |   /**
38 |    * Expect a value to be non-null; return it. It will
39 |    * implicitly be non-null in further use.
40 |    *
41 |    * @param v value to check
42 |    * @param msg message for any assertion
43 |    * @tparam T type of value
44 |    * @return
45 |    */
46 |   def expectNotNull[T](v: T, msg: => String): T = {
47 |     if (v != null) v else throw new AssertionError(msg)
48 |   }
49 | 
50 |   /**
51 |    * Expect a configuration option to be set
52 |    *
53 |    * @param c config
54 |    * @param key kjey to look for
55 |    * @return the set value
56 |    */
57 |   def expectOptionSet(c: Configuration, key: String): String = {
58 |     expectNotNull(c.get(key), s"Unset property ${key}")
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/CsvDatasourceSupport.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import org.apache.hadoop.fs.Path
21 | 
22 | trait CsvDatasourceSupport {
23 | 
24 |   /**
25 |    * Predicate to define whether or not there's a CSV file to work with.
26 |    *
27 |    * @return true if the CSV test file is defined.
28 |    */
29 |   def hasCSVTestFile(): Boolean = false
30 | 
31 |   /**
32 |    * Path to the CSV file's original source
33 |    * @return a path
34 |    */
35 |   def sourceCSVFilePath : Option[Path] = None
36 | 
37 |   /**
38 |    * Path to the CSV file used in the tests themselves; may differ from
39 |    * the original source
40 |    *
41 |    * @return path to test data: valid after `prepareTestCSVFile`.
42 |    */
43 |   def testCSVFilePath : Option[Path] = sourceCSVFilePath
44 | 
45 |   /**
46 |    * Get the test CSV file or raise an exception.
47 |    * @return the CSV path for tests
48 |    */
49 |   def getTestCSVPath(): Path = testCSVFilePath.get
50 | 
51 |   /**
52 |    * Any operation to prepare the CSV file. After completion, returns
53 |    * the path to the test CSV file.
54 |    */
55 |   def prepareTestCSVFile(): Unit = {
56 |     require(hasCSVTestFile(), "No CSV file")
57 |     require(sourceCSVFilePath.isDefined, "No source CSV file")
58 |   }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # lifted from spark core/src/test/resources.
19 | # from the log4j docs:
20 | # > An understanding of how loggers work in Log4j is critical before
21 | # > trying to configure them.
22 | # > Please reference the Log4j architecture if more information is required.
23 | # > Trying to configure Log4j without understanding those concepts will lead to frustration.
24 | 
25 | # Set everything to be logged to the file target/unit-tests.log
26 | rootLogger.level = info
27 | rootLogger.appenderRef.file.ref = ${sys:test.appender:-File}
28 | 
29 | appender.file.type = File
30 | appender.file.name = File
31 | appender.file.fileName = target/unit-tests.log
32 | appender.file.layout.type = PatternLayout
33 | appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n%ex
34 | 
35 | # Tests that launch java subprocesses can set the "test.appender" system property to
36 | # "console" to avoid having the child process's logs overwrite the unit test's
37 | # log file.
38 | appender.console.type = Console
39 | appender.console.name = console
40 | appender.console.target = SYSTEM_ERR
41 | appender.console.layout.type = PatternLayout
42 | appender.console.layout.pattern = %t: %m%n%ex
43 | 
44 | # Ignore messages below warning level from Jetty, because it's a bit verbose
45 | logger.jetty.name = org.sparkproject.jetty
46 | logger.jetty.level = warn
47 | 
48 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/AbstractGsCommitterSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations
21 | import com.cloudera.spark.cloud.common.CloudSuite
22 | 
23 | 
24 | 
25 | import org.apache.spark.{SparkConf, SparkScopeWorkarounds}
26 | 
27 | abstract class AbstractGsCommitterSuite extends CloudSuite with GsTestSetup {
28 |   /**
29 |    * Patch up hive for re-use.
30 |    *
31 |    * @param sparkConf configuration to patch
32 |    */
33 |   def addTransientDerbySettings(sparkConf: SparkConf): Unit = {
34 |     hconf(sparkConf, SparkScopeWorkarounds.tempHiveConfig())
35 |   }
36 | 
37 |   /**
38 |    * Override point for suites: a method which is called
39 |    * in all the `newSparkConf()` methods.
40 |    * This can be used to alter values for the configuration.
41 |    * It is called before the configuration read in from the command line
42 |    * is applied, so that tests can override the values applied in-code.
43 |    *
44 |    * @param sparkConf spark configuration to alter
45 |    */
46 |   override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = {
47 |     super.addSuiteConfigurationOptions(sparkConf)
48 |     logDebug("Patching spark conf with committer bindings")
49 |     sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS)
50 |     addTransientDerbySettings(sparkConf)
51 |   }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ACSVReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.common.CSVReadTests
21 | 
22 | /**
23 |  * A suite of tests reading in the S3A CSV file.
24 |  */
25 | class S3ACSVReadSuite extends CSVReadTests with S3ATestSetup with SequentialIOPolicy {
26 | 
27 |   init()
28 | 
29 |   def init(): Unit = {
30 |     setupFilesystemConfiguration(getConf)
31 |     if (enabled) {
32 |       initDatasources()
33 |     }
34 |   }
35 | 
36 | 
37 | /*  class RemoteOutputIterator[T](private val source: RemoteIterator[T]) extends Iterator[T] {
38 |     def hasNext: Boolean = source.hasNext
39 | 
40 |     def next: T = source.next()
41 |   }*/
42 | 
43 |   /*
44 |    * This doesn't do much, except that it is designed to be pasted straight into
45 |    * Zeppelin and work
46 |    */
47 | /*  ctest("DirOps", "simple directory ops in spark context process") {
48 |     val source = CSV_TESTFILE.get
49 |     sc = new SparkContext("local", "CSVgz", newSparkConf(source))
50 | 
51 |     import org.apache.hadoop.fs._
52 |     val landsat = "s3a://landsat-pds/scene_list.gz"
53 |     val landsatPath = new Path(landsat)
54 |     val fs = landsatPath.getFileSystem(sc.hadoopConfiguration)
55 |     val files = fs.listFiles(landsatPath.getParent, false)
56 |     val listing = new RemoteOutputIterator(files)
57 |     listing.foreach(print(_))
58 | 
59 |   }*/
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/committers/AbstractCommitterSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.committers
19 | 
20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations
21 | import com.cloudera.spark.cloud.common.CloudSuite
22 | import com.cloudera.spark.cloud.s3.S3ATestSetup
23 | 
24 | import org.apache.spark.{SparkConf, SparkScopeWorkarounds}
25 | 
26 | abstract class AbstractCommitterSuite extends CloudSuite {
27 |   /**
28 |    * Patch up hive for re-use.
29 |    *
30 |    * @param sparkConf configuration to patch
31 |    */
32 |   def addTransientDerbySettings(sparkConf: SparkConf): Unit = {
33 |     hconf(sparkConf, SparkScopeWorkarounds.tempHiveConfig())
34 |   }
35 | 
36 |   /**
37 |    * Override point for suites: a method which is called
38 |    * in all the `newSparkConf()` methods.
39 |    * This can be used to alter values for the configuration.
40 |    * It is called before the configuration read in from the command line
41 |    * is applied, so that tests can override the values applied in-code.
42 |    *
43 |    * @param sparkConf spark configuration to alter
44 |    */
45 |   override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = {
46 |     super.addSuiteConfigurationOptions(sparkConf)
47 |     logDebug("Patching spark conf with s3a committer bindings")
48 |     sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS)
49 |     addTransientDerbySettings(sparkConf)
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/azure/AzureDataFrameSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.azure
19 | 
20 | import com.cloudera.spark.cloud.common.DataFrameTests
21 | 
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.sql.SparkSession
24 | import org.apache.spark.sql.types.StringType
25 | 
26 | /**
27 |  * Test Azure and DataFrames
28 |  */
29 | class AzureDataFrameSuite extends DataFrameTests with AzureTestSetup {
30 | 
31 |   init()
32 | 
33 |   def init(): Unit = {
34 |     if (enabled) {
35 |       initFS()
36 |     }
37 |   }
38 | 
39 |   /**
40 |    * This is the source for the example; it is here to ensure it compiles.
41 |    */
42 |   def example(sparkConf: SparkConf): Unit = {
43 |     val spark = SparkSession
44 |         .builder
45 |         .appName("DataFrames")
46 |         .config(sparkConf)
47 |         .getOrCreate()
48 |     import spark.implicits._
49 |     val numRows = 1000
50 |     val sourceData = spark.range(0, numRows).select($"id".as("l"), $"id".cast(StringType).as("s"))
51 |     val dest = "wasb://yourcontainer@youraccount.blob.core.windows.net/dataframes"
52 |     val orcFile = dest + "/data.orc"
53 |     sourceData.write.format("orc").save(orcFile)
54 |     // read it back
55 |     val orcData = spark.read.format("orc").load(orcFile)
56 |     // save it to parquet
57 |     val parquetFile = dest + "/data.parquet"
58 |     orcData.write.format("parquet").save(parquetFile)
59 |     spark.stop()
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/abfs/commit/AbstractAbfsCommitterSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.abfs.commit
19 | 
20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations
21 | import com.cloudera.spark.cloud.abfs.AbfsTestSetup
22 | import com.cloudera.spark.cloud.common.CloudSuite
23 | 
24 | import org.apache.spark.{SparkConf, SparkScopeWorkarounds}
25 | 
26 | abstract class AbstractAbfsCommitterSuite extends CloudSuite with AbfsTestSetup {
27 |   /**
28 |    * Patch up hive for re-use.
29 |    *
30 |    * @param sparkConf configuration to patch
31 |    */
32 |   def addTransientDerbySettings(sparkConf: SparkConf): Unit = {
33 |     hconf(sparkConf, SparkScopeWorkarounds.tempHiveConfig())
34 |   }
35 | 
36 |   /**
37 |    * Override point for suites: a method which is called
38 |    * in all the `newSparkConf()` methods.
39 |    * This can be used to alter values for the configuration.
40 |    * It is called before the configuration read in from the command line
41 |    * is applied, so that tests can override the values applied in-code.
42 |    *
43 |    * @param sparkConf spark configuration to alter
44 |    */
45 |   override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = {
46 |     super.addSuiteConfigurationOptions(sparkConf)
47 |     logDebug("Patching spark conf with committer bindings")
48 |     sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS)
49 |     addTransientDerbySettings(sparkConf)
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/ReadSample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | //import org.apache.spark.mllib.linalg.Vectors
21 | 
22 | /**
23 |  * A sample of a read operation.
24 |  * @param started start time in nS
25 |  * @param duration duration nS
26 |  * @param blockSize size of block worked with
27 |  * @param bytesRequested how many bytes were requested
28 |  * @param bytesRead how many bytes were actually returned
29 |  * @param pos position in the object where the read was requested.
30 |  */
31 | class ReadSample(
32 |     val started: Long,
33 |     val duration: Long,
34 |     val blockSize: Int,
35 |     val bytesRequested: Int,
36 |     val bytesRead: Int,
37 |     val pos: Long) extends Serializable {
38 | 
39 |   def perByte: Long = { if (duration > 0)  bytesRead / duration else -1L }
40 | 
41 |   def delta: Int = { bytesRequested - bytesRead }
42 | 
43 |   override def toString: String = s"ReadSample(started=$started, duration=$duration," +
44 |       s" blockSize=$blockSize, bytesRequested=$bytesRequested, bytesRead=$bytesRead)" +
45 |       s" pos=$pos"
46 | 
47 | /*  def toVector = {
48 |     val a = new Array[Double](8)
49 |     a(0) = started.toDouble
50 |     a(1) = duration.toDouble
51 |     a(2) = blockSize.toDouble
52 |     a(3) = bytesRequested.toDouble
53 |     a(4) = bytesRead.toDouble
54 |     a(5) = pos.toDouble
55 |     a(6) = perByte.toDouble
56 |     a(7) = delta.toDouble
57 |     Vectors.dense(a)
58 |   }*/
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/DataFrameTests.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import com.cloudera.spark.cloud.operations.CloudDataFrames
21 | 
22 | /**
23 |  * Test dataframe and object store integration
24 |  */
25 | abstract class DataFrameTests extends CloudSuite {
26 | 
27 |   after {
28 |     cleanFilesystemInTeardown()
29 |   }
30 | 
31 |   /**
32 |    * Override point: the data frame operation to execute
33 |    */
34 |   protected val instance: CloudDataFrames =  new CloudDataFrames()
35 | 
36 |   ctest("DataFrames",
37 |     "Execute the Data Frames example") {
38 |     val conf = newSparkConf()
39 |     conf.setAppName("DataFrames")
40 |     val destDir = testPath(filesystem, "dataframes")
41 |     val rowCount = 1000
42 | 
43 |     val args = Seq(destDir, rowCount)
44 |     assert(0 === instance.action(conf, args),
45 |       s" action($args) failed against $instance")
46 | 
47 |     // do a recursive listFiles
48 |     val listing = logDuration("listFiles(recursive)") {
49 |       listFiles(filesystem, destDir, true)
50 |     }
51 | 
52 |     var recursivelyListedFilesDataset = 0L
53 |     var recursivelyListedFiles = 0
54 |     logDuration("scan result list") {
55 |       listing.foreach{status =>
56 |         recursivelyListedFiles += 1
57 |         recursivelyListedFilesDataset += status.getLen
58 |         logInfo(s"${status.getPath}[${status.getLen}]")
59 |       }
60 |     }
61 | 
62 |     logInfo(s"FileSystem $filesystem")
63 |   }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/ContextFreeCloudSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import com.cloudera.spark.cloud.s3.S3AConstants
21 | import org.scalatest.concurrent.Eventually
22 | import org.scalatest.BeforeAndAfter
23 | 
24 | import org.apache.spark.SparkFunSuite
25 | import org.apache.spark.sql.SparkSession
26 | 
27 | /**
28 |  * A cloud suite which doesn't create a spark context.
29 |  */
30 | abstract class ContextFreeCloudSuite extends SparkFunSuite
31 |   with BeforeAndAfter
32 |   with Eventually with S3AConstants with CloudSuiteTrait {
33 | 
34 | }
35 | 
36 | /**
37 |  * Cloud test suite with a spark session to clean up afterwards
38 |  */
39 | abstract class SparkSessionCloudSuite extends ContextFreeCloudSuite {
40 | 
41 |   var _sparkSession: SparkSession = null
42 | 
43 |   def sparkSession = _sparkSession
44 | 
45 |   def setSparkSession(s: SparkSession): Unit = {
46 |     _sparkSession = s
47 |   }
48 | 
49 |   /**
50 |    * Close any spark session.
51 |    */
52 |   def closeSparkSession(): Unit = {
53 |     if (_sparkSession != null) {
54 |       _sparkSession.close()
55 |       _sparkSession = null
56 |       // To avoid RPC rebinding to the same port, since it doesn't unbind immediately on shutdown
57 |       // (based on LocalSparkContext; no idea if still holds)
58 |       System.clearProperty("spark.driver.port")
59 |     }
60 |   }
61 | 
62 | 
63 |   override def afterEach(): Unit = {
64 |     try {
65 |       closeSparkSession()
66 |     } finally {
67 |       super.afterEach()
68 |     }
69 |   }
70 | 
71 | }


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/gs/GSDependencyCheckSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.gs
19 | 
20 | import com.cloudera.spark.cloud.test.UnitTestSuite
21 | import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
22 | import com.google.cloud.hadoop.fs.gcs.HadoopConfigurationProperty
23 | import org.apache.hadoop.fs.FileSystem
24 | 
25 | /**
26 |  * Force findClass in hadoop gcs classes and some dependencies.
27 |  * Dependency problems should be picked up at compile time; runtime may
28 |  * identify problems with any other transitive library
29 |  */
30 | class GSDependencyCheckSuite extends UnitTestSuite {
31 | 
32 |   test("Create GCS FS Instance") {
33 |     instantiate("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
34 |   }
35 | 
36 |   test("compile time check of filesystem") {
37 |     val fs = new GoogleHadoopFileSystem()
38 |     assert(fs.isInstanceOf[FileSystem])
39 |   }
40 | 
41 |   test("config") {
42 |     new HadoopConfigurationProperty("key")
43 |   }
44 | 
45 |   /**
46 |    * Instantiate the class.
47 |    * This is wrapped because scalatest gets confused about instantiation Errors raised
48 |    * in a test case: they aren't methods, see.
49 |    * @param classname class to instantiate.
50 |    */
51 |   def instantiate(classname: String) {
52 |     try {
53 |       val clazz = this.getClass.getClassLoader.loadClass(classname)
54 |       clazz.newInstance()
55 |     } catch {
56 |       case e: Exception => throw e
57 |       case e: Throwable => throw new Exception(s"Could not instantiate $classname", e)
58 |     }
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/utils/ConfigSerDeser.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import java.io.{ObjectInputStream, ObjectOutputStream}
21 | 
22 | import org.apache.hadoop.conf.Configuration
23 | 
24 | /**
25 |  * Class to make Hadoop configurations serializable; uses the
26 |  * `Writeable` operations to do this.
27 |  * Note: this only serializes the explicitly set values, not any set
28 |  * in site/default or other XML resources.
29 |  * @param conf configuration to serialize
30 |  */
31 | class ConfigSerDeser(var conf: Configuration) extends Serializable {
32 | 
33 |   private val serialVersionUID = 0xABBA0000
34 | 
35 |   /**
36 |    * Empty constructor: binds to a `new Configuration()`.
37 |    */
38 |   def this() {
39 |     this(new Configuration())
40 |   }
41 | 
42 |   /**
43 |    * Get the current configuration.
44 |    * @return the configuration.
45 |    */
46 |   def get(): Configuration = conf
47 | 
48 |   /**
49 |    * Serializable writer.
50 |    * @param out ouput stream
51 |    */
52 |   private def writeObject (out: ObjectOutputStream): Unit = {
53 |     conf.write(out)
54 |   }
55 | 
56 |   /**
57 |    * Serializable reader.
58 |    * @param in input
59 |    */
60 |   private def readObject (in: ObjectInputStream): Unit = {
61 |     conf = new Configuration()
62 |     conf.readFields(in)
63 |   }
64 | 
65 |   /**
66 |    * Handle a read without data; this should never be called, but it
67 |    * is here as a safety mechanism.
68 |    */
69 |   private def readObjectNoData(): Unit = {
70 |     conf = new Configuration()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3AExampleSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations
21 | import com.cloudera.spark.cloud.common.StoreTestOperations
22 | import org.apache.hadoop.conf.Configuration
23 | 
24 | import org.apache.spark.SparkConf
25 | 
26 | /**
27 |  * Base Class for examples working with S3.
28 |  */
29 | trait S3AExampleSetup extends StoreTestOperations with S3AConstants {
30 | 
31 |   /**
32 |    * Set the standard S3A Hadoop options to be used in test/examples.
33 |    * If Random IO is expected, then the experimental fadvise option is
34 |    * set to random.
35 |    *
36 |    * @param sparkConf spark configuration to patch
37 |    * @param randomIO is the IO expected to be random access?
38 |    */
39 |   override protected def applyObjectStoreConfigurationOptions(
40 |       sparkConf: SparkConf, randomIO: Boolean): Unit = {
41 |     super.applyObjectStoreConfigurationOptions(sparkConf, true)
42 |     // smaller block size to divide up work
43 |     hconf(sparkConf, BLOCK_SIZE, 1 * 1024 * 1024)
44 |     hconf(sparkConf, MULTIPART_SIZE, MIN_PERMITTED_MULTIPART_SIZE)
45 |     hconf(sparkConf, READAHEAD_RANGE, "128K")
46 |     hconf(sparkConf, MIN_MULTIPART_THRESHOLD, MIN_PERMITTED_MULTIPART_SIZE)
47 |     hconf(sparkConf, INPUT_FADVISE, if (randomIO) RANDOM_IO else NORMAL_IO)
48 |     // disable file output in the path output committer as s safety check
49 |     hconf(sparkConf, REJECT_FILE_OUTPUT, true)
50 |     verifyConfigurationOptions(sparkConf,
51 |       ObjectStoreConfigurations.COMMITTER_OPTIONS)
52 |   }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/sources/CloudPartitionTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.sources
19 | 
20 | import org.apache.hadoop.fs.Path
21 | 
22 | import org.apache.spark.sql._
23 | import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
24 | 
25 | /**
26 |  * Test of a single operation; isolated for debugging.
27 |  */
28 | abstract class CloudPartitionTest extends AbstractCloudRelationTest {
29 | 
30 | import testImplicits._
31 | 
32 |   protected val rows = 3
33 |   protected val part1size = 2
34 | 
35 |   ctest(
36 |     "save-findClass-partitioned-part-columns-in-data",
37 |     "Save sets of files in explicitly set up partition tree; read") {
38 |     withTempPathDir("part-columns", None) { path =>
39 |       for (p1 <- 1 to part1size; p2 <- Seq("foo", "bar")) {
40 |         val partitionDir = new Path(path, s"p1=$p1/p2=$p2")
41 |         val df = sparkContext
42 |           .parallelize(for (i <- 1 to rows) yield (i, s"val_$i", p1))
43 |           .toDF("a", "b", "p1")
44 | 
45 |          df.write
46 |           .format(dataSourceName)
47 |           .mode(SaveMode.ErrorIfExists)
48 |           .save(partitionDir.toString)
49 |         // each of these directories as its own success file; there is
50 |         // none at the root
51 |         resolveSuccessFile(partitionDir, true)
52 |       }
53 | 
54 |       val dataSchemaWithPartition =
55 |         StructType(
56 |           dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))
57 | 
58 |       checkQueries(
59 |         spark.read.options(Map(
60 |           "path" -> path.toString,
61 |           "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName)
62 |           .load())
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/org/apache/spark/cloudera/statistics/IOStatisticsAccumulator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.cloudera.statistics
19 | 
20 | import org.apache.hadoop.fs.statistics.{IOStatistics, IOStatisticsSnapshot, IOStatisticsSource}
21 | 
22 | import org.apache.spark.util.AccumulatorV2
23 | 
24 | /**
25 |  * An accumulator which collects and aggregates IOStatistics.
26 |  */
27 | class IOStatisticsAccumulator extends AccumulatorV2[IOStatistics, IOStatisticsSnapshot]
28 |     with IOStatisticsSource {
29 | 
30 |   // the snapshot to accumulate.
31 |   private var iostatistics: IOStatisticsSnapshot = new IOStatisticsSnapshot()
32 | 
33 |   /**
34 |    * Empty if all the various maps are empty.
35 |    * Not thread safe.
36 |    * @return true if the accumulator is empty.
37 |    */
38 |   override def isZero: Boolean = iostatistics.counters().isEmpty &&
39 |     iostatistics.gauges().isEmpty &&
40 |     iostatistics.maximums().isEmpty &&
41 |     iostatistics.minimums().isEmpty &&
42 |     iostatistics.meanStatistics().isEmpty
43 | 
44 |   override def copy(): AccumulatorV2[IOStatistics, IOStatisticsSnapshot] = {
45 |     val newAcc = new IOStatisticsAccumulator()
46 |     newAcc.add(this.iostatistics)
47 |     newAcc
48 |   }
49 | 
50 |   override def reset(): Unit = {
51 |     iostatistics.clear()
52 |   }
53 | 
54 |   override def add(v: IOStatistics): Unit = iostatistics.aggregate(v)
55 | 
56 |   override def merge(other: AccumulatorV2[IOStatistics, IOStatisticsSnapshot]): Unit =
57 |     add(other.value)
58 | 
59 |   override def value: IOStatisticsSnapshot = iostatistics
60 | 
61 |   override def getIOStatistics: IOStatistics = iostatistics
62 | 
63 |   def register(name: String): Unit = {
64 |     super.isRegistered
65 | 
66 |   }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/utils/StatisticsTracker.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import scala.collection.JavaConverters._
21 | 
22 | import org.apache.hadoop.fs.{FileSystem, StorageStatistics}
23 | 
24 | import org.apache.spark.internal.Logging
25 | 
26 | class StatisticsTracker(fs: FileSystem) extends Logging {
27 | 
28 |   private val start: StorageStatistics = fs.getStorageStatistics
29 | 
30 |   import StatisticsTracker._
31 | 
32 |   val original: Map[String, Long] = statsToMap(start)
33 | 
34 |   var updated: Map[String, Long] = Map()
35 | 
36 |   def update(): StatisticsTracker = {
37 |     updated = statsToMap(fs.getStorageStatistics)
38 |     this
39 |   }
40 | 
41 |   /**
42 |    * Build a diff from current to actual.
43 |    * @return map of changed values only
44 |    */
45 |   def diff(): Map[String, Long] = {
46 |     updated.map { case (name: String, value: Long) =>
47 |       name -> (value - original.getOrElse(name, 0L))
48 |     }.filter{tuple => tuple._2 != 0}
49 |   }
50 | 
51 |   /**
52 |    * Dump all changed values.
53 |    * @param prefix prefix of a line
54 |    * @param join join between values
55 |    * @param suffix suffix each line
56 |    * @param merge merge between lines
57 |    * @return
58 |    */
59 |   def dump(prefix: String, join: String, suffix: String, merge: String): String = {
60 |     diff.map { case (name, value) =>
61 |       (prefix + name + join + value + suffix)
62 |     }.mkString(merge)
63 | 
64 |   }
65 | 
66 |   def dump(): String = {
67 |     fs.getUri + "\n" + dump("  [", " = ", "]", "\n")
68 |   }
69 | 
70 | 
71 | }
72 | 
73 | object StatisticsTracker {
74 | 
75 |   def statsToMap(stats: StorageStatistics): Map[String, Long] = {
76 | 
77 |     stats.getLongStatistics.asScala.map { s =>
78 |       s.getName -> s.getValue
79 |     }.toMap
80 | 
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/S3ACommitterFactorySuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3.commit
19 | 
20 | import com.cloudera.spark.cloud.ObjectStoreConfigurations
21 | import org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
22 | import org.apache.hadoop.fs.s3a.commit.magic.MagicS3GuardCommitterFactory
23 | import org.apache.hadoop.fs.s3a.commit.staging.{DirectoryStagingCommitterFactory, PartitionedStagingCommitterFactory}
24 | 
25 | import org.apache.spark.SparkConf
26 | 
27 | /**
28 |  * Explicitly create the S3A committers; forces compile-time
29 |  * validation that the factory classes are on the classpath,
30 |  * along with any direct dependencies.
31 |  */
32 | class S3ACommitterFactorySuite extends AbstractS3ACommitterSuite {
33 | 
34 |   init()
35 | 
36 |   def init(): Unit = {
37 |     // propagate S3 credentials
38 |     if (enabled) {
39 |       initFS()
40 |     }
41 |   }
42 | 
43 |   /**
44 |    * Override point for suites: a method which is called
45 |    * in all the `newSparkConf()` methods.
46 |    * This can be used to alter values for the configuration.
47 |    * It is called before the configuration read in from the command line
48 |    * is applied, so that tests can override the values applied in-code.
49 |    *
50 |    * @param sparkConf spark configuration to alter
51 |    */
52 |   override protected def addSuiteConfigurationOptions(sparkConf: SparkConf): Unit = {
53 |     super.addSuiteConfigurationOptions(sparkConf)
54 |     sparkConf.setAll(ObjectStoreConfigurations.COMMITTER_OPTIONS)
55 |   }
56 | 
57 |   ctest("DirectoryStagingCommitterFactory on CP") {
58 |     new DirectoryStagingCommitterFactory()
59 |   }
60 | 
61 |   ctest("PartitionedStagingCommitterFactory on CP") {
62 |     new PartitionedStagingCommitterFactory()
63 |   }
64 | 
65 |   ctest("MagicS3GuardCommitterFactory on CP") {
66 |     new MagicS3GuardCommitterFactory()
67 |   }
68 | 
69 |   ctest("S3ACommitterFactory on CP") {
70 |     new S3ACommitterFactory()
71 |   }
72 | 
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | 
 2 | #   Licensed under the Apache License, Version 2.0 (the "License");
 3 | #   you may not use this file except in compliance with the License.
 4 | #   You may obtain a copy of the License at
 5 | #
 6 | #       http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | #   Unless required by applicable law or agreed to in writing, software
 9 | #   distributed under the License is distributed on an "AS IS" BASIS,
10 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | #   See the License for the specific language governing permissions and
12 | #   limitations under the License.
13 | # log4j configuration used during build and unit tests
14 | 
15 | log4j.rootLogger=INFO,stdout
16 | log4j.threshold=ALL
17 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
18 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
19 | log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} [%t] %-5p %c{2} (%F:%M(%L)) - %m%n
20 | 
21 | # ALWAYS leave this at debug, it's used to explore what's up with logging
22 | log4j.logger.com.cloudera.spark.test.loglevels=DEBUG
23 | 
24 | # Spark commit protocol
25 | #log4j.logger.org.apache.spark.internal.io=DEBUG
26 | #log4j.logger.com.hortonworks.spark=DEBUG
27 | 
28 | #log4j.logger.org.apache.hadoop.fs.s3a=DEBUG
29 | log4j.logger.org.apache.hadoop.fs.s3a.S3ABlockOutputStream=INFO
30 | log4j.logger.org.apache.hadoop.fs.s3a.S3AStorageStatistics=INFO
31 | log4j.logger.org.apache.hadoop.fs.s3a.S3AUtils=INFO
32 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.PathOutputCommitterFactory=DEBUG
33 | log4j.logger.org.apache.hadoop.fs.s3a.commit=DEBUG
34 | #log4j.logger.org.apache.hadoop.fs.s3a=DEBUG
35 | 
36 | log4j.logger.org.apache.spark.ContextCleaner=WARN
37 | log4j.logger.org.apache.spark.storage.memory.MemoryStore=WARN
38 | log4j.logger.org.apache.spark.sql.execution.FileSourceScanExec=WARN
39 | log4j.logger.org.apache.spark.storage=WARN
40 | log4j.logger.org.apache.spark.sql.catalyst=WARN
41 | log4j.logger.org.apache.spark.SecurityManager=WARN
42 | log4j.logger.org.apache.spark.sql.internal=WARN
43 | log4j.logger.org.apache.spark.scheduler=WARN
44 | log4j.logger.org.apache.spark.SparkEnv=WARN
45 | log4j.logger.org.apache.spark.executor.Executor=WARN
46 | log4j.logger.org.apache.spark.sql.execution.streaming.state=WARN
47 | log4j.logger.org.apache.hadoop.hive.ql.io.orc.RecordReaderFactory=WARN
48 | 
49 | 
50 | #log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=DEBUG
51 | #log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=DEBUG
52 | 
53 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
54 | log4j.logger.org.mortbay.jetty=ERROR
55 | # disable deprecation noise
56 | log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=ERROR
57 | 
58 | # turn off other logs which
59 | log4j.logger.org.eclipse.jetty=ERROR
60 | log4j.logger.org.spark_project.jetty=ERROR
61 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.committer.manifest=DEBUG
62 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/Events.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3.commit
19 | 
20 | import scala.collection.immutable
21 | 
22 | /**
23 |  * Case class for the dataframes
24 |  */
25 | case class Event(
26 |     year: Int, month: Int, day: Int, ymd: Int, monthname: String,
27 |     datestr: String, value: String)
28 | 
29 | object Events {
30 | 
31 |   /**
32 |    * Build up an event sequence across years, every month in every
33 |    * year has "rows" events generated.
34 |    * @param year1 start year
35 |    * @param year2  end year
36 |    * @param startMonth start month
37 |    * @param endMonth end month
38 |    * @param rows rows per month
39 |    * @return the event sequence.
40 |    */
41 |   def events(
42 |       year1: Int,
43 |       year2: Int,
44 |       startMonth: Int,
45 |       endMonth: Int,
46 |       rows: Int): immutable.IndexedSeq[Event] = {
47 |     for (year <- year1 to year2;
48 |       month <- startMonth to endMonth;
49 |       day <- 1 to Months(month - 1)._2;
50 |       r <- 1 to rows)
51 |       yield event(year,
52 |         month,
53 |         day,
54 |         "%d/%04f".format(r, Math.random() * 10000))
55 |   }
56 | 
57 |   def monthCount(
58 |       year1: Int,
59 |       year2: Int,
60 |       startMonth: Int,
61 |       endMonth: Int): Int = {
62 |     var count = 0
63 |     for (year <- year1 to year2;
64 |       month <- startMonth to endMonth)
65 |       count += 1
66 |     count
67 |   }
68 | 
69 |   /**
70 |    * Create an event.
71 |    *
72 |    * @return the event.
73 |    */
74 |   def event(year: Int, month: Int, day: Int, value: String): Event = {
75 |     new Event(year, month, day,
76 |       day + month * 100 + year * 10000,
77 |       Months(month - 1)._1,
78 |       "%04d-%02d0-%02d".format(year, month, day),
79 |       value
80 |     )
81 |   }
82 | 
83 |   val Months = Array(
84 |     ("Jan", 31),
85 |     ("Feb", 28),
86 |     ("Mar", 31),
87 |     ("Apr", 30),
88 |     ("May", 31),
89 |     ("Jun", 30),
90 |     ("Jul", 31),
91 |     ("Aug", 31),
92 |     ("Sep", 30),
93 |     ("Oct", 31),
94 |     ("Nov", 30),
95 |     ("Dec", 31))
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/utils/HConf.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.utils
19 | 
20 | import org.apache.spark.SparkConf
21 | 
22 | /**
23 |   * A minimal trait purely to set Hadoop configuration values in a Spark
24 |   * Configuration.
25 |   */
26 | trait HConf {
27 |   /**
28 |     * Set a Hadoop option in a spark configuration.
29 |     *
30 |     * @param sparkConf configuration to update
31 |     * @param key       key
32 |     * @param value     new value
33 |     */
34 |   def hconf(sparkConf: SparkConf, key: String, value: String): SparkConf = {
35 |     sparkConf.set(hkey(key), value)
36 |     sparkConf
37 |   }
38 | 
39 |   /**
40 |     * Set a Hadoop option in a spark configuration.
41 |     *
42 |     * @param sparkConf configuration to update
43 |     * @param key       key
44 |     * @param value     new value
45 |     */
46 | 
47 |   def hconf(sparkConf: SparkConf, key: String, value: Boolean): SparkConf = {
48 |     sparkConf.set(hkey(key), value.toString)
49 |     sparkConf
50 |   }
51 | 
52 |   /**
53 |     * Take a Hadoop key, add the prefix to allow it to be added to
54 |     * a Spark Config and then picked up properly later.
55 |     *
56 |     * @param key key
57 |     * @return the new key
58 |     */
59 |   def hkey(key: String): String = {
60 |     "spark.hadoop." + key
61 |   }
62 | 
63 |   /**
64 |     * Set a long hadoop option in a spark configuration.
65 |     *
66 |     * @param sparkConf configuration to update
67 |     * @param key       key
68 |     * @param value     new value
69 |     */
70 |   def hconf(sparkConf: SparkConf, key: String, value: Long): SparkConf = {
71 |     sparkConf.set(hkey(key), value.toString)
72 |     sparkConf
73 |   }
74 | 
75 |   /**
76 |     * Set all supplied options to the spark configuration as hadoop options.
77 |     *
78 |     * @param sparkConf Spark configuration to update
79 |     * @param settings  map of settings.
80 |     */
81 |   def hconf(sparkConf: SparkConf,
82 |     settings: Traversable[(String, Object)]): SparkConf = {
83 |     settings.foreach(e => hconf(sparkConf, e._1, e._2.toString))
84 |     sparkConf
85 |   }
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/org/apache/spark/sql/sources/HiveTestTrait.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.sources
19 | 
20 | import java.io.File
21 | 
22 | import com.cloudera.spark.cloud.ObjectStoreConfigurations
23 | import org.scalatest.BeforeAndAfterAll
24 | 
25 | import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
26 | import org.apache.spark.sql.SparkSession
27 | import org.apache.spark.sql.hive.test.TestHiveContext
28 | import org.apache.spark.util.Utils
29 | 
30 | /**
31 |  * A trait for tests which bonds to a hive context
32 |  * After all tests the hive context is reset then it and the spark session
33 |  * closed.
34 |  */
35 | trait HiveTestTrait extends SparkFunSuite with BeforeAndAfterAll {
36 | //  override protected val enableAutoThreadAudit = false
37 |   protected var hiveContext: HiveInstanceForTests = _
38 |   protected var spark: SparkSession = _
39 | 
40 | 
41 |   protected override def beforeAll(): Unit = {
42 |     super.beforeAll()
43 |     // set up spark and hive context
44 |     hiveContext = new HiveInstanceForTests()
45 |     spark = hiveContext.sparkSession
46 |   }
47 | 
48 |   protected override def afterAll(): Unit = {
49 |     try {
50 |       SparkSession.clearActiveSession()
51 | 
52 |       if (hiveContext != null) {
53 |         hiveContext.reset()
54 |         hiveContext = null
55 |       }
56 |       if (spark != null) {
57 |         spark.close()
58 |         spark = null
59 |       }
60 |     } finally {
61 |       super.afterAll()
62 |     }
63 |   }
64 | 
65 | }
66 | 
67 | class HiveInstanceForTests
68 |   extends TestHiveContext(
69 |     new SparkContext(
70 |       System.getProperty("spark.sql.test.master", "local[1]"),
71 |       "TestSQLContext",
72 |       new SparkConf()
73 |         .setAll(ObjectStoreConfigurations.RW_TEST_OPTIONS)
74 |         .set("spark.sql.warehouse.dir",
75 |           TestSetup.makeWarehouseDir().toURI.getPath)
76 |     )
77 |   ) {
78 | 
79 | }
80 | 
81 | 
82 | 
83 | 
84 | object TestSetup {
85 | 
86 |   def makeWarehouseDir(): File = {
87 |     val warehouseDir = Utils.createTempDir(namePrefix = "warehouse")
88 |     warehouseDir.delete()
89 |     warehouseDir
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/org/apache/spark/cloudera/statistics/IOStatisticsCollectorExecutorPlugin.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.cloudera.statistics
19 | 
20 | import java.util
21 | 
22 | import org.apache.hadoop.fs.statistics.IOStatisticsContext
23 | 
24 | import org.apache.spark.{SparkContext, TaskContext, TaskFailedReason}
25 | import org.apache.spark.api.plugin.{ExecutorPlugin, PluginContext}
26 | import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
27 | import org.apache.spark.util.TaskCompletionListener
28 | 
29 | class IOStatisticsCollectorExecutorPlugin extends ExecutorPlugin {
30 | 
31 |   var context: PluginContext = _
32 | 
33 |   override def init(
34 |     ctx: PluginContext,
35 |     extraConf: util.Map[String, String]): Unit = {
36 | 
37 |     context = ctx
38 |     // somehow get the active spark context to register
39 |     // the accumulator
40 |     SparkContext.getOrCreate()
41 | 
42 |   }
43 |   override def shutdown(): Unit = super.shutdown()
44 | 
45 |   override def onTaskStart(): Unit = {
46 |     val iostatsCtx:IOStatisticsContext = IOStatisticsContext.getCurrentIOStatisticsContext
47 |     iostatsCtx.reset;
48 |     val acc = new IOStatisticsAccumulator
49 | 
50 | 
51 |     val taskContext = TaskContext.get()
52 | 
53 | 
54 |     taskContext.registerAccumulator(acc)
55 |     taskContext.addTaskCompletionListener(new TaskCompleted(acc, iostatsCtx))
56 | 
57 |   }
58 | 
59 |   override def onTaskSucceeded(): Unit = super.onTaskSucceeded()
60 | 
61 |   override def onTaskFailed(
62 |     failureReason: TaskFailedReason): Unit = super
63 |     .onTaskFailed(failureReason)
64 | 
65 |   private class TaskCompleted(
66 |     val acc: IOStatisticsAccumulator,
67 |     val iostatsCtx: IOStatisticsContext) extends TaskCompletionListener {
68 | 
69 |     override def onTaskCompletion(context: TaskContext): Unit = {
70 |       acc.add(iostatsCtx.getIOStatistics)
71 |     }
72 | 
73 |   }
74 | 
75 |   private class SparkListenerImpl extends SparkListener {
76 |     override def onJobStart(
77 |       jobStart: SparkListenerJobStart): Unit = super
78 |       .onJobStart(jobStart)
79 |   }
80 | }
81 | 
82 | 
83 | object IOStatisticsCollectorExecutorPlugin {
84 |   val ACCUMULATOR_NAME = "io_statistics"
85 | }


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/StoreTestHelper.scala:
--------------------------------------------------------------------------------
 1 | package com.cloudera.spark.cloud.common
 2 | 
 3 | /*
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.io.{File, FileNotFoundException}
21 | 
22 | import com.cloudera.spark.cloud.s3.S3AConstants
23 | import com.cloudera.spark.cloud.CommitterBinding
24 | import org.apache.hadoop.conf.Configuration
25 | 
26 | import org.apache.spark.internal.Logging
27 | 
28 | /**
29 |  * Instantiation of StoreTestHelper.
30 |  */
31 | object StoreTestHelper extends StoreTestOperations
32 |   with Logging
33 |   with S3AConstants
34 |   with CloudSuiteTrait {
35 | 
36 |   private var configLogged = false
37 | 
38 |   /**
39 |    * Load the configuration file from the system property `SYSPROP_CLOUD_TEST_CONFIGURATION_FILE`.
40 |    * Throws FileNotFoundException if a configuration is named but not present.
41 |    *
42 |    * @return the configuration
43 |    */
44 |   def loadConfiguration(): Configuration = {
45 |     val config = new Configuration(true)
46 |     getKnownSysprop(SYSPROP_CLOUD_TEST_CONFIGURATION_FILE).foreach { filename =>
47 |       logDebug(s"Configuration property = `$filename`")
48 |       val f = new File(filename)
49 |       if (f.exists()) {
50 |         // unsynced but its only a log statement
51 |         if (configLogged) {
52 |           configLogged = true
53 |           logInfo(s"Loading configuration from $f")
54 |         }
55 |         config.addResource(f.toURI.toURL)
56 |       } else {
57 |         throw new FileNotFoundException(s"No file '$filename'" +
58 |           s" declared in property $SYSPROP_CLOUD_TEST_CONFIGURATION_FILE")
59 |       }
60 |     }
61 |     overlayConfiguration(
62 |       config,
63 |       Seq(
64 |         HIVE_TESTS_DISABLED,
65 |         REQUIRED_HADOOP_VERSION,
66 |         SCALE_TEST_ENABLED,
67 |         SCALE_TEST_SIZE_FACTOR,
68 |         S3A_COMMITTER_TEST_ENABLED,
69 |         S3A_ENCRYPTION_KEY_1,
70 |         S3A_ENCRYPTION_KEY_2
71 |       )
72 |     )
73 | 
74 |     // setup the committer from any property passed in
75 |     getKnownSysprop(S3A_COMMITTER_NAME).foreach(committer => {
76 |       val binding = CommitterBinding.COMMITTERS_BY_NAME(committer.toLowerCase())
77 |       binding.bind(config)
78 |       logInfo(s"Using committer binding $binding")
79 |     })
80 |     config
81 |   }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/FileGeneratorTests.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import com.cloudera.spark.cloud.operations.CloudFileGenerator
21 | import org.apache.hadoop.fs.Path
22 | 
23 | import org.apache.spark.SparkConf
24 | 
25 | /**
26 |  * Test the `FileGenerator` entry point. Use a small file number to keep the unit tests fast; some
27 |  * cloud infras are very slow here. System tests can use the CLI instead.
28 |  */
29 | abstract class FileGeneratorTests extends CloudSuite {
30 | 
31 |   ctest("FileGenerator", "Execute the FileGenerator example") {
32 |     val conf = newSparkConf()
33 |     conf.setAppName("FileGenerator")
34 |     val destDir = testPath(filesystem, "filegenerator")
35 |     val months = 2
36 |     val fileCount = 1
37 |     val rowCount = 500
38 | 
39 |     assert(0 === generate(conf, destDir, months, fileCount, rowCount))
40 | 
41 |     val status = filesystem.getFileStatus(destDir)
42 |     assert(status.isDirectory, s"Not a directory: $status")
43 | 
44 |     val totalExpectedFiles = months * fileCount
45 | 
46 |     // do a recursive listFiles
47 |     val listing = logDuration("listFiles(recursive)") {
48 |       listFiles(filesystem, destDir, true)
49 |     }
50 |     var recursivelyListedFilesDataset = 0L
51 |     var recursivelyListedFiles = 0
52 |     logDuration("scan result list") {
53 |       listing.foreach { status =>
54 |         recursivelyListedFiles += 1
55 |         recursivelyListedFilesDataset += status.getLen
56 |         logInfo(s"${status.getPath}[${status.getLen}]")
57 |       }
58 |     }
59 | 
60 |     logInfo(s"FileSystem $filesystem")
61 |     assert(totalExpectedFiles === recursivelyListedFiles)
62 |   }
63 | 
64 |   /**
65 |    * Generate a set of files
66 |    * @param conf configuration
67 |    * @param destDir destination directory
68 |    * @param monthCount number of months to generate
69 |    * @param fileCount files per month
70 |    * @param rowCount rows per file
71 |    * @return the exit code of the operation
72 |    */
73 |   def generate(
74 |       conf: SparkConf,
75 |       destDir: Path,
76 |       monthCount: Int,
77 |       fileCount: Int,
78 |       rowCount: Int): Int = {
79 |     val result = new CloudFileGenerator().action(
80 |       conf,
81 |       Seq(destDir,
82 |         monthCount,
83 |         fileCount,
84 |         rowCount))
85 |     result
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/s3/audit/LogParser.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3.audit
19 | 
20 | import java.util.regex.Matcher
21 | 
22 | import org.apache.hadoop.fs.s3a.audit.S3LogParser
23 | import org.apache.hadoop.fs.s3a.audit.S3LogParser._
24 | 
25 | 
26 | /**
27 |  * Log parsing using s3a audit classes.
28 |  */
29 | object LogParser {
30 | 
31 |   private val pattern = S3LogParser.LOG_ENTRY_PATTERN
32 | 
33 |   private def entry(matcher: Matcher, group: String): String = {
34 |     val g = matcher.group(group)
35 |     assert(g != null, s"Group $group is null")
36 |     assert(!g.isEmpty, s"Group $group is empty")
37 |     g
38 |   }
39 | 
40 |   private def longEntry(m: Matcher, group: String): Long = {
41 |     entry(m, group).toLong
42 |   }
43 | 
44 |   /**
45 |    * Parse a line.
46 |    * @param line line
47 |    * @return the entry or None if the regexp didn't match
48 |    * @throws AssertionError if a group is null/empty
49 |    */
50 |   def parse(line: String): Option[ServerLogEntry] = {
51 |     val m = pattern.matcher(line)
52 | 
53 |     if (m.matches()) {
54 |       return None
55 |     } else {
56 |       Some(ServerLogEntry(
57 |         bucketowner = entry(m, OWNER_GROUP),
58 |         bucket_name = entry(m, BUCKET_GROUP),
59 |         requestdatetime = entry(m, TIMESTAMP_GROUP),
60 |         remoteip = entry(m, REMOTEIP_GROUP),
61 |         requester = entry(m, REQUESTER_GROUP),
62 |         requestid = entry(m, REQUESTID_GROUP),
63 |         operation = entry(m, VERB_GROUP),
64 |         key = entry(m, KEY_GROUP),
65 |         request_uri = entry(m, REQUESTURI_GROUP),
66 |         httpstatus = entry(m, HTTP_GROUP),
67 |         errorcode = entry(m, AWSERRORCODE_GROUP),
68 |         bytessent = longEntry(m, BYTESSENT_GROUP),
69 |         objectsize = longEntry(m, OBJECTSIZE_GROUP),
70 |         totaltime = entry(m, TOTALTIME_GROUP),
71 |         turnaroundtime = entry(m, TURNAROUNDTIME_GROUP),
72 |         referrer = entry(m, REFERRER_GROUP),
73 |         useragent = entry(m, USERAGENT_GROUP),
74 |         versionid = entry(m, VERSION_GROUP),
75 |         hostid = entry(m, HOSTID_GROUP),
76 |         sigv = entry(m, SIGV_GROUP),
77 |         ciphersuite = entry(m, CYPHER_GROUP),
78 |         authtype = entry(m, AUTH_GROUP),
79 |         endpoint = entry(m, ENDPOINT_GROUP),
80 |         tlsversion = entry(m, TLS_GROUP)))
81 |     }
82 | 
83 |   }
84 | 
85 | }
86 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/StoreTestOperations.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import scala.concurrent.duration._
21 | import scala.language.postfixOps
22 | 
23 | import com.cloudera.spark.cloud.{GeneralCommitterConstants, ObjectStoreOperations}
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.fs.{FileStatus, FileSystem, LocatedFileStatus, Path}
26 | import org.scalatest.concurrent.Eventually
27 | import org.scalatest.time.Span
28 | 
29 | import org.apache.spark.sql._
30 | 
31 | /**
32 |  * Extends ObjectStoreOperations with some extra ones for testing.
33 |  */
34 | trait StoreTestOperations extends ObjectStoreOperations with Eventually {
35 | 
36 |   protected val retryTimeout: Span = 30 seconds
37 | 
38 |   protected val retryInterval: Span = 1000 milliseconds
39 | 
40 |   /**
41 |    * Try to get the file status, _eventually_.
42 |    *
43 |    * @param fs filesystem
44 |    * @param p path
45 |    * @return the result
46 |    */
47 |   def eventuallyGetFileStatus(fs: FileSystem, p: Path): FileStatus = {
48 |     fs.getFileStatus(p)
49 |   }
50 | 
51 |   /**
52 |    * findClass a DF and verify it has the expected number of rows
53 |    *
54 |    * @param spark session
55 |    * @param fs filesystem
56 |    * @param source path
57 |    * @param srcFormat format of source
58 |    * @param rowCount expected row caount
59 |    * @return return how long it took
60 |    */
61 |   def validateRowCount(
62 |       spark: SparkSession,
63 |       fs: FileSystem,
64 |       source: Path,
65 |       srcFormat: String,
66 |       rowCount: Long): Long = {
67 |     val success = new Path(source, GeneralCommitterConstants.SUCCESS_FILE_NAME)
68 |     val status = fs.getFileStatus(success)
69 |     assert(status.isDirectory || status.getBlockSize > 0,
70 |       s"Block size 0 in $status")
71 |     val files = listFiles(fs, source, true).filter { st =>
72 |       val name = st.getPath.getName
73 |       st.isFile && !name.startsWith(".") && !name.startsWith("_")
74 |     }
75 |     assert(files.nonEmpty, s"No files in the directory $source")
76 |     val (loadedCount, loadTime) = durationOf(loadDF(spark, source, srcFormat)
77 |       .count())
78 |     logInfo(s"Loaded $source in $loadTime nS")
79 |     require(rowCount == loadedCount,
80 |       s"Expected $rowCount rows, but got $loadedCount from $source formatted as $srcFormat")
81 |     loadTime
82 |   }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/s3/S3ATestSetup.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import java.net.URI
21 | 
22 | import com.cloudera.spark.cloud.common.{CloudSuiteTrait, CsvDatasourceSupport}
23 | import org.apache.hadoop.conf.Configuration
24 | import org.apache.hadoop.fs.{FileSystem, Path}
25 | 
26 | /**
27 |  * Trait for S3A tests.
28 |  */
29 | trait S3ATestSetup extends CloudSuiteTrait with RandomIOPolicy with
30 |   CsvDatasourceSupport {
31 | 
32 |   override def enabled: Boolean = {
33 |     getConf.getBoolean(S3A_TESTS_ENABLED, false) && super.enabled
34 | 
35 |   }
36 | 
37 |   /**
38 |    * this is *not* true but here to make sure the tests
39 |    * fail the way they are meant to.
40 |    * @return true if the store committer expected to support dynamic override
41 |    */
42 |   override def dynamicPartitioning: Boolean = false
43 | 
44 |   def initFS(): FileSystem = {
45 |     setupFilesystemConfiguration(getConf)
46 |     createTestS3AFS
47 |   }
48 | 
49 |   /**
50 |    * do the work of setting up the S3Test FS
51 |    * @return the filesystem
52 |    */
53 |   protected def createTestS3AFS: FileSystem = {
54 |     val s3aURI = new URI(requiredOption(S3A_TEST_URI))
55 |     logInfo(s"Executing S3 tests against $s3aURI with read policy $inputPolicy")
56 |     createFilesystem(s3aURI)
57 |   }
58 | 
59 |   /**
60 |    * Override point: set up the configuration for the filesystem.
61 |    * The base implementation sets up buffer directory, block size and IO Policy.
62 |    * @param config configuration to set up
63 |    */
64 |   def setupFilesystemConfiguration(config: Configuration): Unit = {
65 |     config.set(BUFFER_DIR, localTmpDir.getAbsolutePath)
66 |     // a block size of 1MB
67 |     config.set(BLOCK_SIZE, (1024 * 1024).toString)
68 |     // the input policy
69 |     config.set(INPUT_FADVISE, inputPolicy)
70 |   }
71 | 
72 |   lazy val CSV_TESTFILE: Option[Path] = {
73 |     val pathname = getConf.get(S3A_CSVFILE_PATH, S3A_CSV_PATH_DEFAULT)
74 |     if (!pathname.isEmpty) Some(new Path(pathname)) else None
75 |   }
76 | 
77 |   /**
78 |    * Predicate to define whether or not there's a CSV file to work with.
79 |    * @return true if the CSV test file is defined.
80 |    */
81 |   override def hasCSVTestFile(): Boolean = CSV_TESTFILE.isDefined
82 | 
83 |   /**
84 |    * Path to the CSV file's original source
85 |    *
86 |    * @return a path
87 |    */
88 |   override def sourceCSVFilePath: Option[Path] = CSV_TESTFILE
89 | }
90 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/commit/S3ACommitDataframeSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3.commit
19 | 
20 | import com.cloudera.spark.cloud.CommitterBinding._
21 | import com.cloudera.spark.cloud.committers.AbstractCommitDataframeSuite
22 | import com.cloudera.spark.cloud.s3.S3ATestSetup
23 | import com.cloudera.spark.cloud.CommitterInfo
24 | import org.apache.hadoop.fs.{FileSystem, Path}
25 | 
26 | import org.apache.spark.sql.{Dataset, SparkSession}
27 | import org.apache.spark.SparkConf
28 | 
29 | /**
30 |  * Tests different data formats through the committers.
31 |  */
32 | class S3ACommitDataframeSuite
33 |   extends AbstractCommitDataframeSuite with S3ATestSetup {
34 | 
35 |   init()
36 | 
37 |   def init(): Unit = {
38 |     // propagate S3 credentials
39 |     if (enabled) {
40 |       initFS()
41 |     }
42 |   }
43 | 
44 |   override def schema: String = "s3a"
45 | 
46 | 
47 |   // there's an empty string at the end to aid with commenting out different
48 |   // committers and not have to worry about any trailing commas
49 |   override def committers: Seq[String] = Seq(
50 |     DIRECTORY,
51 |     PARTITIONED,
52 |     MAGIC,
53 |     ""
54 |   )
55 | 
56 | 
57 |   override protected def setDynamicPartitioningOptions(
58 |     sparkConf: SparkConf,
59 |     committerInfo: CommitterInfo): Unit = {
60 |     if (committerInfo.name == PARTITIONED) {
61 |       hconf(sparkConf, S3A_CONFLICT_MODE, CONFLICT_MODE_REPLACE)
62 |     } else {
63 |       super
64 |         .setDynamicPartitioningOptions(sparkConf, committerInfo)
65 |     }
66 |   }
67 | 
68 | 
69 |   override protected def expectDynamicPartitioningToSucceed(
70 |     committerInfo: CommitterInfo): Boolean = {
71 |     committerInfo.name == PARTITIONED
72 |   }
73 | 
74 |   override def anyOtherTests(spark: SparkSession,
75 |     filesystem: FileSystem,
76 |     subdir: Path, format: String,
77 |     sourceData: Dataset[Event],
78 |     eventData2: Dataset[Event],
79 |     committerInfo: CommitterInfo): Unit = {
80 |     if (committerInfo.name == PARTITIONED) {
81 |       logInfo("Executing partitioned committer tests")
82 |       // although the dynamic command doesn't work,
83 |       // a normal query will trigger overwrite
84 |       logDuration(s"overwrite datset2 to $subdir in format $format") {
85 |         eventData2
86 |           .write
87 |           .mode("overwrite")
88 |           .partitionBy("year", "month")
89 |           .format(format)
90 |           .save(subdir.toString)
91 |       }
92 |     }
93 | 
94 | 
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/spark-cloud-integration/src/main/scala/com/cloudera/spark/cloud/CommitterBinding.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud
19 | 
20 | import com.cloudera.spark.cloud.GeneralCommitterConstants.{ABFS_MANIFEST_COMMITTER_FACTORY, DEFAULT_COMMITTER_FACTORY, MANIFEST_COMMITTER_FACTORY, MANIFEST_COMMITTER_NAME}
21 | import org.apache.hadoop.fs.s3a.commit.CommitConstants
22 | 
23 | /**
24 |  * Constants related to the S3A committers.
25 |  * Originally a copy & paste of the java values, it's now just a reference,
26 |  * though retained to reserve the option of moving back to copied values.
27 |  */
28 | object CommitterBinding {
29 | 
30 |   def factoryForSchema(s: String): String =
31 |     String.format(
32 |       GeneralCommitterConstants.OUTPUTCOMMITTER_FACTORY_SCHEME_PATTERN,
33 |       s)
34 | 
35 | 
36 |   val S3A_SCHEME_COMMITTER_FACTORY: String = factoryForSchema("s3a")
37 |   val STAGING_PACKAGE = "org.apache.hadoop.fs.s3a.commit.staging."
38 |   val S3A_COMMITTER_FACTORY: String = CommitConstants.S3A_COMMITTER_FACTORY
39 | 
40 |   val S3A_COMMITTER_NAME: String = CommitConstants.FS_S3A_COMMITTER_NAME
41 | 
42 |   val MAGIC = "magic"
43 |   val STAGING = "staging"
44 |   val DIRECTORY = "directory"
45 |   val PARTITIONED = "partitioned"
46 |   val MANIFEST = "manifest"
47 |   val MANIFEST_ABFS = "manifest_abfs"
48 |   val FILE = "file"
49 | 
50 |   val S3A_CONFLICT_MODE: String =
51 |     CommitConstants.FS_S3A_COMMITTER_STAGING_CONFLICT_MODE
52 | 
53 |   /** Conflict mode */
54 |   val CONFLICT_MODE_FAIL: String = "fail"
55 | 
56 |   val CONFLICT_MODE_APPEND: String = "append"
57 | 
58 |   val CONFLICT_MODE_REPLACE: String = "replace"
59 | 
60 |   /**
61 |    * Committer name to: name in _SUCCESS, factory classname, requires consistent FS.
62 |    *
63 |    * If the first field is "", it means "this committer doesn't put its name into
64 |    * the success file (or that it isn't actually created).
65 |    */
66 |   val COMMITTERS_BY_NAME: Map[String, CommitterInfo] = Map(
67 |     MAGIC -> CommitterInfo(MAGIC, S3A_COMMITTER_FACTORY),
68 |     STAGING -> CommitterInfo(STAGING, S3A_COMMITTER_FACTORY),
69 |     DIRECTORY -> CommitterInfo(DIRECTORY, S3A_COMMITTER_FACTORY),
70 |     PARTITIONED -> CommitterInfo(PARTITIONED, S3A_COMMITTER_FACTORY),
71 |     MANIFEST -> CommitterInfo(MANIFEST_COMMITTER_NAME,
72 |       MANIFEST_COMMITTER_FACTORY),
73 |     MANIFEST_ABFS -> CommitterInfo(MANIFEST_COMMITTER_NAME,
74 |       ABFS_MANIFEST_COMMITTER_FACTORY),
75 |     FILE -> CommitterInfo("", DEFAULT_COMMITTER_FACTORY)
76 |   )
77 | 
78 | }
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/common/CloudSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.common
19 | 
20 | import java.io.{File, FileNotFoundException}
21 | 
22 | import com.cloudera.spark.cloud.s3.S3AConstants
23 | import com.cloudera.spark.cloud.CommitterBinding
24 | import org.apache.hadoop.conf.Configuration
25 | import org.scalatest.concurrent.Eventually
26 | import org.scalatest.BeforeAndAfter
27 | 
28 | import org.apache.spark.{LocalSparkContext, SparkFunSuite}
29 | import org.apache.spark.internal.Logging
30 | 
31 | /**
32 |  * A cloud suite.
33 |  * Adds automatic loading of a Hadoop configuration file with login credentials and
34 |  * options to enable/disable tests, and a mechanism to conditionally declare tests
35 |  * based on these details
36 |  */
37 | abstract class CloudSuite extends ContextFreeCloudSuite
38 |     with LocalSparkContext {
39 | }
40 | 
41 | object CloudSuite extends Logging with S3AConstants
42 |   with CloudSuiteTrait {
43 | 
44 |   private var configLogged = false
45 | 
46 |   /**
47 |    * Load the configuration file from the system property `SYSPROP_CLOUD_TEST_CONFIGURATION_FILE`.
48 |    * Throws FileNotFoundException if a configuration is named but not present.
49 |    * @return the configuration
50 |    */
51 |   def loadConfiguration(): Configuration = {
52 |     val config = new Configuration(true)
53 |     getKnownSysprop(SYSPROP_CLOUD_TEST_CONFIGURATION_FILE).foreach { filename =>
54 |       logDebug(s"Configuration property = `$filename`")
55 |       val f = new File(filename)
56 |       if (f.exists()) {
57 |         // unsynced but its only a log statement
58 |         if (configLogged) {
59 |           configLogged = true
60 |           logInfo(s"Loading configuration from $f")
61 |         }
62 |         config.addResource(f.toURI.toURL)
63 |       } else {
64 |         throw new FileNotFoundException(s"No file '$filename'" +
65 |           s" declared in property $SYSPROP_CLOUD_TEST_CONFIGURATION_FILE")
66 |       }
67 |     }
68 |     overlayConfiguration(
69 |       config,
70 |       Seq(
71 |         HIVE_TESTS_DISABLED,
72 |         REQUIRED_HADOOP_VERSION,
73 |         SCALE_TEST_ENABLED,
74 |         SCALE_TEST_SIZE_FACTOR,
75 |         S3A_CLIENT_FACTORY_IMPL,
76 |         S3A_COMMITTER_TEST_ENABLED,
77 |         S3A_ENCRYPTION_KEY_1,
78 |         S3A_ENCRYPTION_KEY_2
79 |       )
80 |     )
81 | 
82 |     // setup the committer from any property passed in
83 |     getKnownSysprop(S3A_COMMITTER_NAME).foreach(committer => {
84 |       val binding = CommitterBinding.COMMITTERS_BY_NAME(committer.toLowerCase())
85 |       binding.bind(config)
86 |       logInfo(s"Using committer binding $binding")
87 |     })
88 |     config
89 |   }
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3AEncryptionSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.cloudera.spark.cloud.s3
 19 | 
 20 | import com.cloudera.spark.cloud.common.CloudSuite
 21 | import org.apache.hadoop.conf.Configuration
 22 | import org.apache.hadoop.fs._
 23 | 
 24 | /**
 25 |  * A suite of tests working with encryption.
 26 |  * Needs multiple encryption keys to work with.
 27 |  */
 28 | class S3AEncryptionSuite extends CloudSuite with S3ATestSetup {
 29 | 
 30 |   override def enabled: Boolean =  {
 31 |     val conf = getConf
 32 |     super.enabled && hasConf(conf, S3A_ENCRYPTION_KEY_1) &&
 33 |       hasConf(conf, S3A_ENCRYPTION_KEY_2)
 34 |   }
 35 | 
 36 |   init()
 37 | 
 38 |   def init(): Unit = {
 39 |     if (enabled) {
 40 |       initFS()
 41 |     }
 42 |   }
 43 | 
 44 |   override def setupFilesystemConfiguration(config: Configuration): Unit = {
 45 |     super.setupFilesystemConfiguration(config)
 46 |     config.set(SERVER_SIDE_ENCRYPTION_ALGORITHM, SSE_KMS)
 47 |     config.set(SERVER_SIDE_ENCRYPTION_KEY, config.getTrimmed(S3A_ENCRYPTION_KEY_1))
 48 |   }
 49 | 
 50 |   /**
 51 |    * Create an FS with key2
 52 |    */
 53 |   def createKey2FS(): FileSystem = {
 54 |     val config = getConf
 55 |     config.set(SERVER_SIDE_ENCRYPTION_ALGORITHM, SSE_KMS)
 56 |     config.set(SERVER_SIDE_ENCRYPTION_KEY, config.getTrimmed(S3A_ENCRYPTION_KEY_2))
 57 |     FileSystem.newInstance(filesystemURI, config)
 58 |   }
 59 | 
 60 |   /**
 61 |    * Create an FS with key2
 62 |    */
 63 |   def createUnencryptedFS(): FileSystem = {
 64 |     val config = getConf
 65 |     config.unset(SERVER_SIDE_ENCRYPTION_ALGORITHM)
 66 |     FileSystem.newInstance(filesystemURI, config)
 67 |   }
 68 | 
 69 |   ctest("TwoKeys", "read and write with two keys") {
 70 |     val key1 = filesystem.getConf.get(SERVER_SIDE_ENCRYPTION_KEY)
 71 |     logInfo(s"Test key 1 = $key1")
 72 | 
 73 |     val dir = path("TwoKeys")
 74 |     val key1File = new Path(dir, "key1")
 75 |     val hello: String = "hello"
 76 |     write(filesystem, key1File, hello)
 77 | 
 78 |     val fs2 = createKey2FS()
 79 |     val key2 = fs2.getConf.get(SERVER_SIDE_ENCRYPTION_KEY)
 80 |     logInfo(s"Test key 2 = $key2")
 81 |     assert( key1 != key2, "same key is used for both filesystems")
 82 | 
 83 |     val status = fs2.getFileStatus(key1File)
 84 |     assert( hello.length === status.getLen, s"wrong length in $status")
 85 | 
 86 |     fs2.listStatus(dir)
 87 |     val data = read(fs2, key1File, 128)
 88 |     assert (hello.length === data.length)
 89 |     assert (hello === data)
 90 | 
 91 |     val unencryptedFS = createUnencryptedFS()
 92 |     val dataUnencrypted = read(unencryptedFS, key1File, 128)
 93 |     assert(hello === dataUnencrypted)
 94 | 
 95 |     unencryptedFS.delete(key1File, false)
 96 |     fs2.delete(dir, true)
 97 | 
 98 |   }
 99 | 
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/s3/S3ALineCountWritebackSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.cloudera.spark.cloud.s3
19 | 
20 | import scala.concurrent.duration._
21 | import scala.language.postfixOps
22 | 
23 | import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource
24 | import org.apache.hadoop.fs.{FileStatus, Path}
25 | 
26 | /**
27 |  * Test the `S3LineCount` entry point.
28 |  */
29 | class S3ALineCountWritebackSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup {
30 | 
31 |   init()
32 | 
33 |   def init(): Unit = {
34 |     // propagate S3 credentials
35 |     if (enabled) {
36 |       initFS()
37 |     }
38 |   }
39 | 
40 |   override def enabled: Boolean = super.enabled && hasCSVTestFile
41 | 
42 |   override def cleanFSInTeardownEnabled: Boolean = true
43 | 
44 |   after {
45 |     cleanFilesystemInTeardown()
46 |   }
47 | 
48 |   ctest("LineCountWriteback",
49 |     "Execute the LineCount example with the results written back to the test filesystem.") {
50 |     val sourceFile = getTestCSVPath()
51 |     val sourceFS = sourceFile.getFileSystem(getConf)
52 |     val sourceInfo = sourceFS.getFileStatus(sourceFile)
53 |     val sparkConf = newSparkConf()
54 |     sparkConf.setAppName("LineCount")
55 |     val destDir = testPath(filesystem, "LineCountWriteback")
56 |     assert(0 === S3ALineCount.action(sparkConf,
57 |       Array(sourceFile.toString, destDir.toString)))
58 | 
59 | 
60 |     val status = filesystem.getFileStatus(destDir)
61 |     assert(status.isDirectory, s"Not a directory: $status")
62 | 
63 |     // only a small fraction of the source data is needed
64 |     val expectedLen = sourceInfo.getLen / 1024
65 | 
66 |     def validateChildSize(qualifier: String, files: Seq[FileStatus]) = {
67 |       val (filenames, size) = enumFileSize(destDir, files)
68 |       logInfo(s"total size of $qualifier = $size bytes from ${files.length} files: $filenames")
69 |       assert(size >= expectedLen, s"$qualifier size $size in files $filenames" +
70 |           s" smaller than exoected length $expectedLen")
71 |     }
72 | 
73 |     val stdInterval = interval(100 milliseconds)
74 |     val appId = eventually(timeout(20 seconds), stdInterval) {
75 |       validateChildSize("descendants",
76 |         listFiles(filesystem, destDir, true)
77 |             .filter(f => f.getPath.getName != "_SUCCESS"))
78 | 
79 |       validateChildSize("children",
80 |         filesystem.listStatus(destDir,
81 |           pathFilter(p => p.getName != "_SUCCESS")).toSeq)
82 |     }
83 |   }
84 | 
85 |   private def enumFileSize(destDir: Path, files: Seq[FileStatus]): (String, Long)  = {
86 |     assert(files.nonEmpty, s"No files in destination directory $destDir")
87 |     var size = 0L
88 |     val filenames = new StringBuffer()
89 |     files.foreach { f =>
90 |       size += f.getLen
91 |       filenames.append(" ").append(f.getPath)
92 |     }
93 |     (filenames.toString, size)
94 |   }
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/cloud-examples/src/main/scala/com/cloudera/spark/cloud/examples/AzureStreamingExample.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.cloudera.spark.cloud.examples
 19 | 
 20 | import com.cloudera.spark.cloud.ObjectStoreExample
 21 | import org.apache.hadoop.fs.Path
 22 | 
 23 | import org.apache.spark.SparkConf
 24 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 25 | 
 26 | /**
 27 |  * Simple example of streaming on Azure.
 28 |  */
 29 | class AzureStreamingExample extends ObjectStoreExample {
 30 | 
 31 |   /**
 32 |    * List of the command args for the current example.
 33 |    * @return a string
 34 |    */
 35 |   override protected def usageArgs(): String = {
 36 |     "<dest> <execute-seconds> <interval-seconds>"
 37 |   }
 38 | 
 39 |   /**
 40 |    * Action to execute.
 41 |    *
 42 |    * @param sparkConf configuration to use
 43 |    * @param args argument array
 44 |    * @return an exit code
 45 |    */
 46 |   override def action(
 47 |       sparkConf: SparkConf,
 48 |       args: Array[String]): Int = {
 49 |     if (args.length !=  3) {
 50 |       return usage()
 51 |     }
 52 |     sparkConf.setAppName("CloudStreaming")
 53 |     applyObjectStoreConfigurationOptions(sparkConf, false)
 54 |     val dest = args(0)
 55 |     val delay = Integer.valueOf(args(1))
 56 |     val interval = Integer.valueOf(args(2))
 57 | 
 58 |     // Create the context
 59 |     val streaming = new StreamingContext(sparkConf, Seconds(10))
 60 | 
 61 |     try {
 62 |       // Create the FileInputDStream on the directory regexp and use the
 63 |       // stream to look for a new file renamed into it
 64 |       val destPath = new Path(dest)
 65 |       val sc = streaming.sparkContext
 66 |       val hc = sc.hadoopConfiguration
 67 | 
 68 |       val fs = destPath.getFileSystem(hc)
 69 |       rm(fs, destPath)
 70 |       fs.mkdirs(destPath)
 71 | 
 72 |       val sightings = sc.longAccumulator("sightings")
 73 | 
 74 |       print("===================================")
 75 |       print(s"Looking for text files under ${destPath}")
 76 |       print("===================================")
 77 | 
 78 |       val lines = streaming.textFileStream(dest)
 79 | 
 80 |       val matches = lines.map(line => {
 81 |         sightings.add(1)
 82 |         print(s"[${sightings.value}]: $line")
 83 |         line
 84 |       })
 85 | 
 86 |       // materialize the operation
 87 |       matches.print()
 88 | 
 89 |       // start the streaming
 90 |       streaming.start()
 91 | 
 92 |       // sleep a bit to get streaming up and running
 93 |       Thread.sleep(delay * 1000)
 94 |       print("===================================")
 95 |       print(s"Seen ${sightings.value} lines")
 96 |       0
 97 |     } finally {
 98 |       streaming.stop(true)
 99 |     }
100 |   }
101 | 
102 | }
103 | 
104 |  object AzureStreamingExample {
105 | 
106 |   def main(args: Array[String]) {
107 |     new AzureStreamingExample().run(args)
108 |   }
109 | }
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/cloud-examples/src/test/scala/com/cloudera/spark/cloud/common/SeekReadTests.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.cloudera.spark.cloud.common
 19 | 
 20 | import org.apache.hadoop.fs.FileSystem
 21 | 
 22 | /**
 23 |  * Tests reading in the CSV file using sequential and Random IO.
 24 |  */
 25 | class SeekReadTests extends CloudSuiteWithCSVDatasource  {
 26 | 
 27 |   override def enabled: Boolean = super.enabled && hasCSVTestFile
 28 | 
 29 | 
 30 |   ctest("SeekReadFully",
 31 |       """Assess cost of seek and read operations.
 32 |         | When moving the cursor in an input stream, an HTTP connection may be closed and
 33 |         | then re-opened. This can be very expensive; tactics like streaming forwards instead
 34 |         | of seeking, and/or postponing movement until the following read ('lazy seek') try
 35 |         | to address this. Logging these operation times helps track performance.
 36 |         | This test also tries to catch out a regression, where a `close()` operation
 37 |         | is implemented through reading through the entire input stream. This is exhibited
 38 |         | in the time to `close()` while at offset 0 being `O(len(file))`.
 39 |         |
 40 |         | Note also the cost of `readFully()`; this method call is common inside libraries
 41 |         | like Orc and Parquet.""".stripMargin) {
 42 |     val (source, fs) = getCSVSourceAndFileSystem()
 43 |     FileSystem.clearStatistics
 44 |     fs.getStorageStatistics.reset()
 45 |     val st = logDuration("stat") {
 46 |       fs.getFileStatus(source)
 47 |     }
 48 |     val in = logDuration("open") {
 49 |       fs.open(source)
 50 |     }
 51 |     def time[T](operation: String)(testFun: => T): T = {
 52 |       logInfo(s"")
 53 |       var r = logDuration(operation + s" [pos = ${in.getPos}]")(testFun)
 54 |       logInfo(s"  ${in.getWrappedStream}")
 55 |       r
 56 |     }
 57 | 
 58 |     val eof = st.getLen
 59 | 
 60 |     time("read()") {
 61 |       assert(-1 !== in.read())
 62 |     }
 63 |     time("seek(256)") {
 64 |       in.seek(256)
 65 |     }
 66 |     time("seek(256)") {
 67 |       in.seek(256)
 68 |     }
 69 |     time("seek(EOF-2)") {
 70 |       in.seek(eof - 2)
 71 |     }
 72 |     time("read()") {
 73 |       assert(-1 !== in.read())
 74 |     }
 75 | 
 76 |     def readFully(offset: Long, len: Int): Unit = {
 77 |       time(s"readFully($offset, byte[$len])") {
 78 |         val bytes = new Array[Byte](len)
 79 |         assert(-1 !== in.readFully(offset, bytes))
 80 |       }
 81 |     }
 82 |     readFully(1L, 1)
 83 |     readFully(1L, 256)
 84 |     readFully(eof - 350, 300)
 85 |     readFully(260L, 256)
 86 |     readFully(1024L, 256)
 87 |     readFully(1536L, 256)
 88 |     readFully(8192L, 1024)
 89 |     readFully(8192L + 1024 + 512, 1024)
 90 |     readFully(0L, 1024)
 91 |     readFully(eof - 1024, 1024)
 92 | 
 93 |     time("seek(getPos)") {
 94 |       in.seek(in.getPos())
 95 |     }
 96 |     time("read()") {
 97 |       assert(-1 !== in.read())
 98 |     }
 99 |     logDuration("close()") {
100 |       in.close
101 |     }
102 |     dumpFileSystemStatistics(fs.getStorageStatistics)
103 | 
104 |   }
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------