├── connector
    ├── iterator
    │   ├── dependency-reduced-pom.xml
    │   └── src
    │   │   ├── test
    │   │       ├── resources
    │   │       │   └── com
    │   │       │   │   └── microsoft
    │   │       │   │       └── accumulo
    │   │       │   │           └── spark
    │   │       │   │               ├── pyspark.lr.zip
    │   │       │   │               ├── sentiment.zip
    │   │       │   │               └── twitter.model.lr.zip
    │   │       └── java
    │   │       │   └── com
    │   │       │       └── microsoft
    │   │       │           └── accumulo
    │   │       │               └── spark
    │   │       │                   ├── AvroUtil.java
    │   │       │                   ├── DefaultIteratorEnvironment.java
    │   │       │                   ├── AvroColumnPruningTest.java
    │   │       │                   ├── AvroMLeapTest.java
    │   │       │                   ├── AvroFilterTest.java
    │   │       │                   ├── AvroRowEncoderIteratorTest.java
    │   │       │                   ├── AvroRowTopLevelTest.java
    │   │       │                   └── AvroJuelTest.java
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── microsoft
    │   │                   └── accumulo
    │   │                       └── spark
    │   │                           ├── record
    │   │                               ├── RowBuilderCellConsumer.java
    │   │                               ├── RowBuilderType.java
    │   │                               ├── RowBuilderField.java
    │   │                               ├── AvroFastRecord.java
    │   │                               └── AvroSchemaBuilder.java
    │   │                           ├── util
    │   │                               └── StopWatch.java
    │   │                           ├── juel
    │   │                               ├── AvroUtf8Wrapper.java
    │   │                               ├── AvroELContext.java
    │   │                               ├── expressions
    │   │                               │   ├── RowKeyVariableExpression.java
    │   │                               │   ├── AvroObjectExpression.java
    │   │                               │   └── AvroVariableExpression.java
    │   │                               ├── AvroVariableMapper.java
    │   │                               └── AvroResolver.java
    │   │                           └── processors
    │   │                               ├── AvroRowConsumer.java
    │   │                               ├── AvroRowSerializer.java
    │   │                               ├── AvroRowFilter.java
    │   │                               └── AvroRowComputedColumns.java
    ├── integration-test
    │   ├── src
    │   │   └── test
    │   │   │   └── resources
    │   │   │       ├── samplenullable.txt
    │   │   │       ├── sample.txt
    │   │   │       └── sample_more.txt
    │   └── pom.xml
    ├── zipfs
    │   ├── src
    │   │   ├── test
    │   │   │   ├── resources
    │   │   │   │   └── com
    │   │   │   │   │   └── microsoft
    │   │   │   │   │       └── accumulo
    │   │   │   │   │           └── zipfs
    │   │   │   │   │               └── sample.zip
    │   │   │   └── java
    │   │   │   │   └── com
    │   │   │   │       └── microsoft
    │   │   │   │           └── accumulo
    │   │   │   │               └── zipfs
    │   │   │   │                   └── JimfsZipfsTest.java
    │   │   └── main
    │   │   │   └── java
    │   │   │       └── com
    │   │   │           └── microsoft
    │   │   │               └── accumulo
    │   │   │                   └── zipfs
    │   │   │                       ├── ZipFileAttributes.java
    │   │   │                       ├── ZipPosixFileAttributeView.java
    │   │   │                       ├── ZipDirectoryStream.java
    │   │   │                       ├── ZipFileStore.java
    │   │   │                       ├── ZipCoder.java
    │   │   │                       ├── ZipFileAttributeView.java
    │   │   │                       └── ByteArrayChannel.java
    │   └── pom.xml
    ├── datasource
    │   └── src
    │   │   ├── test
    │   │       ├── resources
    │   │       │   └── com
    │   │       │   │   └── microsoft
    │   │       │   │       └── accumulo
    │   │       │   │           └── sentiment.zip
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── microsoft
    │   │       │           └── accumulo
    │   │       │               ├── VerifyMleapSchema.scala
    │   │       │               ├── VerifyFilterToJuel.scala
    │   │       │               └── VerifyAccumuloSchema.scala
    │   │   └── main
    │   │       └── scala
    │   │           └── com
    │   │               └── microsoft
    │   │                   └── accumulo
    │   │                       ├── DefaultSource.scala
    │   │                       ├── FilterToJuel.scala
    │   │                       ├── MLeapUtil.scala
    │   │                       ├── AccumuloDataSourceWriter.scala
    │   │                       ├── AccumuloDataWriter.scala
    │   │                       ├── AvroUtil.scala
    │   │                       ├── AccumuloInputPartitionReader.scala
    │   │                       └── AccumuloDataSourceReader.scala
    ├── publish
    │   ├── settings.xml
    │   └── publish.sh
    └── README.md
├── .github
    └── workflows
    │   └── maven.yml
├── CODE_OF_CONDUCT.md
├── OpenSource
    └── JDK-ZipFileSystem
    │   └── README.md
├── azure-pipelines.yml
├── SECURITY.md
├── README.md
└── .gitignore


/connector/iterator/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/connector/integration-test/src/test/resources/samplenullable.txt:
--------------------------------------------------------------------------------
1 | key,label,text,count
2 | r0,0.0,,5
3 | ,1.0,this is good,
4 | r2,,we don't know yet,2


--------------------------------------------------------------------------------
/connector/integration-test/src/test/resources/sample.txt:
--------------------------------------------------------------------------------
1 | key,label,text,count
2 | r0,0.0,this is bad,5
3 | r1,1.0,this is good,3
4 | r2,0.0,we don't know yet,2


--------------------------------------------------------------------------------
/connector/integration-test/src/test/resources/sample_more.txt:
--------------------------------------------------------------------------------
1 | key,label,text,count
2 | r3,0.0,this is still bad,5
3 | r4,1.0,this is still good,3
4 | r5,0.0,we still don't know yet,2


--------------------------------------------------------------------------------
/connector/zipfs/src/test/resources/com/microsoft/accumulo/zipfs/sample.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/zipfs/src/test/resources/com/microsoft/accumulo/zipfs/sample.zip


--------------------------------------------------------------------------------
/connector/datasource/src/test/resources/com/microsoft/accumulo/sentiment.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/datasource/src/test/resources/com/microsoft/accumulo/sentiment.zip


--------------------------------------------------------------------------------
/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/pyspark.lr.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/pyspark.lr.zip


--------------------------------------------------------------------------------
/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/sentiment.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/sentiment.zip


--------------------------------------------------------------------------------
/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/twitter.model.lr.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/twitter.model.lr.zip


--------------------------------------------------------------------------------
/.github/workflows/maven.yml:
--------------------------------------------------------------------------------
 1 | name: Java CI
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-16.04
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v1
12 |     - name: Set up JDK 1.8
13 |       uses: actions/setup-java@v1
14 |       with:
15 |         java-version: 1.8
16 |     - name: Build with Maven
17 |       run: mvn -B package --file connector/pom.xml
18 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/connector/publish/settings.xml:
--------------------------------------------------------------------------------
 1 | <settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
 2 |           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |           xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0
 4 |           https://maven.apache.org/xsd/settings-1.0.0.xsd">
 5 |           <servers>
 6 |                   <server>
 7 |                           <id>ossrh</id>
 8 |                           <username>${ossrh.username}</username>
 9 |                           <password>${ossrh.password}</password>
10 |                   </server>
11 |           </servers>
12 | </settings>


--------------------------------------------------------------------------------
/OpenSource/JDK-ZipFileSystem/README.md:
--------------------------------------------------------------------------------
1 | # Open Source Information
2 | 
3 | ## JRE ZipFileSystem Component
4 | Code found under [connector/zipfs](/connector/zipfs) originates from [https://github.com/openjdk/jdk/blob/515db21790d589cf636ec8b6592b865ca492e887/src/jdk.zipfs/share/classes/jdk/nio/zipfs/ZipFileSystem.java](https://github.com/openjdk/jdk/blob/515db21790d589cf636ec8b6592b865ca492e887/src/jdk.zipfs/share/classes/jdk/nio/zipfs/ZipFileSystem.java).
5 | It was modified to overcome issues with Apache Spark and Accumulo.
6 | 
7 | The commit version used was d948bfd.
8 | 
9 | See [NOTICE](./NOTICE) for license information


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/RowBuilderCellConsumer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.record;
19 | 
20 | import org.apache.accumulo.core.data.Key;
21 | import org.apache.accumulo.core.data.Value;
22 | 
23 | /**
24 |  * Will be called for each cell found as the iterator goes over the data.
25 |  * Implementation decode data and move into Avro record.
26 |  */
27 | public interface RowBuilderCellConsumer {
28 |   void consume(Key key, Value value);
29 | }
30 | 


--------------------------------------------------------------------------------
/connector/publish/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # template for username/password for sonatype repository server
 3 | cp settings.xml ~/.m2/settings.xml
 4 | 
 5 | # ## generate gpg
 6 | # gpg --full-generate-key
 7 | # ## send to gpg server
 8 | # gpg --keyserver pool.sks-keyservers.net --send-key 2E5260D120241F6F8E35370D293C0...
 9 | # ## import signing key
10 | # ## this is how to export them
11 | # gpg --export-secret-keys 'Markus Cozowicz (com.microsoft.accumulo) <marcozo@microsoft.com>' | base64 -w 0
12 | echo $ossrh_gpg | base64 -d | gpg --import -
13 | 
14 | MAVEN_OPTS="verify gpg:sign deploy:deploy -Dmaven.test.skip=true -DskipITs -Dossrh.username=$ossrh_username -Dossrh.password=$ossrh_password"
15 | 
16 | # to use the snapshot from oss.sonatype.org 
17 | # * add http://oss.sonatype.org/content/repositories/snapshots 
18 | # * reference com.microsoft.accumulo:accumulo-spark-connector:1.0.0
19 | # * reference com.microsoft.accumulo:accumulo-spark-datasource:1.0.0
20 | # 
21 | # more details at https://stackoverflow.com/questions/7715321/how-to-download-snapshot-version-from-maven-snapshot-repository
22 | 
23 | # For a proper release:
24 | # * remove -SNAPSHOT in pom.xml
25 | # * visit https://oss.sonatype.org/#stagingRepositories and "close & release" the staged .jar
26 | # * see https://oss.sonatype.org/#stagingRepositories
27 | mvn -f ../pom.xml install
28 | mvn -f ../pom.xml $MAVEN_OPTS -N # don't recurse
29 | mvn -f ../datasource/pom.xml $MAVEN_OPTS -DshadedArtifactAttached=false
30 | mvn -f ../iterator/pom.xml $MAVEN_OPTS -DshadedArtifactAttached=false
31 | 


--------------------------------------------------------------------------------
/connector/datasource/src/test/scala/com/microsoft/accumulo/VerifyMleapSchema.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo
19 | 
20 | import org.junit.runner.RunWith
21 | import org.scalatest.FunSuite
22 | import org.scalatest.junit.JUnitRunner
23 | import com.google.common.io.Resources
24 | import java.util.Base64
25 | 
26 | import org.apache.spark.sql.types.{DataTypes, StructField}
27 | 
28 | @RunWith(classOf[JUnitRunner])
29 | class VerifyMleapSchema extends FunSuite {
30 |   test("Validate mleap schema extraction") {
31 |     val mleapBundle = Resources.toByteArray(classOf[VerifyMleapSchema].getResource("sentiment.zip"))
32 |     val mleapBundleBase64 = Base64.getEncoder().encodeToString(mleapBundle)
33 | 
34 |   	val fields = MLeapUtil.mleapSchemaToCatalyst(mleapBundleBase64)
35 | 
36 |     assert(Seq(StructField("prediction", DataTypes.DoubleType, false)) ==  fields)
37 |   }
38 | }


--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
 1 | pr:
 2 | - master
 3 | 
 4 | stages:
 5 | - stage: Compliance
 6 |   jobs:
 7 |   - job: 
 8 |     steps:
 9 |     - task: ComponentGovernanceComponentDetection@0
10 |       inputs:
11 |         scanType: 'Register'
12 |         verbosity: 'Verbose'
13 |         alertWarningLevel: 'High'
14 | 
15 | - stage: AccumuloSparkConnector
16 |   jobs:
17 |   - job:
18 |     pool: 
19 |       vmImage: 'ubuntu-16.04'
20 |     steps:
21 |     - task: Maven@3
22 |       displayName: 'Accumulo Spark Connector components'
23 |       inputs:
24 |         mavenPomFile: 'connector/pom.xml'
25 |         javaHomeOption: 'JDKVersion'
26 |         jdkVersionOption: '1.8'
27 |         jdkArchitectureOption: 'x64'
28 |         publishJUnitResults: true
29 |         testResultsFiles: '**/TEST-*.xml'
30 |         goals: 'package'
31 |         options: '-B' # batch mode for non-interactive release
32 | 
33 |     - task: PublishPipelineArtifact@1
34 |       inputs:
35 |         targetPath: connector/iterator/target/microsoft-accumulo-spark-iterator-1.0.4-shaded.jar
36 |         artifactName: accumulo-spark-iterator
37 | 
38 |     - task: PublishPipelineArtifact@1
39 |       inputs:
40 |         targetPath: connector/datasource/target/microsoft-accumulo-spark-datasource-1.0.4-shaded.jar
41 |         artifactName: accumulo-spark-datasource
42 | 
43 |     - bash: cd connector/publish && ./publish.sh
44 |       displayName: Publish to Sonatype
45 |       condition: variables['ossrh_gpg']
46 |       env:
47 |         # these are credentials used to publish to oss.sonatype.org
48 |         # credentials are stored as secrets in the build definition
49 |         ossrh_gpg: $(ossrh_gpg)
50 |         ossrh_username: $(ossrh_username)
51 |         ossrh_password: $(ossrh_password)
52 | 
53 | 


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroUtil.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark;
19 | 
20 | import java.io.IOException;
21 | import java.util.Collection;
22 | import java.util.HashSet;
23 | 
24 | import org.apache.accumulo.core.data.ByteSequence;
25 | import org.apache.avro.Schema;
26 | import org.apache.avro.generic.GenericRecord;
27 | import org.apache.avro.io.BinaryDecoder;
28 | import org.apache.avro.io.DecoderFactory;
29 | import org.apache.avro.specific.SpecificDatumReader;
30 | 
31 | public class AvroUtil {
32 |   public static final Collection<ByteSequence> EMPTY_SET = new HashSet<>();
33 | 
34 |   public static GenericRecord deserialize(byte[] data, Schema schema) throws IOException {
35 |     SpecificDatumReader<GenericRecord> reader = new SpecificDatumReader<GenericRecord>(schema);
36 |     BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(data, null);
37 | 
38 |     return reader.read(null, decoder);
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipFileAttributes.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
 3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 4 |  *
 5 |  * This code is free software; you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License version 2 only, as
 7 |  * published by the Free Software Foundation.  Oracle designates this
 8 |  * particular file as subject to the "Classpath" exception as provided
 9 |  * by Oracle in the LICENSE file that accompanied this code.
10 |  *
11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 |  * version 2 for more details (a copy is included in the LICENSE file that
15 |  * accompanied this code).
16 |  *
17 |  * You should have received a copy of the GNU General Public License version
18 |  * 2 along with this work; if not, write to the Free Software Foundation,
19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 |  *
21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 |  * or visit www.oracle.com if you need additional information or have any
23 |  * questions.
24 |  */
25 | 
26 | package com.microsoft.accumulo.zipfs;
27 | 
28 | import java.nio.file.attribute.BasicFileAttributes;
29 | import java.nio.file.attribute.PosixFilePermission;
30 | import java.util.Optional;
31 | import java.util.Set;
32 | 
33 | /**
34 |  * The attributes of a file stored in a zip file.
35 |  *
36 |  * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal
37 |  */
38 | interface ZipFileAttributes extends BasicFileAttributes {
39 |     long compressedSize();
40 |     long crc();
41 |     int method();
42 |     byte[] extra();
43 |     byte[] comment();
44 |     Optional<Set<PosixFilePermission>> storedPermissions();
45 | }
46 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/DefaultSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo
19 | 
20 | import java.util.Optional
21 | 
22 | import org.apache.spark.sql.SaveMode
23 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader
24 | import org.apache.spark.sql.sources.v2.writer.DataSourceWriter
25 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, WriteSupport}
26 | import org.apache.spark.sql.types.StructType
27 | 
28 | class DefaultSource extends DataSourceV2 with ReadSupport with WriteSupport {
29 | 
30 |   override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader = {
31 |     new AccumuloDataSourceReader(schema, options)
32 |   }
33 | 
34 |   override def createReader(options: DataSourceOptions): DataSourceReader = {
35 |     throw new UnsupportedOperationException("Must supply schema")
36 |   }
37 | 
38 |   override def createWriter(jobId: String,
39 |                             schema: StructType,
40 |                             mode: SaveMode,
41 |                             options: DataSourceOptions): Optional[DataSourceWriter] = {
42 |     Optional.of(new AccumuloDataSourceWriter(schema, mode, options))
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/util/StopWatch.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.util;
19 | 
20 | /**
21 |  * Stop watch functionality optimized for large number of segments.
22 |  */
23 | public class StopWatch {
24 | 	private long start;
25 | 	private double avg;
26 | 	private long n = 1;
27 | 
28 | 	/**
29 | 	 * Start recording a new segment. If stop() is not called subsequently this
30 | 	 * cancels the previous run.
31 | 	 */
32 | 	public void start() {
33 | 		this.start = System.nanoTime();
34 | 	}
35 | 
36 | 	/**
37 | 	 * Stops the current run.
38 | 	 */
39 | 	public void stop() {
40 | 		double time = System.nanoTime() - this.start;
41 | 
42 | 		// see //
43 | 		// https://stackoverflow.com/questions/1930454/what-is-a-good-solution-for-calculating-an-average-where-the-sum-of-all-values-e
44 | 		this.avg += (time - this.avg) / n;
45 | 
46 | 		// important that we only count here as callers might repeatly call start()
47 | 		// as a run was cancelled
48 | 		this.n++;
49 | 	}
50 | 
51 | 	/**
52 | 	 * @return average number of milliseconds.
53 | 	 */
54 | 	public double getAverage() {
55 | 		return this.avg / 1000;
56 | 	}
57 | 
58 | 	/**
59 | 	 * @return Returns the number segments (=stop() calls).
60 | 	 */
61 | 	public long getN() {
62 | 		return this.n;
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroUtf8Wrapper.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.juel;
19 | 
20 | import java.nio.charset.StandardCharsets;
21 | 
22 | import org.apache.avro.util.Utf8;
23 | 
24 | /**
25 |  * Wrap zero-copy Avro Utf8 class and extend with string filter support.
26 |  */
27 | public class AvroUtf8Wrapper extends Utf8 {
28 | 
29 |   // lazy initialized string
30 |   private String string;
31 | 
32 |   public AvroUtf8Wrapper(byte[] data) {
33 |     super(data);
34 |   }
35 | 
36 |   public String getString() {
37 |     if (this.string == null) {
38 |       byte[] bytes = getBytes();
39 |       this.string = new String(bytes, 0, bytes.length, StandardCharsets.UTF_8);
40 |     }
41 | 
42 |     return this.string;
43 |   }
44 | 
45 |   public boolean endsWith(String postfix) {
46 |     return getString().endsWith(postfix);
47 |   }
48 | 
49 |   public boolean startsWith(String prefix) {
50 |     return getString().startsWith(prefix);
51 |   }
52 | 
53 |   public boolean contains(String text) {
54 |     return getString().contains(text);
55 |   }
56 | 
57 |   @Override
58 |   public boolean equals(Object other) {
59 |     if (other instanceof String) {
60 |       return getString().equals(other);
61 |     }
62 | 
63 |     return super.equals(other);
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/test/java/com/microsoft/accumulo/zipfs/JimfsZipfsTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.zipfs;
19 | 
20 | import static org.junit.Assert.assertEquals;
21 | import static org.junit.Assert.assertFalse;
22 | import static org.junit.Assert.assertTrue;
23 | 
24 | import com.google.common.jimfs.Configuration;
25 | import com.google.common.jimfs.Jimfs;
26 | import com.google.common.io.Resources;
27 | import java.nio.file.*;
28 | import org.junit.Test;
29 | import java.util.*;
30 | 
31 | public class JimfsZipfsTest {
32 | 	@Test
33 | 	public void testJimfsAndZipFs() throws Exception {
34 | 		// read .zip file into memory
35 | 		byte[] data = Resources.toByteArray(JimfsZipfsTest.class.getResource("sample.zip"));
36 | 
37 | 		// create in-memory filesystem
38 | 		FileSystem fs = Jimfs.newFileSystem(Configuration.unix());
39 | 		Path sampleFilePath = fs.getPath("/sample.zip");
40 | 
41 | 		Files.write(sampleFilePath, data, StandardOpenOption.CREATE);
42 | 
43 | 		// get zip file system
44 | 		ZipFileSystem zfs = new ZipFileSystem(new ZipFileSystemProvider(), sampleFilePath, new HashMap<String, Object>());
45 | 
46 | 		Path pathInZip = zfs.getPath("/sample.txt");
47 | 
48 | 		List<String> lines = Files.readAllLines(pathInZip);
49 | 
50 | 		assertEquals(1, lines.size());
51 | 		assertEquals("Hello World", lines.get(0));
52 | 	}
53 | }


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/DefaultIteratorEnvironment.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.microsoft.accumulo.spark;
18 | 
19 | import java.io.IOException;
20 | 
21 | import org.apache.accumulo.core.conf.AccumuloConfiguration;
22 | import org.apache.accumulo.core.conf.DefaultConfiguration;
23 | import org.apache.accumulo.core.data.Key;
24 | import org.apache.accumulo.core.data.Value;
25 | import org.apache.accumulo.core.iterators.IteratorEnvironment;
26 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
27 | import org.apache.accumulo.core.iterators.system.MapFileIterator;
28 | import org.apache.hadoop.conf.Configuration;
29 | import org.apache.hadoop.fs.FileSystem;
30 | 
31 | public class DefaultIteratorEnvironment implements IteratorEnvironment {
32 | 
33 | 	AccumuloConfiguration conf;
34 | 	Configuration hadoopConf = new Configuration();
35 | 
36 | 	public DefaultIteratorEnvironment(AccumuloConfiguration conf) {
37 | 		this.conf = conf;
38 | 	}
39 | 
40 | 	public DefaultIteratorEnvironment() {
41 | 		this.conf = DefaultConfiguration.getInstance();
42 | 	}
43 | 
44 | 	@Deprecated
45 | 	@Override
46 | 	public SortedKeyValueIterator<Key, Value> reserveMapFileReader(String mapFileName) throws IOException {
47 | 		FileSystem fs = FileSystem.get(hadoopConf);
48 | 		return new MapFileIterator(fs, mapFileName, hadoopConf);
49 | 	}
50 | 
51 | 	@Override
52 | 	public boolean isSamplingEnabled() {
53 | 		return false;
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroELContext.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.juel;
19 | 
20 | import javax.el.ELContext;
21 | import javax.el.ELResolver;
22 | import javax.el.FunctionMapper;
23 | import javax.el.VariableMapper;
24 | 
25 | import org.apache.avro.Schema;
26 | import org.apache.avro.generic.IndexedRecord;
27 | import org.apache.hadoop.io.Text;
28 | 
29 | /**
30 |  * Exposes Avro GenericRecord as Expression Language (EL) context for filtering
31 |  * and column computation.
32 |  */
33 | public class AvroELContext extends ELContext {
34 | 
35 |   private IndexedRecord avroRecord;
36 |   private Text rowKey;
37 |   private VariableMapper variableMapper;
38 |   private ELResolver resolver;
39 | 
40 |   public AvroELContext(Schema schema) {
41 |     variableMapper = new AvroVariableMapper(schema);
42 |     resolver = new AvroResolver();
43 |   }
44 | 
45 |   @Override
46 |   public ELResolver getELResolver() {
47 |     return resolver;
48 |   }
49 | 
50 |   @Override
51 |   public FunctionMapper getFunctionMapper() {
52 |     return null;
53 |   }
54 | 
55 |   @Override
56 |   public VariableMapper getVariableMapper() {
57 |     return variableMapper;
58 |   }
59 | 
60 |   public IndexedRecord getAvroRecord() {
61 |     return avroRecord;
62 |   }
63 | 
64 |   public Text getRowKey() {
65 |     return rowKey;
66 |   }
67 | 
68 |   public void setCurrent(Text rowKey, IndexedRecord avroRecord) {
69 |     this.rowKey = rowKey;
70 |     this.avroRecord = avroRecord;
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/connector/zipfs/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   contributor license agreements.  See the NOTICE file distributed with
 5 |   this work for additional information regarding copyright ownership.
 6 |   The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   (the "License"); you may not use this file except in compliance with
 8 |   the License.  You may obtain a copy of the License at
 9 | 
10 |       http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS,
14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   See the License for the specific language governing permissions and
16 |   limitations under the License.
17 | -->
18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
19 |   <modelVersion>4.0.0</modelVersion>
20 |   <parent>
21 |     <groupId>com.microsoft.masc</groupId>
22 |     <artifactId>microsoft-accumulo-spark</artifactId>
23 |     <version>1.0.4</version>
24 |   </parent>
25 |   <groupId>com.microsoft.masc</groupId>
26 |   <artifactId>microsoft-accumulo-spark-zipfs</artifactId>
27 |   <version>1.0.4</version>
28 |   <name>Zip File System</name>
29 |   <properties>
30 |     <encoding>UTF-8</encoding>
31 |   </properties>
32 |   <dependencies>
33 |     <dependency>
34 |       <groupId>com.google.guava</groupId>
35 |       <artifactId>guava</artifactId>
36 |       <scope>test</scope>
37 |     </dependency>
38 |     <dependency>
39 |       <groupId>com.google.jimfs</groupId>
40 |       <artifactId>jimfs</artifactId>
41 |       <scope>test</scope>
42 |     </dependency>
43 |     <dependency>
44 |       <groupId>junit</groupId>
45 |       <artifactId>junit</artifactId>
46 |       <scope>test</scope>
47 |     </dependency>
48 |   </dependencies>
49 |   <build>
50 |     <plugins>
51 |       <plugin>
52 |           <groupId>org.apache.maven.plugins</groupId>
53 |           <artifactId>maven-compiler-plugin</artifactId>
54 |       </plugin>
55 |       <plugin>
56 |         <groupId>org.apache.maven.plugins</groupId>
57 |         <artifactId>maven-enforcer-plugin</artifactId>
58 |      </plugin>
59 |     </plugins>
60 |   </build>
61 | </project>


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.1 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [many more](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [definition](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center at [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://technet.microsoft.com/en-us/security/dn606155).
12 | 
13 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
14 | 
15 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
16 | 
17 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
18 |   * Full paths of source file(s) related to the manifestation of the issue
19 |   * The location of the affected source code (tag/branch/commit or direct URL)
20 |   * Any special configuration required to reproduce the issue
21 |   * Step-by-step instructions to reproduce the issue
22 |   * Proof-of-concept or exploit code (if possible)
23 |   * Impact of the issue, including how an attacker might exploit the issue
24 | 
25 | This information will help us triage your report more quickly.
26 | 
27 | ## Preferred Languages
28 | 
29 | We prefer all communications to be in English.
30 | 
31 | ## Policy
32 | 
33 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
34 | 
35 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/expressions/RowKeyVariableExpression.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.juel.expressions;
19 | 
20 | import javax.el.ELContext;
21 | import javax.el.ELException;
22 | import javax.el.ValueExpression;
23 | 
24 | import com.microsoft.accumulo.spark.juel.AvroELContext;
25 | import org.apache.hadoop.io.Text;
26 | 
27 | /**
28 |  * JUEL VariableExpression resolving to the row key.
29 |  */
30 | public class RowKeyVariableExpression extends ValueExpression {
31 | 
32 |   public static final RowKeyVariableExpression INSTANCE = new RowKeyVariableExpression();
33 | 
34 |   private static final long serialVersionUID = 1L;
35 | 
36 |   @Override
37 |   public Class<?> getExpectedType() {
38 |     return String.class;
39 |   }
40 | 
41 |   @Override
42 |   public Class<?> getType(ELContext context) {
43 |     return String.class;
44 |   }
45 | 
46 |   @Override
47 |   public Object getValue(ELContext context) {
48 |     Text text = ((AvroELContext) context).getRowKey();
49 | 
50 |     return text.toString();
51 |   }
52 | 
53 |   @Override
54 |   public boolean isReadOnly(ELContext context) {
55 |     return true;
56 |   }
57 | 
58 |   @Override
59 |   public void setValue(ELContext context, Object value) {
60 |     throw new ELException("setValue not supported");
61 |   }
62 | 
63 |   @Override
64 |   public boolean equals(Object obj) {
65 |     return obj instanceof RowKeyVariableExpression;
66 |   }
67 | 
68 |   @Override
69 |   public String getExpressionString() {
70 |     throw new ELException("getExpressionString() is not supported");
71 |   }
72 | 
73 |   @Override
74 |   public int hashCode() {
75 |     return 42;
76 |   }
77 | 
78 |   @Override
79 |   public boolean isLiteralText() {
80 |     return false;
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Microsoft MASC, an Apache Spark connector for Apache Accumulo
 2 | 
 3 | The goal of this repository is to facilitate the use of [Apache Spark](https://spark.apache.org/) and its machine learning ecosystem with [Apache Accumulo](https://accumulo.apache.org/) as an external data source. 
 4 | 
 5 | # Contents
 6 | - The [connector](connector) provides connectivity to read from / write to Accumulo using Spark. See the [README](connector/README.md) for more details about supported functionality.
 7 | 
 8 | # Contributing
 9 | 
10 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
11 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
12 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
13 | 
14 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
15 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
16 | provided by the bot. You will only need to do this once across all repos using our CLA.
17 | 
18 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
19 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
20 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
21 | 
22 | # Build
23 | 
24 | [![Build Status](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_apis/build/status/AGCE%20AI/Web%20Scale%20AI/microsoft.Accumulo?branchName=master)](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_build/latest?definitionId=84&branchName=master) 
25 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-datasource/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-datasource)
26 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-iterator/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-iterator)
27 | 
28 | # License
29 | All code provided, except where otherwise documented in [OpenSource](OpenSource) and [NOTICE](NOTICE), is covered by the [Apache License 2.0](LICENSE)
30 | 
31 | # Trademarks
32 | 
33 | Apache®, [Apache Spark](https://spark.apache.org/), [Apache Accumulo](https://accumulo.apache.org/) and Accumulo are either registered trademarks or trademarks of the [Apache Software Foundation](https://www.apache.org/) in the United States and/or other countries.
34 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/expressions/AvroObjectExpression.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.juel.expressions;
19 | 
20 | import javax.el.ELContext;
21 | import javax.el.ELException;
22 | import javax.el.ValueExpression;
23 | 
24 | import com.microsoft.accumulo.spark.juel.AvroELContext;
25 | import org.apache.avro.Schema.Field;
26 | import org.apache.avro.generic.GenericData.Record;
27 | import org.apache.avro.generic.IndexedRecord;
28 | 
29 | /**
30 |  * JUEL VariableExpression resolving to a nested record.
31 |  */
32 | public class AvroObjectExpression extends ValueExpression {
33 | 
34 |   private static final long serialVersionUID = 1L;
35 | 
36 |   private Field field;
37 | 
38 |   public AvroObjectExpression(Field field) {
39 |     this.field = field;
40 |   }
41 | 
42 |   @Override
43 |   public Class<?> getExpectedType() {
44 |     return Record.class;
45 |   }
46 | 
47 |   @Override
48 |   public Class<?> getType(ELContext context) {
49 |     return Record.class;
50 |   }
51 | 
52 |   @Override
53 |   public Object getValue(ELContext context) {
54 |     IndexedRecord record = ((AvroELContext) context).getAvroRecord();
55 | 
56 |     return record.get(this.field.pos());
57 |   }
58 | 
59 |   @Override
60 |   public boolean isReadOnly(ELContext context) {
61 |     return true;
62 |   }
63 | 
64 |   @Override
65 |   public void setValue(ELContext context, Object value) {
66 |     throw new ELException("setValue not supported");
67 |   }
68 | 
69 |   @Override
70 |   public boolean equals(Object obj) {
71 |     if (!(obj instanceof AvroVariableExpression))
72 |       return false;
73 | 
74 |     AvroObjectExpression other = (AvroObjectExpression) obj;
75 | 
76 |     return this.field.equals(other.field);
77 |   }
78 | 
79 |   @Override
80 |   public String getExpressionString() {
81 |     throw new ELException("getExpressionString() is not supported");
82 |   }
83 | 
84 |   @Override
85 |   public int hashCode() {
86 |     return this.field.hashCode();
87 |   }
88 | 
89 |   @Override
90 |   public boolean isLiteralText() {
91 |     return false;
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/RowBuilderType.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.record;
19 | 
20 | import java.util.Date;
21 | 
22 | import org.apache.accumulo.core.client.lexicoder.DateLexicoder;
23 | import org.apache.accumulo.core.client.lexicoder.DoubleLexicoder;
24 | import org.apache.accumulo.core.client.lexicoder.Encoder;
25 | import org.apache.accumulo.core.client.lexicoder.FloatLexicoder;
26 | import org.apache.accumulo.core.client.lexicoder.IntegerLexicoder;
27 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder;
28 | import org.apache.accumulo.core.client.lexicoder.StringLexicoder;
29 | 
30 | /**
31 |  * Link JSON type string with Java class and Lexicoder
32 |  */
33 | public enum RowBuilderType {
34 |   String(String.class, new StringLexicoder()), Integer(int.class, new IntegerLexicoder()),
35 |   Long(long.class, new LongLexicoder()), Float(float.class, new FloatLexicoder()),
36 |   Double(double.class, new DoubleLexicoder()), Date(Date.class, new DateLexicoder()), Boolean(boolean.class, null),
37 |   Bytes(byte[].class, new ByteEncoder()), Unknown(null, null);
38 | 
39 |   private static class ByteEncoder implements Encoder<byte[]> {
40 | 
41 |     @Override
42 |     public byte[] encode(byte[] object) {
43 |       return object;
44 |     }
45 | 
46 |     @Override
47 |     public byte[] decode(byte[] bytes) throws IllegalArgumentException {
48 |       return bytes;
49 |     }
50 |   }
51 | 
52 |   private Class<?> javaClass;
53 |   private Encoder<?> encoder;
54 | 
55 |   RowBuilderType(Class<?> javaClass, Encoder<?> encoder) {
56 |     this.javaClass = javaClass;
57 |     this.encoder = encoder;
58 |   }
59 | 
60 |   public static RowBuilderType valueOfIgnoreCase(String name) {
61 |     for (RowBuilderType type : RowBuilderType.values()) {
62 |       if (name.equalsIgnoreCase(type.name()))
63 |         return type;
64 |     }
65 | 
66 |     return null;
67 |   }
68 | 
69 |   /**
70 |    * @return the javaClass
71 |    */
72 |   public Class<?> getJavaClass() {
73 |     return javaClass;
74 |   }
75 | 
76 |   /**
77 |    * @return the encoder
78 |    */
79 |   public Encoder<?> getEncoder() {
80 |     return encoder;
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowConsumer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.processors;
19 | 
20 | import java.io.IOException;
21 | import java.util.Collection;
22 | 
23 | import com.microsoft.accumulo.spark.record.RowBuilderField;
24 | import com.microsoft.accumulo.spark.util.StopWatch;
25 | import org.apache.avro.Schema;
26 | import org.apache.avro.generic.IndexedRecord;
27 | import org.apache.hadoop.io.Text;
28 | 
29 | public abstract class AvroRowConsumer {
30 | 
31 |   // private StopWatch stopWatchConsume = new StopWatch();
32 | 
33 |   /**
34 |    * Process the row.
35 |    * 
36 |    * @param rowKey The row key.
37 |    * @param record The AVRO record.
38 |    * @return The same or a new processed record. Null if processing should be
39 |    *         stopped (e.g. does not match a filter).
40 |    */
41 |   public boolean consume(Text rowKey, IndexedRecord record) throws IOException {
42 |     // this.stopWatchConsume.start();
43 | 
44 |     boolean ret = this.consumeInternal(rowKey, record);
45 |     // if (ret)
46 |     // this.stopWatchConsume.stop();
47 | 
48 |     return ret;
49 |   }
50 | 
51 |   // public double getAverageConsumeTime() {
52 |   // return this.stopWatchConsume.getAverage();
53 |   // }
54 | 
55 |   public String getName() {
56 |     return getClass().getSimpleName();
57 |   }
58 | 
59 |   /**
60 |    * Process the row.
61 |    * 
62 |    * @param rowKey The row key.
63 |    * @param record The AVRO record.
64 |    * @return The same or a new processed record. Null if processing should be
65 |    *         stopped (e.g. does not match a filter).
66 |    */
67 |   protected abstract boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException;
68 | 
69 |   /**
70 |    * Support copying of the object as the iterator needs to be copyable.
71 |    * 
72 |    * @return The cloned object.
73 |    */
74 |   public abstract AvroRowConsumer clone();
75 | 
76 |   /**
77 |    * Any additional fields this consumer wants to populate.
78 |    * 
79 |    * @return additional fields added to the main schema.
80 |    */
81 |   public abstract Collection<RowBuilderField> getSchemaFields();
82 | 
83 |   /**
84 |    * Final initialization of the consumer wants the entire schema was discovered.
85 |    * 
86 |    * @param schema The final schema.
87 |    */
88 |   public abstract void initialize(Schema schema);
89 | }
90 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipPosixFileAttributeView.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
 3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 4 |  *
 5 |  * This code is free software; you can redistribute it and/or modify it
 6 |  * under the terms of the GNU General Public License version 2 only, as
 7 |  * published by the Free Software Foundation.  Oracle designates this
 8 |  * particular file as subject to the "Classpath" exception as provided
 9 |  * by Oracle in the LICENSE file that accompanied this code.
10 |  *
11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 |  * version 2 for more details (a copy is included in the LICENSE file that
15 |  * accompanied this code).
16 |  *
17 |  * You should have received a copy of the GNU General Public License version
18 |  * 2 along with this work; if not, write to the Free Software Foundation,
19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 |  *
21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 |  * or visit www.oracle.com if you need additional information or have any
23 |  * questions.
24 |  */
25 | 
26 | package com.microsoft.accumulo.zipfs;
27 | 
28 | import java.io.IOException;
29 | import java.nio.file.attribute.GroupPrincipal;
30 | import java.nio.file.attribute.PosixFileAttributeView;
31 | import java.nio.file.attribute.PosixFileAttributes;
32 | import java.nio.file.attribute.UserPrincipal;
33 | 
34 | /**
35 |  * The zip file system attribute view with POSIX support.
36 |  */
37 | class ZipPosixFileAttributeView extends ZipFileAttributeView implements PosixFileAttributeView {
38 |     private final boolean isOwnerView;
39 | 
40 |     ZipPosixFileAttributeView(ZipPath path, boolean owner) {
41 |         super(path, true);
42 |         this.isOwnerView = owner;
43 |     }
44 | 
45 |     @Override
46 |     public String name() {
47 |         return isOwnerView ? "owner" : "posix";
48 |     }
49 | 
50 |     @Override
51 |     public PosixFileAttributes readAttributes() throws IOException {
52 |         return (PosixFileAttributes)path.readAttributes();
53 |     }
54 | 
55 |     @Override
56 |     public UserPrincipal getOwner() throws IOException {
57 |         return readAttributes().owner();
58 |     }
59 | 
60 |     @Override
61 |     public void setOwner(UserPrincipal owner) throws IOException {
62 |         path.setOwner(owner);
63 |     }
64 | 
65 |     @Override
66 |     public void setGroup(GroupPrincipal group) throws IOException {
67 |         path.setGroup(group);
68 |     }
69 | 
70 |     @Override
71 |     Object attribute(AttrID id, ZipFileAttributes zfas) {
72 |         PosixFileAttributes pzfas = (PosixFileAttributes)zfas;
73 |         switch (id) {
74 |         case owner:
75 |             return pzfas.owner();
76 |         case group:
77 |             return pzfas.group();
78 |         case permissions:
79 |             if (!isOwnerView) {
80 |                 return pzfas.permissions();
81 |             } else {
82 |                 return super.attribute(id, zfas);
83 |             }
84 |         default:
85 |             return super.attribute(id, zfas);
86 |         }
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/expressions/AvroVariableExpression.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.juel.expressions;
 19 | 
 20 | import java.util.Arrays;
 21 | 
 22 | import javax.el.ELContext;
 23 | import javax.el.ELException;
 24 | import javax.el.ValueExpression;
 25 | 
 26 | import com.microsoft.accumulo.spark.juel.AvroELContext;
 27 | import org.apache.avro.generic.IndexedRecord;
 28 | 
 29 | /**
 30 |  * Expose Avro top-level record fields as JUEL ValueExpression
 31 |  */
 32 | public class AvroVariableExpression extends ValueExpression {
 33 |   private static final long serialVersionUID = 1L;
 34 | 
 35 |   private Class<?> type;
 36 |   // indices to walk through nested avro records
 37 |   private int[] fieldPositions;
 38 | 
 39 |   public AvroVariableExpression(Class<?> type, int... fieldPositions) {
 40 |     this.type = type;
 41 |     this.fieldPositions = fieldPositions;
 42 |   }
 43 | 
 44 |   @Override
 45 |   public Class<?> getExpectedType() {
 46 |     return type;
 47 |   }
 48 | 
 49 |   @Override
 50 |   public Class<?> getType(ELContext context) {
 51 |     return type;
 52 |   }
 53 | 
 54 |   @Override
 55 |   public Object getValue(ELContext context) {
 56 |     IndexedRecord record = ((AvroELContext) context).getAvroRecord();
 57 | 
 58 |     // supported nested records (e.g. column family/column qualifier)
 59 |     for (int i = 0; i < fieldPositions.length - 1; i++)
 60 |       record = (IndexedRecord) record.get(fieldPositions[i]);
 61 | 
 62 |     return record.get(fieldPositions[fieldPositions.length - 1]);
 63 |   }
 64 | 
 65 |   @Override
 66 |   public boolean isReadOnly(ELContext context) {
 67 |     return true;
 68 |   }
 69 | 
 70 |   @Override
 71 |   public void setValue(ELContext context, Object value) {
 72 |     throw new ELException("setValue not supported");
 73 |   }
 74 | 
 75 |   @Override
 76 |   public boolean equals(Object obj) {
 77 |     if (!(obj instanceof AvroVariableExpression))
 78 |       return false;
 79 | 
 80 |     AvroVariableExpression other = (AvroVariableExpression) obj;
 81 | 
 82 |     return type.equals(other.type) && Arrays.equals(fieldPositions, other.fieldPositions);
 83 |   }
 84 | 
 85 |   @Override
 86 |   public String getExpressionString() {
 87 |     throw new ELException("getExpressionString() is not supported");
 88 |   }
 89 | 
 90 |   @Override
 91 |   public int hashCode() {
 92 |     return type.hashCode() + Arrays.hashCode(fieldPositions);
 93 |   }
 94 | 
 95 |   @Override
 96 |   public boolean isLiteralText() {
 97 |     return false;
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/connector/README.md:
--------------------------------------------------------------------------------
 1 | # Microsoft MASC, an Apache Spark connector for Apache Accumulo
 2 | [![Build Status](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_apis/build/status/AGCE%20AI/Web%20Scale%20AI/microsoft.Accumulo?branchName=master)](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_build/latest?definitionId=84&branchName=master) 
 3 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-datasource/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-datasource)
 4 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-iterator/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-iterator)
 5 | 
 6 | This code provides connectivity between Apache Accumulo and Apache Spark.
 7 | 
 8 | ## Main Goals
 9 | - Provide native Spark interface to connect to Accumulo
10 | - Minimize data transfer between Spark and Accumulo
11 | - Enable use of Machine Learning with Accumulo as the datastore
12 | 
13 | ## Examples
14 | ```python
15 | # Read from Accumulo
16 | df = (spark
17 |       .read
18 |       .format("com.microsoft.accumulo")
19 |       .options(**options)  # define Accumulo properties
20 |       .schema(schema))  # define schema for data retrieval
21 | 
22 | # Write to Accumulo
23 | (df
24 |  .write
25 |  .format("com.microsoft.accumulo")
26 |  .options(**options)
27 |  .save())
28 | ```
29 | 
30 | See Pyspark [notebook](examples/AccumuloSparkConnector.ipynb) for a more detailed example.
31 | 
32 | See Scala benchmark [notebook](examples/AccumuloSparkConnectorBenchmark.ipynb) for details on how our evaluation.
33 | 
34 | ## Capabilities
35 | - Native Spark [Datasource V2](http://shzhangji.com/blog/2018/12/08/spark-datasource-api-v2/) API
36 | - Row serialization using [Avro](https://avro.apache.org/)
37 | - Filter pushdown (server-side)
38 | - Expressive filter language using [JUEL](http://juel.sourceforge.net/)
39 | - ML Inference pushdown (server-side) using [MLeap](http://mleap-docs.combust.ml/)
40 | - Support Spark ML pipelines
41 | - Minimal Java-runtime
42 | 
43 | ## Installation
44 | 
45 | The connector is composed of two components:
46 | - The [Datasource](datasource) component provides the interface used on the Spark side
47 | - The [Iterator](iterator) component provides server-side functionality on the Accumulo side
48 | 
49 | The components can be built and tested with Maven (version 3.3.9 or higher) using Java version 8.
50 | ```
51 | mvn clean install
52 | ```
53 | 
54 | Alternatively the JARs are published to the Maven Central Repository
55 | - [Datasource](https://mvnrepository.com/artifact/com.microsoft.masc/microsoft-accumulo-spark-datasource)
56 | - [Iterator](https://mvnrepository.com/artifact/com.microsoft.masc/microsoft-accumulo-spark-iterator)
57 | 
58 | The following steps are needed to deploy the connector:
59 | 1) Deploy iterator JAR to Accumulo lib folders on all nodes and restart the cluster
60 | ```
61 | # use locally built shaded jar in connector/iterator/target folder
62 | #  or
63 | # use maven to download iterator from central repository
64 | mvn dependency:get -Dartifact=com.microsoft.masc:microsoft-accumulo-spark-iterator:[VERSION]
65 | ```
66 | 2) Add Datasource JAR in Spark
67 | ```
68 | # use locally built shaded jar in connector/datasource/target folder or 
69 | #  or
70 | # pull in package from maven central repository
71 | com.microsoft.masc:microsoft-accumulo-spark-datasource:[VERSION]
72 | ```
73 | 
74 | ## Spark Runtime Java Version
75 | 
76 | While the iterator JAR can run on Accumulo tablet servers using JDK versions >= 1.8, the Spark Datasource component is only compatible with JDK version 1.8 (not higher) due to [Spark's Java support](https://spark.apache.org/docs/latest/).
77 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowSerializer.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.processors;
19 | 
20 | import java.io.ByteArrayOutputStream;
21 | import java.io.IOException;
22 | import java.util.List;
23 | import java.util.stream.Collectors;
24 | 
25 | import com.microsoft.accumulo.spark.record.AvroFastRecord;
26 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder;
27 | 
28 | import org.apache.avro.Schema;
29 | import org.apache.avro.Schema.Field;
30 | import org.apache.avro.generic.IndexedRecord;
31 | import org.apache.avro.io.BinaryEncoder;
32 | import org.apache.avro.io.DatumWriter;
33 | import org.apache.avro.io.EncoderFactory;
34 | import org.apache.avro.specific.SpecificDatumWriter;
35 | import org.apache.log4j.Logger;
36 | 
37 | public class AvroRowSerializer {
38 |   private final static Logger logger = Logger.getLogger(AvroRowSerializer.class);
39 | 
40 |   // avro writer infra
41 |   private ByteArrayOutputStream binaryBuffer = new ByteArrayOutputStream();
42 |   private DatumWriter<IndexedRecord> writer;
43 |   private BinaryEncoder encoder;
44 | 
45 |   private AvroFastRecord finalRecord;
46 |   private int[] sourceIndicies;
47 | 
48 |   public AvroRowSerializer(Schema schema) {
49 |     List<Field> fieldList = schema.getFields().stream()
50 |         // AVRO 1.8.2 doesn't support getObjectProp
51 |         .filter(f -> Boolean.parseBoolean(f.getProp(AvroSchemaBuilder.PROPERTY_OUTPUT)))
52 |         .map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal()))
53 |         // create the list
54 |         .collect(Collectors.toList());
55 | 
56 |     // check if the schema pruned fields?
57 |     if (fieldList.size() != schema.getFields().size()) {
58 |       Schema prunedSchema = Schema.createRecord(fieldList);
59 |       this.finalRecord = new AvroFastRecord(prunedSchema);
60 | 
61 |       // initialize source to target mapping
62 |       this.sourceIndicies = new int[fieldList.size()];
63 |       for (Field field : prunedSchema.getFields()) {
64 |         logger.info("Pruned field: " + field.name());
65 |         this.sourceIndicies[field.pos()] = schema.getField(field.name()).pos();
66 |       }
67 | 
68 |       schema = prunedSchema;
69 |     }
70 | 
71 |     this.writer = new SpecificDatumWriter<>(schema);
72 |     this.encoder = EncoderFactory.get().binaryEncoder(binaryBuffer, null);
73 |   }
74 | 
75 |   public byte[] serialize(IndexedRecord record) throws IOException {
76 |     // make sure we're at the beginning again
77 |     this.binaryBuffer.reset();
78 | 
79 |     // copying to final output schema
80 |     if (this.sourceIndicies != null) {
81 |       for (int i = 0; i < this.sourceIndicies.length; i++)
82 |         this.finalRecord.put(i, record.get(this.sourceIndicies[i]));
83 | 
84 |       record = this.finalRecord;
85 |     }
86 |     // serialize the record
87 |     this.writer.write(record, encoder);
88 | 
89 |     this.encoder.flush();
90 |     this.binaryBuffer.flush();
91 | 
92 |     return this.binaryBuffer.toByteArray();
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroColumnPruningTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark;
19 | 
20 | import static org.junit.Assert.assertEquals;
21 | import static org.junit.Assert.assertFalse;
22 | import static org.junit.Assert.assertTrue;
23 | 
24 | import java.io.IOException;
25 | import java.util.Arrays;
26 | import java.util.HashMap;
27 | import java.util.Map;
28 | import java.util.SortedMap;
29 | import java.util.TreeMap;
30 | 
31 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder;
32 | import org.apache.accumulo.core.data.Key;
33 | import org.apache.accumulo.core.data.Range;
34 | import org.apache.accumulo.core.data.Value;
35 | import org.apache.accumulo.core.iterators.SortedMapIterator;
36 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder;
37 | import com.microsoft.accumulo.spark.record.RowBuilderField;
38 | import org.apache.avro.Schema;
39 | import org.apache.avro.generic.GenericRecord;
40 | import org.apache.avro.util.Utf8;
41 | import org.junit.Test;
42 | 
43 | public class AvroColumnPruningTest {
44 | 	@Test
45 | 	public void testColumnPruning() throws IOException {
46 | 		SortedMap<Key, Value> map = new TreeMap<>();
47 | 		map.put(new Key("key1", "cf1", "cq1"), new Value(new LongLexicoder().encode(3L)));
48 | 		map.put(new Key("key1", "cf2", ""), new Value("abc"));
49 | 		map.put(new Key("key2", "cf2"), new Value("def"));
50 | 
51 | 		SortedMapIterator parentIterator = new SortedMapIterator(map);
52 | 		AvroRowEncoderIterator iterator = new AvroRowEncoderIterator();
53 | 
54 | 		Map<String, String> options = new HashMap<>();
55 | 		options.put(AvroRowEncoderIterator.SCHEMA,
56 | 				"[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"long\",\"o\":false},{\"cf\":\"cf2\",\"t\":\"STRING\",\"o\":true}]");
57 | 
58 | 		iterator.init(parentIterator, options, new DefaultIteratorEnvironment());
59 | 		iterator.seek(new Range(), AvroUtil.EMPTY_SET, false);
60 | 
61 | 		RowBuilderField[] schemaMappingFields = new RowBuilderField[] {
62 | 				new RowBuilderField("cf2", null, "string", "v1") };
63 | 
64 | 		Schema schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields));
65 | 
66 | 		// ############################## ROW 1
67 | 		assertTrue(iterator.hasTop());
68 | 		assertEquals("key1", iterator.getTopKey().getRow().toString());
69 | 
70 | 		// validate value
71 | 		byte[] data = iterator.getTopValue().get();
72 | 
73 | 		GenericRecord record = AvroUtil.deserialize(data, schema);
74 | 
75 | 		assertEquals("abc", record.get("cf2").toString());
76 | 		assertTrue(record.get("cf2") instanceof Utf8);
77 | 
78 | 		// ############################## ROW 2
79 | 		iterator.next();
80 | 
81 | 		assertTrue(iterator.hasTop());
82 | 		assertEquals("key2", iterator.getTopKey().getRow().toString());
83 | 
84 | 		// validate value
85 | 		data = iterator.getTopValue().get();
86 | 
87 | 		record = AvroUtil.deserialize(data, schema);
88 | 
89 | 		assertEquals("def", record.get("cf2").toString());
90 | 		assertTrue(record.get("cf2") instanceof Utf8);
91 | 
92 | 		// End of data
93 | 		iterator.next();
94 | 		assertFalse(iterator.hasTop());
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipDirectoryStream.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
  3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4 |  *
  5 |  * This code is free software; you can redistribute it and/or modify it
  6 |  * under the terms of the GNU General Public License version 2 only, as
  7 |  * published by the Free Software Foundation.  Oracle designates this
  8 |  * particular file as subject to the "Classpath" exception as provided
  9 |  * by Oracle in the LICENSE file that accompanied this code.
 10 |  *
 11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14 |  * version 2 for more details (a copy is included in the LICENSE file that
 15 |  * accompanied this code).
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License version
 18 |  * 2 along with this work; if not, write to the Free Software Foundation,
 19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  *
 21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22 |  * or visit www.oracle.com if you need additional information or have any
 23 |  * questions.
 24 |  */
 25 | 
 26 | package com.microsoft.accumulo.zipfs;
 27 | 
 28 | import java.io.IOException;
 29 | import java.nio.file.ClosedDirectoryStreamException;
 30 | import java.nio.file.DirectoryIteratorException;
 31 | import java.nio.file.DirectoryStream;
 32 | import java.nio.file.NotDirectoryException;
 33 | import java.nio.file.Path;
 34 | import java.util.Iterator;
 35 | import java.util.NoSuchElementException;
 36 | 
 37 | /**
 38 |  *
 39 |  * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal
 40 |  */
 41 | class ZipDirectoryStream implements DirectoryStream<Path> {
 42 | 
 43 |     private final ZipFileSystem zipfs;
 44 |     private final ZipPath dir;
 45 |     private final DirectoryStream.Filter<? super Path> filter;
 46 |     private volatile boolean isClosed;
 47 |     private volatile Iterator<Path> itr;
 48 | 
 49 |     ZipDirectoryStream(ZipPath dir,
 50 |                        DirectoryStream.Filter<? super java.nio.file.Path> filter)
 51 |         throws IOException
 52 |     {
 53 |         this.zipfs = dir.getFileSystem();
 54 |         this.dir = dir;
 55 |         this.filter = filter;
 56 |         // sanity check
 57 |         if (!zipfs.isDirectory(dir.getResolvedPath()))
 58 |             throw new NotDirectoryException(dir.toString());
 59 |     }
 60 | 
 61 |     @Override
 62 |     public synchronized Iterator<Path> iterator() {
 63 |         if (isClosed)
 64 |             throw new ClosedDirectoryStreamException();
 65 |         if (itr != null)
 66 |             throw new IllegalStateException("Iterator has already been returned");
 67 | 
 68 |         try {
 69 |             itr = zipfs.iteratorOf(dir, filter);
 70 |         } catch (IOException e) {
 71 |             throw new DirectoryIteratorException(e);
 72 |         }
 73 | 
 74 |         return new Iterator<Path>() {
 75 |             @Override
 76 |             public boolean hasNext() {
 77 |                 if (isClosed)
 78 |                     return false;
 79 |                 return itr.hasNext();
 80 |             }
 81 | 
 82 |             @Override
 83 |             public synchronized Path next() {
 84 |                 if (isClosed)
 85 |                     throw new NoSuchElementException();
 86 |                 return itr.next();
 87 |             }
 88 | 
 89 |             @Override
 90 |             public void remove() {
 91 |                 throw new UnsupportedOperationException();
 92 |             }
 93 |         };
 94 |     }
 95 | 
 96 |     @Override
 97 |     public synchronized void close() throws IOException {
 98 |         isClosed = true;
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroVariableMapper.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo.spark.juel;
19 | 
20 | import java.util.HashMap;
21 | import java.util.Map;
22 | 
23 | import javax.el.ValueExpression;
24 | import javax.el.VariableMapper;
25 | 
26 | import com.microsoft.accumulo.spark.juel.expressions.AvroObjectExpression;
27 | import com.microsoft.accumulo.spark.juel.expressions.AvroVariableExpression;
28 | import com.microsoft.accumulo.spark.juel.expressions.RowKeyVariableExpression;
29 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder;
30 | import com.microsoft.accumulo.spark.record.RowBuilderType;
31 | import org.apache.avro.Schema;
32 | import org.apache.avro.Schema.Field;
33 | import org.apache.avro.Schema.Type;
34 | 
35 | /**
36 |  * Resolve JUEL variables against Avro schema.
37 |  */
38 | public class AvroVariableMapper extends VariableMapper {
39 | 
40 |   private static final String ROWKEY_VARIABLE_NAME = "rowKey";
41 | 
42 |   private Schema schema;
43 | 
44 |   /**
45 |    * fast lookup for variable names modelled by Avro aliases.
46 |    */
47 |   private Map<String, AvroVariableExpression> aliasMap;
48 | 
49 |   public AvroVariableMapper(Schema schema) {
50 |     this.schema = schema;
51 | 
52 |     // build alias to VariableExpression map
53 |     this.aliasMap = new HashMap<>();
54 |     for (Field field : schema.getFields()) {
55 | 
56 |       if (field.schema().getType() == Type.RECORD) {
57 |         for (Field nestedField : field.schema().getFields()) {
58 |           // find the corresponding java class
59 |           Class<?> nestedFieldClass = RowBuilderType
60 |               .valueOf(nestedField.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE)).getJavaClass();
61 | 
62 |           for (String alias : nestedField.aliases())
63 |             this.aliasMap.put(alias, new AvroVariableExpression(nestedFieldClass, field.pos(), nestedField.pos()));
64 |         }
65 |       } else {
66 |         // find the corresponding java class
67 |         Class<?> fieldClass = RowBuilderType.valueOf(field.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE))
68 |             .getJavaClass();
69 |         for (String alias : field.aliases())
70 |           this.aliasMap.put(alias, new AvroVariableExpression(fieldClass, field.pos()));
71 |       }
72 |     }
73 |   }
74 | 
75 |   /**
76 |    * Resolve variables in this order: rowKey, mapped variables (e.g. v2 = cf1.cq1)
77 |    * and finally using variable expressions.
78 |    */
79 |   @Override
80 |   public ValueExpression resolveVariable(String variable) {
81 |     if (variable.equals(ROWKEY_VARIABLE_NAME))
82 |       return RowKeyVariableExpression.INSTANCE;
83 | 
84 |     // check if this is a statically resolved variable (e.g. v2 = cf1.cq1)
85 |     AvroVariableExpression expr = this.aliasMap.get(variable);
86 | 
87 |     // otherwise default to dynamic lookup
88 |     return expr != null ? expr : new AvroObjectExpression(this.schema.getField(variable));
89 |   }
90 | 
91 |   @Override
92 |   public ValueExpression setVariable(String variable, ValueExpression expression) {
93 |     return null;
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowFilter.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.processors;
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.Collection;
 22 | import java.util.Collections;
 23 | import java.util.Map;
 24 | 
 25 | import javax.el.ExpressionFactory;
 26 | import javax.el.ValueExpression;
 27 | 
 28 | import com.microsoft.accumulo.spark.juel.AvroELContext;
 29 | import com.microsoft.accumulo.spark.record.RowBuilderField;
 30 | import org.apache.avro.Schema;
 31 | import org.apache.avro.generic.IndexedRecord;
 32 | import org.apache.commons.lang3.StringUtils;
 33 | import org.apache.hadoop.io.Text;
 34 | import org.apache.log4j.Logger;
 35 | 
 36 | /**
 37 |  * Evaluates the user-supplied filter (JUEL syntax) against the constructed AVRO
 38 |  * record.
 39 |  * 
 40 |  * Note: filter operates on AVRO Record object, not on the serialized version.
 41 |  */
 42 | public class AvroRowFilter extends AvroRowConsumer {
 43 |   private final static Logger logger = Logger.getLogger(AvroRowFilter.class);
 44 | 
 45 |   public static AvroRowFilter create(Map<String, String> options, String optionKey) {
 46 |     String filter = options.get(optionKey);
 47 | 
 48 |     return StringUtils.isEmpty(filter) ? null : new AvroRowFilter(filter, optionKey);
 49 |   }
 50 | 
 51 |   /**
 52 |    * Required for cloning.
 53 |    */
 54 |   private Schema schema;
 55 | 
 56 |   /**
 57 |    * Required for cloning.
 58 |    */
 59 |   private String filter;
 60 | 
 61 |   private String optionKey;
 62 | 
 63 |   /**
 64 |    * JUEL expression context exposing AVRO GenericRecord
 65 |    */
 66 |   private AvroELContext expressionContext;
 67 | 
 68 |   /**
 69 |    * JUEL filter expression
 70 |    */
 71 |   private ValueExpression filterExpression;
 72 | 
 73 |   private AvroRowFilter(String filter, String optionKey) {
 74 |     logger.info(optionKey + " filter '" + filter + "'");
 75 | 
 76 |     this.filter = filter;
 77 |     this.optionKey = optionKey;
 78 |   }
 79 | 
 80 |   @Override
 81 |   public String getName() {
 82 |     return super.getName() + " " + this.optionKey;
 83 |   }
 84 | 
 85 |   @Override
 86 |   protected boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException {
 87 |     // link AVRO record with JUEL expression context
 88 |     this.expressionContext.setCurrent(rowKey, record);
 89 | 
 90 |     return (boolean) filterExpression.getValue(this.expressionContext);
 91 |   }
 92 | 
 93 |   @Override
 94 |   public AvroRowConsumer clone() {
 95 |     AvroRowFilter copy = new AvroRowFilter(this.filter, this.optionKey);
 96 | 
 97 |     copy.initialize(schema);
 98 | 
 99 |     return copy;
100 |   }
101 | 
102 |   @Override
103 |   public Collection<RowBuilderField> getSchemaFields() {
104 |     return Collections.<RowBuilderField>emptyList();
105 |   }
106 | 
107 |   @Override
108 |   public void initialize(Schema schema) {
109 |     this.schema = schema;
110 |     this.expressionContext = new AvroELContext(schema);
111 | 
112 |     ExpressionFactory factory = ExpressionFactory.newInstance();
113 | 
114 |     this.filterExpression = factory.createValueExpression(expressionContext, filter, boolean.class);
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/RowBuilderField.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.record;
 19 | 
 20 | import com.google.gson.annotations.SerializedName;
 21 | 
 22 | /**
 23 |  * POJO for the user-supplied schema fields.
 24 |  */
 25 | public class RowBuilderField {
 26 |   @SerializedName("cf")
 27 |   private String columnFamily;
 28 | 
 29 |   @SerializedName("cq")
 30 |   private String columnQualifier;
 31 | 
 32 |   @SerializedName("t")
 33 |   private String type;
 34 | 
 35 |   @SerializedName("fvn")
 36 |   private String filterVariableName;
 37 | 
 38 |   @SerializedName("o")
 39 |   private boolean output = true;
 40 | 
 41 |   private boolean nullable = true;
 42 | 
 43 |   public RowBuilderField() {
 44 |   }
 45 | 
 46 |   public RowBuilderField(String columnFamily, String columnQualifier, String type, String filterVariableName) {
 47 |     this.columnFamily = columnFamily;
 48 |     this.columnQualifier = columnQualifier;
 49 |     this.type = type;
 50 |     this.filterVariableName = filterVariableName;
 51 |   }
 52 | 
 53 |   public RowBuilderType getRowBuilderType() {
 54 |     return RowBuilderType.valueOfIgnoreCase(this.type);
 55 |   }
 56 | 
 57 |   /**
 58 |    * @return true if this field should be output, otherwise it's just needed for
 59 |    *         intermediate processing (e.g. filtering).
 60 |    */
 61 |   public boolean isOutput() {
 62 |     return output;
 63 |   }
 64 | 
 65 |   public void setOutput(boolean output) {
 66 |     this.output = output;
 67 |   }
 68 | 
 69 |   /**
 70 |    * @return the nullable
 71 |    */
 72 |   public boolean isNullable() {
 73 |     return nullable;
 74 |   }
 75 | 
 76 |   /**
 77 |    * @param nullable the nullable to set
 78 |    */
 79 |   public void setNullable(boolean nullable) {
 80 |     this.nullable = nullable;
 81 |   }
 82 | 
 83 |   /**
 84 |    * @param filterVariableName the filterVariableName to set
 85 |    */
 86 |   public void setFilterVariableName(String filterVariableName) {
 87 |     this.filterVariableName = filterVariableName;
 88 |   }
 89 | 
 90 |   /**
 91 |    * @return the filterVariableName
 92 |    */
 93 |   public String getFilterVariableName() {
 94 |     return filterVariableName;
 95 |   }
 96 | 
 97 |   /**
 98 |    * @param columnFamily the columnFamily to set
 99 |    */
100 |   public void setColumnFamily(String columnFamily) {
101 |     this.columnFamily = columnFamily;
102 |   }
103 | 
104 |   /**
105 |    * @return the columnFamily
106 |    */
107 |   public String getColumnFamily() {
108 |     return columnFamily;
109 |   }
110 | 
111 |   /**
112 |    * @return the columnQualifier
113 |    */
114 |   public String getColumnQualifier() {
115 |     return columnQualifier;
116 |   }
117 | 
118 |   /**
119 |    * @param columnQualifier the columnQualifier to set
120 |    */
121 |   public void setColumnQualifier(String columnQualifier) {
122 |     this.columnQualifier = columnQualifier;
123 |   }
124 | 
125 |   /**
126 |    * @return the type
127 |    */
128 |   public String getType() {
129 |     return type;
130 |   }
131 | 
132 |   /**
133 |    * @param type the type to set
134 |    */
135 |   public void setType(String type) {
136 |     this.type = type;
137 |   }
138 | }
139 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/FilterToJuel.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import org.apache.spark.sql.sources._
 21 | 
 22 | case class AccumuloFilterResult(serializedFilter: String,
 23 | 								supportedFilters: Seq[Filter],
 24 | 								unsupportedFilters: Seq[Filter])
 25 | 
 26 | class FilterToJuel(val attributeToVariableMapping: Map[String, String], val rowKeyColumn: String = "rowKey") {
 27 | 	def mapAttribute(attribute: String): String = {
 28 | 		if (attribute == rowKeyColumn)
 29 | 			"rowKey"
 30 | 		else
 31 | 			attributeToVariableMapping.getOrElse(attribute, attribute)
 32 | 	}
 33 | 
 34 | 	def serializeValue(value: Any): String = {
 35 | 		value match {
 36 | 			case str: String =>
 37 | 				// properly escape \ and '
 38 | 				val strEscaped = str
 39 | 					.replace("\\", "\\\\")
 40 |   				.replace("'", "\\'")
 41 | 
 42 | 				"'" + strEscaped + "'"
 43 | 			case other: Any => other.toString
 44 | 		}
 45 | 	}
 46 | 
 47 | 	def serializeFilter(filter: Filter): String = {
 48 | 		filter match {
 49 | 			case op: And => s"(${serializeFilter(op.left)} && ${serializeFilter(op.right)})"
 50 | 			case op: Or => s"(${serializeFilter(op.left)} || ${serializeFilter(op.right)})"
 51 | 			case op: EqualTo =>  s"(${mapAttribute(op.attribute)} == ${serializeValue(op.value)})"
 52 | 			case op: GreaterThan => s"(${mapAttribute(op.attribute)} > ${serializeValue(op.value)})"
 53 | 			case op: GreaterThanOrEqual => s"(${mapAttribute(op.attribute)} >= ${serializeValue(op.value)})"
 54 | 			case op: LessThan => s"(${mapAttribute(op.attribute)} < ${serializeValue(op.value)})"
 55 | 			case op: LessThanOrEqual => s"(${mapAttribute(op.attribute)} <= ${serializeValue(op.value)})"
 56 | 			case op: Not => s"(!${serializeFilter(op.child)})"
 57 | 			case op: IsNull => s"(${mapAttribute(op.attribute)} == null)"
 58 | 			case op: IsNotNull =>
 59 | 				// IsNotNull(cf1) will be generated for conditions like cf1.cq1 > 5
 60 | 				// since we always create the struct, it's always true
 61 | 				val variable = attributeToVariableMapping.get(op.attribute)
 62 | 
 63 | 				if (variable.isEmpty)
 64 | 					// assuming this comes for a nested column family, will always be true
 65 | 					"true"
 66 | 				else
 67 | 					s"(${variable.get} != null)"
 68 | 			case op: StringContains => s"${mapAttribute(op.attribute)}.contains(${serializeValue(op.value)})"
 69 | 			case op: StringStartsWith => s"${mapAttribute(op.attribute)}.startsWith(${serializeValue(op.value)})"
 70 | 			case op: StringEndsWith => s"${mapAttribute(op.attribute)}.endsWith(${serializeValue(op.value)})"
 71 | 			case op: In =>
 72 | 				val values = op.values.map { v => serializeValue(v) } .mkString(",")
 73 | 				s"${mapAttribute(op.attribute)}.in($values)"
 74 | 			// TODO: not sure if null handling is properly done
 75 | 			// TODO:  EqualNullSafe
 76 | 			case _ => throw new UnsupportedOperationException(s"Filter $filter not supported")
 77 | 		}
 78 | 	}
 79 | 
 80 | 	def serializeFilters(filters: Array[Filter], filterStr: String): AccumuloFilterResult =
 81 | 	{
 82 | 		val (supported, unsupported) = filters.map({ f => {
 83 | 
 84 | 			try {
 85 | 				(serializeFilter(f), f)
 86 | 			} catch {
 87 | 				case _: UnsupportedOperationException => ("", f)
 88 | 			}
 89 | 		}}).partition(!_._1.isEmpty)
 90 | 
 91 | 		var filter = supported.map(_._1)
 92 | 
 93 | 		// append if provided
 94 | 		if (filterStr.length > 0)
 95 | 			filter = filter :+ s"($filterStr)"
 96 | 
 97 | 		val finalFilter = filter.mkString(" && ")
 98 | 
 99 | 		AccumuloFilterResult(
100 | 			finalFilter,
101 | 			supported.map(_._2),
102 | 			unsupported.map(_._2)
103 | 		)
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/MLeapUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.microsoft.accumulo
19 | 
20 | import org.apache.spark.sql.types.StructField
21 | import ml.combust.bundle.BundleFile
22 | import ml.combust.mleap.runtime.MleapSupport._
23 | import ml.combust.mleap.runtime.MleapContext.defaultContext
24 | import org.apache.spark.sql.mleap.TypeConverters
25 | import java.io.File
26 | import java.util.{Base64, HashMap}
27 | import java.net.URI
28 | import java.nio.file.{Files, FileSystem, FileSystems, Path, StandardOpenOption}
29 | import resource._
30 | import ml.combust.mleap.core.types.ScalarType
31 | import com.google.common.jimfs.{Jimfs, Configuration}
32 | import com.microsoft.accumulo.zipfs.{ZipFileSystem, ZipFileSystemProvider}
33 | 
34 | @SerialVersionUID(1L)
35 | object MLeapUtil {
36 | 
37 | 	// load the Spark pipeline we saved in the previous section
38 | 	def mleapSchemaToCatalyst(modelBase64: String): Seq[StructField] = {
39 | 		if (modelBase64.isEmpty)
40 | 			Seq.empty[StructField]
41 | 		else {
42 | 			val mleapBundleArr = Base64.getDecoder().decode(modelBase64)
43 | 
44 | 			val fs = Jimfs.newFileSystem(Configuration.unix())
45 | 			val mleapFilePath = fs.getPath("/mleap.zip")
46 | 			Files.write(mleapFilePath, mleapBundleArr, StandardOpenOption.CREATE)
47 | 
48 | 			// Why do we access a private constructor???
49 | 			// 1. MLeap only exposes a FileSystem layer to load models.
50 | 			// 2. We don't want to write to the local file system
51 | 			// 2a. We use Google JimFS
52 | 			// 2b. We can't use https://github.com/marschall/memoryfilesystem at it has a 16MB file size limitation
53 | 			// 2c. We can't use Apache common-vfs as it doesn't support directory listing
54 | 			// 3. Usually one triggers the ZFS implementation by prefixing the URI with jar:
55 | 			//    Unfortunately on Spark the file system provider disappears from the installed list https://stackoverflow.com/questions/39500445/filesystem-provider-disappearing-in-spark
56 | 			//    thus it cannot be found by the ZFS implementation when looking up the jimfs: protocol
57 | 			// 4. The public methods (e.g. FileSystems.newFileSystem(), new ZipFileSystemProvider().newFileSystem()) have checks that limit the incoming FileSystemProvider
58 | 
59 | 			// Attempt 10: try to find the jar provider, but then we don't know if the same methods exists :(
60 | 			// val zfsProvider = FileSystemProvider.installedProviders().asScala.filter(_.getScheme == "jar")
61 | 			// FileSystemProvider.installedProviders().asScala.foreach(p => println(p.getScheme))
62 | 
63 | 			// Attempt 9: hard dependency on Oracle JDK, fails on OpenJDK
64 | 			// package private ctor... *sigh*
65 | 			// import com.sun.nio.zipfs.{ZipFileSystem, ZipFileSystemProvider}
66 | 			// val zfsCtor = classOf[ZipFileSystem].getDeclaredConstructor(
67 | 				// classOf[ZipFileSystemProvider], 
68 | 				// classOf[java.nio.file.Path], 
69 | 				// classOf[java.util.Map[String, Object]])
70 | 
71 | 			// zfsCtor.setAccessible(true)
72 | 			// val zfs = zfsCtor.newInstance(new ZipFileSystemProvider, mleapFilePath, new java.util.HashMap[String, Object])
73 | 
74 | 			// moving to modified OpenJDK ZipFileSystem
75 | 			val zfs = new ZipFileSystem(new ZipFileSystemProvider, mleapFilePath, new HashMap[String, Object])
76 | 
77 | 			val mleapPipeline = (for(bf <- managed(BundleFile(zfs, zfs.getPath("/")))) yield {
78 | 				bf.loadMleapBundle().get.root
79 | 			}).tried.get
80 | 
81 | 			// TODO: also process mleapPipeline.inputSchema to determine the required fields
82 | 
83 | 			mleapPipeline.outputSchema.fields.flatMap {
84 | 				mleapField => {
85 | 					mleapField.dataType match {
86 | 						case _: ScalarType => Some(TypeConverters.mleapFieldToSparkField(mleapField))
87 | 						case _ => None
88 | 					}
89 | 				}
90 | 			}
91 | 		}
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroResolver.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.juel;
 19 | 
 20 | import java.beans.FeatureDescriptor;
 21 | import java.util.Arrays;
 22 | import java.util.Iterator;
 23 | 
 24 | import javax.el.ELContext;
 25 | import javax.el.ELException;
 26 | import javax.el.ELResolver;
 27 | 
 28 | import org.apache.avro.Schema.Field;
 29 | import org.apache.avro.Schema.Type;
 30 | import org.apache.avro.generic.GenericContainer;
 31 | import org.apache.avro.generic.IndexedRecord;
 32 | 
 33 | /**
 34 |  * Resolves variables and properties from AVRO GenericRecord.
 35 |  */
 36 | public class AvroResolver extends ELResolver {
 37 | 
 38 |   private static Class<?> avroTypeToJavaType(Field field) {
 39 |     Type type = field.schema().getType();
 40 | 
 41 |     if (type == Type.BOOLEAN)
 42 |       return boolean.class;
 43 |     else if (type == Type.DOUBLE)
 44 |       return double.class;
 45 |     else if (type == Type.FLOAT)
 46 |       return float.class;
 47 |     else if (type == Type.INT)
 48 |       return int.class;
 49 |     else if (type == Type.LONG)
 50 |       return long.class;
 51 |     else
 52 |       throw new IllegalArgumentException("Unsupported type: " + type);
 53 |   }
 54 | 
 55 |   @Override
 56 |   public Class<?> getCommonPropertyType(ELContext context, Object base) {
 57 |     throw new ELException("getCommonPropertyType is not supported");
 58 |   }
 59 | 
 60 |   @Override
 61 |   public Iterator<FeatureDescriptor> getFeatureDescriptors(ELContext context, Object base) {
 62 |     return null;
 63 |   }
 64 | 
 65 |   @Override
 66 |   public Class<?> getType(ELContext context, Object base, Object property) {
 67 |     return avroTypeToJavaType(((GenericContainer) base).getSchema().getField((String) property));
 68 |   }
 69 | 
 70 |   @Override
 71 |   public Object getValue(ELContext context, Object base, Object property) {
 72 |     IndexedRecord record = (IndexedRecord) base;
 73 | 
 74 |     context.setPropertyResolved(true);
 75 | 
 76 |     // lookup field
 77 |     return record.get(record.getSchema().getField((String) property).pos());
 78 |   }
 79 | 
 80 |   @Override
 81 |   public boolean isReadOnly(ELContext context, Object base, Object property) {
 82 |     return true;
 83 |   }
 84 | 
 85 |   @Override
 86 |   public void setValue(ELContext context, Object base, Object property, Object value) {
 87 |     throw new ELException("setValue is not supported");
 88 |   }
 89 | 
 90 |   @Override
 91 |   public Object invoke(ELContext context, Object base, Object method, Class<?>[] paramTypes, Object[] params) {
 92 |     if (method.equals("in")) {
 93 |       if (base instanceof AvroUtf8Wrapper)
 94 |         base = ((AvroUtf8Wrapper) base).getString();
 95 | 
 96 |       context.setPropertyResolved(true);
 97 |       return Arrays.binarySearch(params, base) >= 0;
 98 |     } else if (params.length == 1) {
 99 |       if (base instanceof AvroUtf8Wrapper)
100 |         base = ((AvroUtf8Wrapper) base).getString();
101 | 
102 |       if (base instanceof String) {
103 |         String baseStr = (String) base;
104 |         String paramStr = (String) params[0];
105 | 
106 |         // Spark methods available for pushdown
107 |         if (method.equals("endsWith")) {
108 |           context.setPropertyResolved(true);
109 |           return baseStr.endsWith(paramStr);
110 |         }
111 | 
112 |         if (method.equals("startsWith")) {
113 |           context.setPropertyResolved(true);
114 |           return baseStr.startsWith(paramStr);
115 |         }
116 | 
117 |         if (method.equals("contains")) {
118 |           context.setPropertyResolved(true);
119 |           return baseStr.contains(paramStr);
120 |         }
121 |       }
122 |     }
123 | 
124 |     return null;
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroMLeapTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark;
 19 | 
 20 | import static org.junit.Assert.assertEquals;
 21 | import static org.junit.Assert.assertFalse;
 22 | import static org.junit.Assert.assertTrue;
 23 | 
 24 | import java.io.IOException;
 25 | import java.util.*;
 26 | 
 27 | import org.apache.accumulo.core.client.lexicoder.DoubleLexicoder;
 28 | import org.apache.accumulo.core.data.Key;
 29 | import org.apache.accumulo.core.data.Range;
 30 | import org.apache.accumulo.core.data.Value;
 31 | import org.apache.accumulo.core.iterators.SortedMapIterator;
 32 | import com.microsoft.accumulo.spark.processors.AvroRowMLeap;
 33 | import org.apache.avro.generic.GenericRecord;
 34 | import org.apache.commons.lang3.StringUtils;
 35 | import org.junit.Test;
 36 | 
 37 | import com.google.common.io.Resources;
 38 | 
 39 | public class AvroMLeapTest {
 40 | 
 41 |   private AvroRowEncoderIterator createIterator(String mleapFilter) throws IOException {
 42 |     // load mleap model
 43 |     byte[] mleapBundle = Resources.toByteArray(AvroMLeapTest.class.getResource("pyspark.lr.zip"));
 44 |     String mleapBundleBase64 = Base64.getEncoder().encodeToString(mleapBundle);
 45 | 
 46 |     SortedMap<Key, Value> map = new TreeMap<>();
 47 |     map.put(new Key("key1", "cf1", "cq1"), new Value(new DoubleLexicoder().encode(0.0)));
 48 |     map.put(new Key("key2", "cf1", "cq1"), new Value(new DoubleLexicoder().encode(8.2)));
 49 | 
 50 |     SortedMapIterator parentIterator = new SortedMapIterator(map);
 51 |     AvroRowEncoderIterator iterator = new AvroRowEncoderIterator();
 52 | 
 53 |     Map<String, String> options = new HashMap<>();
 54 |     options.put(AvroRowEncoderIterator.SCHEMA,
 55 |         "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"double\",\"fvn\":\"v0\"},{\"cf\":\"cf1\",\"cq\":\"cq2\",\"t\":\"string\",\"o\":true}]");
 56 | 
 57 |     // pass the model to the iterator
 58 |     options.put(AvroRowMLeap.MLEAP_BUNDLE, mleapBundleBase64);
 59 |     options.put(AvroRowMLeap.MLEAP_GUID, UUID.randomUUID().toString());
 60 | 
 61 |     // map cf1.cq1 to fit the models input data frame
 62 |     options.put("column.feature.double", "${cf1.cq1}");
 63 | 
 64 |     if (StringUtils.isNotBlank(mleapFilter))
 65 |       options.put(AvroRowEncoderIterator.MLEAP_FILTER, mleapFilter);
 66 | 
 67 |     iterator.init(parentIterator, options, new DefaultIteratorEnvironment());
 68 |     iterator.seek(new Range(), AvroUtil.EMPTY_SET, false);
 69 | 
 70 |     return iterator;
 71 |   }
 72 | 
 73 |   @Test
 74 |   public void testMLeapModelExecution() throws IOException {
 75 |     AvroRowEncoderIterator iterator = createIterator(null);
 76 | 
 77 |     // row 1
 78 |     assertTrue(iterator.hasTop());
 79 |     GenericRecord record = AvroUtil.deserialize(iterator.getTopValue().get(), iterator.getSchema());
 80 | 
 81 |     assertEquals("key1", iterator.getTopKey().getRow().toString());
 82 |     assertEquals(-0.08748407856807701, (double) record.get("prediction"), 0.00001);
 83 | 
 84 |     // row2
 85 |     iterator.next();
 86 | 
 87 |     assertTrue(iterator.hasTop());
 88 |     record = AvroUtil.deserialize(iterator.getTopValue().get(), iterator.getSchema());
 89 | 
 90 |     assertEquals("key2", iterator.getTopKey().getRow().toString());
 91 |     assertEquals(0.8827512234363478, (double) record.get("prediction"), 0.00001);
 92 | 
 93 |     // end
 94 |     iterator.next();
 95 |     assertFalse(iterator.hasTop());
 96 |   }
 97 | 
 98 |   @Test
 99 |   public void testMLeapModelPredictionFiltering() throws IOException {
100 |     AvroRowEncoderIterator iterator = createIterator("${prediction > 0.7}");
101 | 
102 |     assertTrue(iterator.hasTop());
103 |     assertEquals("key2", iterator.getTopKey().getRow().toString());
104 | 
105 |     // end
106 |     iterator.next();
107 |     assertFalse(iterator.hasTop());
108 |   }
109 | }
110 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataSourceWriter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import org.apache.spark.sql.SaveMode
 21 | import org.apache.spark.sql.catalyst.InternalRow
 22 | import org.apache.spark.sql.sources.v2.DataSourceOptions
 23 | import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage}
 24 | import org.apache.spark.sql.types.StructType
 25 | import org.apache.accumulo.core.client.{Accumulo, AccumuloClient}
 26 | import org.apache.hadoop.io.Text
 27 | 
 28 | import scala.collection.JavaConverters._
 29 | import org.apache.log4j.Logger
 30 | 
 31 | class AccumuloDataSourceWriter(schema: StructType, mode: SaveMode, options: DataSourceOptions)
 32 |   extends DataSourceWriter {
 33 | 
 34 |   private val logger = Logger.getLogger(classOf[AccumuloDataSourceWriter])
 35 | 
 36 |   val tableName: String = options.tableName.get
 37 |   val properties = new java.util.Properties()
 38 |   // cannot use .putAll(options.asMap()) due to https://github.com/scala/bug/issues/10418
 39 |   options.asMap.asScala.foreach { case (k, v) => properties.setProperty(k, v) }
 40 | 
 41 |   // defaults based on https://accumulo.apache.org/docs/2.x/configuration/client-properties
 42 |   val batchThread: Int = options.get("batchThread").orElse("3").toInt
 43 |   val batchMemory: Long = options.get("batchMemory").orElse("50000000").toLong
 44 | 
 45 |   val client: AccumuloClient = Accumulo.newClient().from(properties).build()
 46 |   val tableExists: Boolean = client.tableOperations.exists(tableName)
 47 |   val ignore: Boolean = mode == SaveMode.Ignore && tableExists
 48 | 
 49 |   // enforce write mode
 50 |   try {
 51 |     if (tableExists) {
 52 |       if (mode == SaveMode.ErrorIfExists)
 53 |         // this should throw an error
 54 |         createTable()
 55 |       else if (mode == SaveMode.Overwrite) {
 56 |         client.tableOperations.delete(tableName)
 57 |         createTable()
 58 |       }
 59 |     } else {
 60 |       createTable()
 61 |     }
 62 |   } catch {
 63 |     // re-throw exception
 64 |     case exception: Throwable => throw exception
 65 |   } finally {
 66 |     // always close the client
 67 |     client.close()
 68 |   }
 69 | 
 70 |   def createTable(): Unit = {
 71 |     // adding splits to a newly created table
 72 |     val splits = new java.util.TreeSet(
 73 |       properties.getProperty("splits", "")
 74 |         .split(",")
 75 |         .map(new Text(_))
 76 |         .toSeq
 77 |         .asJava)
 78 | 
 79 |     logger.info(s"Creating table: $tableName")
 80 |     client.tableOperations.create(tableName)
 81 | 
 82 |     if (!splits.isEmpty) {
 83 |       logger.info(s"Adding splits: $splits")
 84 |       client.tableOperations.addSplits(tableName, splits)
 85 |     }
 86 |   }
 87 | 
 88 |   override def createWriterFactory(): DataWriterFactory[InternalRow] = {
 89 |     new AccumuloDataWriterFactory(tableName, schema, mode, properties, batchThread, batchMemory, ignore)
 90 |   }
 91 | 
 92 |   override def commit(messages: Array[WriterCommitMessage]): Unit = {
 93 |   }
 94 | 
 95 |   override def abort(messages: Array[WriterCommitMessage]): Unit = {
 96 |   }
 97 | }
 98 | 
 99 | class AccumuloDataWriterFactory(tableName: String,
100 |                                 schema: StructType,
101 |                                 mode: SaveMode,
102 |                                 properties: java.util.Properties,
103 |                                 batchThread: Int,
104 |                                 batchMemory: Long,
105 |                                 ignore: Boolean)
106 |   extends DataWriterFactory[InternalRow] {
107 |   override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = {
108 |     new AccumuloDataWriter(tableName, schema, mode, properties, batchThread, batchMemory, ignore)
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroFilterTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark;
 19 | 
 20 | import static org.junit.Assert.assertArrayEquals;
 21 | 
 22 | import java.io.IOException;
 23 | import java.util.ArrayList;
 24 | import java.util.HashMap;
 25 | import java.util.List;
 26 | import java.util.Map;
 27 | import java.util.SortedMap;
 28 | import java.util.TreeMap;
 29 | 
 30 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder;
 31 | import org.apache.accumulo.core.data.Key;
 32 | import org.apache.accumulo.core.data.Range;
 33 | import org.apache.accumulo.core.data.Value;
 34 | import org.apache.accumulo.core.iterators.SortedMapIterator;
 35 | import org.junit.Test;
 36 | 
 37 | public class AvroFilterTest {
 38 | 
 39 |   private static void validateFilter(String filter, String... expectedKeys) throws IOException {
 40 |     SortedMap<Key, Value> map = new TreeMap<>();
 41 |     map.put(new Key("key1", "cf1", "cq1"), new Value(new LongLexicoder().encode(3L)));
 42 |     map.put(new Key("key1", "cf1", "cq2"), new Value("Hello"));
 43 |     map.put(new Key("key1", "cf2", ""), new Value("abc"));
 44 | 
 45 |     map.put(new Key("key2", "cf2"), new Value("def"));
 46 | 
 47 |     SortedMapIterator parentIterator = new SortedMapIterator(map);
 48 |     AvroRowEncoderIterator iterator = new AvroRowEncoderIterator();
 49 | 
 50 |     Map<String, String> options = new HashMap<>();
 51 |     options.put(AvroRowEncoderIterator.SCHEMA,
 52 |         "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"long\",\"fvn\":\"v0\",\"o\":true},{\"cf\":\"cf1\",\"cq\":\"cq2\",\"t\":\"string\",\"o\":true},{\"cf\":\"cf2\",\"t\":\"STRING\",\"o\":true,\"fvn\":\"v1\",\"o\":true}]");
 53 | 
 54 |     options.put(AvroRowEncoderIterator.FILTER, filter);
 55 | 
 56 |     // include computed column
 57 |     options.put("column.vc1.long", "${cf1.cq1 + 5}");
 58 | 
 59 |     iterator.init(parentIterator, options, new DefaultIteratorEnvironment());
 60 |     iterator.seek(new Range(), AvroUtil.EMPTY_SET, false);
 61 | 
 62 |     // collect rows
 63 |     List<String> foundRows = new ArrayList<>();
 64 |     for (; iterator.hasTop(); iterator.next())
 65 |       foundRows.add(iterator.getTopKey().getRow().toString());
 66 | 
 67 |     assertArrayEquals(expectedKeys, foundRows.toArray(new String[0]));
 68 |   }
 69 | 
 70 |   @Test
 71 |   public void testComputedColumn() throws IOException {
 72 |     validateFilter("${vc1 == 8}", "key1");
 73 |   }
 74 | 
 75 |   @Test
 76 |   public void testEquals() throws IOException {
 77 |     validateFilter("${v0 == 3}", "key1");
 78 |     validateFilter("${v0 != 2}", "key1", "key2");
 79 |     validateFilter("${v1 == 'def'}", "key2");
 80 |   }
 81 | 
 82 |   @Test
 83 |   public void testIsNull() throws IOException {
 84 |     validateFilter("${v0 == null}", "key2");
 85 |   }
 86 | 
 87 |   @Test
 88 |   public void testEndsWith() throws IOException {
 89 |     // test on variable
 90 |     validateFilter("${v1.endsWith('ef')}", "key2");
 91 | 
 92 |     // test on object/property combination
 93 |     validateFilter("${cf1.cq2.endsWith('ello')}", "key1");
 94 |   }
 95 | 
 96 |   @Test
 97 |   public void testStartsWith() throws IOException {
 98 |     // test on variable
 99 |     validateFilter("${v1.startsWith('de')}", "key2");
100 | 
101 |     // test on object/property combination
102 |     validateFilter("${cf1.cq2.startsWith('Hel')}", "key1");
103 |   }
104 | 
105 |   @Test
106 |   public void testContains() throws IOException {
107 |     // test on variable
108 |     validateFilter("${v1.contains('d')}", "key2");
109 | 
110 |     // test on object/property combination
111 |     validateFilter("${cf1.cq2.contains('ell')}", "key1");
112 |   }
113 | 
114 |   @Test
115 |   public void testIn() throws IOException {
116 |     // test on variable
117 |     validateFilter("${v1.in('aaa','def')}", "key2");
118 | 
119 |     // test on object/property combination
120 |     validateFilter("${cf1.cq2.in('A','Hello','xxx')}", "key1");
121 | 
122 |     // test on object/property combination
123 |     validateFilter("${cf1.cq2.in('Hello')}", "key1");
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/connector/datasource/src/test/scala/com/microsoft/accumulo/VerifyFilterToJuel.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import java.io.ByteArrayOutputStream
 21 | 
 22 | import org.apache.spark.sql.sources._
 23 | import org.junit.runner.RunWith
 24 | 
 25 | import org.scalatest.FunSuite
 26 | import org.scalatest.junit.JUnitRunner
 27 | 
 28 | @RunWith(classOf[JUnitRunner])
 29 | class VerifyFilterToJuel extends FunSuite {
 30 |   val map = Map[String, String](
 31 |     "i" -> "i0",
 32 |     "x" -> "x",
 33 |     "j" -> "j",
 34 |     "k" -> "k",
 35 |     "x.yZ" -> "xyZ")
 36 | 
 37 |   test("Validate filter to juel operators") {
 38 |     assert("(i0 == 5)".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", 5))))
 39 |     assert("(i0 >= 5)".equals(new FilterToJuel(map).serializeFilter(new GreaterThanOrEqual("i", 5))))
 40 |     assert("(i0 > 5)".equals(new FilterToJuel(map).serializeFilter(new GreaterThan("i", 5))))
 41 |     assert("(i0 <= 5)".equals(new FilterToJuel(map).serializeFilter(new LessThanOrEqual("i", 5))))
 42 |     assert("(i0 < 5)".equals(new FilterToJuel(map).serializeFilter(new LessThan("i", 5))))
 43 |     assert("(i0 == null)".equals(new FilterToJuel(map).serializeFilter(new IsNull("i"))))
 44 |     assert("(i0 != null)".equals(new FilterToJuel(map).serializeFilter(new IsNotNull("i"))))
 45 |   }
 46 | 
 47 |   test("Validate filter to juel composed operators") {
 48 |     assert("(!(i0 == 5))".equals(new FilterToJuel(map).serializeFilter(
 49 |       new Not(new EqualTo("i", 5)))))
 50 | 
 51 |     assert("((i0 == 5) && (x == 3.0))".equals(new FilterToJuel(map).serializeFilter(
 52 |       new And(new EqualTo("i", 5), new EqualTo("x", 3.0)))))
 53 | 
 54 |     assert("((i0 == 5) || (x == 3.0))".equals(new FilterToJuel(map).serializeFilter(
 55 |       new Or(new EqualTo("i", 5), new EqualTo("x", 3.0)))))
 56 |   }
 57 | 
 58 |   test("Validate filter to juel string operators") {
 59 |     assert("x.contains('abc')".equals(new FilterToJuel(map).serializeFilter(
 60 |       new StringContains("x", "abc"))))
 61 |     assert("x.startsWith('abc')".equals(new FilterToJuel(map).serializeFilter(
 62 |       new StringStartsWith("x", "abc"))))
 63 |     assert("x.endsWith('abc')".equals(new FilterToJuel(map).serializeFilter(
 64 |       new StringEndsWith("x", "abc"))))
 65 |   }
 66 | 
 67 |   test("Validate filter to juel in operator") {
 68 |     assert("xyZ.in('abc','def','ghi')".equals(new FilterToJuel(map).serializeFilter(
 69 |       new In("x.yZ", Array("abc", "def", "ghi")))))
 70 |   }
 71 | 
 72 |   test("Validate filter string escape") {
 73 |     assert("(i0 == '\\'')".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", "'"))))
 74 |     assert("(i0 == '\\\\')".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", "\\"))))
 75 |     assert("(i0 == '\\\\\\'')".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", "\\'"))))
 76 |   }
 77 | 
 78 |   test("Validate filter combining") {
 79 |     val filters = Array[Filter](
 80 |       new EqualTo("i", 5),
 81 |       new EqualTo("j", 3),
 82 |       new EqualTo("k", 4)
 83 |     )
 84 | 
 85 |     val result = new FilterToJuel(map).serializeFilters(filters, "")
 86 | 
 87 |     assert("(i0 == 5) && (j == 3) && (k == 4)".equals(result.serializedFilter))
 88 |     assert(filters.length == result.supportedFilters.length)
 89 | 
 90 |     assert(result.unsupportedFilters.isEmpty)
 91 |   }
 92 | 
 93 |   test("Validate filter with rowKey and manual filter") {
 94 |     val filters = Array[Filter](
 95 |       new EqualTo("i", 5),
 96 |       new EqualTo("j", 3),
 97 |       new EqualTo("k", 4),
 98 |       new EqualTo("rowKey", "foo")
 99 |     )
100 | 
101 |     val result = new FilterToJuel(map).serializeFilters(filters, "a.b == 3")
102 | 
103 |     assert("(i0 == 5) && (j == 3) && (k == 4) && (rowKey == 'foo') && (a.b == 3)".equals(result.serializedFilter))
104 |     assert(filters.length == result.supportedFilters.length)
105 | 
106 |     assert(result.unsupportedFilters.isEmpty)
107 |   }
108 | 
109 |   test("Validate filter supports unknown attributes (e.g. for prediction)") {
110 |     assert("(prediction > 5)".equals(new FilterToJuel(map).serializeFilter(new GreaterThan("prediction", 5))))
111 |   }
112 | }


--------------------------------------------------------------------------------
/connector/integration-test/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   Licensed to the Apache Software Foundation (ASF) under one or more
  4 |   contributor license agreements.  See the NOTICE file distributed with
  5 |   this work for additional information regarding copyright ownership.
  6 |   The ASF licenses this file to You under the Apache License, Version 2.0
  7 |   (the "License"); you may not use this file except in compliance with
  8 |   the License.  You may obtain a copy of the License at
  9 | 
 10 |       http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |   Unless required by applicable law or agreed to in writing, software
 13 |   distributed under the License is distributed on an "AS IS" BASIS,
 14 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   See the License for the specific language governing permissions and
 16 |   limitations under the License.
 17 | -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 19 |   <modelVersion>4.0.0</modelVersion>
 20 |   <parent>
 21 |     <groupId>com.microsoft.masc</groupId>
 22 |     <artifactId>microsoft-accumulo-spark</artifactId>
 23 |     <version>1.0.4</version>
 24 |   </parent>
 25 |   <groupId>com.microsoft.masc</groupId>
 26 |   <artifactId>microsoft-accumulo-spark-integration-test</artifactId>
 27 |   <version>1.0.4</version>
 28 |   <name>Microsoft MASC, an Apache Spark connector for Apache Accumulo - Integration Test</name>
 29 |   <dependencies>
 30 |     <dependency>
 31 |       <groupId>com.microsoft.masc</groupId>
 32 |       <artifactId>microsoft-accumulo-spark-datasource</artifactId>
 33 |       <version>${project.version}</version>
 34 |       <scope>test</scope>
 35 |     </dependency>
 36 |     <dependency>
 37 |       <groupId>com.microsoft.masc</groupId>
 38 |       <artifactId>microsoft-accumulo-spark-iterator</artifactId>
 39 |       <version>${project.version}</version>
 40 |       <scope>test</scope>
 41 |     </dependency>
 42 |     <dependency>
 43 |       <groupId>org.apache.accumulo</groupId>
 44 |       <artifactId>accumulo-core</artifactId>
 45 |       <scope>test</scope>
 46 |     </dependency>
 47 |     <dependency>
 48 |       <groupId>org.apache.accumulo</groupId>
 49 |       <artifactId>accumulo-minicluster</artifactId>
 50 |       <version>${accumulo.version}</version>
 51 |       <scope>test</scope>
 52 |     </dependency>
 53 |     <dependency>
 54 |         <groupId>ml.combust.mleap</groupId>
 55 |         <artifactId>mleap-spark_${scala.compat.version}</artifactId>
 56 |       <scope>test</scope>
 57 |     </dependency>   
 58 |     <dependency>
 59 |       <groupId>org.apache.spark</groupId>
 60 |       <artifactId>spark-core_${scala.compat.version}</artifactId>
 61 |       <scope>test</scope>
 62 |     </dependency>
 63 |     <dependency>
 64 |       <groupId>org.apache.spark</groupId>
 65 |       <artifactId>spark-mllib_${scala.compat.version}</artifactId>
 66 |       <scope>test</scope>
 67 |     </dependency>
 68 |     <dependency>
 69 |       <groupId>com.fasterxml.jackson.core</groupId>
 70 |       <artifactId>jackson-databind</artifactId>
 71 |       <version>2.6.7.1</version>
 72 |       <scope>test</scope>
 73 |     </dependency>
 74 |     <dependency>
 75 |       <groupId>junit</groupId>
 76 |       <artifactId>junit</artifactId>
 77 |       <scope>test</scope>
 78 |     </dependency>
 79 |   </dependencies>
 80 |   <build>
 81 |     <plugins>
 82 |       <plugin>
 83 |         <groupId>org.apache.maven.plugins</groupId>
 84 |         <artifactId>maven-compiler-plugin</artifactId>
 85 |       </plugin>
 86 |       <plugin>
 87 |         <groupId>org.apache.maven.plugins</groupId>
 88 |         <artifactId>maven-failsafe-plugin</artifactId>
 89 |         <version>2.22.2</version>
 90 |         <configuration>
 91 |           <trimStackTrace>false</trimStackTrace>
 92 |         </configuration>
 93 |         <executions>
 94 |           <execution>
 95 |             <goals>
 96 |               <goal>integration-test</goal>
 97 |               <goal>verify</goal>
 98 |             </goals>
 99 |           </execution>
100 |         </executions>
101 |       </plugin>      
102 |       <plugin>
103 |         <!-- this plugin also puts all dependencies on accumulo server class path (see iterator reference above) -->
104 |         <groupId>org.apache.accumulo</groupId>
105 |         <artifactId>accumulo2-maven-plugin</artifactId>
106 |         <version>1.0.0</version>
107 |         <configuration>
108 |           <instanceName>spark-connector-instance</instanceName>
109 |           <rootPassword>ITSecret</rootPassword>
110 |         </configuration>
111 |         <executions>
112 |           <execution>
113 |             <id>run-plugin</id>
114 |             <goals>
115 |               <goal>start</goal>
116 |               <goal>stop</goal>
117 |             </goals>
118 |           </execution>
119 |         </executions>
120 |       </plugin>
121 |     </plugins>
122 |   </build>
123 | </project>


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroRowEncoderIteratorTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark;
 19 | 
 20 | import static org.junit.Assert.assertEquals;
 21 | import static org.junit.Assert.assertFalse;
 22 | import static org.junit.Assert.assertTrue;
 23 | 
 24 | import java.io.IOException;
 25 | import java.util.HashMap;
 26 | import java.util.Map;
 27 | import java.util.SortedMap;
 28 | import java.util.TreeMap;
 29 | 
 30 | import org.apache.accumulo.core.data.Key;
 31 | import org.apache.accumulo.core.data.Range;
 32 | import org.apache.accumulo.core.data.Value;
 33 | import org.apache.accumulo.core.iterators.SortedMapIterator;
 34 | import org.apache.avro.Schema;
 35 | import org.apache.avro.SchemaBuilder;
 36 | import org.apache.avro.generic.GenericRecord;
 37 | import org.junit.Test;
 38 | 
 39 | public class AvroRowEncoderIteratorTest {
 40 |   class MyRow {
 41 |     public String key;
 42 | 
 43 |     public String cf1cq1;
 44 | 
 45 |     MyRow(String key, String cf1cq1) {
 46 |       this.key = key;
 47 |       this.cf1cq1 = cf1cq1;
 48 |     }
 49 |   }
 50 | 
 51 |   private void validateSingleRowSimpleSchema(SortedMap<Key, Value> map, MyRow... expectedRows) throws IOException {
 52 |     SortedMapIterator parentIterator = new SortedMapIterator(map);
 53 | 
 54 |     // setup avro encoder iterator
 55 |     AvroRowEncoderIterator iterator = new AvroRowEncoderIterator();
 56 | 
 57 |     Map<String, String> options = new HashMap<>();
 58 |     options.put(AvroRowEncoderIterator.SCHEMA, "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"STRING\",\"o\":true}]");
 59 | 
 60 |     iterator.init(parentIterator, options, new DefaultIteratorEnvironment());
 61 |     iterator.seek(new Range(), AvroUtil.EMPTY_SET, false);
 62 | 
 63 |     // the expected avro schema
 64 |     Schema schema = SchemaBuilder.record("root").fields().name("cf1")
 65 |         .type(SchemaBuilder.record("cf1").fields().optionalString("cq1").endRecord()).noDefault().endRecord();
 66 | 
 67 |     for (MyRow row : expectedRows) {
 68 |       assertTrue(iterator.hasTop());
 69 | 
 70 |       // validate key
 71 |       assertEquals(row.key, iterator.getTopKey().getRow().toString());
 72 | 
 73 |       // validate value
 74 |       byte[] data = iterator.getTopValue().get();
 75 | 
 76 |       GenericRecord record = AvroUtil.deserialize(data, schema);
 77 |       GenericRecord cf1Record = (GenericRecord) record.get("cf1");
 78 | 
 79 |       assertEquals(row.cf1cq1, cf1Record.get("cq1").toString());
 80 | 
 81 |       // move to next
 82 |       iterator.next();
 83 |     }
 84 | 
 85 |     assertFalse(iterator.hasTop());
 86 |   }
 87 | 
 88 |   @Test
 89 |   public void testSingleFieldString() throws IOException {
 90 |     // setup input iterator
 91 |     SortedMap<Key, Value> map = new TreeMap<>();
 92 |     map.put(new Key("key1", "cf1", "cq1"), new Value("abc"));
 93 | 
 94 |     validateSingleRowSimpleSchema(map, new MyRow("key1", "abc"));
 95 |   }
 96 | 
 97 |   @Test
 98 |   public void testSkippedField1() throws IOException {
 99 |     // setup input iterator
100 |     SortedMap<Key, Value> map = new TreeMap<>();
101 |     map.put(new Key("key1", "cf1", "cq1"), new Value("abc"));
102 |     map.put(new Key("key1", "cf1", "cq2"), new Value("def"));
103 | 
104 |     validateSingleRowSimpleSchema(map, new MyRow("key1", "abc"));
105 |   }
106 | 
107 |   @Test
108 |   public void testSkippedField2() throws IOException {
109 |     // setup input iterator
110 |     SortedMap<Key, Value> map = new TreeMap<>();
111 |     map.put(new Key("key1", "cf0", "cq1"), new Value("xxx"));
112 |     map.put(new Key("key1", "cf1", "cq1"), new Value("abc"));
113 |     map.put(new Key("key1", "cf1", "cq2"), new Value("def"));
114 | 
115 |     validateSingleRowSimpleSchema(map, new MyRow("key1", "abc"));
116 |   }
117 | 
118 |   @Test
119 |   public void testMultipleRows() throws IOException {
120 |     // setup input iterator
121 |     SortedMap<Key, Value> map = new TreeMap<>();
122 |     map.put(new Key("key1", "cf1", "cq1"), new Value("xxx"));
123 |     map.put(new Key("key2", "cf0", "cq1"), new Value("abc"));
124 |     map.put(new Key("key3", "cf1", "cq1"), new Value("yyy"));
125 | 
126 |     validateSingleRowSimpleSchema(map, new MyRow("key1", "xxx"), new MyRow("key3", "yyy"));
127 |   }
128 | }
129 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipFileStore.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
  3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4 |  *
  5 |  * This code is free software; you can redistribute it and/or modify it
  6 |  * under the terms of the GNU General Public License version 2 only, as
  7 |  * published by the Free Software Foundation.  Oracle designates this
  8 |  * particular file as subject to the "Classpath" exception as provided
  9 |  * by Oracle in the LICENSE file that accompanied this code.
 10 |  *
 11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14 |  * version 2 for more details (a copy is included in the LICENSE file that
 15 |  * accompanied this code).
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License version
 18 |  * 2 along with this work; if not, write to the Free Software Foundation,
 19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  *
 21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22 |  * or visit www.oracle.com if you need additional information or have any
 23 |  * questions.
 24 |  */
 25 | 
 26 | package com.microsoft.accumulo.zipfs;
 27 | 
 28 | import java.io.IOException;
 29 | import java.nio.file.FileStore;
 30 | import java.nio.file.FileSystems;
 31 | import java.nio.file.Files;
 32 | import java.nio.file.Path;
 33 | import java.nio.file.attribute.BasicFileAttributeView;
 34 | import java.nio.file.attribute.FileAttributeView;
 35 | import java.nio.file.attribute.FileOwnerAttributeView;
 36 | import java.nio.file.attribute.FileStoreAttributeView;
 37 | import java.nio.file.attribute.PosixFileAttributeView;
 38 | 
 39 | /**
 40 |  * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal
 41 |  */
 42 | class ZipFileStore extends FileStore {
 43 | 
 44 |     private final ZipFileSystem zfs;
 45 | 
 46 |     ZipFileStore(ZipPath zpath) {
 47 |         this.zfs = zpath.getFileSystem();
 48 |     }
 49 | 
 50 |     @Override
 51 |     public String name() {
 52 |         return zfs.toString() + "/";
 53 |     }
 54 | 
 55 |     @Override
 56 |     public String type() {
 57 |         return "zipfs";
 58 |     }
 59 | 
 60 |     @Override
 61 |     public boolean isReadOnly() {
 62 |         return zfs.isReadOnly();
 63 |     }
 64 | 
 65 |     @Override
 66 |     public boolean supportsFileAttributeView(Class<? extends FileAttributeView> type) {
 67 |         return (type == BasicFileAttributeView.class ||
 68 |                 type == ZipFileAttributeView.class ||
 69 |                 ((type == FileOwnerAttributeView.class ||
 70 |                   type == PosixFileAttributeView.class) && zfs.supportPosix));
 71 |     }
 72 | 
 73 |     @Override
 74 |     public boolean supportsFileAttributeView(String name) {
 75 |         return "basic".equals(name) || "zip".equals(name) ||
 76 |                (("owner".equals(name) || "posix".equals(name)) && zfs.supportPosix);
 77 |     }
 78 | 
 79 |     @Override
 80 |     public <V extends FileStoreAttributeView> V getFileStoreAttributeView(Class<V> type) {
 81 |         if (type == null)
 82 |             throw new NullPointerException();
 83 |         return null;
 84 |     }
 85 | 
 86 |     @Override
 87 |     public long getTotalSpace() throws IOException {
 88 |          return new ZipFileStoreAttributes(this).totalSpace();
 89 |     }
 90 | 
 91 |     @Override
 92 |     public long getUsableSpace() throws IOException {
 93 |          return new ZipFileStoreAttributes(this).usableSpace();
 94 |     }
 95 | 
 96 |     @Override
 97 |     public long getUnallocatedSpace() throws IOException {
 98 |          return new ZipFileStoreAttributes(this).unallocatedSpace();
 99 |     }
100 | 
101 |     @Override
102 |     public Object getAttribute(String attribute) throws IOException {
103 |          if (attribute.equals("totalSpace"))
104 |                return getTotalSpace();
105 |          if (attribute.equals("usableSpace"))
106 |                return getUsableSpace();
107 |          if (attribute.equals("unallocatedSpace"))
108 |                return getUnallocatedSpace();
109 |          throw new UnsupportedOperationException("does not support the given attribute");
110 |     }
111 | 
112 |     private static class ZipFileStoreAttributes {
113 |         final FileStore fstore;
114 |         final long size;
115 | 
116 |         ZipFileStoreAttributes(ZipFileStore fileStore)
117 |             throws IOException
118 |         {
119 |             Path path = FileSystems.getDefault().getPath(fileStore.name());
120 |             this.size = Files.size(path);
121 |             this.fstore = Files.getFileStore(path);
122 |         }
123 | 
124 |         long totalSpace() {
125 |             return size;
126 |         }
127 | 
128 |         long usableSpace() throws IOException {
129 |             if (!fstore.isReadOnly())
130 |                 return fstore.getUsableSpace();
131 |             return 0;
132 |         }
133 | 
134 |         long unallocatedSpace()  throws IOException {
135 |             if (!fstore.isReadOnly())
136 |                 return fstore.getUnallocatedSpace();
137 |             return 0;
138 |         }
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipCoder.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
  3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4 |  *
  5 |  * This code is free software; you can redistribute it and/or modify it
  6 |  * under the terms of the GNU General Public License version 2 only, as
  7 |  * published by the Free Software Foundation.  Oracle designates this
  8 |  * particular file as subject to the "Classpath" exception as provided
  9 |  * by Oracle in the LICENSE file that accompanied this code.
 10 |  *
 11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14 |  * version 2 for more details (a copy is included in the LICENSE file that
 15 |  * accompanied this code).
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License version
 18 |  * 2 along with this work; if not, write to the Free Software Foundation,
 19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  *
 21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22 |  * or visit www.oracle.com if you need additional information or have any
 23 |  * questions.
 24 |  */
 25 | 
 26 | package com.microsoft.accumulo.zipfs;
 27 | 
 28 | import java.nio.ByteBuffer;
 29 | import java.nio.CharBuffer;
 30 | import java.nio.charset.Charset;
 31 | import java.nio.charset.CharsetDecoder;
 32 | import java.nio.charset.CharsetEncoder;
 33 | import java.nio.charset.CoderResult;
 34 | import java.nio.charset.CodingErrorAction;
 35 | import java.util.Arrays;
 36 | 
 37 | import static java.nio.charset.StandardCharsets.ISO_8859_1;
 38 | import static java.nio.charset.StandardCharsets.UTF_8;
 39 | 
 40 | /**
 41 |  * Utility class for zipfile name and comment decoding and encoding
 42 |  *
 43 |  * @author Xueming Shen
 44 |  */
 45 | class ZipCoder {
 46 | 
 47 |     static class UTF8 extends ZipCoder {
 48 |         UTF8() {
 49 |             super(UTF_8);
 50 |         }
 51 | 
 52 |         @Override
 53 |         byte[] getBytes(String s) {        // fast pass for ascii
 54 |             for (int i = 0; i < s.length(); i++) {
 55 |                 if (s.charAt(i) > 0x7f) return super.getBytes(s);
 56 |             }
 57 |             return s.getBytes(ISO_8859_1);
 58 |         }
 59 | 
 60 |         @Override
 61 |         String toString(byte[] ba) {
 62 |             for (byte b : ba) {
 63 |                 if (b < 0) return super.toString(ba);
 64 |             }
 65 |             return new String(ba, ISO_8859_1);
 66 |         }
 67 |     }
 68 | 
 69 |     private static final ZipCoder utf8 = new UTF8();
 70 | 
 71 |     public static ZipCoder get(String csn) {
 72 |         Charset cs = Charset.forName(csn);
 73 |         if (cs.name().equals("UTF-8")) {
 74 |             return utf8;
 75 |         }
 76 |         return new ZipCoder(cs);
 77 |     }
 78 | 
 79 |     String toString(byte[] ba) {
 80 |         CharsetDecoder cd = decoder().reset();
 81 |         int clen = (int)(ba.length * cd.maxCharsPerByte());
 82 |         char[] ca = new char[clen];
 83 |         if (clen == 0)
 84 |             return new String(ca);
 85 |         ByteBuffer bb = ByteBuffer.wrap(ba, 0, ba.length);
 86 |         CharBuffer cb = CharBuffer.wrap(ca);
 87 |         CoderResult cr = cd.decode(bb, cb, true);
 88 |         if (!cr.isUnderflow())
 89 |             throw new IllegalArgumentException(cr.toString());
 90 |         cr = cd.flush(cb);
 91 |         if (!cr.isUnderflow())
 92 |             throw new IllegalArgumentException(cr.toString());
 93 |         return new String(ca, 0, cb.position());
 94 |     }
 95 | 
 96 |     byte[] getBytes(String s) {
 97 |         CharsetEncoder ce = encoder().reset();
 98 |         char[] ca = s.toCharArray();
 99 |         int len = (int)(ca.length * ce.maxBytesPerChar());
100 |         byte[] ba = new byte[len];
101 |         if (len == 0)
102 |             return ba;
103 |         ByteBuffer bb = ByteBuffer.wrap(ba);
104 |         CharBuffer cb = CharBuffer.wrap(ca);
105 |         CoderResult cr = ce.encode(cb, bb, true);
106 |         if (!cr.isUnderflow())
107 |             throw new IllegalArgumentException(cr.toString());
108 |         cr = ce.flush(bb);
109 |         if (!cr.isUnderflow())
110 |             throw new IllegalArgumentException(cr.toString());
111 |         if (bb.position() == ba.length)  // defensive copy?
112 |             return ba;
113 |         else
114 |             return Arrays.copyOf(ba, bb.position());
115 |     }
116 | 
117 |     boolean isUTF8() {
118 |         return cs == UTF_8;
119 |     }
120 | 
121 |     private Charset cs;
122 | 
123 |     private ZipCoder(Charset cs) {
124 |         this.cs = cs;
125 |     }
126 | 
127 |     private final ThreadLocal<CharsetDecoder> decTL = new ThreadLocal<>();
128 |     private final ThreadLocal<CharsetEncoder> encTL = new ThreadLocal<>();
129 | 
130 |     private CharsetDecoder decoder() {
131 |         CharsetDecoder dec = decTL.get();
132 |         if (dec == null) {
133 |         dec = cs.newDecoder()
134 |             .onMalformedInput(CodingErrorAction.REPORT)
135 |             .onUnmappableCharacter(CodingErrorAction.REPORT);
136 |         decTL.set(dec);
137 |         }
138 |         return dec;
139 |     }
140 | 
141 |     private CharsetEncoder encoder() {
142 |         CharsetEncoder enc = encTL.get();
143 |         if (enc == null) {
144 |         enc = cs.newEncoder()
145 |             .onMalformedInput(CodingErrorAction.REPORT)
146 |             .onUnmappableCharacter(CodingErrorAction.REPORT);
147 |         encTL.set(enc);
148 |         }
149 |         return enc;
150 |     }
151 | }
152 | 


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroRowTopLevelTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark;
 19 | 
 20 | import static org.junit.Assert.assertEquals;
 21 | import static org.junit.Assert.assertFalse;
 22 | import static org.junit.Assert.assertNull;
 23 | import static org.junit.Assert.assertTrue;
 24 | 
 25 | import java.io.IOException;
 26 | import java.util.Arrays;
 27 | import java.util.HashMap;
 28 | import java.util.Map;
 29 | import java.util.SortedMap;
 30 | import java.util.TreeMap;
 31 | 
 32 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder;
 33 | import org.apache.accumulo.core.data.Key;
 34 | import org.apache.accumulo.core.data.Range;
 35 | import org.apache.accumulo.core.data.Value;
 36 | import org.apache.accumulo.core.iterators.SortedMapIterator;
 37 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder;
 38 | import com.microsoft.accumulo.spark.record.RowBuilderField;
 39 | import org.apache.avro.Schema;
 40 | import org.apache.avro.Schema.Field;
 41 | import org.apache.avro.Schema.Type;
 42 | import org.apache.avro.generic.GenericRecord;
 43 | import org.apache.avro.util.Utf8;
 44 | import org.junit.Test;
 45 | 
 46 | public class AvroRowTopLevelTest {
 47 |   @Test
 48 |   public void testSchemaGeneration() {
 49 |     RowBuilderField[] schemaMappingFields = new RowBuilderField[] {
 50 |         // row 0
 51 |         new RowBuilderField("cf1", "cq1", "long", "v0"),
 52 |         // row 1
 53 |         new RowBuilderField("cf2", null, "double", "v1") };
 54 | 
 55 |     Schema schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields));
 56 | 
 57 |     assertEquals(Type.RECORD, schema.getType());
 58 |     assertEquals(2, schema.getFields().size());
 59 | 
 60 |     Field f0 = schema.getFields().get(0);
 61 | 
 62 |     // cf1 nested record
 63 |     assertEquals(Type.RECORD, f0.schema().getType());
 64 |     assertEquals(1, f0.schema().getFields().size());
 65 | 
 66 |     // cf1.cq1 nested field
 67 |     Field f00 = f0.schema().getFields().get(0);
 68 | 
 69 |     // nullable long
 70 |     assertEquals(2, f00.schema().getTypes().size());
 71 |     assertEquals(Type.NULL, f00.schema().getTypes().get(0).getType());
 72 |     assertEquals(Type.LONG, f00.schema().getTypes().get(1).getType());
 73 | 
 74 |     // cf2 top-level field
 75 |     Field f1 = schema.getFields().get(1);
 76 | 
 77 |     // nullable double
 78 |     assertEquals(2, f1.schema().getTypes().size());
 79 |     assertEquals(Type.DOUBLE, f1.schema().getTypes().get(1).getType());
 80 |   }
 81 | 
 82 |   @Test
 83 |   public void testTopLevelFields() throws IOException {
 84 |     SortedMap<Key, Value> map = new TreeMap<>();
 85 |     map.put(new Key("key1", "cf1", "cq1"), new Value(new LongLexicoder().encode(3L)));
 86 |     map.put(new Key("key1", "cf2", ""), new Value("abc"));
 87 | 
 88 |     map.put(new Key("key2", "cf2"), new Value("def"));
 89 | 
 90 |     SortedMapIterator parentIterator = new SortedMapIterator(map);
 91 |     AvroRowEncoderIterator iterator = new AvroRowEncoderIterator();
 92 | 
 93 |     Map<String, String> options = new HashMap<>();
 94 |     options.put(AvroRowEncoderIterator.SCHEMA,
 95 |         "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"long\",\"o\":true},{\"cf\":\"cf2\",\"t\":\"STRING\",\"o\":true}]");
 96 | 
 97 |     iterator.init(parentIterator, options, new DefaultIteratorEnvironment());
 98 |     iterator.seek(new Range(), AvroUtil.EMPTY_SET, false);
 99 | 
100 |     RowBuilderField[] schemaMappingFields = new RowBuilderField[] {
101 |         // row 0
102 |         new RowBuilderField("cf1", "cq1", "long", "v0"),
103 |         // row 1
104 |         new RowBuilderField("cf2", null, "string", "v1") };
105 | 
106 |     Schema schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields));
107 | 
108 |     // ############################## ROW 1
109 |     assertTrue(iterator.hasTop());
110 |     assertEquals("key1", iterator.getTopKey().getRow().toString());
111 | 
112 |     // validate value
113 |     byte[] data = iterator.getTopValue().get();
114 | 
115 |     GenericRecord record = AvroUtil.deserialize(data, schema);
116 |     GenericRecord cf1Record = (GenericRecord) record.get("cf1");
117 | 
118 |     assertEquals(3L, cf1Record.get("cq1"));
119 |     assertEquals("abc", record.get("cf2").toString());
120 |     assertTrue(record.get("cf2") instanceof Utf8);
121 | 
122 |     // ############################## ROW 2
123 |     iterator.next();
124 | 
125 |     assertTrue(iterator.hasTop());
126 |     assertEquals("key2", iterator.getTopKey().getRow().toString());
127 | 
128 |     // validate value
129 |     data = iterator.getTopValue().get();
130 | 
131 |     record = AvroUtil.deserialize(data, schema);
132 |     cf1Record = (GenericRecord) record.get("cf1");
133 | 
134 |     assertNull(cf1Record.get("cq1"));
135 |     assertEquals("def", record.get("cf2").toString());
136 |     assertTrue(record.get("cf2") instanceof Utf8);
137 | 
138 |     // End of data
139 |     iterator.next();
140 |     assertFalse(iterator.hasTop());
141 |   }
142 | }
143 | 


--------------------------------------------------------------------------------
/connector/datasource/src/test/scala/com/microsoft/accumulo/VerifyAccumuloSchema.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import java.io.ByteArrayOutputStream
 21 | 
 22 | import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder}
 23 | import org.apache.avro.io.EncoderFactory
 24 | import org.apache.avro.specific.SpecificDatumWriter
 25 | import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
 26 | 
 27 | import org.junit.runner.RunWith
 28 | 
 29 | import org.scalatest.FunSuite
 30 | import org.scalatest.junit.JUnitRunner
 31 | 
 32 | @RunWith(classOf[JUnitRunner])
 33 | class VerifyAccumuloSchema extends FunSuite {
 34 |   test("Validate catalyst schema to json serialization") {
 35 |     val schema = (new StructType)
 36 |        .add(StructField("cf1", (new StructType)
 37 |           .add("cq1", DataTypes.StringType, true)
 38 |           .add("cq2", DataTypes.DoubleType, true)
 39 |           , true))
 40 |       .add(StructField("cf2", (new StructType)
 41 |         .add("cq_a", DataTypes.IntegerType, true)
 42 |         .add("cq_b", DataTypes.FloatType, true)
 43 |         , true))
 44 |       .add("cf3", DataTypes.StringType, false)
 45 | 
 46 |     val jsonActual = AvroUtil.catalystSchemaToJson(schema).json
 47 |     val jsonExpected = "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"fvn\":\"v0\",\"t\":\"STRING\",\"o\":true}" +
 48 |       ",{\"cf\":\"cf1\",\"cq\":\"cq2\",\"fvn\":\"v1\",\"t\":\"DOUBLE\",\"o\":true}" +
 49 |       ",{\"cf\":\"cf2\",\"cq\":\"cq_a\",\"fvn\":\"v2\",\"t\":\"INTEGER\",\"o\":true}" +
 50 |       ",{\"cf\":\"cf2\",\"cq\":\"cq_b\",\"fvn\":\"v3\",\"t\":\"FLOAT\",\"o\":true}" +
 51 |       ",{\"cf\":\"cf3\",\"fvn\":\"v4\",\"t\":\"STRING\",\"o\":true}]"
 52 | 
 53 |     assert(jsonActual == jsonExpected)
 54 |   }
 55 | 
 56 |   test("Validate catalyst schema to json serialization with pruned output schema") {
 57 |     val inputSchema = (new StructType)
 58 |       .add(StructField("cf1", (new StructType)
 59 |         .add("cq1", DataTypes.StringType, true)
 60 |         .add("cq2", DataTypes.DoubleType, true)
 61 |         , true))
 62 |       .add(StructField("cf2", (new StructType)
 63 |         .add("cq_a", DataTypes.IntegerType, true)
 64 |         .add("cq_b", DataTypes.FloatType, true)
 65 |         , true))
 66 |       .add("cf3", DataTypes.StringType, false)
 67 |       .add("cf4", DataTypes.LongType, false)
 68 | 
 69 |     val outputSchema = (new StructType)
 70 |       .add(StructField("cf1", (new StructType)
 71 |         .add("cq1", DataTypes.StringType, true)
 72 |         , true))
 73 |       .add("cf3", DataTypes.StringType, false)
 74 | 
 75 |     val jsonActual = AvroUtil.catalystSchemaToJson(inputSchema, outputSchema).json
 76 |     val jsonExpected = "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"fvn\":\"v0\",\"t\":\"STRING\",\"o\":true}" +
 77 |       ",{\"cf\":\"cf1\",\"cq\":\"cq2\",\"fvn\":\"v1\",\"t\":\"DOUBLE\",\"o\":false}" +
 78 |       ",{\"cf\":\"cf2\",\"cq\":\"cq_a\",\"fvn\":\"v2\",\"t\":\"INTEGER\",\"o\":false}" +
 79 |       ",{\"cf\":\"cf2\",\"cq\":\"cq_b\",\"fvn\":\"v3\",\"t\":\"FLOAT\",\"o\":false}" +
 80 |       ",{\"cf\":\"cf3\",\"fvn\":\"v4\",\"t\":\"STRING\",\"o\":true}" +
 81 |       ",{\"cf\":\"cf4\",\"fvn\":\"v5\",\"t\":\"LONG\",\"o\":false}]"
 82 | 
 83 |     assert(jsonActual == jsonExpected)
 84 |   }
 85 | 
 86 |   test("Validate catalyst schema to avro serialization") {
 87 |     val schema = (new StructType)
 88 |       .add(StructField("cf1", (new StructType)
 89 |         .add("cq1", DataTypes.StringType, true)
 90 |         .add("cq2", DataTypes.DoubleType, false)
 91 |         .add("cq3", DataTypes.DoubleType, true)
 92 |         , true))
 93 |       .add(StructField("cf2", (new StructType)
 94 |         .add("cq_a", DataTypes.IntegerType, true)
 95 |         .add("cq_b", DataTypes.FloatType, true)
 96 |         , true))
 97 | 
 98 |     val avroSchema = AvroUtil.catalystSchemaToAvroSchema(schema)
 99 | 
100 |     val builder = new GenericRecordBuilder(avroSchema)
101 | 
102 |     val builderCf1 = new GenericRecordBuilder(avroSchema.getField("cf1").schema())
103 |     val builderCf2 = new GenericRecordBuilder(avroSchema.getField("cf2").schema())
104 |     // check if clear() helps perf?
105 | 
106 |     builderCf1.set("cq1", "foo")
107 |     builderCf1.set("cq2", 2.3)
108 | 
109 |     builderCf2.set("cq_a", 1)
110 |     builderCf2.set("cq_b", 1.2f)
111 | 
112 |     builder.set("cf1", builderCf1.build())
113 |     builder.set("cf2", builderCf2.build())
114 | 
115 |     val output = new ByteArrayOutputStream()
116 |     val encoder = EncoderFactory.get.jsonEncoder(avroSchema, output)
117 | 
118 |     val writer = new SpecificDatumWriter[GenericRecord](avroSchema)
119 |     writer.write(builder.build(), encoder)
120 | 
121 |     encoder.flush()
122 | 
123 |     val jsonActual = new String(output.toByteArray)
124 | 
125 |     val jsonExpected = "{\"cf1\":{\"cq1\":{\"string\":\"foo\"}," +
126 |       "\"cq2\":2.3,\"cq3\":null}," +
127 |       "\"cf2\":{\"cq_a\":{\"int\":1},\"cq_b\":{\"float\":1.2}}}"
128 | 
129 |     assert(jsonActual == jsonExpected)
130 |   }
131 | 
132 |   test("Validate unsupported types") {
133 |     val schema = (new StructType)
134 |       .add("cf3", DataTypes.CalendarIntervalType, false)
135 | 
136 |     assertThrows[UnsupportedOperationException] {
137 |       AvroUtil.catalystSchemaToAvroSchema(schema)
138 |     }
139 |   }
140 | }


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataWriter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import org.apache.accumulo.core.client.lexicoder._
 21 | import org.apache.accumulo.core.client.{Accumulo, BatchWriterConfig}
 22 | import org.apache.accumulo.core.data.Mutation
 23 | import org.apache.spark.sql.SaveMode
 24 | import org.apache.spark.sql.catalyst.InternalRow
 25 | import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage}
 26 | import org.apache.spark.sql.types._
 27 | import org.apache.spark.unsafe.types.UTF8String
 28 | 
 29 | class AccumuloDataWriter(tableName: String,
 30 |                          schema: StructType,
 31 |                          mode: SaveMode,
 32 |                          properties: java.util.Properties,
 33 |                          batchThread: Int,
 34 |                          batchMemory: Long,
 35 |                          ignore: Boolean)
 36 |   extends DataWriter[InternalRow] {
 37 | 
 38 |   private val rowKeyIdx = schema.fieldIndex(properties.getProperty("rowkey"))
 39 | 
 40 |   private val client = Accumulo.newClient().from(properties).build()
 41 | 
 42 |   private val batchWriter = client.createBatchWriter(
 43 |     tableName,
 44 |     new BatchWriterConfig().setMaxWriteThreads(batchThread).setMaxMemory(batchMemory))
 45 | 
 46 |   private val doubleEncoder = new DoubleLexicoder
 47 |   private val floatEncoder = new FloatLexicoder
 48 |   private val longEncoder = new LongLexicoder
 49 |   private val intEncoder = new IntegerLexicoder
 50 |   private val stringEncoder = new StringLexicoder
 51 | 
 52 |   private val doubleAccessor = InternalRow.getAccessor(DoubleType)
 53 |   private val floatAccessor = InternalRow.getAccessor(FloatType)
 54 |   private val longAccessor = InternalRow.getAccessor(LongType)
 55 |   private val intAccessor = InternalRow.getAccessor(IntegerType)
 56 |   private val stringAccessor = InternalRow.getAccessor(StringType)
 57 | 
 58 |   private def getEncoder(fieldIdx: Int, field: StructField) = {
 59 |     field.dataType match {
 60 |       case DoubleType => (record: InternalRow) => doubleEncoder.encode(doubleAccessor(record, fieldIdx).asInstanceOf[Double])
 61 |       case FloatType => (record: InternalRow) => floatEncoder.encode(floatAccessor(record, fieldIdx).asInstanceOf[Float])
 62 |       case LongType => (record: InternalRow) => longEncoder.encode(longAccessor(record, fieldIdx).asInstanceOf[Long])
 63 |       case IntegerType => (record: InternalRow) => intEncoder.encode(intAccessor(record, fieldIdx).asInstanceOf[Integer])
 64 |       case StringType => (record: InternalRow) => {
 65 |         val obj = stringAccessor(record, fieldIdx)
 66 |         if (obj == null) null else obj.asInstanceOf[UTF8String].getBytes
 67 |       }
 68 |     }
 69 |   }
 70 | 
 71 |   private val structAccessor = InternalRow.getAccessor(new StructType())
 72 | 
 73 |   // pre-compute which fields and how to create the mutations...
 74 |   private val recordToMutation = schema.fields.zipWithIndex
 75 |     // exclude rowkey
 76 |     .filter({ case (_, cfIdx: Int) => cfIdx != rowKeyIdx })
 77 |     // loop through the rest of the fields
 78 |     .map { case (cf: StructField, cfIdx: Int) =>
 79 |       // check which types we have top-level
 80 |       cf.dataType match {
 81 |         case ct: StructType =>
 82 |           val nestedFields = ct.fields.zipWithIndex.map {
 83 |             case (cq: StructField, cqIdx) =>
 84 |               val cfBytes = stringEncoder.encode(cf.name)
 85 |               val cqBytes = stringEncoder.encode(cq.name)
 86 |               val encoder = getEncoder(cqIdx, cq)
 87 | 
 88 |               (mutation: Mutation, nestedRecord: InternalRow) => {
 89 |                 // not using the fluent interface to provide backward compat
 90 |                 val value = encoder(nestedRecord)
 91 |                 if (value != null)
 92 |                   mutation.put(cfBytes, cqBytes, value)
 93 |               }
 94 |           }
 95 | 
 96 |           // parent function
 97 |           (mutation: Mutation, record: InternalRow) => {
 98 |             val nestedRecord = structAccessor(record, cfIdx).asInstanceOf[InternalRow]
 99 | 
100 |             nestedFields.foreach { _(mutation, nestedRecord) }
101 |           }
102 |         case _ =>
103 |           val cfBytes = stringEncoder.encode(cf.name)
104 |           val encoder = getEncoder(cfIdx, cf)
105 | 
106 |           (mutation: Mutation, record: InternalRow) => {
107 |             // println(s"\twriting row ${cf.name}")
108 | 
109 |             // not using the fluent interface to provide backward compatibility
110 |             val value = encoder(record)
111 |             if (value != null)
112 |               mutation.put(cfBytes, Array.empty[Byte], value)
113 |           }
114 |       }
115 |     }
116 | 
117 |   // TODO: expose this as another input column
118 |   // private val columnVisibilityEmpty = new ColumnVisibility
119 | 
120 |   def write(record: InternalRow): Unit = {
121 |     val rowKeyRaw = stringAccessor(record, rowKeyIdx)
122 | 
123 |     // skip if the rowKey is null or ignore flag is set
124 |     if (rowKeyRaw != null && !ignore) {
125 |       val rowKey = rowKeyRaw.asInstanceOf[UTF8String].getBytes
126 | 
127 |       val mutation = new Mutation(rowKey)
128 |       recordToMutation.foreach { _(mutation, record) }
129 |       batchWriter.addMutation(mutation)
130 |     }
131 |   }
132 | 
133 |   def commit(): WriterCommitMessage = {
134 |     batchWriter.close()
135 |     client.close()
136 | 
137 |     WriteSucceeded
138 |   }
139 | 
140 |   def abort(): Unit = {
141 |     batchWriter.close()
142 |     client.close()
143 |   }
144 | 
145 |   object WriteSucceeded extends WriterCommitMessage
146 | }
147 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipFileAttributeView.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
  3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4 |  *
  5 |  * This code is free software; you can redistribute it and/or modify it
  6 |  * under the terms of the GNU General Public License version 2 only, as
  7 |  * published by the Free Software Foundation.  Oracle designates this
  8 |  * particular file as subject to the "Classpath" exception as provided
  9 |  * by Oracle in the LICENSE file that accompanied this code.
 10 |  *
 11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14 |  * version 2 for more details (a copy is included in the LICENSE file that
 15 |  * accompanied this code).
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License version
 18 |  * 2 along with this work; if not, write to the Free Software Foundation,
 19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  *
 21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22 |  * or visit www.oracle.com if you need additional information or have any
 23 |  * questions.
 24 |  */
 25 | 
 26 | package com.microsoft.accumulo.zipfs;
 27 | 
 28 | import java.io.IOException;
 29 | import java.nio.file.attribute.BasicFileAttributeView;
 30 | import java.nio.file.attribute.BasicFileAttributes;
 31 | import java.nio.file.attribute.FileTime;
 32 | import java.nio.file.attribute.PosixFilePermission;
 33 | import java.util.LinkedHashMap;
 34 | import java.util.Map;
 35 | import java.util.Set;
 36 | 
 37 | /**
 38 |  * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal
 39 |  */
 40 | class ZipFileAttributeView implements BasicFileAttributeView {
 41 |     static enum AttrID {
 42 |         size,
 43 |         creationTime,
 44 |         lastAccessTime,
 45 |         lastModifiedTime,
 46 |         isDirectory,
 47 |         isRegularFile,
 48 |         isSymbolicLink,
 49 |         isOther,
 50 |         fileKey,
 51 |         compressedSize,
 52 |         crc,
 53 |         method,
 54 |         owner,
 55 |         group,
 56 |         permissions
 57 |     }
 58 | 
 59 |     final ZipPath path;
 60 |     private final boolean isZipView;
 61 | 
 62 |     ZipFileAttributeView(ZipPath path, boolean isZipView) {
 63 |         this.path = path;
 64 |         this.isZipView = isZipView;
 65 |     }
 66 | 
 67 |     @Override
 68 |     public String name() {
 69 |         return isZipView ? "zip" : "basic";
 70 |     }
 71 | 
 72 |     @Override
 73 |     public BasicFileAttributes readAttributes() throws IOException {
 74 |         return path.readAttributes();
 75 |     }
 76 | 
 77 |     @Override
 78 |     public void setTimes(FileTime lastModifiedTime,
 79 |                          FileTime lastAccessTime,
 80 |                          FileTime createTime)
 81 |         throws IOException
 82 |     {
 83 |         path.setTimes(lastModifiedTime, lastAccessTime, createTime);
 84 |     }
 85 | 
 86 |     public void setPermissions(Set<PosixFilePermission> perms) throws IOException {
 87 |         path.setPermissions(perms);
 88 |     }
 89 | 
 90 |     @SuppressWarnings("unchecked")
 91 |     void setAttribute(String attribute, Object value)
 92 |         throws IOException
 93 |     {
 94 |         try {
 95 |             if (AttrID.valueOf(attribute) == AttrID.lastModifiedTime)
 96 |                 setTimes((FileTime)value, null, null);
 97 |             if (AttrID.valueOf(attribute) == AttrID.lastAccessTime)
 98 |                 setTimes(null, (FileTime)value, null);
 99 |             if (AttrID.valueOf(attribute) == AttrID.creationTime)
100 |                 setTimes(null, null, (FileTime)value);
101 |             if (AttrID.valueOf(attribute) == AttrID.permissions)
102 |                 setPermissions((Set<PosixFilePermission>)value);
103 |         } catch (IllegalArgumentException x) {
104 |             throw new UnsupportedOperationException("'" + attribute +
105 |                 "' is unknown or read-only attribute");
106 |         }
107 |     }
108 | 
109 |     Map<String, Object> readAttributes(String attributes)
110 |         throws IOException
111 |     {
112 |         ZipFileAttributes zfas = (ZipFileAttributes)readAttributes();
113 |         LinkedHashMap<String, Object> map = new LinkedHashMap<>();
114 |         if ("*".equals(attributes)) {
115 |             for (AttrID id : AttrID.values()) {
116 |                 try {
117 |                     map.put(id.name(), attribute(id, zfas));
118 |                 } catch (IllegalArgumentException x) {}
119 |             }
120 |         } else {
121 |             String[] as = attributes.split(",");
122 |             for (String a : as) {
123 |                 try {
124 |                     map.put(a, attribute(AttrID.valueOf(a), zfas));
125 |                 } catch (IllegalArgumentException x) {}
126 |             }
127 |         }
128 |         return map;
129 |     }
130 | 
131 |     Object attribute(AttrID id, ZipFileAttributes zfas) {
132 |         switch (id) {
133 |         case size:
134 |             return zfas.size();
135 |         case creationTime:
136 |             return zfas.creationTime();
137 |         case lastAccessTime:
138 |             return zfas.lastAccessTime();
139 |         case lastModifiedTime:
140 |             return zfas.lastModifiedTime();
141 |         case isDirectory:
142 |             return zfas.isDirectory();
143 |         case isRegularFile:
144 |             return zfas.isRegularFile();
145 |         case isSymbolicLink:
146 |             return zfas.isSymbolicLink();
147 |         case isOther:
148 |             return zfas.isOther();
149 |         case fileKey:
150 |             return zfas.fileKey();
151 |         case compressedSize:
152 |             if (isZipView)
153 |                 return zfas.compressedSize();
154 |             break;
155 |         case crc:
156 |             if (isZipView)
157 |                 return zfas.crc();
158 |             break;
159 |         case method:
160 |             if (isZipView)
161 |                 return zfas.method();
162 |             break;
163 |         case permissions:
164 |             if (isZipView) {
165 |                 return zfas.storedPermissions().orElse(null);
166 |             }
167 |             break;
168 |         default:
169 |             break;
170 |         }
171 |         return null;
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/AvroFastRecord.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.record;
 19 | 
 20 | import java.util.HashMap;
 21 | import java.util.Map;
 22 | import java.util.stream.Collectors;
 23 | 
 24 | import org.apache.accumulo.core.client.lexicoder.Encoder;
 25 | import org.apache.accumulo.core.data.ArrayByteSequence;
 26 | import org.apache.accumulo.core.data.ByteSequence;
 27 | import com.microsoft.accumulo.spark.juel.AvroUtf8Wrapper;
 28 | import org.apache.avro.Schema;
 29 | import org.apache.avro.Schema.Field;
 30 | import org.apache.avro.Schema.Type;
 31 | import org.apache.avro.generic.GenericContainer;
 32 | import org.apache.avro.generic.IndexedRecord;
 33 | 
 34 | /**
 35 |  * This class collects all cells of interest into an AVRO Generic Record.
 36 |  * 
 37 |  * Cells with non-empty column family and column qualifier are stored in nested
 38 |  * AVRO records. Cells with empty column qualifier are stored in the top-level
 39 |  * record.
 40 |  * 
 41 |  * Example:
 42 |  * 
 43 |  * <pre>
 44 |  * cf1, cq1,  abc
 45 |  * cf1, cq2,  3.2
 46 |  * cf2, null, 6
 47 |  * cf3, cq3,  def
 48 |  * </pre>
 49 |  * 
 50 |  * Avro Record:
 51 |  * 
 52 |  * <pre>
 53 |  * { 
 54 |  * 	 cf1: { cq1: "abc", cq2: 3.2 }, 
 55 |  * 	 cf2: 6, 
 56 |  *   cf3: { cq3: "def " }
 57 |  * }
 58 |  * </pre>
 59 |  */
 60 | public class AvroFastRecord implements GenericContainer, IndexedRecord {
 61 | 
 62 |   private static ByteSequence EMPTY_SEQUENCE = new ArrayByteSequence(new byte[0]);
 63 | 
 64 |   /**
 65 |    * The Avro schema.
 66 |    */
 67 |   private Schema schema;
 68 | 
 69 |   /**
 70 |    * The data array.
 71 |    */
 72 |   private Object[] values;
 73 | 
 74 |   /**
 75 |    * The nested records.
 76 |    */
 77 |   private AvroFastRecord[] nestedFields;
 78 | 
 79 |   /**
 80 |    * The primitive field indices for fast clearing.
 81 |    */
 82 |   private int[] primitiveFields;
 83 | 
 84 |   public AvroFastRecord(Schema schema) {
 85 |     this.schema = schema;
 86 |     this.values = new Object[schema.getFields().size()];
 87 | 
 88 |     // find all nested nested fields
 89 |     this.nestedFields = schema.getFields().stream().filter(f -> f.schema().getType() == Type.RECORD).map(f -> {
 90 |       AvroFastRecord rec = new AvroFastRecord(f.schema());
 91 |       this.values[f.pos()] = rec;
 92 |       return rec;
 93 |     }).toArray(AvroFastRecord[]::new);
 94 | 
 95 |     // find all primitive fields
 96 |     this.primitiveFields = schema.getFields().stream().filter(f -> f.schema().getType() != Type.RECORD)
 97 |         .mapToInt(Field::pos).toArray();
 98 |   }
 99 | 
100 |   /**
101 |    * Clears all primitive fields (including nested record once).
102 |    */
103 |   public void clear() {
104 |     for (int idx : this.primitiveFields)
105 |       this.values[idx] = null;
106 | 
107 |     for (AvroFastRecord rec : this.nestedFields)
108 |       rec.clear();
109 |   }
110 | 
111 |   @Override
112 |   public void put(int i, Object v) {
113 |     this.values[i] = v;
114 |   }
115 | 
116 |   @Override
117 |   public Object get(int i) {
118 |     return this.values[i];
119 |   }
120 | 
121 |   @Override
122 |   public Schema getSchema() {
123 |     return this.schema;
124 |   }
125 | 
126 |   /**
127 |    * Create the core lookup map for column family/column qualifier. The leave
128 |    * nodes are consumers that know which record/field to target.
129 |    * 
130 |    * @param rootRecord the root Avro record.
131 |    * @return the lookup map.
132 |    */
133 |   public static Map<ByteSequence, Map<ByteSequence, RowBuilderCellConsumer>> createCellToFieldMap(
134 |       AvroFastRecord rootRecord) {
135 |     Map<ByteSequence, Map<ByteSequence, RowBuilderCellConsumer>> map = new HashMap<>();
136 | 
137 |     // setup GenericRecordBuilder for each column family
138 |     for (Field field : rootRecord.getSchema().getFields()) {
139 |       Schema nestedSchema = field.schema();
140 | 
141 |       ByteSequence columnFamily = new ArrayByteSequence(field.name());
142 | 
143 |       // top-level field
144 |       if (nestedSchema.getType() != Type.RECORD) {
145 |         // Map.of(...) in older JDK
146 |         Map<ByteSequence, RowBuilderCellConsumer> subMap = new HashMap<>();
147 |         subMap.put(EMPTY_SEQUENCE, createAvroCellConsumer(rootRecord, field));
148 | 
149 |         map.put(columnFamily, subMap);
150 | 
151 |         continue;
152 |       }
153 | 
154 |       // nested fields
155 |       Map<ByteSequence, RowBuilderCellConsumer> nestedLookupMap = nestedSchema.getFields().stream()
156 |           .collect(Collectors.toMap(
157 |               // nested name as key
158 |               nestedField -> new ArrayByteSequence(nestedField.name()),
159 |               // assign cells to field in nested record
160 |               nestedField -> createAvroCellConsumer((AvroFastRecord) rootRecord.get(field.pos()), nestedField)));
161 | 
162 |       map.put(columnFamily, nestedLookupMap);
163 |     }
164 | 
165 |     return map;
166 |   }
167 | 
168 |   /**
169 |    * Creates a consumer of cells that copy the data into the corresponding Avro
170 |    * record fields.
171 |    * 
172 |    * @param record The record to populate.
173 |    * @param field  The field to populate
174 |    * @return The closure holding things together.
175 |    */
176 |   private static RowBuilderCellConsumer createAvroCellConsumer(AvroFastRecord record, Field field) {
177 |     int pos = field.pos();
178 | 
179 |     if (field.schema().getType() == Type.STRING)
180 |       // avoid byte array copying
181 |       return (key, value) -> record.put(pos, new AvroUtf8Wrapper(value.get()));
182 | 
183 |     // get the fitting encoder
184 |     Encoder<?> encoder = RowBuilderType.valueOf(field.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE)).getEncoder();
185 |     return (key, value) -> record.put(pos, encoder.decode(value.get()));
186 |   }
187 | }
188 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AvroUtil.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import org.apache.avro.{Schema, SchemaBuilder}
 21 | import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType}
 22 | import org.codehaus.jackson.map.ObjectMapper
 23 | import org.codehaus.jackson.map.annotate.JsonSerialize.Inclusion
 24 | 
 25 | import scala.beans.BeanProperty
 26 | 
 27 | // keeping the property names short to not hit any limits
 28 | case class RowBuilderField(@BeanProperty cf: String,  // column family
 29 |                            @BeanProperty cq: String,  // column qualifier
 30 |                            @BeanProperty fvn: String, // filter variable name
 31 |                            @BeanProperty t: String,   // type
 32 |                            @BeanProperty o: Boolean   // output
 33 |                           )
 34 | 
 35 | case class JsonSchema(json: String, attributeToVariableMapping: Map[String, String])
 36 | 
 37 | @SerialVersionUID(1L)
 38 | object AvroUtil {
 39 |   def catalystSchemaToJson(inputSchema: StructType): JsonSchema = catalystSchemaToJson(inputSchema, inputSchema)
 40 | 
 41 |   def catalystSchemaToJson(inputSchema: StructType, outputSchema: StructType): JsonSchema = {
 42 | 
 43 |     var attributeToVariableMapping = scala.collection.mutable.Map[String,  String]()
 44 | 
 45 |     var i = 0
 46 |     val selectedFields = inputSchema.fields.flatMap(cf => {
 47 |       val outputField = outputSchema.find(f => f.name == cf.name)
 48 | 
 49 |       cf.dataType match {
 50 |         case cft: StructType => cft.fields.map(cq =>
 51 |           RowBuilderField(
 52 |             cf.name,
 53 |             cq.name,
 54 |             {
 55 |               val variableName = s"v$i"
 56 |               attributeToVariableMapping += (s"${cf.name}.${cq.name}" -> variableName)
 57 |               i += 1
 58 | 
 59 |               variableName
 60 |             },
 61 |             // TODO: toUpperCase() is weird...
 62 |             cq.dataType.typeName.toUpperCase,
 63 |             // either the column family is not need -> output = false
 64 |             // otherwise we need to check if the column qualifier is present in the output list
 65 |             if (outputField.isEmpty) false else outputField.get.dataType.asInstanceOf[StructType].exists(f => f.name == cq.name)
 66 |           )
 67 |         )
 68 |         case _: DataType => Seq(RowBuilderField(
 69 |           cf.name,
 70 |           null,
 71 |           {
 72 |             val variableName = s"v$i"
 73 |             attributeToVariableMapping += (s"${cf.name}" -> variableName)
 74 |             i += 1
 75 | 
 76 |             variableName
 77 |           },
 78 |           // TODO: toUpperCase() is weird...
 79 |           cf.dataType.typeName.toUpperCase,
 80 |           outputField.isDefined
 81 |         ))
 82 |       }
 83 |     })
 84 | 
 85 |     try {
 86 |       val mapper = new ObjectMapper()
 87 | 
 88 |       // disable serialization of null-values
 89 |       mapper.setSerializationInclusion(Inclusion.NON_NULL)
 90 | 
 91 |       JsonSchema(mapper.writeValueAsString(selectedFields), attributeToVariableMapping.toMap)
 92 |     } catch {
 93 |       case e: Exception =>
 94 |         throw new IllegalArgumentException(e)
 95 |     }
 96 |   }
 97 | 
 98 |   implicit class CatalystSchemaToAvroRecordBuilder(builder: SchemaBuilder.FieldAssembler[Schema]) {
 99 |     def addAvroRecordField(field: StructField): SchemaBuilder.FieldAssembler[Schema] = {
100 |       (field.dataType, field.nullable) match {
101 |           case (DataTypes.BinaryType, true) => builder.optionalBytes(field.name)
102 |           case (DataTypes.BinaryType, false) => builder.requiredBytes(field.name)
103 |           case (DataTypes.BooleanType, true) => builder.optionalBoolean(field.name)
104 |           case (DataTypes.BooleanType, false) => builder.requiredBoolean(field.name)
105 |           case (DataTypes.DoubleType, true) => builder.optionalDouble(field.name)
106 |           case (DataTypes.DoubleType, false) => builder.requiredDouble(field.name)
107 |           case (DataTypes.FloatType, true) => builder.optionalFloat(field.name)
108 |           case (DataTypes.FloatType, false) => builder.requiredFloat(field.name)
109 |           case (DataTypes.IntegerType, true) => builder.optionalInt(field.name)
110 |           case (DataTypes.IntegerType, false) => builder.requiredInt(field.name)
111 |           case (DataTypes.LongType, true) => builder.optionalLong(field.name)
112 |           case (DataTypes.LongType, false) => builder.requiredLong(field.name)
113 |           case (DataTypes.StringType, true) => builder.optionalString(field.name)
114 |           case (DataTypes.StringType, false) => builder.requiredString(field.name)
115 |           // TODO: date/time support?
116 |           case _ => throw new UnsupportedOperationException(s"Unsupported type: $field.dataType")
117 |       }
118 |     }
119 | 
120 |     def addAvroRecordFields(schema: StructType): SchemaBuilder.FieldAssembler[Schema] = {
121 |       schema.fields.foldLeft(builder) { (builder, field) => builder.addAvroRecordField(field) }
122 |     }
123 |   }
124 | 
125 |   def catalystSchemaToAvroSchema(schema: StructType): Schema = {
126 |     val fieldBuilder = SchemaBuilder.record("root")
127 |       .fields()
128 | 
129 |     schema.fields.foldLeft(fieldBuilder) { (_, field) =>
130 |         field.dataType match {
131 |           // nested fields
132 |           case cft: StructType =>
133 |             fieldBuilder
134 |               .name(field.name)
135 |               .`type`(SchemaBuilder
136 |                 .record(field.name)
137 |                 .fields
138 |                 .addAvroRecordFields(cft)
139 |                 .endRecord())
140 |               .noDefault()
141 |           // top level fields
142 |           case _ => fieldBuilder.addAvroRecordField(field)
143 |         }
144 |       }
145 |       .endRecord()
146 |   }
147 | }
148 | 


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/AvroSchemaBuilder.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.record;
 19 | 
 20 | import java.util.Collection;
 21 | 
 22 | import org.apache.avro.Schema;
 23 | import org.apache.avro.SchemaBuilder;
 24 | 
 25 | /**
 26 |  * Builds the AVRO Schema from the user-supplied JSON encoded schema.
 27 |  */
 28 | public class AvroSchemaBuilder {
 29 |   public static final String PROPERTY_ROWBUILDERTYPE = "rowBuilderType";
 30 | 
 31 |   public static final String PROPERTY_OUTPUT = "output";
 32 | 
 33 |   private static SchemaBuilder.FieldAssembler<Schema> addAvroField(SchemaBuilder.FieldAssembler<Schema> builder,
 34 |       RowBuilderField field, String name) {
 35 | 
 36 |     RowBuilderType type = field.getRowBuilderType();
 37 | 
 38 |     SchemaBuilder.FieldBuilder<Schema> fieldBuilder = builder
 39 |         // configure the field name
 40 |         .name(name);
 41 | 
 42 |     // pass in alias
 43 |     if (field.getFilterVariableName() != null && field.getFilterVariableName().length() > 0)
 44 |       fieldBuilder = fieldBuilder.aliases(field.getFilterVariableName());
 45 | 
 46 |     SchemaBuilder.FieldTypeBuilder<Schema> intermediate = fieldBuilder
 47 |         // encode rowBuilderType so we can only operator on schema
 48 |         .prop(PROPERTY_ROWBUILDERTYPE, type.name())
 49 |         // encode if this is an output field
 50 |         .prop(PROPERTY_OUTPUT, Boolean.toString(field.isOutput()))
 51 |         // all fields are optional
 52 |         .type();
 53 | 
 54 |     if (field.isNullable()) {
 55 |       SchemaBuilder.BaseTypeBuilder<SchemaBuilder.FieldAssembler<Schema>> optionalType = intermediate.optional();
 56 |       switch (type) {
 57 |       case String:
 58 |         return optionalType.stringType();
 59 |       case Long:
 60 |         return optionalType.longType();
 61 |       case Integer:
 62 |         return optionalType.intType();
 63 |       case Double:
 64 |         return optionalType.doubleType();
 65 |       case Float:
 66 |         return optionalType.floatType();
 67 |       case Boolean:
 68 |         return optionalType.booleanType();
 69 |       case Bytes:
 70 |         return optionalType.bytesType();
 71 |       default:
 72 |         throw new IllegalArgumentException("Unsupported type '" + type + "'");
 73 |       }
 74 |     } else {
 75 |       switch (type) {
 76 |       case String:
 77 |         return intermediate.stringType().noDefault();
 78 |       case Long:
 79 |         return intermediate.longType().noDefault();
 80 |       case Integer:
 81 |         return intermediate.intType().noDefault();
 82 |       case Double:
 83 |         return intermediate.doubleType().noDefault();
 84 |       case Float:
 85 |         return intermediate.floatType().noDefault();
 86 |       case Boolean:
 87 |         return intermediate.booleanType().noDefault();
 88 |       case Bytes:
 89 |         return intermediate.bytesType().noDefault();
 90 |       default:
 91 |         throw new IllegalArgumentException("Unsupported type '" + type + "'");
 92 |       }
 93 |     }
 94 |   }
 95 | 
 96 |   private static SchemaBuilder.FieldAssembler<Schema> closeFieldAssembler(
 97 |       SchemaBuilder.FieldAssembler<Schema> rootAssembler, SchemaBuilder.FieldAssembler<Schema> columnFieldsAssembler,
 98 |       String columnFamily, boolean output) {
 99 | 
100 |     if (columnFieldsAssembler == null)
101 |       return rootAssembler;
102 | 
103 |     // add nested type to to root assembler
104 |     return rootAssembler
105 |         // name the record field
106 |         .name(columnFamily)
107 |         // any of the column sub fields need to be output?
108 |         .prop(PROPERTY_OUTPUT, Boolean.toString(output))
109 |         // it's a record type
110 |         .type(columnFieldsAssembler.endRecord()).noDefault();
111 |   }
112 | 
113 |   public static Schema buildSchema(Collection<RowBuilderField> schemaFields) {
114 |     // construct schema
115 |     SchemaBuilder.FieldAssembler<Schema> rootAssembler = SchemaBuilder.record("root").fields();
116 | 
117 |     // note that the order needs to be exactly in-sync with the avro schema
118 |     // generated on the MMLSpark/Scala side
119 |     String lastColumnFamily = null;
120 |     SchemaBuilder.FieldAssembler<Schema> columnFieldsAssembler = null;
121 |     boolean output = false;
122 |     for (RowBuilderField schemaField : schemaFields) {
123 | 
124 |       String columnFamily = schemaField.getColumnFamily();
125 |       String columnQualifier = schemaField.getColumnQualifier();
126 | 
127 |       if (columnQualifier != null) {
128 |         if (lastColumnFamily == null || !lastColumnFamily.equals(columnFamily)) {
129 | 
130 |           // close previous record
131 |           rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output);
132 | 
133 |           // open new record
134 |           columnFieldsAssembler = SchemaBuilder.record(columnFamily).fields();
135 | 
136 |           output = false;
137 |         }
138 | 
139 |         // true if any of the column qualifiers is an output field
140 |         output |= (boolean) schemaField.isOutput();
141 | 
142 |         // add the current field
143 |         columnFieldsAssembler = addAvroField(columnFieldsAssembler, schemaField, columnQualifier);
144 |       } else {
145 |         // close previous record
146 |         rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output);
147 |         columnFieldsAssembler = null;
148 |         output = false;
149 | 
150 |         // add the top-level field
151 |         rootAssembler = addAvroField(rootAssembler, schemaField, columnFamily);
152 |       }
153 | 
154 |       lastColumnFamily = columnFamily;
155 |     }
156 | 
157 |     rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output);
158 | 
159 |     // setup serialization
160 |     return rootAssembler.endRecord();
161 |   }
162 | }
163 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloInputPartitionReader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import java.io.IOException
 21 | 
 22 | import org.apache.accumulo.core.client.{Accumulo, IteratorSetting}
 23 | import org.apache.accumulo.core.data.{Key, Range}
 24 | import org.apache.accumulo.core.security.Authorizations
 25 | import org.apache.avro.generic.GenericRecord
 26 | import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
 27 | import org.apache.avro.specific.SpecificDatumReader
 28 | import org.apache.hadoop.io.Text
 29 | import org.apache.log4j.Logger
 30 | import org.apache.spark.sql.avro.AvroDeserializer
 31 | import org.apache.spark.sql.catalyst.InternalRow
 32 | import org.apache.spark.sql.sources.v2.reader.InputPartitionReader
 33 | import org.apache.spark.sql.types.StructType
 34 | import org.apache.spark.unsafe.types.UTF8String
 35 | import scala.collection.JavaConverters._
 36 | 
 37 | @SerialVersionUID(1L)
 38 | class AccumuloInputPartitionReader(tableName: String,
 39 |                                    ranges: Seq[Seq[Array[Byte]]],
 40 |                                    inputSchema: StructType,
 41 |                                    outputSchema: StructType,
 42 |                                    properties: java.util.Properties,
 43 |                                    rowKeyColumn: String,
 44 |                                    filterInJuel: Option[String])
 45 |   extends InputPartitionReader[InternalRow] with Serializable {
 46 | 
 47 |   private val logger = Logger.getLogger(classOf[AccumuloInputPartitionReader])
 48 | 
 49 |   val defaultPriority = "20"
 50 |   val defaultNumQueryThreads: String = math.min(16, ranges.length).toString
 51 | 
 52 |   private val priority = Integer.valueOf(properties.getProperty("priority", defaultPriority))
 53 |   // this parameter is impacted by number of accumulo splits and spark partitions and executors
 54 |   private val numQueryThreads = Integer.valueOf(properties.getProperty("numQueryThreads", defaultNumQueryThreads))
 55 | 
 56 |   private val authorizations = new Authorizations()
 57 |   private val client = Accumulo.newClient().from(properties).build()
 58 |   private val scanner = client.createBatchScanner(tableName, authorizations, numQueryThreads)
 59 | 
 60 |   private def createRange(start: Array[Byte], stop: Array[Byte]) =
 61 |       new Range(
 62 |         if (start.length == 0) null else new Key(start),
 63 |         start.length == 0, 
 64 |         if (stop.length == 0) null else new Key(stop),
 65 |         true)
 66 | 
 67 |   scanner.setRanges(ranges.map(t => createRange(t(0), t(1))).asJava)
 68 | 
 69 |   private val avroIterator = new IteratorSetting(
 70 |     priority,
 71 |     "AVRO",
 72 |     "com.microsoft.accumulo.spark.AvroRowEncoderIterator")
 73 | 
 74 |   // only fetch column families we care for (and don't filter for the mleapFields which are artificially added later)
 75 |   inputSchema.fields.foreach(f => scanner.fetchColumnFamily(f.name))
 76 | 
 77 |   private val rowKeyColumnIndex = {
 78 |     if (outputSchema.fieldNames.contains(rowKeyColumn))
 79 |       outputSchema.fieldIndex(rowKeyColumn)
 80 |     else
 81 |       -1
 82 |   }
 83 | 
 84 |   // AVRO Iterator setup
 85 |   val jsonSchema: String = AvroUtil.catalystSchemaToJson(inputSchema, outputSchema).json
 86 | 
 87 |   logger.info(s"JSON schema: $jsonSchema")
 88 |   avroIterator.addOption("schema", jsonSchema)
 89 |   if (filterInJuel.isDefined)
 90 |     avroIterator.addOption("filter", filterInJuel.get)
 91 | 
 92 |   // list of output columns
 93 | //  val prunedColumns = schema.map(_.name).mkString(",")
 94 | //  logger.info(s"Pruned columns: ${prunedColumns}")
 95 | //  avroIterator.addOption("prunedcolumns", prunedColumns)
 96 | 
 97 |   // forward options
 98 |   Seq("mleap", "mleapfilter", "mleapguid", "exceptionlogfile")
 99 |     .foreach { key => avroIterator.addOption(key, properties.getProperty(key, "")) }
100 | 
101 |   scanner.addScanIterator(avroIterator)
102 | 
103 |   // TODO: support additional user-supplied iterators
104 |   private val scannerIterator = scanner.iterator()
105 | 
106 |   // filter out row-key target from schema generation
107 |   private val schemaWithoutRowKey = new StructType(outputSchema.fields.filter(_.name != rowKeyColumn))
108 | 
109 |   // the serialized AVRO does not contain the row key as it comes with the key/value pair anyway
110 |   private val avroSchema = AvroUtil.catalystSchemaToAvroSchema(schemaWithoutRowKey)
111 | 
112 |   // pass the schema for the avro input along with the target output schema (incl. row key)
113 |   private val deserializer = new AvroDeserializer(avroSchema, outputSchema)
114 |   private val reader = new SpecificDatumReader[GenericRecord](avroSchema)
115 | 
116 |   private var decoder: BinaryDecoder = _
117 |   private var currentRow: InternalRow = _
118 |   private var datum: GenericRecord = _
119 | 
120 |   private val rowKeyText = new Text()
121 | 
122 |   override def close(): Unit = {
123 |     if (scanner != null)
124 |       scanner.close()
125 | 
126 |     if (client != null)
127 |       client.close()
128 |   }
129 | 
130 |   @IOException
131 |   override def next: Boolean = {
132 |     if (scannerIterator.hasNext) {
133 |       val entry = scannerIterator.next
134 |       val data = entry.getValue.get
135 | 
136 |       // byte[] -> avro
137 |       decoder = DecoderFactory.get.binaryDecoder(data, decoder)
138 |       datum = reader.read(datum, decoder)
139 | 
140 |       // avro -> catalyst
141 |       currentRow = deserializer.deserialize(datum).asInstanceOf[InternalRow]
142 | 
143 |       if (rowKeyColumnIndex >= 0) {
144 |         // move row key id into internalrow
145 |         entry.getKey.getRow(rowKeyText)
146 | 
147 |         // avoid yet another byte array copy...
148 |         val str = UTF8String.fromBytes(rowKeyText.getBytes, 0, rowKeyText.getLength)
149 |         currentRow.update(rowKeyColumnIndex, str)
150 |       }
151 | 
152 |       true
153 |     } else {
154 |       false
155 |     }
156 |   }
157 | 
158 |   override def get(): InternalRow = currentRow
159 | }


--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowComputedColumns.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark.processors;
 19 | 
 20 | import java.io.IOException;
 21 | import java.util.ArrayList;
 22 | import java.util.Collection;
 23 | import java.util.List;
 24 | import java.util.Map;
 25 | import java.util.stream.Collectors;
 26 | 
 27 | import javax.el.ExpressionFactory;
 28 | import javax.el.ValueExpression;
 29 | 
 30 | import com.microsoft.accumulo.spark.juel.AvroELContext;
 31 | import com.microsoft.accumulo.spark.record.RowBuilderField;
 32 | import com.microsoft.accumulo.spark.record.RowBuilderType;
 33 | import org.apache.avro.Schema;
 34 | import org.apache.avro.Schema.Field;
 35 | import org.apache.avro.generic.IndexedRecord;
 36 | import org.apache.hadoop.io.Text;
 37 | 
 38 | /**
 39 |  * Holds all computed columns. <br>
 40 |  * Note: it's a bit convoluted as we first have to parse the options to figure
 41 |  * which additional columns we have, return to the caller so we can setup the
 42 |  * AVRO schema and then continue the setup here.
 43 |  */
 44 | public class AvroRowComputedColumns extends AvroRowConsumer {
 45 |   public static final String COLUMN_PREFIX = "column.";
 46 | 
 47 |   /**
 48 |    * Required for copy.
 49 |    */
 50 |   private Schema schema;
 51 | 
 52 |   /**
 53 |    * JUEL expression context exposing AVRO GenericRecord
 54 |    */
 55 |   private AvroELContext expressionContext;
 56 | 
 57 |   /**
 58 |    * Definitions created from user-supplied options.
 59 |    */
 60 |   private List<ExpressionColumnDefinition> expressionColumnDefinitions;
 61 | 
 62 |   /**
 63 |    * The executable column expressions.
 64 |    */
 65 |   private List<ExpressionColumn> expressionColumns;
 66 | 
 67 |   /**
 68 |    * Just the definition of the expression. Need to collect them all first so the
 69 |    * AVRO schema can be build.
 70 |    */
 71 |   static class ExpressionColumnDefinition {
 72 |     private RowBuilderField schemaField;
 73 | 
 74 |     private String expression;
 75 | 
 76 |     public ExpressionColumnDefinition(RowBuilderField schemaField, String expression) {
 77 |       this.schemaField = schemaField;
 78 |       this.expression = expression;
 79 |     }
 80 | 
 81 |     public RowBuilderField getSchemaField() {
 82 |       return schemaField;
 83 |     }
 84 | 
 85 |     public String getExpression() {
 86 |       return expression;
 87 |     }
 88 |   }
 89 | 
 90 |   /**
 91 |    * The fully initialized expression ready to be computed.
 92 |    */
 93 |   class ExpressionColumn {
 94 |     private ValueExpression columnExpression;
 95 | 
 96 |     private int pos;
 97 | 
 98 |     public ExpressionColumn(ValueExpression columnExpression, int pos) {
 99 |       this.columnExpression = columnExpression;
100 |       this.pos = pos;
101 |     }
102 | 
103 |     public void setFieldValue(IndexedRecord record) {
104 |       Object value = this.columnExpression.getValue(AvroRowComputedColumns.this.expressionContext);
105 |       record.put(this.pos, value);
106 |     }
107 |   }
108 | 
109 |   /**
110 |    * Factory method creating the row processor if valid options are supplied or
111 |    * null if none are found.
112 |    */
113 |   public static AvroRowComputedColumns create(Map<String, String> options) {
114 |     // expression setup
115 |     // options: column.<name>.<type>, JUEL expression
116 |     List<ExpressionColumnDefinition> expressionColumnDefinitions = new ArrayList<>();
117 | 
118 |     for (Map.Entry<String, String> entry : options.entrySet()) {
119 |       if (!entry.getKey().startsWith(COLUMN_PREFIX))
120 |         continue;
121 | 
122 |       String[] arr = entry.getKey().split("\\.");
123 |       if (arr.length != 3)
124 |         throw new IllegalArgumentException(
125 |             "Unable to parse column specification. column.<name>.<type>: " + entry.getKey());
126 | 
127 |       String column = arr[1];
128 |       String type = RowBuilderType.valueOfIgnoreCase(arr[2]).name();
129 |       String expression = entry.getValue();
130 |       RowBuilderField schemaField = new RowBuilderField(column, null, type, column);
131 | 
132 |       expressionColumnDefinitions.add(new ExpressionColumnDefinition(schemaField, expression));
133 |     }
134 | 
135 |     return expressionColumnDefinitions.isEmpty() ? null : new AvroRowComputedColumns(expressionColumnDefinitions);
136 |   }
137 | 
138 |   private AvroRowComputedColumns(List<ExpressionColumnDefinition> expressionColumnDefinitions) {
139 |     this.expressionColumnDefinitions = expressionColumnDefinitions;
140 |   }
141 | 
142 |   /**
143 |    * 
144 |    * @return a collection of RowBuilderFields based on the column expression
145 |    *         definitions.
146 |    */
147 |   @Override
148 |   public Collection<RowBuilderField> getSchemaFields() {
149 |     return this.expressionColumnDefinitions.stream().map(ExpressionColumnDefinition::getSchemaField)
150 |         .collect(Collectors.toList());
151 |   }
152 | 
153 |   /**
154 |    * Initialize the columns expression. Can't be done in the constructor as the
155 |    * schema wasn't ready.
156 |    * 
157 |    * @param schema the AVRO input schema.
158 |    */
159 |   @Override
160 |   public void initialize(Schema schema) {
161 |     this.schema = schema;
162 |     this.expressionContext = new AvroELContext(schema);
163 | 
164 |     ExpressionFactory factory = ExpressionFactory.newInstance();
165 | 
166 |     this.expressionColumns = this.expressionColumnDefinitions.stream().map(expr -> {
167 |       Field field = schema.getField(expr.getSchemaField().getColumnFamily());
168 | 
169 |       RowBuilderType type = expr.getSchemaField().getRowBuilderType();
170 |       ValueExpression columnExpression = factory.createValueExpression(expressionContext, expr.getExpression(),
171 |           type.getJavaClass());
172 | 
173 |       return new ExpressionColumn(columnExpression, field.pos());
174 |     }).collect(Collectors.toList());
175 |   }
176 | 
177 |   @Override
178 |   protected boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException {
179 |     this.expressionContext.setCurrent(rowKey, record);
180 | 
181 |     // compute each expression
182 |     for (ExpressionColumn expr : this.expressionColumns)
183 |       expr.setFieldValue(record);
184 | 
185 |     return true;
186 |   }
187 | 
188 |   @Override
189 |   public AvroRowConsumer clone() {
190 |     AvroRowComputedColumns copy = new AvroRowComputedColumns(this.expressionColumnDefinitions);
191 | 
192 |     copy.initialize(this.schema);
193 | 
194 |     return copy;
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataSourceReader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo
 19 | 
 20 | import org.apache.accumulo.core.client.Accumulo
 21 | import org.apache.spark.sql.catalyst.InternalRow
 22 | import org.apache.spark.sql.sources.Filter
 23 | import org.apache.spark.sql.sources.v2.DataSourceOptions
 24 | import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition, InputPartitionReader}
 25 | import org.apache.spark.sql.types.{DataTypes, StructType}
 26 | import scala.collection.JavaConverters._
 27 | import scala.collection.mutable.ArrayBuffer
 28 | import org.apache.log4j.Logger
 29 | import java.util.UUID
 30 | 
 31 | // TODO: https://github.com/apache/spark/blob/053dd858d38e6107bc71e0aa3a4954291b74f8c8/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java
 32 | // in head of spark github repo
 33 | // import org.apache.spark.sql.connector.read.{SupportsPushDownFilters, SupportsPushDownRequiredColumns}
 34 | import org.apache.spark.sql.sources.v2.reader.{SupportsPushDownFilters, SupportsPushDownRequiredColumns}
 35 | 
 36 | 
 37 | @SerialVersionUID(1L)
 38 | class AccumuloDataSourceReader(schema: StructType, options: DataSourceOptions)
 39 |   extends DataSourceReader with Serializable with SupportsPushDownRequiredColumns with SupportsPushDownFilters {
 40 |   private val logger = Logger.getLogger(classOf[AccumuloDataSourceReader])
 41 | 
 42 |   private val defaultMaxPartitions = 200
 43 | 
 44 |   var filters = Array.empty[Filter]
 45 | 
 46 |   val rowKeyColumn: String = options.get("rowkey").orElse("rowkey")
 47 |   val schemaWithOutRowKey = new StructType(schema.filter { _.name != rowKeyColumn }.toArray)
 48 |   
 49 |   // initialize output schema with full schema
 50 |   private var requiredSchema = {
 51 |     // adding rowKey
 52 |     val baseSchema = schemaWithOutRowKey.add(rowKeyColumn, DataTypes.StringType, nullable = true)
 53 | 
 54 |     // add any output fields we find in a mleap pipeline
 55 |     val mleapFields = MLeapUtil.mleapSchemaToCatalyst(options.get("mleap").orElse(""))
 56 | 
 57 |     StructType(baseSchema ++ mleapFields)
 58 |   }
 59 | 
 60 |   private var filterInJuel: Option[String] = None
 61 | 
 62 |   override def pruneColumns(requiredSchema: StructType): Unit = {
 63 |       this.requiredSchema = requiredSchema
 64 |   }
 65 | 
 66 |   def readSchema: StructType = requiredSchema
 67 | 
 68 |   override def pushFilters(filters: Array[Filter]): Array[Filter] = {
 69 |     // unfortunately predicates on nested elements are not pushed down by Spark
 70 |     // https://issues.apache.org/jira/browse/SPARK-17636
 71 |     // https://github.com/apache/spark/pull/22535
 72 | 
 73 |     val jsonSchema = AvroUtil.catalystSchemaToJson(schemaWithOutRowKey)
 74 |     val result = new FilterToJuel(jsonSchema.attributeToVariableMapping, rowKeyColumn)
 75 |       .serializeFilters(filters, options.get("filter").orElse(""))
 76 | 
 77 |     this.filters = result.supportedFilters.toArray
 78 | 
 79 |     if (result.serializedFilter.length > 0) {
 80 |       this.filterInJuel = Some("${" + result.serializedFilter + "}")
 81 |       logger.info(s"JUEL filter: ${this.filterInJuel}")
 82 |     }
 83 | 
 84 |     result.unsupportedFilters.toArray
 85 |   }
 86 | 
 87 |   override def pushedFilters(): Array[Filter] = filters
 88 | 
 89 |   def planInputPartitions: java.util.List[InputPartition[InternalRow]] = {
 90 |     val tableName = options.tableName.get
 91 |     val maxPartitions = options.getInt("maxPartitions", defaultMaxPartitions)
 92 |     val properties = new java.util.Properties()
 93 |     // can use .putAll(options.asMap()) due to https://github.com/scala/bug/issues/10418
 94 |     options.asMap.asScala.foreach { case (k, v) => properties.setProperty(k, v) }
 95 | 
 96 |     // pass GUID to iterator so we can perform fast cache lookup
 97 |     // needs to be done on the head node so that all have the same guid
 98 |     properties.setProperty("mleapguid", UUID.randomUUID.toString)
 99 | 
100 |     val splits = ArrayBuffer(Array.empty[Byte], Array.empty[Byte])
101 | 
102 |     val client = Accumulo.newClient().from(properties).build()
103 |     // it's possible to merge on the accumulo side
104 |     // val tableSplits = client.tableOperations().listSplits(tableName, maxPartitions)
105 |     val tableSplits = try {
106 |       client.tableOperations().listSplits(tableName)
107 |     }
108 |     finally {
109 |       client.close()
110 |     }
111 | 
112 |     // on deployed clusters a table with no split will return a single empty Text instance
113 |     val containsSingleEmptySplit =
114 |       tableSplits.size == 1 &&
115 |         tableSplits.iterator.next.getLength == 0
116 | 
117 |     if (tableSplits.size > 1 || !containsSingleEmptySplit)
118 |       splits.insertAll(1, tableSplits.asScala.map(_.getBytes))
119 | 
120 |     // convert splits to ranges
121 |     var ranges = splits.sliding(2).toSeq
122 | 
123 |     // optionally shuffle
124 |     if (options.getBoolean("shuffle.ranges", true))
125 |       ranges = scala.util.Random.shuffle(ranges)
126 | 
127 |     // create groups of ranges
128 |     val numReaders = scala.math.min(ranges.length, maxPartitions)
129 |     val batchSize = ranges.length / numReaders
130 |     val batchRanges = ranges.sliding(batchSize, batchSize)
131 | 
132 |     logger.info(s"Splits '$batchRanges' creating $numReaders readers")
133 | 
134 |     val foo = batchRanges.map(r => new PartitionReaderFactory(tableName, r,
135 |       schemaWithOutRowKey, requiredSchema, properties, rowKeyColumn, filterInJuel))
136 |       .toSeq.asJava
137 | 
138 |     new java.util.ArrayList[InputPartition[InternalRow]](foo)
139 |   }
140 | }
141 | 
142 | class PartitionReaderFactory(tableName: String,
143 |                              ranges: Seq[Seq[Array[Byte]]],
144 |                              inputSchema: StructType,
145 |                              outputSchema: StructType,
146 |                              properties: java.util.Properties,
147 |                              rowKeyColumn: String,
148 |                              filterInJuel: Option[String])
149 |   extends InputPartition[InternalRow] {
150 | 
151 |   def createPartitionReader: InputPartitionReader[InternalRow] = {
152 | 
153 |     Logger.getLogger(classOf[AccumuloDataSourceReader]).info(s"Partition reader for $ranges")
154 | 
155 |     new AccumuloInputPartitionReader(tableName, ranges, inputSchema, outputSchema, properties, rowKeyColumn, filterInJuel)
156 |   }
157 | 
158 |   //  override def preferredLocations(): Array[String] = Array("ab", "c")
159 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015/2017 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # Visual Studio 2017 auto generated files
 33 | Generated\ Files/
 34 | 
 35 | # MSTest test Results
 36 | [Tt]est[Rr]esult*/
 37 | [Bb]uild[Ll]og.*
 38 | 
 39 | # NUNIT
 40 | *.VisualState.xml
 41 | TestResult.xml
 42 | 
 43 | # Build Results of an ATL Project
 44 | [Dd]ebugPS/
 45 | [Rr]eleasePS/
 46 | dlldata.c
 47 | 
 48 | # Benchmark Results
 49 | BenchmarkDotNet.Artifacts/
 50 | 
 51 | # .NET Core
 52 | project.lock.json
 53 | project.fragment.lock.json
 54 | artifacts/
 55 | **/Properties/launchSettings.json
 56 | 
 57 | # StyleCop
 58 | StyleCopReport.xml
 59 | 
 60 | # Files built by Visual Studio
 61 | *_i.c
 62 | *_p.c
 63 | *_i.h
 64 | *.ilk
 65 | *.meta
 66 | *.obj
 67 | *.iobj
 68 | *.pch
 69 | *.pdb
 70 | *.ipdb
 71 | *.pgc
 72 | *.pgd
 73 | *.rsp
 74 | *.sbr
 75 | *.tlb
 76 | *.tli
 77 | *.tlh
 78 | *.tmp
 79 | *.tmp_proj
 80 | *.log
 81 | *.vspscc
 82 | *.vssscc
 83 | .builds
 84 | *.pidb
 85 | *.svclog
 86 | *.scc
 87 | 
 88 | # Chutzpah Test files
 89 | _Chutzpah*
 90 | 
 91 | # Visual C++ cache files
 92 | ipch/
 93 | *.aps
 94 | *.ncb
 95 | *.opendb
 96 | *.opensdf
 97 | *.sdf
 98 | *.cachefile
 99 | *.VC.db
100 | *.VC.VC.opendb
101 | 
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 | 
108 | # Visual Studio Trace Files
109 | *.e2e
110 | 
111 | # TFS 2012 Local Workspace
112 | $tf/
113 | 
114 | # Guidance Automation Toolkit
115 | *.gpState
116 | 
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 | 
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 | 
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 | 
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 | 
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 | 
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 | 
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 | 
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 | 
148 | # Web workbench (sass)
149 | .sass-cache/
150 | 
151 | # Installshield output folder
152 | [Ee]xpress/
153 | 
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 | 
164 | # Click-Once directory
165 | publish/
166 | 
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 | 
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 | 
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 | 
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 | 
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 | 
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 | 
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 | 
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 | 
224 | # Including strong name files can present a security risk 
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 | 
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 | 
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 | 
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 | 
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 | 
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 | 
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 | 
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 | 
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 | 
266 | # Visual Studio 6 build log
267 | *.plg
268 | 
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 | 
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 | 
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 | 
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 | 
287 | # FAKE - F# Make
288 | .fake/
289 | 
290 | # JetBrains Rider
291 | .idea/
292 | *.sln
293 | *.iml
294 | 
295 | # CodeRush
296 | .cr/
297 | 
298 | # Python Tools for Visual Studio (PTVS)
299 | __pycache__/
300 | *.pyc
301 | 
302 | # Cake - Uncomment if you are using it
303 | # tools/**
304 | # !tools/packages.config
305 | 
306 | # Tabs Studio
307 | *.tss
308 | 
309 | # Telerik's JustMock configuration file
310 | *.jmconfig
311 | 
312 | # BizTalk build output
313 | *.btp.cs
314 | *.btm.cs
315 | *.odx.cs
316 | *.xsd.cs
317 | 
318 | # OpenCover UI analysis results
319 | OpenCover/
320 | 
321 | # Azure Stream Analytics local run output 
322 | ASALocalRun/
323 | 
324 | # MSBuild Binary and Structured Log
325 | *.binlog
326 | 
327 | # NVidia Nsight GPU debugger configuration file
328 | *.nvuser
329 | 
330 | # MFractors (Xamarin productivity tool) working folder 
331 | .mfractor/
332 | 
333 | .project
334 | .classpath
335 | .vscode
336 | target/
337 | .settings/
338 | .ipynb_checkpoints/


--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroJuelTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.microsoft.accumulo.spark;
 19 | 
 20 | import java.util.Arrays;
 21 | 
 22 | import javax.el.ExpressionFactory;
 23 | import javax.el.ValueExpression;
 24 | 
 25 | import com.microsoft.accumulo.spark.juel.AvroELContext;
 26 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder;
 27 | import com.microsoft.accumulo.spark.record.RowBuilderField;
 28 | import org.apache.avro.Schema;
 29 | import org.apache.avro.generic.GenericRecordBuilder;
 30 | import org.apache.hadoop.io.Text;
 31 | import org.junit.Test;
 32 | 
 33 | import junit.framework.TestCase;
 34 | 
 35 | public class AvroJuelTest extends TestCase {
 36 | 
 37 |   private AvroELContext context;
 38 |   private ExpressionFactory factory;
 39 |   private Schema schema;
 40 | 
 41 |   @Override
 42 |   public void setUp() throws Exception {
 43 |     factory = ExpressionFactory.newInstance();
 44 | 
 45 |     RowBuilderField[] schemaMappingFields = new RowBuilderField[] {
 46 |         // row 0
 47 |         new RowBuilderField("cf1", "cq1", "long", "v0"),
 48 |         // row 1
 49 |         new RowBuilderField("cf2", "cq2", "double", "v1"),
 50 |         // row 2
 51 |         new RowBuilderField("cf2", "cq3", "string", "v2") };
 52 | 
 53 |     schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields));
 54 | 
 55 |     context = new AvroELContext(schema);
 56 |   }
 57 | 
 58 |   private void setRecordValues(String rowKey, long cq1, double cq2, String cq3) {
 59 |     GenericRecordBuilder cf1RecordBuilder = new GenericRecordBuilder(schema.getField("cf1").schema());
 60 |     GenericRecordBuilder cf2RecordBuilder = new GenericRecordBuilder(schema.getField("cf2").schema());
 61 | 
 62 |     cf1RecordBuilder.set("cq1", cq1);
 63 |     cf2RecordBuilder.set("cq2", cq2);
 64 |     cf2RecordBuilder.set("cq3", cq3);
 65 | 
 66 |     GenericRecordBuilder rootRecordBuilder = new GenericRecordBuilder(schema);
 67 |     rootRecordBuilder.set("cf1", cf1RecordBuilder.build());
 68 |     rootRecordBuilder.set("cf2", cf2RecordBuilder.build());
 69 | 
 70 |     context.setCurrent(new Text(rowKey), rootRecordBuilder.build());
 71 |   }
 72 | 
 73 |   @Test
 74 |   public void testVariableExpressions() {
 75 |     ValueExpression exprV0 = factory.createValueExpression(context, "${v0}", long.class);
 76 | 
 77 |     // set the values after the expression is created
 78 |     setRecordValues("key1", 3L, 2.0, "");
 79 |     assertEquals(3L, exprV0.getValue(context));
 80 | 
 81 |     // test if we can reset it
 82 |     setRecordValues("key1", 4L, 2.5, "");
 83 |     assertEquals(4L, exprV0.getValue(context));
 84 | 
 85 |     // check for the second variable
 86 |     ValueExpression exprV1 = factory.createValueExpression(context, "${v1}", double.class);
 87 |     assertEquals(2.5, exprV1.getValue(context));
 88 |   }
 89 | 
 90 |   @Test
 91 |   public void testVariableConditions() {
 92 |     ValueExpression expr = factory.createValueExpression(context, "${v0 > 2.1 && v1 < 3}", boolean.class);
 93 | 
 94 |     setRecordValues("key1", 3L, 2.0, "");
 95 | 
 96 |     assertTrue((boolean) expr.getValue(context));
 97 |   }
 98 | 
 99 |   @Test
100 |   public void testStringEndsWith() {
101 |     ValueExpression expr = factory.createValueExpression(context, "${v2.endsWith('test')}", boolean.class);
102 |     setRecordValues("key1", 3L, 2.0, "This is a test");
103 |     assertTrue((boolean) expr.getValue(context));
104 | 
105 |     expr = factory.createValueExpression(context, "${!v2.endsWith('foo')}", boolean.class);
106 |     assertTrue((boolean) expr.getValue(context));
107 |   }
108 | 
109 |   @Test
110 |   public void testStringStartsWith() {
111 |     ValueExpression expr = factory.createValueExpression(context, "${v2.startsWith('This')}", boolean.class);
112 |     setRecordValues("key1", 3L, 2.0, "This is a test");
113 |     assertTrue((boolean) expr.getValue(context));
114 | 
115 |     expr = factory.createValueExpression(context, "${!v2.startsWith('this')}", boolean.class);
116 |     assertTrue((boolean) expr.getValue(context));
117 |   }
118 | 
119 |   @Test
120 |   public void testStringContains() {
121 |     ValueExpression expr = factory.createValueExpression(context, "${v2.contains('is')}", boolean.class);
122 |     setRecordValues("key1", 3L, 2.0, "This is a test");
123 |     assertTrue((boolean) expr.getValue(context));
124 | 
125 |     expr = factory.createValueExpression(context, "${!v2.contains('IS')}", boolean.class);
126 |     assertTrue((boolean) expr.getValue(context));
127 |   }
128 | 
129 |   @Test
130 |   public void testStringIn() {
131 |     ValueExpression expr = factory.createValueExpression(context, "${v2.in('a','b','c')}", boolean.class);
132 |     setRecordValues("key1", 3L, 2.0, "b");
133 |     assertTrue((boolean) expr.getValue(context));
134 |   }
135 | 
136 |   @Test
137 |   public void testIntIn() {
138 |     ValueExpression expr = factory.createValueExpression(context, "${v0.in(0, 1, 3)}", boolean.class);
139 |     setRecordValues("key1", 3L, 2.0, "b");
140 |     assertTrue((boolean) expr.getValue(context));
141 | 
142 |     expr = factory.createValueExpression(context, "${v0.in(0, 1)}", boolean.class);
143 |     setRecordValues("key1", 3L, 2.0, "b");
144 |     assertFalse((boolean) expr.getValue(context));
145 |   }
146 | 
147 |   @Test
148 |   public void testStringQuoteEscape() {
149 |     ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\\'bc'}", boolean.class);
150 |     setRecordValues("key1", 3L, 2.0, "a'bc");
151 |     assertTrue((boolean) expr.getValue(context));
152 |   }
153 | 
154 |   @Test
155 |   public void testStringDoubleQuoteEscape() {
156 |     ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\"bc'}", boolean.class);
157 |     setRecordValues("key1", 3L, 2.0, "a\"bc");
158 |     assertTrue((boolean) expr.getValue(context));
159 |   }
160 | 
161 |   @Test
162 |   public void testStringBackslash() {
163 |     ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\\\\bc'}", boolean.class);
164 |     setRecordValues("key1", 3L, 2.0, "a\\bc");
165 |     assertTrue((boolean) expr.getValue(context));
166 |   }
167 | 
168 |   @Test
169 |   public void testRowKey() {
170 |     ValueExpression expr = factory.createValueExpression(context, "${rowKey == 'key1'}", boolean.class);
171 |     setRecordValues("key1", 3L, 2.0, "abc");
172 |     assertTrue((boolean) expr.getValue(context));
173 | 
174 |     setRecordValues("key2", 3L, 2.0, "abc");
175 |     assertFalse((boolean) expr.getValue(context));
176 |   }
177 | 
178 |   @Test
179 |   public void testObjectPropertyBased() {
180 |     ValueExpression expr = factory.createValueExpression(context, "${cf1.cq1 == 3}", boolean.class);
181 |     setRecordValues("key1", 3L, 2.0, "abc");
182 |     assertTrue((boolean) expr.getValue(context));
183 |   }
184 | 
185 |   @Test
186 |   public void testColumnRemapping() {
187 |     ValueExpression expr = factory.createValueExpression(context, "${(cf1.cq1 + 1)/2.0}", Object.class);
188 | 
189 |     setRecordValues("key1", 3L, 2.0, "abc");
190 | 
191 |     assertEquals((3 + 1) / 2.0, expr.getValue(context));
192 |   }
193 | }
194 | 


--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ByteArrayChannel.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
  3 |  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4 |  *
  5 |  * This code is free software; you can redistribute it and/or modify it
  6 |  * under the terms of the GNU General Public License version 2 only, as
  7 |  * published by the Free Software Foundation.  Oracle designates this
  8 |  * particular file as subject to the "Classpath" exception as provided
  9 |  * by Oracle in the LICENSE file that accompanied this code.
 10 |  *
 11 |  * This code is distributed in the hope that it will be useful, but WITHOUT
 12 |  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13 |  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14 |  * version 2 for more details (a copy is included in the LICENSE file that
 15 |  * accompanied this code).
 16 |  *
 17 |  * You should have received a copy of the GNU General Public License version
 18 |  * 2 along with this work; if not, write to the Free Software Foundation,
 19 |  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20 |  *
 21 |  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22 |  * or visit www.oracle.com if you need additional information or have any
 23 |  * questions.
 24 |  */
 25 | 
 26 | package com.microsoft.accumulo.zipfs;
 27 | 
 28 | import java.io.IOException;
 29 | import java.nio.ByteBuffer;
 30 | import java.nio.channels.ClosedChannelException;
 31 | import java.nio.channels.NonWritableChannelException;
 32 | import java.nio.channels.SeekableByteChannel;
 33 | import java.util.Arrays;
 34 | import java.util.concurrent.locks.ReadWriteLock;
 35 | import java.util.concurrent.locks.ReentrantReadWriteLock;
 36 | 
 37 | public class ByteArrayChannel implements SeekableByteChannel {
 38 | 
 39 |     private final ReadWriteLock rwlock = new ReentrantReadWriteLock();
 40 |     private byte buf[];
 41 | 
 42 |     /*
 43 |      * The current position of this channel.
 44 |      */
 45 |     private int pos;
 46 | 
 47 |     /*
 48 |      * The index that is one greater than the last valid byte in the channel.
 49 |      */
 50 |     private int last;
 51 | 
 52 |     private boolean closed;
 53 |     private boolean readonly;
 54 | 
 55 |     /*
 56 |      * Creates a {@code ByteArrayChannel} with size {@code sz}.
 57 |      */
 58 |     ByteArrayChannel(int sz, boolean readonly) {
 59 |         this.buf = new byte[sz];
 60 |         this.pos = this.last = 0;
 61 |         this.readonly = readonly;
 62 |     }
 63 | 
 64 |     /*
 65 |      * Creates a ByteArrayChannel with its 'pos' at 0 and its 'last' at buf's end.
 66 |      * Note: no defensive copy of the 'buf', used directly.
 67 |      */
 68 |     ByteArrayChannel(byte[] buf, boolean readonly) {
 69 |         this.buf = buf;
 70 |         this.pos = 0;
 71 |         this.last = buf.length;
 72 |         this.readonly = readonly;
 73 |     }
 74 | 
 75 |     @Override
 76 |     public boolean isOpen() {
 77 |         return !closed;
 78 |     }
 79 | 
 80 |     @Override
 81 |     public long position() throws IOException {
 82 |         beginRead();
 83 |         try {
 84 |             ensureOpen();
 85 |             return pos;
 86 |         } finally {
 87 |             endRead();
 88 |         }
 89 |     }
 90 | 
 91 |     @Override
 92 |     public SeekableByteChannel position(long pos) throws IOException {
 93 |         beginWrite();
 94 |         try {
 95 |             ensureOpen();
 96 |             if (pos < 0 || pos >= Integer.MAX_VALUE)
 97 |                 throw new IllegalArgumentException("Illegal position " + pos);
 98 |             this.pos = Math.min((int)pos, last);
 99 |             return this;
100 |         } finally {
101 |             endWrite();
102 |         }
103 |     }
104 | 
105 |     @Override
106 |     public int read(ByteBuffer dst) throws IOException {
107 |         beginWrite();
108 |         try {
109 |             ensureOpen();
110 |             if (pos == last)
111 |                 return -1;
112 |             int n = Math.min(dst.remaining(), last - pos);
113 |             dst.put(buf, pos, n);
114 |             pos += n;
115 |             return n;
116 |         } finally {
117 |             endWrite();
118 |         }
119 |     }
120 | 
121 |     @Override
122 |     public SeekableByteChannel truncate(long size) throws IOException {
123 |         if (readonly)
124 |             throw new NonWritableChannelException();
125 |         ensureOpen();
126 |         throw new UnsupportedOperationException();
127 |     }
128 | 
129 |     @Override
130 |     public int write(ByteBuffer src) throws IOException {
131 |         if (readonly)
132 |             throw new NonWritableChannelException();
133 |         beginWrite();
134 |         try {
135 |             ensureOpen();
136 |             int n = src.remaining();
137 |             ensureCapacity(pos + n);
138 |             src.get(buf, pos, n);
139 |             pos += n;
140 |             if (pos > last) {
141 |                 last = pos;
142 |             }
143 |             return n;
144 |         } finally {
145 |             endWrite();
146 |         }
147 |     }
148 | 
149 |     @Override
150 |     public long size() throws IOException {
151 |         beginRead();
152 |         try {
153 |             ensureOpen();
154 |             return last;
155 |         } finally {
156 |             endRead();
157 |         }
158 |     }
159 | 
160 |     @Override
161 |     public void close() throws IOException {
162 |         if (closed)
163 |             return;
164 |         beginWrite();
165 |         try {
166 |             closed = true;
167 |             buf = null;
168 |             pos = 0;
169 |             last = 0;
170 |         } finally {
171 |             endWrite();
172 |         }
173 |     }
174 | 
175 |     /**
176 |      * Creates a newly allocated byte array. Its size is the current
177 |      * size of this channel and the valid contents of the buffer
178 |      * have been copied into it.
179 |      *
180 |      * @return the current contents of this channel, as a byte array.
181 |      */
182 |     public byte[] toByteArray() {
183 |         beginRead();
184 |         try {
185 |             // avoid copy if last == bytes.length?
186 |             return Arrays.copyOf(buf, last);
187 |         } finally {
188 |             endRead();
189 |         }
190 |     }
191 | 
192 |     private void ensureOpen() throws IOException {
193 |         if (closed)
194 |             throw new ClosedChannelException();
195 |     }
196 | 
197 |     private final void beginWrite() {
198 |         rwlock.writeLock().lock();
199 |     }
200 | 
201 |     private final void endWrite() {
202 |         rwlock.writeLock().unlock();
203 |     }
204 | 
205 |     private final void beginRead() {
206 |         rwlock.readLock().lock();
207 |     }
208 | 
209 |     private final void endRead() {
210 |         rwlock.readLock().unlock();
211 |     }
212 | 
213 |     private void ensureCapacity(int minCapacity) {
214 |         // overflow-conscious code
215 |         if (minCapacity - buf.length > 0) {
216 |             grow(minCapacity);
217 |         }
218 |     }
219 | 
220 |     /**
221 |      * The maximum size of array to allocate.
222 |      * Some VMs reserve some header words in an array.
223 |      * Attempts to allocate larger arrays may result in
224 |      * OutOfMemoryError: Requested array size exceeds VM limit
225 |      */
226 |     private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
227 | 
228 |     /**
229 |      * Increases the capacity to ensure that it can hold at least the
230 |      * number of elements specified by the minimum capacity argument.
231 |      *
232 |      * @param minCapacity the desired minimum capacity
233 |      */
234 |     private void grow(int minCapacity) {
235 |         // overflow-conscious code
236 |         int oldCapacity = buf.length;
237 |         int newCapacity = oldCapacity << 1;
238 |         if (newCapacity - minCapacity < 0)
239 |             newCapacity = minCapacity;
240 |         if (newCapacity - MAX_ARRAY_SIZE > 0)
241 |             newCapacity = hugeCapacity(minCapacity);
242 |         buf = Arrays.copyOf(buf, newCapacity);
243 |     }
244 | 
245 |     private static int hugeCapacity(int minCapacity) {
246 |         if (minCapacity < 0) // overflow
247 |             throw new OutOfMemoryError();
248 |         return (minCapacity > MAX_ARRAY_SIZE) ?
249 |             Integer.MAX_VALUE :
250 |             MAX_ARRAY_SIZE;
251 |     }
252 | }
253 | 


--------------------------------------------------------------------------------