├── connector ├── iterator │ ├── dependency-reduced-pom.xml │ └── src │ │ ├── test │ │ ├── resources │ │ │ └── com │ │ │ │ └── microsoft │ │ │ │ └── accumulo │ │ │ │ └── spark │ │ │ │ ├── pyspark.lr.zip │ │ │ │ ├── sentiment.zip │ │ │ │ └── twitter.model.lr.zip │ │ └── java │ │ │ └── com │ │ │ └── microsoft │ │ │ └── accumulo │ │ │ └── spark │ │ │ ├── AvroUtil.java │ │ │ ├── DefaultIteratorEnvironment.java │ │ │ ├── AvroColumnPruningTest.java │ │ │ ├── AvroMLeapTest.java │ │ │ ├── AvroFilterTest.java │ │ │ ├── AvroRowEncoderIteratorTest.java │ │ │ ├── AvroRowTopLevelTest.java │ │ │ └── AvroJuelTest.java │ │ └── main │ │ └── java │ │ └── com │ │ └── microsoft │ │ └── accumulo │ │ └── spark │ │ ├── record │ │ ├── RowBuilderCellConsumer.java │ │ ├── RowBuilderType.java │ │ ├── RowBuilderField.java │ │ ├── AvroFastRecord.java │ │ └── AvroSchemaBuilder.java │ │ ├── util │ │ └── StopWatch.java │ │ ├── juel │ │ ├── AvroUtf8Wrapper.java │ │ ├── AvroELContext.java │ │ ├── expressions │ │ │ ├── RowKeyVariableExpression.java │ │ │ ├── AvroObjectExpression.java │ │ │ └── AvroVariableExpression.java │ │ ├── AvroVariableMapper.java │ │ └── AvroResolver.java │ │ └── processors │ │ ├── AvroRowConsumer.java │ │ ├── AvroRowSerializer.java │ │ ├── AvroRowFilter.java │ │ └── AvroRowComputedColumns.java ├── integration-test │ ├── src │ │ └── test │ │ │ └── resources │ │ │ ├── samplenullable.txt │ │ │ ├── sample.txt │ │ │ └── sample_more.txt │ └── pom.xml ├── zipfs │ ├── src │ │ ├── test │ │ │ ├── resources │ │ │ │ └── com │ │ │ │ │ └── microsoft │ │ │ │ │ └── accumulo │ │ │ │ │ └── zipfs │ │ │ │ │ └── sample.zip │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── microsoft │ │ │ │ └── accumulo │ │ │ │ └── zipfs │ │ │ │ └── JimfsZipfsTest.java │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── microsoft │ │ │ └── accumulo │ │ │ └── zipfs │ │ │ ├── ZipFileAttributes.java │ │ │ ├── ZipPosixFileAttributeView.java │ │ │ ├── ZipDirectoryStream.java │ │ │ ├── ZipFileStore.java │ │ │ ├── ZipCoder.java │ │ │ ├── ZipFileAttributeView.java │ │ │ └── ByteArrayChannel.java │ └── pom.xml ├── datasource │ └── src │ │ ├── test │ │ ├── resources │ │ │ └── com │ │ │ │ └── microsoft │ │ │ │ └── accumulo │ │ │ │ └── sentiment.zip │ │ └── scala │ │ │ └── com │ │ │ └── microsoft │ │ │ └── accumulo │ │ │ ├── VerifyMleapSchema.scala │ │ │ ├── VerifyFilterToJuel.scala │ │ │ └── VerifyAccumuloSchema.scala │ │ └── main │ │ └── scala │ │ └── com │ │ └── microsoft │ │ └── accumulo │ │ ├── DefaultSource.scala │ │ ├── FilterToJuel.scala │ │ ├── MLeapUtil.scala │ │ ├── AccumuloDataSourceWriter.scala │ │ ├── AccumuloDataWriter.scala │ │ ├── AvroUtil.scala │ │ ├── AccumuloInputPartitionReader.scala │ │ └── AccumuloDataSourceReader.scala ├── publish │ ├── settings.xml │ └── publish.sh └── README.md ├── .github └── workflows │ └── maven.yml ├── CODE_OF_CONDUCT.md ├── OpenSource └── JDK-ZipFileSystem │ └── README.md ├── azure-pipelines.yml ├── SECURITY.md ├── README.md └── .gitignore /connector/iterator/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /connector/integration-test/src/test/resources/samplenullable.txt: -------------------------------------------------------------------------------- 1 | key,label,text,count 2 | r0,0.0,,5 3 | ,1.0,this is good, 4 | r2,,we don't know yet,2 -------------------------------------------------------------------------------- /connector/integration-test/src/test/resources/sample.txt: -------------------------------------------------------------------------------- 1 | key,label,text,count 2 | r0,0.0,this is bad,5 3 | r1,1.0,this is good,3 4 | r2,0.0,we don't know yet,2 -------------------------------------------------------------------------------- /connector/integration-test/src/test/resources/sample_more.txt: -------------------------------------------------------------------------------- 1 | key,label,text,count 2 | r3,0.0,this is still bad,5 3 | r4,1.0,this is still good,3 4 | r5,0.0,we still don't know yet,2 -------------------------------------------------------------------------------- /connector/zipfs/src/test/resources/com/microsoft/accumulo/zipfs/sample.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/zipfs/src/test/resources/com/microsoft/accumulo/zipfs/sample.zip -------------------------------------------------------------------------------- /connector/datasource/src/test/resources/com/microsoft/accumulo/sentiment.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/datasource/src/test/resources/com/microsoft/accumulo/sentiment.zip -------------------------------------------------------------------------------- /connector/iterator/src/test/resources/com/microsoft/accumulo/spark/pyspark.lr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/pyspark.lr.zip -------------------------------------------------------------------------------- /connector/iterator/src/test/resources/com/microsoft/accumulo/spark/sentiment.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/sentiment.zip -------------------------------------------------------------------------------- /connector/iterator/src/test/resources/com/microsoft/accumulo/spark/twitter.model.lr.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/masc/HEAD/connector/iterator/src/test/resources/com/microsoft/accumulo/spark/twitter.model.lr.zip -------------------------------------------------------------------------------- /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | name: Java CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-16.04 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up JDK 1.8 13 | uses: actions/setup-java@v1 14 | with: 15 | java-version: 1.8 16 | - name: Build with Maven 17 | run: mvn -B package --file connector/pom.xml 18 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /connector/publish/settings.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 7 | ossrh 8 | ${ossrh.username} 9 | ${ossrh.password} 10 | 11 | 12 | -------------------------------------------------------------------------------- /OpenSource/JDK-ZipFileSystem/README.md: -------------------------------------------------------------------------------- 1 | # Open Source Information 2 | 3 | ## JRE ZipFileSystem Component 4 | Code found under [connector/zipfs](/connector/zipfs) originates from [https://github.com/openjdk/jdk/blob/515db21790d589cf636ec8b6592b865ca492e887/src/jdk.zipfs/share/classes/jdk/nio/zipfs/ZipFileSystem.java](https://github.com/openjdk/jdk/blob/515db21790d589cf636ec8b6592b865ca492e887/src/jdk.zipfs/share/classes/jdk/nio/zipfs/ZipFileSystem.java). 5 | It was modified to overcome issues with Apache Spark and Accumulo. 6 | 7 | The commit version used was d948bfd. 8 | 9 | See [NOTICE](./NOTICE) for license information -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/RowBuilderCellConsumer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.record; 19 | 20 | import org.apache.accumulo.core.data.Key; 21 | import org.apache.accumulo.core.data.Value; 22 | 23 | /** 24 | * Will be called for each cell found as the iterator goes over the data. 25 | * Implementation decode data and move into Avro record. 26 | */ 27 | public interface RowBuilderCellConsumer { 28 | void consume(Key key, Value value); 29 | } 30 | -------------------------------------------------------------------------------- /connector/publish/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # template for username/password for sonatype repository server 3 | cp settings.xml ~/.m2/settings.xml 4 | 5 | # ## generate gpg 6 | # gpg --full-generate-key 7 | # ## send to gpg server 8 | # gpg --keyserver pool.sks-keyservers.net --send-key 2E5260D120241F6F8E35370D293C0... 9 | # ## import signing key 10 | # ## this is how to export them 11 | # gpg --export-secret-keys 'Markus Cozowicz (com.microsoft.accumulo) ' | base64 -w 0 12 | echo $ossrh_gpg | base64 -d | gpg --import - 13 | 14 | MAVEN_OPTS="verify gpg:sign deploy:deploy -Dmaven.test.skip=true -DskipITs -Dossrh.username=$ossrh_username -Dossrh.password=$ossrh_password" 15 | 16 | # to use the snapshot from oss.sonatype.org 17 | # * add http://oss.sonatype.org/content/repositories/snapshots 18 | # * reference com.microsoft.accumulo:accumulo-spark-connector:1.0.0 19 | # * reference com.microsoft.accumulo:accumulo-spark-datasource:1.0.0 20 | # 21 | # more details at https://stackoverflow.com/questions/7715321/how-to-download-snapshot-version-from-maven-snapshot-repository 22 | 23 | # For a proper release: 24 | # * remove -SNAPSHOT in pom.xml 25 | # * visit https://oss.sonatype.org/#stagingRepositories and "close & release" the staged .jar 26 | # * see https://oss.sonatype.org/#stagingRepositories 27 | mvn -f ../pom.xml install 28 | mvn -f ../pom.xml $MAVEN_OPTS -N # don't recurse 29 | mvn -f ../datasource/pom.xml $MAVEN_OPTS -DshadedArtifactAttached=false 30 | mvn -f ../iterator/pom.xml $MAVEN_OPTS -DshadedArtifactAttached=false 31 | -------------------------------------------------------------------------------- /connector/datasource/src/test/scala/com/microsoft/accumulo/VerifyMleapSchema.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.junit.runner.RunWith 21 | import org.scalatest.FunSuite 22 | import org.scalatest.junit.JUnitRunner 23 | import com.google.common.io.Resources 24 | import java.util.Base64 25 | 26 | import org.apache.spark.sql.types.{DataTypes, StructField} 27 | 28 | @RunWith(classOf[JUnitRunner]) 29 | class VerifyMleapSchema extends FunSuite { 30 | test("Validate mleap schema extraction") { 31 | val mleapBundle = Resources.toByteArray(classOf[VerifyMleapSchema].getResource("sentiment.zip")) 32 | val mleapBundleBase64 = Base64.getEncoder().encodeToString(mleapBundle) 33 | 34 | val fields = MLeapUtil.mleapSchemaToCatalyst(mleapBundleBase64) 35 | 36 | assert(Seq(StructField("prediction", DataTypes.DoubleType, false)) == fields) 37 | } 38 | } -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | pr: 2 | - master 3 | 4 | stages: 5 | - stage: Compliance 6 | jobs: 7 | - job: 8 | steps: 9 | - task: ComponentGovernanceComponentDetection@0 10 | inputs: 11 | scanType: 'Register' 12 | verbosity: 'Verbose' 13 | alertWarningLevel: 'High' 14 | 15 | - stage: AccumuloSparkConnector 16 | jobs: 17 | - job: 18 | pool: 19 | vmImage: 'ubuntu-16.04' 20 | steps: 21 | - task: Maven@3 22 | displayName: 'Accumulo Spark Connector components' 23 | inputs: 24 | mavenPomFile: 'connector/pom.xml' 25 | javaHomeOption: 'JDKVersion' 26 | jdkVersionOption: '1.8' 27 | jdkArchitectureOption: 'x64' 28 | publishJUnitResults: true 29 | testResultsFiles: '**/TEST-*.xml' 30 | goals: 'package' 31 | options: '-B' # batch mode for non-interactive release 32 | 33 | - task: PublishPipelineArtifact@1 34 | inputs: 35 | targetPath: connector/iterator/target/microsoft-accumulo-spark-iterator-1.0.4-shaded.jar 36 | artifactName: accumulo-spark-iterator 37 | 38 | - task: PublishPipelineArtifact@1 39 | inputs: 40 | targetPath: connector/datasource/target/microsoft-accumulo-spark-datasource-1.0.4-shaded.jar 41 | artifactName: accumulo-spark-datasource 42 | 43 | - bash: cd connector/publish && ./publish.sh 44 | displayName: Publish to Sonatype 45 | condition: variables['ossrh_gpg'] 46 | env: 47 | # these are credentials used to publish to oss.sonatype.org 48 | # credentials are stored as secrets in the build definition 49 | ossrh_gpg: $(ossrh_gpg) 50 | ossrh_username: $(ossrh_username) 51 | ossrh_password: $(ossrh_password) 52 | 53 | -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import java.io.IOException; 21 | import java.util.Collection; 22 | import java.util.HashSet; 23 | 24 | import org.apache.accumulo.core.data.ByteSequence; 25 | import org.apache.avro.Schema; 26 | import org.apache.avro.generic.GenericRecord; 27 | import org.apache.avro.io.BinaryDecoder; 28 | import org.apache.avro.io.DecoderFactory; 29 | import org.apache.avro.specific.SpecificDatumReader; 30 | 31 | public class AvroUtil { 32 | public static final Collection EMPTY_SET = new HashSet<>(); 33 | 34 | public static GenericRecord deserialize(byte[] data, Schema schema) throws IOException { 35 | SpecificDatumReader reader = new SpecificDatumReader(schema); 36 | BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(data, null); 37 | 38 | return reader.read(null, decoder); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipFileAttributes.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.nio.file.attribute.BasicFileAttributes; 29 | import java.nio.file.attribute.PosixFilePermission; 30 | import java.util.Optional; 31 | import java.util.Set; 32 | 33 | /** 34 | * The attributes of a file stored in a zip file. 35 | * 36 | * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal 37 | */ 38 | interface ZipFileAttributes extends BasicFileAttributes { 39 | long compressedSize(); 40 | long crc(); 41 | int method(); 42 | byte[] extra(); 43 | byte[] comment(); 44 | Optional> storedPermissions(); 45 | } 46 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import java.util.Optional 21 | 22 | import org.apache.spark.sql.SaveMode 23 | import org.apache.spark.sql.sources.v2.reader.DataSourceReader 24 | import org.apache.spark.sql.sources.v2.writer.DataSourceWriter 25 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, ReadSupport, WriteSupport} 26 | import org.apache.spark.sql.types.StructType 27 | 28 | class DefaultSource extends DataSourceV2 with ReadSupport with WriteSupport { 29 | 30 | override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader = { 31 | new AccumuloDataSourceReader(schema, options) 32 | } 33 | 34 | override def createReader(options: DataSourceOptions): DataSourceReader = { 35 | throw new UnsupportedOperationException("Must supply schema") 36 | } 37 | 38 | override def createWriter(jobId: String, 39 | schema: StructType, 40 | mode: SaveMode, 41 | options: DataSourceOptions): Optional[DataSourceWriter] = { 42 | Optional.of(new AccumuloDataSourceWriter(schema, mode, options)) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/util/StopWatch.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.util; 19 | 20 | /** 21 | * Stop watch functionality optimized for large number of segments. 22 | */ 23 | public class StopWatch { 24 | private long start; 25 | private double avg; 26 | private long n = 1; 27 | 28 | /** 29 | * Start recording a new segment. If stop() is not called subsequently this 30 | * cancels the previous run. 31 | */ 32 | public void start() { 33 | this.start = System.nanoTime(); 34 | } 35 | 36 | /** 37 | * Stops the current run. 38 | */ 39 | public void stop() { 40 | double time = System.nanoTime() - this.start; 41 | 42 | // see // 43 | // https://stackoverflow.com/questions/1930454/what-is-a-good-solution-for-calculating-an-average-where-the-sum-of-all-values-e 44 | this.avg += (time - this.avg) / n; 45 | 46 | // important that we only count here as callers might repeatly call start() 47 | // as a run was cancelled 48 | this.n++; 49 | } 50 | 51 | /** 52 | * @return average number of milliseconds. 53 | */ 54 | public double getAverage() { 55 | return this.avg / 1000; 56 | } 57 | 58 | /** 59 | * @return Returns the number segments (=stop() calls). 60 | */ 61 | public long getN() { 62 | return this.n; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroUtf8Wrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel; 19 | 20 | import java.nio.charset.StandardCharsets; 21 | 22 | import org.apache.avro.util.Utf8; 23 | 24 | /** 25 | * Wrap zero-copy Avro Utf8 class and extend with string filter support. 26 | */ 27 | public class AvroUtf8Wrapper extends Utf8 { 28 | 29 | // lazy initialized string 30 | private String string; 31 | 32 | public AvroUtf8Wrapper(byte[] data) { 33 | super(data); 34 | } 35 | 36 | public String getString() { 37 | if (this.string == null) { 38 | byte[] bytes = getBytes(); 39 | this.string = new String(bytes, 0, bytes.length, StandardCharsets.UTF_8); 40 | } 41 | 42 | return this.string; 43 | } 44 | 45 | public boolean endsWith(String postfix) { 46 | return getString().endsWith(postfix); 47 | } 48 | 49 | public boolean startsWith(String prefix) { 50 | return getString().startsWith(prefix); 51 | } 52 | 53 | public boolean contains(String text) { 54 | return getString().contains(text); 55 | } 56 | 57 | @Override 58 | public boolean equals(Object other) { 59 | if (other instanceof String) { 60 | return getString().equals(other); 61 | } 62 | 63 | return super.equals(other); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /connector/zipfs/src/test/java/com/microsoft/accumulo/zipfs/JimfsZipfsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.zipfs; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | import static org.junit.Assert.assertFalse; 22 | import static org.junit.Assert.assertTrue; 23 | 24 | import com.google.common.jimfs.Configuration; 25 | import com.google.common.jimfs.Jimfs; 26 | import com.google.common.io.Resources; 27 | import java.nio.file.*; 28 | import org.junit.Test; 29 | import java.util.*; 30 | 31 | public class JimfsZipfsTest { 32 | @Test 33 | public void testJimfsAndZipFs() throws Exception { 34 | // read .zip file into memory 35 | byte[] data = Resources.toByteArray(JimfsZipfsTest.class.getResource("sample.zip")); 36 | 37 | // create in-memory filesystem 38 | FileSystem fs = Jimfs.newFileSystem(Configuration.unix()); 39 | Path sampleFilePath = fs.getPath("/sample.zip"); 40 | 41 | Files.write(sampleFilePath, data, StandardOpenOption.CREATE); 42 | 43 | // get zip file system 44 | ZipFileSystem zfs = new ZipFileSystem(new ZipFileSystemProvider(), sampleFilePath, new HashMap()); 45 | 46 | Path pathInZip = zfs.getPath("/sample.txt"); 47 | 48 | List lines = Files.readAllLines(pathInZip); 49 | 50 | assertEquals(1, lines.size()); 51 | assertEquals("Hello World", lines.get(0)); 52 | } 53 | } -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/DefaultIteratorEnvironment.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.microsoft.accumulo.spark; 18 | 19 | import java.io.IOException; 20 | 21 | import org.apache.accumulo.core.conf.AccumuloConfiguration; 22 | import org.apache.accumulo.core.conf.DefaultConfiguration; 23 | import org.apache.accumulo.core.data.Key; 24 | import org.apache.accumulo.core.data.Value; 25 | import org.apache.accumulo.core.iterators.IteratorEnvironment; 26 | import org.apache.accumulo.core.iterators.SortedKeyValueIterator; 27 | import org.apache.accumulo.core.iterators.system.MapFileIterator; 28 | import org.apache.hadoop.conf.Configuration; 29 | import org.apache.hadoop.fs.FileSystem; 30 | 31 | public class DefaultIteratorEnvironment implements IteratorEnvironment { 32 | 33 | AccumuloConfiguration conf; 34 | Configuration hadoopConf = new Configuration(); 35 | 36 | public DefaultIteratorEnvironment(AccumuloConfiguration conf) { 37 | this.conf = conf; 38 | } 39 | 40 | public DefaultIteratorEnvironment() { 41 | this.conf = DefaultConfiguration.getInstance(); 42 | } 43 | 44 | @Deprecated 45 | @Override 46 | public SortedKeyValueIterator reserveMapFileReader(String mapFileName) throws IOException { 47 | FileSystem fs = FileSystem.get(hadoopConf); 48 | return new MapFileIterator(fs, mapFileName, hadoopConf); 49 | } 50 | 51 | @Override 52 | public boolean isSamplingEnabled() { 53 | return false; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroELContext.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel; 19 | 20 | import javax.el.ELContext; 21 | import javax.el.ELResolver; 22 | import javax.el.FunctionMapper; 23 | import javax.el.VariableMapper; 24 | 25 | import org.apache.avro.Schema; 26 | import org.apache.avro.generic.IndexedRecord; 27 | import org.apache.hadoop.io.Text; 28 | 29 | /** 30 | * Exposes Avro GenericRecord as Expression Language (EL) context for filtering 31 | * and column computation. 32 | */ 33 | public class AvroELContext extends ELContext { 34 | 35 | private IndexedRecord avroRecord; 36 | private Text rowKey; 37 | private VariableMapper variableMapper; 38 | private ELResolver resolver; 39 | 40 | public AvroELContext(Schema schema) { 41 | variableMapper = new AvroVariableMapper(schema); 42 | resolver = new AvroResolver(); 43 | } 44 | 45 | @Override 46 | public ELResolver getELResolver() { 47 | return resolver; 48 | } 49 | 50 | @Override 51 | public FunctionMapper getFunctionMapper() { 52 | return null; 53 | } 54 | 55 | @Override 56 | public VariableMapper getVariableMapper() { 57 | return variableMapper; 58 | } 59 | 60 | public IndexedRecord getAvroRecord() { 61 | return avroRecord; 62 | } 63 | 64 | public Text getRowKey() { 65 | return rowKey; 66 | } 67 | 68 | public void setCurrent(Text rowKey, IndexedRecord avroRecord) { 69 | this.rowKey = rowKey; 70 | this.avroRecord = avroRecord; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /connector/zipfs/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | com.microsoft.masc 22 | microsoft-accumulo-spark 23 | 1.0.4 24 | 25 | com.microsoft.masc 26 | microsoft-accumulo-spark-zipfs 27 | 1.0.4 28 | Zip File System 29 | 30 | UTF-8 31 | 32 | 33 | 34 | com.google.guava 35 | guava 36 | test 37 | 38 | 39 | com.google.jimfs 40 | jimfs 41 | test 42 | 43 | 44 | junit 45 | junit 46 | test 47 | 48 | 49 | 50 | 51 | 52 | org.apache.maven.plugins 53 | maven-compiler-plugin 54 | 55 | 56 | org.apache.maven.plugins 57 | maven-enforcer-plugin 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [many more](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [definition](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center at [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://technet.microsoft.com/en-us/security/dn606155). 12 | 13 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 14 | 15 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 16 | 17 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 18 | * Full paths of source file(s) related to the manifestation of the issue 19 | * The location of the affected source code (tag/branch/commit or direct URL) 20 | * Any special configuration required to reproduce the issue 21 | * Step-by-step instructions to reproduce the issue 22 | * Proof-of-concept or exploit code (if possible) 23 | * Impact of the issue, including how an attacker might exploit the issue 24 | 25 | This information will help us triage your report more quickly. 26 | 27 | ## Preferred Languages 28 | 29 | We prefer all communications to be in English. 30 | 31 | ## Policy 32 | 33 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 34 | 35 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/expressions/RowKeyVariableExpression.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel.expressions; 19 | 20 | import javax.el.ELContext; 21 | import javax.el.ELException; 22 | import javax.el.ValueExpression; 23 | 24 | import com.microsoft.accumulo.spark.juel.AvroELContext; 25 | import org.apache.hadoop.io.Text; 26 | 27 | /** 28 | * JUEL VariableExpression resolving to the row key. 29 | */ 30 | public class RowKeyVariableExpression extends ValueExpression { 31 | 32 | public static final RowKeyVariableExpression INSTANCE = new RowKeyVariableExpression(); 33 | 34 | private static final long serialVersionUID = 1L; 35 | 36 | @Override 37 | public Class getExpectedType() { 38 | return String.class; 39 | } 40 | 41 | @Override 42 | public Class getType(ELContext context) { 43 | return String.class; 44 | } 45 | 46 | @Override 47 | public Object getValue(ELContext context) { 48 | Text text = ((AvroELContext) context).getRowKey(); 49 | 50 | return text.toString(); 51 | } 52 | 53 | @Override 54 | public boolean isReadOnly(ELContext context) { 55 | return true; 56 | } 57 | 58 | @Override 59 | public void setValue(ELContext context, Object value) { 60 | throw new ELException("setValue not supported"); 61 | } 62 | 63 | @Override 64 | public boolean equals(Object obj) { 65 | return obj instanceof RowKeyVariableExpression; 66 | } 67 | 68 | @Override 69 | public String getExpressionString() { 70 | throw new ELException("getExpressionString() is not supported"); 71 | } 72 | 73 | @Override 74 | public int hashCode() { 75 | return 42; 76 | } 77 | 78 | @Override 79 | public boolean isLiteralText() { 80 | return false; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microsoft MASC, an Apache Spark connector for Apache Accumulo 2 | 3 | The goal of this repository is to facilitate the use of [Apache Spark](https://spark.apache.org/) and its machine learning ecosystem with [Apache Accumulo](https://accumulo.apache.org/) as an external data source. 4 | 5 | # Contents 6 | - The [connector](connector) provides connectivity to read from / write to Accumulo using Spark. See the [README](connector/README.md) for more details about supported functionality. 7 | 8 | # Contributing 9 | 10 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 11 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 12 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 13 | 14 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 15 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 16 | provided by the bot. You will only need to do this once across all repos using our CLA. 17 | 18 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 19 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 20 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 21 | 22 | # Build 23 | 24 | [![Build Status](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_apis/build/status/AGCE%20AI/Web%20Scale%20AI/microsoft.Accumulo?branchName=master)](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_build/latest?definitionId=84&branchName=master) 25 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-datasource/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-datasource) 26 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-iterator/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/accumulo-spark-iterator) 27 | 28 | # License 29 | All code provided, except where otherwise documented in [OpenSource](OpenSource) and [NOTICE](NOTICE), is covered by the [Apache License 2.0](LICENSE) 30 | 31 | # Trademarks 32 | 33 | Apache®, [Apache Spark](https://spark.apache.org/), [Apache Accumulo](https://accumulo.apache.org/) and Accumulo are either registered trademarks or trademarks of the [Apache Software Foundation](https://www.apache.org/) in the United States and/or other countries. 34 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/expressions/AvroObjectExpression.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel.expressions; 19 | 20 | import javax.el.ELContext; 21 | import javax.el.ELException; 22 | import javax.el.ValueExpression; 23 | 24 | import com.microsoft.accumulo.spark.juel.AvroELContext; 25 | import org.apache.avro.Schema.Field; 26 | import org.apache.avro.generic.GenericData.Record; 27 | import org.apache.avro.generic.IndexedRecord; 28 | 29 | /** 30 | * JUEL VariableExpression resolving to a nested record. 31 | */ 32 | public class AvroObjectExpression extends ValueExpression { 33 | 34 | private static final long serialVersionUID = 1L; 35 | 36 | private Field field; 37 | 38 | public AvroObjectExpression(Field field) { 39 | this.field = field; 40 | } 41 | 42 | @Override 43 | public Class getExpectedType() { 44 | return Record.class; 45 | } 46 | 47 | @Override 48 | public Class getType(ELContext context) { 49 | return Record.class; 50 | } 51 | 52 | @Override 53 | public Object getValue(ELContext context) { 54 | IndexedRecord record = ((AvroELContext) context).getAvroRecord(); 55 | 56 | return record.get(this.field.pos()); 57 | } 58 | 59 | @Override 60 | public boolean isReadOnly(ELContext context) { 61 | return true; 62 | } 63 | 64 | @Override 65 | public void setValue(ELContext context, Object value) { 66 | throw new ELException("setValue not supported"); 67 | } 68 | 69 | @Override 70 | public boolean equals(Object obj) { 71 | if (!(obj instanceof AvroVariableExpression)) 72 | return false; 73 | 74 | AvroObjectExpression other = (AvroObjectExpression) obj; 75 | 76 | return this.field.equals(other.field); 77 | } 78 | 79 | @Override 80 | public String getExpressionString() { 81 | throw new ELException("getExpressionString() is not supported"); 82 | } 83 | 84 | @Override 85 | public int hashCode() { 86 | return this.field.hashCode(); 87 | } 88 | 89 | @Override 90 | public boolean isLiteralText() { 91 | return false; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/RowBuilderType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.record; 19 | 20 | import java.util.Date; 21 | 22 | import org.apache.accumulo.core.client.lexicoder.DateLexicoder; 23 | import org.apache.accumulo.core.client.lexicoder.DoubleLexicoder; 24 | import org.apache.accumulo.core.client.lexicoder.Encoder; 25 | import org.apache.accumulo.core.client.lexicoder.FloatLexicoder; 26 | import org.apache.accumulo.core.client.lexicoder.IntegerLexicoder; 27 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder; 28 | import org.apache.accumulo.core.client.lexicoder.StringLexicoder; 29 | 30 | /** 31 | * Link JSON type string with Java class and Lexicoder 32 | */ 33 | public enum RowBuilderType { 34 | String(String.class, new StringLexicoder()), Integer(int.class, new IntegerLexicoder()), 35 | Long(long.class, new LongLexicoder()), Float(float.class, new FloatLexicoder()), 36 | Double(double.class, new DoubleLexicoder()), Date(Date.class, new DateLexicoder()), Boolean(boolean.class, null), 37 | Bytes(byte[].class, new ByteEncoder()), Unknown(null, null); 38 | 39 | private static class ByteEncoder implements Encoder { 40 | 41 | @Override 42 | public byte[] encode(byte[] object) { 43 | return object; 44 | } 45 | 46 | @Override 47 | public byte[] decode(byte[] bytes) throws IllegalArgumentException { 48 | return bytes; 49 | } 50 | } 51 | 52 | private Class javaClass; 53 | private Encoder encoder; 54 | 55 | RowBuilderType(Class javaClass, Encoder encoder) { 56 | this.javaClass = javaClass; 57 | this.encoder = encoder; 58 | } 59 | 60 | public static RowBuilderType valueOfIgnoreCase(String name) { 61 | for (RowBuilderType type : RowBuilderType.values()) { 62 | if (name.equalsIgnoreCase(type.name())) 63 | return type; 64 | } 65 | 66 | return null; 67 | } 68 | 69 | /** 70 | * @return the javaClass 71 | */ 72 | public Class getJavaClass() { 73 | return javaClass; 74 | } 75 | 76 | /** 77 | * @return the encoder 78 | */ 79 | public Encoder getEncoder() { 80 | return encoder; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowConsumer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.processors; 19 | 20 | import java.io.IOException; 21 | import java.util.Collection; 22 | 23 | import com.microsoft.accumulo.spark.record.RowBuilderField; 24 | import com.microsoft.accumulo.spark.util.StopWatch; 25 | import org.apache.avro.Schema; 26 | import org.apache.avro.generic.IndexedRecord; 27 | import org.apache.hadoop.io.Text; 28 | 29 | public abstract class AvroRowConsumer { 30 | 31 | // private StopWatch stopWatchConsume = new StopWatch(); 32 | 33 | /** 34 | * Process the row. 35 | * 36 | * @param rowKey The row key. 37 | * @param record The AVRO record. 38 | * @return The same or a new processed record. Null if processing should be 39 | * stopped (e.g. does not match a filter). 40 | */ 41 | public boolean consume(Text rowKey, IndexedRecord record) throws IOException { 42 | // this.stopWatchConsume.start(); 43 | 44 | boolean ret = this.consumeInternal(rowKey, record); 45 | // if (ret) 46 | // this.stopWatchConsume.stop(); 47 | 48 | return ret; 49 | } 50 | 51 | // public double getAverageConsumeTime() { 52 | // return this.stopWatchConsume.getAverage(); 53 | // } 54 | 55 | public String getName() { 56 | return getClass().getSimpleName(); 57 | } 58 | 59 | /** 60 | * Process the row. 61 | * 62 | * @param rowKey The row key. 63 | * @param record The AVRO record. 64 | * @return The same or a new processed record. Null if processing should be 65 | * stopped (e.g. does not match a filter). 66 | */ 67 | protected abstract boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException; 68 | 69 | /** 70 | * Support copying of the object as the iterator needs to be copyable. 71 | * 72 | * @return The cloned object. 73 | */ 74 | public abstract AvroRowConsumer clone(); 75 | 76 | /** 77 | * Any additional fields this consumer wants to populate. 78 | * 79 | * @return additional fields added to the main schema. 80 | */ 81 | public abstract Collection getSchemaFields(); 82 | 83 | /** 84 | * Final initialization of the consumer wants the entire schema was discovered. 85 | * 86 | * @param schema The final schema. 87 | */ 88 | public abstract void initialize(Schema schema); 89 | } 90 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipPosixFileAttributeView.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.io.IOException; 29 | import java.nio.file.attribute.GroupPrincipal; 30 | import java.nio.file.attribute.PosixFileAttributeView; 31 | import java.nio.file.attribute.PosixFileAttributes; 32 | import java.nio.file.attribute.UserPrincipal; 33 | 34 | /** 35 | * The zip file system attribute view with POSIX support. 36 | */ 37 | class ZipPosixFileAttributeView extends ZipFileAttributeView implements PosixFileAttributeView { 38 | private final boolean isOwnerView; 39 | 40 | ZipPosixFileAttributeView(ZipPath path, boolean owner) { 41 | super(path, true); 42 | this.isOwnerView = owner; 43 | } 44 | 45 | @Override 46 | public String name() { 47 | return isOwnerView ? "owner" : "posix"; 48 | } 49 | 50 | @Override 51 | public PosixFileAttributes readAttributes() throws IOException { 52 | return (PosixFileAttributes)path.readAttributes(); 53 | } 54 | 55 | @Override 56 | public UserPrincipal getOwner() throws IOException { 57 | return readAttributes().owner(); 58 | } 59 | 60 | @Override 61 | public void setOwner(UserPrincipal owner) throws IOException { 62 | path.setOwner(owner); 63 | } 64 | 65 | @Override 66 | public void setGroup(GroupPrincipal group) throws IOException { 67 | path.setGroup(group); 68 | } 69 | 70 | @Override 71 | Object attribute(AttrID id, ZipFileAttributes zfas) { 72 | PosixFileAttributes pzfas = (PosixFileAttributes)zfas; 73 | switch (id) { 74 | case owner: 75 | return pzfas.owner(); 76 | case group: 77 | return pzfas.group(); 78 | case permissions: 79 | if (!isOwnerView) { 80 | return pzfas.permissions(); 81 | } else { 82 | return super.attribute(id, zfas); 83 | } 84 | default: 85 | return super.attribute(id, zfas); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/expressions/AvroVariableExpression.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel.expressions; 19 | 20 | import java.util.Arrays; 21 | 22 | import javax.el.ELContext; 23 | import javax.el.ELException; 24 | import javax.el.ValueExpression; 25 | 26 | import com.microsoft.accumulo.spark.juel.AvroELContext; 27 | import org.apache.avro.generic.IndexedRecord; 28 | 29 | /** 30 | * Expose Avro top-level record fields as JUEL ValueExpression 31 | */ 32 | public class AvroVariableExpression extends ValueExpression { 33 | private static final long serialVersionUID = 1L; 34 | 35 | private Class type; 36 | // indices to walk through nested avro records 37 | private int[] fieldPositions; 38 | 39 | public AvroVariableExpression(Class type, int... fieldPositions) { 40 | this.type = type; 41 | this.fieldPositions = fieldPositions; 42 | } 43 | 44 | @Override 45 | public Class getExpectedType() { 46 | return type; 47 | } 48 | 49 | @Override 50 | public Class getType(ELContext context) { 51 | return type; 52 | } 53 | 54 | @Override 55 | public Object getValue(ELContext context) { 56 | IndexedRecord record = ((AvroELContext) context).getAvroRecord(); 57 | 58 | // supported nested records (e.g. column family/column qualifier) 59 | for (int i = 0; i < fieldPositions.length - 1; i++) 60 | record = (IndexedRecord) record.get(fieldPositions[i]); 61 | 62 | return record.get(fieldPositions[fieldPositions.length - 1]); 63 | } 64 | 65 | @Override 66 | public boolean isReadOnly(ELContext context) { 67 | return true; 68 | } 69 | 70 | @Override 71 | public void setValue(ELContext context, Object value) { 72 | throw new ELException("setValue not supported"); 73 | } 74 | 75 | @Override 76 | public boolean equals(Object obj) { 77 | if (!(obj instanceof AvroVariableExpression)) 78 | return false; 79 | 80 | AvroVariableExpression other = (AvroVariableExpression) obj; 81 | 82 | return type.equals(other.type) && Arrays.equals(fieldPositions, other.fieldPositions); 83 | } 84 | 85 | @Override 86 | public String getExpressionString() { 87 | throw new ELException("getExpressionString() is not supported"); 88 | } 89 | 90 | @Override 91 | public int hashCode() { 92 | return type.hashCode() + Arrays.hashCode(fieldPositions); 93 | } 94 | 95 | @Override 96 | public boolean isLiteralText() { 97 | return false; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /connector/README.md: -------------------------------------------------------------------------------- 1 | # Microsoft MASC, an Apache Spark connector for Apache Accumulo 2 | [![Build Status](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_apis/build/status/AGCE%20AI/Web%20Scale%20AI/microsoft.Accumulo?branchName=master)](https://dev.azure.com/AZGlobal/Azure%20Global%20CAT%20Engineering/_build/latest?definitionId=84&branchName=master) 3 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-datasource/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-datasource) 4 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-iterator/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.microsoft.masc/microsoft-accumulo-spark-iterator) 5 | 6 | This code provides connectivity between Apache Accumulo and Apache Spark. 7 | 8 | ## Main Goals 9 | - Provide native Spark interface to connect to Accumulo 10 | - Minimize data transfer between Spark and Accumulo 11 | - Enable use of Machine Learning with Accumulo as the datastore 12 | 13 | ## Examples 14 | ```python 15 | # Read from Accumulo 16 | df = (spark 17 | .read 18 | .format("com.microsoft.accumulo") 19 | .options(**options) # define Accumulo properties 20 | .schema(schema)) # define schema for data retrieval 21 | 22 | # Write to Accumulo 23 | (df 24 | .write 25 | .format("com.microsoft.accumulo") 26 | .options(**options) 27 | .save()) 28 | ``` 29 | 30 | See Pyspark [notebook](examples/AccumuloSparkConnector.ipynb) for a more detailed example. 31 | 32 | See Scala benchmark [notebook](examples/AccumuloSparkConnectorBenchmark.ipynb) for details on how our evaluation. 33 | 34 | ## Capabilities 35 | - Native Spark [Datasource V2](http://shzhangji.com/blog/2018/12/08/spark-datasource-api-v2/) API 36 | - Row serialization using [Avro](https://avro.apache.org/) 37 | - Filter pushdown (server-side) 38 | - Expressive filter language using [JUEL](http://juel.sourceforge.net/) 39 | - ML Inference pushdown (server-side) using [MLeap](http://mleap-docs.combust.ml/) 40 | - Support Spark ML pipelines 41 | - Minimal Java-runtime 42 | 43 | ## Installation 44 | 45 | The connector is composed of two components: 46 | - The [Datasource](datasource) component provides the interface used on the Spark side 47 | - The [Iterator](iterator) component provides server-side functionality on the Accumulo side 48 | 49 | The components can be built and tested with Maven (version 3.3.9 or higher) using Java version 8. 50 | ``` 51 | mvn clean install 52 | ``` 53 | 54 | Alternatively the JARs are published to the Maven Central Repository 55 | - [Datasource](https://mvnrepository.com/artifact/com.microsoft.masc/microsoft-accumulo-spark-datasource) 56 | - [Iterator](https://mvnrepository.com/artifact/com.microsoft.masc/microsoft-accumulo-spark-iterator) 57 | 58 | The following steps are needed to deploy the connector: 59 | 1) Deploy iterator JAR to Accumulo lib folders on all nodes and restart the cluster 60 | ``` 61 | # use locally built shaded jar in connector/iterator/target folder 62 | # or 63 | # use maven to download iterator from central repository 64 | mvn dependency:get -Dartifact=com.microsoft.masc:microsoft-accumulo-spark-iterator:[VERSION] 65 | ``` 66 | 2) Add Datasource JAR in Spark 67 | ``` 68 | # use locally built shaded jar in connector/datasource/target folder or 69 | # or 70 | # pull in package from maven central repository 71 | com.microsoft.masc:microsoft-accumulo-spark-datasource:[VERSION] 72 | ``` 73 | 74 | ## Spark Runtime Java Version 75 | 76 | While the iterator JAR can run on Accumulo tablet servers using JDK versions >= 1.8, the Spark Datasource component is only compatible with JDK version 1.8 (not higher) due to [Spark's Java support](https://spark.apache.org/docs/latest/). 77 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowSerializer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.processors; 19 | 20 | import java.io.ByteArrayOutputStream; 21 | import java.io.IOException; 22 | import java.util.List; 23 | import java.util.stream.Collectors; 24 | 25 | import com.microsoft.accumulo.spark.record.AvroFastRecord; 26 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder; 27 | 28 | import org.apache.avro.Schema; 29 | import org.apache.avro.Schema.Field; 30 | import org.apache.avro.generic.IndexedRecord; 31 | import org.apache.avro.io.BinaryEncoder; 32 | import org.apache.avro.io.DatumWriter; 33 | import org.apache.avro.io.EncoderFactory; 34 | import org.apache.avro.specific.SpecificDatumWriter; 35 | import org.apache.log4j.Logger; 36 | 37 | public class AvroRowSerializer { 38 | private final static Logger logger = Logger.getLogger(AvroRowSerializer.class); 39 | 40 | // avro writer infra 41 | private ByteArrayOutputStream binaryBuffer = new ByteArrayOutputStream(); 42 | private DatumWriter writer; 43 | private BinaryEncoder encoder; 44 | 45 | private AvroFastRecord finalRecord; 46 | private int[] sourceIndicies; 47 | 48 | public AvroRowSerializer(Schema schema) { 49 | List fieldList = schema.getFields().stream() 50 | // AVRO 1.8.2 doesn't support getObjectProp 51 | .filter(f -> Boolean.parseBoolean(f.getProp(AvroSchemaBuilder.PROPERTY_OUTPUT))) 52 | .map(f -> new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal())) 53 | // create the list 54 | .collect(Collectors.toList()); 55 | 56 | // check if the schema pruned fields? 57 | if (fieldList.size() != schema.getFields().size()) { 58 | Schema prunedSchema = Schema.createRecord(fieldList); 59 | this.finalRecord = new AvroFastRecord(prunedSchema); 60 | 61 | // initialize source to target mapping 62 | this.sourceIndicies = new int[fieldList.size()]; 63 | for (Field field : prunedSchema.getFields()) { 64 | logger.info("Pruned field: " + field.name()); 65 | this.sourceIndicies[field.pos()] = schema.getField(field.name()).pos(); 66 | } 67 | 68 | schema = prunedSchema; 69 | } 70 | 71 | this.writer = new SpecificDatumWriter<>(schema); 72 | this.encoder = EncoderFactory.get().binaryEncoder(binaryBuffer, null); 73 | } 74 | 75 | public byte[] serialize(IndexedRecord record) throws IOException { 76 | // make sure we're at the beginning again 77 | this.binaryBuffer.reset(); 78 | 79 | // copying to final output schema 80 | if (this.sourceIndicies != null) { 81 | for (int i = 0; i < this.sourceIndicies.length; i++) 82 | this.finalRecord.put(i, record.get(this.sourceIndicies[i])); 83 | 84 | record = this.finalRecord; 85 | } 86 | // serialize the record 87 | this.writer.write(record, encoder); 88 | 89 | this.encoder.flush(); 90 | this.binaryBuffer.flush(); 91 | 92 | return this.binaryBuffer.toByteArray(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroColumnPruningTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | import static org.junit.Assert.assertFalse; 22 | import static org.junit.Assert.assertTrue; 23 | 24 | import java.io.IOException; 25 | import java.util.Arrays; 26 | import java.util.HashMap; 27 | import java.util.Map; 28 | import java.util.SortedMap; 29 | import java.util.TreeMap; 30 | 31 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder; 32 | import org.apache.accumulo.core.data.Key; 33 | import org.apache.accumulo.core.data.Range; 34 | import org.apache.accumulo.core.data.Value; 35 | import org.apache.accumulo.core.iterators.SortedMapIterator; 36 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder; 37 | import com.microsoft.accumulo.spark.record.RowBuilderField; 38 | import org.apache.avro.Schema; 39 | import org.apache.avro.generic.GenericRecord; 40 | import org.apache.avro.util.Utf8; 41 | import org.junit.Test; 42 | 43 | public class AvroColumnPruningTest { 44 | @Test 45 | public void testColumnPruning() throws IOException { 46 | SortedMap map = new TreeMap<>(); 47 | map.put(new Key("key1", "cf1", "cq1"), new Value(new LongLexicoder().encode(3L))); 48 | map.put(new Key("key1", "cf2", ""), new Value("abc")); 49 | map.put(new Key("key2", "cf2"), new Value("def")); 50 | 51 | SortedMapIterator parentIterator = new SortedMapIterator(map); 52 | AvroRowEncoderIterator iterator = new AvroRowEncoderIterator(); 53 | 54 | Map options = new HashMap<>(); 55 | options.put(AvroRowEncoderIterator.SCHEMA, 56 | "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"long\",\"o\":false},{\"cf\":\"cf2\",\"t\":\"STRING\",\"o\":true}]"); 57 | 58 | iterator.init(parentIterator, options, new DefaultIteratorEnvironment()); 59 | iterator.seek(new Range(), AvroUtil.EMPTY_SET, false); 60 | 61 | RowBuilderField[] schemaMappingFields = new RowBuilderField[] { 62 | new RowBuilderField("cf2", null, "string", "v1") }; 63 | 64 | Schema schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields)); 65 | 66 | // ############################## ROW 1 67 | assertTrue(iterator.hasTop()); 68 | assertEquals("key1", iterator.getTopKey().getRow().toString()); 69 | 70 | // validate value 71 | byte[] data = iterator.getTopValue().get(); 72 | 73 | GenericRecord record = AvroUtil.deserialize(data, schema); 74 | 75 | assertEquals("abc", record.get("cf2").toString()); 76 | assertTrue(record.get("cf2") instanceof Utf8); 77 | 78 | // ############################## ROW 2 79 | iterator.next(); 80 | 81 | assertTrue(iterator.hasTop()); 82 | assertEquals("key2", iterator.getTopKey().getRow().toString()); 83 | 84 | // validate value 85 | data = iterator.getTopValue().get(); 86 | 87 | record = AvroUtil.deserialize(data, schema); 88 | 89 | assertEquals("def", record.get("cf2").toString()); 90 | assertTrue(record.get("cf2") instanceof Utf8); 91 | 92 | // End of data 93 | iterator.next(); 94 | assertFalse(iterator.hasTop()); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipDirectoryStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.io.IOException; 29 | import java.nio.file.ClosedDirectoryStreamException; 30 | import java.nio.file.DirectoryIteratorException; 31 | import java.nio.file.DirectoryStream; 32 | import java.nio.file.NotDirectoryException; 33 | import java.nio.file.Path; 34 | import java.util.Iterator; 35 | import java.util.NoSuchElementException; 36 | 37 | /** 38 | * 39 | * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal 40 | */ 41 | class ZipDirectoryStream implements DirectoryStream { 42 | 43 | private final ZipFileSystem zipfs; 44 | private final ZipPath dir; 45 | private final DirectoryStream.Filter filter; 46 | private volatile boolean isClosed; 47 | private volatile Iterator itr; 48 | 49 | ZipDirectoryStream(ZipPath dir, 50 | DirectoryStream.Filter filter) 51 | throws IOException 52 | { 53 | this.zipfs = dir.getFileSystem(); 54 | this.dir = dir; 55 | this.filter = filter; 56 | // sanity check 57 | if (!zipfs.isDirectory(dir.getResolvedPath())) 58 | throw new NotDirectoryException(dir.toString()); 59 | } 60 | 61 | @Override 62 | public synchronized Iterator iterator() { 63 | if (isClosed) 64 | throw new ClosedDirectoryStreamException(); 65 | if (itr != null) 66 | throw new IllegalStateException("Iterator has already been returned"); 67 | 68 | try { 69 | itr = zipfs.iteratorOf(dir, filter); 70 | } catch (IOException e) { 71 | throw new DirectoryIteratorException(e); 72 | } 73 | 74 | return new Iterator() { 75 | @Override 76 | public boolean hasNext() { 77 | if (isClosed) 78 | return false; 79 | return itr.hasNext(); 80 | } 81 | 82 | @Override 83 | public synchronized Path next() { 84 | if (isClosed) 85 | throw new NoSuchElementException(); 86 | return itr.next(); 87 | } 88 | 89 | @Override 90 | public void remove() { 91 | throw new UnsupportedOperationException(); 92 | } 93 | }; 94 | } 95 | 96 | @Override 97 | public synchronized void close() throws IOException { 98 | isClosed = true; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroVariableMapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | 23 | import javax.el.ValueExpression; 24 | import javax.el.VariableMapper; 25 | 26 | import com.microsoft.accumulo.spark.juel.expressions.AvroObjectExpression; 27 | import com.microsoft.accumulo.spark.juel.expressions.AvroVariableExpression; 28 | import com.microsoft.accumulo.spark.juel.expressions.RowKeyVariableExpression; 29 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder; 30 | import com.microsoft.accumulo.spark.record.RowBuilderType; 31 | import org.apache.avro.Schema; 32 | import org.apache.avro.Schema.Field; 33 | import org.apache.avro.Schema.Type; 34 | 35 | /** 36 | * Resolve JUEL variables against Avro schema. 37 | */ 38 | public class AvroVariableMapper extends VariableMapper { 39 | 40 | private static final String ROWKEY_VARIABLE_NAME = "rowKey"; 41 | 42 | private Schema schema; 43 | 44 | /** 45 | * fast lookup for variable names modelled by Avro aliases. 46 | */ 47 | private Map aliasMap; 48 | 49 | public AvroVariableMapper(Schema schema) { 50 | this.schema = schema; 51 | 52 | // build alias to VariableExpression map 53 | this.aliasMap = new HashMap<>(); 54 | for (Field field : schema.getFields()) { 55 | 56 | if (field.schema().getType() == Type.RECORD) { 57 | for (Field nestedField : field.schema().getFields()) { 58 | // find the corresponding java class 59 | Class nestedFieldClass = RowBuilderType 60 | .valueOf(nestedField.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE)).getJavaClass(); 61 | 62 | for (String alias : nestedField.aliases()) 63 | this.aliasMap.put(alias, new AvroVariableExpression(nestedFieldClass, field.pos(), nestedField.pos())); 64 | } 65 | } else { 66 | // find the corresponding java class 67 | Class fieldClass = RowBuilderType.valueOf(field.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE)) 68 | .getJavaClass(); 69 | for (String alias : field.aliases()) 70 | this.aliasMap.put(alias, new AvroVariableExpression(fieldClass, field.pos())); 71 | } 72 | } 73 | } 74 | 75 | /** 76 | * Resolve variables in this order: rowKey, mapped variables (e.g. v2 = cf1.cq1) 77 | * and finally using variable expressions. 78 | */ 79 | @Override 80 | public ValueExpression resolveVariable(String variable) { 81 | if (variable.equals(ROWKEY_VARIABLE_NAME)) 82 | return RowKeyVariableExpression.INSTANCE; 83 | 84 | // check if this is a statically resolved variable (e.g. v2 = cf1.cq1) 85 | AvroVariableExpression expr = this.aliasMap.get(variable); 86 | 87 | // otherwise default to dynamic lookup 88 | return expr != null ? expr : new AvroObjectExpression(this.schema.getField(variable)); 89 | } 90 | 91 | @Override 92 | public ValueExpression setVariable(String variable, ValueExpression expression) { 93 | return null; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.processors; 19 | 20 | import java.io.IOException; 21 | import java.util.Collection; 22 | import java.util.Collections; 23 | import java.util.Map; 24 | 25 | import javax.el.ExpressionFactory; 26 | import javax.el.ValueExpression; 27 | 28 | import com.microsoft.accumulo.spark.juel.AvroELContext; 29 | import com.microsoft.accumulo.spark.record.RowBuilderField; 30 | import org.apache.avro.Schema; 31 | import org.apache.avro.generic.IndexedRecord; 32 | import org.apache.commons.lang3.StringUtils; 33 | import org.apache.hadoop.io.Text; 34 | import org.apache.log4j.Logger; 35 | 36 | /** 37 | * Evaluates the user-supplied filter (JUEL syntax) against the constructed AVRO 38 | * record. 39 | * 40 | * Note: filter operates on AVRO Record object, not on the serialized version. 41 | */ 42 | public class AvroRowFilter extends AvroRowConsumer { 43 | private final static Logger logger = Logger.getLogger(AvroRowFilter.class); 44 | 45 | public static AvroRowFilter create(Map options, String optionKey) { 46 | String filter = options.get(optionKey); 47 | 48 | return StringUtils.isEmpty(filter) ? null : new AvroRowFilter(filter, optionKey); 49 | } 50 | 51 | /** 52 | * Required for cloning. 53 | */ 54 | private Schema schema; 55 | 56 | /** 57 | * Required for cloning. 58 | */ 59 | private String filter; 60 | 61 | private String optionKey; 62 | 63 | /** 64 | * JUEL expression context exposing AVRO GenericRecord 65 | */ 66 | private AvroELContext expressionContext; 67 | 68 | /** 69 | * JUEL filter expression 70 | */ 71 | private ValueExpression filterExpression; 72 | 73 | private AvroRowFilter(String filter, String optionKey) { 74 | logger.info(optionKey + " filter '" + filter + "'"); 75 | 76 | this.filter = filter; 77 | this.optionKey = optionKey; 78 | } 79 | 80 | @Override 81 | public String getName() { 82 | return super.getName() + " " + this.optionKey; 83 | } 84 | 85 | @Override 86 | protected boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException { 87 | // link AVRO record with JUEL expression context 88 | this.expressionContext.setCurrent(rowKey, record); 89 | 90 | return (boolean) filterExpression.getValue(this.expressionContext); 91 | } 92 | 93 | @Override 94 | public AvroRowConsumer clone() { 95 | AvroRowFilter copy = new AvroRowFilter(this.filter, this.optionKey); 96 | 97 | copy.initialize(schema); 98 | 99 | return copy; 100 | } 101 | 102 | @Override 103 | public Collection getSchemaFields() { 104 | return Collections.emptyList(); 105 | } 106 | 107 | @Override 108 | public void initialize(Schema schema) { 109 | this.schema = schema; 110 | this.expressionContext = new AvroELContext(schema); 111 | 112 | ExpressionFactory factory = ExpressionFactory.newInstance(); 113 | 114 | this.filterExpression = factory.createValueExpression(expressionContext, filter, boolean.class); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/RowBuilderField.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.record; 19 | 20 | import com.google.gson.annotations.SerializedName; 21 | 22 | /** 23 | * POJO for the user-supplied schema fields. 24 | */ 25 | public class RowBuilderField { 26 | @SerializedName("cf") 27 | private String columnFamily; 28 | 29 | @SerializedName("cq") 30 | private String columnQualifier; 31 | 32 | @SerializedName("t") 33 | private String type; 34 | 35 | @SerializedName("fvn") 36 | private String filterVariableName; 37 | 38 | @SerializedName("o") 39 | private boolean output = true; 40 | 41 | private boolean nullable = true; 42 | 43 | public RowBuilderField() { 44 | } 45 | 46 | public RowBuilderField(String columnFamily, String columnQualifier, String type, String filterVariableName) { 47 | this.columnFamily = columnFamily; 48 | this.columnQualifier = columnQualifier; 49 | this.type = type; 50 | this.filterVariableName = filterVariableName; 51 | } 52 | 53 | public RowBuilderType getRowBuilderType() { 54 | return RowBuilderType.valueOfIgnoreCase(this.type); 55 | } 56 | 57 | /** 58 | * @return true if this field should be output, otherwise it's just needed for 59 | * intermediate processing (e.g. filtering). 60 | */ 61 | public boolean isOutput() { 62 | return output; 63 | } 64 | 65 | public void setOutput(boolean output) { 66 | this.output = output; 67 | } 68 | 69 | /** 70 | * @return the nullable 71 | */ 72 | public boolean isNullable() { 73 | return nullable; 74 | } 75 | 76 | /** 77 | * @param nullable the nullable to set 78 | */ 79 | public void setNullable(boolean nullable) { 80 | this.nullable = nullable; 81 | } 82 | 83 | /** 84 | * @param filterVariableName the filterVariableName to set 85 | */ 86 | public void setFilterVariableName(String filterVariableName) { 87 | this.filterVariableName = filterVariableName; 88 | } 89 | 90 | /** 91 | * @return the filterVariableName 92 | */ 93 | public String getFilterVariableName() { 94 | return filterVariableName; 95 | } 96 | 97 | /** 98 | * @param columnFamily the columnFamily to set 99 | */ 100 | public void setColumnFamily(String columnFamily) { 101 | this.columnFamily = columnFamily; 102 | } 103 | 104 | /** 105 | * @return the columnFamily 106 | */ 107 | public String getColumnFamily() { 108 | return columnFamily; 109 | } 110 | 111 | /** 112 | * @return the columnQualifier 113 | */ 114 | public String getColumnQualifier() { 115 | return columnQualifier; 116 | } 117 | 118 | /** 119 | * @param columnQualifier the columnQualifier to set 120 | */ 121 | public void setColumnQualifier(String columnQualifier) { 122 | this.columnQualifier = columnQualifier; 123 | } 124 | 125 | /** 126 | * @return the type 127 | */ 128 | public String getType() { 129 | return type; 130 | } 131 | 132 | /** 133 | * @param type the type to set 134 | */ 135 | public void setType(String type) { 136 | this.type = type; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/FilterToJuel.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.apache.spark.sql.sources._ 21 | 22 | case class AccumuloFilterResult(serializedFilter: String, 23 | supportedFilters: Seq[Filter], 24 | unsupportedFilters: Seq[Filter]) 25 | 26 | class FilterToJuel(val attributeToVariableMapping: Map[String, String], val rowKeyColumn: String = "rowKey") { 27 | def mapAttribute(attribute: String): String = { 28 | if (attribute == rowKeyColumn) 29 | "rowKey" 30 | else 31 | attributeToVariableMapping.getOrElse(attribute, attribute) 32 | } 33 | 34 | def serializeValue(value: Any): String = { 35 | value match { 36 | case str: String => 37 | // properly escape \ and ' 38 | val strEscaped = str 39 | .replace("\\", "\\\\") 40 | .replace("'", "\\'") 41 | 42 | "'" + strEscaped + "'" 43 | case other: Any => other.toString 44 | } 45 | } 46 | 47 | def serializeFilter(filter: Filter): String = { 48 | filter match { 49 | case op: And => s"(${serializeFilter(op.left)} && ${serializeFilter(op.right)})" 50 | case op: Or => s"(${serializeFilter(op.left)} || ${serializeFilter(op.right)})" 51 | case op: EqualTo => s"(${mapAttribute(op.attribute)} == ${serializeValue(op.value)})" 52 | case op: GreaterThan => s"(${mapAttribute(op.attribute)} > ${serializeValue(op.value)})" 53 | case op: GreaterThanOrEqual => s"(${mapAttribute(op.attribute)} >= ${serializeValue(op.value)})" 54 | case op: LessThan => s"(${mapAttribute(op.attribute)} < ${serializeValue(op.value)})" 55 | case op: LessThanOrEqual => s"(${mapAttribute(op.attribute)} <= ${serializeValue(op.value)})" 56 | case op: Not => s"(!${serializeFilter(op.child)})" 57 | case op: IsNull => s"(${mapAttribute(op.attribute)} == null)" 58 | case op: IsNotNull => 59 | // IsNotNull(cf1) will be generated for conditions like cf1.cq1 > 5 60 | // since we always create the struct, it's always true 61 | val variable = attributeToVariableMapping.get(op.attribute) 62 | 63 | if (variable.isEmpty) 64 | // assuming this comes for a nested column family, will always be true 65 | "true" 66 | else 67 | s"(${variable.get} != null)" 68 | case op: StringContains => s"${mapAttribute(op.attribute)}.contains(${serializeValue(op.value)})" 69 | case op: StringStartsWith => s"${mapAttribute(op.attribute)}.startsWith(${serializeValue(op.value)})" 70 | case op: StringEndsWith => s"${mapAttribute(op.attribute)}.endsWith(${serializeValue(op.value)})" 71 | case op: In => 72 | val values = op.values.map { v => serializeValue(v) } .mkString(",") 73 | s"${mapAttribute(op.attribute)}.in($values)" 74 | // TODO: not sure if null handling is properly done 75 | // TODO: EqualNullSafe 76 | case _ => throw new UnsupportedOperationException(s"Filter $filter not supported") 77 | } 78 | } 79 | 80 | def serializeFilters(filters: Array[Filter], filterStr: String): AccumuloFilterResult = 81 | { 82 | val (supported, unsupported) = filters.map({ f => { 83 | 84 | try { 85 | (serializeFilter(f), f) 86 | } catch { 87 | case _: UnsupportedOperationException => ("", f) 88 | } 89 | }}).partition(!_._1.isEmpty) 90 | 91 | var filter = supported.map(_._1) 92 | 93 | // append if provided 94 | if (filterStr.length > 0) 95 | filter = filter :+ s"($filterStr)" 96 | 97 | val finalFilter = filter.mkString(" && ") 98 | 99 | AccumuloFilterResult( 100 | finalFilter, 101 | supported.map(_._2), 102 | unsupported.map(_._2) 103 | ) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/MLeapUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.apache.spark.sql.types.StructField 21 | import ml.combust.bundle.BundleFile 22 | import ml.combust.mleap.runtime.MleapSupport._ 23 | import ml.combust.mleap.runtime.MleapContext.defaultContext 24 | import org.apache.spark.sql.mleap.TypeConverters 25 | import java.io.File 26 | import java.util.{Base64, HashMap} 27 | import java.net.URI 28 | import java.nio.file.{Files, FileSystem, FileSystems, Path, StandardOpenOption} 29 | import resource._ 30 | import ml.combust.mleap.core.types.ScalarType 31 | import com.google.common.jimfs.{Jimfs, Configuration} 32 | import com.microsoft.accumulo.zipfs.{ZipFileSystem, ZipFileSystemProvider} 33 | 34 | @SerialVersionUID(1L) 35 | object MLeapUtil { 36 | 37 | // load the Spark pipeline we saved in the previous section 38 | def mleapSchemaToCatalyst(modelBase64: String): Seq[StructField] = { 39 | if (modelBase64.isEmpty) 40 | Seq.empty[StructField] 41 | else { 42 | val mleapBundleArr = Base64.getDecoder().decode(modelBase64) 43 | 44 | val fs = Jimfs.newFileSystem(Configuration.unix()) 45 | val mleapFilePath = fs.getPath("/mleap.zip") 46 | Files.write(mleapFilePath, mleapBundleArr, StandardOpenOption.CREATE) 47 | 48 | // Why do we access a private constructor??? 49 | // 1. MLeap only exposes a FileSystem layer to load models. 50 | // 2. We don't want to write to the local file system 51 | // 2a. We use Google JimFS 52 | // 2b. We can't use https://github.com/marschall/memoryfilesystem at it has a 16MB file size limitation 53 | // 2c. We can't use Apache common-vfs as it doesn't support directory listing 54 | // 3. Usually one triggers the ZFS implementation by prefixing the URI with jar: 55 | // Unfortunately on Spark the file system provider disappears from the installed list https://stackoverflow.com/questions/39500445/filesystem-provider-disappearing-in-spark 56 | // thus it cannot be found by the ZFS implementation when looking up the jimfs: protocol 57 | // 4. The public methods (e.g. FileSystems.newFileSystem(), new ZipFileSystemProvider().newFileSystem()) have checks that limit the incoming FileSystemProvider 58 | 59 | // Attempt 10: try to find the jar provider, but then we don't know if the same methods exists :( 60 | // val zfsProvider = FileSystemProvider.installedProviders().asScala.filter(_.getScheme == "jar") 61 | // FileSystemProvider.installedProviders().asScala.foreach(p => println(p.getScheme)) 62 | 63 | // Attempt 9: hard dependency on Oracle JDK, fails on OpenJDK 64 | // package private ctor... *sigh* 65 | // import com.sun.nio.zipfs.{ZipFileSystem, ZipFileSystemProvider} 66 | // val zfsCtor = classOf[ZipFileSystem].getDeclaredConstructor( 67 | // classOf[ZipFileSystemProvider], 68 | // classOf[java.nio.file.Path], 69 | // classOf[java.util.Map[String, Object]]) 70 | 71 | // zfsCtor.setAccessible(true) 72 | // val zfs = zfsCtor.newInstance(new ZipFileSystemProvider, mleapFilePath, new java.util.HashMap[String, Object]) 73 | 74 | // moving to modified OpenJDK ZipFileSystem 75 | val zfs = new ZipFileSystem(new ZipFileSystemProvider, mleapFilePath, new HashMap[String, Object]) 76 | 77 | val mleapPipeline = (for(bf <- managed(BundleFile(zfs, zfs.getPath("/")))) yield { 78 | bf.loadMleapBundle().get.root 79 | }).tried.get 80 | 81 | // TODO: also process mleapPipeline.inputSchema to determine the required fields 82 | 83 | mleapPipeline.outputSchema.fields.flatMap { 84 | mleapField => { 85 | mleapField.dataType match { 86 | case _: ScalarType => Some(TypeConverters.mleapFieldToSparkField(mleapField)) 87 | case _ => None 88 | } 89 | } 90 | } 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/juel/AvroResolver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.juel; 19 | 20 | import java.beans.FeatureDescriptor; 21 | import java.util.Arrays; 22 | import java.util.Iterator; 23 | 24 | import javax.el.ELContext; 25 | import javax.el.ELException; 26 | import javax.el.ELResolver; 27 | 28 | import org.apache.avro.Schema.Field; 29 | import org.apache.avro.Schema.Type; 30 | import org.apache.avro.generic.GenericContainer; 31 | import org.apache.avro.generic.IndexedRecord; 32 | 33 | /** 34 | * Resolves variables and properties from AVRO GenericRecord. 35 | */ 36 | public class AvroResolver extends ELResolver { 37 | 38 | private static Class avroTypeToJavaType(Field field) { 39 | Type type = field.schema().getType(); 40 | 41 | if (type == Type.BOOLEAN) 42 | return boolean.class; 43 | else if (type == Type.DOUBLE) 44 | return double.class; 45 | else if (type == Type.FLOAT) 46 | return float.class; 47 | else if (type == Type.INT) 48 | return int.class; 49 | else if (type == Type.LONG) 50 | return long.class; 51 | else 52 | throw new IllegalArgumentException("Unsupported type: " + type); 53 | } 54 | 55 | @Override 56 | public Class getCommonPropertyType(ELContext context, Object base) { 57 | throw new ELException("getCommonPropertyType is not supported"); 58 | } 59 | 60 | @Override 61 | public Iterator getFeatureDescriptors(ELContext context, Object base) { 62 | return null; 63 | } 64 | 65 | @Override 66 | public Class getType(ELContext context, Object base, Object property) { 67 | return avroTypeToJavaType(((GenericContainer) base).getSchema().getField((String) property)); 68 | } 69 | 70 | @Override 71 | public Object getValue(ELContext context, Object base, Object property) { 72 | IndexedRecord record = (IndexedRecord) base; 73 | 74 | context.setPropertyResolved(true); 75 | 76 | // lookup field 77 | return record.get(record.getSchema().getField((String) property).pos()); 78 | } 79 | 80 | @Override 81 | public boolean isReadOnly(ELContext context, Object base, Object property) { 82 | return true; 83 | } 84 | 85 | @Override 86 | public void setValue(ELContext context, Object base, Object property, Object value) { 87 | throw new ELException("setValue is not supported"); 88 | } 89 | 90 | @Override 91 | public Object invoke(ELContext context, Object base, Object method, Class[] paramTypes, Object[] params) { 92 | if (method.equals("in")) { 93 | if (base instanceof AvroUtf8Wrapper) 94 | base = ((AvroUtf8Wrapper) base).getString(); 95 | 96 | context.setPropertyResolved(true); 97 | return Arrays.binarySearch(params, base) >= 0; 98 | } else if (params.length == 1) { 99 | if (base instanceof AvroUtf8Wrapper) 100 | base = ((AvroUtf8Wrapper) base).getString(); 101 | 102 | if (base instanceof String) { 103 | String baseStr = (String) base; 104 | String paramStr = (String) params[0]; 105 | 106 | // Spark methods available for pushdown 107 | if (method.equals("endsWith")) { 108 | context.setPropertyResolved(true); 109 | return baseStr.endsWith(paramStr); 110 | } 111 | 112 | if (method.equals("startsWith")) { 113 | context.setPropertyResolved(true); 114 | return baseStr.startsWith(paramStr); 115 | } 116 | 117 | if (method.equals("contains")) { 118 | context.setPropertyResolved(true); 119 | return baseStr.contains(paramStr); 120 | } 121 | } 122 | } 123 | 124 | return null; 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroMLeapTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | import static org.junit.Assert.assertFalse; 22 | import static org.junit.Assert.assertTrue; 23 | 24 | import java.io.IOException; 25 | import java.util.*; 26 | 27 | import org.apache.accumulo.core.client.lexicoder.DoubleLexicoder; 28 | import org.apache.accumulo.core.data.Key; 29 | import org.apache.accumulo.core.data.Range; 30 | import org.apache.accumulo.core.data.Value; 31 | import org.apache.accumulo.core.iterators.SortedMapIterator; 32 | import com.microsoft.accumulo.spark.processors.AvroRowMLeap; 33 | import org.apache.avro.generic.GenericRecord; 34 | import org.apache.commons.lang3.StringUtils; 35 | import org.junit.Test; 36 | 37 | import com.google.common.io.Resources; 38 | 39 | public class AvroMLeapTest { 40 | 41 | private AvroRowEncoderIterator createIterator(String mleapFilter) throws IOException { 42 | // load mleap model 43 | byte[] mleapBundle = Resources.toByteArray(AvroMLeapTest.class.getResource("pyspark.lr.zip")); 44 | String mleapBundleBase64 = Base64.getEncoder().encodeToString(mleapBundle); 45 | 46 | SortedMap map = new TreeMap<>(); 47 | map.put(new Key("key1", "cf1", "cq1"), new Value(new DoubleLexicoder().encode(0.0))); 48 | map.put(new Key("key2", "cf1", "cq1"), new Value(new DoubleLexicoder().encode(8.2))); 49 | 50 | SortedMapIterator parentIterator = new SortedMapIterator(map); 51 | AvroRowEncoderIterator iterator = new AvroRowEncoderIterator(); 52 | 53 | Map options = new HashMap<>(); 54 | options.put(AvroRowEncoderIterator.SCHEMA, 55 | "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"double\",\"fvn\":\"v0\"},{\"cf\":\"cf1\",\"cq\":\"cq2\",\"t\":\"string\",\"o\":true}]"); 56 | 57 | // pass the model to the iterator 58 | options.put(AvroRowMLeap.MLEAP_BUNDLE, mleapBundleBase64); 59 | options.put(AvroRowMLeap.MLEAP_GUID, UUID.randomUUID().toString()); 60 | 61 | // map cf1.cq1 to fit the models input data frame 62 | options.put("column.feature.double", "${cf1.cq1}"); 63 | 64 | if (StringUtils.isNotBlank(mleapFilter)) 65 | options.put(AvroRowEncoderIterator.MLEAP_FILTER, mleapFilter); 66 | 67 | iterator.init(parentIterator, options, new DefaultIteratorEnvironment()); 68 | iterator.seek(new Range(), AvroUtil.EMPTY_SET, false); 69 | 70 | return iterator; 71 | } 72 | 73 | @Test 74 | public void testMLeapModelExecution() throws IOException { 75 | AvroRowEncoderIterator iterator = createIterator(null); 76 | 77 | // row 1 78 | assertTrue(iterator.hasTop()); 79 | GenericRecord record = AvroUtil.deserialize(iterator.getTopValue().get(), iterator.getSchema()); 80 | 81 | assertEquals("key1", iterator.getTopKey().getRow().toString()); 82 | assertEquals(-0.08748407856807701, (double) record.get("prediction"), 0.00001); 83 | 84 | // row2 85 | iterator.next(); 86 | 87 | assertTrue(iterator.hasTop()); 88 | record = AvroUtil.deserialize(iterator.getTopValue().get(), iterator.getSchema()); 89 | 90 | assertEquals("key2", iterator.getTopKey().getRow().toString()); 91 | assertEquals(0.8827512234363478, (double) record.get("prediction"), 0.00001); 92 | 93 | // end 94 | iterator.next(); 95 | assertFalse(iterator.hasTop()); 96 | } 97 | 98 | @Test 99 | public void testMLeapModelPredictionFiltering() throws IOException { 100 | AvroRowEncoderIterator iterator = createIterator("${prediction > 0.7}"); 101 | 102 | assertTrue(iterator.hasTop()); 103 | assertEquals("key2", iterator.getTopKey().getRow().toString()); 104 | 105 | // end 106 | iterator.next(); 107 | assertFalse(iterator.hasTop()); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataSourceWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.apache.spark.sql.SaveMode 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | import org.apache.spark.sql.sources.v2.DataSourceOptions 23 | import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage} 24 | import org.apache.spark.sql.types.StructType 25 | import org.apache.accumulo.core.client.{Accumulo, AccumuloClient} 26 | import org.apache.hadoop.io.Text 27 | 28 | import scala.collection.JavaConverters._ 29 | import org.apache.log4j.Logger 30 | 31 | class AccumuloDataSourceWriter(schema: StructType, mode: SaveMode, options: DataSourceOptions) 32 | extends DataSourceWriter { 33 | 34 | private val logger = Logger.getLogger(classOf[AccumuloDataSourceWriter]) 35 | 36 | val tableName: String = options.tableName.get 37 | val properties = new java.util.Properties() 38 | // cannot use .putAll(options.asMap()) due to https://github.com/scala/bug/issues/10418 39 | options.asMap.asScala.foreach { case (k, v) => properties.setProperty(k, v) } 40 | 41 | // defaults based on https://accumulo.apache.org/docs/2.x/configuration/client-properties 42 | val batchThread: Int = options.get("batchThread").orElse("3").toInt 43 | val batchMemory: Long = options.get("batchMemory").orElse("50000000").toLong 44 | 45 | val client: AccumuloClient = Accumulo.newClient().from(properties).build() 46 | val tableExists: Boolean = client.tableOperations.exists(tableName) 47 | val ignore: Boolean = mode == SaveMode.Ignore && tableExists 48 | 49 | // enforce write mode 50 | try { 51 | if (tableExists) { 52 | if (mode == SaveMode.ErrorIfExists) 53 | // this should throw an error 54 | createTable() 55 | else if (mode == SaveMode.Overwrite) { 56 | client.tableOperations.delete(tableName) 57 | createTable() 58 | } 59 | } else { 60 | createTable() 61 | } 62 | } catch { 63 | // re-throw exception 64 | case exception: Throwable => throw exception 65 | } finally { 66 | // always close the client 67 | client.close() 68 | } 69 | 70 | def createTable(): Unit = { 71 | // adding splits to a newly created table 72 | val splits = new java.util.TreeSet( 73 | properties.getProperty("splits", "") 74 | .split(",") 75 | .map(new Text(_)) 76 | .toSeq 77 | .asJava) 78 | 79 | logger.info(s"Creating table: $tableName") 80 | client.tableOperations.create(tableName) 81 | 82 | if (!splits.isEmpty) { 83 | logger.info(s"Adding splits: $splits") 84 | client.tableOperations.addSplits(tableName, splits) 85 | } 86 | } 87 | 88 | override def createWriterFactory(): DataWriterFactory[InternalRow] = { 89 | new AccumuloDataWriterFactory(tableName, schema, mode, properties, batchThread, batchMemory, ignore) 90 | } 91 | 92 | override def commit(messages: Array[WriterCommitMessage]): Unit = { 93 | } 94 | 95 | override def abort(messages: Array[WriterCommitMessage]): Unit = { 96 | } 97 | } 98 | 99 | class AccumuloDataWriterFactory(tableName: String, 100 | schema: StructType, 101 | mode: SaveMode, 102 | properties: java.util.Properties, 103 | batchThread: Int, 104 | batchMemory: Long, 105 | ignore: Boolean) 106 | extends DataWriterFactory[InternalRow] { 107 | override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = { 108 | new AccumuloDataWriter(tableName, schema, mode, properties, batchThread, batchMemory, ignore) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import static org.junit.Assert.assertArrayEquals; 21 | 22 | import java.io.IOException; 23 | import java.util.ArrayList; 24 | import java.util.HashMap; 25 | import java.util.List; 26 | import java.util.Map; 27 | import java.util.SortedMap; 28 | import java.util.TreeMap; 29 | 30 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder; 31 | import org.apache.accumulo.core.data.Key; 32 | import org.apache.accumulo.core.data.Range; 33 | import org.apache.accumulo.core.data.Value; 34 | import org.apache.accumulo.core.iterators.SortedMapIterator; 35 | import org.junit.Test; 36 | 37 | public class AvroFilterTest { 38 | 39 | private static void validateFilter(String filter, String... expectedKeys) throws IOException { 40 | SortedMap map = new TreeMap<>(); 41 | map.put(new Key("key1", "cf1", "cq1"), new Value(new LongLexicoder().encode(3L))); 42 | map.put(new Key("key1", "cf1", "cq2"), new Value("Hello")); 43 | map.put(new Key("key1", "cf2", ""), new Value("abc")); 44 | 45 | map.put(new Key("key2", "cf2"), new Value("def")); 46 | 47 | SortedMapIterator parentIterator = new SortedMapIterator(map); 48 | AvroRowEncoderIterator iterator = new AvroRowEncoderIterator(); 49 | 50 | Map options = new HashMap<>(); 51 | options.put(AvroRowEncoderIterator.SCHEMA, 52 | "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"long\",\"fvn\":\"v0\",\"o\":true},{\"cf\":\"cf1\",\"cq\":\"cq2\",\"t\":\"string\",\"o\":true},{\"cf\":\"cf2\",\"t\":\"STRING\",\"o\":true,\"fvn\":\"v1\",\"o\":true}]"); 53 | 54 | options.put(AvroRowEncoderIterator.FILTER, filter); 55 | 56 | // include computed column 57 | options.put("column.vc1.long", "${cf1.cq1 + 5}"); 58 | 59 | iterator.init(parentIterator, options, new DefaultIteratorEnvironment()); 60 | iterator.seek(new Range(), AvroUtil.EMPTY_SET, false); 61 | 62 | // collect rows 63 | List foundRows = new ArrayList<>(); 64 | for (; iterator.hasTop(); iterator.next()) 65 | foundRows.add(iterator.getTopKey().getRow().toString()); 66 | 67 | assertArrayEquals(expectedKeys, foundRows.toArray(new String[0])); 68 | } 69 | 70 | @Test 71 | public void testComputedColumn() throws IOException { 72 | validateFilter("${vc1 == 8}", "key1"); 73 | } 74 | 75 | @Test 76 | public void testEquals() throws IOException { 77 | validateFilter("${v0 == 3}", "key1"); 78 | validateFilter("${v0 != 2}", "key1", "key2"); 79 | validateFilter("${v1 == 'def'}", "key2"); 80 | } 81 | 82 | @Test 83 | public void testIsNull() throws IOException { 84 | validateFilter("${v0 == null}", "key2"); 85 | } 86 | 87 | @Test 88 | public void testEndsWith() throws IOException { 89 | // test on variable 90 | validateFilter("${v1.endsWith('ef')}", "key2"); 91 | 92 | // test on object/property combination 93 | validateFilter("${cf1.cq2.endsWith('ello')}", "key1"); 94 | } 95 | 96 | @Test 97 | public void testStartsWith() throws IOException { 98 | // test on variable 99 | validateFilter("${v1.startsWith('de')}", "key2"); 100 | 101 | // test on object/property combination 102 | validateFilter("${cf1.cq2.startsWith('Hel')}", "key1"); 103 | } 104 | 105 | @Test 106 | public void testContains() throws IOException { 107 | // test on variable 108 | validateFilter("${v1.contains('d')}", "key2"); 109 | 110 | // test on object/property combination 111 | validateFilter("${cf1.cq2.contains('ell')}", "key1"); 112 | } 113 | 114 | @Test 115 | public void testIn() throws IOException { 116 | // test on variable 117 | validateFilter("${v1.in('aaa','def')}", "key2"); 118 | 119 | // test on object/property combination 120 | validateFilter("${cf1.cq2.in('A','Hello','xxx')}", "key1"); 121 | 122 | // test on object/property combination 123 | validateFilter("${cf1.cq2.in('Hello')}", "key1"); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /connector/datasource/src/test/scala/com/microsoft/accumulo/VerifyFilterToJuel.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import java.io.ByteArrayOutputStream 21 | 22 | import org.apache.spark.sql.sources._ 23 | import org.junit.runner.RunWith 24 | 25 | import org.scalatest.FunSuite 26 | import org.scalatest.junit.JUnitRunner 27 | 28 | @RunWith(classOf[JUnitRunner]) 29 | class VerifyFilterToJuel extends FunSuite { 30 | val map = Map[String, String]( 31 | "i" -> "i0", 32 | "x" -> "x", 33 | "j" -> "j", 34 | "k" -> "k", 35 | "x.yZ" -> "xyZ") 36 | 37 | test("Validate filter to juel operators") { 38 | assert("(i0 == 5)".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", 5)))) 39 | assert("(i0 >= 5)".equals(new FilterToJuel(map).serializeFilter(new GreaterThanOrEqual("i", 5)))) 40 | assert("(i0 > 5)".equals(new FilterToJuel(map).serializeFilter(new GreaterThan("i", 5)))) 41 | assert("(i0 <= 5)".equals(new FilterToJuel(map).serializeFilter(new LessThanOrEqual("i", 5)))) 42 | assert("(i0 < 5)".equals(new FilterToJuel(map).serializeFilter(new LessThan("i", 5)))) 43 | assert("(i0 == null)".equals(new FilterToJuel(map).serializeFilter(new IsNull("i")))) 44 | assert("(i0 != null)".equals(new FilterToJuel(map).serializeFilter(new IsNotNull("i")))) 45 | } 46 | 47 | test("Validate filter to juel composed operators") { 48 | assert("(!(i0 == 5))".equals(new FilterToJuel(map).serializeFilter( 49 | new Not(new EqualTo("i", 5))))) 50 | 51 | assert("((i0 == 5) && (x == 3.0))".equals(new FilterToJuel(map).serializeFilter( 52 | new And(new EqualTo("i", 5), new EqualTo("x", 3.0))))) 53 | 54 | assert("((i0 == 5) || (x == 3.0))".equals(new FilterToJuel(map).serializeFilter( 55 | new Or(new EqualTo("i", 5), new EqualTo("x", 3.0))))) 56 | } 57 | 58 | test("Validate filter to juel string operators") { 59 | assert("x.contains('abc')".equals(new FilterToJuel(map).serializeFilter( 60 | new StringContains("x", "abc")))) 61 | assert("x.startsWith('abc')".equals(new FilterToJuel(map).serializeFilter( 62 | new StringStartsWith("x", "abc")))) 63 | assert("x.endsWith('abc')".equals(new FilterToJuel(map).serializeFilter( 64 | new StringEndsWith("x", "abc")))) 65 | } 66 | 67 | test("Validate filter to juel in operator") { 68 | assert("xyZ.in('abc','def','ghi')".equals(new FilterToJuel(map).serializeFilter( 69 | new In("x.yZ", Array("abc", "def", "ghi"))))) 70 | } 71 | 72 | test("Validate filter string escape") { 73 | assert("(i0 == '\\'')".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", "'")))) 74 | assert("(i0 == '\\\\')".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", "\\")))) 75 | assert("(i0 == '\\\\\\'')".equals(new FilterToJuel(map).serializeFilter(new EqualTo("i", "\\'")))) 76 | } 77 | 78 | test("Validate filter combining") { 79 | val filters = Array[Filter]( 80 | new EqualTo("i", 5), 81 | new EqualTo("j", 3), 82 | new EqualTo("k", 4) 83 | ) 84 | 85 | val result = new FilterToJuel(map).serializeFilters(filters, "") 86 | 87 | assert("(i0 == 5) && (j == 3) && (k == 4)".equals(result.serializedFilter)) 88 | assert(filters.length == result.supportedFilters.length) 89 | 90 | assert(result.unsupportedFilters.isEmpty) 91 | } 92 | 93 | test("Validate filter with rowKey and manual filter") { 94 | val filters = Array[Filter]( 95 | new EqualTo("i", 5), 96 | new EqualTo("j", 3), 97 | new EqualTo("k", 4), 98 | new EqualTo("rowKey", "foo") 99 | ) 100 | 101 | val result = new FilterToJuel(map).serializeFilters(filters, "a.b == 3") 102 | 103 | assert("(i0 == 5) && (j == 3) && (k == 4) && (rowKey == 'foo') && (a.b == 3)".equals(result.serializedFilter)) 104 | assert(filters.length == result.supportedFilters.length) 105 | 106 | assert(result.unsupportedFilters.isEmpty) 107 | } 108 | 109 | test("Validate filter supports unknown attributes (e.g. for prediction)") { 110 | assert("(prediction > 5)".equals(new FilterToJuel(map).serializeFilter(new GreaterThan("prediction", 5)))) 111 | } 112 | } -------------------------------------------------------------------------------- /connector/integration-test/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | com.microsoft.masc 22 | microsoft-accumulo-spark 23 | 1.0.4 24 | 25 | com.microsoft.masc 26 | microsoft-accumulo-spark-integration-test 27 | 1.0.4 28 | Microsoft MASC, an Apache Spark connector for Apache Accumulo - Integration Test 29 | 30 | 31 | com.microsoft.masc 32 | microsoft-accumulo-spark-datasource 33 | ${project.version} 34 | test 35 | 36 | 37 | com.microsoft.masc 38 | microsoft-accumulo-spark-iterator 39 | ${project.version} 40 | test 41 | 42 | 43 | org.apache.accumulo 44 | accumulo-core 45 | test 46 | 47 | 48 | org.apache.accumulo 49 | accumulo-minicluster 50 | ${accumulo.version} 51 | test 52 | 53 | 54 | ml.combust.mleap 55 | mleap-spark_${scala.compat.version} 56 | test 57 | 58 | 59 | org.apache.spark 60 | spark-core_${scala.compat.version} 61 | test 62 | 63 | 64 | org.apache.spark 65 | spark-mllib_${scala.compat.version} 66 | test 67 | 68 | 69 | com.fasterxml.jackson.core 70 | jackson-databind 71 | 2.6.7.1 72 | test 73 | 74 | 75 | junit 76 | junit 77 | test 78 | 79 | 80 | 81 | 82 | 83 | org.apache.maven.plugins 84 | maven-compiler-plugin 85 | 86 | 87 | org.apache.maven.plugins 88 | maven-failsafe-plugin 89 | 2.22.2 90 | 91 | false 92 | 93 | 94 | 95 | 96 | integration-test 97 | verify 98 | 99 | 100 | 101 | 102 | 103 | 104 | org.apache.accumulo 105 | accumulo2-maven-plugin 106 | 1.0.0 107 | 108 | spark-connector-instance 109 | ITSecret 110 | 111 | 112 | 113 | run-plugin 114 | 115 | start 116 | stop 117 | 118 | 119 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroRowEncoderIteratorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | import static org.junit.Assert.assertFalse; 22 | import static org.junit.Assert.assertTrue; 23 | 24 | import java.io.IOException; 25 | import java.util.HashMap; 26 | import java.util.Map; 27 | import java.util.SortedMap; 28 | import java.util.TreeMap; 29 | 30 | import org.apache.accumulo.core.data.Key; 31 | import org.apache.accumulo.core.data.Range; 32 | import org.apache.accumulo.core.data.Value; 33 | import org.apache.accumulo.core.iterators.SortedMapIterator; 34 | import org.apache.avro.Schema; 35 | import org.apache.avro.SchemaBuilder; 36 | import org.apache.avro.generic.GenericRecord; 37 | import org.junit.Test; 38 | 39 | public class AvroRowEncoderIteratorTest { 40 | class MyRow { 41 | public String key; 42 | 43 | public String cf1cq1; 44 | 45 | MyRow(String key, String cf1cq1) { 46 | this.key = key; 47 | this.cf1cq1 = cf1cq1; 48 | } 49 | } 50 | 51 | private void validateSingleRowSimpleSchema(SortedMap map, MyRow... expectedRows) throws IOException { 52 | SortedMapIterator parentIterator = new SortedMapIterator(map); 53 | 54 | // setup avro encoder iterator 55 | AvroRowEncoderIterator iterator = new AvroRowEncoderIterator(); 56 | 57 | Map options = new HashMap<>(); 58 | options.put(AvroRowEncoderIterator.SCHEMA, "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"STRING\",\"o\":true}]"); 59 | 60 | iterator.init(parentIterator, options, new DefaultIteratorEnvironment()); 61 | iterator.seek(new Range(), AvroUtil.EMPTY_SET, false); 62 | 63 | // the expected avro schema 64 | Schema schema = SchemaBuilder.record("root").fields().name("cf1") 65 | .type(SchemaBuilder.record("cf1").fields().optionalString("cq1").endRecord()).noDefault().endRecord(); 66 | 67 | for (MyRow row : expectedRows) { 68 | assertTrue(iterator.hasTop()); 69 | 70 | // validate key 71 | assertEquals(row.key, iterator.getTopKey().getRow().toString()); 72 | 73 | // validate value 74 | byte[] data = iterator.getTopValue().get(); 75 | 76 | GenericRecord record = AvroUtil.deserialize(data, schema); 77 | GenericRecord cf1Record = (GenericRecord) record.get("cf1"); 78 | 79 | assertEquals(row.cf1cq1, cf1Record.get("cq1").toString()); 80 | 81 | // move to next 82 | iterator.next(); 83 | } 84 | 85 | assertFalse(iterator.hasTop()); 86 | } 87 | 88 | @Test 89 | public void testSingleFieldString() throws IOException { 90 | // setup input iterator 91 | SortedMap map = new TreeMap<>(); 92 | map.put(new Key("key1", "cf1", "cq1"), new Value("abc")); 93 | 94 | validateSingleRowSimpleSchema(map, new MyRow("key1", "abc")); 95 | } 96 | 97 | @Test 98 | public void testSkippedField1() throws IOException { 99 | // setup input iterator 100 | SortedMap map = new TreeMap<>(); 101 | map.put(new Key("key1", "cf1", "cq1"), new Value("abc")); 102 | map.put(new Key("key1", "cf1", "cq2"), new Value("def")); 103 | 104 | validateSingleRowSimpleSchema(map, new MyRow("key1", "abc")); 105 | } 106 | 107 | @Test 108 | public void testSkippedField2() throws IOException { 109 | // setup input iterator 110 | SortedMap map = new TreeMap<>(); 111 | map.put(new Key("key1", "cf0", "cq1"), new Value("xxx")); 112 | map.put(new Key("key1", "cf1", "cq1"), new Value("abc")); 113 | map.put(new Key("key1", "cf1", "cq2"), new Value("def")); 114 | 115 | validateSingleRowSimpleSchema(map, new MyRow("key1", "abc")); 116 | } 117 | 118 | @Test 119 | public void testMultipleRows() throws IOException { 120 | // setup input iterator 121 | SortedMap map = new TreeMap<>(); 122 | map.put(new Key("key1", "cf1", "cq1"), new Value("xxx")); 123 | map.put(new Key("key2", "cf0", "cq1"), new Value("abc")); 124 | map.put(new Key("key3", "cf1", "cq1"), new Value("yyy")); 125 | 126 | validateSingleRowSimpleSchema(map, new MyRow("key1", "xxx"), new MyRow("key3", "yyy")); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipFileStore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.io.IOException; 29 | import java.nio.file.FileStore; 30 | import java.nio.file.FileSystems; 31 | import java.nio.file.Files; 32 | import java.nio.file.Path; 33 | import java.nio.file.attribute.BasicFileAttributeView; 34 | import java.nio.file.attribute.FileAttributeView; 35 | import java.nio.file.attribute.FileOwnerAttributeView; 36 | import java.nio.file.attribute.FileStoreAttributeView; 37 | import java.nio.file.attribute.PosixFileAttributeView; 38 | 39 | /** 40 | * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal 41 | */ 42 | class ZipFileStore extends FileStore { 43 | 44 | private final ZipFileSystem zfs; 45 | 46 | ZipFileStore(ZipPath zpath) { 47 | this.zfs = zpath.getFileSystem(); 48 | } 49 | 50 | @Override 51 | public String name() { 52 | return zfs.toString() + "/"; 53 | } 54 | 55 | @Override 56 | public String type() { 57 | return "zipfs"; 58 | } 59 | 60 | @Override 61 | public boolean isReadOnly() { 62 | return zfs.isReadOnly(); 63 | } 64 | 65 | @Override 66 | public boolean supportsFileAttributeView(Class type) { 67 | return (type == BasicFileAttributeView.class || 68 | type == ZipFileAttributeView.class || 69 | ((type == FileOwnerAttributeView.class || 70 | type == PosixFileAttributeView.class) && zfs.supportPosix)); 71 | } 72 | 73 | @Override 74 | public boolean supportsFileAttributeView(String name) { 75 | return "basic".equals(name) || "zip".equals(name) || 76 | (("owner".equals(name) || "posix".equals(name)) && zfs.supportPosix); 77 | } 78 | 79 | @Override 80 | public V getFileStoreAttributeView(Class type) { 81 | if (type == null) 82 | throw new NullPointerException(); 83 | return null; 84 | } 85 | 86 | @Override 87 | public long getTotalSpace() throws IOException { 88 | return new ZipFileStoreAttributes(this).totalSpace(); 89 | } 90 | 91 | @Override 92 | public long getUsableSpace() throws IOException { 93 | return new ZipFileStoreAttributes(this).usableSpace(); 94 | } 95 | 96 | @Override 97 | public long getUnallocatedSpace() throws IOException { 98 | return new ZipFileStoreAttributes(this).unallocatedSpace(); 99 | } 100 | 101 | @Override 102 | public Object getAttribute(String attribute) throws IOException { 103 | if (attribute.equals("totalSpace")) 104 | return getTotalSpace(); 105 | if (attribute.equals("usableSpace")) 106 | return getUsableSpace(); 107 | if (attribute.equals("unallocatedSpace")) 108 | return getUnallocatedSpace(); 109 | throw new UnsupportedOperationException("does not support the given attribute"); 110 | } 111 | 112 | private static class ZipFileStoreAttributes { 113 | final FileStore fstore; 114 | final long size; 115 | 116 | ZipFileStoreAttributes(ZipFileStore fileStore) 117 | throws IOException 118 | { 119 | Path path = FileSystems.getDefault().getPath(fileStore.name()); 120 | this.size = Files.size(path); 121 | this.fstore = Files.getFileStore(path); 122 | } 123 | 124 | long totalSpace() { 125 | return size; 126 | } 127 | 128 | long usableSpace() throws IOException { 129 | if (!fstore.isReadOnly()) 130 | return fstore.getUsableSpace(); 131 | return 0; 132 | } 133 | 134 | long unallocatedSpace() throws IOException { 135 | if (!fstore.isReadOnly()) 136 | return fstore.getUnallocatedSpace(); 137 | return 0; 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipCoder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.nio.ByteBuffer; 29 | import java.nio.CharBuffer; 30 | import java.nio.charset.Charset; 31 | import java.nio.charset.CharsetDecoder; 32 | import java.nio.charset.CharsetEncoder; 33 | import java.nio.charset.CoderResult; 34 | import java.nio.charset.CodingErrorAction; 35 | import java.util.Arrays; 36 | 37 | import static java.nio.charset.StandardCharsets.ISO_8859_1; 38 | import static java.nio.charset.StandardCharsets.UTF_8; 39 | 40 | /** 41 | * Utility class for zipfile name and comment decoding and encoding 42 | * 43 | * @author Xueming Shen 44 | */ 45 | class ZipCoder { 46 | 47 | static class UTF8 extends ZipCoder { 48 | UTF8() { 49 | super(UTF_8); 50 | } 51 | 52 | @Override 53 | byte[] getBytes(String s) { // fast pass for ascii 54 | for (int i = 0; i < s.length(); i++) { 55 | if (s.charAt(i) > 0x7f) return super.getBytes(s); 56 | } 57 | return s.getBytes(ISO_8859_1); 58 | } 59 | 60 | @Override 61 | String toString(byte[] ba) { 62 | for (byte b : ba) { 63 | if (b < 0) return super.toString(ba); 64 | } 65 | return new String(ba, ISO_8859_1); 66 | } 67 | } 68 | 69 | private static final ZipCoder utf8 = new UTF8(); 70 | 71 | public static ZipCoder get(String csn) { 72 | Charset cs = Charset.forName(csn); 73 | if (cs.name().equals("UTF-8")) { 74 | return utf8; 75 | } 76 | return new ZipCoder(cs); 77 | } 78 | 79 | String toString(byte[] ba) { 80 | CharsetDecoder cd = decoder().reset(); 81 | int clen = (int)(ba.length * cd.maxCharsPerByte()); 82 | char[] ca = new char[clen]; 83 | if (clen == 0) 84 | return new String(ca); 85 | ByteBuffer bb = ByteBuffer.wrap(ba, 0, ba.length); 86 | CharBuffer cb = CharBuffer.wrap(ca); 87 | CoderResult cr = cd.decode(bb, cb, true); 88 | if (!cr.isUnderflow()) 89 | throw new IllegalArgumentException(cr.toString()); 90 | cr = cd.flush(cb); 91 | if (!cr.isUnderflow()) 92 | throw new IllegalArgumentException(cr.toString()); 93 | return new String(ca, 0, cb.position()); 94 | } 95 | 96 | byte[] getBytes(String s) { 97 | CharsetEncoder ce = encoder().reset(); 98 | char[] ca = s.toCharArray(); 99 | int len = (int)(ca.length * ce.maxBytesPerChar()); 100 | byte[] ba = new byte[len]; 101 | if (len == 0) 102 | return ba; 103 | ByteBuffer bb = ByteBuffer.wrap(ba); 104 | CharBuffer cb = CharBuffer.wrap(ca); 105 | CoderResult cr = ce.encode(cb, bb, true); 106 | if (!cr.isUnderflow()) 107 | throw new IllegalArgumentException(cr.toString()); 108 | cr = ce.flush(bb); 109 | if (!cr.isUnderflow()) 110 | throw new IllegalArgumentException(cr.toString()); 111 | if (bb.position() == ba.length) // defensive copy? 112 | return ba; 113 | else 114 | return Arrays.copyOf(ba, bb.position()); 115 | } 116 | 117 | boolean isUTF8() { 118 | return cs == UTF_8; 119 | } 120 | 121 | private Charset cs; 122 | 123 | private ZipCoder(Charset cs) { 124 | this.cs = cs; 125 | } 126 | 127 | private final ThreadLocal decTL = new ThreadLocal<>(); 128 | private final ThreadLocal encTL = new ThreadLocal<>(); 129 | 130 | private CharsetDecoder decoder() { 131 | CharsetDecoder dec = decTL.get(); 132 | if (dec == null) { 133 | dec = cs.newDecoder() 134 | .onMalformedInput(CodingErrorAction.REPORT) 135 | .onUnmappableCharacter(CodingErrorAction.REPORT); 136 | decTL.set(dec); 137 | } 138 | return dec; 139 | } 140 | 141 | private CharsetEncoder encoder() { 142 | CharsetEncoder enc = encTL.get(); 143 | if (enc == null) { 144 | enc = cs.newEncoder() 145 | .onMalformedInput(CodingErrorAction.REPORT) 146 | .onUnmappableCharacter(CodingErrorAction.REPORT); 147 | encTL.set(enc); 148 | } 149 | return enc; 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroRowTopLevelTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import static org.junit.Assert.assertEquals; 21 | import static org.junit.Assert.assertFalse; 22 | import static org.junit.Assert.assertNull; 23 | import static org.junit.Assert.assertTrue; 24 | 25 | import java.io.IOException; 26 | import java.util.Arrays; 27 | import java.util.HashMap; 28 | import java.util.Map; 29 | import java.util.SortedMap; 30 | import java.util.TreeMap; 31 | 32 | import org.apache.accumulo.core.client.lexicoder.LongLexicoder; 33 | import org.apache.accumulo.core.data.Key; 34 | import org.apache.accumulo.core.data.Range; 35 | import org.apache.accumulo.core.data.Value; 36 | import org.apache.accumulo.core.iterators.SortedMapIterator; 37 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder; 38 | import com.microsoft.accumulo.spark.record.RowBuilderField; 39 | import org.apache.avro.Schema; 40 | import org.apache.avro.Schema.Field; 41 | import org.apache.avro.Schema.Type; 42 | import org.apache.avro.generic.GenericRecord; 43 | import org.apache.avro.util.Utf8; 44 | import org.junit.Test; 45 | 46 | public class AvroRowTopLevelTest { 47 | @Test 48 | public void testSchemaGeneration() { 49 | RowBuilderField[] schemaMappingFields = new RowBuilderField[] { 50 | // row 0 51 | new RowBuilderField("cf1", "cq1", "long", "v0"), 52 | // row 1 53 | new RowBuilderField("cf2", null, "double", "v1") }; 54 | 55 | Schema schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields)); 56 | 57 | assertEquals(Type.RECORD, schema.getType()); 58 | assertEquals(2, schema.getFields().size()); 59 | 60 | Field f0 = schema.getFields().get(0); 61 | 62 | // cf1 nested record 63 | assertEquals(Type.RECORD, f0.schema().getType()); 64 | assertEquals(1, f0.schema().getFields().size()); 65 | 66 | // cf1.cq1 nested field 67 | Field f00 = f0.schema().getFields().get(0); 68 | 69 | // nullable long 70 | assertEquals(2, f00.schema().getTypes().size()); 71 | assertEquals(Type.NULL, f00.schema().getTypes().get(0).getType()); 72 | assertEquals(Type.LONG, f00.schema().getTypes().get(1).getType()); 73 | 74 | // cf2 top-level field 75 | Field f1 = schema.getFields().get(1); 76 | 77 | // nullable double 78 | assertEquals(2, f1.schema().getTypes().size()); 79 | assertEquals(Type.DOUBLE, f1.schema().getTypes().get(1).getType()); 80 | } 81 | 82 | @Test 83 | public void testTopLevelFields() throws IOException { 84 | SortedMap map = new TreeMap<>(); 85 | map.put(new Key("key1", "cf1", "cq1"), new Value(new LongLexicoder().encode(3L))); 86 | map.put(new Key("key1", "cf2", ""), new Value("abc")); 87 | 88 | map.put(new Key("key2", "cf2"), new Value("def")); 89 | 90 | SortedMapIterator parentIterator = new SortedMapIterator(map); 91 | AvroRowEncoderIterator iterator = new AvroRowEncoderIterator(); 92 | 93 | Map options = new HashMap<>(); 94 | options.put(AvroRowEncoderIterator.SCHEMA, 95 | "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"t\":\"long\",\"o\":true},{\"cf\":\"cf2\",\"t\":\"STRING\",\"o\":true}]"); 96 | 97 | iterator.init(parentIterator, options, new DefaultIteratorEnvironment()); 98 | iterator.seek(new Range(), AvroUtil.EMPTY_SET, false); 99 | 100 | RowBuilderField[] schemaMappingFields = new RowBuilderField[] { 101 | // row 0 102 | new RowBuilderField("cf1", "cq1", "long", "v0"), 103 | // row 1 104 | new RowBuilderField("cf2", null, "string", "v1") }; 105 | 106 | Schema schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields)); 107 | 108 | // ############################## ROW 1 109 | assertTrue(iterator.hasTop()); 110 | assertEquals("key1", iterator.getTopKey().getRow().toString()); 111 | 112 | // validate value 113 | byte[] data = iterator.getTopValue().get(); 114 | 115 | GenericRecord record = AvroUtil.deserialize(data, schema); 116 | GenericRecord cf1Record = (GenericRecord) record.get("cf1"); 117 | 118 | assertEquals(3L, cf1Record.get("cq1")); 119 | assertEquals("abc", record.get("cf2").toString()); 120 | assertTrue(record.get("cf2") instanceof Utf8); 121 | 122 | // ############################## ROW 2 123 | iterator.next(); 124 | 125 | assertTrue(iterator.hasTop()); 126 | assertEquals("key2", iterator.getTopKey().getRow().toString()); 127 | 128 | // validate value 129 | data = iterator.getTopValue().get(); 130 | 131 | record = AvroUtil.deserialize(data, schema); 132 | cf1Record = (GenericRecord) record.get("cf1"); 133 | 134 | assertNull(cf1Record.get("cq1")); 135 | assertEquals("def", record.get("cf2").toString()); 136 | assertTrue(record.get("cf2") instanceof Utf8); 137 | 138 | // End of data 139 | iterator.next(); 140 | assertFalse(iterator.hasTop()); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /connector/datasource/src/test/scala/com/microsoft/accumulo/VerifyAccumuloSchema.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import java.io.ByteArrayOutputStream 21 | 22 | import org.apache.avro.generic.{GenericRecord, GenericRecordBuilder} 23 | import org.apache.avro.io.EncoderFactory 24 | import org.apache.avro.specific.SpecificDatumWriter 25 | import org.apache.spark.sql.types.{DataTypes, StructField, StructType} 26 | 27 | import org.junit.runner.RunWith 28 | 29 | import org.scalatest.FunSuite 30 | import org.scalatest.junit.JUnitRunner 31 | 32 | @RunWith(classOf[JUnitRunner]) 33 | class VerifyAccumuloSchema extends FunSuite { 34 | test("Validate catalyst schema to json serialization") { 35 | val schema = (new StructType) 36 | .add(StructField("cf1", (new StructType) 37 | .add("cq1", DataTypes.StringType, true) 38 | .add("cq2", DataTypes.DoubleType, true) 39 | , true)) 40 | .add(StructField("cf2", (new StructType) 41 | .add("cq_a", DataTypes.IntegerType, true) 42 | .add("cq_b", DataTypes.FloatType, true) 43 | , true)) 44 | .add("cf3", DataTypes.StringType, false) 45 | 46 | val jsonActual = AvroUtil.catalystSchemaToJson(schema).json 47 | val jsonExpected = "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"fvn\":\"v0\",\"t\":\"STRING\",\"o\":true}" + 48 | ",{\"cf\":\"cf1\",\"cq\":\"cq2\",\"fvn\":\"v1\",\"t\":\"DOUBLE\",\"o\":true}" + 49 | ",{\"cf\":\"cf2\",\"cq\":\"cq_a\",\"fvn\":\"v2\",\"t\":\"INTEGER\",\"o\":true}" + 50 | ",{\"cf\":\"cf2\",\"cq\":\"cq_b\",\"fvn\":\"v3\",\"t\":\"FLOAT\",\"o\":true}" + 51 | ",{\"cf\":\"cf3\",\"fvn\":\"v4\",\"t\":\"STRING\",\"o\":true}]" 52 | 53 | assert(jsonActual == jsonExpected) 54 | } 55 | 56 | test("Validate catalyst schema to json serialization with pruned output schema") { 57 | val inputSchema = (new StructType) 58 | .add(StructField("cf1", (new StructType) 59 | .add("cq1", DataTypes.StringType, true) 60 | .add("cq2", DataTypes.DoubleType, true) 61 | , true)) 62 | .add(StructField("cf2", (new StructType) 63 | .add("cq_a", DataTypes.IntegerType, true) 64 | .add("cq_b", DataTypes.FloatType, true) 65 | , true)) 66 | .add("cf3", DataTypes.StringType, false) 67 | .add("cf4", DataTypes.LongType, false) 68 | 69 | val outputSchema = (new StructType) 70 | .add(StructField("cf1", (new StructType) 71 | .add("cq1", DataTypes.StringType, true) 72 | , true)) 73 | .add("cf3", DataTypes.StringType, false) 74 | 75 | val jsonActual = AvroUtil.catalystSchemaToJson(inputSchema, outputSchema).json 76 | val jsonExpected = "[{\"cf\":\"cf1\",\"cq\":\"cq1\",\"fvn\":\"v0\",\"t\":\"STRING\",\"o\":true}" + 77 | ",{\"cf\":\"cf1\",\"cq\":\"cq2\",\"fvn\":\"v1\",\"t\":\"DOUBLE\",\"o\":false}" + 78 | ",{\"cf\":\"cf2\",\"cq\":\"cq_a\",\"fvn\":\"v2\",\"t\":\"INTEGER\",\"o\":false}" + 79 | ",{\"cf\":\"cf2\",\"cq\":\"cq_b\",\"fvn\":\"v3\",\"t\":\"FLOAT\",\"o\":false}" + 80 | ",{\"cf\":\"cf3\",\"fvn\":\"v4\",\"t\":\"STRING\",\"o\":true}" + 81 | ",{\"cf\":\"cf4\",\"fvn\":\"v5\",\"t\":\"LONG\",\"o\":false}]" 82 | 83 | assert(jsonActual == jsonExpected) 84 | } 85 | 86 | test("Validate catalyst schema to avro serialization") { 87 | val schema = (new StructType) 88 | .add(StructField("cf1", (new StructType) 89 | .add("cq1", DataTypes.StringType, true) 90 | .add("cq2", DataTypes.DoubleType, false) 91 | .add("cq3", DataTypes.DoubleType, true) 92 | , true)) 93 | .add(StructField("cf2", (new StructType) 94 | .add("cq_a", DataTypes.IntegerType, true) 95 | .add("cq_b", DataTypes.FloatType, true) 96 | , true)) 97 | 98 | val avroSchema = AvroUtil.catalystSchemaToAvroSchema(schema) 99 | 100 | val builder = new GenericRecordBuilder(avroSchema) 101 | 102 | val builderCf1 = new GenericRecordBuilder(avroSchema.getField("cf1").schema()) 103 | val builderCf2 = new GenericRecordBuilder(avroSchema.getField("cf2").schema()) 104 | // check if clear() helps perf? 105 | 106 | builderCf1.set("cq1", "foo") 107 | builderCf1.set("cq2", 2.3) 108 | 109 | builderCf2.set("cq_a", 1) 110 | builderCf2.set("cq_b", 1.2f) 111 | 112 | builder.set("cf1", builderCf1.build()) 113 | builder.set("cf2", builderCf2.build()) 114 | 115 | val output = new ByteArrayOutputStream() 116 | val encoder = EncoderFactory.get.jsonEncoder(avroSchema, output) 117 | 118 | val writer = new SpecificDatumWriter[GenericRecord](avroSchema) 119 | writer.write(builder.build(), encoder) 120 | 121 | encoder.flush() 122 | 123 | val jsonActual = new String(output.toByteArray) 124 | 125 | val jsonExpected = "{\"cf1\":{\"cq1\":{\"string\":\"foo\"}," + 126 | "\"cq2\":2.3,\"cq3\":null}," + 127 | "\"cf2\":{\"cq_a\":{\"int\":1},\"cq_b\":{\"float\":1.2}}}" 128 | 129 | assert(jsonActual == jsonExpected) 130 | } 131 | 132 | test("Validate unsupported types") { 133 | val schema = (new StructType) 134 | .add("cf3", DataTypes.CalendarIntervalType, false) 135 | 136 | assertThrows[UnsupportedOperationException] { 137 | AvroUtil.catalystSchemaToAvroSchema(schema) 138 | } 139 | } 140 | } -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.apache.accumulo.core.client.lexicoder._ 21 | import org.apache.accumulo.core.client.{Accumulo, BatchWriterConfig} 22 | import org.apache.accumulo.core.data.Mutation 23 | import org.apache.spark.sql.SaveMode 24 | import org.apache.spark.sql.catalyst.InternalRow 25 | import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage} 26 | import org.apache.spark.sql.types._ 27 | import org.apache.spark.unsafe.types.UTF8String 28 | 29 | class AccumuloDataWriter(tableName: String, 30 | schema: StructType, 31 | mode: SaveMode, 32 | properties: java.util.Properties, 33 | batchThread: Int, 34 | batchMemory: Long, 35 | ignore: Boolean) 36 | extends DataWriter[InternalRow] { 37 | 38 | private val rowKeyIdx = schema.fieldIndex(properties.getProperty("rowkey")) 39 | 40 | private val client = Accumulo.newClient().from(properties).build() 41 | 42 | private val batchWriter = client.createBatchWriter( 43 | tableName, 44 | new BatchWriterConfig().setMaxWriteThreads(batchThread).setMaxMemory(batchMemory)) 45 | 46 | private val doubleEncoder = new DoubleLexicoder 47 | private val floatEncoder = new FloatLexicoder 48 | private val longEncoder = new LongLexicoder 49 | private val intEncoder = new IntegerLexicoder 50 | private val stringEncoder = new StringLexicoder 51 | 52 | private val doubleAccessor = InternalRow.getAccessor(DoubleType) 53 | private val floatAccessor = InternalRow.getAccessor(FloatType) 54 | private val longAccessor = InternalRow.getAccessor(LongType) 55 | private val intAccessor = InternalRow.getAccessor(IntegerType) 56 | private val stringAccessor = InternalRow.getAccessor(StringType) 57 | 58 | private def getEncoder(fieldIdx: Int, field: StructField) = { 59 | field.dataType match { 60 | case DoubleType => (record: InternalRow) => doubleEncoder.encode(doubleAccessor(record, fieldIdx).asInstanceOf[Double]) 61 | case FloatType => (record: InternalRow) => floatEncoder.encode(floatAccessor(record, fieldIdx).asInstanceOf[Float]) 62 | case LongType => (record: InternalRow) => longEncoder.encode(longAccessor(record, fieldIdx).asInstanceOf[Long]) 63 | case IntegerType => (record: InternalRow) => intEncoder.encode(intAccessor(record, fieldIdx).asInstanceOf[Integer]) 64 | case StringType => (record: InternalRow) => { 65 | val obj = stringAccessor(record, fieldIdx) 66 | if (obj == null) null else obj.asInstanceOf[UTF8String].getBytes 67 | } 68 | } 69 | } 70 | 71 | private val structAccessor = InternalRow.getAccessor(new StructType()) 72 | 73 | // pre-compute which fields and how to create the mutations... 74 | private val recordToMutation = schema.fields.zipWithIndex 75 | // exclude rowkey 76 | .filter({ case (_, cfIdx: Int) => cfIdx != rowKeyIdx }) 77 | // loop through the rest of the fields 78 | .map { case (cf: StructField, cfIdx: Int) => 79 | // check which types we have top-level 80 | cf.dataType match { 81 | case ct: StructType => 82 | val nestedFields = ct.fields.zipWithIndex.map { 83 | case (cq: StructField, cqIdx) => 84 | val cfBytes = stringEncoder.encode(cf.name) 85 | val cqBytes = stringEncoder.encode(cq.name) 86 | val encoder = getEncoder(cqIdx, cq) 87 | 88 | (mutation: Mutation, nestedRecord: InternalRow) => { 89 | // not using the fluent interface to provide backward compat 90 | val value = encoder(nestedRecord) 91 | if (value != null) 92 | mutation.put(cfBytes, cqBytes, value) 93 | } 94 | } 95 | 96 | // parent function 97 | (mutation: Mutation, record: InternalRow) => { 98 | val nestedRecord = structAccessor(record, cfIdx).asInstanceOf[InternalRow] 99 | 100 | nestedFields.foreach { _(mutation, nestedRecord) } 101 | } 102 | case _ => 103 | val cfBytes = stringEncoder.encode(cf.name) 104 | val encoder = getEncoder(cfIdx, cf) 105 | 106 | (mutation: Mutation, record: InternalRow) => { 107 | // println(s"\twriting row ${cf.name}") 108 | 109 | // not using the fluent interface to provide backward compatibility 110 | val value = encoder(record) 111 | if (value != null) 112 | mutation.put(cfBytes, Array.empty[Byte], value) 113 | } 114 | } 115 | } 116 | 117 | // TODO: expose this as another input column 118 | // private val columnVisibilityEmpty = new ColumnVisibility 119 | 120 | def write(record: InternalRow): Unit = { 121 | val rowKeyRaw = stringAccessor(record, rowKeyIdx) 122 | 123 | // skip if the rowKey is null or ignore flag is set 124 | if (rowKeyRaw != null && !ignore) { 125 | val rowKey = rowKeyRaw.asInstanceOf[UTF8String].getBytes 126 | 127 | val mutation = new Mutation(rowKey) 128 | recordToMutation.foreach { _(mutation, record) } 129 | batchWriter.addMutation(mutation) 130 | } 131 | } 132 | 133 | def commit(): WriterCommitMessage = { 134 | batchWriter.close() 135 | client.close() 136 | 137 | WriteSucceeded 138 | } 139 | 140 | def abort(): Unit = { 141 | batchWriter.close() 142 | client.close() 143 | } 144 | 145 | object WriteSucceeded extends WriterCommitMessage 146 | } 147 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ZipFileAttributeView.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.io.IOException; 29 | import java.nio.file.attribute.BasicFileAttributeView; 30 | import java.nio.file.attribute.BasicFileAttributes; 31 | import java.nio.file.attribute.FileTime; 32 | import java.nio.file.attribute.PosixFilePermission; 33 | import java.util.LinkedHashMap; 34 | import java.util.Map; 35 | import java.util.Set; 36 | 37 | /** 38 | * @author Xueming Shen, Rajendra Gutupalli, Jaya Hangal 39 | */ 40 | class ZipFileAttributeView implements BasicFileAttributeView { 41 | static enum AttrID { 42 | size, 43 | creationTime, 44 | lastAccessTime, 45 | lastModifiedTime, 46 | isDirectory, 47 | isRegularFile, 48 | isSymbolicLink, 49 | isOther, 50 | fileKey, 51 | compressedSize, 52 | crc, 53 | method, 54 | owner, 55 | group, 56 | permissions 57 | } 58 | 59 | final ZipPath path; 60 | private final boolean isZipView; 61 | 62 | ZipFileAttributeView(ZipPath path, boolean isZipView) { 63 | this.path = path; 64 | this.isZipView = isZipView; 65 | } 66 | 67 | @Override 68 | public String name() { 69 | return isZipView ? "zip" : "basic"; 70 | } 71 | 72 | @Override 73 | public BasicFileAttributes readAttributes() throws IOException { 74 | return path.readAttributes(); 75 | } 76 | 77 | @Override 78 | public void setTimes(FileTime lastModifiedTime, 79 | FileTime lastAccessTime, 80 | FileTime createTime) 81 | throws IOException 82 | { 83 | path.setTimes(lastModifiedTime, lastAccessTime, createTime); 84 | } 85 | 86 | public void setPermissions(Set perms) throws IOException { 87 | path.setPermissions(perms); 88 | } 89 | 90 | @SuppressWarnings("unchecked") 91 | void setAttribute(String attribute, Object value) 92 | throws IOException 93 | { 94 | try { 95 | if (AttrID.valueOf(attribute) == AttrID.lastModifiedTime) 96 | setTimes((FileTime)value, null, null); 97 | if (AttrID.valueOf(attribute) == AttrID.lastAccessTime) 98 | setTimes(null, (FileTime)value, null); 99 | if (AttrID.valueOf(attribute) == AttrID.creationTime) 100 | setTimes(null, null, (FileTime)value); 101 | if (AttrID.valueOf(attribute) == AttrID.permissions) 102 | setPermissions((Set)value); 103 | } catch (IllegalArgumentException x) { 104 | throw new UnsupportedOperationException("'" + attribute + 105 | "' is unknown or read-only attribute"); 106 | } 107 | } 108 | 109 | Map readAttributes(String attributes) 110 | throws IOException 111 | { 112 | ZipFileAttributes zfas = (ZipFileAttributes)readAttributes(); 113 | LinkedHashMap map = new LinkedHashMap<>(); 114 | if ("*".equals(attributes)) { 115 | for (AttrID id : AttrID.values()) { 116 | try { 117 | map.put(id.name(), attribute(id, zfas)); 118 | } catch (IllegalArgumentException x) {} 119 | } 120 | } else { 121 | String[] as = attributes.split(","); 122 | for (String a : as) { 123 | try { 124 | map.put(a, attribute(AttrID.valueOf(a), zfas)); 125 | } catch (IllegalArgumentException x) {} 126 | } 127 | } 128 | return map; 129 | } 130 | 131 | Object attribute(AttrID id, ZipFileAttributes zfas) { 132 | switch (id) { 133 | case size: 134 | return zfas.size(); 135 | case creationTime: 136 | return zfas.creationTime(); 137 | case lastAccessTime: 138 | return zfas.lastAccessTime(); 139 | case lastModifiedTime: 140 | return zfas.lastModifiedTime(); 141 | case isDirectory: 142 | return zfas.isDirectory(); 143 | case isRegularFile: 144 | return zfas.isRegularFile(); 145 | case isSymbolicLink: 146 | return zfas.isSymbolicLink(); 147 | case isOther: 148 | return zfas.isOther(); 149 | case fileKey: 150 | return zfas.fileKey(); 151 | case compressedSize: 152 | if (isZipView) 153 | return zfas.compressedSize(); 154 | break; 155 | case crc: 156 | if (isZipView) 157 | return zfas.crc(); 158 | break; 159 | case method: 160 | if (isZipView) 161 | return zfas.method(); 162 | break; 163 | case permissions: 164 | if (isZipView) { 165 | return zfas.storedPermissions().orElse(null); 166 | } 167 | break; 168 | default: 169 | break; 170 | } 171 | return null; 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/AvroFastRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.record; 19 | 20 | import java.util.HashMap; 21 | import java.util.Map; 22 | import java.util.stream.Collectors; 23 | 24 | import org.apache.accumulo.core.client.lexicoder.Encoder; 25 | import org.apache.accumulo.core.data.ArrayByteSequence; 26 | import org.apache.accumulo.core.data.ByteSequence; 27 | import com.microsoft.accumulo.spark.juel.AvroUtf8Wrapper; 28 | import org.apache.avro.Schema; 29 | import org.apache.avro.Schema.Field; 30 | import org.apache.avro.Schema.Type; 31 | import org.apache.avro.generic.GenericContainer; 32 | import org.apache.avro.generic.IndexedRecord; 33 | 34 | /** 35 | * This class collects all cells of interest into an AVRO Generic Record. 36 | * 37 | * Cells with non-empty column family and column qualifier are stored in nested 38 | * AVRO records. Cells with empty column qualifier are stored in the top-level 39 | * record. 40 | * 41 | * Example: 42 | * 43 | *
 44 |  * cf1, cq1,  abc
 45 |  * cf1, cq2,  3.2
 46 |  * cf2, null, 6
 47 |  * cf3, cq3,  def
 48 |  * 
49 | * 50 | * Avro Record: 51 | * 52 | *
 53 |  * { 
 54 |  * 	 cf1: { cq1: "abc", cq2: 3.2 }, 
 55 |  * 	 cf2: 6, 
 56 |  *   cf3: { cq3: "def " }
 57 |  * }
 58 |  * 
59 | */ 60 | public class AvroFastRecord implements GenericContainer, IndexedRecord { 61 | 62 | private static ByteSequence EMPTY_SEQUENCE = new ArrayByteSequence(new byte[0]); 63 | 64 | /** 65 | * The Avro schema. 66 | */ 67 | private Schema schema; 68 | 69 | /** 70 | * The data array. 71 | */ 72 | private Object[] values; 73 | 74 | /** 75 | * The nested records. 76 | */ 77 | private AvroFastRecord[] nestedFields; 78 | 79 | /** 80 | * The primitive field indices for fast clearing. 81 | */ 82 | private int[] primitiveFields; 83 | 84 | public AvroFastRecord(Schema schema) { 85 | this.schema = schema; 86 | this.values = new Object[schema.getFields().size()]; 87 | 88 | // find all nested nested fields 89 | this.nestedFields = schema.getFields().stream().filter(f -> f.schema().getType() == Type.RECORD).map(f -> { 90 | AvroFastRecord rec = new AvroFastRecord(f.schema()); 91 | this.values[f.pos()] = rec; 92 | return rec; 93 | }).toArray(AvroFastRecord[]::new); 94 | 95 | // find all primitive fields 96 | this.primitiveFields = schema.getFields().stream().filter(f -> f.schema().getType() != Type.RECORD) 97 | .mapToInt(Field::pos).toArray(); 98 | } 99 | 100 | /** 101 | * Clears all primitive fields (including nested record once). 102 | */ 103 | public void clear() { 104 | for (int idx : this.primitiveFields) 105 | this.values[idx] = null; 106 | 107 | for (AvroFastRecord rec : this.nestedFields) 108 | rec.clear(); 109 | } 110 | 111 | @Override 112 | public void put(int i, Object v) { 113 | this.values[i] = v; 114 | } 115 | 116 | @Override 117 | public Object get(int i) { 118 | return this.values[i]; 119 | } 120 | 121 | @Override 122 | public Schema getSchema() { 123 | return this.schema; 124 | } 125 | 126 | /** 127 | * Create the core lookup map for column family/column qualifier. The leave 128 | * nodes are consumers that know which record/field to target. 129 | * 130 | * @param rootRecord the root Avro record. 131 | * @return the lookup map. 132 | */ 133 | public static Map> createCellToFieldMap( 134 | AvroFastRecord rootRecord) { 135 | Map> map = new HashMap<>(); 136 | 137 | // setup GenericRecordBuilder for each column family 138 | for (Field field : rootRecord.getSchema().getFields()) { 139 | Schema nestedSchema = field.schema(); 140 | 141 | ByteSequence columnFamily = new ArrayByteSequence(field.name()); 142 | 143 | // top-level field 144 | if (nestedSchema.getType() != Type.RECORD) { 145 | // Map.of(...) in older JDK 146 | Map subMap = new HashMap<>(); 147 | subMap.put(EMPTY_SEQUENCE, createAvroCellConsumer(rootRecord, field)); 148 | 149 | map.put(columnFamily, subMap); 150 | 151 | continue; 152 | } 153 | 154 | // nested fields 155 | Map nestedLookupMap = nestedSchema.getFields().stream() 156 | .collect(Collectors.toMap( 157 | // nested name as key 158 | nestedField -> new ArrayByteSequence(nestedField.name()), 159 | // assign cells to field in nested record 160 | nestedField -> createAvroCellConsumer((AvroFastRecord) rootRecord.get(field.pos()), nestedField))); 161 | 162 | map.put(columnFamily, nestedLookupMap); 163 | } 164 | 165 | return map; 166 | } 167 | 168 | /** 169 | * Creates a consumer of cells that copy the data into the corresponding Avro 170 | * record fields. 171 | * 172 | * @param record The record to populate. 173 | * @param field The field to populate 174 | * @return The closure holding things together. 175 | */ 176 | private static RowBuilderCellConsumer createAvroCellConsumer(AvroFastRecord record, Field field) { 177 | int pos = field.pos(); 178 | 179 | if (field.schema().getType() == Type.STRING) 180 | // avoid byte array copying 181 | return (key, value) -> record.put(pos, new AvroUtf8Wrapper(value.get())); 182 | 183 | // get the fitting encoder 184 | Encoder encoder = RowBuilderType.valueOf(field.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE)).getEncoder(); 185 | return (key, value) -> record.put(pos, encoder.decode(value.get())); 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/AvroUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.apache.avro.{Schema, SchemaBuilder} 21 | import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType} 22 | import org.codehaus.jackson.map.ObjectMapper 23 | import org.codehaus.jackson.map.annotate.JsonSerialize.Inclusion 24 | 25 | import scala.beans.BeanProperty 26 | 27 | // keeping the property names short to not hit any limits 28 | case class RowBuilderField(@BeanProperty cf: String, // column family 29 | @BeanProperty cq: String, // column qualifier 30 | @BeanProperty fvn: String, // filter variable name 31 | @BeanProperty t: String, // type 32 | @BeanProperty o: Boolean // output 33 | ) 34 | 35 | case class JsonSchema(json: String, attributeToVariableMapping: Map[String, String]) 36 | 37 | @SerialVersionUID(1L) 38 | object AvroUtil { 39 | def catalystSchemaToJson(inputSchema: StructType): JsonSchema = catalystSchemaToJson(inputSchema, inputSchema) 40 | 41 | def catalystSchemaToJson(inputSchema: StructType, outputSchema: StructType): JsonSchema = { 42 | 43 | var attributeToVariableMapping = scala.collection.mutable.Map[String, String]() 44 | 45 | var i = 0 46 | val selectedFields = inputSchema.fields.flatMap(cf => { 47 | val outputField = outputSchema.find(f => f.name == cf.name) 48 | 49 | cf.dataType match { 50 | case cft: StructType => cft.fields.map(cq => 51 | RowBuilderField( 52 | cf.name, 53 | cq.name, 54 | { 55 | val variableName = s"v$i" 56 | attributeToVariableMapping += (s"${cf.name}.${cq.name}" -> variableName) 57 | i += 1 58 | 59 | variableName 60 | }, 61 | // TODO: toUpperCase() is weird... 62 | cq.dataType.typeName.toUpperCase, 63 | // either the column family is not need -> output = false 64 | // otherwise we need to check if the column qualifier is present in the output list 65 | if (outputField.isEmpty) false else outputField.get.dataType.asInstanceOf[StructType].exists(f => f.name == cq.name) 66 | ) 67 | ) 68 | case _: DataType => Seq(RowBuilderField( 69 | cf.name, 70 | null, 71 | { 72 | val variableName = s"v$i" 73 | attributeToVariableMapping += (s"${cf.name}" -> variableName) 74 | i += 1 75 | 76 | variableName 77 | }, 78 | // TODO: toUpperCase() is weird... 79 | cf.dataType.typeName.toUpperCase, 80 | outputField.isDefined 81 | )) 82 | } 83 | }) 84 | 85 | try { 86 | val mapper = new ObjectMapper() 87 | 88 | // disable serialization of null-values 89 | mapper.setSerializationInclusion(Inclusion.NON_NULL) 90 | 91 | JsonSchema(mapper.writeValueAsString(selectedFields), attributeToVariableMapping.toMap) 92 | } catch { 93 | case e: Exception => 94 | throw new IllegalArgumentException(e) 95 | } 96 | } 97 | 98 | implicit class CatalystSchemaToAvroRecordBuilder(builder: SchemaBuilder.FieldAssembler[Schema]) { 99 | def addAvroRecordField(field: StructField): SchemaBuilder.FieldAssembler[Schema] = { 100 | (field.dataType, field.nullable) match { 101 | case (DataTypes.BinaryType, true) => builder.optionalBytes(field.name) 102 | case (DataTypes.BinaryType, false) => builder.requiredBytes(field.name) 103 | case (DataTypes.BooleanType, true) => builder.optionalBoolean(field.name) 104 | case (DataTypes.BooleanType, false) => builder.requiredBoolean(field.name) 105 | case (DataTypes.DoubleType, true) => builder.optionalDouble(field.name) 106 | case (DataTypes.DoubleType, false) => builder.requiredDouble(field.name) 107 | case (DataTypes.FloatType, true) => builder.optionalFloat(field.name) 108 | case (DataTypes.FloatType, false) => builder.requiredFloat(field.name) 109 | case (DataTypes.IntegerType, true) => builder.optionalInt(field.name) 110 | case (DataTypes.IntegerType, false) => builder.requiredInt(field.name) 111 | case (DataTypes.LongType, true) => builder.optionalLong(field.name) 112 | case (DataTypes.LongType, false) => builder.requiredLong(field.name) 113 | case (DataTypes.StringType, true) => builder.optionalString(field.name) 114 | case (DataTypes.StringType, false) => builder.requiredString(field.name) 115 | // TODO: date/time support? 116 | case _ => throw new UnsupportedOperationException(s"Unsupported type: $field.dataType") 117 | } 118 | } 119 | 120 | def addAvroRecordFields(schema: StructType): SchemaBuilder.FieldAssembler[Schema] = { 121 | schema.fields.foldLeft(builder) { (builder, field) => builder.addAvroRecordField(field) } 122 | } 123 | } 124 | 125 | def catalystSchemaToAvroSchema(schema: StructType): Schema = { 126 | val fieldBuilder = SchemaBuilder.record("root") 127 | .fields() 128 | 129 | schema.fields.foldLeft(fieldBuilder) { (_, field) => 130 | field.dataType match { 131 | // nested fields 132 | case cft: StructType => 133 | fieldBuilder 134 | .name(field.name) 135 | .`type`(SchemaBuilder 136 | .record(field.name) 137 | .fields 138 | .addAvroRecordFields(cft) 139 | .endRecord()) 140 | .noDefault() 141 | // top level fields 142 | case _ => fieldBuilder.addAvroRecordField(field) 143 | } 144 | } 145 | .endRecord() 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/AvroSchemaBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.record; 19 | 20 | import java.util.Collection; 21 | 22 | import org.apache.avro.Schema; 23 | import org.apache.avro.SchemaBuilder; 24 | 25 | /** 26 | * Builds the AVRO Schema from the user-supplied JSON encoded schema. 27 | */ 28 | public class AvroSchemaBuilder { 29 | public static final String PROPERTY_ROWBUILDERTYPE = "rowBuilderType"; 30 | 31 | public static final String PROPERTY_OUTPUT = "output"; 32 | 33 | private static SchemaBuilder.FieldAssembler addAvroField(SchemaBuilder.FieldAssembler builder, 34 | RowBuilderField field, String name) { 35 | 36 | RowBuilderType type = field.getRowBuilderType(); 37 | 38 | SchemaBuilder.FieldBuilder fieldBuilder = builder 39 | // configure the field name 40 | .name(name); 41 | 42 | // pass in alias 43 | if (field.getFilterVariableName() != null && field.getFilterVariableName().length() > 0) 44 | fieldBuilder = fieldBuilder.aliases(field.getFilterVariableName()); 45 | 46 | SchemaBuilder.FieldTypeBuilder intermediate = fieldBuilder 47 | // encode rowBuilderType so we can only operator on schema 48 | .prop(PROPERTY_ROWBUILDERTYPE, type.name()) 49 | // encode if this is an output field 50 | .prop(PROPERTY_OUTPUT, Boolean.toString(field.isOutput())) 51 | // all fields are optional 52 | .type(); 53 | 54 | if (field.isNullable()) { 55 | SchemaBuilder.BaseTypeBuilder> optionalType = intermediate.optional(); 56 | switch (type) { 57 | case String: 58 | return optionalType.stringType(); 59 | case Long: 60 | return optionalType.longType(); 61 | case Integer: 62 | return optionalType.intType(); 63 | case Double: 64 | return optionalType.doubleType(); 65 | case Float: 66 | return optionalType.floatType(); 67 | case Boolean: 68 | return optionalType.booleanType(); 69 | case Bytes: 70 | return optionalType.bytesType(); 71 | default: 72 | throw new IllegalArgumentException("Unsupported type '" + type + "'"); 73 | } 74 | } else { 75 | switch (type) { 76 | case String: 77 | return intermediate.stringType().noDefault(); 78 | case Long: 79 | return intermediate.longType().noDefault(); 80 | case Integer: 81 | return intermediate.intType().noDefault(); 82 | case Double: 83 | return intermediate.doubleType().noDefault(); 84 | case Float: 85 | return intermediate.floatType().noDefault(); 86 | case Boolean: 87 | return intermediate.booleanType().noDefault(); 88 | case Bytes: 89 | return intermediate.bytesType().noDefault(); 90 | default: 91 | throw new IllegalArgumentException("Unsupported type '" + type + "'"); 92 | } 93 | } 94 | } 95 | 96 | private static SchemaBuilder.FieldAssembler closeFieldAssembler( 97 | SchemaBuilder.FieldAssembler rootAssembler, SchemaBuilder.FieldAssembler columnFieldsAssembler, 98 | String columnFamily, boolean output) { 99 | 100 | if (columnFieldsAssembler == null) 101 | return rootAssembler; 102 | 103 | // add nested type to to root assembler 104 | return rootAssembler 105 | // name the record field 106 | .name(columnFamily) 107 | // any of the column sub fields need to be output? 108 | .prop(PROPERTY_OUTPUT, Boolean.toString(output)) 109 | // it's a record type 110 | .type(columnFieldsAssembler.endRecord()).noDefault(); 111 | } 112 | 113 | public static Schema buildSchema(Collection schemaFields) { 114 | // construct schema 115 | SchemaBuilder.FieldAssembler rootAssembler = SchemaBuilder.record("root").fields(); 116 | 117 | // note that the order needs to be exactly in-sync with the avro schema 118 | // generated on the MMLSpark/Scala side 119 | String lastColumnFamily = null; 120 | SchemaBuilder.FieldAssembler columnFieldsAssembler = null; 121 | boolean output = false; 122 | for (RowBuilderField schemaField : schemaFields) { 123 | 124 | String columnFamily = schemaField.getColumnFamily(); 125 | String columnQualifier = schemaField.getColumnQualifier(); 126 | 127 | if (columnQualifier != null) { 128 | if (lastColumnFamily == null || !lastColumnFamily.equals(columnFamily)) { 129 | 130 | // close previous record 131 | rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output); 132 | 133 | // open new record 134 | columnFieldsAssembler = SchemaBuilder.record(columnFamily).fields(); 135 | 136 | output = false; 137 | } 138 | 139 | // true if any of the column qualifiers is an output field 140 | output |= (boolean) schemaField.isOutput(); 141 | 142 | // add the current field 143 | columnFieldsAssembler = addAvroField(columnFieldsAssembler, schemaField, columnQualifier); 144 | } else { 145 | // close previous record 146 | rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output); 147 | columnFieldsAssembler = null; 148 | output = false; 149 | 150 | // add the top-level field 151 | rootAssembler = addAvroField(rootAssembler, schemaField, columnFamily); 152 | } 153 | 154 | lastColumnFamily = columnFamily; 155 | } 156 | 157 | rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output); 158 | 159 | // setup serialization 160 | return rootAssembler.endRecord(); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloInputPartitionReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import java.io.IOException 21 | 22 | import org.apache.accumulo.core.client.{Accumulo, IteratorSetting} 23 | import org.apache.accumulo.core.data.{Key, Range} 24 | import org.apache.accumulo.core.security.Authorizations 25 | import org.apache.avro.generic.GenericRecord 26 | import org.apache.avro.io.{BinaryDecoder, DecoderFactory} 27 | import org.apache.avro.specific.SpecificDatumReader 28 | import org.apache.hadoop.io.Text 29 | import org.apache.log4j.Logger 30 | import org.apache.spark.sql.avro.AvroDeserializer 31 | import org.apache.spark.sql.catalyst.InternalRow 32 | import org.apache.spark.sql.sources.v2.reader.InputPartitionReader 33 | import org.apache.spark.sql.types.StructType 34 | import org.apache.spark.unsafe.types.UTF8String 35 | import scala.collection.JavaConverters._ 36 | 37 | @SerialVersionUID(1L) 38 | class AccumuloInputPartitionReader(tableName: String, 39 | ranges: Seq[Seq[Array[Byte]]], 40 | inputSchema: StructType, 41 | outputSchema: StructType, 42 | properties: java.util.Properties, 43 | rowKeyColumn: String, 44 | filterInJuel: Option[String]) 45 | extends InputPartitionReader[InternalRow] with Serializable { 46 | 47 | private val logger = Logger.getLogger(classOf[AccumuloInputPartitionReader]) 48 | 49 | val defaultPriority = "20" 50 | val defaultNumQueryThreads: String = math.min(16, ranges.length).toString 51 | 52 | private val priority = Integer.valueOf(properties.getProperty("priority", defaultPriority)) 53 | // this parameter is impacted by number of accumulo splits and spark partitions and executors 54 | private val numQueryThreads = Integer.valueOf(properties.getProperty("numQueryThreads", defaultNumQueryThreads)) 55 | 56 | private val authorizations = new Authorizations() 57 | private val client = Accumulo.newClient().from(properties).build() 58 | private val scanner = client.createBatchScanner(tableName, authorizations, numQueryThreads) 59 | 60 | private def createRange(start: Array[Byte], stop: Array[Byte]) = 61 | new Range( 62 | if (start.length == 0) null else new Key(start), 63 | start.length == 0, 64 | if (stop.length == 0) null else new Key(stop), 65 | true) 66 | 67 | scanner.setRanges(ranges.map(t => createRange(t(0), t(1))).asJava) 68 | 69 | private val avroIterator = new IteratorSetting( 70 | priority, 71 | "AVRO", 72 | "com.microsoft.accumulo.spark.AvroRowEncoderIterator") 73 | 74 | // only fetch column families we care for (and don't filter for the mleapFields which are artificially added later) 75 | inputSchema.fields.foreach(f => scanner.fetchColumnFamily(f.name)) 76 | 77 | private val rowKeyColumnIndex = { 78 | if (outputSchema.fieldNames.contains(rowKeyColumn)) 79 | outputSchema.fieldIndex(rowKeyColumn) 80 | else 81 | -1 82 | } 83 | 84 | // AVRO Iterator setup 85 | val jsonSchema: String = AvroUtil.catalystSchemaToJson(inputSchema, outputSchema).json 86 | 87 | logger.info(s"JSON schema: $jsonSchema") 88 | avroIterator.addOption("schema", jsonSchema) 89 | if (filterInJuel.isDefined) 90 | avroIterator.addOption("filter", filterInJuel.get) 91 | 92 | // list of output columns 93 | // val prunedColumns = schema.map(_.name).mkString(",") 94 | // logger.info(s"Pruned columns: ${prunedColumns}") 95 | // avroIterator.addOption("prunedcolumns", prunedColumns) 96 | 97 | // forward options 98 | Seq("mleap", "mleapfilter", "mleapguid", "exceptionlogfile") 99 | .foreach { key => avroIterator.addOption(key, properties.getProperty(key, "")) } 100 | 101 | scanner.addScanIterator(avroIterator) 102 | 103 | // TODO: support additional user-supplied iterators 104 | private val scannerIterator = scanner.iterator() 105 | 106 | // filter out row-key target from schema generation 107 | private val schemaWithoutRowKey = new StructType(outputSchema.fields.filter(_.name != rowKeyColumn)) 108 | 109 | // the serialized AVRO does not contain the row key as it comes with the key/value pair anyway 110 | private val avroSchema = AvroUtil.catalystSchemaToAvroSchema(schemaWithoutRowKey) 111 | 112 | // pass the schema for the avro input along with the target output schema (incl. row key) 113 | private val deserializer = new AvroDeserializer(avroSchema, outputSchema) 114 | private val reader = new SpecificDatumReader[GenericRecord](avroSchema) 115 | 116 | private var decoder: BinaryDecoder = _ 117 | private var currentRow: InternalRow = _ 118 | private var datum: GenericRecord = _ 119 | 120 | private val rowKeyText = new Text() 121 | 122 | override def close(): Unit = { 123 | if (scanner != null) 124 | scanner.close() 125 | 126 | if (client != null) 127 | client.close() 128 | } 129 | 130 | @IOException 131 | override def next: Boolean = { 132 | if (scannerIterator.hasNext) { 133 | val entry = scannerIterator.next 134 | val data = entry.getValue.get 135 | 136 | // byte[] -> avro 137 | decoder = DecoderFactory.get.binaryDecoder(data, decoder) 138 | datum = reader.read(datum, decoder) 139 | 140 | // avro -> catalyst 141 | currentRow = deserializer.deserialize(datum).asInstanceOf[InternalRow] 142 | 143 | if (rowKeyColumnIndex >= 0) { 144 | // move row key id into internalrow 145 | entry.getKey.getRow(rowKeyText) 146 | 147 | // avoid yet another byte array copy... 148 | val str = UTF8String.fromBytes(rowKeyText.getBytes, 0, rowKeyText.getLength) 149 | currentRow.update(rowKeyColumnIndex, str) 150 | } 151 | 152 | true 153 | } else { 154 | false 155 | } 156 | } 157 | 158 | override def get(): InternalRow = currentRow 159 | } -------------------------------------------------------------------------------- /connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowComputedColumns.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark.processors; 19 | 20 | import java.io.IOException; 21 | import java.util.ArrayList; 22 | import java.util.Collection; 23 | import java.util.List; 24 | import java.util.Map; 25 | import java.util.stream.Collectors; 26 | 27 | import javax.el.ExpressionFactory; 28 | import javax.el.ValueExpression; 29 | 30 | import com.microsoft.accumulo.spark.juel.AvroELContext; 31 | import com.microsoft.accumulo.spark.record.RowBuilderField; 32 | import com.microsoft.accumulo.spark.record.RowBuilderType; 33 | import org.apache.avro.Schema; 34 | import org.apache.avro.Schema.Field; 35 | import org.apache.avro.generic.IndexedRecord; 36 | import org.apache.hadoop.io.Text; 37 | 38 | /** 39 | * Holds all computed columns.
40 | * Note: it's a bit convoluted as we first have to parse the options to figure 41 | * which additional columns we have, return to the caller so we can setup the 42 | * AVRO schema and then continue the setup here. 43 | */ 44 | public class AvroRowComputedColumns extends AvroRowConsumer { 45 | public static final String COLUMN_PREFIX = "column."; 46 | 47 | /** 48 | * Required for copy. 49 | */ 50 | private Schema schema; 51 | 52 | /** 53 | * JUEL expression context exposing AVRO GenericRecord 54 | */ 55 | private AvroELContext expressionContext; 56 | 57 | /** 58 | * Definitions created from user-supplied options. 59 | */ 60 | private List expressionColumnDefinitions; 61 | 62 | /** 63 | * The executable column expressions. 64 | */ 65 | private List expressionColumns; 66 | 67 | /** 68 | * Just the definition of the expression. Need to collect them all first so the 69 | * AVRO schema can be build. 70 | */ 71 | static class ExpressionColumnDefinition { 72 | private RowBuilderField schemaField; 73 | 74 | private String expression; 75 | 76 | public ExpressionColumnDefinition(RowBuilderField schemaField, String expression) { 77 | this.schemaField = schemaField; 78 | this.expression = expression; 79 | } 80 | 81 | public RowBuilderField getSchemaField() { 82 | return schemaField; 83 | } 84 | 85 | public String getExpression() { 86 | return expression; 87 | } 88 | } 89 | 90 | /** 91 | * The fully initialized expression ready to be computed. 92 | */ 93 | class ExpressionColumn { 94 | private ValueExpression columnExpression; 95 | 96 | private int pos; 97 | 98 | public ExpressionColumn(ValueExpression columnExpression, int pos) { 99 | this.columnExpression = columnExpression; 100 | this.pos = pos; 101 | } 102 | 103 | public void setFieldValue(IndexedRecord record) { 104 | Object value = this.columnExpression.getValue(AvroRowComputedColumns.this.expressionContext); 105 | record.put(this.pos, value); 106 | } 107 | } 108 | 109 | /** 110 | * Factory method creating the row processor if valid options are supplied or 111 | * null if none are found. 112 | */ 113 | public static AvroRowComputedColumns create(Map options) { 114 | // expression setup 115 | // options: column.., JUEL expression 116 | List expressionColumnDefinitions = new ArrayList<>(); 117 | 118 | for (Map.Entry entry : options.entrySet()) { 119 | if (!entry.getKey().startsWith(COLUMN_PREFIX)) 120 | continue; 121 | 122 | String[] arr = entry.getKey().split("\\."); 123 | if (arr.length != 3) 124 | throw new IllegalArgumentException( 125 | "Unable to parse column specification. column..: " + entry.getKey()); 126 | 127 | String column = arr[1]; 128 | String type = RowBuilderType.valueOfIgnoreCase(arr[2]).name(); 129 | String expression = entry.getValue(); 130 | RowBuilderField schemaField = new RowBuilderField(column, null, type, column); 131 | 132 | expressionColumnDefinitions.add(new ExpressionColumnDefinition(schemaField, expression)); 133 | } 134 | 135 | return expressionColumnDefinitions.isEmpty() ? null : new AvroRowComputedColumns(expressionColumnDefinitions); 136 | } 137 | 138 | private AvroRowComputedColumns(List expressionColumnDefinitions) { 139 | this.expressionColumnDefinitions = expressionColumnDefinitions; 140 | } 141 | 142 | /** 143 | * 144 | * @return a collection of RowBuilderFields based on the column expression 145 | * definitions. 146 | */ 147 | @Override 148 | public Collection getSchemaFields() { 149 | return this.expressionColumnDefinitions.stream().map(ExpressionColumnDefinition::getSchemaField) 150 | .collect(Collectors.toList()); 151 | } 152 | 153 | /** 154 | * Initialize the columns expression. Can't be done in the constructor as the 155 | * schema wasn't ready. 156 | * 157 | * @param schema the AVRO input schema. 158 | */ 159 | @Override 160 | public void initialize(Schema schema) { 161 | this.schema = schema; 162 | this.expressionContext = new AvroELContext(schema); 163 | 164 | ExpressionFactory factory = ExpressionFactory.newInstance(); 165 | 166 | this.expressionColumns = this.expressionColumnDefinitions.stream().map(expr -> { 167 | Field field = schema.getField(expr.getSchemaField().getColumnFamily()); 168 | 169 | RowBuilderType type = expr.getSchemaField().getRowBuilderType(); 170 | ValueExpression columnExpression = factory.createValueExpression(expressionContext, expr.getExpression(), 171 | type.getJavaClass()); 172 | 173 | return new ExpressionColumn(columnExpression, field.pos()); 174 | }).collect(Collectors.toList()); 175 | } 176 | 177 | @Override 178 | protected boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException { 179 | this.expressionContext.setCurrent(rowKey, record); 180 | 181 | // compute each expression 182 | for (ExpressionColumn expr : this.expressionColumns) 183 | expr.setFieldValue(record); 184 | 185 | return true; 186 | } 187 | 188 | @Override 189 | public AvroRowConsumer clone() { 190 | AvroRowComputedColumns copy = new AvroRowComputedColumns(this.expressionColumnDefinitions); 191 | 192 | copy.initialize(this.schema); 193 | 194 | return copy; 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataSourceReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo 19 | 20 | import org.apache.accumulo.core.client.Accumulo 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | import org.apache.spark.sql.sources.Filter 23 | import org.apache.spark.sql.sources.v2.DataSourceOptions 24 | import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition, InputPartitionReader} 25 | import org.apache.spark.sql.types.{DataTypes, StructType} 26 | import scala.collection.JavaConverters._ 27 | import scala.collection.mutable.ArrayBuffer 28 | import org.apache.log4j.Logger 29 | import java.util.UUID 30 | 31 | // TODO: https://github.com/apache/spark/blob/053dd858d38e6107bc71e0aa3a4954291b74f8c8/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java 32 | // in head of spark github repo 33 | // import org.apache.spark.sql.connector.read.{SupportsPushDownFilters, SupportsPushDownRequiredColumns} 34 | import org.apache.spark.sql.sources.v2.reader.{SupportsPushDownFilters, SupportsPushDownRequiredColumns} 35 | 36 | 37 | @SerialVersionUID(1L) 38 | class AccumuloDataSourceReader(schema: StructType, options: DataSourceOptions) 39 | extends DataSourceReader with Serializable with SupportsPushDownRequiredColumns with SupportsPushDownFilters { 40 | private val logger = Logger.getLogger(classOf[AccumuloDataSourceReader]) 41 | 42 | private val defaultMaxPartitions = 200 43 | 44 | var filters = Array.empty[Filter] 45 | 46 | val rowKeyColumn: String = options.get("rowkey").orElse("rowkey") 47 | val schemaWithOutRowKey = new StructType(schema.filter { _.name != rowKeyColumn }.toArray) 48 | 49 | // initialize output schema with full schema 50 | private var requiredSchema = { 51 | // adding rowKey 52 | val baseSchema = schemaWithOutRowKey.add(rowKeyColumn, DataTypes.StringType, nullable = true) 53 | 54 | // add any output fields we find in a mleap pipeline 55 | val mleapFields = MLeapUtil.mleapSchemaToCatalyst(options.get("mleap").orElse("")) 56 | 57 | StructType(baseSchema ++ mleapFields) 58 | } 59 | 60 | private var filterInJuel: Option[String] = None 61 | 62 | override def pruneColumns(requiredSchema: StructType): Unit = { 63 | this.requiredSchema = requiredSchema 64 | } 65 | 66 | def readSchema: StructType = requiredSchema 67 | 68 | override def pushFilters(filters: Array[Filter]): Array[Filter] = { 69 | // unfortunately predicates on nested elements are not pushed down by Spark 70 | // https://issues.apache.org/jira/browse/SPARK-17636 71 | // https://github.com/apache/spark/pull/22535 72 | 73 | val jsonSchema = AvroUtil.catalystSchemaToJson(schemaWithOutRowKey) 74 | val result = new FilterToJuel(jsonSchema.attributeToVariableMapping, rowKeyColumn) 75 | .serializeFilters(filters, options.get("filter").orElse("")) 76 | 77 | this.filters = result.supportedFilters.toArray 78 | 79 | if (result.serializedFilter.length > 0) { 80 | this.filterInJuel = Some("${" + result.serializedFilter + "}") 81 | logger.info(s"JUEL filter: ${this.filterInJuel}") 82 | } 83 | 84 | result.unsupportedFilters.toArray 85 | } 86 | 87 | override def pushedFilters(): Array[Filter] = filters 88 | 89 | def planInputPartitions: java.util.List[InputPartition[InternalRow]] = { 90 | val tableName = options.tableName.get 91 | val maxPartitions = options.getInt("maxPartitions", defaultMaxPartitions) 92 | val properties = new java.util.Properties() 93 | // can use .putAll(options.asMap()) due to https://github.com/scala/bug/issues/10418 94 | options.asMap.asScala.foreach { case (k, v) => properties.setProperty(k, v) } 95 | 96 | // pass GUID to iterator so we can perform fast cache lookup 97 | // needs to be done on the head node so that all have the same guid 98 | properties.setProperty("mleapguid", UUID.randomUUID.toString) 99 | 100 | val splits = ArrayBuffer(Array.empty[Byte], Array.empty[Byte]) 101 | 102 | val client = Accumulo.newClient().from(properties).build() 103 | // it's possible to merge on the accumulo side 104 | // val tableSplits = client.tableOperations().listSplits(tableName, maxPartitions) 105 | val tableSplits = try { 106 | client.tableOperations().listSplits(tableName) 107 | } 108 | finally { 109 | client.close() 110 | } 111 | 112 | // on deployed clusters a table with no split will return a single empty Text instance 113 | val containsSingleEmptySplit = 114 | tableSplits.size == 1 && 115 | tableSplits.iterator.next.getLength == 0 116 | 117 | if (tableSplits.size > 1 || !containsSingleEmptySplit) 118 | splits.insertAll(1, tableSplits.asScala.map(_.getBytes)) 119 | 120 | // convert splits to ranges 121 | var ranges = splits.sliding(2).toSeq 122 | 123 | // optionally shuffle 124 | if (options.getBoolean("shuffle.ranges", true)) 125 | ranges = scala.util.Random.shuffle(ranges) 126 | 127 | // create groups of ranges 128 | val numReaders = scala.math.min(ranges.length, maxPartitions) 129 | val batchSize = ranges.length / numReaders 130 | val batchRanges = ranges.sliding(batchSize, batchSize) 131 | 132 | logger.info(s"Splits '$batchRanges' creating $numReaders readers") 133 | 134 | val foo = batchRanges.map(r => new PartitionReaderFactory(tableName, r, 135 | schemaWithOutRowKey, requiredSchema, properties, rowKeyColumn, filterInJuel)) 136 | .toSeq.asJava 137 | 138 | new java.util.ArrayList[InputPartition[InternalRow]](foo) 139 | } 140 | } 141 | 142 | class PartitionReaderFactory(tableName: String, 143 | ranges: Seq[Seq[Array[Byte]]], 144 | inputSchema: StructType, 145 | outputSchema: StructType, 146 | properties: java.util.Properties, 147 | rowKeyColumn: String, 148 | filterInJuel: Option[String]) 149 | extends InputPartition[InternalRow] { 150 | 151 | def createPartitionReader: InputPartitionReader[InternalRow] = { 152 | 153 | Logger.getLogger(classOf[AccumuloDataSourceReader]).info(s"Partition reader for $ranges") 154 | 155 | new AccumuloInputPartitionReader(tableName, ranges, inputSchema, outputSchema, properties, rowKeyColumn, filterInJuel) 156 | } 157 | 158 | // override def preferredLocations(): Array[String] = Array("ab", "c") 159 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015/2017 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # Visual Studio 2017 auto generated files 33 | Generated\ Files/ 34 | 35 | # MSTest test Results 36 | [Tt]est[Rr]esult*/ 37 | [Bb]uild[Ll]og.* 38 | 39 | # NUNIT 40 | *.VisualState.xml 41 | TestResult.xml 42 | 43 | # Build Results of an ATL Project 44 | [Dd]ebugPS/ 45 | [Rr]eleasePS/ 46 | dlldata.c 47 | 48 | # Benchmark Results 49 | BenchmarkDotNet.Artifacts/ 50 | 51 | # .NET Core 52 | project.lock.json 53 | project.fragment.lock.json 54 | artifacts/ 55 | **/Properties/launchSettings.json 56 | 57 | # StyleCop 58 | StyleCopReport.xml 59 | 60 | # Files built by Visual Studio 61 | *_i.c 62 | *_p.c 63 | *_i.h 64 | *.ilk 65 | *.meta 66 | *.obj 67 | *.iobj 68 | *.pch 69 | *.pdb 70 | *.ipdb 71 | *.pgc 72 | *.pgd 73 | *.rsp 74 | *.sbr 75 | *.tlb 76 | *.tli 77 | *.tlh 78 | *.tmp 79 | *.tmp_proj 80 | *.log 81 | *.vspscc 82 | *.vssscc 83 | .builds 84 | *.pidb 85 | *.svclog 86 | *.scc 87 | 88 | # Chutzpah Test files 89 | _Chutzpah* 90 | 91 | # Visual C++ cache files 92 | ipch/ 93 | *.aps 94 | *.ncb 95 | *.opendb 96 | *.opensdf 97 | *.sdf 98 | *.cachefile 99 | *.VC.db 100 | *.VC.VC.opendb 101 | 102 | # Visual Studio profiler 103 | *.psess 104 | *.vsp 105 | *.vspx 106 | *.sap 107 | 108 | # Visual Studio Trace Files 109 | *.e2e 110 | 111 | # TFS 2012 Local Workspace 112 | $tf/ 113 | 114 | # Guidance Automation Toolkit 115 | *.gpState 116 | 117 | # ReSharper is a .NET coding add-in 118 | _ReSharper*/ 119 | *.[Rr]e[Ss]harper 120 | *.DotSettings.user 121 | 122 | # JustCode is a .NET coding add-in 123 | .JustCode 124 | 125 | # TeamCity is a build add-in 126 | _TeamCity* 127 | 128 | # DotCover is a Code Coverage Tool 129 | *.dotCover 130 | 131 | # AxoCover is a Code Coverage Tool 132 | .axoCover/* 133 | !.axoCover/settings.json 134 | 135 | # Visual Studio code coverage results 136 | *.coverage 137 | *.coveragexml 138 | 139 | # NCrunch 140 | _NCrunch_* 141 | .*crunch*.local.xml 142 | nCrunchTemp_* 143 | 144 | # MightyMoose 145 | *.mm.* 146 | AutoTest.Net/ 147 | 148 | # Web workbench (sass) 149 | .sass-cache/ 150 | 151 | # Installshield output folder 152 | [Ee]xpress/ 153 | 154 | # DocProject is a documentation generator add-in 155 | DocProject/buildhelp/ 156 | DocProject/Help/*.HxT 157 | DocProject/Help/*.HxC 158 | DocProject/Help/*.hhc 159 | DocProject/Help/*.hhk 160 | DocProject/Help/*.hhp 161 | DocProject/Help/Html2 162 | DocProject/Help/html 163 | 164 | # Click-Once directory 165 | publish/ 166 | 167 | # Publish Web Output 168 | *.[Pp]ublish.xml 169 | *.azurePubxml 170 | # Note: Comment the next line if you want to checkin your web deploy settings, 171 | # but database connection strings (with potential passwords) will be unencrypted 172 | *.pubxml 173 | *.publishproj 174 | 175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 176 | # checkin your Azure Web App publish settings, but sensitive information contained 177 | # in these scripts will be unencrypted 178 | PublishScripts/ 179 | 180 | # NuGet Packages 181 | *.nupkg 182 | # The packages folder can be ignored because of Package Restore 183 | **/[Pp]ackages/* 184 | # except build/, which is used as an MSBuild target. 185 | !**/[Pp]ackages/build/ 186 | # Uncomment if necessary however generally it will be regenerated when needed 187 | #!**/[Pp]ackages/repositories.config 188 | # NuGet v3's project.json files produces more ignorable files 189 | *.nuget.props 190 | *.nuget.targets 191 | 192 | # Microsoft Azure Build Output 193 | csx/ 194 | *.build.csdef 195 | 196 | # Microsoft Azure Emulator 197 | ecf/ 198 | rcf/ 199 | 200 | # Windows Store app package directories and files 201 | AppPackages/ 202 | BundleArtifacts/ 203 | Package.StoreAssociation.xml 204 | _pkginfo.txt 205 | *.appx 206 | 207 | # Visual Studio cache files 208 | # files ending in .cache can be ignored 209 | *.[Cc]ache 210 | # but keep track of directories ending in .cache 211 | !*.[Cc]ache/ 212 | 213 | # Others 214 | ClientBin/ 215 | ~$* 216 | *~ 217 | *.dbmdl 218 | *.dbproj.schemaview 219 | *.jfm 220 | *.pfx 221 | *.publishsettings 222 | orleans.codegen.cs 223 | 224 | # Including strong name files can present a security risk 225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 226 | #*.snk 227 | 228 | # Since there are multiple workflows, uncomment next line to ignore bower_components 229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 230 | #bower_components/ 231 | 232 | # RIA/Silverlight projects 233 | Generated_Code/ 234 | 235 | # Backup & report files from converting an old project file 236 | # to a newer Visual Studio version. Backup files are not needed, 237 | # because we have git ;-) 238 | _UpgradeReport_Files/ 239 | Backup*/ 240 | UpgradeLog*.XML 241 | UpgradeLog*.htm 242 | ServiceFabricBackup/ 243 | *.rptproj.bak 244 | 245 | # SQL Server files 246 | *.mdf 247 | *.ldf 248 | *.ndf 249 | 250 | # Business Intelligence projects 251 | *.rdl.data 252 | *.bim.layout 253 | *.bim_*.settings 254 | *.rptproj.rsuser 255 | 256 | # Microsoft Fakes 257 | FakesAssemblies/ 258 | 259 | # GhostDoc plugin setting file 260 | *.GhostDoc.xml 261 | 262 | # Node.js Tools for Visual Studio 263 | .ntvs_analysis.dat 264 | node_modules/ 265 | 266 | # Visual Studio 6 build log 267 | *.plg 268 | 269 | # Visual Studio 6 workspace options file 270 | *.opt 271 | 272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 273 | *.vbw 274 | 275 | # Visual Studio LightSwitch build output 276 | **/*.HTMLClient/GeneratedArtifacts 277 | **/*.DesktopClient/GeneratedArtifacts 278 | **/*.DesktopClient/ModelManifest.xml 279 | **/*.Server/GeneratedArtifacts 280 | **/*.Server/ModelManifest.xml 281 | _Pvt_Extensions 282 | 283 | # Paket dependency manager 284 | .paket/paket.exe 285 | paket-files/ 286 | 287 | # FAKE - F# Make 288 | .fake/ 289 | 290 | # JetBrains Rider 291 | .idea/ 292 | *.sln 293 | *.iml 294 | 295 | # CodeRush 296 | .cr/ 297 | 298 | # Python Tools for Visual Studio (PTVS) 299 | __pycache__/ 300 | *.pyc 301 | 302 | # Cake - Uncomment if you are using it 303 | # tools/** 304 | # !tools/packages.config 305 | 306 | # Tabs Studio 307 | *.tss 308 | 309 | # Telerik's JustMock configuration file 310 | *.jmconfig 311 | 312 | # BizTalk build output 313 | *.btp.cs 314 | *.btm.cs 315 | *.odx.cs 316 | *.xsd.cs 317 | 318 | # OpenCover UI analysis results 319 | OpenCover/ 320 | 321 | # Azure Stream Analytics local run output 322 | ASALocalRun/ 323 | 324 | # MSBuild Binary and Structured Log 325 | *.binlog 326 | 327 | # NVidia Nsight GPU debugger configuration file 328 | *.nvuser 329 | 330 | # MFractors (Xamarin productivity tool) working folder 331 | .mfractor/ 332 | 333 | .project 334 | .classpath 335 | .vscode 336 | target/ 337 | .settings/ 338 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroJuelTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.microsoft.accumulo.spark; 19 | 20 | import java.util.Arrays; 21 | 22 | import javax.el.ExpressionFactory; 23 | import javax.el.ValueExpression; 24 | 25 | import com.microsoft.accumulo.spark.juel.AvroELContext; 26 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder; 27 | import com.microsoft.accumulo.spark.record.RowBuilderField; 28 | import org.apache.avro.Schema; 29 | import org.apache.avro.generic.GenericRecordBuilder; 30 | import org.apache.hadoop.io.Text; 31 | import org.junit.Test; 32 | 33 | import junit.framework.TestCase; 34 | 35 | public class AvroJuelTest extends TestCase { 36 | 37 | private AvroELContext context; 38 | private ExpressionFactory factory; 39 | private Schema schema; 40 | 41 | @Override 42 | public void setUp() throws Exception { 43 | factory = ExpressionFactory.newInstance(); 44 | 45 | RowBuilderField[] schemaMappingFields = new RowBuilderField[] { 46 | // row 0 47 | new RowBuilderField("cf1", "cq1", "long", "v0"), 48 | // row 1 49 | new RowBuilderField("cf2", "cq2", "double", "v1"), 50 | // row 2 51 | new RowBuilderField("cf2", "cq3", "string", "v2") }; 52 | 53 | schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields)); 54 | 55 | context = new AvroELContext(schema); 56 | } 57 | 58 | private void setRecordValues(String rowKey, long cq1, double cq2, String cq3) { 59 | GenericRecordBuilder cf1RecordBuilder = new GenericRecordBuilder(schema.getField("cf1").schema()); 60 | GenericRecordBuilder cf2RecordBuilder = new GenericRecordBuilder(schema.getField("cf2").schema()); 61 | 62 | cf1RecordBuilder.set("cq1", cq1); 63 | cf2RecordBuilder.set("cq2", cq2); 64 | cf2RecordBuilder.set("cq3", cq3); 65 | 66 | GenericRecordBuilder rootRecordBuilder = new GenericRecordBuilder(schema); 67 | rootRecordBuilder.set("cf1", cf1RecordBuilder.build()); 68 | rootRecordBuilder.set("cf2", cf2RecordBuilder.build()); 69 | 70 | context.setCurrent(new Text(rowKey), rootRecordBuilder.build()); 71 | } 72 | 73 | @Test 74 | public void testVariableExpressions() { 75 | ValueExpression exprV0 = factory.createValueExpression(context, "${v0}", long.class); 76 | 77 | // set the values after the expression is created 78 | setRecordValues("key1", 3L, 2.0, ""); 79 | assertEquals(3L, exprV0.getValue(context)); 80 | 81 | // test if we can reset it 82 | setRecordValues("key1", 4L, 2.5, ""); 83 | assertEquals(4L, exprV0.getValue(context)); 84 | 85 | // check for the second variable 86 | ValueExpression exprV1 = factory.createValueExpression(context, "${v1}", double.class); 87 | assertEquals(2.5, exprV1.getValue(context)); 88 | } 89 | 90 | @Test 91 | public void testVariableConditions() { 92 | ValueExpression expr = factory.createValueExpression(context, "${v0 > 2.1 && v1 < 3}", boolean.class); 93 | 94 | setRecordValues("key1", 3L, 2.0, ""); 95 | 96 | assertTrue((boolean) expr.getValue(context)); 97 | } 98 | 99 | @Test 100 | public void testStringEndsWith() { 101 | ValueExpression expr = factory.createValueExpression(context, "${v2.endsWith('test')}", boolean.class); 102 | setRecordValues("key1", 3L, 2.0, "This is a test"); 103 | assertTrue((boolean) expr.getValue(context)); 104 | 105 | expr = factory.createValueExpression(context, "${!v2.endsWith('foo')}", boolean.class); 106 | assertTrue((boolean) expr.getValue(context)); 107 | } 108 | 109 | @Test 110 | public void testStringStartsWith() { 111 | ValueExpression expr = factory.createValueExpression(context, "${v2.startsWith('This')}", boolean.class); 112 | setRecordValues("key1", 3L, 2.0, "This is a test"); 113 | assertTrue((boolean) expr.getValue(context)); 114 | 115 | expr = factory.createValueExpression(context, "${!v2.startsWith('this')}", boolean.class); 116 | assertTrue((boolean) expr.getValue(context)); 117 | } 118 | 119 | @Test 120 | public void testStringContains() { 121 | ValueExpression expr = factory.createValueExpression(context, "${v2.contains('is')}", boolean.class); 122 | setRecordValues("key1", 3L, 2.0, "This is a test"); 123 | assertTrue((boolean) expr.getValue(context)); 124 | 125 | expr = factory.createValueExpression(context, "${!v2.contains('IS')}", boolean.class); 126 | assertTrue((boolean) expr.getValue(context)); 127 | } 128 | 129 | @Test 130 | public void testStringIn() { 131 | ValueExpression expr = factory.createValueExpression(context, "${v2.in('a','b','c')}", boolean.class); 132 | setRecordValues("key1", 3L, 2.0, "b"); 133 | assertTrue((boolean) expr.getValue(context)); 134 | } 135 | 136 | @Test 137 | public void testIntIn() { 138 | ValueExpression expr = factory.createValueExpression(context, "${v0.in(0, 1, 3)}", boolean.class); 139 | setRecordValues("key1", 3L, 2.0, "b"); 140 | assertTrue((boolean) expr.getValue(context)); 141 | 142 | expr = factory.createValueExpression(context, "${v0.in(0, 1)}", boolean.class); 143 | setRecordValues("key1", 3L, 2.0, "b"); 144 | assertFalse((boolean) expr.getValue(context)); 145 | } 146 | 147 | @Test 148 | public void testStringQuoteEscape() { 149 | ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\\'bc'}", boolean.class); 150 | setRecordValues("key1", 3L, 2.0, "a'bc"); 151 | assertTrue((boolean) expr.getValue(context)); 152 | } 153 | 154 | @Test 155 | public void testStringDoubleQuoteEscape() { 156 | ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\"bc'}", boolean.class); 157 | setRecordValues("key1", 3L, 2.0, "a\"bc"); 158 | assertTrue((boolean) expr.getValue(context)); 159 | } 160 | 161 | @Test 162 | public void testStringBackslash() { 163 | ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\\\\bc'}", boolean.class); 164 | setRecordValues("key1", 3L, 2.0, "a\\bc"); 165 | assertTrue((boolean) expr.getValue(context)); 166 | } 167 | 168 | @Test 169 | public void testRowKey() { 170 | ValueExpression expr = factory.createValueExpression(context, "${rowKey == 'key1'}", boolean.class); 171 | setRecordValues("key1", 3L, 2.0, "abc"); 172 | assertTrue((boolean) expr.getValue(context)); 173 | 174 | setRecordValues("key2", 3L, 2.0, "abc"); 175 | assertFalse((boolean) expr.getValue(context)); 176 | } 177 | 178 | @Test 179 | public void testObjectPropertyBased() { 180 | ValueExpression expr = factory.createValueExpression(context, "${cf1.cq1 == 3}", boolean.class); 181 | setRecordValues("key1", 3L, 2.0, "abc"); 182 | assertTrue((boolean) expr.getValue(context)); 183 | } 184 | 185 | @Test 186 | public void testColumnRemapping() { 187 | ValueExpression expr = factory.createValueExpression(context, "${(cf1.cq1 + 1)/2.0}", Object.class); 188 | 189 | setRecordValues("key1", 3L, 2.0, "abc"); 190 | 191 | assertEquals((3 + 1) / 2.0, expr.getValue(context)); 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ByteArrayChannel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 | * 5 | * This code is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License version 2 only, as 7 | * published by the Free Software Foundation. Oracle designates this 8 | * particular file as subject to the "Classpath" exception as provided 9 | * by Oracle in the LICENSE file that accompanied this code. 10 | * 11 | * This code is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 | * version 2 for more details (a copy is included in the LICENSE file that 15 | * accompanied this code). 16 | * 17 | * You should have received a copy of the GNU General Public License version 18 | * 2 along with this work; if not, write to the Free Software Foundation, 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 | * 21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 | * or visit www.oracle.com if you need additional information or have any 23 | * questions. 24 | */ 25 | 26 | package com.microsoft.accumulo.zipfs; 27 | 28 | import java.io.IOException; 29 | import java.nio.ByteBuffer; 30 | import java.nio.channels.ClosedChannelException; 31 | import java.nio.channels.NonWritableChannelException; 32 | import java.nio.channels.SeekableByteChannel; 33 | import java.util.Arrays; 34 | import java.util.concurrent.locks.ReadWriteLock; 35 | import java.util.concurrent.locks.ReentrantReadWriteLock; 36 | 37 | public class ByteArrayChannel implements SeekableByteChannel { 38 | 39 | private final ReadWriteLock rwlock = new ReentrantReadWriteLock(); 40 | private byte buf[]; 41 | 42 | /* 43 | * The current position of this channel. 44 | */ 45 | private int pos; 46 | 47 | /* 48 | * The index that is one greater than the last valid byte in the channel. 49 | */ 50 | private int last; 51 | 52 | private boolean closed; 53 | private boolean readonly; 54 | 55 | /* 56 | * Creates a {@code ByteArrayChannel} with size {@code sz}. 57 | */ 58 | ByteArrayChannel(int sz, boolean readonly) { 59 | this.buf = new byte[sz]; 60 | this.pos = this.last = 0; 61 | this.readonly = readonly; 62 | } 63 | 64 | /* 65 | * Creates a ByteArrayChannel with its 'pos' at 0 and its 'last' at buf's end. 66 | * Note: no defensive copy of the 'buf', used directly. 67 | */ 68 | ByteArrayChannel(byte[] buf, boolean readonly) { 69 | this.buf = buf; 70 | this.pos = 0; 71 | this.last = buf.length; 72 | this.readonly = readonly; 73 | } 74 | 75 | @Override 76 | public boolean isOpen() { 77 | return !closed; 78 | } 79 | 80 | @Override 81 | public long position() throws IOException { 82 | beginRead(); 83 | try { 84 | ensureOpen(); 85 | return pos; 86 | } finally { 87 | endRead(); 88 | } 89 | } 90 | 91 | @Override 92 | public SeekableByteChannel position(long pos) throws IOException { 93 | beginWrite(); 94 | try { 95 | ensureOpen(); 96 | if (pos < 0 || pos >= Integer.MAX_VALUE) 97 | throw new IllegalArgumentException("Illegal position " + pos); 98 | this.pos = Math.min((int)pos, last); 99 | return this; 100 | } finally { 101 | endWrite(); 102 | } 103 | } 104 | 105 | @Override 106 | public int read(ByteBuffer dst) throws IOException { 107 | beginWrite(); 108 | try { 109 | ensureOpen(); 110 | if (pos == last) 111 | return -1; 112 | int n = Math.min(dst.remaining(), last - pos); 113 | dst.put(buf, pos, n); 114 | pos += n; 115 | return n; 116 | } finally { 117 | endWrite(); 118 | } 119 | } 120 | 121 | @Override 122 | public SeekableByteChannel truncate(long size) throws IOException { 123 | if (readonly) 124 | throw new NonWritableChannelException(); 125 | ensureOpen(); 126 | throw new UnsupportedOperationException(); 127 | } 128 | 129 | @Override 130 | public int write(ByteBuffer src) throws IOException { 131 | if (readonly) 132 | throw new NonWritableChannelException(); 133 | beginWrite(); 134 | try { 135 | ensureOpen(); 136 | int n = src.remaining(); 137 | ensureCapacity(pos + n); 138 | src.get(buf, pos, n); 139 | pos += n; 140 | if (pos > last) { 141 | last = pos; 142 | } 143 | return n; 144 | } finally { 145 | endWrite(); 146 | } 147 | } 148 | 149 | @Override 150 | public long size() throws IOException { 151 | beginRead(); 152 | try { 153 | ensureOpen(); 154 | return last; 155 | } finally { 156 | endRead(); 157 | } 158 | } 159 | 160 | @Override 161 | public void close() throws IOException { 162 | if (closed) 163 | return; 164 | beginWrite(); 165 | try { 166 | closed = true; 167 | buf = null; 168 | pos = 0; 169 | last = 0; 170 | } finally { 171 | endWrite(); 172 | } 173 | } 174 | 175 | /** 176 | * Creates a newly allocated byte array. Its size is the current 177 | * size of this channel and the valid contents of the buffer 178 | * have been copied into it. 179 | * 180 | * @return the current contents of this channel, as a byte array. 181 | */ 182 | public byte[] toByteArray() { 183 | beginRead(); 184 | try { 185 | // avoid copy if last == bytes.length? 186 | return Arrays.copyOf(buf, last); 187 | } finally { 188 | endRead(); 189 | } 190 | } 191 | 192 | private void ensureOpen() throws IOException { 193 | if (closed) 194 | throw new ClosedChannelException(); 195 | } 196 | 197 | private final void beginWrite() { 198 | rwlock.writeLock().lock(); 199 | } 200 | 201 | private final void endWrite() { 202 | rwlock.writeLock().unlock(); 203 | } 204 | 205 | private final void beginRead() { 206 | rwlock.readLock().lock(); 207 | } 208 | 209 | private final void endRead() { 210 | rwlock.readLock().unlock(); 211 | } 212 | 213 | private void ensureCapacity(int minCapacity) { 214 | // overflow-conscious code 215 | if (minCapacity - buf.length > 0) { 216 | grow(minCapacity); 217 | } 218 | } 219 | 220 | /** 221 | * The maximum size of array to allocate. 222 | * Some VMs reserve some header words in an array. 223 | * Attempts to allocate larger arrays may result in 224 | * OutOfMemoryError: Requested array size exceeds VM limit 225 | */ 226 | private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8; 227 | 228 | /** 229 | * Increases the capacity to ensure that it can hold at least the 230 | * number of elements specified by the minimum capacity argument. 231 | * 232 | * @param minCapacity the desired minimum capacity 233 | */ 234 | private void grow(int minCapacity) { 235 | // overflow-conscious code 236 | int oldCapacity = buf.length; 237 | int newCapacity = oldCapacity << 1; 238 | if (newCapacity - minCapacity < 0) 239 | newCapacity = minCapacity; 240 | if (newCapacity - MAX_ARRAY_SIZE > 0) 241 | newCapacity = hugeCapacity(minCapacity); 242 | buf = Arrays.copyOf(buf, newCapacity); 243 | } 244 | 245 | private static int hugeCapacity(int minCapacity) { 246 | if (minCapacity < 0) // overflow 247 | throw new OutOfMemoryError(); 248 | return (minCapacity > MAX_ARRAY_SIZE) ? 249 | Integer.MAX_VALUE : 250 | MAX_ARRAY_SIZE; 251 | } 252 | } 253 | --------------------------------------------------------------------------------