map = new LinkedHashMap<>();
114 | if ("*".equals(attributes)) {
115 | for (AttrID id : AttrID.values()) {
116 | try {
117 | map.put(id.name(), attribute(id, zfas));
118 | } catch (IllegalArgumentException x) {}
119 | }
120 | } else {
121 | String[] as = attributes.split(",");
122 | for (String a : as) {
123 | try {
124 | map.put(a, attribute(AttrID.valueOf(a), zfas));
125 | } catch (IllegalArgumentException x) {}
126 | }
127 | }
128 | return map;
129 | }
130 |
131 | Object attribute(AttrID id, ZipFileAttributes zfas) {
132 | switch (id) {
133 | case size:
134 | return zfas.size();
135 | case creationTime:
136 | return zfas.creationTime();
137 | case lastAccessTime:
138 | return zfas.lastAccessTime();
139 | case lastModifiedTime:
140 | return zfas.lastModifiedTime();
141 | case isDirectory:
142 | return zfas.isDirectory();
143 | case isRegularFile:
144 | return zfas.isRegularFile();
145 | case isSymbolicLink:
146 | return zfas.isSymbolicLink();
147 | case isOther:
148 | return zfas.isOther();
149 | case fileKey:
150 | return zfas.fileKey();
151 | case compressedSize:
152 | if (isZipView)
153 | return zfas.compressedSize();
154 | break;
155 | case crc:
156 | if (isZipView)
157 | return zfas.crc();
158 | break;
159 | case method:
160 | if (isZipView)
161 | return zfas.method();
162 | break;
163 | case permissions:
164 | if (isZipView) {
165 | return zfas.storedPermissions().orElse(null);
166 | }
167 | break;
168 | default:
169 | break;
170 | }
171 | return null;
172 | }
173 | }
174 |
--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/AvroFastRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo.spark.record;
19 |
20 | import java.util.HashMap;
21 | import java.util.Map;
22 | import java.util.stream.Collectors;
23 |
24 | import org.apache.accumulo.core.client.lexicoder.Encoder;
25 | import org.apache.accumulo.core.data.ArrayByteSequence;
26 | import org.apache.accumulo.core.data.ByteSequence;
27 | import com.microsoft.accumulo.spark.juel.AvroUtf8Wrapper;
28 | import org.apache.avro.Schema;
29 | import org.apache.avro.Schema.Field;
30 | import org.apache.avro.Schema.Type;
31 | import org.apache.avro.generic.GenericContainer;
32 | import org.apache.avro.generic.IndexedRecord;
33 |
34 | /**
35 | * This class collects all cells of interest into an AVRO Generic Record.
36 | *
37 | * Cells with non-empty column family and column qualifier are stored in nested
38 | * AVRO records. Cells with empty column qualifier are stored in the top-level
39 | * record.
40 | *
41 | * Example:
42 | *
43 | *
44 | * cf1, cq1, abc
45 | * cf1, cq2, 3.2
46 | * cf2, null, 6
47 | * cf3, cq3, def
48 | *
49 | *
50 | * Avro Record:
51 | *
52 | *
53 | * {
54 | * cf1: { cq1: "abc", cq2: 3.2 },
55 | * cf2: 6,
56 | * cf3: { cq3: "def " }
57 | * }
58 | *
59 | */
60 | public class AvroFastRecord implements GenericContainer, IndexedRecord {
61 |
62 | private static ByteSequence EMPTY_SEQUENCE = new ArrayByteSequence(new byte[0]);
63 |
64 | /**
65 | * The Avro schema.
66 | */
67 | private Schema schema;
68 |
69 | /**
70 | * The data array.
71 | */
72 | private Object[] values;
73 |
74 | /**
75 | * The nested records.
76 | */
77 | private AvroFastRecord[] nestedFields;
78 |
79 | /**
80 | * The primitive field indices for fast clearing.
81 | */
82 | private int[] primitiveFields;
83 |
84 | public AvroFastRecord(Schema schema) {
85 | this.schema = schema;
86 | this.values = new Object[schema.getFields().size()];
87 |
88 | // find all nested nested fields
89 | this.nestedFields = schema.getFields().stream().filter(f -> f.schema().getType() == Type.RECORD).map(f -> {
90 | AvroFastRecord rec = new AvroFastRecord(f.schema());
91 | this.values[f.pos()] = rec;
92 | return rec;
93 | }).toArray(AvroFastRecord[]::new);
94 |
95 | // find all primitive fields
96 | this.primitiveFields = schema.getFields().stream().filter(f -> f.schema().getType() != Type.RECORD)
97 | .mapToInt(Field::pos).toArray();
98 | }
99 |
100 | /**
101 | * Clears all primitive fields (including nested record once).
102 | */
103 | public void clear() {
104 | for (int idx : this.primitiveFields)
105 | this.values[idx] = null;
106 |
107 | for (AvroFastRecord rec : this.nestedFields)
108 | rec.clear();
109 | }
110 |
111 | @Override
112 | public void put(int i, Object v) {
113 | this.values[i] = v;
114 | }
115 |
116 | @Override
117 | public Object get(int i) {
118 | return this.values[i];
119 | }
120 |
121 | @Override
122 | public Schema getSchema() {
123 | return this.schema;
124 | }
125 |
126 | /**
127 | * Create the core lookup map for column family/column qualifier. The leave
128 | * nodes are consumers that know which record/field to target.
129 | *
130 | * @param rootRecord the root Avro record.
131 | * @return the lookup map.
132 | */
133 | public static Map> createCellToFieldMap(
134 | AvroFastRecord rootRecord) {
135 | Map> map = new HashMap<>();
136 |
137 | // setup GenericRecordBuilder for each column family
138 | for (Field field : rootRecord.getSchema().getFields()) {
139 | Schema nestedSchema = field.schema();
140 |
141 | ByteSequence columnFamily = new ArrayByteSequence(field.name());
142 |
143 | // top-level field
144 | if (nestedSchema.getType() != Type.RECORD) {
145 | // Map.of(...) in older JDK
146 | Map subMap = new HashMap<>();
147 | subMap.put(EMPTY_SEQUENCE, createAvroCellConsumer(rootRecord, field));
148 |
149 | map.put(columnFamily, subMap);
150 |
151 | continue;
152 | }
153 |
154 | // nested fields
155 | Map nestedLookupMap = nestedSchema.getFields().stream()
156 | .collect(Collectors.toMap(
157 | // nested name as key
158 | nestedField -> new ArrayByteSequence(nestedField.name()),
159 | // assign cells to field in nested record
160 | nestedField -> createAvroCellConsumer((AvroFastRecord) rootRecord.get(field.pos()), nestedField)));
161 |
162 | map.put(columnFamily, nestedLookupMap);
163 | }
164 |
165 | return map;
166 | }
167 |
168 | /**
169 | * Creates a consumer of cells that copy the data into the corresponding Avro
170 | * record fields.
171 | *
172 | * @param record The record to populate.
173 | * @param field The field to populate
174 | * @return The closure holding things together.
175 | */
176 | private static RowBuilderCellConsumer createAvroCellConsumer(AvroFastRecord record, Field field) {
177 | int pos = field.pos();
178 |
179 | if (field.schema().getType() == Type.STRING)
180 | // avoid byte array copying
181 | return (key, value) -> record.put(pos, new AvroUtf8Wrapper(value.get()));
182 |
183 | // get the fitting encoder
184 | Encoder> encoder = RowBuilderType.valueOf(field.getProp(AvroSchemaBuilder.PROPERTY_ROWBUILDERTYPE)).getEncoder();
185 | return (key, value) -> record.put(pos, encoder.decode(value.get()));
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AvroUtil.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo
19 |
20 | import org.apache.avro.{Schema, SchemaBuilder}
21 | import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType}
22 | import org.codehaus.jackson.map.ObjectMapper
23 | import org.codehaus.jackson.map.annotate.JsonSerialize.Inclusion
24 |
25 | import scala.beans.BeanProperty
26 |
27 | // keeping the property names short to not hit any limits
28 | case class RowBuilderField(@BeanProperty cf: String, // column family
29 | @BeanProperty cq: String, // column qualifier
30 | @BeanProperty fvn: String, // filter variable name
31 | @BeanProperty t: String, // type
32 | @BeanProperty o: Boolean // output
33 | )
34 |
35 | case class JsonSchema(json: String, attributeToVariableMapping: Map[String, String])
36 |
37 | @SerialVersionUID(1L)
38 | object AvroUtil {
39 | def catalystSchemaToJson(inputSchema: StructType): JsonSchema = catalystSchemaToJson(inputSchema, inputSchema)
40 |
41 | def catalystSchemaToJson(inputSchema: StructType, outputSchema: StructType): JsonSchema = {
42 |
43 | var attributeToVariableMapping = scala.collection.mutable.Map[String, String]()
44 |
45 | var i = 0
46 | val selectedFields = inputSchema.fields.flatMap(cf => {
47 | val outputField = outputSchema.find(f => f.name == cf.name)
48 |
49 | cf.dataType match {
50 | case cft: StructType => cft.fields.map(cq =>
51 | RowBuilderField(
52 | cf.name,
53 | cq.name,
54 | {
55 | val variableName = s"v$i"
56 | attributeToVariableMapping += (s"${cf.name}.${cq.name}" -> variableName)
57 | i += 1
58 |
59 | variableName
60 | },
61 | // TODO: toUpperCase() is weird...
62 | cq.dataType.typeName.toUpperCase,
63 | // either the column family is not need -> output = false
64 | // otherwise we need to check if the column qualifier is present in the output list
65 | if (outputField.isEmpty) false else outputField.get.dataType.asInstanceOf[StructType].exists(f => f.name == cq.name)
66 | )
67 | )
68 | case _: DataType => Seq(RowBuilderField(
69 | cf.name,
70 | null,
71 | {
72 | val variableName = s"v$i"
73 | attributeToVariableMapping += (s"${cf.name}" -> variableName)
74 | i += 1
75 |
76 | variableName
77 | },
78 | // TODO: toUpperCase() is weird...
79 | cf.dataType.typeName.toUpperCase,
80 | outputField.isDefined
81 | ))
82 | }
83 | })
84 |
85 | try {
86 | val mapper = new ObjectMapper()
87 |
88 | // disable serialization of null-values
89 | mapper.setSerializationInclusion(Inclusion.NON_NULL)
90 |
91 | JsonSchema(mapper.writeValueAsString(selectedFields), attributeToVariableMapping.toMap)
92 | } catch {
93 | case e: Exception =>
94 | throw new IllegalArgumentException(e)
95 | }
96 | }
97 |
98 | implicit class CatalystSchemaToAvroRecordBuilder(builder: SchemaBuilder.FieldAssembler[Schema]) {
99 | def addAvroRecordField(field: StructField): SchemaBuilder.FieldAssembler[Schema] = {
100 | (field.dataType, field.nullable) match {
101 | case (DataTypes.BinaryType, true) => builder.optionalBytes(field.name)
102 | case (DataTypes.BinaryType, false) => builder.requiredBytes(field.name)
103 | case (DataTypes.BooleanType, true) => builder.optionalBoolean(field.name)
104 | case (DataTypes.BooleanType, false) => builder.requiredBoolean(field.name)
105 | case (DataTypes.DoubleType, true) => builder.optionalDouble(field.name)
106 | case (DataTypes.DoubleType, false) => builder.requiredDouble(field.name)
107 | case (DataTypes.FloatType, true) => builder.optionalFloat(field.name)
108 | case (DataTypes.FloatType, false) => builder.requiredFloat(field.name)
109 | case (DataTypes.IntegerType, true) => builder.optionalInt(field.name)
110 | case (DataTypes.IntegerType, false) => builder.requiredInt(field.name)
111 | case (DataTypes.LongType, true) => builder.optionalLong(field.name)
112 | case (DataTypes.LongType, false) => builder.requiredLong(field.name)
113 | case (DataTypes.StringType, true) => builder.optionalString(field.name)
114 | case (DataTypes.StringType, false) => builder.requiredString(field.name)
115 | // TODO: date/time support?
116 | case _ => throw new UnsupportedOperationException(s"Unsupported type: $field.dataType")
117 | }
118 | }
119 |
120 | def addAvroRecordFields(schema: StructType): SchemaBuilder.FieldAssembler[Schema] = {
121 | schema.fields.foldLeft(builder) { (builder, field) => builder.addAvroRecordField(field) }
122 | }
123 | }
124 |
125 | def catalystSchemaToAvroSchema(schema: StructType): Schema = {
126 | val fieldBuilder = SchemaBuilder.record("root")
127 | .fields()
128 |
129 | schema.fields.foldLeft(fieldBuilder) { (_, field) =>
130 | field.dataType match {
131 | // nested fields
132 | case cft: StructType =>
133 | fieldBuilder
134 | .name(field.name)
135 | .`type`(SchemaBuilder
136 | .record(field.name)
137 | .fields
138 | .addAvroRecordFields(cft)
139 | .endRecord())
140 | .noDefault()
141 | // top level fields
142 | case _ => fieldBuilder.addAvroRecordField(field)
143 | }
144 | }
145 | .endRecord()
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/record/AvroSchemaBuilder.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo.spark.record;
19 |
20 | import java.util.Collection;
21 |
22 | import org.apache.avro.Schema;
23 | import org.apache.avro.SchemaBuilder;
24 |
25 | /**
26 | * Builds the AVRO Schema from the user-supplied JSON encoded schema.
27 | */
28 | public class AvroSchemaBuilder {
29 | public static final String PROPERTY_ROWBUILDERTYPE = "rowBuilderType";
30 |
31 | public static final String PROPERTY_OUTPUT = "output";
32 |
33 | private static SchemaBuilder.FieldAssembler addAvroField(SchemaBuilder.FieldAssembler builder,
34 | RowBuilderField field, String name) {
35 |
36 | RowBuilderType type = field.getRowBuilderType();
37 |
38 | SchemaBuilder.FieldBuilder fieldBuilder = builder
39 | // configure the field name
40 | .name(name);
41 |
42 | // pass in alias
43 | if (field.getFilterVariableName() != null && field.getFilterVariableName().length() > 0)
44 | fieldBuilder = fieldBuilder.aliases(field.getFilterVariableName());
45 |
46 | SchemaBuilder.FieldTypeBuilder intermediate = fieldBuilder
47 | // encode rowBuilderType so we can only operator on schema
48 | .prop(PROPERTY_ROWBUILDERTYPE, type.name())
49 | // encode if this is an output field
50 | .prop(PROPERTY_OUTPUT, Boolean.toString(field.isOutput()))
51 | // all fields are optional
52 | .type();
53 |
54 | if (field.isNullable()) {
55 | SchemaBuilder.BaseTypeBuilder> optionalType = intermediate.optional();
56 | switch (type) {
57 | case String:
58 | return optionalType.stringType();
59 | case Long:
60 | return optionalType.longType();
61 | case Integer:
62 | return optionalType.intType();
63 | case Double:
64 | return optionalType.doubleType();
65 | case Float:
66 | return optionalType.floatType();
67 | case Boolean:
68 | return optionalType.booleanType();
69 | case Bytes:
70 | return optionalType.bytesType();
71 | default:
72 | throw new IllegalArgumentException("Unsupported type '" + type + "'");
73 | }
74 | } else {
75 | switch (type) {
76 | case String:
77 | return intermediate.stringType().noDefault();
78 | case Long:
79 | return intermediate.longType().noDefault();
80 | case Integer:
81 | return intermediate.intType().noDefault();
82 | case Double:
83 | return intermediate.doubleType().noDefault();
84 | case Float:
85 | return intermediate.floatType().noDefault();
86 | case Boolean:
87 | return intermediate.booleanType().noDefault();
88 | case Bytes:
89 | return intermediate.bytesType().noDefault();
90 | default:
91 | throw new IllegalArgumentException("Unsupported type '" + type + "'");
92 | }
93 | }
94 | }
95 |
96 | private static SchemaBuilder.FieldAssembler closeFieldAssembler(
97 | SchemaBuilder.FieldAssembler rootAssembler, SchemaBuilder.FieldAssembler columnFieldsAssembler,
98 | String columnFamily, boolean output) {
99 |
100 | if (columnFieldsAssembler == null)
101 | return rootAssembler;
102 |
103 | // add nested type to to root assembler
104 | return rootAssembler
105 | // name the record field
106 | .name(columnFamily)
107 | // any of the column sub fields need to be output?
108 | .prop(PROPERTY_OUTPUT, Boolean.toString(output))
109 | // it's a record type
110 | .type(columnFieldsAssembler.endRecord()).noDefault();
111 | }
112 |
113 | public static Schema buildSchema(Collection schemaFields) {
114 | // construct schema
115 | SchemaBuilder.FieldAssembler rootAssembler = SchemaBuilder.record("root").fields();
116 |
117 | // note that the order needs to be exactly in-sync with the avro schema
118 | // generated on the MMLSpark/Scala side
119 | String lastColumnFamily = null;
120 | SchemaBuilder.FieldAssembler columnFieldsAssembler = null;
121 | boolean output = false;
122 | for (RowBuilderField schemaField : schemaFields) {
123 |
124 | String columnFamily = schemaField.getColumnFamily();
125 | String columnQualifier = schemaField.getColumnQualifier();
126 |
127 | if (columnQualifier != null) {
128 | if (lastColumnFamily == null || !lastColumnFamily.equals(columnFamily)) {
129 |
130 | // close previous record
131 | rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output);
132 |
133 | // open new record
134 | columnFieldsAssembler = SchemaBuilder.record(columnFamily).fields();
135 |
136 | output = false;
137 | }
138 |
139 | // true if any of the column qualifiers is an output field
140 | output |= (boolean) schemaField.isOutput();
141 |
142 | // add the current field
143 | columnFieldsAssembler = addAvroField(columnFieldsAssembler, schemaField, columnQualifier);
144 | } else {
145 | // close previous record
146 | rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output);
147 | columnFieldsAssembler = null;
148 | output = false;
149 |
150 | // add the top-level field
151 | rootAssembler = addAvroField(rootAssembler, schemaField, columnFamily);
152 | }
153 |
154 | lastColumnFamily = columnFamily;
155 | }
156 |
157 | rootAssembler = closeFieldAssembler(rootAssembler, columnFieldsAssembler, lastColumnFamily, output);
158 |
159 | // setup serialization
160 | return rootAssembler.endRecord();
161 | }
162 | }
163 |
--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloInputPartitionReader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo
19 |
20 | import java.io.IOException
21 |
22 | import org.apache.accumulo.core.client.{Accumulo, IteratorSetting}
23 | import org.apache.accumulo.core.data.{Key, Range}
24 | import org.apache.accumulo.core.security.Authorizations
25 | import org.apache.avro.generic.GenericRecord
26 | import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
27 | import org.apache.avro.specific.SpecificDatumReader
28 | import org.apache.hadoop.io.Text
29 | import org.apache.log4j.Logger
30 | import org.apache.spark.sql.avro.AvroDeserializer
31 | import org.apache.spark.sql.catalyst.InternalRow
32 | import org.apache.spark.sql.sources.v2.reader.InputPartitionReader
33 | import org.apache.spark.sql.types.StructType
34 | import org.apache.spark.unsafe.types.UTF8String
35 | import scala.collection.JavaConverters._
36 |
37 | @SerialVersionUID(1L)
38 | class AccumuloInputPartitionReader(tableName: String,
39 | ranges: Seq[Seq[Array[Byte]]],
40 | inputSchema: StructType,
41 | outputSchema: StructType,
42 | properties: java.util.Properties,
43 | rowKeyColumn: String,
44 | filterInJuel: Option[String])
45 | extends InputPartitionReader[InternalRow] with Serializable {
46 |
47 | private val logger = Logger.getLogger(classOf[AccumuloInputPartitionReader])
48 |
49 | val defaultPriority = "20"
50 | val defaultNumQueryThreads: String = math.min(16, ranges.length).toString
51 |
52 | private val priority = Integer.valueOf(properties.getProperty("priority", defaultPriority))
53 | // this parameter is impacted by number of accumulo splits and spark partitions and executors
54 | private val numQueryThreads = Integer.valueOf(properties.getProperty("numQueryThreads", defaultNumQueryThreads))
55 |
56 | private val authorizations = new Authorizations()
57 | private val client = Accumulo.newClient().from(properties).build()
58 | private val scanner = client.createBatchScanner(tableName, authorizations, numQueryThreads)
59 |
60 | private def createRange(start: Array[Byte], stop: Array[Byte]) =
61 | new Range(
62 | if (start.length == 0) null else new Key(start),
63 | start.length == 0,
64 | if (stop.length == 0) null else new Key(stop),
65 | true)
66 |
67 | scanner.setRanges(ranges.map(t => createRange(t(0), t(1))).asJava)
68 |
69 | private val avroIterator = new IteratorSetting(
70 | priority,
71 | "AVRO",
72 | "com.microsoft.accumulo.spark.AvroRowEncoderIterator")
73 |
74 | // only fetch column families we care for (and don't filter for the mleapFields which are artificially added later)
75 | inputSchema.fields.foreach(f => scanner.fetchColumnFamily(f.name))
76 |
77 | private val rowKeyColumnIndex = {
78 | if (outputSchema.fieldNames.contains(rowKeyColumn))
79 | outputSchema.fieldIndex(rowKeyColumn)
80 | else
81 | -1
82 | }
83 |
84 | // AVRO Iterator setup
85 | val jsonSchema: String = AvroUtil.catalystSchemaToJson(inputSchema, outputSchema).json
86 |
87 | logger.info(s"JSON schema: $jsonSchema")
88 | avroIterator.addOption("schema", jsonSchema)
89 | if (filterInJuel.isDefined)
90 | avroIterator.addOption("filter", filterInJuel.get)
91 |
92 | // list of output columns
93 | // val prunedColumns = schema.map(_.name).mkString(",")
94 | // logger.info(s"Pruned columns: ${prunedColumns}")
95 | // avroIterator.addOption("prunedcolumns", prunedColumns)
96 |
97 | // forward options
98 | Seq("mleap", "mleapfilter", "mleapguid", "exceptionlogfile")
99 | .foreach { key => avroIterator.addOption(key, properties.getProperty(key, "")) }
100 |
101 | scanner.addScanIterator(avroIterator)
102 |
103 | // TODO: support additional user-supplied iterators
104 | private val scannerIterator = scanner.iterator()
105 |
106 | // filter out row-key target from schema generation
107 | private val schemaWithoutRowKey = new StructType(outputSchema.fields.filter(_.name != rowKeyColumn))
108 |
109 | // the serialized AVRO does not contain the row key as it comes with the key/value pair anyway
110 | private val avroSchema = AvroUtil.catalystSchemaToAvroSchema(schemaWithoutRowKey)
111 |
112 | // pass the schema for the avro input along with the target output schema (incl. row key)
113 | private val deserializer = new AvroDeserializer(avroSchema, outputSchema)
114 | private val reader = new SpecificDatumReader[GenericRecord](avroSchema)
115 |
116 | private var decoder: BinaryDecoder = _
117 | private var currentRow: InternalRow = _
118 | private var datum: GenericRecord = _
119 |
120 | private val rowKeyText = new Text()
121 |
122 | override def close(): Unit = {
123 | if (scanner != null)
124 | scanner.close()
125 |
126 | if (client != null)
127 | client.close()
128 | }
129 |
130 | @IOException
131 | override def next: Boolean = {
132 | if (scannerIterator.hasNext) {
133 | val entry = scannerIterator.next
134 | val data = entry.getValue.get
135 |
136 | // byte[] -> avro
137 | decoder = DecoderFactory.get.binaryDecoder(data, decoder)
138 | datum = reader.read(datum, decoder)
139 |
140 | // avro -> catalyst
141 | currentRow = deserializer.deserialize(datum).asInstanceOf[InternalRow]
142 |
143 | if (rowKeyColumnIndex >= 0) {
144 | // move row key id into internalrow
145 | entry.getKey.getRow(rowKeyText)
146 |
147 | // avoid yet another byte array copy...
148 | val str = UTF8String.fromBytes(rowKeyText.getBytes, 0, rowKeyText.getLength)
149 | currentRow.update(rowKeyColumnIndex, str)
150 | }
151 |
152 | true
153 | } else {
154 | false
155 | }
156 | }
157 |
158 | override def get(): InternalRow = currentRow
159 | }
--------------------------------------------------------------------------------
/connector/iterator/src/main/java/com/microsoft/accumulo/spark/processors/AvroRowComputedColumns.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo.spark.processors;
19 |
20 | import java.io.IOException;
21 | import java.util.ArrayList;
22 | import java.util.Collection;
23 | import java.util.List;
24 | import java.util.Map;
25 | import java.util.stream.Collectors;
26 |
27 | import javax.el.ExpressionFactory;
28 | import javax.el.ValueExpression;
29 |
30 | import com.microsoft.accumulo.spark.juel.AvroELContext;
31 | import com.microsoft.accumulo.spark.record.RowBuilderField;
32 | import com.microsoft.accumulo.spark.record.RowBuilderType;
33 | import org.apache.avro.Schema;
34 | import org.apache.avro.Schema.Field;
35 | import org.apache.avro.generic.IndexedRecord;
36 | import org.apache.hadoop.io.Text;
37 |
38 | /**
39 | * Holds all computed columns.
40 | * Note: it's a bit convoluted as we first have to parse the options to figure
41 | * which additional columns we have, return to the caller so we can setup the
42 | * AVRO schema and then continue the setup here.
43 | */
44 | public class AvroRowComputedColumns extends AvroRowConsumer {
45 | public static final String COLUMN_PREFIX = "column.";
46 |
47 | /**
48 | * Required for copy.
49 | */
50 | private Schema schema;
51 |
52 | /**
53 | * JUEL expression context exposing AVRO GenericRecord
54 | */
55 | private AvroELContext expressionContext;
56 |
57 | /**
58 | * Definitions created from user-supplied options.
59 | */
60 | private List expressionColumnDefinitions;
61 |
62 | /**
63 | * The executable column expressions.
64 | */
65 | private List expressionColumns;
66 |
67 | /**
68 | * Just the definition of the expression. Need to collect them all first so the
69 | * AVRO schema can be build.
70 | */
71 | static class ExpressionColumnDefinition {
72 | private RowBuilderField schemaField;
73 |
74 | private String expression;
75 |
76 | public ExpressionColumnDefinition(RowBuilderField schemaField, String expression) {
77 | this.schemaField = schemaField;
78 | this.expression = expression;
79 | }
80 |
81 | public RowBuilderField getSchemaField() {
82 | return schemaField;
83 | }
84 |
85 | public String getExpression() {
86 | return expression;
87 | }
88 | }
89 |
90 | /**
91 | * The fully initialized expression ready to be computed.
92 | */
93 | class ExpressionColumn {
94 | private ValueExpression columnExpression;
95 |
96 | private int pos;
97 |
98 | public ExpressionColumn(ValueExpression columnExpression, int pos) {
99 | this.columnExpression = columnExpression;
100 | this.pos = pos;
101 | }
102 |
103 | public void setFieldValue(IndexedRecord record) {
104 | Object value = this.columnExpression.getValue(AvroRowComputedColumns.this.expressionContext);
105 | record.put(this.pos, value);
106 | }
107 | }
108 |
109 | /**
110 | * Factory method creating the row processor if valid options are supplied or
111 | * null if none are found.
112 | */
113 | public static AvroRowComputedColumns create(Map options) {
114 | // expression setup
115 | // options: column.., JUEL expression
116 | List expressionColumnDefinitions = new ArrayList<>();
117 |
118 | for (Map.Entry entry : options.entrySet()) {
119 | if (!entry.getKey().startsWith(COLUMN_PREFIX))
120 | continue;
121 |
122 | String[] arr = entry.getKey().split("\\.");
123 | if (arr.length != 3)
124 | throw new IllegalArgumentException(
125 | "Unable to parse column specification. column..: " + entry.getKey());
126 |
127 | String column = arr[1];
128 | String type = RowBuilderType.valueOfIgnoreCase(arr[2]).name();
129 | String expression = entry.getValue();
130 | RowBuilderField schemaField = new RowBuilderField(column, null, type, column);
131 |
132 | expressionColumnDefinitions.add(new ExpressionColumnDefinition(schemaField, expression));
133 | }
134 |
135 | return expressionColumnDefinitions.isEmpty() ? null : new AvroRowComputedColumns(expressionColumnDefinitions);
136 | }
137 |
138 | private AvroRowComputedColumns(List expressionColumnDefinitions) {
139 | this.expressionColumnDefinitions = expressionColumnDefinitions;
140 | }
141 |
142 | /**
143 | *
144 | * @return a collection of RowBuilderFields based on the column expression
145 | * definitions.
146 | */
147 | @Override
148 | public Collection getSchemaFields() {
149 | return this.expressionColumnDefinitions.stream().map(ExpressionColumnDefinition::getSchemaField)
150 | .collect(Collectors.toList());
151 | }
152 |
153 | /**
154 | * Initialize the columns expression. Can't be done in the constructor as the
155 | * schema wasn't ready.
156 | *
157 | * @param schema the AVRO input schema.
158 | */
159 | @Override
160 | public void initialize(Schema schema) {
161 | this.schema = schema;
162 | this.expressionContext = new AvroELContext(schema);
163 |
164 | ExpressionFactory factory = ExpressionFactory.newInstance();
165 |
166 | this.expressionColumns = this.expressionColumnDefinitions.stream().map(expr -> {
167 | Field field = schema.getField(expr.getSchemaField().getColumnFamily());
168 |
169 | RowBuilderType type = expr.getSchemaField().getRowBuilderType();
170 | ValueExpression columnExpression = factory.createValueExpression(expressionContext, expr.getExpression(),
171 | type.getJavaClass());
172 |
173 | return new ExpressionColumn(columnExpression, field.pos());
174 | }).collect(Collectors.toList());
175 | }
176 |
177 | @Override
178 | protected boolean consumeInternal(Text rowKey, IndexedRecord record) throws IOException {
179 | this.expressionContext.setCurrent(rowKey, record);
180 |
181 | // compute each expression
182 | for (ExpressionColumn expr : this.expressionColumns)
183 | expr.setFieldValue(record);
184 |
185 | return true;
186 | }
187 |
188 | @Override
189 | public AvroRowConsumer clone() {
190 | AvroRowComputedColumns copy = new AvroRowComputedColumns(this.expressionColumnDefinitions);
191 |
192 | copy.initialize(this.schema);
193 |
194 | return copy;
195 | }
196 | }
197 |
--------------------------------------------------------------------------------
/connector/datasource/src/main/scala/com/microsoft/accumulo/AccumuloDataSourceReader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo
19 |
20 | import org.apache.accumulo.core.client.Accumulo
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.sources.Filter
23 | import org.apache.spark.sql.sources.v2.DataSourceOptions
24 | import org.apache.spark.sql.sources.v2.reader.{DataSourceReader, InputPartition, InputPartitionReader}
25 | import org.apache.spark.sql.types.{DataTypes, StructType}
26 | import scala.collection.JavaConverters._
27 | import scala.collection.mutable.ArrayBuffer
28 | import org.apache.log4j.Logger
29 | import java.util.UUID
30 |
31 | // TODO: https://github.com/apache/spark/blob/053dd858d38e6107bc71e0aa3a4954291b74f8c8/sql/catalyst/src/main/java/org/apache/spark/sql/connector/read/SupportsReportPartitioning.java
32 | // in head of spark github repo
33 | // import org.apache.spark.sql.connector.read.{SupportsPushDownFilters, SupportsPushDownRequiredColumns}
34 | import org.apache.spark.sql.sources.v2.reader.{SupportsPushDownFilters, SupportsPushDownRequiredColumns}
35 |
36 |
37 | @SerialVersionUID(1L)
38 | class AccumuloDataSourceReader(schema: StructType, options: DataSourceOptions)
39 | extends DataSourceReader with Serializable with SupportsPushDownRequiredColumns with SupportsPushDownFilters {
40 | private val logger = Logger.getLogger(classOf[AccumuloDataSourceReader])
41 |
42 | private val defaultMaxPartitions = 200
43 |
44 | var filters = Array.empty[Filter]
45 |
46 | val rowKeyColumn: String = options.get("rowkey").orElse("rowkey")
47 | val schemaWithOutRowKey = new StructType(schema.filter { _.name != rowKeyColumn }.toArray)
48 |
49 | // initialize output schema with full schema
50 | private var requiredSchema = {
51 | // adding rowKey
52 | val baseSchema = schemaWithOutRowKey.add(rowKeyColumn, DataTypes.StringType, nullable = true)
53 |
54 | // add any output fields we find in a mleap pipeline
55 | val mleapFields = MLeapUtil.mleapSchemaToCatalyst(options.get("mleap").orElse(""))
56 |
57 | StructType(baseSchema ++ mleapFields)
58 | }
59 |
60 | private var filterInJuel: Option[String] = None
61 |
62 | override def pruneColumns(requiredSchema: StructType): Unit = {
63 | this.requiredSchema = requiredSchema
64 | }
65 |
66 | def readSchema: StructType = requiredSchema
67 |
68 | override def pushFilters(filters: Array[Filter]): Array[Filter] = {
69 | // unfortunately predicates on nested elements are not pushed down by Spark
70 | // https://issues.apache.org/jira/browse/SPARK-17636
71 | // https://github.com/apache/spark/pull/22535
72 |
73 | val jsonSchema = AvroUtil.catalystSchemaToJson(schemaWithOutRowKey)
74 | val result = new FilterToJuel(jsonSchema.attributeToVariableMapping, rowKeyColumn)
75 | .serializeFilters(filters, options.get("filter").orElse(""))
76 |
77 | this.filters = result.supportedFilters.toArray
78 |
79 | if (result.serializedFilter.length > 0) {
80 | this.filterInJuel = Some("${" + result.serializedFilter + "}")
81 | logger.info(s"JUEL filter: ${this.filterInJuel}")
82 | }
83 |
84 | result.unsupportedFilters.toArray
85 | }
86 |
87 | override def pushedFilters(): Array[Filter] = filters
88 |
89 | def planInputPartitions: java.util.List[InputPartition[InternalRow]] = {
90 | val tableName = options.tableName.get
91 | val maxPartitions = options.getInt("maxPartitions", defaultMaxPartitions)
92 | val properties = new java.util.Properties()
93 | // can use .putAll(options.asMap()) due to https://github.com/scala/bug/issues/10418
94 | options.asMap.asScala.foreach { case (k, v) => properties.setProperty(k, v) }
95 |
96 | // pass GUID to iterator so we can perform fast cache lookup
97 | // needs to be done on the head node so that all have the same guid
98 | properties.setProperty("mleapguid", UUID.randomUUID.toString)
99 |
100 | val splits = ArrayBuffer(Array.empty[Byte], Array.empty[Byte])
101 |
102 | val client = Accumulo.newClient().from(properties).build()
103 | // it's possible to merge on the accumulo side
104 | // val tableSplits = client.tableOperations().listSplits(tableName, maxPartitions)
105 | val tableSplits = try {
106 | client.tableOperations().listSplits(tableName)
107 | }
108 | finally {
109 | client.close()
110 | }
111 |
112 | // on deployed clusters a table with no split will return a single empty Text instance
113 | val containsSingleEmptySplit =
114 | tableSplits.size == 1 &&
115 | tableSplits.iterator.next.getLength == 0
116 |
117 | if (tableSplits.size > 1 || !containsSingleEmptySplit)
118 | splits.insertAll(1, tableSplits.asScala.map(_.getBytes))
119 |
120 | // convert splits to ranges
121 | var ranges = splits.sliding(2).toSeq
122 |
123 | // optionally shuffle
124 | if (options.getBoolean("shuffle.ranges", true))
125 | ranges = scala.util.Random.shuffle(ranges)
126 |
127 | // create groups of ranges
128 | val numReaders = scala.math.min(ranges.length, maxPartitions)
129 | val batchSize = ranges.length / numReaders
130 | val batchRanges = ranges.sliding(batchSize, batchSize)
131 |
132 | logger.info(s"Splits '$batchRanges' creating $numReaders readers")
133 |
134 | val foo = batchRanges.map(r => new PartitionReaderFactory(tableName, r,
135 | schemaWithOutRowKey, requiredSchema, properties, rowKeyColumn, filterInJuel))
136 | .toSeq.asJava
137 |
138 | new java.util.ArrayList[InputPartition[InternalRow]](foo)
139 | }
140 | }
141 |
142 | class PartitionReaderFactory(tableName: String,
143 | ranges: Seq[Seq[Array[Byte]]],
144 | inputSchema: StructType,
145 | outputSchema: StructType,
146 | properties: java.util.Properties,
147 | rowKeyColumn: String,
148 | filterInJuel: Option[String])
149 | extends InputPartition[InternalRow] {
150 |
151 | def createPartitionReader: InputPartitionReader[InternalRow] = {
152 |
153 | Logger.getLogger(classOf[AccumuloDataSourceReader]).info(s"Partition reader for $ranges")
154 |
155 | new AccumuloInputPartitionReader(tableName, ranges, inputSchema, outputSchema, properties, rowKeyColumn, filterInJuel)
156 | }
157 |
158 | // override def preferredLocations(): Array[String] = Array("ab", "c")
159 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.suo
8 | *.user
9 | *.userosscache
10 | *.sln.docstates
11 |
12 | # User-specific files (MonoDevelop/Xamarin Studio)
13 | *.userprefs
14 |
15 | # Build results
16 | [Dd]ebug/
17 | [Dd]ebugPublic/
18 | [Rr]elease/
19 | [Rr]eleases/
20 | x64/
21 | x86/
22 | bld/
23 | [Bb]in/
24 | [Oo]bj/
25 | [Ll]og/
26 |
27 | # Visual Studio 2015/2017 cache/options directory
28 | .vs/
29 | # Uncomment if you have tasks that create the project's static files in wwwroot
30 | #wwwroot/
31 |
32 | # Visual Studio 2017 auto generated files
33 | Generated\ Files/
34 |
35 | # MSTest test Results
36 | [Tt]est[Rr]esult*/
37 | [Bb]uild[Ll]og.*
38 |
39 | # NUNIT
40 | *.VisualState.xml
41 | TestResult.xml
42 |
43 | # Build Results of an ATL Project
44 | [Dd]ebugPS/
45 | [Rr]eleasePS/
46 | dlldata.c
47 |
48 | # Benchmark Results
49 | BenchmarkDotNet.Artifacts/
50 |
51 | # .NET Core
52 | project.lock.json
53 | project.fragment.lock.json
54 | artifacts/
55 | **/Properties/launchSettings.json
56 |
57 | # StyleCop
58 | StyleCopReport.xml
59 |
60 | # Files built by Visual Studio
61 | *_i.c
62 | *_p.c
63 | *_i.h
64 | *.ilk
65 | *.meta
66 | *.obj
67 | *.iobj
68 | *.pch
69 | *.pdb
70 | *.ipdb
71 | *.pgc
72 | *.pgd
73 | *.rsp
74 | *.sbr
75 | *.tlb
76 | *.tli
77 | *.tlh
78 | *.tmp
79 | *.tmp_proj
80 | *.log
81 | *.vspscc
82 | *.vssscc
83 | .builds
84 | *.pidb
85 | *.svclog
86 | *.scc
87 |
88 | # Chutzpah Test files
89 | _Chutzpah*
90 |
91 | # Visual C++ cache files
92 | ipch/
93 | *.aps
94 | *.ncb
95 | *.opendb
96 | *.opensdf
97 | *.sdf
98 | *.cachefile
99 | *.VC.db
100 | *.VC.VC.opendb
101 |
102 | # Visual Studio profiler
103 | *.psess
104 | *.vsp
105 | *.vspx
106 | *.sap
107 |
108 | # Visual Studio Trace Files
109 | *.e2e
110 |
111 | # TFS 2012 Local Workspace
112 | $tf/
113 |
114 | # Guidance Automation Toolkit
115 | *.gpState
116 |
117 | # ReSharper is a .NET coding add-in
118 | _ReSharper*/
119 | *.[Rr]e[Ss]harper
120 | *.DotSettings.user
121 |
122 | # JustCode is a .NET coding add-in
123 | .JustCode
124 |
125 | # TeamCity is a build add-in
126 | _TeamCity*
127 |
128 | # DotCover is a Code Coverage Tool
129 | *.dotCover
130 |
131 | # AxoCover is a Code Coverage Tool
132 | .axoCover/*
133 | !.axoCover/settings.json
134 |
135 | # Visual Studio code coverage results
136 | *.coverage
137 | *.coveragexml
138 |
139 | # NCrunch
140 | _NCrunch_*
141 | .*crunch*.local.xml
142 | nCrunchTemp_*
143 |
144 | # MightyMoose
145 | *.mm.*
146 | AutoTest.Net/
147 |
148 | # Web workbench (sass)
149 | .sass-cache/
150 |
151 | # Installshield output folder
152 | [Ee]xpress/
153 |
154 | # DocProject is a documentation generator add-in
155 | DocProject/buildhelp/
156 | DocProject/Help/*.HxT
157 | DocProject/Help/*.HxC
158 | DocProject/Help/*.hhc
159 | DocProject/Help/*.hhk
160 | DocProject/Help/*.hhp
161 | DocProject/Help/Html2
162 | DocProject/Help/html
163 |
164 | # Click-Once directory
165 | publish/
166 |
167 | # Publish Web Output
168 | *.[Pp]ublish.xml
169 | *.azurePubxml
170 | # Note: Comment the next line if you want to checkin your web deploy settings,
171 | # but database connection strings (with potential passwords) will be unencrypted
172 | *.pubxml
173 | *.publishproj
174 |
175 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
176 | # checkin your Azure Web App publish settings, but sensitive information contained
177 | # in these scripts will be unencrypted
178 | PublishScripts/
179 |
180 | # NuGet Packages
181 | *.nupkg
182 | # The packages folder can be ignored because of Package Restore
183 | **/[Pp]ackages/*
184 | # except build/, which is used as an MSBuild target.
185 | !**/[Pp]ackages/build/
186 | # Uncomment if necessary however generally it will be regenerated when needed
187 | #!**/[Pp]ackages/repositories.config
188 | # NuGet v3's project.json files produces more ignorable files
189 | *.nuget.props
190 | *.nuget.targets
191 |
192 | # Microsoft Azure Build Output
193 | csx/
194 | *.build.csdef
195 |
196 | # Microsoft Azure Emulator
197 | ecf/
198 | rcf/
199 |
200 | # Windows Store app package directories and files
201 | AppPackages/
202 | BundleArtifacts/
203 | Package.StoreAssociation.xml
204 | _pkginfo.txt
205 | *.appx
206 |
207 | # Visual Studio cache files
208 | # files ending in .cache can be ignored
209 | *.[Cc]ache
210 | # but keep track of directories ending in .cache
211 | !*.[Cc]ache/
212 |
213 | # Others
214 | ClientBin/
215 | ~$*
216 | *~
217 | *.dbmdl
218 | *.dbproj.schemaview
219 | *.jfm
220 | *.pfx
221 | *.publishsettings
222 | orleans.codegen.cs
223 |
224 | # Including strong name files can present a security risk
225 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
226 | #*.snk
227 |
228 | # Since there are multiple workflows, uncomment next line to ignore bower_components
229 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
230 | #bower_components/
231 |
232 | # RIA/Silverlight projects
233 | Generated_Code/
234 |
235 | # Backup & report files from converting an old project file
236 | # to a newer Visual Studio version. Backup files are not needed,
237 | # because we have git ;-)
238 | _UpgradeReport_Files/
239 | Backup*/
240 | UpgradeLog*.XML
241 | UpgradeLog*.htm
242 | ServiceFabricBackup/
243 | *.rptproj.bak
244 |
245 | # SQL Server files
246 | *.mdf
247 | *.ldf
248 | *.ndf
249 |
250 | # Business Intelligence projects
251 | *.rdl.data
252 | *.bim.layout
253 | *.bim_*.settings
254 | *.rptproj.rsuser
255 |
256 | # Microsoft Fakes
257 | FakesAssemblies/
258 |
259 | # GhostDoc plugin setting file
260 | *.GhostDoc.xml
261 |
262 | # Node.js Tools for Visual Studio
263 | .ntvs_analysis.dat
264 | node_modules/
265 |
266 | # Visual Studio 6 build log
267 | *.plg
268 |
269 | # Visual Studio 6 workspace options file
270 | *.opt
271 |
272 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
273 | *.vbw
274 |
275 | # Visual Studio LightSwitch build output
276 | **/*.HTMLClient/GeneratedArtifacts
277 | **/*.DesktopClient/GeneratedArtifacts
278 | **/*.DesktopClient/ModelManifest.xml
279 | **/*.Server/GeneratedArtifacts
280 | **/*.Server/ModelManifest.xml
281 | _Pvt_Extensions
282 |
283 | # Paket dependency manager
284 | .paket/paket.exe
285 | paket-files/
286 |
287 | # FAKE - F# Make
288 | .fake/
289 |
290 | # JetBrains Rider
291 | .idea/
292 | *.sln
293 | *.iml
294 |
295 | # CodeRush
296 | .cr/
297 |
298 | # Python Tools for Visual Studio (PTVS)
299 | __pycache__/
300 | *.pyc
301 |
302 | # Cake - Uncomment if you are using it
303 | # tools/**
304 | # !tools/packages.config
305 |
306 | # Tabs Studio
307 | *.tss
308 |
309 | # Telerik's JustMock configuration file
310 | *.jmconfig
311 |
312 | # BizTalk build output
313 | *.btp.cs
314 | *.btm.cs
315 | *.odx.cs
316 | *.xsd.cs
317 |
318 | # OpenCover UI analysis results
319 | OpenCover/
320 |
321 | # Azure Stream Analytics local run output
322 | ASALocalRun/
323 |
324 | # MSBuild Binary and Structured Log
325 | *.binlog
326 |
327 | # NVidia Nsight GPU debugger configuration file
328 | *.nvuser
329 |
330 | # MFractors (Xamarin productivity tool) working folder
331 | .mfractor/
332 |
333 | .project
334 | .classpath
335 | .vscode
336 | target/
337 | .settings/
338 | .ipynb_checkpoints/
--------------------------------------------------------------------------------
/connector/iterator/src/test/java/com/microsoft/accumulo/spark/AvroJuelTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.microsoft.accumulo.spark;
19 |
20 | import java.util.Arrays;
21 |
22 | import javax.el.ExpressionFactory;
23 | import javax.el.ValueExpression;
24 |
25 | import com.microsoft.accumulo.spark.juel.AvroELContext;
26 | import com.microsoft.accumulo.spark.record.AvroSchemaBuilder;
27 | import com.microsoft.accumulo.spark.record.RowBuilderField;
28 | import org.apache.avro.Schema;
29 | import org.apache.avro.generic.GenericRecordBuilder;
30 | import org.apache.hadoop.io.Text;
31 | import org.junit.Test;
32 |
33 | import junit.framework.TestCase;
34 |
35 | public class AvroJuelTest extends TestCase {
36 |
37 | private AvroELContext context;
38 | private ExpressionFactory factory;
39 | private Schema schema;
40 |
41 | @Override
42 | public void setUp() throws Exception {
43 | factory = ExpressionFactory.newInstance();
44 |
45 | RowBuilderField[] schemaMappingFields = new RowBuilderField[] {
46 | // row 0
47 | new RowBuilderField("cf1", "cq1", "long", "v0"),
48 | // row 1
49 | new RowBuilderField("cf2", "cq2", "double", "v1"),
50 | // row 2
51 | new RowBuilderField("cf2", "cq3", "string", "v2") };
52 |
53 | schema = AvroSchemaBuilder.buildSchema(Arrays.asList(schemaMappingFields));
54 |
55 | context = new AvroELContext(schema);
56 | }
57 |
58 | private void setRecordValues(String rowKey, long cq1, double cq2, String cq3) {
59 | GenericRecordBuilder cf1RecordBuilder = new GenericRecordBuilder(schema.getField("cf1").schema());
60 | GenericRecordBuilder cf2RecordBuilder = new GenericRecordBuilder(schema.getField("cf2").schema());
61 |
62 | cf1RecordBuilder.set("cq1", cq1);
63 | cf2RecordBuilder.set("cq2", cq2);
64 | cf2RecordBuilder.set("cq3", cq3);
65 |
66 | GenericRecordBuilder rootRecordBuilder = new GenericRecordBuilder(schema);
67 | rootRecordBuilder.set("cf1", cf1RecordBuilder.build());
68 | rootRecordBuilder.set("cf2", cf2RecordBuilder.build());
69 |
70 | context.setCurrent(new Text(rowKey), rootRecordBuilder.build());
71 | }
72 |
73 | @Test
74 | public void testVariableExpressions() {
75 | ValueExpression exprV0 = factory.createValueExpression(context, "${v0}", long.class);
76 |
77 | // set the values after the expression is created
78 | setRecordValues("key1", 3L, 2.0, "");
79 | assertEquals(3L, exprV0.getValue(context));
80 |
81 | // test if we can reset it
82 | setRecordValues("key1", 4L, 2.5, "");
83 | assertEquals(4L, exprV0.getValue(context));
84 |
85 | // check for the second variable
86 | ValueExpression exprV1 = factory.createValueExpression(context, "${v1}", double.class);
87 | assertEquals(2.5, exprV1.getValue(context));
88 | }
89 |
90 | @Test
91 | public void testVariableConditions() {
92 | ValueExpression expr = factory.createValueExpression(context, "${v0 > 2.1 && v1 < 3}", boolean.class);
93 |
94 | setRecordValues("key1", 3L, 2.0, "");
95 |
96 | assertTrue((boolean) expr.getValue(context));
97 | }
98 |
99 | @Test
100 | public void testStringEndsWith() {
101 | ValueExpression expr = factory.createValueExpression(context, "${v2.endsWith('test')}", boolean.class);
102 | setRecordValues("key1", 3L, 2.0, "This is a test");
103 | assertTrue((boolean) expr.getValue(context));
104 |
105 | expr = factory.createValueExpression(context, "${!v2.endsWith('foo')}", boolean.class);
106 | assertTrue((boolean) expr.getValue(context));
107 | }
108 |
109 | @Test
110 | public void testStringStartsWith() {
111 | ValueExpression expr = factory.createValueExpression(context, "${v2.startsWith('This')}", boolean.class);
112 | setRecordValues("key1", 3L, 2.0, "This is a test");
113 | assertTrue((boolean) expr.getValue(context));
114 |
115 | expr = factory.createValueExpression(context, "${!v2.startsWith('this')}", boolean.class);
116 | assertTrue((boolean) expr.getValue(context));
117 | }
118 |
119 | @Test
120 | public void testStringContains() {
121 | ValueExpression expr = factory.createValueExpression(context, "${v2.contains('is')}", boolean.class);
122 | setRecordValues("key1", 3L, 2.0, "This is a test");
123 | assertTrue((boolean) expr.getValue(context));
124 |
125 | expr = factory.createValueExpression(context, "${!v2.contains('IS')}", boolean.class);
126 | assertTrue((boolean) expr.getValue(context));
127 | }
128 |
129 | @Test
130 | public void testStringIn() {
131 | ValueExpression expr = factory.createValueExpression(context, "${v2.in('a','b','c')}", boolean.class);
132 | setRecordValues("key1", 3L, 2.0, "b");
133 | assertTrue((boolean) expr.getValue(context));
134 | }
135 |
136 | @Test
137 | public void testIntIn() {
138 | ValueExpression expr = factory.createValueExpression(context, "${v0.in(0, 1, 3)}", boolean.class);
139 | setRecordValues("key1", 3L, 2.0, "b");
140 | assertTrue((boolean) expr.getValue(context));
141 |
142 | expr = factory.createValueExpression(context, "${v0.in(0, 1)}", boolean.class);
143 | setRecordValues("key1", 3L, 2.0, "b");
144 | assertFalse((boolean) expr.getValue(context));
145 | }
146 |
147 | @Test
148 | public void testStringQuoteEscape() {
149 | ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\\'bc'}", boolean.class);
150 | setRecordValues("key1", 3L, 2.0, "a'bc");
151 | assertTrue((boolean) expr.getValue(context));
152 | }
153 |
154 | @Test
155 | public void testStringDoubleQuoteEscape() {
156 | ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\"bc'}", boolean.class);
157 | setRecordValues("key1", 3L, 2.0, "a\"bc");
158 | assertTrue((boolean) expr.getValue(context));
159 | }
160 |
161 | @Test
162 | public void testStringBackslash() {
163 | ValueExpression expr = factory.createValueExpression(context, "${v2 == 'a\\\\bc'}", boolean.class);
164 | setRecordValues("key1", 3L, 2.0, "a\\bc");
165 | assertTrue((boolean) expr.getValue(context));
166 | }
167 |
168 | @Test
169 | public void testRowKey() {
170 | ValueExpression expr = factory.createValueExpression(context, "${rowKey == 'key1'}", boolean.class);
171 | setRecordValues("key1", 3L, 2.0, "abc");
172 | assertTrue((boolean) expr.getValue(context));
173 |
174 | setRecordValues("key2", 3L, 2.0, "abc");
175 | assertFalse((boolean) expr.getValue(context));
176 | }
177 |
178 | @Test
179 | public void testObjectPropertyBased() {
180 | ValueExpression expr = factory.createValueExpression(context, "${cf1.cq1 == 3}", boolean.class);
181 | setRecordValues("key1", 3L, 2.0, "abc");
182 | assertTrue((boolean) expr.getValue(context));
183 | }
184 |
185 | @Test
186 | public void testColumnRemapping() {
187 | ValueExpression expr = factory.createValueExpression(context, "${(cf1.cq1 + 1)/2.0}", Object.class);
188 |
189 | setRecordValues("key1", 3L, 2.0, "abc");
190 |
191 | assertEquals((3 + 1) / 2.0, expr.getValue(context));
192 | }
193 | }
194 |
--------------------------------------------------------------------------------
/connector/zipfs/src/main/java/com/microsoft/accumulo/zipfs/ByteArrayChannel.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 | *
5 | * This code is free software; you can redistribute it and/or modify it
6 | * under the terms of the GNU General Public License version 2 only, as
7 | * published by the Free Software Foundation. Oracle designates this
8 | * particular file as subject to the "Classpath" exception as provided
9 | * by Oracle in the LICENSE file that accompanied this code.
10 | *
11 | * This code is distributed in the hope that it will be useful, but WITHOUT
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 | * version 2 for more details (a copy is included in the LICENSE file that
15 | * accompanied this code).
16 | *
17 | * You should have received a copy of the GNU General Public License version
18 | * 2 along with this work; if not, write to the Free Software Foundation,
19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 | *
21 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 | * or visit www.oracle.com if you need additional information or have any
23 | * questions.
24 | */
25 |
26 | package com.microsoft.accumulo.zipfs;
27 |
28 | import java.io.IOException;
29 | import java.nio.ByteBuffer;
30 | import java.nio.channels.ClosedChannelException;
31 | import java.nio.channels.NonWritableChannelException;
32 | import java.nio.channels.SeekableByteChannel;
33 | import java.util.Arrays;
34 | import java.util.concurrent.locks.ReadWriteLock;
35 | import java.util.concurrent.locks.ReentrantReadWriteLock;
36 |
37 | public class ByteArrayChannel implements SeekableByteChannel {
38 |
39 | private final ReadWriteLock rwlock = new ReentrantReadWriteLock();
40 | private byte buf[];
41 |
42 | /*
43 | * The current position of this channel.
44 | */
45 | private int pos;
46 |
47 | /*
48 | * The index that is one greater than the last valid byte in the channel.
49 | */
50 | private int last;
51 |
52 | private boolean closed;
53 | private boolean readonly;
54 |
55 | /*
56 | * Creates a {@code ByteArrayChannel} with size {@code sz}.
57 | */
58 | ByteArrayChannel(int sz, boolean readonly) {
59 | this.buf = new byte[sz];
60 | this.pos = this.last = 0;
61 | this.readonly = readonly;
62 | }
63 |
64 | /*
65 | * Creates a ByteArrayChannel with its 'pos' at 0 and its 'last' at buf's end.
66 | * Note: no defensive copy of the 'buf', used directly.
67 | */
68 | ByteArrayChannel(byte[] buf, boolean readonly) {
69 | this.buf = buf;
70 | this.pos = 0;
71 | this.last = buf.length;
72 | this.readonly = readonly;
73 | }
74 |
75 | @Override
76 | public boolean isOpen() {
77 | return !closed;
78 | }
79 |
80 | @Override
81 | public long position() throws IOException {
82 | beginRead();
83 | try {
84 | ensureOpen();
85 | return pos;
86 | } finally {
87 | endRead();
88 | }
89 | }
90 |
91 | @Override
92 | public SeekableByteChannel position(long pos) throws IOException {
93 | beginWrite();
94 | try {
95 | ensureOpen();
96 | if (pos < 0 || pos >= Integer.MAX_VALUE)
97 | throw new IllegalArgumentException("Illegal position " + pos);
98 | this.pos = Math.min((int)pos, last);
99 | return this;
100 | } finally {
101 | endWrite();
102 | }
103 | }
104 |
105 | @Override
106 | public int read(ByteBuffer dst) throws IOException {
107 | beginWrite();
108 | try {
109 | ensureOpen();
110 | if (pos == last)
111 | return -1;
112 | int n = Math.min(dst.remaining(), last - pos);
113 | dst.put(buf, pos, n);
114 | pos += n;
115 | return n;
116 | } finally {
117 | endWrite();
118 | }
119 | }
120 |
121 | @Override
122 | public SeekableByteChannel truncate(long size) throws IOException {
123 | if (readonly)
124 | throw new NonWritableChannelException();
125 | ensureOpen();
126 | throw new UnsupportedOperationException();
127 | }
128 |
129 | @Override
130 | public int write(ByteBuffer src) throws IOException {
131 | if (readonly)
132 | throw new NonWritableChannelException();
133 | beginWrite();
134 | try {
135 | ensureOpen();
136 | int n = src.remaining();
137 | ensureCapacity(pos + n);
138 | src.get(buf, pos, n);
139 | pos += n;
140 | if (pos > last) {
141 | last = pos;
142 | }
143 | return n;
144 | } finally {
145 | endWrite();
146 | }
147 | }
148 |
149 | @Override
150 | public long size() throws IOException {
151 | beginRead();
152 | try {
153 | ensureOpen();
154 | return last;
155 | } finally {
156 | endRead();
157 | }
158 | }
159 |
160 | @Override
161 | public void close() throws IOException {
162 | if (closed)
163 | return;
164 | beginWrite();
165 | try {
166 | closed = true;
167 | buf = null;
168 | pos = 0;
169 | last = 0;
170 | } finally {
171 | endWrite();
172 | }
173 | }
174 |
175 | /**
176 | * Creates a newly allocated byte array. Its size is the current
177 | * size of this channel and the valid contents of the buffer
178 | * have been copied into it.
179 | *
180 | * @return the current contents of this channel, as a byte array.
181 | */
182 | public byte[] toByteArray() {
183 | beginRead();
184 | try {
185 | // avoid copy if last == bytes.length?
186 | return Arrays.copyOf(buf, last);
187 | } finally {
188 | endRead();
189 | }
190 | }
191 |
192 | private void ensureOpen() throws IOException {
193 | if (closed)
194 | throw new ClosedChannelException();
195 | }
196 |
197 | private final void beginWrite() {
198 | rwlock.writeLock().lock();
199 | }
200 |
201 | private final void endWrite() {
202 | rwlock.writeLock().unlock();
203 | }
204 |
205 | private final void beginRead() {
206 | rwlock.readLock().lock();
207 | }
208 |
209 | private final void endRead() {
210 | rwlock.readLock().unlock();
211 | }
212 |
213 | private void ensureCapacity(int minCapacity) {
214 | // overflow-conscious code
215 | if (minCapacity - buf.length > 0) {
216 | grow(minCapacity);
217 | }
218 | }
219 |
220 | /**
221 | * The maximum size of array to allocate.
222 | * Some VMs reserve some header words in an array.
223 | * Attempts to allocate larger arrays may result in
224 | * OutOfMemoryError: Requested array size exceeds VM limit
225 | */
226 | private static final int MAX_ARRAY_SIZE = Integer.MAX_VALUE - 8;
227 |
228 | /**
229 | * Increases the capacity to ensure that it can hold at least the
230 | * number of elements specified by the minimum capacity argument.
231 | *
232 | * @param minCapacity the desired minimum capacity
233 | */
234 | private void grow(int minCapacity) {
235 | // overflow-conscious code
236 | int oldCapacity = buf.length;
237 | int newCapacity = oldCapacity << 1;
238 | if (newCapacity - minCapacity < 0)
239 | newCapacity = minCapacity;
240 | if (newCapacity - MAX_ARRAY_SIZE > 0)
241 | newCapacity = hugeCapacity(minCapacity);
242 | buf = Arrays.copyOf(buf, newCapacity);
243 | }
244 |
245 | private static int hugeCapacity(int minCapacity) {
246 | if (minCapacity < 0) // overflow
247 | throw new OutOfMemoryError();
248 | return (minCapacity > MAX_ARRAY_SIZE) ?
249 | Integer.MAX_VALUE :
250 | MAX_ARRAY_SIZE;
251 | }
252 | }
253 |
--------------------------------------------------------------------------------