├── data └── files │ ├── users.txt │ ├── test1.txt │ ├── clicks.txt │ ├── create_nested_type.txt │ └── kv3.txt ├── src ├── main │ ├── resources │ │ ├── dashboard │ │ │ ├── README │ │ │ └── dashboard.css │ │ └── tablerdd │ │ │ ├── generator_utils.py │ │ │ ├── SharkContext_sqlRdd_generator.py │ │ │ ├── rddtable_generator.py │ │ │ └── TableRDDGenerated_generator.py │ ├── scala │ │ └── shark │ │ │ ├── api │ │ │ ├── ClassTags.scala │ │ │ ├── DataType.java │ │ │ ├── QueryExecutionException.scala │ │ │ ├── ColumnDesc.scala │ │ │ ├── ResultSet.scala │ │ │ ├── PythonTableRDD.scala │ │ │ ├── TableRDD.scala │ │ │ ├── JavaTableRDD.scala │ │ │ └── RDDTableFunctions.scala │ │ │ ├── server │ │ │ ├── SharkSessionManager.scala │ │ │ ├── SharkOperationManager.scala │ │ │ ├── SharkCLIService.scala │ │ │ ├── SharkExecuteStatementOperation.scala │ │ │ └── SharkSQLOperation.scala │ │ │ ├── tachyon │ │ │ └── TachyonException.scala │ │ │ ├── memstore2 │ │ │ ├── column │ │ │ │ ├── MemoryStoreException.scala │ │ │ │ ├── NullableColumnIterator.scala │ │ │ │ ├── NullableColumnBuilder.scala │ │ │ │ ├── ColumnIterators.scala │ │ │ │ └── ColumnBuilders.scala │ │ │ ├── TablePartitionStats.scala │ │ │ ├── ColumnarStruct.scala │ │ │ ├── LazySimpleSerDeWrapper.scala │ │ │ ├── Table.scala │ │ │ ├── TablePartitionIterator.scala │ │ │ ├── TablePartitionBuilder.scala │ │ │ ├── CacheType.scala │ │ │ ├── TableRecovery.scala │ │ │ ├── SharkTblProperties.scala │ │ │ ├── MemoryTable.scala │ │ │ ├── ColumnarStructObjectInspector.scala │ │ │ └── TablePartition.scala │ │ │ ├── parse │ │ │ ├── QueryContext.scala │ │ │ ├── SharkSemanticAnalyzerFactory.scala │ │ │ ├── QueryBlock.scala │ │ │ ├── SharkExplainSemanticAnalyzer.scala │ │ │ └── SharkLoadSemanticAnalyzer.scala │ │ │ ├── execution │ │ │ ├── LateralViewForwardOperator.scala │ │ │ ├── ForwardOperator.scala │ │ │ ├── MapSplitPruningHelper.scala │ │ │ ├── serialization │ │ │ │ ├── JavaSerializer.scala │ │ │ │ ├── KryoSerializer.scala │ │ │ │ ├── HiveStructSerializer.scala │ │ │ │ ├── HiveConfPersistenceDelegate.scala │ │ │ │ ├── KryoSerializationWrapper.scala │ │ │ │ ├── SerializableWritable.scala │ │ │ │ ├── HiveStructDeserializer.scala │ │ │ │ ├── OperatorSerializationWrapper.scala │ │ │ │ └── XmlSerializer.scala │ │ │ ├── package.scala │ │ │ ├── ReduceSinkTableDesc.scala │ │ │ ├── GroupByOperator.scala │ │ │ ├── LimitOperator.scala │ │ │ ├── ScriptOperatorHelper.scala │ │ │ ├── FilterOperator.scala │ │ │ ├── TerminalOperator.scala │ │ │ ├── SelectOperator.scala │ │ │ ├── UDTFOperator.scala │ │ │ ├── JoinUtil.scala │ │ │ └── SharkExplainTask.scala │ │ │ ├── repl │ │ │ ├── Main.scala │ │ │ └── SharkILoop.scala │ │ │ ├── util │ │ │ └── QueryRewriteUtils.scala │ │ │ ├── SharkServer2.scala │ │ │ ├── LogHelper.scala │ │ │ ├── optimizer │ │ │ ├── SharkMapJoinProcessor.scala │ │ │ └── SharkOptimizer.scala │ │ │ └── KryoRegistrator.scala │ └── java │ │ └── shark │ │ └── tgf │ │ └── Schema.java ├── test │ ├── 0.20S-exclude.txt │ ├── scala │ │ └── shark │ │ │ ├── util │ │ │ └── BloomFilterSuite.scala │ │ │ ├── SortSuite.scala │ │ │ ├── UtilsSuite.scala │ │ │ ├── CliSuite.scala │ │ │ ├── execution │ │ │ └── HiveStructSerializerSuite.scala │ │ │ ├── memstore2 │ │ │ └── column │ │ │ │ ├── NullableColumnIteratorSuite.scala │ │ │ │ └── ColumnTypeSuite.scala │ │ │ └── SharkServerSuite.scala │ ├── README.md │ └── 0.20S-include.txt └── tachyon_enabled │ └── scala │ └── shark │ └── tachyon │ └── TachyonOffHeapTableWriter.scala ├── lib ├── pyrolite.jar └── JavaEWAH-0.4.2.jar ├── README.md ├── conf ├── log4j.properties.template └── shark-env.sh.template ├── .gitignore ├── bin ├── shark-shell ├── shark-withinfo ├── shark-withdebug ├── beeline ├── ext │ ├── cli.sh │ ├── sharkserver.sh │ ├── beeline.sh │ └── sharkserver2.sh ├── dev │ ├── release_cleanup.sh │ ├── clear-buffer-cache.py │ ├── build_test.xml │ └── test └── shark ├── project ├── build.properties └── plugins.sbt └── run /data/files/users.txt: -------------------------------------------------------------------------------- 1 | 1 A 2 | 2 B 3 | 3 A 4 | -------------------------------------------------------------------------------- /data/files/test1.txt: -------------------------------------------------------------------------------- 1 | 1 012 2 | 2 345 3 | 3 678 4 | -------------------------------------------------------------------------------- /data/files/clicks.txt: -------------------------------------------------------------------------------- 1 | 1 0 2 | 2 1 3 | 1 1 4 | 2 0 5 | 1 1 6 | 7 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/README: -------------------------------------------------------------------------------- 1 | Place static files here. 2 | -------------------------------------------------------------------------------- /lib/pyrolite.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/shark/HEAD/lib/pyrolite.jar -------------------------------------------------------------------------------- /lib/JavaEWAH-0.4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amplab/shark/HEAD/lib/JavaEWAH-0.4.2.jar -------------------------------------------------------------------------------- /data/files/create_nested_type.txt: -------------------------------------------------------------------------------- 1 | a0b00b01c001C001c002C002c011\Nc012C012d01d011d012d02d021d022 2 | a1b10c001C001c002C002d01d011d012d02\N 3 | a2c001\Nc002C002c011C011c012C012d01\Nd012d02d021d022 4 | a3\N\N\N 5 | -------------------------------------------------------------------------------- /src/test/0.20S-exclude.txt: -------------------------------------------------------------------------------- 1 | testCliDriver_archive_excludeHadoop20 2 | testCliDriver_auto_join14 3 | testCliDriver_combine2 4 | testCliDriver_ctas 5 | testCliDriver_input12 6 | testCliDriver_input39 7 | testCliDriver_join14 8 | testCliDriver_loadpart_err 9 | testCliDriver_sample_islocalmode_hook -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Shark (Hive on Spark) 2 | 3 | 4 | Development in Shark has been ended and subsumed by [Spark SQL](http://spark.apache.org/sql/). Please see [this blog post](http://databricks.com/blog/2014/07/01/shark-spark-sql-hive-on-spark-and-the-future-of-sql-on-spark.html) for more information. 5 | -------------------------------------------------------------------------------- /data/files/kv3.txt: -------------------------------------------------------------------------------- 1 | 238val_238 2 |  3 | 311val_311 4 | val_27 5 | val_165 6 | val_409 7 | 255val_255 8 | 278val_278 9 | 98val_98 10 | val_484 11 | val_265 12 | val_193 13 | 401val_401 14 | 150val_150 15 | 273val_273 16 | 224 17 | 369 18 | 66val_66 19 | 128 20 | 213val_213 21 | 146val_146 22 | 406val_406 23 |  24 |  25 |  26 | -------------------------------------------------------------------------------- /conf/log4j.properties.template: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | # Ignore messages below warning level from Jetty, because it's a bit verbose 8 | log4j.logger.org.eclipse.jetty=WARN 9 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/ClassTags.scala: -------------------------------------------------------------------------------- 1 | package shark.api 2 | 3 | import scala.reflect.classTag 4 | 5 | object ClassTags { 6 | // List of primitive ClassTags. 7 | val jBoolean = classTag[java.lang.Boolean] 8 | val jByte = classTag[java.lang.Byte] 9 | val jShort = classTag[java.lang.Short] 10 | val jInt = classTag[java.lang.Integer] 11 | val jLong = classTag[java.lang.Long] 12 | val jFloat = classTag[java.lang.Float] 13 | val jDouble = classTag[java.lang.Double] 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/DataType.java: -------------------------------------------------------------------------------- 1 | package shark.api; 2 | 3 | import java.io.Serializable; 4 | 5 | 6 | public class DataType implements Serializable { 7 | 8 | public final String name; 9 | public final String hiveName; 10 | public final boolean isPrimitive; 11 | 12 | DataType(String name, String hiveName, boolean isPrimitive) { 13 | this.name = name; 14 | this.hiveName = hiveName; 15 | this.isPrimitive = isPrimitive; 16 | } 17 | 18 | @Override 19 | public String toString() { 20 | return name; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/shark/server/SharkSessionManager.scala: -------------------------------------------------------------------------------- 1 | package shark.server 2 | 3 | import org.apache.hadoop.hive.conf.HiveConf 4 | import org.apache.hive.service.cli.session.SessionManager 5 | import shark.Utils 6 | 7 | class SharkSessionManager extends SessionManager { 8 | override def init(hiveConf : HiveConf) { 9 | this.synchronized { 10 | val sharkOpManager = new SharkOperationManager 11 | Utils.setSuperField("operationManager", sharkOpManager, this) 12 | addService(sharkOpManager) 13 | sharkInit(hiveConf) 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/resources/tablerdd/generator_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import sys 3 | 4 | # e.g. createList(1,3, "T[", "]", ",") gives T[1],T[2],T[3] 5 | def createList(start, stop, prefix, suffix="", sep = ",", newlineAfter = 70, indent = 0): 6 | res = "" 7 | oneLine = res 8 | for y in range(start,stop+1): 9 | res += prefix + str(y) + suffix 10 | oneLine += prefix + str(y) + suffix 11 | if y != stop: 12 | res += sep 13 | oneLine += sep 14 | if len(oneLine) > newlineAfter: 15 | res += "\n" + " "*indent 16 | oneLine = "" 17 | return res 18 | 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | build/ 3 | metastore_db/ 4 | project/boot 5 | lib_managed/ 6 | TempStatsStore 7 | work/ 8 | run-tests-from-scratch-workspace/ 9 | sbt/*.jar 10 | conf/shark-env.sh 11 | 12 | # Compiled Source 13 | *.class 14 | 15 | # Packages 16 | #*.jar 17 | 18 | # Log Files 19 | *.log 20 | 21 | # Eclipse project files 22 | .classpath 23 | .project 24 | .settings 25 | 26 | # emacs backup 27 | *~ 28 | 29 | # tmp files 30 | *.swp 31 | .cache 32 | 33 | # mac os file 34 | *.DS_Store 35 | 36 | # latex files 37 | paper.pdf 38 | paper.blg 39 | paper.bbl 40 | paper.aux 41 | 42 | # IntelliJ IDE files 43 | .idea 44 | *.iml 45 | 46 | # Test Reports 47 | TEST*.xml 48 | test_warehouses 49 | 50 | # Ensime files for emacs 51 | .ensime 52 | .ensime_lucene 53 | /eclipse_bin 54 | /.scala_dependencies 55 | -------------------------------------------------------------------------------- /src/main/resources/tablerdd/SharkContext_sqlRdd_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from string import Template 3 | import sys 4 | 5 | from generator_utils import * 6 | 7 | ## This script generates functions sqlRdd for SharkContext.scala 8 | 9 | p = sys.stdout 10 | 11 | # The SharkContext declarations 12 | for x in range(2,23): 13 | sqlRddFun = Template( 14 | """ 15 | def sqlRdd[$list1](cmd: String): 16 | RDD[Tuple$num[$list2]] = { 17 | new TableRDD$num[$list2](sql2rdd(cmd), 18 | Seq($list3)) 19 | } 20 | """).substitute(num = x, 21 | list1 = createList(1, x, "T", ": M", ", ", 80, 4), 22 | list2 = createList(1, x, "T", sep=", ", indent = 4), 23 | list3 = createList(1, x, "m[T", "]", sep=", ", indent = 10)) 24 | p.write(sqlRddFun) 25 | -------------------------------------------------------------------------------- /bin/shark-shell: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | BINDIR="`dirname $0`" 19 | FWDIR="`dirname $BINDIR`" 20 | exec $FWDIR/run shark.repl.Main "$@" 21 | -------------------------------------------------------------------------------- /src/main/scala/shark/tachyon/TachyonException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.tachyon 19 | 20 | class TachyonException(msg: String) extends Exception(msg) 21 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/QueryExecutionException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | 21 | class QueryExecutionException(message: String) extends Exception(message) 22 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | sbt.version=0.13.1 18 | 19 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/column/MemoryStoreException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | 21 | class MemoryStoreException(message: String) extends Exception(message) 22 | -------------------------------------------------------------------------------- /src/main/scala/shark/server/SharkOperationManager.scala: -------------------------------------------------------------------------------- 1 | package shark.server 2 | 3 | import java.util.{Map => JMap} 4 | import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, OperationManager} 5 | import org.apache.hive.service.cli.session.HiveSession 6 | 7 | class SharkOperationManager extends OperationManager { 8 | override def newExecuteStatementOperation(parentSession: HiveSession, 9 | statement: String, confOverlay: 10 | JMap[String, String]) 11 | : ExecuteStatementOperation = { 12 | val executeStatementOperation = SharkExecuteStatementOperation 13 | .newExecuteStatementOperation(parentSession, statement, confOverlay) 14 | val castOp = executeStatementOperation.asInstanceOf[ExecuteStatementOperation] 15 | addOperation(castOp) 16 | castOp 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /bin/shark-withinfo: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This is really just a wrapper around bin/shark to pipe INFO log to console. 19 | # Very handy for debugging. 20 | 21 | BINDIR="`dirname $0`" 22 | exec $BINDIR/shark -hiveconf hive.root.logger=INFO,console "$@" 23 | -------------------------------------------------------------------------------- /bin/shark-withdebug: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # This is really just a wrapper around bin/shark to pipe INFO log to console. 19 | # Very handy for debugging. 20 | 21 | BINDIR="`dirname $0`" 22 | exec $BINDIR/shark -hiveconf hive.root.logger=DEBUG,console "$@" 23 | 24 | -------------------------------------------------------------------------------- /bin/beeline: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | bin=`dirname "$0"` 19 | bin=`cd "$bin"; pwd` 20 | 21 | . "$bin"/shark --service beeline "$@" 22 | -------------------------------------------------------------------------------- /bin/ext/cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | THISSERVICE=cli 19 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} " 20 | 21 | cli() { 22 | echo "Starting the Shark Command Line Client" 23 | exec $FWDIR/run shark.SharkCliDriver "$@" 24 | } 25 | 26 | cli_help() { 27 | echo "usage ./shark --service cli" 28 | } 29 | -------------------------------------------------------------------------------- /src/test/scala/shark/util/BloomFilterSuite.scala: -------------------------------------------------------------------------------- 1 | package shark.util 2 | 3 | import org.scalatest.FunSuite 4 | 5 | class BloomFilterSuite extends FunSuite{ 6 | 7 | test("Integer") { 8 | val bf = new BloomFilter(0.03, 1000000) 9 | Range(0, 1000000).foreach { 10 | i => bf.add(i) 11 | } 12 | assert(bf.contains(333)) 13 | assert(bf.contains(678)) 14 | assert(!bf.contains(1200000)) 15 | } 16 | 17 | test("Integer FP") { 18 | val bf = new BloomFilter(0.03,1000) 19 | Range(0,700).foreach { 20 | i => bf.add(i) 21 | } 22 | assert(bf.contains(333)) 23 | assert(bf.contains(678)) 24 | //is the fraction of false positives in line with what we expect ? 25 | val e = Range(0, 100).map { 26 | i => bf.contains(i*10) 27 | } 28 | val s = e.groupBy(x => x).map(x => (x._1, x._2.size)) 29 | val t = s(true) 30 | val f = s(false) 31 | assert(f > 25 && f < 35) 32 | assert(t < 75 && t > 65) 33 | // expect false positive to be < 3 % and no false negatives 34 | } 35 | } -------------------------------------------------------------------------------- /bin/ext/sharkserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | THISSERVICE=sharkserver 19 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} " 20 | 21 | sharkserver() { 22 | echo "Starting the Shark Server" 23 | exec $FWDIR/run shark.SharkServer "$@" 24 | } 25 | 26 | sharkserver_help() { 27 | echo "usage SHARK_PORT=xxxx ./shark --service sharkserver" 28 | echo "SHARK_PORT : Specify the server port" 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/shark/parse/QueryContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.parse 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | import org.apache.hadoop.hive.ql.Context 22 | 23 | /** 24 | * Shark's query context. Adds Shark-specific information to Hive's Context. 25 | */ 26 | class QueryContext(conf: Configuration, val useTableRddSink: Boolean) extends Context(conf) 27 | -------------------------------------------------------------------------------- /src/test/README.md: -------------------------------------------------------------------------------- 1 | ###Hive Compatibility Test Warnings 2 | 3 | #### Test results that rely on tables with `timestamp` fields may differ across JVM versions. 4 | For example, these tests: 5 | * udf5 6 | * timestamp.1, timestamp_2, timestamp_udf 7 | 8 | Pass when running with this JVM: 9 | (Mac 10.9, AMPLab Jenkins) 10 | java version "1.7.0_25" 11 | Java(TM) SE Runtime Environment (build 1.7.0_25-b15) 12 | Java HotSpot(TM) 64-Bit Server VM (build 23.25-b01, mixed mode) 13 | 14 | But fail on EC2 when run with this JVM: 15 | (EC2 c2.2xlarge) 16 | java version "1.7.0_45" 17 | OpenJDK Runtime Environment (amzn-2.4.3.2.32.amzn1-x86_64 u45-b15) 18 | OpenJDK 64-Bit Server VM (build 24.45-b08, mixed mode) 19 | 20 | 21 | A few more tests from test_pass.txt that fall into this category: 22 | TestCliDriver_input_part8 23 | TestSharkCliDriver: testCliDriver_timestamp_1 24 | TestSharkCliDriver: testCliDriver_timestamp_2 25 | TestSharkCliDriver: testCliDriver_timestamp_3 26 | TestSharkCliDriver: testCliDriver_timestamp_udf 27 | TestSharkCliDriver: testCliDriver_udf_to_unix_timestamp 28 | TestSharkCliDriver: testCliDriver_udf5 29 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/LateralViewForwardOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc 21 | 22 | import org.apache.spark.rdd.RDD 23 | 24 | 25 | class LateralViewForwardOperator extends UnaryOperator[LateralViewForwardDesc] { 26 | 27 | override def execute(): RDD[_] = executeParents().head._2 28 | 29 | override def processPartition(split: Int, iter: Iterator[_]) = iter 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /bin/ext/beeline.sh: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Need arguments [host [port [db]]] 17 | THISSERVICE=beeline 18 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} " 19 | 20 | beeline () { 21 | CLASS=org.apache.hive.beeline.BeeLine; 22 | exec $FWDIR/run $CLASS "$@" 23 | } 24 | 25 | beeline_help () { 26 | CLASS=org.apache.hive.beeline.BeeLine; 27 | exec $FWDIR/run "--help" 28 | } 29 | 30 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/ForwardOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.apache.hadoop.hive.ql.plan.ForwardDesc 22 | 23 | 24 | class ForwardOperator extends UnaryOperator[ForwardDesc] { 25 | 26 | override def execute(): RDD[_] = executeParents().head._2 27 | 28 | override def processPartition(split: Int, iter: Iterator[_]) = 29 | throw new UnsupportedOperationException("ForwardOperator.processPartition()") 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/shark/tgf/Schema.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2013 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.tgf; 19 | 20 | import java.lang.annotation.Retention; 21 | import java.lang.annotation.RetentionPolicy; 22 | import java.lang.annotation.ElementType; 23 | import java.lang.annotation.Target; 24 | 25 | 26 | /** 27 | * Schema annotation for TGFs, example syntax: @Schema(spec = "name string, age int") 28 | */ 29 | @Retention(RetentionPolicy.RUNTIME) 30 | @Target(ElementType.METHOD) 31 | public @interface Schema { 32 | String spec(); 33 | } 34 | -------------------------------------------------------------------------------- /src/main/resources/dashboard/dashboard.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | body { 19 | background-color : #ffffff; 20 | font-family : sans-serif; 21 | } 22 | 23 | th { 24 | padding-bottom : 10px; 25 | padding-top : 10px; 26 | padding-left : 10px; 27 | padding-right : 10px; 28 | } 29 | 30 | td.node { 31 | padding-bottom : 8px; 32 | padding-top : 8px; 33 | padding-left : 8px; 34 | padding-right : 8px; 35 | } 36 | 37 | table.percent_bar { 38 | width: 200px; 39 | height: 15px; 40 | } 41 | 42 | td.percent_used { 43 | background: #AAAAFF; 44 | } 45 | 46 | -------------------------------------------------------------------------------- /bin/ext/sharkserver2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | THISSERVICE=sharkserver2 19 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} " 20 | 21 | # Use Java to launch Shark otherwise the unit tests cannot properly kill 22 | # the server process. 23 | export SHARK_LAUNCH_WITH_JAVA=1 24 | 25 | sharkserver2() { 26 | echo "Starting the Shark Server" 27 | exec $FWDIR/run shark.SharkServer2 "$@" 28 | } 29 | 30 | sharkserver2_help() { 31 | echo "usage HIVE_SERVER2_THRIFT_PORT=xxxx ./shark --service sharkserver2" 32 | echo "HIVE_SERVER2_THRIFT_PORT : Specify the server port" 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/MapSplitPruningHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.hadoop.hive.serde2.objectinspector 19 | 20 | import org.apache.hadoop.hive.serde2.objectinspector.UnionStructObjectInspector.MyField 21 | 22 | 23 | object MapSplitPruningHelper { 24 | 25 | /** 26 | * Extract the UnionStructObjectInspector.MyField's `structField` reference, which is 27 | * package-private. 28 | */ 29 | def getStructFieldFromUnionOIField(unionOIMyField: MyField): StructField = { 30 | unionOIMyField.structField 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/TablePartitionStats.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import shark.memstore2.column.ColumnStats 21 | 22 | 23 | /** 24 | * Stores column statistics for a table partition. 25 | */ 26 | class TablePartitionStats(val stats: Array[ColumnStats[_]], val numRows: Long) 27 | extends Serializable { 28 | 29 | override def toString = 30 | numRows + " rows\n" + 31 | stats.zipWithIndex.map { case (column, index) => 32 | " column " + index + " " + 33 | { if (column != null) column.toString else "no column statistics" } 34 | }.mkString("\n") 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/JavaSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution.serialization 19 | 20 | import java.nio.ByteBuffer 21 | 22 | import org.apache.spark.SparkEnv 23 | import org.apache.spark.serializer.{JavaSerializer => SparkJavaSerializer} 24 | 25 | 26 | object JavaSerializer { 27 | @transient val ser = new SparkJavaSerializer(SparkEnv.get.conf) 28 | 29 | def serialize[T](o: T): Array[Byte] = { 30 | ser.newInstance().serialize(o).array() 31 | } 32 | 33 | def deserialize[T](bytes: Array[Byte]): T = { 34 | ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /bin/dev/release_cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | DEVDIR="`dirname $0`" 19 | BINDIR="`dirname $DEVDIR`" 20 | FWDIR="`dirname $BINDIR`" 21 | 22 | rm -rf $FWDIR/run-tests-from-scratch-workspace 23 | rm -rf $FWDIR/test_warehouses 24 | 25 | rm -rf $FWDIR/conf/shark-env.sh 26 | 27 | rm -rf $FWDIR/metastore_db 28 | rm -rf $FWDIR/derby.log 29 | 30 | rm -rf $FWDIR/project/target $FWDIR/project/project/target 31 | 32 | rm -rf $FWDIR/target/resolution-cache 33 | rm -rf $FWDIR/target/streams 34 | rm -rf $FWDIR/target/scala-*/cache 35 | rm -rf $FWDIR/target/scala-*/classes 36 | rm -rf $FWDIR/target/scala-*/test-classes 37 | 38 | find $FWDIR -name ".DS_Store" -exec rm {} \; 39 | find $FWDIR -name ".history" -exec rm {} \; 40 | 41 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark 19 | 20 | import scala.language.implicitConversions 21 | 22 | import org.apache.hadoop.hive.ql.plan.OperatorDesc 23 | 24 | import shark.execution.serialization.KryoSerializationWrapper 25 | import shark.execution.serialization.OperatorSerializationWrapper 26 | 27 | package object execution { 28 | 29 | type HiveDesc = OperatorDesc // XXXDesc in Hive is the subclass of Serializable 30 | 31 | implicit def opSerWrapper2op[T <: Operator[_ <: HiveDesc]]( 32 | wrapper: OperatorSerializationWrapper[T]): T = wrapper.value 33 | 34 | implicit def kryoWrapper2object[T](wrapper: KryoSerializationWrapper[T]): T = wrapper.value 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/ReduceSinkTableDesc.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import org.apache.hadoop.hive.ql.plan.TableDesc 21 | import shark.LogHelper 22 | 23 | 24 | trait ReduceSinkTableDesc extends LogHelper { 25 | self: Operator[_ <: HiveDesc] => 26 | 27 | // Seq(tag, (Key TableDesc, Value TableDesc)) 28 | def keyValueDescs(): Seq[(Int, (TableDesc, TableDesc))] = { 29 | // get the parent ReduceSinkOperator and sort it by tag 30 | val reduceSinkOps = 31 | for (op <- self.parentOperators.toSeq if op.isInstanceOf[ReduceSinkOperator]) 32 | yield op.asInstanceOf[ReduceSinkOperator] 33 | 34 | reduceSinkOps.map(f => (f.getTag, f.getKeyValueTableDescs)) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/ColumnarStruct.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.util.{List => JList, ArrayList => JArrayList} 21 | 22 | import shark.memstore2.column.ColumnIterator 23 | 24 | 25 | /** 26 | * A struct returned by the TablePartitionIterator. It contains references to the same set of 27 | * ColumnIterators and use those to return individual fields back to the object inspectors. 28 | */ 29 | class ColumnarStruct(columnIterators: Array[ColumnIterator]) { 30 | 31 | def getField(columnId: Int): Object = columnIterators(columnId).current 32 | 33 | def getFieldsAsList(): JList[Object] = { 34 | val list = new JArrayList[Object](columnIterators.length) 35 | var i = 0 36 | while (i < columnIterators.length) { 37 | list.add(columnIterators(i).current) 38 | i += 1 39 | } 40 | list 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2012 The Regents of The University California. 2 | // All rights reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | scalaVersion := "2.10.3" 16 | 17 | resolvers += Resolver.url( 18 | "sbt-plugin-releases", 19 | new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases/"))(Resolver.ivyStylePatterns) 20 | 21 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" 22 | 23 | addSbtPlugin("org.ensime" % "ensime-sbt-cmd" % "0.1.2") 24 | 25 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.4.0") 26 | 27 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0") 28 | 29 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.5.1") 30 | 31 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") 32 | 33 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") 34 | 35 | addSbtPlugin("com.typesafe.sbt" % "sbt-pgp" % "0.8.3") 36 | 37 | addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.0") 38 | 39 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/GroupByOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import org.apache.hadoop.hive.ql.exec.{GroupByOperator => HiveGroupByOperator} 21 | import org.apache.hadoop.hive.ql.exec.{ReduceSinkOperator => HiveReduceSinkOperator} 22 | 23 | 24 | /** 25 | * Unlike Hive, group by in Shark is split into two different operators: 26 | * GroupByPostShuffleOperator and GroupByPreShuffleOperator. The pre-shuffle one 27 | * serves as a combiner on each map partition. 28 | * 29 | * These two classes are defined in org.apache.hadoop.hive.ql.exec package 30 | * (scala files) to get around the problem that some Hive classes are only 31 | * visibile within that class. 32 | */ 33 | object GroupByOperator { 34 | 35 | def isPostShuffle(op: HiveGroupByOperator): Boolean = { 36 | op.getParentOperators().get(0).isInstanceOf[HiveReduceSinkOperator] 37 | } 38 | 39 | } 40 | 41 | -------------------------------------------------------------------------------- /bin/dev/clear-buffer-cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # Clear OS buffer cache for mesos clusters on EC2. 19 | 20 | import os 21 | import thread 22 | import time 23 | 24 | machinesFile = "/root/spark-ec2/slaves" 25 | machs = open(machinesFile).readlines() 26 | machs = map(lambda s: s.strip(),machs) 27 | machCount = len(machs) 28 | machID = 0 29 | cmd = "sync; echo 3 > /proc/sys/vm/drop_caches" 30 | done = {} 31 | 32 | def dropCachesThread( mach, myID, *args ): 33 | print "SSH to machine %i" % (myID) 34 | os.system("ssh %s '%s'" % (mach, cmd)) 35 | done[mach] = "done" 36 | 37 | for mach in ( machs ): 38 | thread.start_new_thread(dropCachesThread, (mach, machID)) 39 | machID = machID + 1 40 | time.sleep(0.2) 41 | 42 | while (len(done.keys()) < machCount): 43 | print "waiting for %d tasks to finish..." % (machCount - len(done.keys())) 44 | time.sleep(1) 45 | 46 | print "Done with %i threads" % (len(done.keys())) 47 | 48 | -------------------------------------------------------------------------------- /src/main/scala/shark/repl/Main.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.repl 19 | 20 | import org.apache.hadoop.hive.common.LogUtils 21 | import org.apache.hadoop.hive.common.LogUtils.LogInitializationException 22 | 23 | 24 | /** 25 | * Shark's REPL entry point. 26 | */ 27 | object Main { 28 | 29 | try { 30 | LogUtils.initHiveLog4j() 31 | } catch { 32 | case e: LogInitializationException => // Ignore the error. 33 | } 34 | 35 | private var _interp: SharkILoop = null 36 | 37 | def interp = _interp 38 | 39 | private def interp_=(i: SharkILoop) { _interp = i } 40 | 41 | def main(args: Array[String]) { 42 | 43 | _interp = new SharkILoop 44 | 45 | // We need to set spark.repl.InterpAccessor.interp since it is used 46 | // everywhere in spark.repl code. 47 | org.apache.spark.repl.Main.interp = _interp 48 | 49 | // Start an infinite loop ... 50 | _interp.process(args) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/shark/server/SharkCLIService.scala: -------------------------------------------------------------------------------- 1 | package shark.server 2 | 3 | import org.apache.hive.service.cli.CLIService 4 | import org.apache.hadoop.hive.conf.HiveConf 5 | import org.apache.hadoop.hive.shims.ShimLoader 6 | import org.apache.hive.service.auth.HiveAuthFactory 7 | import java.io.IOException 8 | import org.apache.hive.service.ServiceException 9 | import javax.security.auth.login.LoginException 10 | import org.apache.spark.SparkEnv 11 | import shark.{SharkServer, Utils} 12 | 13 | class SharkCLIService extends CLIService { 14 | override def init(hiveConf: HiveConf) { 15 | this.synchronized { 16 | Utils.setSuperField("hiveConf", hiveConf, this) 17 | val sharkSM = new SharkSessionManager 18 | Utils.setSuperField("sessionManager", sharkSM, this) 19 | addService(sharkSM) 20 | try { 21 | HiveAuthFactory.loginFromKeytab(hiveConf) 22 | val serverUserName = ShimLoader.getHadoopShims 23 | .getShortUserName(ShimLoader.getHadoopShims.getUGIForConf(hiveConf)) 24 | Utils.setSuperField("serverUserName", serverUserName, this) 25 | } catch { 26 | case e: IOException => { 27 | throw new ServiceException("Unable to login to kerberos with given principal/keytab", e) 28 | } 29 | case e: LoginException => { 30 | throw new ServiceException("Unable to login to kerberos with given principal/keytab", e) 31 | } 32 | } 33 | // Make sure the ThreadLocal SparkEnv reference is the same for all threads. 34 | SparkEnv.set(SharkServer.sparkEnv) 35 | sharkInit(hiveConf) 36 | } 37 | } 38 | } 39 | 40 | 41 | -------------------------------------------------------------------------------- /bin/shark: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | bin="`dirname $0`" 19 | bin=`cd "$bin"; pwd` 20 | export FWDIR="`dirname $bin`" 21 | 22 | SERVICE="" 23 | HELP="" 24 | while [ $# -gt 0 ];do 25 | case "$1" in 26 | --service) 27 | shift 28 | SERVICE=$1 29 | shift 30 | ;; 31 | --help) 32 | HELP=_help 33 | shift 34 | ;; 35 | *) 36 | break 37 | ;; 38 | esac 39 | done 40 | 41 | if [ "$SERVICE" = "" ] ; then 42 | if [ "$HELP" = "_help" ] ; then 43 | SERVICE="help" 44 | else 45 | SERVICE="cli" 46 | fi 47 | fi 48 | SERVICE_LIST="" 49 | 50 | for i in "$bin"/ext/*.sh ; do 51 | . $i 52 | done 53 | 54 | TORUN="" 55 | for j in $SERVICE_LIST ; do 56 | if [ "$j" = "$SERVICE" ] ; then 57 | TORUN=${j}$HELP 58 | fi 59 | done 60 | echo "$@" 61 | if [ "$TORUN" = "" ] ; then 62 | echo "Service $SERVICE not found" 63 | echo "Available Services: $SERVICE_LIST" 64 | exit 7 65 | else 66 | $TORUN "$@" 67 | fi 68 | 69 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/LimitOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import org.apache.hadoop.hive.ql.plan.LimitDesc 21 | 22 | import org.apache.spark.rdd.{EmptyRDD, RDD} 23 | 24 | import shark.SharkEnv 25 | 26 | 27 | class LimitOperator extends UnaryOperator[LimitDesc] { 28 | 29 | // Only works on the master program. 30 | def limit = desc.getLimit() 31 | 32 | override def execute(): RDD[_] = { 33 | 34 | val limitNum = desc.getLimit() 35 | 36 | if (limitNum > 0) { 37 | // Take limit on each partition. 38 | val inputRdd = executeParents().head._2 39 | inputRdd.mapPartitions({ iter => iter.take(limitNum) }, preservesPartitioning = true) 40 | } else { 41 | new EmptyRDD(SharkEnv.sc) 42 | } 43 | } 44 | 45 | override def processPartition(split: Int, iter: Iterator[_]) = { 46 | throw new UnsupportedOperationException("LimitOperator.processPartition()") 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/KryoSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution.serialization 19 | 20 | import java.nio.ByteBuffer 21 | 22 | import org.apache.spark.{SparkConf, SparkEnv} 23 | import org.apache.spark.serializer.{KryoSerializer => SparkKryoSerializer} 24 | 25 | import shark.SharkContext 26 | 27 | /** 28 | * Java object serialization using Kryo. This is much more efficient, but Kryo 29 | * sometimes is buggy to use. We use this mainly to serialize the object 30 | * inspectors. 31 | */ 32 | object KryoSerializer { 33 | 34 | @transient lazy val ser: SparkKryoSerializer = { 35 | val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) 36 | new SparkKryoSerializer(sparkConf) 37 | } 38 | 39 | def serialize[T](o: T): Array[Byte] = { 40 | ser.newInstance().serialize(o).array() 41 | } 42 | 43 | def deserialize[T](bytes: Array[Byte]): T = { 44 | ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/ScriptOperatorHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.hadoop.hive.ql.exec 19 | // Put this file in Hive's exec package to access package level visible fields and methods. 20 | 21 | import java.util.{Map => JMap} 22 | 23 | import org.apache.hadoop.conf.Configuration 24 | 25 | 26 | /** 27 | * A helper class that gets us PathFinder and alias in ScriptOperator. 28 | * This is needed since PathFinder inner class is not declared as 29 | * static/public. 30 | */ 31 | class ScriptOperatorHelper(val op: ScriptOperator) extends ScriptOperator { 32 | 33 | def newPathFinderInstance(envpath: String): op.PathFinder = { 34 | new op.PathFinder(envpath) 35 | } 36 | 37 | def getAlias: String = op.alias 38 | 39 | override def addJobConfToEnvironment(conf: Configuration, env: JMap[String, String]) { 40 | op.addJobConfToEnvironment(conf, env) 41 | } 42 | 43 | override def safeEnvVarName(variable: String): String = { 44 | op.safeEnvVarName(variable) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/ColumnDesc.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | import java.util.{List => JList} 21 | 22 | import scala.collection.JavaConversions._ 23 | 24 | import org.apache.hadoop.hive.metastore.api.FieldSchema 25 | import org.apache.hadoop.hive.metastore.api.Schema 26 | 27 | 28 | class ColumnDesc(val name: String, val dataType: DataType) extends Serializable { 29 | 30 | private[shark] def this(hiveSchema: FieldSchema) { 31 | this(hiveSchema.getName, DataTypes.fromHiveType(hiveSchema.getType)) 32 | } 33 | 34 | override def toString = "ColumnDesc(name: %s, type: %s)".format(name, dataType.name) 35 | } 36 | 37 | 38 | object ColumnDesc { 39 | 40 | def createSchema(fieldSchemas: JList[FieldSchema]): Array[ColumnDesc] = { 41 | if (fieldSchemas == null) Array.empty else fieldSchemas.map(new ColumnDesc(_)).toArray 42 | } 43 | 44 | def createSchema(schema: Schema): Array[ColumnDesc] = { 45 | if (schema == null) Array.empty else createSchema(schema.getFieldSchemas) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/test/0.20S-include.txt: -------------------------------------------------------------------------------- 1 | testCliDriver_archive 2 | testCliDriver_archive_corrupt 3 | testCliDriver_infer_bucket_sort_list_bucket 4 | testCliDriver_list_bucket_dml_1 5 | testCliDriver_list_bucket_dml_11 6 | testCliDriver_list_bucket_dml_12 7 | testCliDriver_list_bucket_dml_13 8 | testCliDriver_list_bucket_dml_2 9 | testCliDriver_list_bucket_dml_3 10 | testCliDriver_list_bucket_dml_4 11 | testCliDriver_list_bucket_dml_5 12 | testCliDriver_list_bucket_dml_6 13 | testCliDriver_list_bucket_dml_7 14 | testCliDriver_list_bucket_dml_8 15 | testCliDriver_list_bucket_dml_9 16 | testCliDriver_list_bucket_query_multiskew_1 17 | testCliDriver_list_bucket_query_multiskew_2 18 | testCliDriver_list_bucket_query_multiskew_3 19 | testCliDriver_list_bucket_query_oneskew_1 20 | testCliDriver_list_bucket_query_oneskew_2 21 | testCliDriver_list_bucket_query_oneskew_3 22 | testCliDriver_recursive_dir 23 | testCliDriver_skewjoin_union_remove_1 24 | testCliDriver_skewjoin_union_remove_2 25 | testCliDriver_split_sample 26 | testCliDriver_union_remove_1 27 | testCliDriver_union_remove_10 28 | testCliDriver_union_remove_11 29 | testCliDriver_union_remove_12 30 | testCliDriver_union_remove_13 31 | testCliDriver_union_remove_14 32 | testCliDriver_union_remove_15 33 | testCliDriver_union_remove_16 34 | testCliDriver_union_remove_17 35 | testCliDriver_union_remove_18 36 | testCliDriver_union_remove_19 37 | testCliDriver_union_remove_2 38 | testCliDriver_union_remove_20 39 | testCliDriver_union_remove_21 40 | testCliDriver_union_remove_22 41 | testCliDriver_union_remove_23 42 | testCliDriver_union_remove_24 43 | testCliDriver_union_remove_3 44 | testCliDriver_union_remove_4 45 | testCliDriver_union_remove_5 46 | testCliDriver_union_remove_7 47 | testCliDriver_union_remove_8 48 | testCliDriver_union_remove_9 -------------------------------------------------------------------------------- /src/main/scala/shark/api/ResultSet.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | import java.util.{Arrays, Collections, List => JList} 21 | 22 | 23 | class ResultSet private[shark](_schema: Array[ColumnDesc], _results: Array[Array[Object]]) { 24 | 25 | /** 26 | * The schema for the query results, for use in Scala. 27 | */ 28 | def schema: Seq[ColumnDesc] = _schema.toSeq 29 | 30 | /** 31 | * Query results, for use in Scala. 32 | */ 33 | def results: Seq[Array[Object]] = _results.toSeq 34 | 35 | /** 36 | * Get the schema for the query results as an immutable list, for use in Java. 37 | */ 38 | def getSchema: JList[ColumnDesc] = Collections.unmodifiableList(Arrays.asList(_schema : _*)) 39 | 40 | /** 41 | * Get the query results as an immutable list, for use in Java. 42 | */ 43 | def getResults: JList[Array[Object]] = Collections.unmodifiableList(Arrays.asList(_results : _*)) 44 | 45 | override def toString: String = { 46 | "ResultSet(" + _schema.map(c => c.name + " " + c.dataType).mkString("\t") + ")\n" + 47 | _results.map(row => row.mkString("\t")).mkString("\n") 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/shark/parse/SharkSemanticAnalyzerFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.parse 19 | 20 | import org.apache.hadoop.hive.conf.HiveConf 21 | import org.apache.hadoop.hive.ql.parse.{ASTNode, BaseSemanticAnalyzer, DDLSemanticAnalyzer, 22 | ExplainSemanticAnalyzer, LoadSemanticAnalyzer, SemanticAnalyzerFactory, SemanticAnalyzer} 23 | 24 | import shark.SharkConfVars 25 | 26 | 27 | object SharkSemanticAnalyzerFactory { 28 | 29 | /** 30 | * Return a semantic analyzer for the given ASTNode. 31 | */ 32 | def get(conf: HiveConf, tree:ASTNode): BaseSemanticAnalyzer = { 33 | val explainMode = SharkConfVars.getVar(conf, SharkConfVars.EXPLAIN_MODE) == "shark" 34 | 35 | SemanticAnalyzerFactory.get(conf, tree) match { 36 | case _: SemanticAnalyzer => 37 | new SharkSemanticAnalyzer(conf) 38 | case _: ExplainSemanticAnalyzer if explainMode => 39 | new SharkExplainSemanticAnalyzer(conf) 40 | case _: DDLSemanticAnalyzer => 41 | new SharkDDLSemanticAnalyzer(conf) 42 | case _: LoadSemanticAnalyzer => 43 | new SharkLoadSemanticAnalyzer(conf) 44 | case sem: BaseSemanticAnalyzer => 45 | sem 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/shark/server/SharkExecuteStatementOperation.scala: -------------------------------------------------------------------------------- 1 | package shark.server 2 | 3 | import java.lang.reflect.Constructor 4 | import java.util.{Map => JMap} 5 | import org.apache.hive.service.cli.session.HiveSession 6 | 7 | object SharkExecuteStatementOperation { 8 | def newExecuteStatementOperation(parentSession: HiveSession, 9 | statement: String, 10 | confOverlay: JMap[String, String]) 11 | : Any = { 12 | val tokens = statement.trim().split("\\s+") 13 | val command = tokens{0}.toLowerCase 14 | command match { 15 | case "set" => { 16 | val ctor = accessCtor("org.apache.hive.service.cli.operation.SetOperation") 17 | ctor.newInstance(parentSession, statement, confOverlay) 18 | } 19 | case "dfs" => { 20 | val ctor = accessCtor("org.apache.hive.service.cli.operation.DfsOperation") 21 | ctor.newInstance(parentSession, statement, confOverlay) 22 | } 23 | case "add" => { 24 | val ctor = accessCtor("org.apache.hive.service.cli.operation.AddResourceOperation") 25 | ctor.newInstance(parentSession, statement, confOverlay) 26 | } 27 | case "delete" => { 28 | val ctor = accessCtor("org.apache.hive.service.cli.operation.DeleteResourceOperation") 29 | ctor.newInstance(parentSession, statement, confOverlay) 30 | } 31 | case _ => { 32 | new SharkSQLOperation(parentSession, statement, confOverlay) 33 | } 34 | } 35 | } 36 | 37 | def accessCtor(className : String) : Constructor[_] = { 38 | val setClass = Class.forName(className) 39 | val setConst = 40 | setClass.getDeclaredConstructor( 41 | classOf[HiveSession], 42 | classOf[String], 43 | classOf[JMap[String, String]]) 44 | setConst.setAccessible(true) 45 | setConst 46 | } 47 | } -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/LazySimpleSerDeWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.util.{List => JList, Properties} 21 | 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.hive.serde2.{SerDe, SerDeStats} 24 | import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 26 | import org.apache.hadoop.io.Writable 27 | 28 | 29 | class LazySimpleSerDeWrapper extends SerDe { 30 | 31 | val _lazySimpleSerDe = new LazySimpleSerDe() 32 | 33 | override def initialize(conf: Configuration, tbl: Properties) { 34 | _lazySimpleSerDe.initialize(conf, tbl) 35 | } 36 | 37 | override def deserialize(blob: Writable): Object = _lazySimpleSerDe.deserialize(blob) 38 | 39 | override def getSerDeStats(): SerDeStats = _lazySimpleSerDe.getSerDeStats() 40 | 41 | override def getObjectInspector: ObjectInspector = _lazySimpleSerDe.getObjectInspector 42 | 43 | override def getSerializedClass: Class[_ <: Writable] = _lazySimpleSerDe.getSerializedClass 44 | 45 | override def serialize(obj: Object, objInspector: ObjectInspector): Writable = { 46 | _lazySimpleSerDe.serialize(obj, objInspector) 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/test/scala/shark/SortSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark 19 | 20 | import org.apache.hadoop.io.BytesWritable 21 | 22 | import org.scalatest.FunSuite 23 | 24 | import org.apache.spark.SparkContext 25 | import org.apache.spark.rdd.RDD 26 | 27 | import shark.execution.{ReduceKey, ReduceKeyMapSide, ReduceKeyReduceSide, RDDUtils} 28 | 29 | 30 | class SortSuite extends FunSuite { 31 | 32 | TestUtils.init() 33 | 34 | var sc: SparkContext = SharkRunner.init() 35 | 36 | test("order by limit") { 37 | val data = Array((4, 14), (1, 11), (7, 17), (0, 10)) 38 | val expected = data.sortWith(_._1 < _._1).toSeq 39 | val rdd: RDD[(ReduceKey, BytesWritable)] = sc.parallelize(data, 50).map { x => 40 | (new ReduceKeyMapSide(new BytesWritable(Array[Byte](x._1.toByte))), 41 | new BytesWritable(Array[Byte](x._2.toByte))) 42 | } 43 | for (k <- 0 to 5) { 44 | val sortedRdd = RDDUtils.topK(rdd, k).asInstanceOf[RDD[(ReduceKeyReduceSide, Array[Byte])]] 45 | val output = sortedRdd.map { case(k, v) => 46 | (k.byteArray(0).toInt, v(0).toInt) 47 | }.collect().toSeq 48 | assert(output.size === math.min(k, 4)) 49 | assert(output === expected.take(math.min(k, 4))) 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/shark/parse/QueryBlock.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.parse 19 | 20 | import org.apache.hadoop.hive.ql.parse.{QB => HiveQueryBlock} 21 | import org.apache.hadoop.hive.ql.plan.CreateTableDesc 22 | import org.apache.hadoop.hive.ql.plan.TableDesc 23 | 24 | import shark.memstore2.CacheType 25 | import shark.memstore2.CacheType._ 26 | 27 | 28 | /** 29 | * A container for flags and table metadata. Used in SharkSemanticAnalyzer while parsing 30 | * and analyzing ASTs (e.g. in SharkSemanticAnalyzer#analyzeCreateTable()). 31 | */ 32 | class QueryBlock(outerID: String, alias: String, isSubQuery: Boolean) 33 | extends HiveQueryBlock(outerID, alias, isSubQuery) { 34 | 35 | // The CacheType for the table that will be created from CREATE TABLE/CTAS, or updated for an 36 | // INSERT. 37 | var cacheMode = CacheType.NONE 38 | 39 | // Descriptor for the table being updated by an INSERT. 40 | var targetTableDesc: TableDesc = _ 41 | 42 | // Hive's QB uses `tableDesc` to refer to the CreateTableDesc. A direct `createTableDesc` 43 | // makes it easier to differentiate from `_targetTableDesc`. 44 | def createTableDesc: CreateTableDesc = super.getTableDesc 45 | 46 | def createTableDesc_= (desc: CreateTableDesc) = super.setTableDesc(desc) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/shark/util/QueryRewriteUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.util 19 | 20 | import org.apache.hadoop.hive.ql.parse.SemanticException 21 | 22 | object QueryRewriteUtils { 23 | 24 | def cacheToAlterTable(cmd: String): String = { 25 | val CACHE_TABLE_DEFAULT = "(?i)CACHE ([^ ]+)".r 26 | val CACHE_TABLE_IN = "(?i)CACHE ([^ ]+) IN ([^ ]+)".r 27 | 28 | cmd match { 29 | case CACHE_TABLE_DEFAULT(tableName) => 30 | s"ALTER TABLE $tableName SET TBLPROPERTIES ('shark.cache' = 'memory')" 31 | case CACHE_TABLE_IN(tableName, cacheType) => 32 | s"ALTER TABLE $tableName SET TBLPROPERTIES ('shark.cache' = '$cacheType')" 33 | case _ => 34 | throw new SemanticException( 35 | s"CACHE accepts a single table name: 'CACHE [IN ]'" + 36 | s" (received command: '$cmd')") 37 | } 38 | } 39 | 40 | def uncacheToAlterTable(cmd: String): String = { 41 | val cmdSplit = cmd.split(' ') 42 | if (cmdSplit.size == 2) { 43 | val tableName = cmdSplit(1) 44 | "ALTER TABLE %s SET TBLPROPERTIES ('shark.cache' = 'false')".format(tableName) 45 | } else { 46 | throw new SemanticException( 47 | s"UNCACHE accepts a single table name: 'UNCACHE
' (received command: '$cmd')") 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/HiveStructSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.hadoop.hive.serde2.binarysortable 19 | 20 | // Putting it in this package so it can access the package level visible function 21 | // static void BinarySortableSerDe.serialize(OutputByteBuffer, Object, ObjectInspector, boolean) 22 | 23 | import java.util.{List => JList} 24 | 25 | import org.apache.hadoop.hive.serde2.objectinspector.{StructField, StructObjectInspector} 26 | 27 | 28 | /** 29 | * Used to serialize a row of data. It needs to be initialized with an object inspector 30 | * for the row. 31 | */ 32 | class HiveStructSerializer(val rowObjectInspector: StructObjectInspector) { 33 | 34 | def serialize(obj: Object): Array[Byte] = { 35 | outputByteBuffer.reset() 36 | var i = 0 37 | while (i < fields.size) { 38 | BinarySortableSerDe.serialize( 39 | outputByteBuffer, 40 | rowObjectInspector.getStructFieldData(obj, fields.get(i)), 41 | fields.get(i).getFieldObjectInspector(), 42 | false) 43 | i += 1 44 | } 45 | val bytes = new Array[Byte](outputByteBuffer.length) 46 | System.arraycopy(outputByteBuffer.getData(), 0, bytes, 0, outputByteBuffer.length) 47 | bytes 48 | } 49 | 50 | private val outputByteBuffer = new OutputByteBuffer 51 | private val fields: JList[_ <: StructField] = rowObjectInspector.getAllStructFieldRefs 52 | } 53 | -------------------------------------------------------------------------------- /conf/shark-env.sh.template: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # (Required) Amount of memory used per slave node. This should be in the same 19 | # format as the JVM's -Xmx option, e.g. 300m or 1g. 20 | export SPARK_MEM=1g 21 | 22 | # (Required) Set the master program's memory 23 | export SHARK_MASTER_MEM=1g 24 | 25 | # (Optional) Specify the location of Hive's configuration directory. By default, 26 | # Shark run scripts will point it to $SHARK_HOME/conf 27 | #export HIVE_CONF_DIR="" 28 | 29 | # For running Shark in distributed mode, set the following: 30 | #export HADOOP_HOME="" 31 | #export SPARK_HOME="" 32 | #export MASTER="" 33 | # Only required if using Mesos: 34 | #export MESOS_NATIVE_LIBRARY=/usr/local/lib/libmesos.so 35 | 36 | # Only required if run shark with spark on yarn 37 | #export SHARK_EXEC_MODE=yarn 38 | #export SPARK_ASSEMBLY_JAR= 39 | #export SHARK_ASSEMBLY_JAR= 40 | 41 | # (Optional) Extra classpath 42 | #export SPARK_LIBRARY_PATH="" 43 | 44 | # Java options 45 | # On EC2, change the local.dir to /mnt/tmp 46 | SPARK_JAVA_OPTS=" -Dspark.local.dir=/tmp " 47 | SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 " 48 | SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps " 49 | export SPARK_JAVA_OPTS 50 | 51 | # (Optional) Tachyon Related Configuration 52 | #export TACHYON_MASTER="" # e.g. "localhost:19998" 53 | #export TACHYON_WAREHOUSE_PATH=/sharktables # Could be any valid path name 54 | 55 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/PythonTableRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | import scala.collection.JavaConversions._ 21 | 22 | import net.razorvine.pickle.Pickler 23 | 24 | import org.apache.spark.api.java.JavaRDD 25 | 26 | class PythonTableRDD( 27 | tableRDD: JavaTableRDD) 28 | extends JavaRDD[Array[Byte]](tableRDD.rdd.mapPartitions(PythonTableRDD.javaRowToPythonRow)) { 29 | val schema: java.util.Map[String, Int] = tableRDD.first.colname2indexMap 30 | } 31 | 32 | /* 33 | * These static methods are to be called by Python to run SQL queries. sql2rdd runs the query and 34 | * attempts to convert the JavaTableRDD to a Python compatible RDD (an RDD of ByteArrays 35 | * that are pickled Python objects). We map the pickle serializer per partition to convert the Java 36 | * objects to python objects, and we return the resulting PythonTableRDD to the caller (presumably 37 | * a Python process). 38 | */ 39 | object PythonTableRDD { 40 | 41 | def sql2rdd(sc: JavaSharkContext, cmd: String): PythonTableRDD = { 42 | new PythonTableRDD(sc.sql2rdd(cmd)) 43 | } 44 | 45 | // Pickle a row of java objects to a row of pickled python objects (byte arrays) 46 | def javaRowToPythonRow(rows: Iterator[Row]): Iterator[Array[Byte]] = { 47 | // Pickler is not threadsafe, so we use 1 per partition 48 | val pickle = new Pickler 49 | rows.map { r => 50 | pickle.dumps(r.toSeq.toArray) 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/shark/UtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark 19 | 20 | import java.util.{HashMap => JHashMap} 21 | 22 | import org.apache.hadoop.conf.Configuration 23 | 24 | import org.scalatest.{BeforeAndAfter, FunSuite} 25 | 26 | 27 | class UtilsSuite extends FunSuite { 28 | 29 | test("set aws credentials") { 30 | var conf = new Configuration 31 | var map = new JHashMap[String, String]() 32 | Utils.setAwsCredentials(conf, map) 33 | assert(conf.get("fs.s3n.awsAccessKeyId") === null) 34 | assert(conf.get("fs.s3n.awsSecretAccessKey") === null) 35 | assert(conf.get("fs.s3.awsAccessKeyId") === null) 36 | assert(conf.get("fs.s3.awsSecretAccessKey") === null) 37 | 38 | map.put("AWS_ACCESS_KEY_ID", "id") 39 | conf = new Configuration 40 | Utils.setAwsCredentials(conf, map) 41 | assert(conf.get("fs.s3n.awsAccessKeyId") === null) 42 | assert(conf.get("fs.s3n.awsSecretAccessKey") === null) 43 | assert(conf.get("fs.s3.awsAccessKeyId") === null) 44 | assert(conf.get("fs.s3.awsSecretAccessKey") === null) 45 | 46 | map.put("AWS_SECRET_ACCESS_KEY", "key") 47 | conf = new Configuration 48 | Utils.setAwsCredentials(conf, map) 49 | assert(conf.get("fs.s3n.awsAccessKeyId") === "id") 50 | assert(conf.get("fs.s3n.awsSecretAccessKey") === "key") 51 | assert(conf.get("fs.s3.awsAccessKeyId") === "id") 52 | assert(conf.get("fs.s3.awsSecretAccessKey") === "key") 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/HiveConfPersistenceDelegate.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package shark.execution.serialization 18 | 19 | import java.beans.{Statement, Encoder, DefaultPersistenceDelegate} 20 | import scala.collection.JavaConversions._ 21 | import org.apache.hadoop.hive.conf.HiveConf 22 | import org.apache.commons.lang.ObjectUtils 23 | 24 | class HiveConfPersistenceDelegate extends DefaultPersistenceDelegate { 25 | override protected def initialize(clazz: Class[_], oldInst: AnyRef, newInst: AnyRef, out: Encoder) 26 | { 27 | val oldConf = oldInst.asInstanceOf[HiveConf] 28 | val newConf = newInst.asInstanceOf[HiveConf] 29 | 30 | if (!ObjectUtils.equals(oldConf.getAuxJars, newConf.getAuxJars)) { 31 | out.writeStatement(new Statement(oldInst, "setAuxJars", Array(oldConf.getAuxJars))) 32 | } 33 | 34 | val oldConfProps = oldConf.getAllProperties 35 | val newConfProps = newConf.getAllProperties 36 | 37 | val propsToDelete = newConfProps.filter { case(k, v) => !oldConfProps.containsKey(k) } 38 | val propsToAdd = oldConf.getAllProperties.filter { case(k, v) => 39 | !newConfProps.containsKey(k) || !ObjectUtils.equals(newConfProps.get(k), v) 40 | } 41 | 42 | propsToDelete.foreach { case(k, v) => 43 | out.writeStatement(new Statement(oldInst, "unset", Array(k))) 44 | } 45 | propsToAdd.foreach { case(k, v) => 46 | out.writeStatement(new Statement(oldInst, "set", Array(k, v))) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/FilterOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import scala.collection.Iterator 21 | import scala.reflect.BeanProperty 22 | 23 | import org.apache.hadoop.hive.ql.exec.{ExprNodeEvaluator, ExprNodeEvaluatorFactory} 24 | import org.apache.hadoop.hive.ql.metadata.HiveException 25 | import org.apache.hadoop.hive.ql.plan.FilterDesc 26 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector 27 | 28 | 29 | class FilterOperator extends UnaryOperator[FilterDesc] { 30 | 31 | @transient var conditionEvaluator: ExprNodeEvaluator = _ 32 | @transient var conditionInspector: PrimitiveObjectInspector = _ 33 | 34 | @BeanProperty var conf: FilterDesc = _ 35 | 36 | override def initializeOnMaster() { 37 | super.initializeOnMaster() 38 | 39 | conf = desc 40 | } 41 | 42 | override def initializeOnSlave() { 43 | try { 44 | conditionEvaluator = ExprNodeEvaluatorFactory.get(conf.getPredicate()) 45 | 46 | conditionInspector = conditionEvaluator.initialize(objectInspector) 47 | .asInstanceOf[PrimitiveObjectInspector] 48 | } catch { 49 | case e: Throwable => throw new HiveException(e) 50 | } 51 | } 52 | 53 | override def processPartition(split: Int, iter: Iterator[_]) = { 54 | iter.filter { row => 55 | java.lang.Boolean.TRUE.equals( 56 | conditionInspector.getPrimitiveJavaObject(conditionEvaluator.evaluate(row))) 57 | } 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/KryoSerializationWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution.serialization 19 | 20 | /** 21 | * A wrapper around some unserializable objects that make them both Java 22 | * serializable. Internally, Kryo is used for serialization. 23 | * 24 | * Use KryoSerializationWrapper(value) to create a wrapper. 25 | */ 26 | class KryoSerializationWrapper[T] extends Serializable { 27 | 28 | @transient var value: T = _ 29 | 30 | private var valueSerialized: Array[Byte] = _ 31 | 32 | // The getter and setter for valueSerialized is used for XML serialization. 33 | def getValueSerialized(): Array[Byte] = { 34 | valueSerialized = KryoSerializer.serialize(value) 35 | valueSerialized 36 | } 37 | 38 | def setValueSerialized(bytes: Array[Byte]) = { 39 | valueSerialized = bytes 40 | value = KryoSerializer.deserialize[T](valueSerialized) 41 | } 42 | 43 | // Used for Java serialization. 44 | private def writeObject(out: java.io.ObjectOutputStream) { 45 | getValueSerialized() 46 | out.defaultWriteObject() 47 | } 48 | 49 | private def readObject(in: java.io.ObjectInputStream) { 50 | in.defaultReadObject() 51 | setValueSerialized(valueSerialized) 52 | } 53 | } 54 | 55 | 56 | object KryoSerializationWrapper { 57 | def apply[T](value: T): KryoSerializationWrapper[T] = { 58 | val wrapper = new KryoSerializationWrapper[T] 59 | wrapper.value = value 60 | wrapper 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/main/scala/shark/SharkServer2.scala: -------------------------------------------------------------------------------- 1 | package shark 2 | 3 | import org.apache.commons.logging.LogFactory 4 | import org.apache.hadoop.hive.common.LogUtils 5 | import org.apache.hadoop.hive.common.LogUtils.LogInitializationException 6 | import org.apache.hadoop.hive.conf.HiveConf 7 | import org.apache.hive.service.cli.thrift.ThriftCLIService 8 | import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor} 9 | import org.apache.spark.SparkEnv 10 | import shark.server.SharkCLIService 11 | 12 | object SharkServer2 extends LogHelper { 13 | SharkEnv.init() 14 | var sparkEnv: SparkEnv = SparkEnv.get 15 | var LOG = LogFactory.getLog(classOf[SharkServer2]) 16 | 17 | def main(args: Array[String]) { 18 | try { 19 | LogUtils.initHiveLog4j() 20 | } catch { 21 | case e: LogInitializationException => { 22 | LOG.warn(e.getMessage) 23 | } 24 | } 25 | val optproc = new ServerOptionsProcessor("sharkserver2") //TODO: include load RDDs 26 | 27 | if (!optproc.process(args)) { 28 | LOG.fatal("Error starting SharkServer2 with given arguments") 29 | System.exit(-1) 30 | } 31 | 32 | Runtime.getRuntime.addShutdownHook( 33 | new Thread() { 34 | override def run() { 35 | SharkEnv.stop() 36 | } 37 | } 38 | ) 39 | } 40 | 41 | try { 42 | val hiveConf = new HiveConf 43 | SharkConfVars.initializeWithDefaults(hiveConf) 44 | val server = new SharkServer2 45 | server.init(hiveConf) 46 | server.start() 47 | logInfo("SharkServer2 started") 48 | } catch { 49 | case t: Throwable => { 50 | LOG.fatal("Error starting SharkServer2", t) 51 | System.exit(-1) 52 | } 53 | } 54 | } 55 | 56 | class SharkServer2 extends HiveServer2 { 57 | override def init(hiveConf: HiveConf) { 58 | this.synchronized { 59 | val sharkCLIService = new SharkCLIService 60 | Utils.setSuperField("cliService", sharkCLIService, this) 61 | addService(sharkCLIService) 62 | val sthriftCLIService = new ThriftCLIService(sharkCLIService) 63 | Utils.setSuperField("thriftCLIService", sthriftCLIService, this) 64 | addService(sthriftCLIService) 65 | sharkInit(hiveConf) 66 | } 67 | } 68 | } 69 | 70 | 71 | -------------------------------------------------------------------------------- /src/main/scala/shark/LogHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark 19 | 20 | import java.io.PrintStream 21 | 22 | import org.apache.commons.lang.StringUtils 23 | import org.apache.hadoop.hive.ql.session.SessionState 24 | 25 | import org.apache.spark.Logging 26 | 27 | /** 28 | * Utility trait for classes that want to log data. This wraps around Spark's 29 | * Logging trait. It creates a SLF4J logger for the class and allows logging 30 | * messages at different levels using methods that only evaluate parameters 31 | * lazily if the log level is enabled. 32 | * 33 | * It differs from the Spark's Logging trait in that it can print out the 34 | * error to the specified console of the Hive session. 35 | */ 36 | trait LogHelper extends Logging { 37 | 38 | override def logError(msg: => String) = { 39 | errStream().println(msg) 40 | super.logError(msg) 41 | } 42 | 43 | def logError(msg: String, detail: String) = { 44 | errStream().println(msg) 45 | super.logError(msg + StringUtils.defaultString(detail)) 46 | } 47 | 48 | def logError(msg: String, exception: Throwable) = { 49 | val err = errStream() 50 | err.println(msg) 51 | exception.printStackTrace(err) 52 | super.logError(msg, exception) 53 | } 54 | 55 | def outStream(): PrintStream = { 56 | val ss = SessionState.get() 57 | if (ss != null && ss.out != null) ss.out else System.out 58 | } 59 | 60 | def errStream(): PrintStream = { 61 | val ss = SessionState.get(); 62 | if (ss != null && ss.err != null) ss.err else System.err 63 | } 64 | 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/TerminalOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import java.util.Date 21 | 22 | import scala.reflect.BeanProperty 23 | 24 | import org.apache.hadoop.hive.conf.HiveConf 25 | import org.apache.hadoop.hive.ql.exec.{FileSinkOperator => HiveFileSinkOperator} 26 | import org.apache.hadoop.hive.ql.plan.FileSinkDesc 27 | 28 | 29 | /** 30 | * File sink operator. It can accomplish one of the three things: 31 | * - write query output to disk 32 | * - cache query output 33 | * - return query as RDD directly (without materializing it) 34 | */ 35 | class TerminalOperator extends UnaryOperator[FileSinkDesc] { 36 | 37 | // Create a local copy of hconf and hiveSinkOp so we can XML serialize it. 38 | @BeanProperty var localHiveOp: HiveFileSinkOperator = _ 39 | @BeanProperty var localHconf: HiveConf = _ 40 | @BeanProperty val now = new Date() 41 | 42 | override def initializeOnMaster() { 43 | super.initializeOnMaster() 44 | localHconf = super.hconf 45 | // Set parent to null so we won't serialize the entire query plan. 46 | localHiveOp.setParentOperators(null) 47 | localHiveOp.setChildOperators(null) 48 | localHiveOp.setInputObjInspectors(null) 49 | } 50 | 51 | override def initializeOnSlave() { 52 | localHiveOp.initialize(localHconf, Array(objectInspector)) 53 | } 54 | 55 | override def processPartition(split: Int, iter: Iterator[_]): Iterator[_] = iter 56 | } 57 | 58 | 59 | /** 60 | * Collect the output as a TableRDD. 61 | */ 62 | class TableRddSinkOperator extends TerminalOperator {} 63 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/Table.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import scala.collection.mutable.ArrayBuffer 21 | 22 | import org.apache.spark.rdd.RDD 23 | 24 | import scala.collection.mutable.Buffer 25 | 26 | 27 | /** 28 | * A container for table metadata managed by Shark and Spark. Subclasses are responsible for 29 | * how RDDs are set, stored, and accessed. 30 | * 31 | * @param databaseName Namespace for this table. 32 | * @param tableName Name of this table. 33 | * @param cacheMode Type of memory storage used for the table (e.g., the Spark block manager). 34 | */ 35 | private[shark] abstract class Table( 36 | var databaseName: String, 37 | var tableName: String, 38 | var cacheMode: CacheType.CacheType) { 39 | 40 | /** 41 | * A mutable wrapper for an RDD and stats for its partitions. 42 | */ 43 | class RDDValue( 44 | var rdd: RDD[TablePartition], 45 | var stats: collection.Map[Int, TablePartitionStats]) { 46 | 47 | def toTuple = (rdd, stats) 48 | } 49 | } 50 | 51 | object Table { 52 | 53 | /** 54 | * Merges contents of `otherStatsMaps` into `targetStatsMap`. 55 | */ 56 | def mergeStats( 57 | targetStatsMap: Buffer[(Int, TablePartitionStats)], 58 | otherStatsMap: Iterable[(Int, TablePartitionStats)] 59 | ): Buffer[(Int, TablePartitionStats)] = { 60 | val targetStatsMapSize = targetStatsMap.size 61 | for ((otherIndex, tableStats) <- otherStatsMap) { 62 | targetStatsMap.append((otherIndex + targetStatsMapSize, tableStats)) 63 | } 64 | targetStatsMap 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/SerializableWritable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution.serialization 19 | 20 | import java.io._ 21 | import org.apache.hadoop.io.ObjectWritable 22 | import org.apache.hadoop.io.Writable 23 | import org.apache.hadoop.mapred.JobConf 24 | import org.apache.hadoop.io.NullWritable 25 | 26 | object SerializableWritable { 27 | val conf = new JobConf() 28 | } 29 | 30 | 31 | class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { 32 | def value = t 33 | 34 | override def toString = if(null == t) "null" else t.toString 35 | 36 | private def writeObject(out: ObjectOutputStream) { 37 | out.defaultWriteObject() 38 | new ObjectWritable(if (t == null) NullWritable.get() else t).write(out) 39 | } 40 | 41 | private def readObject(in: ObjectInputStream) { 42 | in.defaultReadObject() 43 | val ow = new ObjectWritable() 44 | ow.setConf(SerializableWritable.conf) 45 | ow.readFields(in) 46 | val s = ow.get 47 | if (s == null || s.isInstanceOf[NullWritable]) { 48 | t = null.asInstanceOf[T] 49 | } else { 50 | t = s.asInstanceOf[T] 51 | } 52 | } 53 | 54 | override def hashCode(): Int = if(t == null) 0 else t.hashCode 55 | 56 | override def equals(other: Any) = { 57 | if(other.isInstanceOf[SerializableWritable[_]].unary_!) { 58 | false 59 | } else { 60 | val other_t = other.asInstanceOf[SerializableWritable[_]].t 61 | if (t == null) { 62 | other_t == null 63 | } else { 64 | t.equals(other_t) 65 | } 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/TablePartitionIterator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.util.BitSet 21 | import shark.memstore2.column.ColumnIterator 22 | 23 | 24 | /** 25 | * An iterator for a partition of data. Each element returns a ColumnarStruct 26 | * that can be read by a ColumnarStructObjectInspector. 27 | * 28 | * @param numRows: total number of rows in this partition. 29 | * @param columnIterators: iterators for all columns. 30 | @ @param columnUsed: an optional bitmap indicating whether a column is used. 31 | */ 32 | class TablePartitionIterator( 33 | val numRows: Long, 34 | val columnIterators: Array[ColumnIterator], 35 | val columnUsed: BitSet) 36 | extends Iterator[ColumnarStruct] { 37 | 38 | def this(numRows: Long, 39 | columnIterators: Array[ColumnIterator]) { 40 | this(numRows, columnIterators, TablePartitionIterator.newBitSet(columnIterators.size)) 41 | } 42 | 43 | private val _struct = new ColumnarStruct(columnIterators) 44 | 45 | private var _position: Long = 0 46 | 47 | def hasNext: Boolean = _position < numRows 48 | 49 | def next(): ColumnarStruct = { 50 | _position += 1 51 | var i = columnUsed.nextSetBit(0) 52 | while (i > -1) { 53 | columnIterators(i).next() 54 | i = columnUsed.nextSetBit(i + 1) 55 | } 56 | _struct 57 | } 58 | } 59 | 60 | object TablePartitionIterator { 61 | 62 | def newBitSet(numCols: Int): BitSet = { 63 | val b = new BitSet(numCols) 64 | var i = numCols 65 | while (i > 0) { 66 | i -= 1 67 | b.set(i, true) 68 | } 69 | b 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /bin/dev/build_test.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/main/scala/shark/repl/SharkILoop.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.repl 19 | 20 | import java.io.PrintWriter 21 | 22 | import org.apache.spark.{SparkContext, SparkEnv} 23 | import org.apache.spark.repl.SparkILoop 24 | 25 | import shark.{SharkContext, SharkEnv} 26 | 27 | 28 | /** 29 | * Add more Shark specific initializations. 30 | */ 31 | class SharkILoop extends SparkILoop(None, new PrintWriter(Console.out, true), None) { 32 | 33 | override def initializeSpark() { 34 | // Note: shark.SharkEnv.initWithSharkContext must be invoked after spark.repl.Main.interp 35 | // is used because the slaves' executors depend on the environmental variable 36 | // "spark.repl.class.uri" set to invoke Spark's ExecutorClassLoader. 37 | intp.beQuietDuring { 38 | command(""" 39 | org.apache.spark.repl.Main.interp.out.println("Creating SparkContext..."); 40 | org.apache.spark.repl.Main.interp.out.flush(); 41 | shark.SharkEnv.initWithSharkContext("shark-shell"); 42 | @transient val sparkContext = shark.SharkEnv.sc; 43 | org.apache.spark.repl.Main.interp.sparkContext = sparkContext; 44 | @transient val sc = sparkContext.asInstanceOf[shark.SharkContext]; 45 | org.apache.spark.repl.Main.interp.out.println("Shark context available as sc."); 46 | import sc._; 47 | def s = sql2console _; 48 | org.apache.spark.repl.Main.interp.out.flush(); 49 | """) 50 | command("import org.apache.spark.SparkContext._"); 51 | } 52 | Console.println("Type in expressions to have them evaluated.") 53 | Console.println("Type :help for more information.") 54 | Console.flush() 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/column/NullableColumnIterator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | import java.nio.ByteBuffer 21 | import java.nio.ByteOrder 22 | 23 | /** 24 | * Reads a nullable column. Expects the byte buffer to contain as first element 25 | * the null count, followed by the null indices, and finally the non nulls. 26 | * Reading of non nulls is delegated by setting the buffer position to the first 27 | * non null. 28 | */ 29 | class NullableColumnIterator(buffer: ByteBuffer) extends ColumnIterator { 30 | private var _d: ByteBuffer = _ 31 | private var _nullCount: Int = _ 32 | private var _nulls = 0 33 | 34 | private var _isNull = false 35 | private var _currentNullIndex: Int = _ 36 | private var _pos = 0 37 | 38 | private var _delegate: ColumnIterator = _ 39 | 40 | override def init() { 41 | _d = buffer.duplicate() 42 | _d.order(ByteOrder.nativeOrder()) 43 | _nullCount = _d.getInt() 44 | _currentNullIndex = if (_nullCount > 0) _d.getInt() else Integer.MAX_VALUE 45 | _pos = 0 46 | 47 | // Move the buffer position to the non-null region. 48 | buffer.position(buffer.position() + 4 + _nullCount * 4) 49 | _delegate = ColumnIterator.newNonNullIterator(buffer) 50 | } 51 | 52 | override def next() { 53 | if (_pos == _currentNullIndex) { 54 | _nulls += 1 55 | if (_nulls < _nullCount) { 56 | _currentNullIndex = _d.getInt() 57 | } 58 | _isNull = true 59 | } else { 60 | _isNull = false 61 | _delegate.next() 62 | } 63 | _pos += 1 64 | } 65 | 66 | override def hasNext: Boolean = (_nulls < _nullCount) || _delegate.hasNext 67 | 68 | def current: Object = if (_isNull) null else _delegate.current 69 | } 70 | -------------------------------------------------------------------------------- /src/test/scala/shark/CliSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark 19 | 20 | import java.io.{BufferedReader, File, InputStreamReader, PrintWriter} 21 | import org.scalatest.{BeforeAndAfterAll, FunSuite} 22 | 23 | 24 | /** 25 | * Test the Shark CLI. 26 | */ 27 | class CliSuite extends FunSuite with BeforeAndAfterAll with TestUtils { 28 | 29 | val WAREHOUSE_PATH = TestUtils.getWarehousePath("cli") 30 | val METASTORE_PATH = TestUtils.getMetastorePath("cli") 31 | 32 | override def beforeAll() { 33 | val pb = new ProcessBuilder( 34 | "./bin/shark", 35 | "-hiveconf", 36 | "javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + METASTORE_PATH + ";create=true", 37 | "-hiveconf", 38 | "hive.metastore.warehouse.dir=" + WAREHOUSE_PATH) 39 | 40 | process = pb.start() 41 | outputWriter = new PrintWriter(process.getOutputStream, true) 42 | inputReader = new BufferedReader(new InputStreamReader(process.getInputStream)) 43 | errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream)) 44 | waitForOutput(inputReader, "shark>") 45 | } 46 | 47 | override def afterAll() { 48 | process.destroy() 49 | process.waitFor() 50 | } 51 | 52 | test("simple select") { 53 | val dataFilePath = TestUtils.dataFilePath + "/kv1.txt" 54 | executeQuery("create table shark_test1(key int, val string);") 55 | executeQuery("load data local inpath '" + dataFilePath+ "' overwrite into table shark_test1;") 56 | executeQuery("""create table shark_test1_cached TBLPROPERTIES ("shark.cache" = "true") as 57 | select * from shark_test1;""") 58 | val out = executeQuery("select * from shark_test1_cached where key = 407;") 59 | assert(out.contains("val_407")) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/shark/parse/SharkExplainSemanticAnalyzer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.parse 19 | 20 | import java.io.Serializable 21 | import java.util.ArrayList 22 | 23 | import org.apache.hadoop.fs.Path 24 | import org.apache.hadoop.hive.conf.HiveConf 25 | import org.apache.hadoop.hive.ql.exec._ 26 | import org.apache.hadoop.hive.ql.parse._ 27 | 28 | import shark.execution.SharkExplainWork 29 | 30 | 31 | class SharkExplainSemanticAnalyzer(conf: HiveConf) extends ExplainSemanticAnalyzer(conf) { 32 | 33 | var sem: BaseSemanticAnalyzer = null 34 | 35 | /** 36 | * This is basically the same as Hive's except we invoke 37 | * SharkSemanticAnalyzerFactory. We need to do this to get 38 | * SharkSemanticAnalyzer for SELECT and CTAS queries. 39 | */ 40 | override def analyzeInternal(ast: ASTNode): Unit = { 41 | ctx.setExplain(true) 42 | 43 | // Create a semantic analyzer for the query 44 | val childNode = ast.getChild(0).asInstanceOf[ASTNode] 45 | sem = SharkSemanticAnalyzerFactory.get(conf, childNode) 46 | sem.analyze(childNode, ctx) 47 | 48 | val extended = (ast.getChildCount() > 1) 49 | 50 | ctx.setResFile(new Path(ctx.getLocalTmpFileURI())) 51 | var tasks = sem.getRootTasks() 52 | val fetchTask = sem.getFetchTask() 53 | if (tasks == null) { 54 | if (fetchTask != null) { 55 | tasks = new ArrayList[Task[_ <: Serializable]](); 56 | tasks.add(fetchTask) 57 | } 58 | } else if (fetchTask != null) { 59 | tasks.add(fetchTask) 60 | } 61 | 62 | val task = TaskFactory.get( 63 | new SharkExplainWork(ctx.getResFile().toString(), tasks, childNode.toStringTree(), 64 | sem.getInputs(), extended), conf) 65 | 66 | rootTasks.add(task) 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/column/NullableColumnBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | import java.nio.ByteBuffer 21 | import java.nio.ByteOrder 22 | 23 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 24 | 25 | 26 | /** 27 | * Builds a nullable column. The byte buffer of a nullable column contains: 28 | * - 4 bytes for the null count (number of nulls) 29 | * - positions for each null, in ascending order 30 | * - the non-null data (column data type, compression type, data...) 31 | */ 32 | trait NullableColumnBuilder[T] extends ColumnBuilder[T] { 33 | 34 | private var _nulls: ByteBuffer = _ 35 | 36 | private var _pos: Int = _ 37 | private var _nullCount: Int = _ 38 | 39 | override def initialize(initialSize: Int, cName: String): ByteBuffer = { 40 | _nulls = ByteBuffer.allocate(1024) 41 | _nulls.order(ByteOrder.nativeOrder()) 42 | _pos = 0 43 | _nullCount = 0 44 | super.initialize(initialSize, cName) 45 | } 46 | 47 | override def append(o: Object, oi: ObjectInspector) { 48 | if (o == null) { 49 | _nulls = growIfNeeded(_nulls, 4) 50 | _nulls.putInt(_pos) 51 | _nullCount += 1 52 | } else { 53 | super.append(o, oi) 54 | } 55 | _pos += 1 56 | } 57 | 58 | override def build(): ByteBuffer = { 59 | val nonNulls = super.build() 60 | val nullDataLen = _nulls.position() 61 | _nulls.limit(nullDataLen) 62 | _nulls.rewind() 63 | 64 | // 4 bytes for null count + null positions + non nulls 65 | val newBuffer = ByteBuffer.allocate(4 + nullDataLen + nonNulls.limit) 66 | newBuffer.order(ByteOrder.nativeOrder()) 67 | newBuffer.putInt(_nullCount).put(_nulls).put(nonNulls) 68 | newBuffer.rewind() 69 | newBuffer 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/shark/server/SharkSQLOperation.scala: -------------------------------------------------------------------------------- 1 | package shark.server 2 | 3 | import java.util.{Map => JMap} 4 | import org.apache.hadoop.hive.ql.parse.VariableSubstitution 5 | import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse 6 | import org.apache.hive.service.cli.{HiveSQLException, OperationState, TableSchema} 7 | import org.apache.hive.service.cli.operation.SQLOperation 8 | import org.apache.hive.service.cli.session.HiveSession 9 | import shark.{SharkDriver, Utils} 10 | 11 | class SharkSQLOperation( 12 | parentSession: HiveSession, 13 | statement: String, 14 | confOverlay: JMap[String, String]) 15 | extends SQLOperation(parentSession, statement, confOverlay) { 16 | 17 | private val sdriver = { 18 | val d = new SharkDriver(getParentSession.getHiveConf) 19 | d.init() 20 | d 21 | } 22 | 23 | override def run() { 24 | setState(OperationState.RUNNING) 25 | Utils.setSuperField("driver", sdriver, this) 26 | var response: Option[CommandProcessorResponse] = None 27 | sdriver.setTryCount(Integer.MAX_VALUE) //maybe useless? 28 | var subStatement = "" 29 | try { 30 | //duplicate: this is also done when Driver compiles command 31 | subStatement = new VariableSubstitution().substitute(getParentSession.getHiveConf, statement) 32 | } catch { 33 | case e: IllegalStateException => { 34 | setState(OperationState.ERROR) 35 | throw new HiveSQLException 36 | } 37 | } 38 | 39 | response = Option(sdriver.run(subStatement)) 40 | response match { 41 | case Some(resp: CommandProcessorResponse) => { 42 | val code = resp.getResponseCode 43 | if (code != 0) { 44 | setState(OperationState.ERROR) 45 | throw new HiveSQLException("Error while processing statement: " 46 | + resp.getErrorMessage, resp.getSQLState, code) 47 | } 48 | } 49 | case None => { 50 | setState(OperationState.ERROR) 51 | throw new HiveSQLException 52 | } 53 | } 54 | 55 | val mResultSchema = sdriver.getSchema 56 | Utils.setSuperField("mResultSchema", mResultSchema, this) 57 | if (mResultSchema != null && mResultSchema.isSetFieldSchemas) { 58 | val resultSchema = new TableSchema(mResultSchema) 59 | Utils.setSuperField("resultSchema", resultSchema, this) 60 | setHasResultSet(true) 61 | } else { 62 | setHasResultSet(false) 63 | } 64 | setState(OperationState.FINISHED) 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/TablePartitionBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.io.{DataInput, DataOutput} 21 | 22 | import scala.collection.JavaConversions._ 23 | 24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 25 | import org.apache.hadoop.hive.serde2.objectinspector.StructField 26 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector 27 | import org.apache.hadoop.io.Writable 28 | 29 | import shark.memstore2.column.ColumnBuilder 30 | 31 | 32 | /** 33 | * Used to build a TablePartition. This is used in the serializer to convert a 34 | * partition of data into columnar format and to generate a TablePartition. 35 | */ 36 | class TablePartitionBuilder( 37 | oi: StructObjectInspector, 38 | initialColumnSize: Int, 39 | shouldCompress: Boolean = true) 40 | extends Writable { 41 | 42 | private var numRows: Long = 0 43 | val fields: Seq[_ <: StructField] = oi.getAllStructFieldRefs 44 | 45 | val columnBuilders = Array.tabulate[ColumnBuilder[_]](fields.size) { i => 46 | val columnBuilder = ColumnBuilder.create(fields(i), shouldCompress) 47 | columnBuilder.initialize(initialColumnSize, fields(i).getFieldName) 48 | columnBuilder 49 | } 50 | 51 | def incrementRowCount() { 52 | numRows += 1 53 | } 54 | 55 | def append(columnIndex: Int, o: Object, oi: ObjectInspector) { 56 | columnBuilders(columnIndex).append(o, oi) 57 | } 58 | 59 | def stats: TablePartitionStats = new TablePartitionStats(columnBuilders.map(_.stats), numRows) 60 | 61 | def build(): TablePartition = new TablePartition(numRows, columnBuilders.map(_.build())) 62 | 63 | // We don't use these, but want to maintain Writable interface for SerDe 64 | override def write(out: DataOutput) {} 65 | override def readFields(in: DataInput) {} 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/HiveStructDeserializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.hadoop.hive.serde2.binarysortable 19 | 20 | // Putting it in this package so it can access the package level visible function 21 | // static void BinarySortableSerDe.serialize(OutputByteBuffer, Object, ObjectInspector, boolean) 22 | 23 | import java.io.IOException 24 | import java.util.{ArrayList => JArrayList} 25 | 26 | import org.apache.hadoop.hive.serde2.SerDeException 27 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector 28 | import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, TypeInfoUtils} 29 | 30 | 31 | /** 32 | * Used to deserialize a row of data. It needs to be initialized with an object inspector 33 | * for the row. 34 | */ 35 | class HiveStructDeserializer(val rowObjectInspector: StructObjectInspector) { 36 | 37 | def deserialize(bytes: Array[Byte]): JArrayList[Object] = { 38 | inputByteBuffer.reset(bytes, 0, bytes.length) 39 | try { 40 | var i = 0 41 | while (i < types.size) { 42 | reusedRow.set(i, 43 | BinarySortableSerDe.deserialize(inputByteBuffer, types(i), false, reusedRow.get(i))) 44 | i += 1 45 | } 46 | } catch{ 47 | case e: IOException => throw new SerDeException(e) 48 | } 49 | reusedRow 50 | } 51 | 52 | private val inputByteBuffer = new InputByteBuffer 53 | private val types = Array.tabulate[TypeInfo](rowObjectInspector.getAllStructFieldRefs.size) { i => 54 | TypeInfoUtils.getTypeInfoFromObjectInspector( 55 | rowObjectInspector.getAllStructFieldRefs.get(i).getFieldObjectInspector) 56 | } 57 | 58 | private val reusedRow: JArrayList[Object] = { 59 | val row = new JArrayList[Object](rowObjectInspector.getAllStructFieldRefs.size()) 60 | (0 until rowObjectInspector.getAllStructFieldRefs.size).foreach(i => row.add(null)) 61 | row 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/shark/optimizer/SharkMapJoinProcessor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.optimizer 19 | 20 | import java.util.{LinkedHashMap => JavaLinkedHashMap} 21 | 22 | import org.apache.hadoop.hive.ql.exec.{MapJoinOperator, JoinOperator, Operator} 23 | import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor 24 | import org.apache.hadoop.hive.ql.parse.{ParseContext, QBJoinTree, OpParseContext} 25 | import org.apache.hadoop.hive.ql.plan.OperatorDesc 26 | import org.apache.hadoop.hive.conf.HiveConf 27 | 28 | class SharkMapJoinProcessor extends MapJoinProcessor { 29 | 30 | /** 31 | * Override generateMapJoinOperator to bypass the step of validating Map Join hints int Hive. 32 | */ 33 | override def generateMapJoinOperator( 34 | pctx: ParseContext, 35 | op: JoinOperator, 36 | joinTree: QBJoinTree, 37 | mapJoinPos: Int): MapJoinOperator = { 38 | val hiveConf: HiveConf = pctx.getConf 39 | val noCheckOuterJoin: Boolean = 40 | HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN) && 41 | HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN) 42 | 43 | val opParseCtxMap: JavaLinkedHashMap[Operator[_ <: OperatorDesc], OpParseContext] = 44 | pctx.getOpParseCtx 45 | 46 | // Explicitly set validateMapJoinTree to false to bypass the step of validating 47 | // Map Join hints in Hive. 48 | val validateMapJoinTree = false 49 | val mapJoinOp: MapJoinOperator = 50 | MapJoinProcessor.convertMapJoin( 51 | opParseCtxMap, op, joinTree, mapJoinPos, noCheckOuterJoin, validateMapJoinTree) 52 | 53 | // Hive originally uses genSelectPlan to insert an dummy select after the MapJoinOperator. 54 | // We should not need this step. 55 | // create a dummy select to select all columns 56 | // MapJoinProcessor.genSelectPlan(pctx, mapJoinOp) 57 | 58 | return mapJoinOp 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/SelectOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import scala.collection.JavaConversions._ 21 | import scala.reflect.BeanProperty 22 | 23 | import org.apache.hadoop.hive.ql.exec.{ExprNodeEvaluator, ExprNodeEvaluatorFactory} 24 | import org.apache.hadoop.hive.ql.plan.SelectDesc 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 26 | 27 | 28 | /** 29 | * An operator that does projection, i.e. selecting certain columns and 30 | * filtering out others. 31 | */ 32 | class SelectOperator extends UnaryOperator[SelectDesc] { 33 | 34 | @BeanProperty var conf: SelectDesc = _ 35 | 36 | @transient var evals: Array[ExprNodeEvaluator] = _ 37 | 38 | override def initializeOnMaster() { 39 | super.initializeOnMaster() 40 | conf = desc 41 | initializeEvals(false) 42 | } 43 | 44 | def initializeEvals(initializeEval: Boolean) { 45 | if (!conf.isSelStarNoCompute) { 46 | evals = conf.getColList().map(ExprNodeEvaluatorFactory.get(_)).toArray 47 | if (initializeEval) { 48 | evals.foreach(_.initialize(objectInspector)) 49 | } 50 | } 51 | } 52 | 53 | override def initializeOnSlave() { 54 | initializeEvals(true) 55 | } 56 | 57 | override def processPartition(split: Int, iter: Iterator[_]) = { 58 | if (conf.isSelStarNoCompute) { 59 | iter 60 | } else { 61 | val reusedRow = new Array[Object](evals.length) 62 | iter.map { row => 63 | var i = 0 64 | while (i < evals.length) { 65 | reusedRow(i) = evals(i).evaluate(row) 66 | i += 1 67 | } 68 | reusedRow 69 | } 70 | } 71 | } 72 | 73 | override def outputObjectInspector(): ObjectInspector = { 74 | if (conf.isSelStarNoCompute()) { 75 | super.outputObjectInspector() 76 | } else { 77 | initEvaluatorsAndReturnStruct(evals, conf.getOutputColumnNames(), objectInspector) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/CacheType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import shark.LogHelper 21 | 22 | 23 | /* 24 | * Enumerations and static helper functions for caches supported by Shark. 25 | */ 26 | object CacheType extends Enumeration with LogHelper { 27 | 28 | /* 29 | * The CacheTypes: 30 | * - MEMORY: Stored in memory and on disk (i.e., cache is write-through). Persistent across Shark 31 | * sessions. By default, all such tables are reloaded into memory on restart. 32 | * - MEMORY_ONLY: Stored only in memory and dropped at the end of each Shark session. 33 | * - OFFHEAP: Stored in an off-heap data storage format, specified by the System property 34 | * 'shark.offheap.clientFactory'. Defaults to TachyonStorageClientFactory. 35 | * - NONE: Stored on disk (e.g., HDFS) and managed by Hive. 36 | */ 37 | type CacheType = Value 38 | val MEMORY, MEMORY_ONLY, OFFHEAP, NONE = Value 39 | 40 | def shouldCache(c: CacheType): Boolean = (c != NONE) 41 | 42 | /** Get the cache type object from a string representation. */ 43 | def fromString(name: String): CacheType = Option(name).map(_.toUpperCase) match { 44 | case None | Some("") | Some("FALSE") => NONE 45 | case Some("TRUE") => MEMORY 46 | case Some("HEAP") => 47 | logWarning("The 'HEAP' cache type name is deprecated. Use 'MEMORY' instead.") 48 | MEMORY 49 | case Some("TACHYON") => 50 | logWarning("The 'TACHYON' cache type name is deprecated. Use 'OFFHEAP' instead.") 51 | OFFHEAP 52 | case _ => { 53 | try { 54 | // Try to use Scala's Enumeration::withName() to interpret 'name'. 55 | withName(name.toUpperCase) 56 | } catch { 57 | case e: java.util.NoSuchElementException => throw new InvalidCacheTypeException(name) 58 | } 59 | } 60 | } 61 | 62 | class InvalidCacheTypeException(name: String) 63 | extends Exception("Invalid string representation of cache type: '%s'".format(name)) 64 | } 65 | -------------------------------------------------------------------------------- /src/test/scala/shark/execution/HiveStructSerializerSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import java.util.{ArrayList => JArrayList} 21 | 22 | import scala.collection.JavaConversions._ 23 | 24 | import org.apache.hadoop.hive.serde2.binarysortable.{HiveStructSerializer, HiveStructDeserializer} 25 | import org.apache.hadoop.hive.serde2.objectinspector.{PrimitiveObjectInspector, 26 | ObjectInspectorFactory, StandardListObjectInspector, StandardMapObjectInspector, 27 | StructObjectInspector} 28 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.{PrimitiveObjectInspectorUtils, 29 | PrimitiveObjectInspectorFactory} 30 | import org.apache.hadoop.io.{IntWritable, LongWritable, Text} 31 | 32 | import org.scalatest.FunSuite 33 | 34 | 35 | class HiveStructSerializerSuite extends FunSuite { 36 | 37 | test("Testing serializing a simple row") { 38 | val row1 = createRow(1, "test1") 39 | val row2 = createRow(2, "test2") 40 | val ser = new HiveStructSerializer(createObjectInspector) 41 | val deser = new HiveStructDeserializer(createObjectInspector) 42 | val deserRow1 = deser.deserialize(ser.serialize(row1)) 43 | assert(row1.get(0).equals(deserRow1.get(0))) 44 | assert(row1.get(1).equals(deserRow1.get(1))) 45 | } 46 | 47 | def createObjectInspector(): StructObjectInspector = { 48 | val names = List("a", "b") 49 | val ois = List( 50 | createPrimitiveOi(classOf[java.lang.Integer]), 51 | createPrimitiveOi(classOf[String])) 52 | ObjectInspectorFactory.getStandardStructObjectInspector(names, ois) 53 | } 54 | 55 | def createRow(v1: Int, v2: String): JArrayList[Object] = { 56 | val row = new JArrayList[Object](2) 57 | row.add(new IntWritable(v1)) 58 | row.add(new Text(v2)) 59 | row 60 | } 61 | 62 | def createPrimitiveOi(javaClass: Class[_]): PrimitiveObjectInspector = 63 | PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( 64 | PrimitiveObjectInspectorUtils.getTypeEntryFromPrimitiveJavaClass(javaClass).primitiveCategory) 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/OperatorSerializationWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution.serialization 19 | 20 | import shark.execution.HiveDesc 21 | import shark.execution.Operator 22 | 23 | 24 | /** 25 | * A wrapper around our operators so they can be serialized by standard Java 26 | * serialization. This really just delegates the serialization of the operators 27 | * to XML, and that of object inspectors to Kryo. 28 | * 29 | * Use OperatorSerializationWrapper(operator) to create a wrapper. 30 | */ 31 | class OperatorSerializationWrapper[T <: Operator[_ <: HiveDesc]] 32 | extends Serializable with shark.LogHelper { 33 | 34 | /** The operator we are going to serialize. */ 35 | @transient var _value: T = _ 36 | 37 | /** The operator serialized by the XMLEncoder, minus the object inspectors. */ 38 | var opSerialized: Array[Byte] = _ 39 | 40 | /** The object inspectors, serialized by Kryo. */ 41 | var objectInspectorsSerialized: Array[Byte] = _ 42 | 43 | def value: T = { 44 | if (_value == null) { 45 | assert(opSerialized != null) 46 | assert(opSerialized.length > 0) 47 | assert(objectInspectorsSerialized != null) 48 | assert(objectInspectorsSerialized.length > 0) 49 | _value = XmlSerializer.deserialize[T](opSerialized) 50 | _value.objectInspectors = KryoSerializer.deserialize(objectInspectorsSerialized) 51 | } 52 | _value 53 | } 54 | 55 | def value_= (v: T):Unit = { 56 | _value = v 57 | opSerialized = XmlSerializer.serialize(value, v.hconf) 58 | objectInspectorsSerialized = KryoSerializer.serialize(value.objectInspectors) 59 | } 60 | 61 | override def toString(): String = { 62 | if (value != null) { 63 | "OperatorSerializationWrapper[ " + value.toString() + " ]" 64 | } else { 65 | super.toString() 66 | } 67 | } 68 | } 69 | 70 | 71 | object OperatorSerializationWrapper { 72 | def apply[T <: Operator[_ <: HiveDesc]](value: T): OperatorSerializationWrapper[T] = { 73 | val wrapper = new OperatorSerializationWrapper[T] 74 | wrapper.value = value 75 | wrapper 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/TableRecovery.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.util.{HashMap => JavaHashMap} 21 | 22 | import scala.collection.JavaConversions.asScalaBuffer 23 | 24 | import org.apache.hadoop.hive.ql.metadata.Hive 25 | import org.apache.hadoop.hive.ql.session.SessionState 26 | 27 | import shark.{LogHelper, SharkEnv} 28 | import shark.util.QueryRewriteUtils 29 | 30 | /** 31 | * Singleton used to reload RDDs upon server restarts. 32 | */ 33 | object TableRecovery extends LogHelper { 34 | 35 | val db = Hive.get() 36 | 37 | /** 38 | * Loads any cached tables with MEMORY as its `shark.cache` property. 39 | * @param cmdRunner The runner that is responsible for taking a cached table query and 40 | * a) Creating the table metadata in Hive Meta Store 41 | * b) Loading the table as an RDD in memory 42 | * @see SharkServer for an example usage. 43 | * @param console Optional SessionState.LogHelper used, if present, to log information about 44 | the tables that get reloaded. 45 | */ 46 | def reloadRdds(cmdRunner: String => Unit, console: Option[SessionState.LogHelper] = None) { 47 | // Filter for tables that should be reloaded into the cache. 48 | val currentDbName = db.getCurrentDatabase() 49 | for (databaseName <- db.getAllDatabases(); tableName <- db.getAllTables(databaseName)) { 50 | val hiveTable = db.getTable(databaseName, tableName) 51 | val tblProps = hiveTable.getParameters 52 | val cacheMode = CacheType.fromString(tblProps.get(SharkTblProperties.CACHE_FLAG.varname)) 53 | if (cacheMode == CacheType.MEMORY) { 54 | val logMessage = "Reloading %s.%s into memory.".format(databaseName, tableName) 55 | if (console.isDefined) { 56 | console.get.printInfo(logMessage) 57 | } else { 58 | logInfo(logMessage) 59 | } 60 | val cmd = QueryRewriteUtils.cacheToAlterTable("CACHE %s".format(tableName)) 61 | cmdRunner(s"use $databaseName") 62 | cmdRunner(cmd) 63 | } 64 | } 65 | db.setCurrentDatabase(currentDbName) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/TableRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | import java.util.{List => JList} 21 | 22 | import org.apache.hadoop.hive.metastore.api.FieldSchema 23 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, StructObjectInspector} 24 | 25 | import shark.execution.serialization.KryoSerializer 26 | 27 | import org.apache.spark.{Partition, TaskContext} 28 | import org.apache.spark.rdd.RDD 29 | 30 | 31 | class TableRDD( 32 | prev: RDD[Any], 33 | val schema: Array[ColumnDesc], 34 | @transient oi: ObjectInspector, 35 | val limit: Int = -1) 36 | extends RDD[Row](prev) { 37 | 38 | private[shark] 39 | def this(prev: RDD[Any], schema: JList[FieldSchema], oi: ObjectInspector, limit: Int) { 40 | this(prev, ColumnDesc.createSchema(schema), oi, limit) 41 | } 42 | 43 | override def getPartitions = firstParent[Any].partitions 44 | 45 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = { 46 | val structOi = initObjectInspector() 47 | firstParent[Any].iterator(split, context).map { rowData => 48 | new Row(rowData, colname2indexMap, structOi) 49 | } 50 | } 51 | 52 | /** 53 | * ObjectInspector is not Java serializable. We serialize it using Kryo and 54 | * and save it as a byte array. On slave nodes, we deserialize this byte 55 | * array to obtain the ObjectInspector object. 56 | */ 57 | private val serializedObjectInspector: Array[Byte] = KryoSerializer.serialize(oi) 58 | 59 | /** 60 | * Maps the column name to column index. 61 | */ 62 | private val colname2indexMap: Map[String, Int] = 63 | collection.immutable.Map() ++ schema.zipWithIndex.map { case(column, index) => 64 | (column.name, index) 65 | } 66 | 67 | /** 68 | * Initialize object inspector from the serializedObjectInspector. 69 | */ 70 | private def initObjectInspector(): StructObjectInspector = { 71 | val oi = KryoSerializer.deserialize[ObjectInspector](serializedObjectInspector) 72 | oi match { 73 | case soi: StructObjectInspector => soi 74 | case _ => throw new Exception("Only basic StructObjectInspector is supposed.") 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/shark/optimizer/SharkOptimizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.optimizer 19 | 20 | import java.util.{List => JavaList} 21 | 22 | import org.apache.hadoop.hive.ql.optimizer.JoinReorder 23 | import org.apache.hadoop.hive.ql.optimizer.{Optimizer => HiveOptimizer, 24 | SimpleFetchOptimizer, Transform, MapJoinProcessor => HiveMapJoinProcessor} 25 | import org.apache.hadoop.hive.ql.parse.ParseContext 26 | import shark.LogHelper 27 | 28 | class SharkOptimizer extends HiveOptimizer with LogHelper { 29 | 30 | /** 31 | * Override Hive optimizer to skip SimpleFetchOptimizer, which is designed 32 | * to let Hive avoid launching MR jobs on simple queries, but rewrites the 33 | * query plan in a way that is inconvenient for Shark (replaces the FS operator 34 | * with a non-terminal ListSink operator). 35 | */ 36 | override def optimize(): ParseContext = { 37 | 38 | // Use reflection to make some private members accessible. 39 | val transformationsField = classOf[HiveOptimizer].getDeclaredField("transformations") 40 | val pctxField = classOf[HiveOptimizer].getDeclaredField("pctx") 41 | pctxField.setAccessible(true) 42 | transformationsField.setAccessible(true) 43 | val transformations = transformationsField.get(this).asInstanceOf[JavaList[Transform]] 44 | var pctx = pctxField.get(this).asInstanceOf[ParseContext] 45 | 46 | // Invoke each optimizer transformation 47 | val it = transformations.iterator 48 | while (it.hasNext()) { 49 | val transformation = it.next() 50 | transformation match { 51 | case _: SimpleFetchOptimizer => {} 52 | case _: JoinReorder => {} 53 | case _: HiveMapJoinProcessor => { 54 | // Use SharkMapJoinProcessor to bypass the step of validating Map Join hints 55 | // in Hive. So, we can use hints to mark tables that will be considered as small 56 | // tables (like Hive 0.9). 57 | val sharkMapJoinProcessor = new SharkMapJoinProcessor 58 | pctx = sharkMapJoinProcessor.transform(pctx) 59 | } 60 | case _ => { 61 | pctx = transformation.transform(pctx) 62 | } 63 | } 64 | } 65 | pctx 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/JavaTableRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | import scala.reflect.ClassTag 21 | 22 | import org.apache.spark.api.java.function.{Function => JFunction} 23 | import org.apache.spark.api.java.JavaRDDLike 24 | import org.apache.spark.rdd.RDD 25 | import org.apache.spark.storage.StorageLevel 26 | 27 | 28 | class JavaTableRDD(val rdd: RDD[Row], val schema: Array[ColumnDesc]) 29 | extends JavaRDDLike[Row, JavaTableRDD] { 30 | 31 | override def wrapRDD(rdd: RDD[Row]): JavaTableRDD = new JavaTableRDD(rdd, schema) 32 | 33 | // Common RDD functions 34 | override val classTag: ClassTag[Row] = implicitly[ClassTag[Row]] 35 | 36 | // This shouldn't be necessary, but we seem to need this to get first() to return Row 37 | // instead of Object; possibly a compiler bug? 38 | override def first(): Row = rdd.first() 39 | 40 | /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */ 41 | def cache(): JavaTableRDD = wrapRDD(rdd.cache()) 42 | 43 | /** 44 | * Set this RDD's storage level to persist its values across operations after the first time 45 | * it is computed. Can only be called once on each RDD. 46 | */ 47 | def persist(newLevel: StorageLevel): JavaTableRDD = wrapRDD(rdd.persist(newLevel)) 48 | 49 | // Transformations (return a new RDD) 50 | 51 | // Note: we didn't implement distinct() because equals() and hashCode() are not defined for Row. 52 | 53 | /** 54 | * Return a new RDD containing only the elements that satisfy a predicate. 55 | */ 56 | def filter(f: JFunction[Row, java.lang.Boolean]): JavaTableRDD = 57 | wrapRDD(rdd.filter((x => f(x).booleanValue()))) 58 | 59 | /** 60 | * Return a sampled subset of this RDD. 61 | */ 62 | def sample(withReplacement: Boolean, fraction: Double, seed: Int): JavaTableRDD = 63 | wrapRDD(rdd.sample(withReplacement, fraction, seed)) 64 | 65 | /** 66 | * Return the union of this RDD and another one. Any identical elements will appear multiple 67 | * times (use `.distinct()` to eliminate them). 68 | * 69 | * Note: the `schema` of a union is this RDD's schema. 70 | */ 71 | def union(other: JavaTableRDD): JavaTableRDD = wrapRDD(rdd.union(other.rdd)) 72 | 73 | } 74 | 75 | 76 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/column/ColumnIterators.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | import java.nio.ByteBuffer 21 | import org.apache.hadoop.hive.serde2.`lazy`.LazyObject 22 | import org.apache.hadoop.hive.serde2.`lazy`.LazyFactory 23 | import org.apache.hadoop.hive.serde2.`lazy`.ByteArrayRef 24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 25 | 26 | import shark.execution.serialization.KryoSerializer 27 | 28 | 29 | class IntColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, INT) 30 | 31 | class FloatColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, FLOAT) 32 | 33 | class LongColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, LONG) 34 | 35 | class DoubleColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, DOUBLE) 36 | 37 | class BooleanColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, BOOLEAN) 38 | 39 | class ByteColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, BYTE) 40 | 41 | class ShortColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, SHORT) 42 | 43 | class NullColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, VOID) 44 | 45 | class TimestampColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, TIMESTAMP) 46 | 47 | class BinaryColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, BINARY) 48 | 49 | class StringColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, STRING) 50 | 51 | class GenericColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, GENERIC) { 52 | 53 | private var _obj: LazyObject[_] = _ 54 | 55 | override def init() { 56 | super.init() 57 | val oiSize = buffer.getInt() 58 | val oiSerialized = new Array[Byte](oiSize) 59 | buffer.get(oiSerialized, 0, oiSize) 60 | val oi = KryoSerializer.deserialize[ObjectInspector](oiSerialized) 61 | _obj = LazyFactory.createLazyObject(oi) 62 | } 63 | 64 | override def current = { 65 | val v = super.current.asInstanceOf[ByteArrayRef] 66 | _obj.init(v, 0, v.getData().length) 67 | _obj 68 | } 69 | } 70 | 71 | class VoidColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, VOID) 72 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/column/ColumnBuilders.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | import java.nio.ByteBuffer 21 | import java.sql.Timestamp 22 | 23 | import org.apache.hadoop.hive.serde2.ByteStream 24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 25 | import org.apache.hadoop.io.BytesWritable 26 | import org.apache.hadoop.io.Text 27 | 28 | 29 | import shark.execution.serialization.KryoSerializer 30 | import shark.memstore2.column.ColumnStats._ 31 | 32 | 33 | class BooleanColumnBuilder extends DefaultColumnBuilder[Boolean](new BooleanColumnStats(), BOOLEAN) 34 | 35 | class IntColumnBuilder extends DefaultColumnBuilder[Int](new IntColumnStats(), INT) 36 | 37 | class LongColumnBuilder extends DefaultColumnBuilder[Long](new LongColumnStats(), LONG) 38 | 39 | class FloatColumnBuilder extends DefaultColumnBuilder[Float](new FloatColumnStats(), FLOAT) 40 | 41 | class DoubleColumnBuilder extends DefaultColumnBuilder[Double](new DoubleColumnStats(), DOUBLE) 42 | 43 | class StringColumnBuilder extends DefaultColumnBuilder[Text](new StringColumnStats(), STRING) 44 | 45 | class ByteColumnBuilder extends DefaultColumnBuilder[Byte](new ByteColumnStats(), BYTE) 46 | 47 | class ShortColumnBuilder extends DefaultColumnBuilder[Short](new ShortColumnStats(), SHORT) 48 | 49 | class TimestampColumnBuilder 50 | extends DefaultColumnBuilder[Timestamp](new TimestampColumnStats(), TIMESTAMP) 51 | 52 | class BinaryColumnBuilder extends DefaultColumnBuilder[BytesWritable](new NoOpStats(), BINARY) 53 | 54 | class VoidColumnBuilder extends DefaultColumnBuilder[Void](new NoOpStats(), VOID) 55 | 56 | /** 57 | * Generic columns that we can serialize, including maps, structs, and other complex types. 58 | */ 59 | class GenericColumnBuilder(oi: ObjectInspector) 60 | extends DefaultColumnBuilder[ByteStream.Output](new NoOpStats(), GENERIC) { 61 | 62 | // Complex data types cannot be null. Override the initialize in NullableColumnBuilder. 63 | override def initialize(initialSize: Int, columnName: String): ByteBuffer = { 64 | val buffer = super.initialize(initialSize, columnName) 65 | val objectInspectorSerialized = KryoSerializer.serialize(oi) 66 | buffer.putInt(objectInspectorSerialized.size) 67 | buffer.put(objectInspectorSerialized) 68 | buffer 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /bin/dev/test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (C) 2012 The Regents of The University California. 4 | # All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | get_abs_path() { 19 | local PARENT_DIR=$(dirname "$1") 20 | cd "$PARENT_DIR" 21 | local ABS_PATH="$(pwd)"/"$(basename $1)" 22 | cd - >/dev/null 23 | echo $ABS_PATH 24 | } 25 | 26 | CURRENTFILE=`get_abs_path $0` 27 | BINDIR="`dirname $CURRENTFILE`" 28 | FWDIR="`dirname $BINDIR`/.." 29 | 30 | # Load environment variables from conf/shark-env.sh, if it exists 31 | if [ -e $FWDIR/conf/shark-env.sh ] ; then 32 | . $FWDIR/conf/shark-env.sh 33 | fi 34 | 35 | # Hive related section. 36 | if [ -z $HIVE_DEV_HOME ] ; then 37 | echo "No HIVE_DEV_HOME specified. Please set HIVE_DEV_HOME" 38 | exit 1 39 | fi 40 | 41 | # Hive related section. 42 | if [ -z $HADOOP_HOME ] ; then 43 | echo "No HADOOP_HOME specified. Please set HADOOP_HOME" 44 | exit 1 45 | fi 46 | 47 | if [ -n "$TEST_FILE" ] ; then 48 | TEST_FILE=`get_abs_path $TEST_FILE` 49 | export TEST_FILE 50 | fi 51 | 52 | 53 | SPARK_CLASSPATH+=":${HIVE_DEV_HOME}/build/ql/test/classes" 54 | SPARK_CLASSPATH+=":${HIVE_DEV_HOME}/data/conf" 55 | export SPARK_CLASSPATH 56 | 57 | BUILD_PATH=$HIVE_DEV_HOME/build/ql 58 | 59 | # Set variables used by unit tests (ex. create_like.q). 60 | TEST_JAVA_OPTS="-Dbuild.dir=${HIVE_DEV_HOME}/build/ql " 61 | TEST_JAVA_OPTS+="-Dbuild.dir.hive=${HIVE_DEV_HOME}/build " 62 | TEST_JAVA_OPTS+="-Dbuild.ivy.lib.dir=${HIVE_DEV_HOME}/build/ivy/lib " 63 | TEST_JAVA_OPTS+="-Dderby.version=10.4.2.0 " 64 | TEST_JAVA_OPTS+="-Dlog4j.configuration=file://${HIVE_DEV_HOME}/data/conf/hive-log4j.properties " 65 | TEST_JAVA_OPTS+="-Dtest.log.dir=${BUILD_PATH}/test/logs " 66 | TEST_JAVA_OPTS+="-Dtest.output.overwrite=false " 67 | TEST_JAVA_OPTS+="-Dtest.src.data.dir=${HIVE_DEV_HOME}/data " 68 | TEST_JAVA_OPTS+="-Dtest.tmp.dir=${BUILD_PATH}/tmp " 69 | TEST_JAVA_OPTS+="-Dtest.warehouse.dir=${BUILD_PATH}/test/data/warehouse " 70 | #TEST_JAVA_OPTS+="-Duser.dir=${HIVE_DEV_HOME}/ql " 71 | 72 | export TEST_JAVA_OPTS 73 | 74 | # Set the current directory to hive/ql since lots of tests use relative path. 75 | cd ${HIVE_DEV_HOME}/ql 76 | 77 | if [ "$TEST_WITH_ANT" == "1" ] ; then 78 | export CLASSPATH 79 | export RUNNER="ant -noclasspath -nouserlib -f $FWDIR/bin/dev/build_test.xml test" 80 | exec $FWDIR/run "$@" 81 | else 82 | export SHARK_LAUNCH_WITH_JAVA=1 83 | exec $FWDIR/run junit.textui.TestRunner shark.TestSharkCliDriver "$@" 84 | fi 85 | -------------------------------------------------------------------------------- /src/main/resources/tablerdd/rddtable_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from string import Template 3 | import sys 4 | from generator_utils import * 5 | 6 | ## This script generates RDDtable.scala 7 | 8 | p = sys.stdout 9 | 10 | # e.g. createList(1,3, "T[", "]", ",") gives T[1],T[2],T[3] 11 | def createList(start, stop, prefix, suffix="", sep = ",", newlineAfter = 70, indent = 0): 12 | res = "" 13 | oneLine = res 14 | for y in range(start,stop+1): 15 | res += prefix + str(y) + suffix 16 | oneLine += prefix + str(y) + suffix 17 | if y != stop: 18 | res += sep 19 | oneLine += sep 20 | if len(oneLine) > newlineAfter: 21 | res += "\n" + " "*indent 22 | oneLine = "" 23 | return res 24 | 25 | ### The SparkContext declaration 26 | 27 | prefix = """ 28 | /* 29 | * Copyright (C) 2012 The Regents of The University California. 30 | * All rights reserved. 31 | * 32 | * Licensed under the Apache License, Version 2.0 (the "License"); 33 | * you may not use this file except in compliance with the License. 34 | * You may obtain a copy of the License at 35 | * 36 | * http://www.apache.org/licenses/LICENSE-2.0 37 | * 38 | * Unless required by applicable law or agreed to in writing, software 39 | * distributed under the License is distributed on an "AS IS" BASIS, 40 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 41 | * See the License for the specific language governing permissions and 42 | * limitations under the License. 43 | */ 44 | 45 | package shark.api 46 | 47 | // *** This file is auto-generated from RDDTable_generator.py *** 48 | 49 | import scala.language.implicitConversions 50 | import scala.reflect.ClassTag 51 | import org.apache.spark.rdd.RDD 52 | 53 | object RDDTableImplicits { 54 | private type C[T] = ClassTag[T] 55 | 56 | """ 57 | 58 | p.write(prefix) 59 | 60 | for x in range(2,23): 61 | 62 | tableClass = Template( 63 | """ 64 | implicit def rddToTable$num[$tmlist] 65 | (rdd: RDD[($tlist)]): RDDTableFunctions = RDDTable(rdd) 66 | 67 | """).substitute(num = x, tmlist = createList(1, x, "T", ": C", ", ", indent=4), tlist = createList(1, x, "T", "", ", ", indent=4)) 68 | p.write(tableClass) 69 | 70 | prefix = """ 71 | } 72 | 73 | object RDDTable { 74 | 75 | private type C[T] = ClassTag[T] 76 | private def ct[T](implicit c: ClassTag[T]) = c 77 | """ 78 | 79 | p.write(prefix) 80 | 81 | for x in range(2,23): 82 | 83 | tableClass = Template( 84 | """ 85 | def apply[$tmlist] 86 | (rdd: RDD[($tlist)]) = { 87 | val classTag = implicitly[ClassTag[Seq[Any]]] 88 | val rddSeq: RDD[Seq[_]] = rdd.map(t => t.productIterator.toList.asInstanceOf[Seq[Any]])(classTag) 89 | new RDDTableFunctions(rddSeq, Seq($mtlist)) 90 | } 91 | 92 | """).substitute(tmlist = createList(1, x, "T", ": C", ", ", indent=4), tlist = createList(1, x, "T", "", ", ", indent=4), 93 | mtlist = createList(1, x, "ct[T", "]", ", ", indent=4)) 94 | p.write(tableClass) 95 | 96 | 97 | p.write("}\n") 98 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/SharkTblProperties.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.util.{Map => JavaMap} 21 | 22 | 23 | /** 24 | * Collection of static fields and helpers for table properties (i.e., from A 25 | * CREATE TABLE TBLPROPERTIES( ... ) used by Shark. 26 | */ 27 | object SharkTblProperties { 28 | 29 | case class TableProperty(varname: String, defaultVal: String) 30 | 31 | // Class name of the default cache policy used to manage partition evictions for cached, 32 | // Hive-partitioned tables. 33 | val CACHE_POLICY = new TableProperty("shark.cache.policy", "shark.memstore2.CacheAllPolicy") 34 | 35 | // Maximum size - in terms of the number of objects - of the cache specified by the 36 | // "shark.cache.partition.cachePolicy" property above. 37 | val MAX_PARTITION_CACHE_SIZE = new TableProperty("shark.cache.policy.maxSize", "10") 38 | 39 | // Default value for the "shark.cache" table property 40 | val CACHE_FLAG = new TableProperty("shark.cache", "true") 41 | 42 | // Whether we are currently in the process of caching the table (meaning it cannot be accessed). 43 | val CACHE_IN_PROGRESS_FLAG = new TableProperty("shark.cache.inProgress", "false") 44 | 45 | def getOrSetDefault(tblProps: JavaMap[String, String], variable: TableProperty): String = { 46 | if (!tblProps.containsKey(variable.varname)) { 47 | tblProps.put(variable.varname, variable.defaultVal) 48 | } 49 | tblProps.get(variable.varname) 50 | } 51 | 52 | /** 53 | * Returns value for the `variable` table property. If a value isn't present in `tblProps`, then 54 | * the default for `variable` will be returned. 55 | */ 56 | def initializeWithDefaults( 57 | tblProps: JavaMap[String, String], 58 | isPartitioned: Boolean = false): JavaMap[String, String] = { 59 | tblProps.put(CACHE_FLAG.varname, CACHE_FLAG.defaultVal) 60 | tblProps.put(CACHE_IN_PROGRESS_FLAG.varname, CACHE_IN_PROGRESS_FLAG.defaultVal) 61 | if (isPartitioned) { 62 | tblProps.put(CACHE_POLICY.varname, CACHE_POLICY.defaultVal) 63 | } 64 | tblProps 65 | } 66 | 67 | def removeSharkProperties(tblProps: JavaMap[String, String]) { 68 | tblProps.remove(CACHE_FLAG.varname) 69 | tblProps.remove(CACHE_IN_PROGRESS_FLAG.varname) 70 | tblProps.remove(CACHE_POLICY.varname) 71 | tblProps.remove(MAX_PARTITION_CACHE_SIZE.varname) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/resources/tablerdd/TableRDDGenerated_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from string import Template 3 | import sys 4 | from generator_utils import * 5 | 6 | ## This script generates TableRDDGenerated.scala 7 | 8 | p = sys.stdout 9 | 10 | p.write( 11 | """ 12 | /* 13 | * Copyright (C) 2013 The Regents of The University California. 14 | * All rights reserved. 15 | * 16 | * Licensed under the Apache License, Version 2.0 (the "License"); 17 | * you may not use this file except in compliance with the License. 18 | * You may obtain a copy of the License at 19 | * 20 | * http://www.apache.org/licenses/LICENSE-2.0 21 | * 22 | * Unless required by applicable law or agreed to in writing, software 23 | * distributed under the License is distributed on an "AS IS" BASIS, 24 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 | * See the License for the specific language governing permissions and 26 | * limitations under the License. 27 | */ 28 | 29 | 30 | 31 | package shark.api 32 | 33 | // *** This file is auto-generated from TableRDDGenerated_generator.py *** 34 | import scala.language.implicitConversions 35 | import org.apache.spark.rdd.RDD 36 | import org.apache.spark.{TaskContext, Partition} 37 | 38 | import scala.reflect.ClassTag 39 | 40 | class TableSeqRDD(prev: TableRDD) 41 | extends RDD[Seq[Any]](prev) { 42 | 43 | def getSchema = prev.schema 44 | 45 | override def getPartitions = prev.getPartitions 46 | 47 | override def compute(split: Partition, context: TaskContext): Iterator[Seq[Any]] = { 48 | prev.compute(split, context).map( row => 49 | (0 until prev.schema.size).map(i => row.getPrimitive(i)) ) 50 | } 51 | } 52 | 53 | """) 54 | 55 | for x in range(1,23): 56 | 57 | inner = "" 58 | for y in range(1,x+1): 59 | if y % 3 == 1: inner += " " 60 | inner += Template(" row.getPrimitiveGeneric[T$num1]($num2)").substitute(num1=y, num2=y-1) 61 | if y != x: inner += "," 62 | if y % 3 == 0: inner += "\n" 63 | inner += " ) )\n" 64 | 65 | tableClass = Template( 66 | """ 67 | class TableRDD$num[$list](prev: TableRDD, 68 | tags: Seq[ClassTag[_]]) 69 | extends RDD[Tuple$num[$list]](prev) { 70 | def schema = prev.schema 71 | 72 | private val tableCols = schema.size 73 | require(tableCols == $num, "Table only has " + tableCols + " columns, expecting $num") 74 | 75 | tags.zipWithIndex.foreach{ case (m, i) => if (DataTypes.fromClassTag(m) != schema(i).dataType) 76 | throw new IllegalArgumentException( 77 | "Type mismatch on column " + (i + 1) + ", expected " + DataTypes.fromClassTag(m) + " got " + schema(i).dataType) } 78 | 79 | override def getPartitions = prev.getPartitions 80 | 81 | override def compute(split: Partition, context: TaskContext): 82 | Iterator[Tuple$num[$list]] = { 83 | prev.compute(split, context).map( row => 84 | new Tuple$num[$list]( 85 | $innerfatlist 86 | } 87 | } 88 | """).substitute(num = x, list = createList(1, x, "T", "", ", ", indent=4), innerfatlist = inner) 89 | 90 | 91 | p.write(tableClass) 92 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/MemoryTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import org.apache.spark.rdd.RDD 21 | 22 | import scala.collection.mutable.{Buffer, HashMap} 23 | 24 | import shark.execution.RDDUtils 25 | 26 | 27 | /** 28 | * A metadata container for a table in Shark that's backed by an RDD. 29 | */ 30 | private[shark] class MemoryTable( 31 | databaseName: String, 32 | tableName: String, 33 | cacheMode: CacheType.CacheType) 34 | extends Table(databaseName, tableName, cacheMode) { 35 | 36 | private var _rddValueOpt: Option[RDDValue] = None 37 | 38 | /** 39 | * Sets the RDD and stats fields the `_rddValueOpt`. Used for INSERT/LOAD OVERWRITE. 40 | * @param newRDD The table's data. 41 | * @param newStats Stats for each TablePartition in `newRDD`. 42 | * @return The previous (RDD, stats) pair for this table. 43 | */ 44 | def put( 45 | newRDD: RDD[TablePartition], 46 | newStats: collection.Map[Int, TablePartitionStats] = new HashMap[Int, TablePartitionStats]() 47 | ): Option[(RDD[TablePartition], collection.Map[Int, TablePartitionStats])] = { 48 | val prevRDDAndStatsOpt = _rddValueOpt.map(_.toTuple) 49 | if (_rddValueOpt.isDefined) { 50 | _rddValueOpt.foreach { rddValue => 51 | rddValue.rdd = newRDD 52 | rddValue.stats = newStats 53 | } 54 | } else { 55 | _rddValueOpt = Some(new RDDValue(newRDD, newStats)) 56 | } 57 | prevRDDAndStatsOpt 58 | } 59 | 60 | /** 61 | * Used for append operations, such as INSERT and LOAD INTO. 62 | * 63 | * @param newRDD Data to append to the table. 64 | * @param newStats Stats for each TablePartition in `newRDD`. 65 | * @return The previous (RDD, stats) pair for this table. 66 | */ 67 | def update( 68 | newRDD: RDD[TablePartition], 69 | newStats: Buffer[(Int, TablePartitionStats)] 70 | ): Option[(RDD[TablePartition], collection.Map[Int, TablePartitionStats])] = { 71 | val prevRDDAndStatsOpt = _rddValueOpt.map(_.toTuple) 72 | if (_rddValueOpt.isDefined) { 73 | val (prevRDD, prevStats) = (prevRDDAndStatsOpt.get._1, prevRDDAndStatsOpt.get._2) 74 | val updatedRDDValue = _rddValueOpt.get 75 | updatedRDDValue.rdd = RDDUtils.unionAndFlatten(newRDD, prevRDD) 76 | updatedRDDValue.stats = Table.mergeStats(newStats, prevStats).toMap 77 | } else { 78 | put(newRDD, newStats.toMap) 79 | } 80 | prevRDDAndStatsOpt 81 | } 82 | 83 | def getRDD = _rddValueOpt.map(_.rdd) 84 | 85 | def getStats = _rddValueOpt.map(_.stats) 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/shark/KryoRegistrator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark 19 | 20 | import java.io.{DataInputStream, DataOutputStream} 21 | import java.util.Arrays 22 | import com.esotericsoftware.kryo.{Kryo, Serializer => KSerializer} 23 | import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput} 24 | import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer} 25 | import org.apache.hadoop.io.Writable 26 | import org.apache.hadoop.hive.ql.exec.persistence.{MapJoinSingleKey, MapJoinObjectKey, 27 | MapJoinDoubleKeys, MapJoinObjectValue} 28 | import org.apache.spark.serializer.{KryoRegistrator => SparkKryoRegistrator} 29 | import shark.execution.serialization.SerializableWritable 30 | 31 | 32 | class KryoRegistrator extends SparkKryoRegistrator { 33 | def registerClasses(kryo: Kryo) { 34 | 35 | kryo.register(classOf[execution.ReduceKey]) 36 | 37 | // The map join data structures are Java serializable. 38 | kryo.register(classOf[MapJoinSingleKey], new KryoJavaSerializer) 39 | kryo.register(classOf[MapJoinObjectKey], new KryoJavaSerializer) 40 | kryo.register(classOf[MapJoinDoubleKeys], new KryoJavaSerializer) 41 | kryo.register(classOf[MapJoinObjectValue], new KryoJavaSerializer) 42 | 43 | kryo.register(classOf[SerializableWritable[_]], new KryoSWSerializer) 44 | 45 | // As far as I (rxin) know, among all Hadoop writables only TimestampWritable 46 | // cannot be serialized by Kryo out of the box. 47 | kryo.register(classOf[org.apache.hadoop.hive.serde2.io.TimestampWritable], 48 | new KryoWritableSerializer[org.apache.hadoop.hive.serde2.io.TimestampWritable]) 49 | } 50 | } 51 | 52 | class KryoSWSerializer[T <: Writable] extends KSerializer[SerializableWritable[T]] { 53 | def write(kryo : Kryo, out : KryoOutput, obj : SerializableWritable[T]) { 54 | kryo.writeClassAndObject(out, obj.t); out.flush; 55 | } 56 | def read(kryo : Kryo, in : KryoInput, cls : Class[SerializableWritable[T]]) : SerializableWritable[T] = { 57 | new SerializableWritable( 58 | kryo.readClassAndObject(in).asInstanceOf[T] 59 | ) 60 | } 61 | } 62 | 63 | /** A Kryo serializer for Hadoop writables. */ 64 | class KryoWritableSerializer[T <: Writable] extends KSerializer[T] { 65 | override def write(kryo: Kryo, output: KryoOutput, writable: T) { 66 | val ouputStream = new DataOutputStream(output) 67 | writable.write(ouputStream) 68 | } 69 | 70 | override def read(kryo: Kryo, input: KryoInput, cls: java.lang.Class[T]): T = { 71 | val writable = cls.newInstance() 72 | val inputStream = new DataInputStream(input) 73 | writable.readFields(inputStream) 74 | writable 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/tachyon_enabled/scala/shark/tachyon/TachyonOffHeapTableWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.tachyon 19 | 20 | import java.nio.ByteBuffer 21 | 22 | import scala.reflect.BeanProperty 23 | 24 | import shark.{LogHelper, SharkConfVars} 25 | import shark.execution.serialization.JavaSerializer 26 | import shark.memstore2.{OffHeapStorageClient, OffHeapTableWriter, TablePartitionStats} 27 | 28 | import tachyon.client.WriteType 29 | import tachyon.master.MasterInfo 30 | import tachyon.util.CommonUtils 31 | 32 | class TachyonOffHeapTableWriter(@transient path: String, @transient numColumns: Int) 33 | extends OffHeapTableWriter with LogHelper { 34 | 35 | // Re-instantiated upon deserialization, the first time it's referenced. 36 | @transient lazy val tfs = OffHeapStorageClient.client.asInstanceOf[TachyonStorageClient].tfs 37 | val TEMP = "_temperary" 38 | var rawTableId: Int = -1 39 | 40 | override def createTable() { 41 | val metadata = ByteBuffer.allocate(0) 42 | rawTableId = tfs.createRawTable(path, numColumns, metadata) 43 | } 44 | 45 | override def setStats(indexToStats: collection.Map[Int, TablePartitionStats]) { 46 | val buffer = ByteBuffer.wrap(JavaSerializer.serialize(indexToStats)) 47 | tfs.updateRawTableMetadata(rawTableId, buffer) 48 | } 49 | 50 | // rawTable is a lazy val so it gets created the first time it is referenced. 51 | // This is only used on worker nodes. 52 | @transient lazy val rawTable = tfs.getRawTable(rawTableId) 53 | 54 | override def writePartitionColumn(part: Int, column: Int, data: ByteBuffer, tempDir: String) { 55 | val tmpPath = CommonUtils.concat(rawTable.getPath(), TEMP) 56 | val fid = tfs.createFile(CommonUtils.concat(tmpPath, tempDir, column + "", part + "")) 57 | val file = tfs.getFile(fid) 58 | val writeType: WriteType = WriteType.valueOf( 59 | SharkConfVars.getVar(localHconf, SharkConfVars.TACHYON_WRITER_WRITETYPE)) 60 | val outStream = file.getOutStream(writeType) 61 | outStream.write(data.array(), 0, data.limit()) 62 | outStream.close() 63 | } 64 | 65 | override def commitPartition(part: Int, numColumns: Int, tempDir: String) { 66 | val tmpPath = CommonUtils.concat(rawTable.getPath(), TEMP) 67 | (0 until numColumns).reverse.foreach { column => 68 | val srcPath = CommonUtils.concat(tmpPath, tempDir, column + "", part + "") 69 | val destPath = CommonUtils.concat(rawTable.getPath(), MasterInfo.COL, column + "", part + "") 70 | tfs.rename(srcPath, destPath) 71 | } 72 | tfs.delete(CommonUtils.concat(tmpPath, tempDir), true) 73 | } 74 | 75 | override def cleanTmpPath() { 76 | val tmpPath = CommonUtils.concat(rawTable.getPath(), TEMP) 77 | tfs.delete(tmpPath, true) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/serialization/XmlSerializer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution.serialization 19 | 20 | import java.beans.{XMLDecoder, XMLEncoder} 21 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream} 22 | 23 | import com.ning.compress.lzf.{LZFEncoder, LZFDecoder} 24 | 25 | import org.apache.hadoop.conf.Configuration 26 | import org.apache.hadoop.hive.conf.HiveConf 27 | import org.apache.hadoop.hive.ql.exec.Utilities.EnumDelegate 28 | import org.apache.hadoop.hive.ql.plan.GroupByDesc 29 | import org.apache.hadoop.hive.ql.plan.PlanUtils.ExpressionTypes 30 | 31 | import shark.SharkConfVars 32 | 33 | 34 | /** 35 | * Java object serialization using XML encoder/decoder. Avoid using this to 36 | * serialize byte arrays because it is extremely inefficient. 37 | */ 38 | object XmlSerializer { 39 | // We prepend the buffer with a byte indicating whether payload is compressed 40 | val COMPRESSION_ENABLED: Byte = 1 41 | val COMPRESSION_DISABLED: Byte = 0 42 | 43 | def serialize[T](o: T, conf: Configuration): Array[Byte] = { 44 | val byteStream = new ByteArrayOutputStream() 45 | val e = new XMLEncoder(byteStream) 46 | // workaround for java 1.5 47 | e.setPersistenceDelegate(classOf[ExpressionTypes], new EnumDelegate()) 48 | e.setPersistenceDelegate(classOf[GroupByDesc.Mode], new EnumDelegate()) 49 | // workaround for HiveConf-not-a-javabean 50 | e.setPersistenceDelegate(classOf[HiveConf], new HiveConfPersistenceDelegate ) 51 | e.writeObject(o) 52 | e.close() 53 | 54 | val useCompression = conf match { 55 | case null => SharkConfVars.COMPRESS_QUERY_PLAN.defaultBoolVal 56 | case _ => SharkConfVars.getBoolVar(conf, SharkConfVars.COMPRESS_QUERY_PLAN) 57 | } 58 | 59 | if (useCompression) { 60 | COMPRESSION_ENABLED +: LZFEncoder.encode(byteStream.toByteArray()) 61 | } else { 62 | COMPRESSION_DISABLED +: byteStream.toByteArray 63 | } 64 | } 65 | 66 | def deserialize[T](bytes: Array[Byte]): T = { 67 | val cl = Thread.currentThread.getContextClassLoader 68 | val decodedStream = 69 | if (bytes(0) == COMPRESSION_ENABLED) { 70 | new ByteArrayInputStream(LZFDecoder.decode(bytes.slice(1, bytes.size))) 71 | } else { 72 | new ByteArrayInputStream(bytes.slice(1, bytes.size)) 73 | } 74 | 75 | // Occasionally an object inspector is created from the decoding. 76 | // Need to put a lock on the process. 77 | val ret = { 78 | val d: XMLDecoder = new XMLDecoder(decodedStream, null, null, cl) 79 | classOf[XMLDecoder].synchronized { 80 | val ret = d.readObject() 81 | d.close() 82 | ret 83 | } 84 | } 85 | ret.asInstanceOf[T] 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/UDTFOperator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import java.util.{List => JavaList} 21 | 22 | import scala.collection.mutable.ArrayBuffer 23 | import scala.collection.JavaConversions._ 24 | import scala.reflect.BeanProperty 25 | 26 | import org.apache.hadoop.hive.ql.plan.UDTFDesc 27 | import org.apache.hadoop.hive.ql.udf.generic.Collector 28 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector 29 | import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector 30 | import org.apache.hadoop.hive.serde2.objectinspector.StructField 31 | 32 | 33 | class UDTFOperator extends UnaryOperator[UDTFDesc] { 34 | 35 | @BeanProperty var conf: UDTFDesc = _ 36 | 37 | @transient var objToSendToUDTF: Array[java.lang.Object] = _ 38 | @transient var soi: StandardStructObjectInspector = _ 39 | @transient var inputFields: JavaList[_ <: StructField] = _ 40 | @transient var collector: UDTFCollector = _ 41 | @transient var outputObjInspector: ObjectInspector = _ 42 | 43 | override def initializeOnMaster() { 44 | super.initializeOnMaster() 45 | 46 | conf = desc 47 | 48 | initializeOnSlave() 49 | } 50 | 51 | override def initializeOnSlave() { 52 | collector = new UDTFCollector 53 | conf.getGenericUDTF().setCollector(collector) 54 | 55 | // Make an object inspector [] of the arguments to the UDTF 56 | soi = objectInspectors.head.asInstanceOf[StandardStructObjectInspector] 57 | inputFields = soi.getAllStructFieldRefs() 58 | 59 | val udtfInputOIs = inputFields.map { case inputField => 60 | inputField.getFieldObjectInspector() 61 | }.toArray 62 | 63 | objToSendToUDTF = new Array[java.lang.Object](inputFields.size) 64 | outputObjInspector = conf.getGenericUDTF().initialize(udtfInputOIs) 65 | } 66 | 67 | override def outputObjectInspector() = outputObjInspector 68 | 69 | override def processPartition(split: Int, iter: Iterator[_]): Iterator[_] = { 70 | iter.flatMap { row => 71 | explode(row) 72 | } 73 | } 74 | 75 | def explode[T](row: T): ArrayBuffer[java.lang.Object] = { 76 | (0 until inputFields.size).foreach { case i => 77 | objToSendToUDTF(i) = soi.getStructFieldData(row, inputFields.get(i)) 78 | } 79 | conf.getGenericUDTF().process(objToSendToUDTF) 80 | collector.collectRows() 81 | } 82 | } 83 | 84 | class UDTFCollector extends Collector { 85 | 86 | var collected = new ArrayBuffer[java.lang.Object] 87 | 88 | override def collect(input: java.lang.Object) { 89 | // We need to clone the input here because implementations of 90 | // GenericUDTF reuse the same object. Luckily they are always an array, so 91 | // it is easy to clone. 92 | collected += input.asInstanceOf[Array[_]].clone 93 | } 94 | 95 | def collectRows() = { 96 | val toCollect = collected 97 | collected = new ArrayBuffer[java.lang.Object] 98 | toCollect 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/shark/api/RDDTableFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.api 19 | 20 | import scala.collection.mutable.ArrayBuffer 21 | import scala.reflect.ClassTag 22 | 23 | import org.apache.hadoop.hive.ql.metadata.Hive 24 | 25 | import org.apache.spark.rdd.RDD 26 | 27 | import shark.{SharkContext, SharkEnv} 28 | import shark.memstore2.{CacheType, TablePartitionStats, TablePartition, TablePartitionBuilder} 29 | import shark.util.HiveUtils 30 | 31 | 32 | class RDDTableFunctions(self: RDD[Seq[_]], classTags: Seq[ClassTag[_]]) { 33 | 34 | def saveAsTable(tableName: String, fields: Seq[String]): Boolean = { 35 | require(fields.size == this.classTags.size, 36 | "Number of column names != number of fields in the RDD.") 37 | 38 | // Get a local copy of the classTags so we don't need to serialize this object. 39 | val classTags = this.classTags 40 | 41 | val statsAcc = SharkEnv.sc.accumulableCollection(ArrayBuffer[(Int, TablePartitionStats)]()) 42 | 43 | // Create the RDD object. 44 | val rdd = self.mapPartitionsWithIndex { case(partitionIndex, iter) => 45 | val ois = classTags.map(HiveUtils.getJavaPrimitiveObjectInspector) 46 | val builder = new TablePartitionBuilder( 47 | HiveUtils.makeStandardStructObjectInspector(fields, ois), 48 | 1000000, 49 | shouldCompress = false) 50 | 51 | for (p <- iter) { 52 | builder.incrementRowCount() 53 | // TODO: this is not the most efficient code to do the insertion ... 54 | p.zipWithIndex.foreach { case (v, i) => 55 | builder.append(i, v.asInstanceOf[Object], ois(i)) 56 | } 57 | } 58 | 59 | statsAcc += Tuple2(partitionIndex, builder.asInstanceOf[TablePartitionBuilder].stats) 60 | Iterator(builder.build()) 61 | }.persist() 62 | 63 | var isSucessfulCreateTable = HiveUtils.createTableInHive( 64 | tableName, fields, classTags, Hive.get().getConf()) 65 | 66 | // Put the table in the metastore. Only proceed if the DDL statement is executed successfully. 67 | val databaseName = Hive.get(SharkContext.hiveconf).getCurrentDatabase() 68 | if (isSucessfulCreateTable) { 69 | // Create an entry in the MemoryMetadataManager. 70 | val newTable = SharkEnv.memoryMetadataManager.createMemoryTable( 71 | databaseName, tableName, CacheType.MEMORY) 72 | try { 73 | // Force evaluate to put the data in memory. 74 | rdd.context.runJob(rdd, (iter: Iterator[TablePartition]) => iter.foreach(_ => Unit)) 75 | } catch { 76 | case _: Exception => { 77 | // Intercept the exception thrown by SparkContext#runJob() and handle it silently. The 78 | // exception message should already be printed to the console by DDLTask#execute(). 79 | HiveUtils.dropTableInHive(tableName) 80 | // Drop the table entry from MemoryMetadataManager. 81 | SharkEnv.memoryMetadataManager.removeTable(databaseName, tableName) 82 | isSucessfulCreateTable = false 83 | } 84 | } 85 | newTable.put(rdd, statsAcc.value.toMap) 86 | } 87 | return isSucessfulCreateTable 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/shark/parse/SharkLoadSemanticAnalyzer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.parse 19 | 20 | import scala.collection.JavaConversions._ 21 | 22 | import org.apache.hadoop.hive.conf.HiveConf 23 | import org.apache.hadoop.hive.ql.exec.{CopyTask, MoveTask, TaskFactory} 24 | import org.apache.hadoop.hive.ql.metadata.{Partition, Table => HiveTable} 25 | import org.apache.hadoop.hive.ql.parse.{ASTNode, BaseSemanticAnalyzer, LoadSemanticAnalyzer} 26 | import org.apache.hadoop.hive.ql.plan._ 27 | 28 | import shark.{LogHelper, SharkEnv} 29 | import shark.execution.SparkLoadWork 30 | import shark.memstore2.{CacheType, SharkTblProperties} 31 | 32 | 33 | class SharkLoadSemanticAnalyzer(conf: HiveConf) extends LoadSemanticAnalyzer(conf) { 34 | 35 | override def analyzeInternal(ast: ASTNode): Unit = { 36 | // Delegate to the LoadSemanticAnalyzer parent for error checking the source path formatting. 37 | super.analyzeInternal(ast) 38 | 39 | // Children of the AST root created for a LOAD DATA [LOCAL] INPATH ... statement are, in order: 40 | // 1. node containing the path specified by INPATH. 41 | // 2. internal TOK_TABNAME node that contains the table's name. 42 | // 3. (optional) node representing the LOCAL modifier. 43 | val tableASTNode = ast.getChild(1).asInstanceOf[ASTNode] 44 | val tableName = getTableName(tableASTNode) 45 | val hiveTable = db.getTable(tableName) 46 | val cacheMode = CacheType.fromString( 47 | hiveTable.getProperty(SharkTblProperties.CACHE_FLAG.varname)) 48 | 49 | if (CacheType.shouldCache(cacheMode)) { 50 | // Find the arguments needed to instantiate a SparkLoadWork. 51 | val tableSpec = new BaseSemanticAnalyzer.tableSpec(db, conf, tableASTNode) 52 | val hiveTable = tableSpec.tableHandle 53 | val moveTask = getMoveTask() 54 | val partSpecOpt = Option(tableSpec.getPartSpec) 55 | val sparkLoadWork = SparkLoadWork( 56 | db, 57 | conf, 58 | hiveTable, 59 | partSpecOpt, 60 | isOverwrite = moveTask.getWork.getLoadTableWork.getReplace) 61 | 62 | // Create a SparkLoadTask that will read from the table's data directory. Make it a dependent 63 | // task of the LoadTask so that it's executed only if the LoadTask executes successfully. 64 | moveTask.addDependentTask(TaskFactory.get(sparkLoadWork, conf)) 65 | } 66 | } 67 | 68 | private def getMoveTask(): MoveTask = { 69 | assert(rootTasks.size == 1) 70 | 71 | // If the execution is local, then the root task is a CopyTask with a MoveTask child. 72 | // Otherwise, the root is a MoveTask. 73 | var rootTask = rootTasks.head 74 | val moveTask = if (rootTask.isInstanceOf[CopyTask]) { 75 | val firstChildTask = rootTask.getChildTasks.head 76 | assert(firstChildTask.isInstanceOf[MoveTask]) 77 | firstChildTask 78 | } else { 79 | rootTask 80 | } 81 | 82 | // In Hive, LoadTableDesc is referred to as LoadTableWork ... 83 | moveTask.asInstanceOf[MoveTask] 84 | } 85 | 86 | private def getTableName(node: ASTNode): String = { 87 | BaseSemanticAnalyzer.getUnescapedName(node.getChild(0).asInstanceOf[ASTNode]) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/test/scala/shark/memstore2/column/NullableColumnIteratorSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory 21 | import org.apache.hadoop.io.Text 22 | import org.apache.hadoop.io.IntWritable 23 | 24 | import org.scalatest.FunSuite 25 | 26 | 27 | class NullableColumnIteratorSuite extends FunSuite { 28 | 29 | test("String Growth") { 30 | val c = new StringColumnBuilder 31 | c.initialize(4, "") 32 | val oi = PrimitiveObjectInspectorFactory.writableStringObjectInspector 33 | 34 | val a = Array[Text]( 35 | new Text("a"), null, 36 | new Text("b"), null, 37 | new Text("abc"), null, 38 | null, null, new Text("efg") 39 | ) 40 | a.foreach { 41 | t => c.append(t, oi) 42 | } 43 | val b = c.build() 44 | val i = ColumnIterator.newIterator(b) 45 | Range(0, a.length).foreach { x => 46 | if (x > 0) assert(i.hasNext) 47 | i.next() 48 | val v = i.current 49 | if (a(x) == null) { 50 | assert(v == null) 51 | } else { 52 | assert(v.toString == a(x).toString) 53 | } 54 | } 55 | assert(!i.hasNext) 56 | } 57 | 58 | test("Iterate Strings") { 59 | val c = new StringColumnBuilder 60 | c.initialize(4, "") 61 | val oi = PrimitiveObjectInspectorFactory.writableStringObjectInspector 62 | 63 | c.append(new Text("a"), oi) 64 | c.append(new Text(""), oi) 65 | c.append(null, oi) 66 | c.append(new Text("b"), oi) 67 | c.append(new Text("Abcdz"), oi) 68 | c.append(null, oi) 69 | val b = c.build() 70 | val i = ColumnIterator.newIterator(b) 71 | i.next() 72 | assert(i.current.toString() == "a") 73 | i.next() 74 | assert(i.current.toString() == "") 75 | i.next() 76 | assert(i.current == null) 77 | i.next() 78 | assert(i.current.toString() == "b") 79 | i.next() 80 | assert(i.current.toString() == "Abcdz") 81 | i.next() 82 | assert(i.current == null) 83 | assert(false === i.hasNext) 84 | } 85 | 86 | test("Iterate Ints") { 87 | def testList(l: Seq[AnyRef]) { 88 | val c = new IntColumnBuilder 89 | c.initialize(l.size, "") 90 | val oi = PrimitiveObjectInspectorFactory.javaIntObjectInspector 91 | 92 | l.foreach { item => 93 | if (item == null) { 94 | c.append(null, oi) 95 | } else { 96 | c.append(item.asInstanceOf[Object], oi) 97 | } 98 | } 99 | 100 | val b = c.build() 101 | val i = ColumnIterator.newIterator(b) 102 | 103 | l.foreach { x => 104 | i.next() 105 | if (x == null) { 106 | assert(i.current === x) 107 | } else { 108 | assert(i.current.asInstanceOf[IntWritable].get === x) 109 | } 110 | } 111 | assert(false === i.hasNext) 112 | } 113 | 114 | testList(List(null, null, 123.asInstanceOf[AnyRef])) 115 | testList(List(123.asInstanceOf[AnyRef], 4.asInstanceOf[AnyRef], null)) 116 | testList(List(null)) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/test/scala/shark/memstore2/column/ColumnTypeSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2.column 19 | 20 | import java.nio.ByteBuffer 21 | 22 | import org.apache.hadoop.io.IntWritable 23 | import org.apache.hadoop.io.LongWritable 24 | import org.apache.hadoop.hive.serde2.io._ 25 | 26 | import org.scalatest.FunSuite 27 | 28 | class ColumnTypeSuite extends FunSuite { 29 | 30 | test("Int") { 31 | assert(INT.defaultSize == 4) 32 | var buffer = ByteBuffer.allocate(32) 33 | var a: Seq[Int] = Array[Int](35, 67, 899, 4569001) 34 | a.foreach {i => buffer.putInt(i)} 35 | buffer.rewind() 36 | a.foreach {i => 37 | val v = INT.extract(buffer) 38 | assert(v == i) 39 | } 40 | buffer = ByteBuffer.allocate(32) 41 | a = Range(0, 4) 42 | a.foreach { i => 43 | INT.append(i, buffer) 44 | } 45 | buffer.rewind() 46 | a.foreach { i => assert(buffer.getInt() == i)} 47 | 48 | buffer = ByteBuffer.allocate(32) 49 | a =Range(0,4) 50 | a.foreach { i => buffer.putInt(i)} 51 | buffer.rewind() 52 | val writable = new IntWritable() 53 | a.foreach { i => 54 | INT.extractInto(buffer, writable) 55 | assert(writable.get == i) 56 | } 57 | 58 | } 59 | 60 | test("Short") { 61 | assert(SHORT.defaultSize == 2) 62 | assert(SHORT.actualSize(8) == 2) 63 | var buffer = ByteBuffer.allocate(32) 64 | var a = Array[Short](35, 67, 87, 45) 65 | a.foreach {i => buffer.putShort(i)} 66 | buffer.rewind() 67 | a.foreach {i => 68 | val v = SHORT.extract(buffer) 69 | assert(v == i) 70 | } 71 | 72 | buffer = ByteBuffer.allocate(32) 73 | a = Array[Short](0,1,2,3) 74 | a.foreach { i => 75 | SHORT.append(i, buffer) 76 | } 77 | buffer.rewind() 78 | a.foreach { i => assert(buffer.getShort() == i)} 79 | 80 | buffer = ByteBuffer.allocate(32) 81 | a =Array[Short](0,1,2,3) 82 | a.foreach { i => buffer.putShort(i)} 83 | buffer.rewind() 84 | val writable = new ShortWritable() 85 | a.foreach { i => 86 | SHORT.extractInto(buffer, writable) 87 | assert(writable.get == i) 88 | } 89 | } 90 | 91 | test("Long") { 92 | assert(LONG.defaultSize == 8) 93 | assert(LONG.actualSize(45L) == 8) 94 | var buffer = ByteBuffer.allocate(64) 95 | var a = Array[Long](35L, 67L, 8799000880L, 45000999090L) 96 | a.foreach {i => buffer.putLong(i)} 97 | buffer.rewind() 98 | a.foreach {i => 99 | val v = LONG.extract(buffer) 100 | assert(v == i) 101 | } 102 | 103 | buffer = ByteBuffer.allocate(32) 104 | a = Array[Long](0,1,2,3) 105 | a.foreach { i => 106 | LONG.append(i, buffer) 107 | } 108 | buffer.rewind() 109 | a.foreach { i => assert(buffer.getLong() == i)} 110 | 111 | buffer = ByteBuffer.allocate(32) 112 | a =Array[Long](0,1,2,3) 113 | a.foreach { i => buffer.putLong(i)} 114 | buffer.rewind() 115 | val writable = new LongWritable() 116 | a.foreach { i => 117 | LONG.extractInto(buffer, writable) 118 | assert(writable.get == i) 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/ColumnarStructObjectInspector.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.util.{ArrayList => JArrayList, List => JList} 21 | 22 | import org.apache.hadoop.hive.serde2.`lazy`.LazyFactory 23 | import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe.SerDeParameters 24 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorUtils, 25 | StructField, StructObjectInspector} 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category 27 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory 28 | import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo 29 | 30 | 31 | class ColumnarStructObjectInspector(fields: JList[StructField]) extends StructObjectInspector { 32 | 33 | override def getCategory: Category = Category.STRUCT 34 | 35 | override def getTypeName: String = ObjectInspectorUtils.getStandardStructTypeName(this) 36 | 37 | override def getStructFieldRef(fieldName: String): StructField = 38 | ObjectInspectorUtils.getStandardStructFieldRef(fieldName, fields) 39 | 40 | override def getAllStructFieldRefs: JList[_ <: StructField] = fields 41 | 42 | override def getStructFieldData(data: Object, fieldRef: StructField): Object = 43 | data.asInstanceOf[ColumnarStruct].getField( 44 | fieldRef.asInstanceOf[ColumnarStructObjectInspector.IDStructField].fieldID) 45 | 46 | override def getStructFieldsDataAsList(data: Object): JList[Object] = 47 | if (data == null) null else data.asInstanceOf[ColumnarStruct].getFieldsAsList() 48 | } 49 | 50 | 51 | object ColumnarStructObjectInspector { 52 | 53 | def apply(serDeParams: SerDeParameters): ColumnarStructObjectInspector = { 54 | 55 | val columnNames = serDeParams.getColumnNames() 56 | val columnTypes = serDeParams.getColumnTypes() 57 | val fields = new JArrayList[StructField]() 58 | for (i <- 0 until columnNames.size) { 59 | val typeInfo = columnTypes.get(i) 60 | val fieldOI = typeInfo.getCategory match { 61 | case Category.PRIMITIVE => 62 | PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( 63 | typeInfo.asInstanceOf[PrimitiveTypeInfo].getPrimitiveCategory) 64 | case _ => LazyFactory.createLazyObjectInspector( 65 | typeInfo, serDeParams.getSeparators(), 1, serDeParams.getNullSequence(), 66 | serDeParams.isEscaped(), serDeParams.getEscapeChar()) 67 | } 68 | fields.add(new IDStructField(i, columnNames.get(i), fieldOI)) 69 | } 70 | new ColumnarStructObjectInspector(fields) 71 | } 72 | 73 | class IDStructField( 74 | val fieldID: Int, 75 | val fieldName: String, 76 | val fieldObjectInspector: ObjectInspector, 77 | val fieldComment: String) 78 | extends StructField { 79 | 80 | def this(fieldID: Int, fieldName: String, fieldObjectInspector: ObjectInspector) = 81 | this(fieldID, fieldName, fieldObjectInspector, null) 82 | 83 | override def getFieldName: String = fieldName 84 | override def getFieldObjectInspector: ObjectInspector = fieldObjectInspector 85 | override def toString(): String = "" + fieldID + ":" + fieldName 86 | override def getFieldComment() : String = fieldComment 87 | } 88 | } 89 | 90 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/JoinUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import java.util.{List => JavaList} 21 | 22 | import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator 23 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector => OI} 24 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorUtils => OIUtils} 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.{ObjectInspectorCopyOption => CopyOption} 26 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector 27 | 28 | import org.apache.hadoop.io.BooleanWritable 29 | import org.apache.hadoop.io.NullWritable 30 | import org.apache.hadoop.io.Writable 31 | 32 | import shark.execution.serialization.SerializableWritable 33 | 34 | 35 | object JoinUtil { 36 | 37 | def computeJoinKey(row: Any, keyFields: JavaList[ExprNodeEvaluator], keyFieldsOI: JavaList[OI]) 38 | : Seq[SerializableWritable[_]] = { 39 | Range(0, keyFields.size).map { i => 40 | val c = copy(row, keyFields.get(i), keyFieldsOI.get(i), CopyOption.WRITABLE) 41 | val s = if (c == null) NullWritable.get else c 42 | new SerializableWritable(s.asInstanceOf[Writable]) 43 | } 44 | } 45 | 46 | def joinKeyHasAnyNulls(joinKey: Seq[AnyRef], nullSafes: Array[Boolean]): Boolean = { 47 | joinKey.zipWithIndex.exists { x => 48 | (nullSafes == null || nullSafes(x._2).unary_!) && (x._1 == null) 49 | } 50 | } 51 | 52 | def computeJoinValues(row: Any, 53 | valueFields: JavaList[ExprNodeEvaluator], 54 | valueFieldsOI: JavaList[OI], 55 | filters: JavaList[ExprNodeEvaluator], 56 | filtersOI: JavaList[OI], 57 | noOuterJoin: Boolean, 58 | serializable: Boolean = false) 59 | : Array[AnyRef] = { 60 | 61 | // isFiltered = true means failed in the join filter testing 62 | val isFiltered: Boolean = { 63 | if (filters == null) { 64 | false 65 | } else { 66 | var x = 0 67 | var exists = false 68 | while (x < filters.size() && !exists) { 69 | val cond = filters.get(x).evaluate(row) 70 | if (cond == null) { 71 | exists = true 72 | } else { 73 | exists = !filtersOI.get(x).asInstanceOf[BooleanObjectInspector].get(cond) 74 | } 75 | x += 1 76 | } 77 | 78 | exists 79 | } 80 | } 81 | val size = valueFields.size 82 | val a = new Array[AnyRef](size) 83 | var i = 0 84 | while (i < size) { 85 | a(i) = copy(row, valueFields.get(i), valueFieldsOI.get(i), CopyOption.WRITABLE) 86 | i += 1 87 | } 88 | 89 | val result = if (noOuterJoin) { 90 | a 91 | } else { 92 | val n = new Array[AnyRef](size + 1) 93 | Array.copy(a, 0, n, 0, size) 94 | n(size) = new BooleanWritable(isFiltered) 95 | n 96 | } 97 | 98 | if (serializable) { 99 | result.map(e => new SerializableWritable(e.asInstanceOf[Writable])) 100 | } else { 101 | result 102 | } 103 | } 104 | 105 | private def copy(row: Any, evaluator: ExprNodeEvaluator, oi: OI, copyOption: CopyOption) = { 106 | OIUtils.copyToStandardObject(evaluator.evaluate(row), oi, copyOption) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/shark/memstore2/TablePartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.memstore2 19 | 20 | import java.io.{Externalizable, ObjectInput, ObjectOutput} 21 | import java.nio.ByteBuffer 22 | import java.nio.ByteOrder 23 | import java.util.BitSet 24 | import shark.memstore2.column.ColumnIterator 25 | 26 | 27 | /** 28 | * TablePartition contains a whole partition of data in columnar format. It 29 | * simply contains a list of columns and their meta data. It should be built 30 | * using a TablePartitionBuilder. 31 | */ 32 | class TablePartition(private var _numRows: Long, private var _columns: Array[ByteBuffer]) 33 | extends Externalizable { 34 | 35 | // Empty constructor for Externalizable 36 | def this() { 37 | this(0, null) 38 | } 39 | 40 | def this(columns: Array[ByteBuffer]) { 41 | this(columns(0).getLong(), columns.tail) 42 | } 43 | 44 | def numRows: Long = _numRows 45 | 46 | def columns: Array[ByteBuffer] = _columns 47 | 48 | /** We store our per-partition metadata in a fake "column 0" for off-heap storage. */ 49 | def toOffHeap: Array[ByteBuffer] = { 50 | val buffers = new Array[ByteBuffer](1 + _columns.size) 51 | buffers(0) = metadata 52 | System.arraycopy(_columns, 0, buffers, 1, _columns.size) 53 | buffers 54 | } 55 | 56 | def metadata: ByteBuffer = { 57 | val buffer = ByteBuffer.allocate(8) 58 | buffer.order(ByteOrder.nativeOrder()) 59 | buffer.putLong(_numRows) 60 | buffer.rewind() 61 | buffer 62 | } 63 | 64 | /** 65 | * Return an iterator for the partition. 66 | */ 67 | def iterator: TablePartitionIterator = { 68 | val columnIterators: Array[ColumnIterator] = _columns.map { case buffer: ByteBuffer => 69 | val iter = ColumnIterator.newIterator(buffer) 70 | iter 71 | } 72 | new TablePartitionIterator(_numRows, columnIterators) 73 | } 74 | 75 | def prunedIterator(columnsUsed: BitSet) = { 76 | val columnIterators: Array[ColumnIterator] = _columns.map { 77 | case buffer: ByteBuffer => 78 | ColumnIterator.newIterator(buffer) 79 | case _ => 80 | // The buffer might be null if it is pruned in off-heap storage. 81 | null 82 | } 83 | new TablePartitionIterator(_numRows, columnIterators, columnsUsed) 84 | } 85 | 86 | override def readExternal(in: ObjectInput) { 87 | _numRows = in.readLong() 88 | val numColumns = in.readInt() 89 | _columns = Array.fill[ByteBuffer](numColumns) { 90 | val columnLen = in.readInt() 91 | val buf = ByteBuffer.allocate(columnLen) 92 | in.readFully(buf.array(), 0, columnLen) 93 | buf 94 | } 95 | } 96 | 97 | override def writeExternal(out: ObjectOutput) { 98 | out.writeLong(numRows) 99 | out.writeInt(columns.length) 100 | for (column <- columns) { 101 | val buf = column.duplicate() 102 | buf.rewind() 103 | // If the ByteBuffer is backed by a byte array, just write the byte array out. 104 | // Otherwise, write each byte one by one. 105 | if (buf.hasArray()) { 106 | val byteArray = buf.array() 107 | out.writeInt(byteArray.length) 108 | out.write(byteArray, 0, byteArray.length) 109 | } else { 110 | out.writeInt(buf.remaining()) 111 | while (buf.hasRemaining()) { 112 | out.write(buf.get()) 113 | } 114 | } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/scala/shark/execution/SharkExplainTask.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 The Regents of The University California. 3 | * All rights reserved. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package shark.execution 19 | 20 | import java.io.PrintStream 21 | import java.util.{HashSet => JHashSet, List => JList} 22 | 23 | import scala.collection.JavaConversions._ 24 | 25 | import org.apache.hadoop.fs.Path 26 | import org.apache.hadoop.hive.conf.HiveConf 27 | import org.apache.hadoop.hive.ql.exec.{ExplainTask, Task} 28 | import org.apache.hadoop.hive.ql.hooks.ReadEntity; 29 | import org.apache.hadoop.hive.ql.{Context, DriverContext, QueryPlan} 30 | import org.apache.hadoop.hive.ql.exec.{ExplainTask, Task} 31 | import org.apache.hadoop.hive.ql.plan.ExplainWork 32 | import org.apache.hadoop.util.StringUtils 33 | 34 | import shark.LogHelper 35 | 36 | 37 | class SharkExplainWork( 38 | resFile: String, 39 | rootTasks: JList[Task[_ <: java.io.Serializable]], 40 | astStringTree: String, 41 | inputs: JHashSet[ReadEntity], 42 | extended: Boolean) 43 | extends ExplainWork(resFile, rootTasks, astStringTree, inputs, extended, false, false) 44 | 45 | 46 | /** 47 | * SharkExplainTask executes EXPLAIN for RDD operators. 48 | */ 49 | class SharkExplainTask extends Task[SharkExplainWork] with java.io.Serializable with LogHelper { 50 | 51 | val hiveExplainTask = new ExplainTask 52 | 53 | override def execute(driverContext: DriverContext): Int = { 54 | logDebug("Executing " + this.getClass.getName()) 55 | hiveExplainTask.setWork(work) 56 | 57 | try { 58 | val resFile = new Path(work.getResFile()) 59 | val outS = resFile.getFileSystem(conf).create(resFile) 60 | val out = new PrintStream(outS) 61 | 62 | // Print out the parse AST 63 | ExplainTask.outputAST(work.getAstStringTree, out, false, 0) 64 | out.println() 65 | 66 | ExplainTask.outputDependencies(out, work.isFormatted(), work.getRootTasks, 0) 67 | out.println() 68 | 69 | // Go over all the tasks and dump out the plans 70 | ExplainTask.outputStagePlans(out, work, work.getRootTasks, 0) 71 | 72 | // Print the Shark query plan if applicable. 73 | if (work != null && work.getRootTasks != null && work.getRootTasks.size > 0) { 74 | work.getRootTasks.zipWithIndex.foreach { case(task, taskIndex) => 75 | task match { 76 | case sparkTask: SparkTask => { 77 | out.println("SHARK QUERY PLAN #%d:".format(taskIndex)) 78 | val terminalOp = sparkTask.getWork().terminalOperator 79 | ExplainTaskHelper.outputPlan(terminalOp, out, work.getExtended, 2) 80 | out.println() 81 | } 82 | case _ => null 83 | } 84 | } 85 | } 86 | 87 | out.close() 88 | 0 89 | } catch { 90 | case e: Exception => { 91 | console.printError("Failed with exception " + e.getMessage(), "\n" + 92 | StringUtils.stringifyException(e)) 93 | throw e 94 | 1 95 | } 96 | } 97 | } 98 | 99 | override def initialize(conf: HiveConf, queryPlan: QueryPlan, driverContext: DriverContext) { 100 | hiveExplainTask.initialize(conf, queryPlan, driverContext) 101 | super.initialize(conf, queryPlan, driverContext) 102 | } 103 | 104 | override def getType = hiveExplainTask.getType 105 | 106 | override def getName = hiveExplainTask.getName 107 | 108 | override def localizeMRTmpFilesImpl(ctx: Context) { 109 | // explain task has nothing to localize 110 | // we don't expect to enter this code path at all 111 | throw new RuntimeException ("Unexpected call") 112 | } 113 | 114 | } 115 | 116 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file is used to launch Shark on the master. 4 | export SCALA_VERSION=2.10 5 | SHARK_VERSION=0.9.2 6 | 7 | # Figure out where the framework is installed 8 | FWDIR="$(cd `dirname $0`; pwd)" 9 | 10 | export SHARK_HOME="$FWDIR" 11 | 12 | # Load environment variables from conf/shark-env.sh, if it exists 13 | if [ -e $SHARK_HOME/conf/shark-env.sh ] ; then 14 | . $SHARK_HOME/conf/shark-env.sh 15 | fi 16 | 17 | if [ -n "$MASTER" ] ; then 18 | if [ -z $SPARK_HOME ] ; then 19 | echo "No SPARK_HOME specified. Please set SPARK_HOME for cluster mode." 20 | exit 1 21 | fi 22 | fi 23 | 24 | # check for shark with spark on yarn params 25 | if [ "x$SHARK_EXEC_MODE" == "xyarn" ] ; then 26 | if [ "x$SPARK_ASSEMBLY_JAR" == "x" ] ; then 27 | echo "No SPARK_ASSEMBLY_JAR specified. Please set SPARK_ASSEMBLY_JAR for spark on yarn mode." 28 | exit 1 29 | else 30 | export SPARK_JAR=$SPARK_ASSEMBLY_JAR 31 | fi 32 | 33 | if [ "x$SHARK_ASSEMBLY_JAR" == "x" ] ; then 34 | echo "No SHARK_ASSEMBLY_JAR specified. please set SHARK_ASSEMBLY_JAR for spark on yarn mode." 35 | exit 1 36 | else 37 | export SPARK_YARN_APP_JAR=$SHARK_ASSEMBLY_JAR 38 | fi 39 | 40 | # use yarn-client mode for interactive shell. 41 | export MASTER=yarn-client 42 | fi 43 | 44 | # Check for optionally specified configuration file path 45 | if [ "x$HIVE_CONF_DIR" == "x" ] ; then 46 | HIVE_CONF_DIR="$SHARK_HOME/conf" 47 | fi 48 | 49 | if [ -f "${HIVE_CONF_DIR}/hive-env.sh" ]; then 50 | . "${HIVE_CONF_DIR}/hive-env.sh" 51 | fi 52 | 53 | # Add Shark jars. 54 | for jar in `find $SHARK_HOME/lib -name '*jar'`; do 55 | SPARK_CLASSPATH+=:$jar 56 | done 57 | for jar in `find $SHARK_HOME/lib_managed/jars -name '*jar'`; do 58 | SPARK_CLASSPATH+=:$jar 59 | done 60 | for jar in `find $SHARK_HOME/lib_managed/bundles -name '*jar'`; do 61 | SPARK_CLASSPATH+=:$jar 62 | done 63 | 64 | SPARK_CLASSPATH+=:$HIVE_CONF_DIR 65 | 66 | # Build up Shark's jar or classes. 67 | SHARK_CLASSES="$SHARK_HOME/target/scala-$SCALA_VERSION/classes" 68 | SHARK_JAR="$SHARK_HOME/target/scala-$SCALA_VERSION/shark_$SCALA_VERSION-$SHARK_VERSION.jar" 69 | if [ -d "$SHARK_CLASSES/shark" ] ; then 70 | SPARK_CLASSPATH+=":$SHARK_CLASSES" 71 | else 72 | if [ -f "$SHARK_JAR" ] ; then 73 | SPARK_CLASSPATH+=":$SHARK_JAR" 74 | else 75 | echo "Cannot find either compiled classes or compiled jar package for Shark." 76 | echo "Have you compiled Shark yet?" 77 | exit 1 78 | fi 79 | fi 80 | 81 | SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/test-classes" 82 | 83 | 84 | SHARK_JAR="$SHARK_HOME/target/scala-$SCALA_VERSION/shark_$SCALA_VERSION-$SHARK_VERSION.jar" 85 | if [ -f "$SHARK_JAR" ] ; then 86 | SPARK_CLASSPATH+=":$SHARK_JAR" 87 | else 88 | SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/classes" 89 | fi 90 | 91 | SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/test-classes" 92 | 93 | 94 | if [ "x$HADOOP_HOME" == "x" ] ; then 95 | echo "No HADOOP_HOME specified. Shark will run in local-mode" 96 | else 97 | SPARK_CLASSPATH+=:$HADOOP_HOME/etc/hadoop 98 | SPARK_CLASSPATH+=:$HADOOP_HOME/conf 99 | fi 100 | 101 | 102 | # TODO(rxin): Check aux classpath and aux java opts. 103 | #CLASSPATH=${CLASSPATH}:${AUX_CLASSPATH} 104 | 105 | export SPARK_CLASSPATH 106 | export CLASSPATH+=$SPARK_CLASSPATH # Needed for spark-shell 107 | 108 | export SPARK_JAVA_OPTS+=" $TEST_JAVA_OPTS" 109 | 110 | # supress the HADOOP_HOME warnings in 1.x.x 111 | export HADOOP_HOME_WARN_SUPPRESS=true 112 | 113 | if [ "x$SHARK_MASTER_MEM" == "x" ] ; then 114 | SHARK_MASTER_MEM="512m" 115 | fi 116 | 117 | # Set JAVA_OPTS to be able to load native libraries and to set heap size 118 | JAVA_OPTS+="$SPARK_JAVA_OPTS" 119 | JAVA_OPTS+=" -Djava.library.path=$SPARK_LIBRARY_PATH" 120 | JAVA_OPTS+=" -Xms$SHARK_MASTER_MEM -Xmx$SHARK_MASTER_MEM" 121 | export JAVA_OPTS 122 | 123 | # In case we are running Ant 124 | export ANT_OPTS=$JAVA_OPTS 125 | 126 | if [ "x$RUNNER" == "x" ] ; then 127 | if [ -n "$JAVA_HOME" ]; then 128 | RUNNER="${JAVA_HOME}/bin/java" 129 | else 130 | RUNNER=java 131 | fi 132 | # The JVM doesn't read JAVA_OPTS by default so we need to pass it in 133 | EXTRA_ARGS="$JAVA_OPTS" 134 | fi 135 | 136 | exec $RUNNER $EXTRA_ARGS "$@" 137 | -------------------------------------------------------------------------------- /src/test/scala/shark/SharkServerSuite.scala: -------------------------------------------------------------------------------- 1 | package shark 2 | 3 | import java.io.{BufferedReader, InputStreamReader} 4 | import java.sql.DriverManager 5 | import java.sql.Statement 6 | import java.sql.Connection 7 | 8 | import scala.collection.JavaConversions._ 9 | 10 | import org.scalatest.{BeforeAndAfterAll, FunSuite} 11 | import org.scalatest.matchers.ShouldMatchers 12 | 13 | import scala.concurrent._ 14 | import ExecutionContext.Implicits.global 15 | 16 | /** 17 | * Test for the Shark server. 18 | */ 19 | class SharkServerSuite extends FunSuite with BeforeAndAfterAll with ShouldMatchers with TestUtils { 20 | 21 | val WAREHOUSE_PATH = TestUtils.getWarehousePath("server") 22 | val METASTORE_PATH = TestUtils.getMetastorePath("server") 23 | val DRIVER_NAME = "org.apache.hadoop.hive.jdbc.HiveDriver" 24 | val TABLE = "test" 25 | // use a different port, than the hive standard 10000, 26 | // for tests to avoid issues with the port being taken on some machines 27 | val PORT = "9011" 28 | 29 | // If verbose is true, the testing program will print all outputs coming from the shark server. 30 | val VERBOSE = Option(System.getenv("SHARK_TEST_VERBOSE")).getOrElse("false").toBoolean 31 | 32 | Class.forName(DRIVER_NAME) 33 | 34 | override def beforeAll() { launchServer() } 35 | 36 | override def afterAll() { stopServer() } 37 | 38 | private def launchServer(args: Seq[String] = Seq.empty) { 39 | // Forking a new process to start the Shark server. The reason to do this is it is 40 | // hard to clean up Hive resources entirely, so we just start a new process and kill 41 | // that process for cleanup. 42 | val defaultArgs = Seq("./bin/shark", "--service", "sharkserver", 43 | "--verbose", 44 | "-p", 45 | PORT, 46 | "--hiveconf", 47 | "hive.root.logger=INFO,console", 48 | "--hiveconf", 49 | "\"javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + METASTORE_PATH + ";create=true\"", 50 | "--hiveconf", 51 | "\"hive.metastore.warehouse.dir=" + WAREHOUSE_PATH + "\"") 52 | val pb = new ProcessBuilder(defaultArgs ++ args) 53 | process = pb.start() 54 | inputReader = new BufferedReader(new InputStreamReader(process.getInputStream)) 55 | errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream)) 56 | waitForOutput(inputReader, "Starting Shark server") 57 | 58 | // Spawn a thread to read the output from the forked process. 59 | // Note that this is necessary since in some configurations, log4j could be blocked 60 | // if its output to stderr are not read, and eventually blocking the entire test suite. 61 | future { 62 | while (true) { 63 | val stdout = readFrom(inputReader) 64 | val stderr = readFrom(errorReader) 65 | if (VERBOSE && stdout.length > 0) { 66 | println(stdout) 67 | } 68 | if (VERBOSE && stderr.length > 0) { 69 | println(stderr) 70 | } 71 | Thread.sleep(50) 72 | } 73 | } 74 | } 75 | 76 | private def stopServer() { 77 | process.destroy() 78 | process.waitFor() 79 | } 80 | 81 | test("test query execution against a shark server") { 82 | Thread.sleep(5*1000) // I know... Gross. However, without this the tests fail non-deterministically. 83 | 84 | val dataFilePath = TestUtils.dataFilePath + "/kv1.txt" 85 | val stmt = createStatement() 86 | stmt.executeQuery("DROP TABLE IF EXISTS test") 87 | stmt.executeQuery("DROP TABLE IF EXISTS test_cached") 88 | stmt.executeQuery("CREATE TABLE test(key int, val string)") 89 | stmt.executeQuery("LOAD DATA LOCAL INPATH '" + dataFilePath+ "' OVERWRITE INTO TABLE test") 90 | stmt.executeQuery("CREATE TABLE test_cached as select * from test limit 499") 91 | 92 | var rs = stmt.executeQuery("select count(*) from test") 93 | rs.next() 94 | rs.getInt(1) should equal (500) 95 | 96 | rs = stmt.executeQuery("select count(*) from test_cached") 97 | rs.next() 98 | rs.getInt(1) should equal (499) 99 | 100 | stmt.close() 101 | } 102 | 103 | def getConnection(): Connection = { 104 | DriverManager.getConnection("jdbc:hive://localhost:" + PORT + "/default", "", "") 105 | } 106 | 107 | def createStatement(): Statement = getConnection().createStatement() 108 | } --------------------------------------------------------------------------------