├── data
└── files
│ ├── users.txt
│ ├── test1.txt
│ ├── clicks.txt
│ ├── create_nested_type.txt
│ └── kv3.txt
├── src
├── main
│ ├── resources
│ │ ├── dashboard
│ │ │ ├── README
│ │ │ └── dashboard.css
│ │ └── tablerdd
│ │ │ ├── generator_utils.py
│ │ │ ├── SharkContext_sqlRdd_generator.py
│ │ │ ├── rddtable_generator.py
│ │ │ └── TableRDDGenerated_generator.py
│ ├── scala
│ │ └── shark
│ │ │ ├── api
│ │ │ ├── ClassTags.scala
│ │ │ ├── DataType.java
│ │ │ ├── QueryExecutionException.scala
│ │ │ ├── ColumnDesc.scala
│ │ │ ├── ResultSet.scala
│ │ │ ├── PythonTableRDD.scala
│ │ │ ├── TableRDD.scala
│ │ │ ├── JavaTableRDD.scala
│ │ │ └── RDDTableFunctions.scala
│ │ │ ├── server
│ │ │ ├── SharkSessionManager.scala
│ │ │ ├── SharkOperationManager.scala
│ │ │ ├── SharkCLIService.scala
│ │ │ ├── SharkExecuteStatementOperation.scala
│ │ │ └── SharkSQLOperation.scala
│ │ │ ├── tachyon
│ │ │ └── TachyonException.scala
│ │ │ ├── memstore2
│ │ │ ├── column
│ │ │ │ ├── MemoryStoreException.scala
│ │ │ │ ├── NullableColumnIterator.scala
│ │ │ │ ├── NullableColumnBuilder.scala
│ │ │ │ ├── ColumnIterators.scala
│ │ │ │ └── ColumnBuilders.scala
│ │ │ ├── TablePartitionStats.scala
│ │ │ ├── ColumnarStruct.scala
│ │ │ ├── LazySimpleSerDeWrapper.scala
│ │ │ ├── Table.scala
│ │ │ ├── TablePartitionIterator.scala
│ │ │ ├── TablePartitionBuilder.scala
│ │ │ ├── CacheType.scala
│ │ │ ├── TableRecovery.scala
│ │ │ ├── SharkTblProperties.scala
│ │ │ ├── MemoryTable.scala
│ │ │ ├── ColumnarStructObjectInspector.scala
│ │ │ └── TablePartition.scala
│ │ │ ├── parse
│ │ │ ├── QueryContext.scala
│ │ │ ├── SharkSemanticAnalyzerFactory.scala
│ │ │ ├── QueryBlock.scala
│ │ │ ├── SharkExplainSemanticAnalyzer.scala
│ │ │ └── SharkLoadSemanticAnalyzer.scala
│ │ │ ├── execution
│ │ │ ├── LateralViewForwardOperator.scala
│ │ │ ├── ForwardOperator.scala
│ │ │ ├── MapSplitPruningHelper.scala
│ │ │ ├── serialization
│ │ │ │ ├── JavaSerializer.scala
│ │ │ │ ├── KryoSerializer.scala
│ │ │ │ ├── HiveStructSerializer.scala
│ │ │ │ ├── HiveConfPersistenceDelegate.scala
│ │ │ │ ├── KryoSerializationWrapper.scala
│ │ │ │ ├── SerializableWritable.scala
│ │ │ │ ├── HiveStructDeserializer.scala
│ │ │ │ ├── OperatorSerializationWrapper.scala
│ │ │ │ └── XmlSerializer.scala
│ │ │ ├── package.scala
│ │ │ ├── ReduceSinkTableDesc.scala
│ │ │ ├── GroupByOperator.scala
│ │ │ ├── LimitOperator.scala
│ │ │ ├── ScriptOperatorHelper.scala
│ │ │ ├── FilterOperator.scala
│ │ │ ├── TerminalOperator.scala
│ │ │ ├── SelectOperator.scala
│ │ │ ├── UDTFOperator.scala
│ │ │ ├── JoinUtil.scala
│ │ │ └── SharkExplainTask.scala
│ │ │ ├── repl
│ │ │ ├── Main.scala
│ │ │ └── SharkILoop.scala
│ │ │ ├── util
│ │ │ └── QueryRewriteUtils.scala
│ │ │ ├── SharkServer2.scala
│ │ │ ├── LogHelper.scala
│ │ │ ├── optimizer
│ │ │ ├── SharkMapJoinProcessor.scala
│ │ │ └── SharkOptimizer.scala
│ │ │ └── KryoRegistrator.scala
│ └── java
│ │ └── shark
│ │ └── tgf
│ │ └── Schema.java
├── test
│ ├── 0.20S-exclude.txt
│ ├── scala
│ │ └── shark
│ │ │ ├── util
│ │ │ └── BloomFilterSuite.scala
│ │ │ ├── SortSuite.scala
│ │ │ ├── UtilsSuite.scala
│ │ │ ├── CliSuite.scala
│ │ │ ├── execution
│ │ │ └── HiveStructSerializerSuite.scala
│ │ │ ├── memstore2
│ │ │ └── column
│ │ │ │ ├── NullableColumnIteratorSuite.scala
│ │ │ │ └── ColumnTypeSuite.scala
│ │ │ └── SharkServerSuite.scala
│ ├── README.md
│ └── 0.20S-include.txt
└── tachyon_enabled
│ └── scala
│ └── shark
│ └── tachyon
│ └── TachyonOffHeapTableWriter.scala
├── lib
├── pyrolite.jar
└── JavaEWAH-0.4.2.jar
├── README.md
├── conf
├── log4j.properties.template
└── shark-env.sh.template
├── .gitignore
├── bin
├── shark-shell
├── shark-withinfo
├── shark-withdebug
├── beeline
├── ext
│ ├── cli.sh
│ ├── sharkserver.sh
│ ├── beeline.sh
│ └── sharkserver2.sh
├── dev
│ ├── release_cleanup.sh
│ ├── clear-buffer-cache.py
│ ├── build_test.xml
│ └── test
└── shark
├── project
├── build.properties
└── plugins.sbt
└── run
/data/files/users.txt:
--------------------------------------------------------------------------------
1 | 1 A
2 | 2 B
3 | 3 A
4 |
--------------------------------------------------------------------------------
/data/files/test1.txt:
--------------------------------------------------------------------------------
1 | 1 012
2 | 2 345
3 | 3 678
4 |
--------------------------------------------------------------------------------
/data/files/clicks.txt:
--------------------------------------------------------------------------------
1 | 1 0
2 | 2 1
3 | 1 1
4 | 2 0
5 | 1 1
6 |
7 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/README:
--------------------------------------------------------------------------------
1 | Place static files here.
2 |
--------------------------------------------------------------------------------
/lib/pyrolite.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/shark/HEAD/lib/pyrolite.jar
--------------------------------------------------------------------------------
/lib/JavaEWAH-0.4.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amplab/shark/HEAD/lib/JavaEWAH-0.4.2.jar
--------------------------------------------------------------------------------
/data/files/create_nested_type.txt:
--------------------------------------------------------------------------------
1 | a0b00b01c001C001c002C002c011\Nc012C012d01d011d012d02d021d022
2 | a1b10c001C001c002C002d01d011d012d02\N
3 | a2c001\Nc002C002c011C011c012C012d01\Nd012d02d021d022
4 | a3\N\N\N
5 |
--------------------------------------------------------------------------------
/src/test/0.20S-exclude.txt:
--------------------------------------------------------------------------------
1 | testCliDriver_archive_excludeHadoop20
2 | testCliDriver_auto_join14
3 | testCliDriver_combine2
4 | testCliDriver_ctas
5 | testCliDriver_input12
6 | testCliDriver_input39
7 | testCliDriver_join14
8 | testCliDriver_loadpart_err
9 | testCliDriver_sample_islocalmode_hook
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Shark (Hive on Spark)
2 |
3 |
4 | Development in Shark has been ended and subsumed by [Spark SQL](http://spark.apache.org/sql/). Please see [this blog post](http://databricks.com/blog/2014/07/01/shark-spark-sql-hive-on-spark-and-the-future-of-sql-on-spark.html) for more information.
5 |
--------------------------------------------------------------------------------
/data/files/kv3.txt:
--------------------------------------------------------------------------------
1 | 238val_238
2 |
3 | 311val_311
4 | val_27
5 | val_165
6 | val_409
7 | 255val_255
8 | 278val_278
9 | 98val_98
10 | val_484
11 | val_265
12 | val_193
13 | 401val_401
14 | 150val_150
15 | 273val_273
16 | 224
17 | 369
18 | 66val_66
19 | 128
20 | 213val_213
21 | 146val_146
22 | 406val_406
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/conf/log4j.properties.template:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
6 |
7 | # Ignore messages below warning level from Jetty, because it's a bit verbose
8 | log4j.logger.org.eclipse.jetty=WARN
9 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/ClassTags.scala:
--------------------------------------------------------------------------------
1 | package shark.api
2 |
3 | import scala.reflect.classTag
4 |
5 | object ClassTags {
6 | // List of primitive ClassTags.
7 | val jBoolean = classTag[java.lang.Boolean]
8 | val jByte = classTag[java.lang.Byte]
9 | val jShort = classTag[java.lang.Short]
10 | val jInt = classTag[java.lang.Integer]
11 | val jLong = classTag[java.lang.Long]
12 | val jFloat = classTag[java.lang.Float]
13 | val jDouble = classTag[java.lang.Double]
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/DataType.java:
--------------------------------------------------------------------------------
1 | package shark.api;
2 |
3 | import java.io.Serializable;
4 |
5 |
6 | public class DataType implements Serializable {
7 |
8 | public final String name;
9 | public final String hiveName;
10 | public final boolean isPrimitive;
11 |
12 | DataType(String name, String hiveName, boolean isPrimitive) {
13 | this.name = name;
14 | this.hiveName = hiveName;
15 | this.isPrimitive = isPrimitive;
16 | }
17 |
18 | @Override
19 | public String toString() {
20 | return name;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/scala/shark/server/SharkSessionManager.scala:
--------------------------------------------------------------------------------
1 | package shark.server
2 |
3 | import org.apache.hadoop.hive.conf.HiveConf
4 | import org.apache.hive.service.cli.session.SessionManager
5 | import shark.Utils
6 |
7 | class SharkSessionManager extends SessionManager {
8 | override def init(hiveConf : HiveConf) {
9 | this.synchronized {
10 | val sharkOpManager = new SharkOperationManager
11 | Utils.setSuperField("operationManager", sharkOpManager, this)
12 | addService(sharkOpManager)
13 | sharkInit(hiveConf)
14 | }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/resources/tablerdd/generator_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import sys
3 |
4 | # e.g. createList(1,3, "T[", "]", ",") gives T[1],T[2],T[3]
5 | def createList(start, stop, prefix, suffix="", sep = ",", newlineAfter = 70, indent = 0):
6 | res = ""
7 | oneLine = res
8 | for y in range(start,stop+1):
9 | res += prefix + str(y) + suffix
10 | oneLine += prefix + str(y) + suffix
11 | if y != stop:
12 | res += sep
13 | oneLine += sep
14 | if len(oneLine) > newlineAfter:
15 | res += "\n" + " "*indent
16 | oneLine = ""
17 | return res
18 |
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | build/
3 | metastore_db/
4 | project/boot
5 | lib_managed/
6 | TempStatsStore
7 | work/
8 | run-tests-from-scratch-workspace/
9 | sbt/*.jar
10 | conf/shark-env.sh
11 |
12 | # Compiled Source
13 | *.class
14 |
15 | # Packages
16 | #*.jar
17 |
18 | # Log Files
19 | *.log
20 |
21 | # Eclipse project files
22 | .classpath
23 | .project
24 | .settings
25 |
26 | # emacs backup
27 | *~
28 |
29 | # tmp files
30 | *.swp
31 | .cache
32 |
33 | # mac os file
34 | *.DS_Store
35 |
36 | # latex files
37 | paper.pdf
38 | paper.blg
39 | paper.bbl
40 | paper.aux
41 |
42 | # IntelliJ IDE files
43 | .idea
44 | *.iml
45 |
46 | # Test Reports
47 | TEST*.xml
48 | test_warehouses
49 |
50 | # Ensime files for emacs
51 | .ensime
52 | .ensime_lucene
53 | /eclipse_bin
54 | /.scala_dependencies
55 |
--------------------------------------------------------------------------------
/src/main/resources/tablerdd/SharkContext_sqlRdd_generator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from string import Template
3 | import sys
4 |
5 | from generator_utils import *
6 |
7 | ## This script generates functions sqlRdd for SharkContext.scala
8 |
9 | p = sys.stdout
10 |
11 | # The SharkContext declarations
12 | for x in range(2,23):
13 | sqlRddFun = Template(
14 | """
15 | def sqlRdd[$list1](cmd: String):
16 | RDD[Tuple$num[$list2]] = {
17 | new TableRDD$num[$list2](sql2rdd(cmd),
18 | Seq($list3))
19 | }
20 | """).substitute(num = x,
21 | list1 = createList(1, x, "T", ": M", ", ", 80, 4),
22 | list2 = createList(1, x, "T", sep=", ", indent = 4),
23 | list3 = createList(1, x, "m[T", "]", sep=", ", indent = 10))
24 | p.write(sqlRddFun)
25 |
--------------------------------------------------------------------------------
/bin/shark-shell:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | BINDIR="`dirname $0`"
19 | FWDIR="`dirname $BINDIR`"
20 | exec $FWDIR/run shark.repl.Main "$@"
21 |
--------------------------------------------------------------------------------
/src/main/scala/shark/tachyon/TachyonException.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.tachyon
19 |
20 | class TachyonException(msg: String) extends Exception(msg)
21 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/QueryExecutionException.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 |
21 | class QueryExecutionException(message: String) extends Exception(message)
22 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | sbt.version=0.13.1
18 |
19 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/column/MemoryStoreException.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 |
21 | class MemoryStoreException(message: String) extends Exception(message)
22 |
--------------------------------------------------------------------------------
/src/main/scala/shark/server/SharkOperationManager.scala:
--------------------------------------------------------------------------------
1 | package shark.server
2 |
3 | import java.util.{Map => JMap}
4 | import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, OperationManager}
5 | import org.apache.hive.service.cli.session.HiveSession
6 |
7 | class SharkOperationManager extends OperationManager {
8 | override def newExecuteStatementOperation(parentSession: HiveSession,
9 | statement: String, confOverlay:
10 | JMap[String, String])
11 | : ExecuteStatementOperation = {
12 | val executeStatementOperation = SharkExecuteStatementOperation
13 | .newExecuteStatementOperation(parentSession, statement, confOverlay)
14 | val castOp = executeStatementOperation.asInstanceOf[ExecuteStatementOperation]
15 | addOperation(castOp)
16 | castOp
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/bin/shark-withinfo:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # This is really just a wrapper around bin/shark to pipe INFO log to console.
19 | # Very handy for debugging.
20 |
21 | BINDIR="`dirname $0`"
22 | exec $BINDIR/shark -hiveconf hive.root.logger=INFO,console "$@"
23 |
--------------------------------------------------------------------------------
/bin/shark-withdebug:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # This is really just a wrapper around bin/shark to pipe INFO log to console.
19 | # Very handy for debugging.
20 |
21 | BINDIR="`dirname $0`"
22 | exec $BINDIR/shark -hiveconf hive.root.logger=DEBUG,console "$@"
23 |
24 |
--------------------------------------------------------------------------------
/bin/beeline:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Licensed to the Apache Software Foundation (ASF) under one or more
4 | # contributor license agreements. See the NOTICE file distributed with
5 | # this work for additional information regarding copyright ownership.
6 | # The ASF licenses this file to You under the Apache License, Version 2.0
7 | # (the "License"); you may not use this file except in compliance with
8 | # the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | bin=`dirname "$0"`
19 | bin=`cd "$bin"; pwd`
20 |
21 | . "$bin"/shark --service beeline "$@"
22 |
--------------------------------------------------------------------------------
/bin/ext/cli.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | THISSERVICE=cli
19 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
20 |
21 | cli() {
22 | echo "Starting the Shark Command Line Client"
23 | exec $FWDIR/run shark.SharkCliDriver "$@"
24 | }
25 |
26 | cli_help() {
27 | echo "usage ./shark --service cli"
28 | }
29 |
--------------------------------------------------------------------------------
/src/test/scala/shark/util/BloomFilterSuite.scala:
--------------------------------------------------------------------------------
1 | package shark.util
2 |
3 | import org.scalatest.FunSuite
4 |
5 | class BloomFilterSuite extends FunSuite{
6 |
7 | test("Integer") {
8 | val bf = new BloomFilter(0.03, 1000000)
9 | Range(0, 1000000).foreach {
10 | i => bf.add(i)
11 | }
12 | assert(bf.contains(333))
13 | assert(bf.contains(678))
14 | assert(!bf.contains(1200000))
15 | }
16 |
17 | test("Integer FP") {
18 | val bf = new BloomFilter(0.03,1000)
19 | Range(0,700).foreach {
20 | i => bf.add(i)
21 | }
22 | assert(bf.contains(333))
23 | assert(bf.contains(678))
24 | //is the fraction of false positives in line with what we expect ?
25 | val e = Range(0, 100).map {
26 | i => bf.contains(i*10)
27 | }
28 | val s = e.groupBy(x => x).map(x => (x._1, x._2.size))
29 | val t = s(true)
30 | val f = s(false)
31 | assert(f > 25 && f < 35)
32 | assert(t < 75 && t > 65)
33 | // expect false positive to be < 3 % and no false negatives
34 | }
35 | }
--------------------------------------------------------------------------------
/bin/ext/sharkserver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | THISSERVICE=sharkserver
19 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
20 |
21 | sharkserver() {
22 | echo "Starting the Shark Server"
23 | exec $FWDIR/run shark.SharkServer "$@"
24 | }
25 |
26 | sharkserver_help() {
27 | echo "usage SHARK_PORT=xxxx ./shark --service sharkserver"
28 | echo "SHARK_PORT : Specify the server port"
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/shark/parse/QueryContext.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.parse
19 |
20 | import org.apache.hadoop.conf.Configuration
21 | import org.apache.hadoop.hive.ql.Context
22 |
23 | /**
24 | * Shark's query context. Adds Shark-specific information to Hive's Context.
25 | */
26 | class QueryContext(conf: Configuration, val useTableRddSink: Boolean) extends Context(conf)
27 |
--------------------------------------------------------------------------------
/src/test/README.md:
--------------------------------------------------------------------------------
1 | ###Hive Compatibility Test Warnings
2 |
3 | #### Test results that rely on tables with `timestamp` fields may differ across JVM versions.
4 | For example, these tests:
5 | * udf5
6 | * timestamp.1, timestamp_2, timestamp_udf
7 |
8 | Pass when running with this JVM:
9 | (Mac 10.9, AMPLab Jenkins)
10 | java version "1.7.0_25"
11 | Java(TM) SE Runtime Environment (build 1.7.0_25-b15)
12 | Java HotSpot(TM) 64-Bit Server VM (build 23.25-b01, mixed mode)
13 |
14 | But fail on EC2 when run with this JVM:
15 | (EC2 c2.2xlarge)
16 | java version "1.7.0_45"
17 | OpenJDK Runtime Environment (amzn-2.4.3.2.32.amzn1-x86_64 u45-b15)
18 | OpenJDK 64-Bit Server VM (build 24.45-b08, mixed mode)
19 |
20 |
21 | A few more tests from test_pass.txt that fall into this category:
22 | TestCliDriver_input_part8
23 | TestSharkCliDriver: testCliDriver_timestamp_1
24 | TestSharkCliDriver: testCliDriver_timestamp_2
25 | TestSharkCliDriver: testCliDriver_timestamp_3
26 | TestSharkCliDriver: testCliDriver_timestamp_udf
27 | TestSharkCliDriver: testCliDriver_udf_to_unix_timestamp
28 | TestSharkCliDriver: testCliDriver_udf5
29 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/LateralViewForwardOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import org.apache.hadoop.hive.ql.plan.LateralViewForwardDesc
21 |
22 | import org.apache.spark.rdd.RDD
23 |
24 |
25 | class LateralViewForwardOperator extends UnaryOperator[LateralViewForwardDesc] {
26 |
27 | override def execute(): RDD[_] = executeParents().head._2
28 |
29 | override def processPartition(split: Int, iter: Iterator[_]) = iter
30 |
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/bin/ext/beeline.sh:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Need arguments [host [port [db]]]
17 | THISSERVICE=beeline
18 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
19 |
20 | beeline () {
21 | CLASS=org.apache.hive.beeline.BeeLine;
22 | exec $FWDIR/run $CLASS "$@"
23 | }
24 |
25 | beeline_help () {
26 | CLASS=org.apache.hive.beeline.BeeLine;
27 | exec $FWDIR/run "--help"
28 | }
29 |
30 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/ForwardOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.hadoop.hive.ql.plan.ForwardDesc
22 |
23 |
24 | class ForwardOperator extends UnaryOperator[ForwardDesc] {
25 |
26 | override def execute(): RDD[_] = executeParents().head._2
27 |
28 | override def processPartition(split: Int, iter: Iterator[_]) =
29 | throw new UnsupportedOperationException("ForwardOperator.processPartition()")
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/java/shark/tgf/Schema.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2013 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.tgf;
19 |
20 | import java.lang.annotation.Retention;
21 | import java.lang.annotation.RetentionPolicy;
22 | import java.lang.annotation.ElementType;
23 | import java.lang.annotation.Target;
24 |
25 |
26 | /**
27 | * Schema annotation for TGFs, example syntax: @Schema(spec = "name string, age int")
28 | */
29 | @Retention(RetentionPolicy.RUNTIME)
30 | @Target(ElementType.METHOD)
31 | public @interface Schema {
32 | String spec();
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/resources/dashboard/dashboard.css:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | body {
19 | background-color : #ffffff;
20 | font-family : sans-serif;
21 | }
22 |
23 | th {
24 | padding-bottom : 10px;
25 | padding-top : 10px;
26 | padding-left : 10px;
27 | padding-right : 10px;
28 | }
29 |
30 | td.node {
31 | padding-bottom : 8px;
32 | padding-top : 8px;
33 | padding-left : 8px;
34 | padding-right : 8px;
35 | }
36 |
37 | table.percent_bar {
38 | width: 200px;
39 | height: 15px;
40 | }
41 |
42 | td.percent_used {
43 | background: #AAAAFF;
44 | }
45 |
46 |
--------------------------------------------------------------------------------
/bin/ext/sharkserver2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | THISSERVICE=sharkserver2
19 | export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
20 |
21 | # Use Java to launch Shark otherwise the unit tests cannot properly kill
22 | # the server process.
23 | export SHARK_LAUNCH_WITH_JAVA=1
24 |
25 | sharkserver2() {
26 | echo "Starting the Shark Server"
27 | exec $FWDIR/run shark.SharkServer2 "$@"
28 | }
29 |
30 | sharkserver2_help() {
31 | echo "usage HIVE_SERVER2_THRIFT_PORT=xxxx ./shark --service sharkserver2"
32 | echo "HIVE_SERVER2_THRIFT_PORT : Specify the server port"
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/MapSplitPruningHelper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.hadoop.hive.serde2.objectinspector
19 |
20 | import org.apache.hadoop.hive.serde2.objectinspector.UnionStructObjectInspector.MyField
21 |
22 |
23 | object MapSplitPruningHelper {
24 |
25 | /**
26 | * Extract the UnionStructObjectInspector.MyField's `structField` reference, which is
27 | * package-private.
28 | */
29 | def getStructFieldFromUnionOIField(unionOIMyField: MyField): StructField = {
30 | unionOIMyField.structField
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/TablePartitionStats.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import shark.memstore2.column.ColumnStats
21 |
22 |
23 | /**
24 | * Stores column statistics for a table partition.
25 | */
26 | class TablePartitionStats(val stats: Array[ColumnStats[_]], val numRows: Long)
27 | extends Serializable {
28 |
29 | override def toString =
30 | numRows + " rows\n" +
31 | stats.zipWithIndex.map { case (column, index) =>
32 | " column " + index + " " +
33 | { if (column != null) column.toString else "no column statistics" }
34 | }.mkString("\n")
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/JavaSerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution.serialization
19 |
20 | import java.nio.ByteBuffer
21 |
22 | import org.apache.spark.SparkEnv
23 | import org.apache.spark.serializer.{JavaSerializer => SparkJavaSerializer}
24 |
25 |
26 | object JavaSerializer {
27 | @transient val ser = new SparkJavaSerializer(SparkEnv.get.conf)
28 |
29 | def serialize[T](o: T): Array[Byte] = {
30 | ser.newInstance().serialize(o).array()
31 | }
32 |
33 | def deserialize[T](bytes: Array[Byte]): T = {
34 | ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes))
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/bin/dev/release_cleanup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | DEVDIR="`dirname $0`"
19 | BINDIR="`dirname $DEVDIR`"
20 | FWDIR="`dirname $BINDIR`"
21 |
22 | rm -rf $FWDIR/run-tests-from-scratch-workspace
23 | rm -rf $FWDIR/test_warehouses
24 |
25 | rm -rf $FWDIR/conf/shark-env.sh
26 |
27 | rm -rf $FWDIR/metastore_db
28 | rm -rf $FWDIR/derby.log
29 |
30 | rm -rf $FWDIR/project/target $FWDIR/project/project/target
31 |
32 | rm -rf $FWDIR/target/resolution-cache
33 | rm -rf $FWDIR/target/streams
34 | rm -rf $FWDIR/target/scala-*/cache
35 | rm -rf $FWDIR/target/scala-*/classes
36 | rm -rf $FWDIR/target/scala-*/test-classes
37 |
38 | find $FWDIR -name ".DS_Store" -exec rm {} \;
39 | find $FWDIR -name ".history" -exec rm {} \;
40 |
41 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark
19 |
20 | import scala.language.implicitConversions
21 |
22 | import org.apache.hadoop.hive.ql.plan.OperatorDesc
23 |
24 | import shark.execution.serialization.KryoSerializationWrapper
25 | import shark.execution.serialization.OperatorSerializationWrapper
26 |
27 | package object execution {
28 |
29 | type HiveDesc = OperatorDesc // XXXDesc in Hive is the subclass of Serializable
30 |
31 | implicit def opSerWrapper2op[T <: Operator[_ <: HiveDesc]](
32 | wrapper: OperatorSerializationWrapper[T]): T = wrapper.value
33 |
34 | implicit def kryoWrapper2object[T](wrapper: KryoSerializationWrapper[T]): T = wrapper.value
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/ReduceSinkTableDesc.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import org.apache.hadoop.hive.ql.plan.TableDesc
21 | import shark.LogHelper
22 |
23 |
24 | trait ReduceSinkTableDesc extends LogHelper {
25 | self: Operator[_ <: HiveDesc] =>
26 |
27 | // Seq(tag, (Key TableDesc, Value TableDesc))
28 | def keyValueDescs(): Seq[(Int, (TableDesc, TableDesc))] = {
29 | // get the parent ReduceSinkOperator and sort it by tag
30 | val reduceSinkOps =
31 | for (op <- self.parentOperators.toSeq if op.isInstanceOf[ReduceSinkOperator])
32 | yield op.asInstanceOf[ReduceSinkOperator]
33 |
34 | reduceSinkOps.map(f => (f.getTag, f.getKeyValueTableDescs))
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/ColumnarStruct.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.util.{List => JList, ArrayList => JArrayList}
21 |
22 | import shark.memstore2.column.ColumnIterator
23 |
24 |
25 | /**
26 | * A struct returned by the TablePartitionIterator. It contains references to the same set of
27 | * ColumnIterators and use those to return individual fields back to the object inspectors.
28 | */
29 | class ColumnarStruct(columnIterators: Array[ColumnIterator]) {
30 |
31 | def getField(columnId: Int): Object = columnIterators(columnId).current
32 |
33 | def getFieldsAsList(): JList[Object] = {
34 | val list = new JArrayList[Object](columnIterators.length)
35 | var i = 0
36 | while (i < columnIterators.length) {
37 | list.add(columnIterators(i).current)
38 | i += 1
39 | }
40 | list
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2012 The Regents of The University California.
2 | // All rights reserved.
3 | //
4 | // Licensed under the Apache License, Version 2.0 (the "License");
5 | // you may not use this file except in compliance with the License.
6 | // You may obtain a copy of the License at
7 | //
8 | // http://www.apache.org/licenses/LICENSE-2.0
9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | scalaVersion := "2.10.3"
16 |
17 | resolvers += Resolver.url(
18 | "sbt-plugin-releases",
19 | new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases/"))(Resolver.ivyStylePatterns)
20 |
21 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
22 |
23 | addSbtPlugin("org.ensime" % "ensime-sbt-cmd" % "0.1.2")
24 |
25 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.4.0")
26 |
27 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0")
28 |
29 | addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.5.1")
30 |
31 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")
32 |
33 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2")
34 |
35 | addSbtPlugin("com.typesafe.sbt" % "sbt-pgp" % "0.8.3")
36 |
37 | addSbtPlugin("com.alpinenow" % "junit_xml_listener" % "0.5.0")
38 |
39 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/GroupByOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import org.apache.hadoop.hive.ql.exec.{GroupByOperator => HiveGroupByOperator}
21 | import org.apache.hadoop.hive.ql.exec.{ReduceSinkOperator => HiveReduceSinkOperator}
22 |
23 |
24 | /**
25 | * Unlike Hive, group by in Shark is split into two different operators:
26 | * GroupByPostShuffleOperator and GroupByPreShuffleOperator. The pre-shuffle one
27 | * serves as a combiner on each map partition.
28 | *
29 | * These two classes are defined in org.apache.hadoop.hive.ql.exec package
30 | * (scala files) to get around the problem that some Hive classes are only
31 | * visibile within that class.
32 | */
33 | object GroupByOperator {
34 |
35 | def isPostShuffle(op: HiveGroupByOperator): Boolean = {
36 | op.getParentOperators().get(0).isInstanceOf[HiveReduceSinkOperator]
37 | }
38 |
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/bin/dev/clear-buffer-cache.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # Clear OS buffer cache for mesos clusters on EC2.
19 |
20 | import os
21 | import thread
22 | import time
23 |
24 | machinesFile = "/root/spark-ec2/slaves"
25 | machs = open(machinesFile).readlines()
26 | machs = map(lambda s: s.strip(),machs)
27 | machCount = len(machs)
28 | machID = 0
29 | cmd = "sync; echo 3 > /proc/sys/vm/drop_caches"
30 | done = {}
31 |
32 | def dropCachesThread( mach, myID, *args ):
33 | print "SSH to machine %i" % (myID)
34 | os.system("ssh %s '%s'" % (mach, cmd))
35 | done[mach] = "done"
36 |
37 | for mach in ( machs ):
38 | thread.start_new_thread(dropCachesThread, (mach, machID))
39 | machID = machID + 1
40 | time.sleep(0.2)
41 |
42 | while (len(done.keys()) < machCount):
43 | print "waiting for %d tasks to finish..." % (machCount - len(done.keys()))
44 | time.sleep(1)
45 |
46 | print "Done with %i threads" % (len(done.keys()))
47 |
48 |
--------------------------------------------------------------------------------
/src/main/scala/shark/repl/Main.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.repl
19 |
20 | import org.apache.hadoop.hive.common.LogUtils
21 | import org.apache.hadoop.hive.common.LogUtils.LogInitializationException
22 |
23 |
24 | /**
25 | * Shark's REPL entry point.
26 | */
27 | object Main {
28 |
29 | try {
30 | LogUtils.initHiveLog4j()
31 | } catch {
32 | case e: LogInitializationException => // Ignore the error.
33 | }
34 |
35 | private var _interp: SharkILoop = null
36 |
37 | def interp = _interp
38 |
39 | private def interp_=(i: SharkILoop) { _interp = i }
40 |
41 | def main(args: Array[String]) {
42 |
43 | _interp = new SharkILoop
44 |
45 | // We need to set spark.repl.InterpAccessor.interp since it is used
46 | // everywhere in spark.repl code.
47 | org.apache.spark.repl.Main.interp = _interp
48 |
49 | // Start an infinite loop ...
50 | _interp.process(args)
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/scala/shark/server/SharkCLIService.scala:
--------------------------------------------------------------------------------
1 | package shark.server
2 |
3 | import org.apache.hive.service.cli.CLIService
4 | import org.apache.hadoop.hive.conf.HiveConf
5 | import org.apache.hadoop.hive.shims.ShimLoader
6 | import org.apache.hive.service.auth.HiveAuthFactory
7 | import java.io.IOException
8 | import org.apache.hive.service.ServiceException
9 | import javax.security.auth.login.LoginException
10 | import org.apache.spark.SparkEnv
11 | import shark.{SharkServer, Utils}
12 |
13 | class SharkCLIService extends CLIService {
14 | override def init(hiveConf: HiveConf) {
15 | this.synchronized {
16 | Utils.setSuperField("hiveConf", hiveConf, this)
17 | val sharkSM = new SharkSessionManager
18 | Utils.setSuperField("sessionManager", sharkSM, this)
19 | addService(sharkSM)
20 | try {
21 | HiveAuthFactory.loginFromKeytab(hiveConf)
22 | val serverUserName = ShimLoader.getHadoopShims
23 | .getShortUserName(ShimLoader.getHadoopShims.getUGIForConf(hiveConf))
24 | Utils.setSuperField("serverUserName", serverUserName, this)
25 | } catch {
26 | case e: IOException => {
27 | throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
28 | }
29 | case e: LoginException => {
30 | throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
31 | }
32 | }
33 | // Make sure the ThreadLocal SparkEnv reference is the same for all threads.
34 | SparkEnv.set(SharkServer.sparkEnv)
35 | sharkInit(hiveConf)
36 | }
37 | }
38 | }
39 |
40 |
41 |
--------------------------------------------------------------------------------
/bin/shark:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | bin="`dirname $0`"
19 | bin=`cd "$bin"; pwd`
20 | export FWDIR="`dirname $bin`"
21 |
22 | SERVICE=""
23 | HELP=""
24 | while [ $# -gt 0 ];do
25 | case "$1" in
26 | --service)
27 | shift
28 | SERVICE=$1
29 | shift
30 | ;;
31 | --help)
32 | HELP=_help
33 | shift
34 | ;;
35 | *)
36 | break
37 | ;;
38 | esac
39 | done
40 |
41 | if [ "$SERVICE" = "" ] ; then
42 | if [ "$HELP" = "_help" ] ; then
43 | SERVICE="help"
44 | else
45 | SERVICE="cli"
46 | fi
47 | fi
48 | SERVICE_LIST=""
49 |
50 | for i in "$bin"/ext/*.sh ; do
51 | . $i
52 | done
53 |
54 | TORUN=""
55 | for j in $SERVICE_LIST ; do
56 | if [ "$j" = "$SERVICE" ] ; then
57 | TORUN=${j}$HELP
58 | fi
59 | done
60 | echo "$@"
61 | if [ "$TORUN" = "" ] ; then
62 | echo "Service $SERVICE not found"
63 | echo "Available Services: $SERVICE_LIST"
64 | exit 7
65 | else
66 | $TORUN "$@"
67 | fi
68 |
69 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/LimitOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import org.apache.hadoop.hive.ql.plan.LimitDesc
21 |
22 | import org.apache.spark.rdd.{EmptyRDD, RDD}
23 |
24 | import shark.SharkEnv
25 |
26 |
27 | class LimitOperator extends UnaryOperator[LimitDesc] {
28 |
29 | // Only works on the master program.
30 | def limit = desc.getLimit()
31 |
32 | override def execute(): RDD[_] = {
33 |
34 | val limitNum = desc.getLimit()
35 |
36 | if (limitNum > 0) {
37 | // Take limit on each partition.
38 | val inputRdd = executeParents().head._2
39 | inputRdd.mapPartitions({ iter => iter.take(limitNum) }, preservesPartitioning = true)
40 | } else {
41 | new EmptyRDD(SharkEnv.sc)
42 | }
43 | }
44 |
45 | override def processPartition(split: Int, iter: Iterator[_]) = {
46 | throw new UnsupportedOperationException("LimitOperator.processPartition()")
47 | }
48 | }
49 |
50 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/KryoSerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution.serialization
19 |
20 | import java.nio.ByteBuffer
21 |
22 | import org.apache.spark.{SparkConf, SparkEnv}
23 | import org.apache.spark.serializer.{KryoSerializer => SparkKryoSerializer}
24 |
25 | import shark.SharkContext
26 |
27 | /**
28 | * Java object serialization using Kryo. This is much more efficient, but Kryo
29 | * sometimes is buggy to use. We use this mainly to serialize the object
30 | * inspectors.
31 | */
32 | object KryoSerializer {
33 |
34 | @transient lazy val ser: SparkKryoSerializer = {
35 | val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
36 | new SparkKryoSerializer(sparkConf)
37 | }
38 |
39 | def serialize[T](o: T): Array[Byte] = {
40 | ser.newInstance().serialize(o).array()
41 | }
42 |
43 | def deserialize[T](bytes: Array[Byte]): T = {
44 | ser.newInstance().deserialize[T](ByteBuffer.wrap(bytes))
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/ScriptOperatorHelper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.hadoop.hive.ql.exec
19 | // Put this file in Hive's exec package to access package level visible fields and methods.
20 |
21 | import java.util.{Map => JMap}
22 |
23 | import org.apache.hadoop.conf.Configuration
24 |
25 |
26 | /**
27 | * A helper class that gets us PathFinder and alias in ScriptOperator.
28 | * This is needed since PathFinder inner class is not declared as
29 | * static/public.
30 | */
31 | class ScriptOperatorHelper(val op: ScriptOperator) extends ScriptOperator {
32 |
33 | def newPathFinderInstance(envpath: String): op.PathFinder = {
34 | new op.PathFinder(envpath)
35 | }
36 |
37 | def getAlias: String = op.alias
38 |
39 | override def addJobConfToEnvironment(conf: Configuration, env: JMap[String, String]) {
40 | op.addJobConfToEnvironment(conf, env)
41 | }
42 |
43 | override def safeEnvVarName(variable: String): String = {
44 | op.safeEnvVarName(variable)
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/ColumnDesc.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 | import java.util.{List => JList}
21 |
22 | import scala.collection.JavaConversions._
23 |
24 | import org.apache.hadoop.hive.metastore.api.FieldSchema
25 | import org.apache.hadoop.hive.metastore.api.Schema
26 |
27 |
28 | class ColumnDesc(val name: String, val dataType: DataType) extends Serializable {
29 |
30 | private[shark] def this(hiveSchema: FieldSchema) {
31 | this(hiveSchema.getName, DataTypes.fromHiveType(hiveSchema.getType))
32 | }
33 |
34 | override def toString = "ColumnDesc(name: %s, type: %s)".format(name, dataType.name)
35 | }
36 |
37 |
38 | object ColumnDesc {
39 |
40 | def createSchema(fieldSchemas: JList[FieldSchema]): Array[ColumnDesc] = {
41 | if (fieldSchemas == null) Array.empty else fieldSchemas.map(new ColumnDesc(_)).toArray
42 | }
43 |
44 | def createSchema(schema: Schema): Array[ColumnDesc] = {
45 | if (schema == null) Array.empty else createSchema(schema.getFieldSchemas)
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/test/0.20S-include.txt:
--------------------------------------------------------------------------------
1 | testCliDriver_archive
2 | testCliDriver_archive_corrupt
3 | testCliDriver_infer_bucket_sort_list_bucket
4 | testCliDriver_list_bucket_dml_1
5 | testCliDriver_list_bucket_dml_11
6 | testCliDriver_list_bucket_dml_12
7 | testCliDriver_list_bucket_dml_13
8 | testCliDriver_list_bucket_dml_2
9 | testCliDriver_list_bucket_dml_3
10 | testCliDriver_list_bucket_dml_4
11 | testCliDriver_list_bucket_dml_5
12 | testCliDriver_list_bucket_dml_6
13 | testCliDriver_list_bucket_dml_7
14 | testCliDriver_list_bucket_dml_8
15 | testCliDriver_list_bucket_dml_9
16 | testCliDriver_list_bucket_query_multiskew_1
17 | testCliDriver_list_bucket_query_multiskew_2
18 | testCliDriver_list_bucket_query_multiskew_3
19 | testCliDriver_list_bucket_query_oneskew_1
20 | testCliDriver_list_bucket_query_oneskew_2
21 | testCliDriver_list_bucket_query_oneskew_3
22 | testCliDriver_recursive_dir
23 | testCliDriver_skewjoin_union_remove_1
24 | testCliDriver_skewjoin_union_remove_2
25 | testCliDriver_split_sample
26 | testCliDriver_union_remove_1
27 | testCliDriver_union_remove_10
28 | testCliDriver_union_remove_11
29 | testCliDriver_union_remove_12
30 | testCliDriver_union_remove_13
31 | testCliDriver_union_remove_14
32 | testCliDriver_union_remove_15
33 | testCliDriver_union_remove_16
34 | testCliDriver_union_remove_17
35 | testCliDriver_union_remove_18
36 | testCliDriver_union_remove_19
37 | testCliDriver_union_remove_2
38 | testCliDriver_union_remove_20
39 | testCliDriver_union_remove_21
40 | testCliDriver_union_remove_22
41 | testCliDriver_union_remove_23
42 | testCliDriver_union_remove_24
43 | testCliDriver_union_remove_3
44 | testCliDriver_union_remove_4
45 | testCliDriver_union_remove_5
46 | testCliDriver_union_remove_7
47 | testCliDriver_union_remove_8
48 | testCliDriver_union_remove_9
--------------------------------------------------------------------------------
/src/main/scala/shark/api/ResultSet.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 | import java.util.{Arrays, Collections, List => JList}
21 |
22 |
23 | class ResultSet private[shark](_schema: Array[ColumnDesc], _results: Array[Array[Object]]) {
24 |
25 | /**
26 | * The schema for the query results, for use in Scala.
27 | */
28 | def schema: Seq[ColumnDesc] = _schema.toSeq
29 |
30 | /**
31 | * Query results, for use in Scala.
32 | */
33 | def results: Seq[Array[Object]] = _results.toSeq
34 |
35 | /**
36 | * Get the schema for the query results as an immutable list, for use in Java.
37 | */
38 | def getSchema: JList[ColumnDesc] = Collections.unmodifiableList(Arrays.asList(_schema : _*))
39 |
40 | /**
41 | * Get the query results as an immutable list, for use in Java.
42 | */
43 | def getResults: JList[Array[Object]] = Collections.unmodifiableList(Arrays.asList(_results : _*))
44 |
45 | override def toString: String = {
46 | "ResultSet(" + _schema.map(c => c.name + " " + c.dataType).mkString("\t") + ")\n" +
47 | _results.map(row => row.mkString("\t")).mkString("\n")
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/shark/parse/SharkSemanticAnalyzerFactory.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.parse
19 |
20 | import org.apache.hadoop.hive.conf.HiveConf
21 | import org.apache.hadoop.hive.ql.parse.{ASTNode, BaseSemanticAnalyzer, DDLSemanticAnalyzer,
22 | ExplainSemanticAnalyzer, LoadSemanticAnalyzer, SemanticAnalyzerFactory, SemanticAnalyzer}
23 |
24 | import shark.SharkConfVars
25 |
26 |
27 | object SharkSemanticAnalyzerFactory {
28 |
29 | /**
30 | * Return a semantic analyzer for the given ASTNode.
31 | */
32 | def get(conf: HiveConf, tree:ASTNode): BaseSemanticAnalyzer = {
33 | val explainMode = SharkConfVars.getVar(conf, SharkConfVars.EXPLAIN_MODE) == "shark"
34 |
35 | SemanticAnalyzerFactory.get(conf, tree) match {
36 | case _: SemanticAnalyzer =>
37 | new SharkSemanticAnalyzer(conf)
38 | case _: ExplainSemanticAnalyzer if explainMode =>
39 | new SharkExplainSemanticAnalyzer(conf)
40 | case _: DDLSemanticAnalyzer =>
41 | new SharkDDLSemanticAnalyzer(conf)
42 | case _: LoadSemanticAnalyzer =>
43 | new SharkLoadSemanticAnalyzer(conf)
44 | case sem: BaseSemanticAnalyzer =>
45 | sem
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/shark/server/SharkExecuteStatementOperation.scala:
--------------------------------------------------------------------------------
1 | package shark.server
2 |
3 | import java.lang.reflect.Constructor
4 | import java.util.{Map => JMap}
5 | import org.apache.hive.service.cli.session.HiveSession
6 |
7 | object SharkExecuteStatementOperation {
8 | def newExecuteStatementOperation(parentSession: HiveSession,
9 | statement: String,
10 | confOverlay: JMap[String, String])
11 | : Any = {
12 | val tokens = statement.trim().split("\\s+")
13 | val command = tokens{0}.toLowerCase
14 | command match {
15 | case "set" => {
16 | val ctor = accessCtor("org.apache.hive.service.cli.operation.SetOperation")
17 | ctor.newInstance(parentSession, statement, confOverlay)
18 | }
19 | case "dfs" => {
20 | val ctor = accessCtor("org.apache.hive.service.cli.operation.DfsOperation")
21 | ctor.newInstance(parentSession, statement, confOverlay)
22 | }
23 | case "add" => {
24 | val ctor = accessCtor("org.apache.hive.service.cli.operation.AddResourceOperation")
25 | ctor.newInstance(parentSession, statement, confOverlay)
26 | }
27 | case "delete" => {
28 | val ctor = accessCtor("org.apache.hive.service.cli.operation.DeleteResourceOperation")
29 | ctor.newInstance(parentSession, statement, confOverlay)
30 | }
31 | case _ => {
32 | new SharkSQLOperation(parentSession, statement, confOverlay)
33 | }
34 | }
35 | }
36 |
37 | def accessCtor(className : String) : Constructor[_] = {
38 | val setClass = Class.forName(className)
39 | val setConst =
40 | setClass.getDeclaredConstructor(
41 | classOf[HiveSession],
42 | classOf[String],
43 | classOf[JMap[String, String]])
44 | setConst.setAccessible(true)
45 | setConst
46 | }
47 | }
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/LazySimpleSerDeWrapper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.util.{List => JList, Properties}
21 |
22 | import org.apache.hadoop.conf.Configuration
23 | import org.apache.hadoop.hive.serde2.{SerDe, SerDeStats}
24 | import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
26 | import org.apache.hadoop.io.Writable
27 |
28 |
29 | class LazySimpleSerDeWrapper extends SerDe {
30 |
31 | val _lazySimpleSerDe = new LazySimpleSerDe()
32 |
33 | override def initialize(conf: Configuration, tbl: Properties) {
34 | _lazySimpleSerDe.initialize(conf, tbl)
35 | }
36 |
37 | override def deserialize(blob: Writable): Object = _lazySimpleSerDe.deserialize(blob)
38 |
39 | override def getSerDeStats(): SerDeStats = _lazySimpleSerDe.getSerDeStats()
40 |
41 | override def getObjectInspector: ObjectInspector = _lazySimpleSerDe.getObjectInspector
42 |
43 | override def getSerializedClass: Class[_ <: Writable] = _lazySimpleSerDe.getSerializedClass
44 |
45 | override def serialize(obj: Object, objInspector: ObjectInspector): Writable = {
46 | _lazySimpleSerDe.serialize(obj, objInspector)
47 | }
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/scala/shark/SortSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark
19 |
20 | import org.apache.hadoop.io.BytesWritable
21 |
22 | import org.scalatest.FunSuite
23 |
24 | import org.apache.spark.SparkContext
25 | import org.apache.spark.rdd.RDD
26 |
27 | import shark.execution.{ReduceKey, ReduceKeyMapSide, ReduceKeyReduceSide, RDDUtils}
28 |
29 |
30 | class SortSuite extends FunSuite {
31 |
32 | TestUtils.init()
33 |
34 | var sc: SparkContext = SharkRunner.init()
35 |
36 | test("order by limit") {
37 | val data = Array((4, 14), (1, 11), (7, 17), (0, 10))
38 | val expected = data.sortWith(_._1 < _._1).toSeq
39 | val rdd: RDD[(ReduceKey, BytesWritable)] = sc.parallelize(data, 50).map { x =>
40 | (new ReduceKeyMapSide(new BytesWritable(Array[Byte](x._1.toByte))),
41 | new BytesWritable(Array[Byte](x._2.toByte)))
42 | }
43 | for (k <- 0 to 5) {
44 | val sortedRdd = RDDUtils.topK(rdd, k).asInstanceOf[RDD[(ReduceKeyReduceSide, Array[Byte])]]
45 | val output = sortedRdd.map { case(k, v) =>
46 | (k.byteArray(0).toInt, v(0).toInt)
47 | }.collect().toSeq
48 | assert(output.size === math.min(k, 4))
49 | assert(output === expected.take(math.min(k, 4)))
50 | }
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/scala/shark/parse/QueryBlock.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.parse
19 |
20 | import org.apache.hadoop.hive.ql.parse.{QB => HiveQueryBlock}
21 | import org.apache.hadoop.hive.ql.plan.CreateTableDesc
22 | import org.apache.hadoop.hive.ql.plan.TableDesc
23 |
24 | import shark.memstore2.CacheType
25 | import shark.memstore2.CacheType._
26 |
27 |
28 | /**
29 | * A container for flags and table metadata. Used in SharkSemanticAnalyzer while parsing
30 | * and analyzing ASTs (e.g. in SharkSemanticAnalyzer#analyzeCreateTable()).
31 | */
32 | class QueryBlock(outerID: String, alias: String, isSubQuery: Boolean)
33 | extends HiveQueryBlock(outerID, alias, isSubQuery) {
34 |
35 | // The CacheType for the table that will be created from CREATE TABLE/CTAS, or updated for an
36 | // INSERT.
37 | var cacheMode = CacheType.NONE
38 |
39 | // Descriptor for the table being updated by an INSERT.
40 | var targetTableDesc: TableDesc = _
41 |
42 | // Hive's QB uses `tableDesc` to refer to the CreateTableDesc. A direct `createTableDesc`
43 | // makes it easier to differentiate from `_targetTableDesc`.
44 | def createTableDesc: CreateTableDesc = super.getTableDesc
45 |
46 | def createTableDesc_= (desc: CreateTableDesc) = super.setTableDesc(desc)
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/shark/util/QueryRewriteUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.util
19 |
20 | import org.apache.hadoop.hive.ql.parse.SemanticException
21 |
22 | object QueryRewriteUtils {
23 |
24 | def cacheToAlterTable(cmd: String): String = {
25 | val CACHE_TABLE_DEFAULT = "(?i)CACHE ([^ ]+)".r
26 | val CACHE_TABLE_IN = "(?i)CACHE ([^ ]+) IN ([^ ]+)".r
27 |
28 | cmd match {
29 | case CACHE_TABLE_DEFAULT(tableName) =>
30 | s"ALTER TABLE $tableName SET TBLPROPERTIES ('shark.cache' = 'memory')"
31 | case CACHE_TABLE_IN(tableName, cacheType) =>
32 | s"ALTER TABLE $tableName SET TBLPROPERTIES ('shark.cache' = '$cacheType')"
33 | case _ =>
34 | throw new SemanticException(
35 | s"CACHE accepts a single table name: 'CACHE
[IN ]'" +
36 | s" (received command: '$cmd')")
37 | }
38 | }
39 |
40 | def uncacheToAlterTable(cmd: String): String = {
41 | val cmdSplit = cmd.split(' ')
42 | if (cmdSplit.size == 2) {
43 | val tableName = cmdSplit(1)
44 | "ALTER TABLE %s SET TBLPROPERTIES ('shark.cache' = 'false')".format(tableName)
45 | } else {
46 | throw new SemanticException(
47 | s"UNCACHE accepts a single table name: 'UNCACHE ' (received command: '$cmd')")
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/HiveStructSerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.hadoop.hive.serde2.binarysortable
19 |
20 | // Putting it in this package so it can access the package level visible function
21 | // static void BinarySortableSerDe.serialize(OutputByteBuffer, Object, ObjectInspector, boolean)
22 |
23 | import java.util.{List => JList}
24 |
25 | import org.apache.hadoop.hive.serde2.objectinspector.{StructField, StructObjectInspector}
26 |
27 |
28 | /**
29 | * Used to serialize a row of data. It needs to be initialized with an object inspector
30 | * for the row.
31 | */
32 | class HiveStructSerializer(val rowObjectInspector: StructObjectInspector) {
33 |
34 | def serialize(obj: Object): Array[Byte] = {
35 | outputByteBuffer.reset()
36 | var i = 0
37 | while (i < fields.size) {
38 | BinarySortableSerDe.serialize(
39 | outputByteBuffer,
40 | rowObjectInspector.getStructFieldData(obj, fields.get(i)),
41 | fields.get(i).getFieldObjectInspector(),
42 | false)
43 | i += 1
44 | }
45 | val bytes = new Array[Byte](outputByteBuffer.length)
46 | System.arraycopy(outputByteBuffer.getData(), 0, bytes, 0, outputByteBuffer.length)
47 | bytes
48 | }
49 |
50 | private val outputByteBuffer = new OutputByteBuffer
51 | private val fields: JList[_ <: StructField] = rowObjectInspector.getAllStructFieldRefs
52 | }
53 |
--------------------------------------------------------------------------------
/conf/shark-env.sh.template:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # (Required) Amount of memory used per slave node. This should be in the same
19 | # format as the JVM's -Xmx option, e.g. 300m or 1g.
20 | export SPARK_MEM=1g
21 |
22 | # (Required) Set the master program's memory
23 | export SHARK_MASTER_MEM=1g
24 |
25 | # (Optional) Specify the location of Hive's configuration directory. By default,
26 | # Shark run scripts will point it to $SHARK_HOME/conf
27 | #export HIVE_CONF_DIR=""
28 |
29 | # For running Shark in distributed mode, set the following:
30 | #export HADOOP_HOME=""
31 | #export SPARK_HOME=""
32 | #export MASTER=""
33 | # Only required if using Mesos:
34 | #export MESOS_NATIVE_LIBRARY=/usr/local/lib/libmesos.so
35 |
36 | # Only required if run shark with spark on yarn
37 | #export SHARK_EXEC_MODE=yarn
38 | #export SPARK_ASSEMBLY_JAR=
39 | #export SHARK_ASSEMBLY_JAR=
40 |
41 | # (Optional) Extra classpath
42 | #export SPARK_LIBRARY_PATH=""
43 |
44 | # Java options
45 | # On EC2, change the local.dir to /mnt/tmp
46 | SPARK_JAVA_OPTS=" -Dspark.local.dir=/tmp "
47 | SPARK_JAVA_OPTS+="-Dspark.kryoserializer.buffer.mb=10 "
48 | SPARK_JAVA_OPTS+="-verbose:gc -XX:-PrintGCDetails -XX:+PrintGCTimeStamps "
49 | export SPARK_JAVA_OPTS
50 |
51 | # (Optional) Tachyon Related Configuration
52 | #export TACHYON_MASTER="" # e.g. "localhost:19998"
53 | #export TACHYON_WAREHOUSE_PATH=/sharktables # Could be any valid path name
54 |
55 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/PythonTableRDD.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 | import scala.collection.JavaConversions._
21 |
22 | import net.razorvine.pickle.Pickler
23 |
24 | import org.apache.spark.api.java.JavaRDD
25 |
26 | class PythonTableRDD(
27 | tableRDD: JavaTableRDD)
28 | extends JavaRDD[Array[Byte]](tableRDD.rdd.mapPartitions(PythonTableRDD.javaRowToPythonRow)) {
29 | val schema: java.util.Map[String, Int] = tableRDD.first.colname2indexMap
30 | }
31 |
32 | /*
33 | * These static methods are to be called by Python to run SQL queries. sql2rdd runs the query and
34 | * attempts to convert the JavaTableRDD to a Python compatible RDD (an RDD of ByteArrays
35 | * that are pickled Python objects). We map the pickle serializer per partition to convert the Java
36 | * objects to python objects, and we return the resulting PythonTableRDD to the caller (presumably
37 | * a Python process).
38 | */
39 | object PythonTableRDD {
40 |
41 | def sql2rdd(sc: JavaSharkContext, cmd: String): PythonTableRDD = {
42 | new PythonTableRDD(sc.sql2rdd(cmd))
43 | }
44 |
45 | // Pickle a row of java objects to a row of pickled python objects (byte arrays)
46 | def javaRowToPythonRow(rows: Iterator[Row]): Iterator[Array[Byte]] = {
47 | // Pickler is not threadsafe, so we use 1 per partition
48 | val pickle = new Pickler
49 | rows.map { r =>
50 | pickle.dumps(r.toSeq.toArray)
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/test/scala/shark/UtilsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark
19 |
20 | import java.util.{HashMap => JHashMap}
21 |
22 | import org.apache.hadoop.conf.Configuration
23 |
24 | import org.scalatest.{BeforeAndAfter, FunSuite}
25 |
26 |
27 | class UtilsSuite extends FunSuite {
28 |
29 | test("set aws credentials") {
30 | var conf = new Configuration
31 | var map = new JHashMap[String, String]()
32 | Utils.setAwsCredentials(conf, map)
33 | assert(conf.get("fs.s3n.awsAccessKeyId") === null)
34 | assert(conf.get("fs.s3n.awsSecretAccessKey") === null)
35 | assert(conf.get("fs.s3.awsAccessKeyId") === null)
36 | assert(conf.get("fs.s3.awsSecretAccessKey") === null)
37 |
38 | map.put("AWS_ACCESS_KEY_ID", "id")
39 | conf = new Configuration
40 | Utils.setAwsCredentials(conf, map)
41 | assert(conf.get("fs.s3n.awsAccessKeyId") === null)
42 | assert(conf.get("fs.s3n.awsSecretAccessKey") === null)
43 | assert(conf.get("fs.s3.awsAccessKeyId") === null)
44 | assert(conf.get("fs.s3.awsSecretAccessKey") === null)
45 |
46 | map.put("AWS_SECRET_ACCESS_KEY", "key")
47 | conf = new Configuration
48 | Utils.setAwsCredentials(conf, map)
49 | assert(conf.get("fs.s3n.awsAccessKeyId") === "id")
50 | assert(conf.get("fs.s3n.awsSecretAccessKey") === "key")
51 | assert(conf.get("fs.s3.awsAccessKeyId") === "id")
52 | assert(conf.get("fs.s3.awsSecretAccessKey") === "key")
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/HiveConfPersistenceDelegate.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package shark.execution.serialization
18 |
19 | import java.beans.{Statement, Encoder, DefaultPersistenceDelegate}
20 | import scala.collection.JavaConversions._
21 | import org.apache.hadoop.hive.conf.HiveConf
22 | import org.apache.commons.lang.ObjectUtils
23 |
24 | class HiveConfPersistenceDelegate extends DefaultPersistenceDelegate {
25 | override protected def initialize(clazz: Class[_], oldInst: AnyRef, newInst: AnyRef, out: Encoder)
26 | {
27 | val oldConf = oldInst.asInstanceOf[HiveConf]
28 | val newConf = newInst.asInstanceOf[HiveConf]
29 |
30 | if (!ObjectUtils.equals(oldConf.getAuxJars, newConf.getAuxJars)) {
31 | out.writeStatement(new Statement(oldInst, "setAuxJars", Array(oldConf.getAuxJars)))
32 | }
33 |
34 | val oldConfProps = oldConf.getAllProperties
35 | val newConfProps = newConf.getAllProperties
36 |
37 | val propsToDelete = newConfProps.filter { case(k, v) => !oldConfProps.containsKey(k) }
38 | val propsToAdd = oldConf.getAllProperties.filter { case(k, v) =>
39 | !newConfProps.containsKey(k) || !ObjectUtils.equals(newConfProps.get(k), v)
40 | }
41 |
42 | propsToDelete.foreach { case(k, v) =>
43 | out.writeStatement(new Statement(oldInst, "unset", Array(k)))
44 | }
45 | propsToAdd.foreach { case(k, v) =>
46 | out.writeStatement(new Statement(oldInst, "set", Array(k, v)))
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/FilterOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import scala.collection.Iterator
21 | import scala.reflect.BeanProperty
22 |
23 | import org.apache.hadoop.hive.ql.exec.{ExprNodeEvaluator, ExprNodeEvaluatorFactory}
24 | import org.apache.hadoop.hive.ql.metadata.HiveException
25 | import org.apache.hadoop.hive.ql.plan.FilterDesc
26 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector
27 |
28 |
29 | class FilterOperator extends UnaryOperator[FilterDesc] {
30 |
31 | @transient var conditionEvaluator: ExprNodeEvaluator = _
32 | @transient var conditionInspector: PrimitiveObjectInspector = _
33 |
34 | @BeanProperty var conf: FilterDesc = _
35 |
36 | override def initializeOnMaster() {
37 | super.initializeOnMaster()
38 |
39 | conf = desc
40 | }
41 |
42 | override def initializeOnSlave() {
43 | try {
44 | conditionEvaluator = ExprNodeEvaluatorFactory.get(conf.getPredicate())
45 |
46 | conditionInspector = conditionEvaluator.initialize(objectInspector)
47 | .asInstanceOf[PrimitiveObjectInspector]
48 | } catch {
49 | case e: Throwable => throw new HiveException(e)
50 | }
51 | }
52 |
53 | override def processPartition(split: Int, iter: Iterator[_]) = {
54 | iter.filter { row =>
55 | java.lang.Boolean.TRUE.equals(
56 | conditionInspector.getPrimitiveJavaObject(conditionEvaluator.evaluate(row)))
57 | }
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/KryoSerializationWrapper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution.serialization
19 |
20 | /**
21 | * A wrapper around some unserializable objects that make them both Java
22 | * serializable. Internally, Kryo is used for serialization.
23 | *
24 | * Use KryoSerializationWrapper(value) to create a wrapper.
25 | */
26 | class KryoSerializationWrapper[T] extends Serializable {
27 |
28 | @transient var value: T = _
29 |
30 | private var valueSerialized: Array[Byte] = _
31 |
32 | // The getter and setter for valueSerialized is used for XML serialization.
33 | def getValueSerialized(): Array[Byte] = {
34 | valueSerialized = KryoSerializer.serialize(value)
35 | valueSerialized
36 | }
37 |
38 | def setValueSerialized(bytes: Array[Byte]) = {
39 | valueSerialized = bytes
40 | value = KryoSerializer.deserialize[T](valueSerialized)
41 | }
42 |
43 | // Used for Java serialization.
44 | private def writeObject(out: java.io.ObjectOutputStream) {
45 | getValueSerialized()
46 | out.defaultWriteObject()
47 | }
48 |
49 | private def readObject(in: java.io.ObjectInputStream) {
50 | in.defaultReadObject()
51 | setValueSerialized(valueSerialized)
52 | }
53 | }
54 |
55 |
56 | object KryoSerializationWrapper {
57 | def apply[T](value: T): KryoSerializationWrapper[T] = {
58 | val wrapper = new KryoSerializationWrapper[T]
59 | wrapper.value = value
60 | wrapper
61 | }
62 | }
63 |
64 |
--------------------------------------------------------------------------------
/src/main/scala/shark/SharkServer2.scala:
--------------------------------------------------------------------------------
1 | package shark
2 |
3 | import org.apache.commons.logging.LogFactory
4 | import org.apache.hadoop.hive.common.LogUtils
5 | import org.apache.hadoop.hive.common.LogUtils.LogInitializationException
6 | import org.apache.hadoop.hive.conf.HiveConf
7 | import org.apache.hive.service.cli.thrift.ThriftCLIService
8 | import org.apache.hive.service.server.{HiveServer2, ServerOptionsProcessor}
9 | import org.apache.spark.SparkEnv
10 | import shark.server.SharkCLIService
11 |
12 | object SharkServer2 extends LogHelper {
13 | SharkEnv.init()
14 | var sparkEnv: SparkEnv = SparkEnv.get
15 | var LOG = LogFactory.getLog(classOf[SharkServer2])
16 |
17 | def main(args: Array[String]) {
18 | try {
19 | LogUtils.initHiveLog4j()
20 | } catch {
21 | case e: LogInitializationException => {
22 | LOG.warn(e.getMessage)
23 | }
24 | }
25 | val optproc = new ServerOptionsProcessor("sharkserver2") //TODO: include load RDDs
26 |
27 | if (!optproc.process(args)) {
28 | LOG.fatal("Error starting SharkServer2 with given arguments")
29 | System.exit(-1)
30 | }
31 |
32 | Runtime.getRuntime.addShutdownHook(
33 | new Thread() {
34 | override def run() {
35 | SharkEnv.stop()
36 | }
37 | }
38 | )
39 | }
40 |
41 | try {
42 | val hiveConf = new HiveConf
43 | SharkConfVars.initializeWithDefaults(hiveConf)
44 | val server = new SharkServer2
45 | server.init(hiveConf)
46 | server.start()
47 | logInfo("SharkServer2 started")
48 | } catch {
49 | case t: Throwable => {
50 | LOG.fatal("Error starting SharkServer2", t)
51 | System.exit(-1)
52 | }
53 | }
54 | }
55 |
56 | class SharkServer2 extends HiveServer2 {
57 | override def init(hiveConf: HiveConf) {
58 | this.synchronized {
59 | val sharkCLIService = new SharkCLIService
60 | Utils.setSuperField("cliService", sharkCLIService, this)
61 | addService(sharkCLIService)
62 | val sthriftCLIService = new ThriftCLIService(sharkCLIService)
63 | Utils.setSuperField("thriftCLIService", sthriftCLIService, this)
64 | addService(sthriftCLIService)
65 | sharkInit(hiveConf)
66 | }
67 | }
68 | }
69 |
70 |
71 |
--------------------------------------------------------------------------------
/src/main/scala/shark/LogHelper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark
19 |
20 | import java.io.PrintStream
21 |
22 | import org.apache.commons.lang.StringUtils
23 | import org.apache.hadoop.hive.ql.session.SessionState
24 |
25 | import org.apache.spark.Logging
26 |
27 | /**
28 | * Utility trait for classes that want to log data. This wraps around Spark's
29 | * Logging trait. It creates a SLF4J logger for the class and allows logging
30 | * messages at different levels using methods that only evaluate parameters
31 | * lazily if the log level is enabled.
32 | *
33 | * It differs from the Spark's Logging trait in that it can print out the
34 | * error to the specified console of the Hive session.
35 | */
36 | trait LogHelper extends Logging {
37 |
38 | override def logError(msg: => String) = {
39 | errStream().println(msg)
40 | super.logError(msg)
41 | }
42 |
43 | def logError(msg: String, detail: String) = {
44 | errStream().println(msg)
45 | super.logError(msg + StringUtils.defaultString(detail))
46 | }
47 |
48 | def logError(msg: String, exception: Throwable) = {
49 | val err = errStream()
50 | err.println(msg)
51 | exception.printStackTrace(err)
52 | super.logError(msg, exception)
53 | }
54 |
55 | def outStream(): PrintStream = {
56 | val ss = SessionState.get()
57 | if (ss != null && ss.out != null) ss.out else System.out
58 | }
59 |
60 | def errStream(): PrintStream = {
61 | val ss = SessionState.get();
62 | if (ss != null && ss.err != null) ss.err else System.err
63 | }
64 |
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/TerminalOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import java.util.Date
21 |
22 | import scala.reflect.BeanProperty
23 |
24 | import org.apache.hadoop.hive.conf.HiveConf
25 | import org.apache.hadoop.hive.ql.exec.{FileSinkOperator => HiveFileSinkOperator}
26 | import org.apache.hadoop.hive.ql.plan.FileSinkDesc
27 |
28 |
29 | /**
30 | * File sink operator. It can accomplish one of the three things:
31 | * - write query output to disk
32 | * - cache query output
33 | * - return query as RDD directly (without materializing it)
34 | */
35 | class TerminalOperator extends UnaryOperator[FileSinkDesc] {
36 |
37 | // Create a local copy of hconf and hiveSinkOp so we can XML serialize it.
38 | @BeanProperty var localHiveOp: HiveFileSinkOperator = _
39 | @BeanProperty var localHconf: HiveConf = _
40 | @BeanProperty val now = new Date()
41 |
42 | override def initializeOnMaster() {
43 | super.initializeOnMaster()
44 | localHconf = super.hconf
45 | // Set parent to null so we won't serialize the entire query plan.
46 | localHiveOp.setParentOperators(null)
47 | localHiveOp.setChildOperators(null)
48 | localHiveOp.setInputObjInspectors(null)
49 | }
50 |
51 | override def initializeOnSlave() {
52 | localHiveOp.initialize(localHconf, Array(objectInspector))
53 | }
54 |
55 | override def processPartition(split: Int, iter: Iterator[_]): Iterator[_] = iter
56 | }
57 |
58 |
59 | /**
60 | * Collect the output as a TableRDD.
61 | */
62 | class TableRddSinkOperator extends TerminalOperator {}
63 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/Table.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import scala.collection.mutable.ArrayBuffer
21 |
22 | import org.apache.spark.rdd.RDD
23 |
24 | import scala.collection.mutable.Buffer
25 |
26 |
27 | /**
28 | * A container for table metadata managed by Shark and Spark. Subclasses are responsible for
29 | * how RDDs are set, stored, and accessed.
30 | *
31 | * @param databaseName Namespace for this table.
32 | * @param tableName Name of this table.
33 | * @param cacheMode Type of memory storage used for the table (e.g., the Spark block manager).
34 | */
35 | private[shark] abstract class Table(
36 | var databaseName: String,
37 | var tableName: String,
38 | var cacheMode: CacheType.CacheType) {
39 |
40 | /**
41 | * A mutable wrapper for an RDD and stats for its partitions.
42 | */
43 | class RDDValue(
44 | var rdd: RDD[TablePartition],
45 | var stats: collection.Map[Int, TablePartitionStats]) {
46 |
47 | def toTuple = (rdd, stats)
48 | }
49 | }
50 |
51 | object Table {
52 |
53 | /**
54 | * Merges contents of `otherStatsMaps` into `targetStatsMap`.
55 | */
56 | def mergeStats(
57 | targetStatsMap: Buffer[(Int, TablePartitionStats)],
58 | otherStatsMap: Iterable[(Int, TablePartitionStats)]
59 | ): Buffer[(Int, TablePartitionStats)] = {
60 | val targetStatsMapSize = targetStatsMap.size
61 | for ((otherIndex, tableStats) <- otherStatsMap) {
62 | targetStatsMap.append((otherIndex + targetStatsMapSize, tableStats))
63 | }
64 | targetStatsMap
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/SerializableWritable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution.serialization
19 |
20 | import java.io._
21 | import org.apache.hadoop.io.ObjectWritable
22 | import org.apache.hadoop.io.Writable
23 | import org.apache.hadoop.mapred.JobConf
24 | import org.apache.hadoop.io.NullWritable
25 |
26 | object SerializableWritable {
27 | val conf = new JobConf()
28 | }
29 |
30 |
31 | class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable {
32 | def value = t
33 |
34 | override def toString = if(null == t) "null" else t.toString
35 |
36 | private def writeObject(out: ObjectOutputStream) {
37 | out.defaultWriteObject()
38 | new ObjectWritable(if (t == null) NullWritable.get() else t).write(out)
39 | }
40 |
41 | private def readObject(in: ObjectInputStream) {
42 | in.defaultReadObject()
43 | val ow = new ObjectWritable()
44 | ow.setConf(SerializableWritable.conf)
45 | ow.readFields(in)
46 | val s = ow.get
47 | if (s == null || s.isInstanceOf[NullWritable]) {
48 | t = null.asInstanceOf[T]
49 | } else {
50 | t = s.asInstanceOf[T]
51 | }
52 | }
53 |
54 | override def hashCode(): Int = if(t == null) 0 else t.hashCode
55 |
56 | override def equals(other: Any) = {
57 | if(other.isInstanceOf[SerializableWritable[_]].unary_!) {
58 | false
59 | } else {
60 | val other_t = other.asInstanceOf[SerializableWritable[_]].t
61 | if (t == null) {
62 | other_t == null
63 | } else {
64 | t.equals(other_t)
65 | }
66 | }
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/TablePartitionIterator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.util.BitSet
21 | import shark.memstore2.column.ColumnIterator
22 |
23 |
24 | /**
25 | * An iterator for a partition of data. Each element returns a ColumnarStruct
26 | * that can be read by a ColumnarStructObjectInspector.
27 | *
28 | * @param numRows: total number of rows in this partition.
29 | * @param columnIterators: iterators for all columns.
30 | @ @param columnUsed: an optional bitmap indicating whether a column is used.
31 | */
32 | class TablePartitionIterator(
33 | val numRows: Long,
34 | val columnIterators: Array[ColumnIterator],
35 | val columnUsed: BitSet)
36 | extends Iterator[ColumnarStruct] {
37 |
38 | def this(numRows: Long,
39 | columnIterators: Array[ColumnIterator]) {
40 | this(numRows, columnIterators, TablePartitionIterator.newBitSet(columnIterators.size))
41 | }
42 |
43 | private val _struct = new ColumnarStruct(columnIterators)
44 |
45 | private var _position: Long = 0
46 |
47 | def hasNext: Boolean = _position < numRows
48 |
49 | def next(): ColumnarStruct = {
50 | _position += 1
51 | var i = columnUsed.nextSetBit(0)
52 | while (i > -1) {
53 | columnIterators(i).next()
54 | i = columnUsed.nextSetBit(i + 1)
55 | }
56 | _struct
57 | }
58 | }
59 |
60 | object TablePartitionIterator {
61 |
62 | def newBitSet(numCols: Int): BitSet = {
63 | val b = new BitSet(numCols)
64 | var i = numCols
65 | while (i > 0) {
66 | i -= 1
67 | b.set(i, true)
68 | }
69 | b
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/bin/dev/build_test.xml:
--------------------------------------------------------------------------------
1 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/src/main/scala/shark/repl/SharkILoop.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.repl
19 |
20 | import java.io.PrintWriter
21 |
22 | import org.apache.spark.{SparkContext, SparkEnv}
23 | import org.apache.spark.repl.SparkILoop
24 |
25 | import shark.{SharkContext, SharkEnv}
26 |
27 |
28 | /**
29 | * Add more Shark specific initializations.
30 | */
31 | class SharkILoop extends SparkILoop(None, new PrintWriter(Console.out, true), None) {
32 |
33 | override def initializeSpark() {
34 | // Note: shark.SharkEnv.initWithSharkContext must be invoked after spark.repl.Main.interp
35 | // is used because the slaves' executors depend on the environmental variable
36 | // "spark.repl.class.uri" set to invoke Spark's ExecutorClassLoader.
37 | intp.beQuietDuring {
38 | command("""
39 | org.apache.spark.repl.Main.interp.out.println("Creating SparkContext...");
40 | org.apache.spark.repl.Main.interp.out.flush();
41 | shark.SharkEnv.initWithSharkContext("shark-shell");
42 | @transient val sparkContext = shark.SharkEnv.sc;
43 | org.apache.spark.repl.Main.interp.sparkContext = sparkContext;
44 | @transient val sc = sparkContext.asInstanceOf[shark.SharkContext];
45 | org.apache.spark.repl.Main.interp.out.println("Shark context available as sc.");
46 | import sc._;
47 | def s = sql2console _;
48 | org.apache.spark.repl.Main.interp.out.flush();
49 | """)
50 | command("import org.apache.spark.SparkContext._");
51 | }
52 | Console.println("Type in expressions to have them evaluated.")
53 | Console.println("Type :help for more information.")
54 | Console.flush()
55 | }
56 | }
57 |
58 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/column/NullableColumnIterator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 | import java.nio.ByteBuffer
21 | import java.nio.ByteOrder
22 |
23 | /**
24 | * Reads a nullable column. Expects the byte buffer to contain as first element
25 | * the null count, followed by the null indices, and finally the non nulls.
26 | * Reading of non nulls is delegated by setting the buffer position to the first
27 | * non null.
28 | */
29 | class NullableColumnIterator(buffer: ByteBuffer) extends ColumnIterator {
30 | private var _d: ByteBuffer = _
31 | private var _nullCount: Int = _
32 | private var _nulls = 0
33 |
34 | private var _isNull = false
35 | private var _currentNullIndex: Int = _
36 | private var _pos = 0
37 |
38 | private var _delegate: ColumnIterator = _
39 |
40 | override def init() {
41 | _d = buffer.duplicate()
42 | _d.order(ByteOrder.nativeOrder())
43 | _nullCount = _d.getInt()
44 | _currentNullIndex = if (_nullCount > 0) _d.getInt() else Integer.MAX_VALUE
45 | _pos = 0
46 |
47 | // Move the buffer position to the non-null region.
48 | buffer.position(buffer.position() + 4 + _nullCount * 4)
49 | _delegate = ColumnIterator.newNonNullIterator(buffer)
50 | }
51 |
52 | override def next() {
53 | if (_pos == _currentNullIndex) {
54 | _nulls += 1
55 | if (_nulls < _nullCount) {
56 | _currentNullIndex = _d.getInt()
57 | }
58 | _isNull = true
59 | } else {
60 | _isNull = false
61 | _delegate.next()
62 | }
63 | _pos += 1
64 | }
65 |
66 | override def hasNext: Boolean = (_nulls < _nullCount) || _delegate.hasNext
67 |
68 | def current: Object = if (_isNull) null else _delegate.current
69 | }
70 |
--------------------------------------------------------------------------------
/src/test/scala/shark/CliSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark
19 |
20 | import java.io.{BufferedReader, File, InputStreamReader, PrintWriter}
21 | import org.scalatest.{BeforeAndAfterAll, FunSuite}
22 |
23 |
24 | /**
25 | * Test the Shark CLI.
26 | */
27 | class CliSuite extends FunSuite with BeforeAndAfterAll with TestUtils {
28 |
29 | val WAREHOUSE_PATH = TestUtils.getWarehousePath("cli")
30 | val METASTORE_PATH = TestUtils.getMetastorePath("cli")
31 |
32 | override def beforeAll() {
33 | val pb = new ProcessBuilder(
34 | "./bin/shark",
35 | "-hiveconf",
36 | "javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + METASTORE_PATH + ";create=true",
37 | "-hiveconf",
38 | "hive.metastore.warehouse.dir=" + WAREHOUSE_PATH)
39 |
40 | process = pb.start()
41 | outputWriter = new PrintWriter(process.getOutputStream, true)
42 | inputReader = new BufferedReader(new InputStreamReader(process.getInputStream))
43 | errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream))
44 | waitForOutput(inputReader, "shark>")
45 | }
46 |
47 | override def afterAll() {
48 | process.destroy()
49 | process.waitFor()
50 | }
51 |
52 | test("simple select") {
53 | val dataFilePath = TestUtils.dataFilePath + "/kv1.txt"
54 | executeQuery("create table shark_test1(key int, val string);")
55 | executeQuery("load data local inpath '" + dataFilePath+ "' overwrite into table shark_test1;")
56 | executeQuery("""create table shark_test1_cached TBLPROPERTIES ("shark.cache" = "true") as
57 | select * from shark_test1;""")
58 | val out = executeQuery("select * from shark_test1_cached where key = 407;")
59 | assert(out.contains("val_407"))
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/shark/parse/SharkExplainSemanticAnalyzer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.parse
19 |
20 | import java.io.Serializable
21 | import java.util.ArrayList
22 |
23 | import org.apache.hadoop.fs.Path
24 | import org.apache.hadoop.hive.conf.HiveConf
25 | import org.apache.hadoop.hive.ql.exec._
26 | import org.apache.hadoop.hive.ql.parse._
27 |
28 | import shark.execution.SharkExplainWork
29 |
30 |
31 | class SharkExplainSemanticAnalyzer(conf: HiveConf) extends ExplainSemanticAnalyzer(conf) {
32 |
33 | var sem: BaseSemanticAnalyzer = null
34 |
35 | /**
36 | * This is basically the same as Hive's except we invoke
37 | * SharkSemanticAnalyzerFactory. We need to do this to get
38 | * SharkSemanticAnalyzer for SELECT and CTAS queries.
39 | */
40 | override def analyzeInternal(ast: ASTNode): Unit = {
41 | ctx.setExplain(true)
42 |
43 | // Create a semantic analyzer for the query
44 | val childNode = ast.getChild(0).asInstanceOf[ASTNode]
45 | sem = SharkSemanticAnalyzerFactory.get(conf, childNode)
46 | sem.analyze(childNode, ctx)
47 |
48 | val extended = (ast.getChildCount() > 1)
49 |
50 | ctx.setResFile(new Path(ctx.getLocalTmpFileURI()))
51 | var tasks = sem.getRootTasks()
52 | val fetchTask = sem.getFetchTask()
53 | if (tasks == null) {
54 | if (fetchTask != null) {
55 | tasks = new ArrayList[Task[_ <: Serializable]]();
56 | tasks.add(fetchTask)
57 | }
58 | } else if (fetchTask != null) {
59 | tasks.add(fetchTask)
60 | }
61 |
62 | val task = TaskFactory.get(
63 | new SharkExplainWork(ctx.getResFile().toString(), tasks, childNode.toStringTree(),
64 | sem.getInputs(), extended), conf)
65 |
66 | rootTasks.add(task)
67 | }
68 | }
69 |
70 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/column/NullableColumnBuilder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 | import java.nio.ByteBuffer
21 | import java.nio.ByteOrder
22 |
23 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
24 |
25 |
26 | /**
27 | * Builds a nullable column. The byte buffer of a nullable column contains:
28 | * - 4 bytes for the null count (number of nulls)
29 | * - positions for each null, in ascending order
30 | * - the non-null data (column data type, compression type, data...)
31 | */
32 | trait NullableColumnBuilder[T] extends ColumnBuilder[T] {
33 |
34 | private var _nulls: ByteBuffer = _
35 |
36 | private var _pos: Int = _
37 | private var _nullCount: Int = _
38 |
39 | override def initialize(initialSize: Int, cName: String): ByteBuffer = {
40 | _nulls = ByteBuffer.allocate(1024)
41 | _nulls.order(ByteOrder.nativeOrder())
42 | _pos = 0
43 | _nullCount = 0
44 | super.initialize(initialSize, cName)
45 | }
46 |
47 | override def append(o: Object, oi: ObjectInspector) {
48 | if (o == null) {
49 | _nulls = growIfNeeded(_nulls, 4)
50 | _nulls.putInt(_pos)
51 | _nullCount += 1
52 | } else {
53 | super.append(o, oi)
54 | }
55 | _pos += 1
56 | }
57 |
58 | override def build(): ByteBuffer = {
59 | val nonNulls = super.build()
60 | val nullDataLen = _nulls.position()
61 | _nulls.limit(nullDataLen)
62 | _nulls.rewind()
63 |
64 | // 4 bytes for null count + null positions + non nulls
65 | val newBuffer = ByteBuffer.allocate(4 + nullDataLen + nonNulls.limit)
66 | newBuffer.order(ByteOrder.nativeOrder())
67 | newBuffer.putInt(_nullCount).put(_nulls).put(nonNulls)
68 | newBuffer.rewind()
69 | newBuffer
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/scala/shark/server/SharkSQLOperation.scala:
--------------------------------------------------------------------------------
1 | package shark.server
2 |
3 | import java.util.{Map => JMap}
4 | import org.apache.hadoop.hive.ql.parse.VariableSubstitution
5 | import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse
6 | import org.apache.hive.service.cli.{HiveSQLException, OperationState, TableSchema}
7 | import org.apache.hive.service.cli.operation.SQLOperation
8 | import org.apache.hive.service.cli.session.HiveSession
9 | import shark.{SharkDriver, Utils}
10 |
11 | class SharkSQLOperation(
12 | parentSession: HiveSession,
13 | statement: String,
14 | confOverlay: JMap[String, String])
15 | extends SQLOperation(parentSession, statement, confOverlay) {
16 |
17 | private val sdriver = {
18 | val d = new SharkDriver(getParentSession.getHiveConf)
19 | d.init()
20 | d
21 | }
22 |
23 | override def run() {
24 | setState(OperationState.RUNNING)
25 | Utils.setSuperField("driver", sdriver, this)
26 | var response: Option[CommandProcessorResponse] = None
27 | sdriver.setTryCount(Integer.MAX_VALUE) //maybe useless?
28 | var subStatement = ""
29 | try {
30 | //duplicate: this is also done when Driver compiles command
31 | subStatement = new VariableSubstitution().substitute(getParentSession.getHiveConf, statement)
32 | } catch {
33 | case e: IllegalStateException => {
34 | setState(OperationState.ERROR)
35 | throw new HiveSQLException
36 | }
37 | }
38 |
39 | response = Option(sdriver.run(subStatement))
40 | response match {
41 | case Some(resp: CommandProcessorResponse) => {
42 | val code = resp.getResponseCode
43 | if (code != 0) {
44 | setState(OperationState.ERROR)
45 | throw new HiveSQLException("Error while processing statement: "
46 | + resp.getErrorMessage, resp.getSQLState, code)
47 | }
48 | }
49 | case None => {
50 | setState(OperationState.ERROR)
51 | throw new HiveSQLException
52 | }
53 | }
54 |
55 | val mResultSchema = sdriver.getSchema
56 | Utils.setSuperField("mResultSchema", mResultSchema, this)
57 | if (mResultSchema != null && mResultSchema.isSetFieldSchemas) {
58 | val resultSchema = new TableSchema(mResultSchema)
59 | Utils.setSuperField("resultSchema", resultSchema, this)
60 | setHasResultSet(true)
61 | } else {
62 | setHasResultSet(false)
63 | }
64 | setState(OperationState.FINISHED)
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/TablePartitionBuilder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.io.{DataInput, DataOutput}
21 |
22 | import scala.collection.JavaConversions._
23 |
24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
25 | import org.apache.hadoop.hive.serde2.objectinspector.StructField
26 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
27 | import org.apache.hadoop.io.Writable
28 |
29 | import shark.memstore2.column.ColumnBuilder
30 |
31 |
32 | /**
33 | * Used to build a TablePartition. This is used in the serializer to convert a
34 | * partition of data into columnar format and to generate a TablePartition.
35 | */
36 | class TablePartitionBuilder(
37 | oi: StructObjectInspector,
38 | initialColumnSize: Int,
39 | shouldCompress: Boolean = true)
40 | extends Writable {
41 |
42 | private var numRows: Long = 0
43 | val fields: Seq[_ <: StructField] = oi.getAllStructFieldRefs
44 |
45 | val columnBuilders = Array.tabulate[ColumnBuilder[_]](fields.size) { i =>
46 | val columnBuilder = ColumnBuilder.create(fields(i), shouldCompress)
47 | columnBuilder.initialize(initialColumnSize, fields(i).getFieldName)
48 | columnBuilder
49 | }
50 |
51 | def incrementRowCount() {
52 | numRows += 1
53 | }
54 |
55 | def append(columnIndex: Int, o: Object, oi: ObjectInspector) {
56 | columnBuilders(columnIndex).append(o, oi)
57 | }
58 |
59 | def stats: TablePartitionStats = new TablePartitionStats(columnBuilders.map(_.stats), numRows)
60 |
61 | def build(): TablePartition = new TablePartition(numRows, columnBuilders.map(_.build()))
62 |
63 | // We don't use these, but want to maintain Writable interface for SerDe
64 | override def write(out: DataOutput) {}
65 | override def readFields(in: DataInput) {}
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/HiveStructDeserializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.hadoop.hive.serde2.binarysortable
19 |
20 | // Putting it in this package so it can access the package level visible function
21 | // static void BinarySortableSerDe.serialize(OutputByteBuffer, Object, ObjectInspector, boolean)
22 |
23 | import java.io.IOException
24 | import java.util.{ArrayList => JArrayList}
25 |
26 | import org.apache.hadoop.hive.serde2.SerDeException
27 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
28 | import org.apache.hadoop.hive.serde2.typeinfo.{TypeInfo, TypeInfoUtils}
29 |
30 |
31 | /**
32 | * Used to deserialize a row of data. It needs to be initialized with an object inspector
33 | * for the row.
34 | */
35 | class HiveStructDeserializer(val rowObjectInspector: StructObjectInspector) {
36 |
37 | def deserialize(bytes: Array[Byte]): JArrayList[Object] = {
38 | inputByteBuffer.reset(bytes, 0, bytes.length)
39 | try {
40 | var i = 0
41 | while (i < types.size) {
42 | reusedRow.set(i,
43 | BinarySortableSerDe.deserialize(inputByteBuffer, types(i), false, reusedRow.get(i)))
44 | i += 1
45 | }
46 | } catch{
47 | case e: IOException => throw new SerDeException(e)
48 | }
49 | reusedRow
50 | }
51 |
52 | private val inputByteBuffer = new InputByteBuffer
53 | private val types = Array.tabulate[TypeInfo](rowObjectInspector.getAllStructFieldRefs.size) { i =>
54 | TypeInfoUtils.getTypeInfoFromObjectInspector(
55 | rowObjectInspector.getAllStructFieldRefs.get(i).getFieldObjectInspector)
56 | }
57 |
58 | private val reusedRow: JArrayList[Object] = {
59 | val row = new JArrayList[Object](rowObjectInspector.getAllStructFieldRefs.size())
60 | (0 until rowObjectInspector.getAllStructFieldRefs.size).foreach(i => row.add(null))
61 | row
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/shark/optimizer/SharkMapJoinProcessor.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.optimizer
19 |
20 | import java.util.{LinkedHashMap => JavaLinkedHashMap}
21 |
22 | import org.apache.hadoop.hive.ql.exec.{MapJoinOperator, JoinOperator, Operator}
23 | import org.apache.hadoop.hive.ql.optimizer.MapJoinProcessor
24 | import org.apache.hadoop.hive.ql.parse.{ParseContext, QBJoinTree, OpParseContext}
25 | import org.apache.hadoop.hive.ql.plan.OperatorDesc
26 | import org.apache.hadoop.hive.conf.HiveConf
27 |
28 | class SharkMapJoinProcessor extends MapJoinProcessor {
29 |
30 | /**
31 | * Override generateMapJoinOperator to bypass the step of validating Map Join hints int Hive.
32 | */
33 | override def generateMapJoinOperator(
34 | pctx: ParseContext,
35 | op: JoinOperator,
36 | joinTree: QBJoinTree,
37 | mapJoinPos: Int): MapJoinOperator = {
38 | val hiveConf: HiveConf = pctx.getConf
39 | val noCheckOuterJoin: Boolean =
40 | HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTSORTMERGEBUCKETMAPJOIN) &&
41 | HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTBUCKETMAPJOIN)
42 |
43 | val opParseCtxMap: JavaLinkedHashMap[Operator[_ <: OperatorDesc], OpParseContext] =
44 | pctx.getOpParseCtx
45 |
46 | // Explicitly set validateMapJoinTree to false to bypass the step of validating
47 | // Map Join hints in Hive.
48 | val validateMapJoinTree = false
49 | val mapJoinOp: MapJoinOperator =
50 | MapJoinProcessor.convertMapJoin(
51 | opParseCtxMap, op, joinTree, mapJoinPos, noCheckOuterJoin, validateMapJoinTree)
52 |
53 | // Hive originally uses genSelectPlan to insert an dummy select after the MapJoinOperator.
54 | // We should not need this step.
55 | // create a dummy select to select all columns
56 | // MapJoinProcessor.genSelectPlan(pctx, mapJoinOp)
57 |
58 | return mapJoinOp
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/SelectOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import scala.collection.JavaConversions._
21 | import scala.reflect.BeanProperty
22 |
23 | import org.apache.hadoop.hive.ql.exec.{ExprNodeEvaluator, ExprNodeEvaluatorFactory}
24 | import org.apache.hadoop.hive.ql.plan.SelectDesc
25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
26 |
27 |
28 | /**
29 | * An operator that does projection, i.e. selecting certain columns and
30 | * filtering out others.
31 | */
32 | class SelectOperator extends UnaryOperator[SelectDesc] {
33 |
34 | @BeanProperty var conf: SelectDesc = _
35 |
36 | @transient var evals: Array[ExprNodeEvaluator] = _
37 |
38 | override def initializeOnMaster() {
39 | super.initializeOnMaster()
40 | conf = desc
41 | initializeEvals(false)
42 | }
43 |
44 | def initializeEvals(initializeEval: Boolean) {
45 | if (!conf.isSelStarNoCompute) {
46 | evals = conf.getColList().map(ExprNodeEvaluatorFactory.get(_)).toArray
47 | if (initializeEval) {
48 | evals.foreach(_.initialize(objectInspector))
49 | }
50 | }
51 | }
52 |
53 | override def initializeOnSlave() {
54 | initializeEvals(true)
55 | }
56 |
57 | override def processPartition(split: Int, iter: Iterator[_]) = {
58 | if (conf.isSelStarNoCompute) {
59 | iter
60 | } else {
61 | val reusedRow = new Array[Object](evals.length)
62 | iter.map { row =>
63 | var i = 0
64 | while (i < evals.length) {
65 | reusedRow(i) = evals(i).evaluate(row)
66 | i += 1
67 | }
68 | reusedRow
69 | }
70 | }
71 | }
72 |
73 | override def outputObjectInspector(): ObjectInspector = {
74 | if (conf.isSelStarNoCompute()) {
75 | super.outputObjectInspector()
76 | } else {
77 | initEvaluatorsAndReturnStruct(evals, conf.getOutputColumnNames(), objectInspector)
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/CacheType.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import shark.LogHelper
21 |
22 |
23 | /*
24 | * Enumerations and static helper functions for caches supported by Shark.
25 | */
26 | object CacheType extends Enumeration with LogHelper {
27 |
28 | /*
29 | * The CacheTypes:
30 | * - MEMORY: Stored in memory and on disk (i.e., cache is write-through). Persistent across Shark
31 | * sessions. By default, all such tables are reloaded into memory on restart.
32 | * - MEMORY_ONLY: Stored only in memory and dropped at the end of each Shark session.
33 | * - OFFHEAP: Stored in an off-heap data storage format, specified by the System property
34 | * 'shark.offheap.clientFactory'. Defaults to TachyonStorageClientFactory.
35 | * - NONE: Stored on disk (e.g., HDFS) and managed by Hive.
36 | */
37 | type CacheType = Value
38 | val MEMORY, MEMORY_ONLY, OFFHEAP, NONE = Value
39 |
40 | def shouldCache(c: CacheType): Boolean = (c != NONE)
41 |
42 | /** Get the cache type object from a string representation. */
43 | def fromString(name: String): CacheType = Option(name).map(_.toUpperCase) match {
44 | case None | Some("") | Some("FALSE") => NONE
45 | case Some("TRUE") => MEMORY
46 | case Some("HEAP") =>
47 | logWarning("The 'HEAP' cache type name is deprecated. Use 'MEMORY' instead.")
48 | MEMORY
49 | case Some("TACHYON") =>
50 | logWarning("The 'TACHYON' cache type name is deprecated. Use 'OFFHEAP' instead.")
51 | OFFHEAP
52 | case _ => {
53 | try {
54 | // Try to use Scala's Enumeration::withName() to interpret 'name'.
55 | withName(name.toUpperCase)
56 | } catch {
57 | case e: java.util.NoSuchElementException => throw new InvalidCacheTypeException(name)
58 | }
59 | }
60 | }
61 |
62 | class InvalidCacheTypeException(name: String)
63 | extends Exception("Invalid string representation of cache type: '%s'".format(name))
64 | }
65 |
--------------------------------------------------------------------------------
/src/test/scala/shark/execution/HiveStructSerializerSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import java.util.{ArrayList => JArrayList}
21 |
22 | import scala.collection.JavaConversions._
23 |
24 | import org.apache.hadoop.hive.serde2.binarysortable.{HiveStructSerializer, HiveStructDeserializer}
25 | import org.apache.hadoop.hive.serde2.objectinspector.{PrimitiveObjectInspector,
26 | ObjectInspectorFactory, StandardListObjectInspector, StandardMapObjectInspector,
27 | StructObjectInspector}
28 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.{PrimitiveObjectInspectorUtils,
29 | PrimitiveObjectInspectorFactory}
30 | import org.apache.hadoop.io.{IntWritable, LongWritable, Text}
31 |
32 | import org.scalatest.FunSuite
33 |
34 |
35 | class HiveStructSerializerSuite extends FunSuite {
36 |
37 | test("Testing serializing a simple row") {
38 | val row1 = createRow(1, "test1")
39 | val row2 = createRow(2, "test2")
40 | val ser = new HiveStructSerializer(createObjectInspector)
41 | val deser = new HiveStructDeserializer(createObjectInspector)
42 | val deserRow1 = deser.deserialize(ser.serialize(row1))
43 | assert(row1.get(0).equals(deserRow1.get(0)))
44 | assert(row1.get(1).equals(deserRow1.get(1)))
45 | }
46 |
47 | def createObjectInspector(): StructObjectInspector = {
48 | val names = List("a", "b")
49 | val ois = List(
50 | createPrimitiveOi(classOf[java.lang.Integer]),
51 | createPrimitiveOi(classOf[String]))
52 | ObjectInspectorFactory.getStandardStructObjectInspector(names, ois)
53 | }
54 |
55 | def createRow(v1: Int, v2: String): JArrayList[Object] = {
56 | val row = new JArrayList[Object](2)
57 | row.add(new IntWritable(v1))
58 | row.add(new Text(v2))
59 | row
60 | }
61 |
62 | def createPrimitiveOi(javaClass: Class[_]): PrimitiveObjectInspector =
63 | PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
64 | PrimitiveObjectInspectorUtils.getTypeEntryFromPrimitiveJavaClass(javaClass).primitiveCategory)
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/OperatorSerializationWrapper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution.serialization
19 |
20 | import shark.execution.HiveDesc
21 | import shark.execution.Operator
22 |
23 |
24 | /**
25 | * A wrapper around our operators so they can be serialized by standard Java
26 | * serialization. This really just delegates the serialization of the operators
27 | * to XML, and that of object inspectors to Kryo.
28 | *
29 | * Use OperatorSerializationWrapper(operator) to create a wrapper.
30 | */
31 | class OperatorSerializationWrapper[T <: Operator[_ <: HiveDesc]]
32 | extends Serializable with shark.LogHelper {
33 |
34 | /** The operator we are going to serialize. */
35 | @transient var _value: T = _
36 |
37 | /** The operator serialized by the XMLEncoder, minus the object inspectors. */
38 | var opSerialized: Array[Byte] = _
39 |
40 | /** The object inspectors, serialized by Kryo. */
41 | var objectInspectorsSerialized: Array[Byte] = _
42 |
43 | def value: T = {
44 | if (_value == null) {
45 | assert(opSerialized != null)
46 | assert(opSerialized.length > 0)
47 | assert(objectInspectorsSerialized != null)
48 | assert(objectInspectorsSerialized.length > 0)
49 | _value = XmlSerializer.deserialize[T](opSerialized)
50 | _value.objectInspectors = KryoSerializer.deserialize(objectInspectorsSerialized)
51 | }
52 | _value
53 | }
54 |
55 | def value_= (v: T):Unit = {
56 | _value = v
57 | opSerialized = XmlSerializer.serialize(value, v.hconf)
58 | objectInspectorsSerialized = KryoSerializer.serialize(value.objectInspectors)
59 | }
60 |
61 | override def toString(): String = {
62 | if (value != null) {
63 | "OperatorSerializationWrapper[ " + value.toString() + " ]"
64 | } else {
65 | super.toString()
66 | }
67 | }
68 | }
69 |
70 |
71 | object OperatorSerializationWrapper {
72 | def apply[T <: Operator[_ <: HiveDesc]](value: T): OperatorSerializationWrapper[T] = {
73 | val wrapper = new OperatorSerializationWrapper[T]
74 | wrapper.value = value
75 | wrapper
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/TableRecovery.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.util.{HashMap => JavaHashMap}
21 |
22 | import scala.collection.JavaConversions.asScalaBuffer
23 |
24 | import org.apache.hadoop.hive.ql.metadata.Hive
25 | import org.apache.hadoop.hive.ql.session.SessionState
26 |
27 | import shark.{LogHelper, SharkEnv}
28 | import shark.util.QueryRewriteUtils
29 |
30 | /**
31 | * Singleton used to reload RDDs upon server restarts.
32 | */
33 | object TableRecovery extends LogHelper {
34 |
35 | val db = Hive.get()
36 |
37 | /**
38 | * Loads any cached tables with MEMORY as its `shark.cache` property.
39 | * @param cmdRunner The runner that is responsible for taking a cached table query and
40 | * a) Creating the table metadata in Hive Meta Store
41 | * b) Loading the table as an RDD in memory
42 | * @see SharkServer for an example usage.
43 | * @param console Optional SessionState.LogHelper used, if present, to log information about
44 | the tables that get reloaded.
45 | */
46 | def reloadRdds(cmdRunner: String => Unit, console: Option[SessionState.LogHelper] = None) {
47 | // Filter for tables that should be reloaded into the cache.
48 | val currentDbName = db.getCurrentDatabase()
49 | for (databaseName <- db.getAllDatabases(); tableName <- db.getAllTables(databaseName)) {
50 | val hiveTable = db.getTable(databaseName, tableName)
51 | val tblProps = hiveTable.getParameters
52 | val cacheMode = CacheType.fromString(tblProps.get(SharkTblProperties.CACHE_FLAG.varname))
53 | if (cacheMode == CacheType.MEMORY) {
54 | val logMessage = "Reloading %s.%s into memory.".format(databaseName, tableName)
55 | if (console.isDefined) {
56 | console.get.printInfo(logMessage)
57 | } else {
58 | logInfo(logMessage)
59 | }
60 | val cmd = QueryRewriteUtils.cacheToAlterTable("CACHE %s".format(tableName))
61 | cmdRunner(s"use $databaseName")
62 | cmdRunner(cmd)
63 | }
64 | }
65 | db.setCurrentDatabase(currentDbName)
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/TableRDD.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 | import java.util.{List => JList}
21 |
22 | import org.apache.hadoop.hive.metastore.api.FieldSchema
23 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, StructObjectInspector}
24 |
25 | import shark.execution.serialization.KryoSerializer
26 |
27 | import org.apache.spark.{Partition, TaskContext}
28 | import org.apache.spark.rdd.RDD
29 |
30 |
31 | class TableRDD(
32 | prev: RDD[Any],
33 | val schema: Array[ColumnDesc],
34 | @transient oi: ObjectInspector,
35 | val limit: Int = -1)
36 | extends RDD[Row](prev) {
37 |
38 | private[shark]
39 | def this(prev: RDD[Any], schema: JList[FieldSchema], oi: ObjectInspector, limit: Int) {
40 | this(prev, ColumnDesc.createSchema(schema), oi, limit)
41 | }
42 |
43 | override def getPartitions = firstParent[Any].partitions
44 |
45 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = {
46 | val structOi = initObjectInspector()
47 | firstParent[Any].iterator(split, context).map { rowData =>
48 | new Row(rowData, colname2indexMap, structOi)
49 | }
50 | }
51 |
52 | /**
53 | * ObjectInspector is not Java serializable. We serialize it using Kryo and
54 | * and save it as a byte array. On slave nodes, we deserialize this byte
55 | * array to obtain the ObjectInspector object.
56 | */
57 | private val serializedObjectInspector: Array[Byte] = KryoSerializer.serialize(oi)
58 |
59 | /**
60 | * Maps the column name to column index.
61 | */
62 | private val colname2indexMap: Map[String, Int] =
63 | collection.immutable.Map() ++ schema.zipWithIndex.map { case(column, index) =>
64 | (column.name, index)
65 | }
66 |
67 | /**
68 | * Initialize object inspector from the serializedObjectInspector.
69 | */
70 | private def initObjectInspector(): StructObjectInspector = {
71 | val oi = KryoSerializer.deserialize[ObjectInspector](serializedObjectInspector)
72 | oi match {
73 | case soi: StructObjectInspector => soi
74 | case _ => throw new Exception("Only basic StructObjectInspector is supposed.")
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/scala/shark/optimizer/SharkOptimizer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.optimizer
19 |
20 | import java.util.{List => JavaList}
21 |
22 | import org.apache.hadoop.hive.ql.optimizer.JoinReorder
23 | import org.apache.hadoop.hive.ql.optimizer.{Optimizer => HiveOptimizer,
24 | SimpleFetchOptimizer, Transform, MapJoinProcessor => HiveMapJoinProcessor}
25 | import org.apache.hadoop.hive.ql.parse.ParseContext
26 | import shark.LogHelper
27 |
28 | class SharkOptimizer extends HiveOptimizer with LogHelper {
29 |
30 | /**
31 | * Override Hive optimizer to skip SimpleFetchOptimizer, which is designed
32 | * to let Hive avoid launching MR jobs on simple queries, but rewrites the
33 | * query plan in a way that is inconvenient for Shark (replaces the FS operator
34 | * with a non-terminal ListSink operator).
35 | */
36 | override def optimize(): ParseContext = {
37 |
38 | // Use reflection to make some private members accessible.
39 | val transformationsField = classOf[HiveOptimizer].getDeclaredField("transformations")
40 | val pctxField = classOf[HiveOptimizer].getDeclaredField("pctx")
41 | pctxField.setAccessible(true)
42 | transformationsField.setAccessible(true)
43 | val transformations = transformationsField.get(this).asInstanceOf[JavaList[Transform]]
44 | var pctx = pctxField.get(this).asInstanceOf[ParseContext]
45 |
46 | // Invoke each optimizer transformation
47 | val it = transformations.iterator
48 | while (it.hasNext()) {
49 | val transformation = it.next()
50 | transformation match {
51 | case _: SimpleFetchOptimizer => {}
52 | case _: JoinReorder => {}
53 | case _: HiveMapJoinProcessor => {
54 | // Use SharkMapJoinProcessor to bypass the step of validating Map Join hints
55 | // in Hive. So, we can use hints to mark tables that will be considered as small
56 | // tables (like Hive 0.9).
57 | val sharkMapJoinProcessor = new SharkMapJoinProcessor
58 | pctx = sharkMapJoinProcessor.transform(pctx)
59 | }
60 | case _ => {
61 | pctx = transformation.transform(pctx)
62 | }
63 | }
64 | }
65 | pctx
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/JavaTableRDD.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 | import scala.reflect.ClassTag
21 |
22 | import org.apache.spark.api.java.function.{Function => JFunction}
23 | import org.apache.spark.api.java.JavaRDDLike
24 | import org.apache.spark.rdd.RDD
25 | import org.apache.spark.storage.StorageLevel
26 |
27 |
28 | class JavaTableRDD(val rdd: RDD[Row], val schema: Array[ColumnDesc])
29 | extends JavaRDDLike[Row, JavaTableRDD] {
30 |
31 | override def wrapRDD(rdd: RDD[Row]): JavaTableRDD = new JavaTableRDD(rdd, schema)
32 |
33 | // Common RDD functions
34 | override val classTag: ClassTag[Row] = implicitly[ClassTag[Row]]
35 |
36 | // This shouldn't be necessary, but we seem to need this to get first() to return Row
37 | // instead of Object; possibly a compiler bug?
38 | override def first(): Row = rdd.first()
39 |
40 | /** Persist this RDD with the default storage level (`MEMORY_ONLY`). */
41 | def cache(): JavaTableRDD = wrapRDD(rdd.cache())
42 |
43 | /**
44 | * Set this RDD's storage level to persist its values across operations after the first time
45 | * it is computed. Can only be called once on each RDD.
46 | */
47 | def persist(newLevel: StorageLevel): JavaTableRDD = wrapRDD(rdd.persist(newLevel))
48 |
49 | // Transformations (return a new RDD)
50 |
51 | // Note: we didn't implement distinct() because equals() and hashCode() are not defined for Row.
52 |
53 | /**
54 | * Return a new RDD containing only the elements that satisfy a predicate.
55 | */
56 | def filter(f: JFunction[Row, java.lang.Boolean]): JavaTableRDD =
57 | wrapRDD(rdd.filter((x => f(x).booleanValue())))
58 |
59 | /**
60 | * Return a sampled subset of this RDD.
61 | */
62 | def sample(withReplacement: Boolean, fraction: Double, seed: Int): JavaTableRDD =
63 | wrapRDD(rdd.sample(withReplacement, fraction, seed))
64 |
65 | /**
66 | * Return the union of this RDD and another one. Any identical elements will appear multiple
67 | * times (use `.distinct()` to eliminate them).
68 | *
69 | * Note: the `schema` of a union is this RDD's schema.
70 | */
71 | def union(other: JavaTableRDD): JavaTableRDD = wrapRDD(rdd.union(other.rdd))
72 |
73 | }
74 |
75 |
76 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/column/ColumnIterators.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 | import java.nio.ByteBuffer
21 | import org.apache.hadoop.hive.serde2.`lazy`.LazyObject
22 | import org.apache.hadoop.hive.serde2.`lazy`.LazyFactory
23 | import org.apache.hadoop.hive.serde2.`lazy`.ByteArrayRef
24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
25 |
26 | import shark.execution.serialization.KryoSerializer
27 |
28 |
29 | class IntColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, INT)
30 |
31 | class FloatColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, FLOAT)
32 |
33 | class LongColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, LONG)
34 |
35 | class DoubleColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, DOUBLE)
36 |
37 | class BooleanColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, BOOLEAN)
38 |
39 | class ByteColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, BYTE)
40 |
41 | class ShortColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, SHORT)
42 |
43 | class NullColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, VOID)
44 |
45 | class TimestampColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, TIMESTAMP)
46 |
47 | class BinaryColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, BINARY)
48 |
49 | class StringColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, STRING)
50 |
51 | class GenericColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, GENERIC) {
52 |
53 | private var _obj: LazyObject[_] = _
54 |
55 | override def init() {
56 | super.init()
57 | val oiSize = buffer.getInt()
58 | val oiSerialized = new Array[Byte](oiSize)
59 | buffer.get(oiSerialized, 0, oiSize)
60 | val oi = KryoSerializer.deserialize[ObjectInspector](oiSerialized)
61 | _obj = LazyFactory.createLazyObject(oi)
62 | }
63 |
64 | override def current = {
65 | val v = super.current.asInstanceOf[ByteArrayRef]
66 | _obj.init(v, 0, v.getData().length)
67 | _obj
68 | }
69 | }
70 |
71 | class VoidColumnIterator(buffer: ByteBuffer) extends DefaultColumnIterator(buffer, VOID)
72 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/column/ColumnBuilders.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 | import java.nio.ByteBuffer
21 | import java.sql.Timestamp
22 |
23 | import org.apache.hadoop.hive.serde2.ByteStream
24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
25 | import org.apache.hadoop.io.BytesWritable
26 | import org.apache.hadoop.io.Text
27 |
28 |
29 | import shark.execution.serialization.KryoSerializer
30 | import shark.memstore2.column.ColumnStats._
31 |
32 |
33 | class BooleanColumnBuilder extends DefaultColumnBuilder[Boolean](new BooleanColumnStats(), BOOLEAN)
34 |
35 | class IntColumnBuilder extends DefaultColumnBuilder[Int](new IntColumnStats(), INT)
36 |
37 | class LongColumnBuilder extends DefaultColumnBuilder[Long](new LongColumnStats(), LONG)
38 |
39 | class FloatColumnBuilder extends DefaultColumnBuilder[Float](new FloatColumnStats(), FLOAT)
40 |
41 | class DoubleColumnBuilder extends DefaultColumnBuilder[Double](new DoubleColumnStats(), DOUBLE)
42 |
43 | class StringColumnBuilder extends DefaultColumnBuilder[Text](new StringColumnStats(), STRING)
44 |
45 | class ByteColumnBuilder extends DefaultColumnBuilder[Byte](new ByteColumnStats(), BYTE)
46 |
47 | class ShortColumnBuilder extends DefaultColumnBuilder[Short](new ShortColumnStats(), SHORT)
48 |
49 | class TimestampColumnBuilder
50 | extends DefaultColumnBuilder[Timestamp](new TimestampColumnStats(), TIMESTAMP)
51 |
52 | class BinaryColumnBuilder extends DefaultColumnBuilder[BytesWritable](new NoOpStats(), BINARY)
53 |
54 | class VoidColumnBuilder extends DefaultColumnBuilder[Void](new NoOpStats(), VOID)
55 |
56 | /**
57 | * Generic columns that we can serialize, including maps, structs, and other complex types.
58 | */
59 | class GenericColumnBuilder(oi: ObjectInspector)
60 | extends DefaultColumnBuilder[ByteStream.Output](new NoOpStats(), GENERIC) {
61 |
62 | // Complex data types cannot be null. Override the initialize in NullableColumnBuilder.
63 | override def initialize(initialSize: Int, columnName: String): ByteBuffer = {
64 | val buffer = super.initialize(initialSize, columnName)
65 | val objectInspectorSerialized = KryoSerializer.serialize(oi)
66 | buffer.putInt(objectInspectorSerialized.size)
67 | buffer.put(objectInspectorSerialized)
68 | buffer
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/bin/dev/test:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (C) 2012 The Regents of The University California.
4 | # All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | get_abs_path() {
19 | local PARENT_DIR=$(dirname "$1")
20 | cd "$PARENT_DIR"
21 | local ABS_PATH="$(pwd)"/"$(basename $1)"
22 | cd - >/dev/null
23 | echo $ABS_PATH
24 | }
25 |
26 | CURRENTFILE=`get_abs_path $0`
27 | BINDIR="`dirname $CURRENTFILE`"
28 | FWDIR="`dirname $BINDIR`/.."
29 |
30 | # Load environment variables from conf/shark-env.sh, if it exists
31 | if [ -e $FWDIR/conf/shark-env.sh ] ; then
32 | . $FWDIR/conf/shark-env.sh
33 | fi
34 |
35 | # Hive related section.
36 | if [ -z $HIVE_DEV_HOME ] ; then
37 | echo "No HIVE_DEV_HOME specified. Please set HIVE_DEV_HOME"
38 | exit 1
39 | fi
40 |
41 | # Hive related section.
42 | if [ -z $HADOOP_HOME ] ; then
43 | echo "No HADOOP_HOME specified. Please set HADOOP_HOME"
44 | exit 1
45 | fi
46 |
47 | if [ -n "$TEST_FILE" ] ; then
48 | TEST_FILE=`get_abs_path $TEST_FILE`
49 | export TEST_FILE
50 | fi
51 |
52 |
53 | SPARK_CLASSPATH+=":${HIVE_DEV_HOME}/build/ql/test/classes"
54 | SPARK_CLASSPATH+=":${HIVE_DEV_HOME}/data/conf"
55 | export SPARK_CLASSPATH
56 |
57 | BUILD_PATH=$HIVE_DEV_HOME/build/ql
58 |
59 | # Set variables used by unit tests (ex. create_like.q).
60 | TEST_JAVA_OPTS="-Dbuild.dir=${HIVE_DEV_HOME}/build/ql "
61 | TEST_JAVA_OPTS+="-Dbuild.dir.hive=${HIVE_DEV_HOME}/build "
62 | TEST_JAVA_OPTS+="-Dbuild.ivy.lib.dir=${HIVE_DEV_HOME}/build/ivy/lib "
63 | TEST_JAVA_OPTS+="-Dderby.version=10.4.2.0 "
64 | TEST_JAVA_OPTS+="-Dlog4j.configuration=file://${HIVE_DEV_HOME}/data/conf/hive-log4j.properties "
65 | TEST_JAVA_OPTS+="-Dtest.log.dir=${BUILD_PATH}/test/logs "
66 | TEST_JAVA_OPTS+="-Dtest.output.overwrite=false "
67 | TEST_JAVA_OPTS+="-Dtest.src.data.dir=${HIVE_DEV_HOME}/data "
68 | TEST_JAVA_OPTS+="-Dtest.tmp.dir=${BUILD_PATH}/tmp "
69 | TEST_JAVA_OPTS+="-Dtest.warehouse.dir=${BUILD_PATH}/test/data/warehouse "
70 | #TEST_JAVA_OPTS+="-Duser.dir=${HIVE_DEV_HOME}/ql "
71 |
72 | export TEST_JAVA_OPTS
73 |
74 | # Set the current directory to hive/ql since lots of tests use relative path.
75 | cd ${HIVE_DEV_HOME}/ql
76 |
77 | if [ "$TEST_WITH_ANT" == "1" ] ; then
78 | export CLASSPATH
79 | export RUNNER="ant -noclasspath -nouserlib -f $FWDIR/bin/dev/build_test.xml test"
80 | exec $FWDIR/run "$@"
81 | else
82 | export SHARK_LAUNCH_WITH_JAVA=1
83 | exec $FWDIR/run junit.textui.TestRunner shark.TestSharkCliDriver "$@"
84 | fi
85 |
--------------------------------------------------------------------------------
/src/main/resources/tablerdd/rddtable_generator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from string import Template
3 | import sys
4 | from generator_utils import *
5 |
6 | ## This script generates RDDtable.scala
7 |
8 | p = sys.stdout
9 |
10 | # e.g. createList(1,3, "T[", "]", ",") gives T[1],T[2],T[3]
11 | def createList(start, stop, prefix, suffix="", sep = ",", newlineAfter = 70, indent = 0):
12 | res = ""
13 | oneLine = res
14 | for y in range(start,stop+1):
15 | res += prefix + str(y) + suffix
16 | oneLine += prefix + str(y) + suffix
17 | if y != stop:
18 | res += sep
19 | oneLine += sep
20 | if len(oneLine) > newlineAfter:
21 | res += "\n" + " "*indent
22 | oneLine = ""
23 | return res
24 |
25 | ### The SparkContext declaration
26 |
27 | prefix = """
28 | /*
29 | * Copyright (C) 2012 The Regents of The University California.
30 | * All rights reserved.
31 | *
32 | * Licensed under the Apache License, Version 2.0 (the "License");
33 | * you may not use this file except in compliance with the License.
34 | * You may obtain a copy of the License at
35 | *
36 | * http://www.apache.org/licenses/LICENSE-2.0
37 | *
38 | * Unless required by applicable law or agreed to in writing, software
39 | * distributed under the License is distributed on an "AS IS" BASIS,
40 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
41 | * See the License for the specific language governing permissions and
42 | * limitations under the License.
43 | */
44 |
45 | package shark.api
46 |
47 | // *** This file is auto-generated from RDDTable_generator.py ***
48 |
49 | import scala.language.implicitConversions
50 | import scala.reflect.ClassTag
51 | import org.apache.spark.rdd.RDD
52 |
53 | object RDDTableImplicits {
54 | private type C[T] = ClassTag[T]
55 |
56 | """
57 |
58 | p.write(prefix)
59 |
60 | for x in range(2,23):
61 |
62 | tableClass = Template(
63 | """
64 | implicit def rddToTable$num[$tmlist]
65 | (rdd: RDD[($tlist)]): RDDTableFunctions = RDDTable(rdd)
66 |
67 | """).substitute(num = x, tmlist = createList(1, x, "T", ": C", ", ", indent=4), tlist = createList(1, x, "T", "", ", ", indent=4))
68 | p.write(tableClass)
69 |
70 | prefix = """
71 | }
72 |
73 | object RDDTable {
74 |
75 | private type C[T] = ClassTag[T]
76 | private def ct[T](implicit c: ClassTag[T]) = c
77 | """
78 |
79 | p.write(prefix)
80 |
81 | for x in range(2,23):
82 |
83 | tableClass = Template(
84 | """
85 | def apply[$tmlist]
86 | (rdd: RDD[($tlist)]) = {
87 | val classTag = implicitly[ClassTag[Seq[Any]]]
88 | val rddSeq: RDD[Seq[_]] = rdd.map(t => t.productIterator.toList.asInstanceOf[Seq[Any]])(classTag)
89 | new RDDTableFunctions(rddSeq, Seq($mtlist))
90 | }
91 |
92 | """).substitute(tmlist = createList(1, x, "T", ": C", ", ", indent=4), tlist = createList(1, x, "T", "", ", ", indent=4),
93 | mtlist = createList(1, x, "ct[T", "]", ", ", indent=4))
94 | p.write(tableClass)
95 |
96 |
97 | p.write("}\n")
98 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/SharkTblProperties.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.util.{Map => JavaMap}
21 |
22 |
23 | /**
24 | * Collection of static fields and helpers for table properties (i.e., from A
25 | * CREATE TABLE TBLPROPERTIES( ... ) used by Shark.
26 | */
27 | object SharkTblProperties {
28 |
29 | case class TableProperty(varname: String, defaultVal: String)
30 |
31 | // Class name of the default cache policy used to manage partition evictions for cached,
32 | // Hive-partitioned tables.
33 | val CACHE_POLICY = new TableProperty("shark.cache.policy", "shark.memstore2.CacheAllPolicy")
34 |
35 | // Maximum size - in terms of the number of objects - of the cache specified by the
36 | // "shark.cache.partition.cachePolicy" property above.
37 | val MAX_PARTITION_CACHE_SIZE = new TableProperty("shark.cache.policy.maxSize", "10")
38 |
39 | // Default value for the "shark.cache" table property
40 | val CACHE_FLAG = new TableProperty("shark.cache", "true")
41 |
42 | // Whether we are currently in the process of caching the table (meaning it cannot be accessed).
43 | val CACHE_IN_PROGRESS_FLAG = new TableProperty("shark.cache.inProgress", "false")
44 |
45 | def getOrSetDefault(tblProps: JavaMap[String, String], variable: TableProperty): String = {
46 | if (!tblProps.containsKey(variable.varname)) {
47 | tblProps.put(variable.varname, variable.defaultVal)
48 | }
49 | tblProps.get(variable.varname)
50 | }
51 |
52 | /**
53 | * Returns value for the `variable` table property. If a value isn't present in `tblProps`, then
54 | * the default for `variable` will be returned.
55 | */
56 | def initializeWithDefaults(
57 | tblProps: JavaMap[String, String],
58 | isPartitioned: Boolean = false): JavaMap[String, String] = {
59 | tblProps.put(CACHE_FLAG.varname, CACHE_FLAG.defaultVal)
60 | tblProps.put(CACHE_IN_PROGRESS_FLAG.varname, CACHE_IN_PROGRESS_FLAG.defaultVal)
61 | if (isPartitioned) {
62 | tblProps.put(CACHE_POLICY.varname, CACHE_POLICY.defaultVal)
63 | }
64 | tblProps
65 | }
66 |
67 | def removeSharkProperties(tblProps: JavaMap[String, String]) {
68 | tblProps.remove(CACHE_FLAG.varname)
69 | tblProps.remove(CACHE_IN_PROGRESS_FLAG.varname)
70 | tblProps.remove(CACHE_POLICY.varname)
71 | tblProps.remove(MAX_PARTITION_CACHE_SIZE.varname)
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/resources/tablerdd/TableRDDGenerated_generator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from string import Template
3 | import sys
4 | from generator_utils import *
5 |
6 | ## This script generates TableRDDGenerated.scala
7 |
8 | p = sys.stdout
9 |
10 | p.write(
11 | """
12 | /*
13 | * Copyright (C) 2013 The Regents of The University California.
14 | * All rights reserved.
15 | *
16 | * Licensed under the Apache License, Version 2.0 (the "License");
17 | * you may not use this file except in compliance with the License.
18 | * You may obtain a copy of the License at
19 | *
20 | * http://www.apache.org/licenses/LICENSE-2.0
21 | *
22 | * Unless required by applicable law or agreed to in writing, software
23 | * distributed under the License is distributed on an "AS IS" BASIS,
24 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 | * See the License for the specific language governing permissions and
26 | * limitations under the License.
27 | */
28 |
29 |
30 |
31 | package shark.api
32 |
33 | // *** This file is auto-generated from TableRDDGenerated_generator.py ***
34 | import scala.language.implicitConversions
35 | import org.apache.spark.rdd.RDD
36 | import org.apache.spark.{TaskContext, Partition}
37 |
38 | import scala.reflect.ClassTag
39 |
40 | class TableSeqRDD(prev: TableRDD)
41 | extends RDD[Seq[Any]](prev) {
42 |
43 | def getSchema = prev.schema
44 |
45 | override def getPartitions = prev.getPartitions
46 |
47 | override def compute(split: Partition, context: TaskContext): Iterator[Seq[Any]] = {
48 | prev.compute(split, context).map( row =>
49 | (0 until prev.schema.size).map(i => row.getPrimitive(i)) )
50 | }
51 | }
52 |
53 | """)
54 |
55 | for x in range(1,23):
56 |
57 | inner = ""
58 | for y in range(1,x+1):
59 | if y % 3 == 1: inner += " "
60 | inner += Template(" row.getPrimitiveGeneric[T$num1]($num2)").substitute(num1=y, num2=y-1)
61 | if y != x: inner += ","
62 | if y % 3 == 0: inner += "\n"
63 | inner += " ) )\n"
64 |
65 | tableClass = Template(
66 | """
67 | class TableRDD$num[$list](prev: TableRDD,
68 | tags: Seq[ClassTag[_]])
69 | extends RDD[Tuple$num[$list]](prev) {
70 | def schema = prev.schema
71 |
72 | private val tableCols = schema.size
73 | require(tableCols == $num, "Table only has " + tableCols + " columns, expecting $num")
74 |
75 | tags.zipWithIndex.foreach{ case (m, i) => if (DataTypes.fromClassTag(m) != schema(i).dataType)
76 | throw new IllegalArgumentException(
77 | "Type mismatch on column " + (i + 1) + ", expected " + DataTypes.fromClassTag(m) + " got " + schema(i).dataType) }
78 |
79 | override def getPartitions = prev.getPartitions
80 |
81 | override def compute(split: Partition, context: TaskContext):
82 | Iterator[Tuple$num[$list]] = {
83 | prev.compute(split, context).map( row =>
84 | new Tuple$num[$list](
85 | $innerfatlist
86 | }
87 | }
88 | """).substitute(num = x, list = createList(1, x, "T", "", ", ", indent=4), innerfatlist = inner)
89 |
90 |
91 | p.write(tableClass)
92 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/MemoryTable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import org.apache.spark.rdd.RDD
21 |
22 | import scala.collection.mutable.{Buffer, HashMap}
23 |
24 | import shark.execution.RDDUtils
25 |
26 |
27 | /**
28 | * A metadata container for a table in Shark that's backed by an RDD.
29 | */
30 | private[shark] class MemoryTable(
31 | databaseName: String,
32 | tableName: String,
33 | cacheMode: CacheType.CacheType)
34 | extends Table(databaseName, tableName, cacheMode) {
35 |
36 | private var _rddValueOpt: Option[RDDValue] = None
37 |
38 | /**
39 | * Sets the RDD and stats fields the `_rddValueOpt`. Used for INSERT/LOAD OVERWRITE.
40 | * @param newRDD The table's data.
41 | * @param newStats Stats for each TablePartition in `newRDD`.
42 | * @return The previous (RDD, stats) pair for this table.
43 | */
44 | def put(
45 | newRDD: RDD[TablePartition],
46 | newStats: collection.Map[Int, TablePartitionStats] = new HashMap[Int, TablePartitionStats]()
47 | ): Option[(RDD[TablePartition], collection.Map[Int, TablePartitionStats])] = {
48 | val prevRDDAndStatsOpt = _rddValueOpt.map(_.toTuple)
49 | if (_rddValueOpt.isDefined) {
50 | _rddValueOpt.foreach { rddValue =>
51 | rddValue.rdd = newRDD
52 | rddValue.stats = newStats
53 | }
54 | } else {
55 | _rddValueOpt = Some(new RDDValue(newRDD, newStats))
56 | }
57 | prevRDDAndStatsOpt
58 | }
59 |
60 | /**
61 | * Used for append operations, such as INSERT and LOAD INTO.
62 | *
63 | * @param newRDD Data to append to the table.
64 | * @param newStats Stats for each TablePartition in `newRDD`.
65 | * @return The previous (RDD, stats) pair for this table.
66 | */
67 | def update(
68 | newRDD: RDD[TablePartition],
69 | newStats: Buffer[(Int, TablePartitionStats)]
70 | ): Option[(RDD[TablePartition], collection.Map[Int, TablePartitionStats])] = {
71 | val prevRDDAndStatsOpt = _rddValueOpt.map(_.toTuple)
72 | if (_rddValueOpt.isDefined) {
73 | val (prevRDD, prevStats) = (prevRDDAndStatsOpt.get._1, prevRDDAndStatsOpt.get._2)
74 | val updatedRDDValue = _rddValueOpt.get
75 | updatedRDDValue.rdd = RDDUtils.unionAndFlatten(newRDD, prevRDD)
76 | updatedRDDValue.stats = Table.mergeStats(newStats, prevStats).toMap
77 | } else {
78 | put(newRDD, newStats.toMap)
79 | }
80 | prevRDDAndStatsOpt
81 | }
82 |
83 | def getRDD = _rddValueOpt.map(_.rdd)
84 |
85 | def getStats = _rddValueOpt.map(_.stats)
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/scala/shark/KryoRegistrator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark
19 |
20 | import java.io.{DataInputStream, DataOutputStream}
21 | import java.util.Arrays
22 | import com.esotericsoftware.kryo.{Kryo, Serializer => KSerializer}
23 | import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
24 | import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer}
25 | import org.apache.hadoop.io.Writable
26 | import org.apache.hadoop.hive.ql.exec.persistence.{MapJoinSingleKey, MapJoinObjectKey,
27 | MapJoinDoubleKeys, MapJoinObjectValue}
28 | import org.apache.spark.serializer.{KryoRegistrator => SparkKryoRegistrator}
29 | import shark.execution.serialization.SerializableWritable
30 |
31 |
32 | class KryoRegistrator extends SparkKryoRegistrator {
33 | def registerClasses(kryo: Kryo) {
34 |
35 | kryo.register(classOf[execution.ReduceKey])
36 |
37 | // The map join data structures are Java serializable.
38 | kryo.register(classOf[MapJoinSingleKey], new KryoJavaSerializer)
39 | kryo.register(classOf[MapJoinObjectKey], new KryoJavaSerializer)
40 | kryo.register(classOf[MapJoinDoubleKeys], new KryoJavaSerializer)
41 | kryo.register(classOf[MapJoinObjectValue], new KryoJavaSerializer)
42 |
43 | kryo.register(classOf[SerializableWritable[_]], new KryoSWSerializer)
44 |
45 | // As far as I (rxin) know, among all Hadoop writables only TimestampWritable
46 | // cannot be serialized by Kryo out of the box.
47 | kryo.register(classOf[org.apache.hadoop.hive.serde2.io.TimestampWritable],
48 | new KryoWritableSerializer[org.apache.hadoop.hive.serde2.io.TimestampWritable])
49 | }
50 | }
51 |
52 | class KryoSWSerializer[T <: Writable] extends KSerializer[SerializableWritable[T]] {
53 | def write(kryo : Kryo, out : KryoOutput, obj : SerializableWritable[T]) {
54 | kryo.writeClassAndObject(out, obj.t); out.flush;
55 | }
56 | def read(kryo : Kryo, in : KryoInput, cls : Class[SerializableWritable[T]]) : SerializableWritable[T] = {
57 | new SerializableWritable(
58 | kryo.readClassAndObject(in).asInstanceOf[T]
59 | )
60 | }
61 | }
62 |
63 | /** A Kryo serializer for Hadoop writables. */
64 | class KryoWritableSerializer[T <: Writable] extends KSerializer[T] {
65 | override def write(kryo: Kryo, output: KryoOutput, writable: T) {
66 | val ouputStream = new DataOutputStream(output)
67 | writable.write(ouputStream)
68 | }
69 |
70 | override def read(kryo: Kryo, input: KryoInput, cls: java.lang.Class[T]): T = {
71 | val writable = cls.newInstance()
72 | val inputStream = new DataInputStream(input)
73 | writable.readFields(inputStream)
74 | writable
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/tachyon_enabled/scala/shark/tachyon/TachyonOffHeapTableWriter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.tachyon
19 |
20 | import java.nio.ByteBuffer
21 |
22 | import scala.reflect.BeanProperty
23 |
24 | import shark.{LogHelper, SharkConfVars}
25 | import shark.execution.serialization.JavaSerializer
26 | import shark.memstore2.{OffHeapStorageClient, OffHeapTableWriter, TablePartitionStats}
27 |
28 | import tachyon.client.WriteType
29 | import tachyon.master.MasterInfo
30 | import tachyon.util.CommonUtils
31 |
32 | class TachyonOffHeapTableWriter(@transient path: String, @transient numColumns: Int)
33 | extends OffHeapTableWriter with LogHelper {
34 |
35 | // Re-instantiated upon deserialization, the first time it's referenced.
36 | @transient lazy val tfs = OffHeapStorageClient.client.asInstanceOf[TachyonStorageClient].tfs
37 | val TEMP = "_temperary"
38 | var rawTableId: Int = -1
39 |
40 | override def createTable() {
41 | val metadata = ByteBuffer.allocate(0)
42 | rawTableId = tfs.createRawTable(path, numColumns, metadata)
43 | }
44 |
45 | override def setStats(indexToStats: collection.Map[Int, TablePartitionStats]) {
46 | val buffer = ByteBuffer.wrap(JavaSerializer.serialize(indexToStats))
47 | tfs.updateRawTableMetadata(rawTableId, buffer)
48 | }
49 |
50 | // rawTable is a lazy val so it gets created the first time it is referenced.
51 | // This is only used on worker nodes.
52 | @transient lazy val rawTable = tfs.getRawTable(rawTableId)
53 |
54 | override def writePartitionColumn(part: Int, column: Int, data: ByteBuffer, tempDir: String) {
55 | val tmpPath = CommonUtils.concat(rawTable.getPath(), TEMP)
56 | val fid = tfs.createFile(CommonUtils.concat(tmpPath, tempDir, column + "", part + ""))
57 | val file = tfs.getFile(fid)
58 | val writeType: WriteType = WriteType.valueOf(
59 | SharkConfVars.getVar(localHconf, SharkConfVars.TACHYON_WRITER_WRITETYPE))
60 | val outStream = file.getOutStream(writeType)
61 | outStream.write(data.array(), 0, data.limit())
62 | outStream.close()
63 | }
64 |
65 | override def commitPartition(part: Int, numColumns: Int, tempDir: String) {
66 | val tmpPath = CommonUtils.concat(rawTable.getPath(), TEMP)
67 | (0 until numColumns).reverse.foreach { column =>
68 | val srcPath = CommonUtils.concat(tmpPath, tempDir, column + "", part + "")
69 | val destPath = CommonUtils.concat(rawTable.getPath(), MasterInfo.COL, column + "", part + "")
70 | tfs.rename(srcPath, destPath)
71 | }
72 | tfs.delete(CommonUtils.concat(tmpPath, tempDir), true)
73 | }
74 |
75 | override def cleanTmpPath() {
76 | val tmpPath = CommonUtils.concat(rawTable.getPath(), TEMP)
77 | tfs.delete(tmpPath, true)
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/serialization/XmlSerializer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution.serialization
19 |
20 | import java.beans.{XMLDecoder, XMLEncoder}
21 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
22 |
23 | import com.ning.compress.lzf.{LZFEncoder, LZFDecoder}
24 |
25 | import org.apache.hadoop.conf.Configuration
26 | import org.apache.hadoop.hive.conf.HiveConf
27 | import org.apache.hadoop.hive.ql.exec.Utilities.EnumDelegate
28 | import org.apache.hadoop.hive.ql.plan.GroupByDesc
29 | import org.apache.hadoop.hive.ql.plan.PlanUtils.ExpressionTypes
30 |
31 | import shark.SharkConfVars
32 |
33 |
34 | /**
35 | * Java object serialization using XML encoder/decoder. Avoid using this to
36 | * serialize byte arrays because it is extremely inefficient.
37 | */
38 | object XmlSerializer {
39 | // We prepend the buffer with a byte indicating whether payload is compressed
40 | val COMPRESSION_ENABLED: Byte = 1
41 | val COMPRESSION_DISABLED: Byte = 0
42 |
43 | def serialize[T](o: T, conf: Configuration): Array[Byte] = {
44 | val byteStream = new ByteArrayOutputStream()
45 | val e = new XMLEncoder(byteStream)
46 | // workaround for java 1.5
47 | e.setPersistenceDelegate(classOf[ExpressionTypes], new EnumDelegate())
48 | e.setPersistenceDelegate(classOf[GroupByDesc.Mode], new EnumDelegate())
49 | // workaround for HiveConf-not-a-javabean
50 | e.setPersistenceDelegate(classOf[HiveConf], new HiveConfPersistenceDelegate )
51 | e.writeObject(o)
52 | e.close()
53 |
54 | val useCompression = conf match {
55 | case null => SharkConfVars.COMPRESS_QUERY_PLAN.defaultBoolVal
56 | case _ => SharkConfVars.getBoolVar(conf, SharkConfVars.COMPRESS_QUERY_PLAN)
57 | }
58 |
59 | if (useCompression) {
60 | COMPRESSION_ENABLED +: LZFEncoder.encode(byteStream.toByteArray())
61 | } else {
62 | COMPRESSION_DISABLED +: byteStream.toByteArray
63 | }
64 | }
65 |
66 | def deserialize[T](bytes: Array[Byte]): T = {
67 | val cl = Thread.currentThread.getContextClassLoader
68 | val decodedStream =
69 | if (bytes(0) == COMPRESSION_ENABLED) {
70 | new ByteArrayInputStream(LZFDecoder.decode(bytes.slice(1, bytes.size)))
71 | } else {
72 | new ByteArrayInputStream(bytes.slice(1, bytes.size))
73 | }
74 |
75 | // Occasionally an object inspector is created from the decoding.
76 | // Need to put a lock on the process.
77 | val ret = {
78 | val d: XMLDecoder = new XMLDecoder(decodedStream, null, null, cl)
79 | classOf[XMLDecoder].synchronized {
80 | val ret = d.readObject()
81 | d.close()
82 | ret
83 | }
84 | }
85 | ret.asInstanceOf[T]
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/UDTFOperator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import java.util.{List => JavaList}
21 |
22 | import scala.collection.mutable.ArrayBuffer
23 | import scala.collection.JavaConversions._
24 | import scala.reflect.BeanProperty
25 |
26 | import org.apache.hadoop.hive.ql.plan.UDTFDesc
27 | import org.apache.hadoop.hive.ql.udf.generic.Collector
28 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
29 | import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector
30 | import org.apache.hadoop.hive.serde2.objectinspector.StructField
31 |
32 |
33 | class UDTFOperator extends UnaryOperator[UDTFDesc] {
34 |
35 | @BeanProperty var conf: UDTFDesc = _
36 |
37 | @transient var objToSendToUDTF: Array[java.lang.Object] = _
38 | @transient var soi: StandardStructObjectInspector = _
39 | @transient var inputFields: JavaList[_ <: StructField] = _
40 | @transient var collector: UDTFCollector = _
41 | @transient var outputObjInspector: ObjectInspector = _
42 |
43 | override def initializeOnMaster() {
44 | super.initializeOnMaster()
45 |
46 | conf = desc
47 |
48 | initializeOnSlave()
49 | }
50 |
51 | override def initializeOnSlave() {
52 | collector = new UDTFCollector
53 | conf.getGenericUDTF().setCollector(collector)
54 |
55 | // Make an object inspector [] of the arguments to the UDTF
56 | soi = objectInspectors.head.asInstanceOf[StandardStructObjectInspector]
57 | inputFields = soi.getAllStructFieldRefs()
58 |
59 | val udtfInputOIs = inputFields.map { case inputField =>
60 | inputField.getFieldObjectInspector()
61 | }.toArray
62 |
63 | objToSendToUDTF = new Array[java.lang.Object](inputFields.size)
64 | outputObjInspector = conf.getGenericUDTF().initialize(udtfInputOIs)
65 | }
66 |
67 | override def outputObjectInspector() = outputObjInspector
68 |
69 | override def processPartition(split: Int, iter: Iterator[_]): Iterator[_] = {
70 | iter.flatMap { row =>
71 | explode(row)
72 | }
73 | }
74 |
75 | def explode[T](row: T): ArrayBuffer[java.lang.Object] = {
76 | (0 until inputFields.size).foreach { case i =>
77 | objToSendToUDTF(i) = soi.getStructFieldData(row, inputFields.get(i))
78 | }
79 | conf.getGenericUDTF().process(objToSendToUDTF)
80 | collector.collectRows()
81 | }
82 | }
83 |
84 | class UDTFCollector extends Collector {
85 |
86 | var collected = new ArrayBuffer[java.lang.Object]
87 |
88 | override def collect(input: java.lang.Object) {
89 | // We need to clone the input here because implementations of
90 | // GenericUDTF reuse the same object. Luckily they are always an array, so
91 | // it is easy to clone.
92 | collected += input.asInstanceOf[Array[_]].clone
93 | }
94 |
95 | def collectRows() = {
96 | val toCollect = collected
97 | collected = new ArrayBuffer[java.lang.Object]
98 | toCollect
99 | }
100 |
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/scala/shark/api/RDDTableFunctions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.api
19 |
20 | import scala.collection.mutable.ArrayBuffer
21 | import scala.reflect.ClassTag
22 |
23 | import org.apache.hadoop.hive.ql.metadata.Hive
24 |
25 | import org.apache.spark.rdd.RDD
26 |
27 | import shark.{SharkContext, SharkEnv}
28 | import shark.memstore2.{CacheType, TablePartitionStats, TablePartition, TablePartitionBuilder}
29 | import shark.util.HiveUtils
30 |
31 |
32 | class RDDTableFunctions(self: RDD[Seq[_]], classTags: Seq[ClassTag[_]]) {
33 |
34 | def saveAsTable(tableName: String, fields: Seq[String]): Boolean = {
35 | require(fields.size == this.classTags.size,
36 | "Number of column names != number of fields in the RDD.")
37 |
38 | // Get a local copy of the classTags so we don't need to serialize this object.
39 | val classTags = this.classTags
40 |
41 | val statsAcc = SharkEnv.sc.accumulableCollection(ArrayBuffer[(Int, TablePartitionStats)]())
42 |
43 | // Create the RDD object.
44 | val rdd = self.mapPartitionsWithIndex { case(partitionIndex, iter) =>
45 | val ois = classTags.map(HiveUtils.getJavaPrimitiveObjectInspector)
46 | val builder = new TablePartitionBuilder(
47 | HiveUtils.makeStandardStructObjectInspector(fields, ois),
48 | 1000000,
49 | shouldCompress = false)
50 |
51 | for (p <- iter) {
52 | builder.incrementRowCount()
53 | // TODO: this is not the most efficient code to do the insertion ...
54 | p.zipWithIndex.foreach { case (v, i) =>
55 | builder.append(i, v.asInstanceOf[Object], ois(i))
56 | }
57 | }
58 |
59 | statsAcc += Tuple2(partitionIndex, builder.asInstanceOf[TablePartitionBuilder].stats)
60 | Iterator(builder.build())
61 | }.persist()
62 |
63 | var isSucessfulCreateTable = HiveUtils.createTableInHive(
64 | tableName, fields, classTags, Hive.get().getConf())
65 |
66 | // Put the table in the metastore. Only proceed if the DDL statement is executed successfully.
67 | val databaseName = Hive.get(SharkContext.hiveconf).getCurrentDatabase()
68 | if (isSucessfulCreateTable) {
69 | // Create an entry in the MemoryMetadataManager.
70 | val newTable = SharkEnv.memoryMetadataManager.createMemoryTable(
71 | databaseName, tableName, CacheType.MEMORY)
72 | try {
73 | // Force evaluate to put the data in memory.
74 | rdd.context.runJob(rdd, (iter: Iterator[TablePartition]) => iter.foreach(_ => Unit))
75 | } catch {
76 | case _: Exception => {
77 | // Intercept the exception thrown by SparkContext#runJob() and handle it silently. The
78 | // exception message should already be printed to the console by DDLTask#execute().
79 | HiveUtils.dropTableInHive(tableName)
80 | // Drop the table entry from MemoryMetadataManager.
81 | SharkEnv.memoryMetadataManager.removeTable(databaseName, tableName)
82 | isSucessfulCreateTable = false
83 | }
84 | }
85 | newTable.put(rdd, statsAcc.value.toMap)
86 | }
87 | return isSucessfulCreateTable
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/scala/shark/parse/SharkLoadSemanticAnalyzer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.parse
19 |
20 | import scala.collection.JavaConversions._
21 |
22 | import org.apache.hadoop.hive.conf.HiveConf
23 | import org.apache.hadoop.hive.ql.exec.{CopyTask, MoveTask, TaskFactory}
24 | import org.apache.hadoop.hive.ql.metadata.{Partition, Table => HiveTable}
25 | import org.apache.hadoop.hive.ql.parse.{ASTNode, BaseSemanticAnalyzer, LoadSemanticAnalyzer}
26 | import org.apache.hadoop.hive.ql.plan._
27 |
28 | import shark.{LogHelper, SharkEnv}
29 | import shark.execution.SparkLoadWork
30 | import shark.memstore2.{CacheType, SharkTblProperties}
31 |
32 |
33 | class SharkLoadSemanticAnalyzer(conf: HiveConf) extends LoadSemanticAnalyzer(conf) {
34 |
35 | override def analyzeInternal(ast: ASTNode): Unit = {
36 | // Delegate to the LoadSemanticAnalyzer parent for error checking the source path formatting.
37 | super.analyzeInternal(ast)
38 |
39 | // Children of the AST root created for a LOAD DATA [LOCAL] INPATH ... statement are, in order:
40 | // 1. node containing the path specified by INPATH.
41 | // 2. internal TOK_TABNAME node that contains the table's name.
42 | // 3. (optional) node representing the LOCAL modifier.
43 | val tableASTNode = ast.getChild(1).asInstanceOf[ASTNode]
44 | val tableName = getTableName(tableASTNode)
45 | val hiveTable = db.getTable(tableName)
46 | val cacheMode = CacheType.fromString(
47 | hiveTable.getProperty(SharkTblProperties.CACHE_FLAG.varname))
48 |
49 | if (CacheType.shouldCache(cacheMode)) {
50 | // Find the arguments needed to instantiate a SparkLoadWork.
51 | val tableSpec = new BaseSemanticAnalyzer.tableSpec(db, conf, tableASTNode)
52 | val hiveTable = tableSpec.tableHandle
53 | val moveTask = getMoveTask()
54 | val partSpecOpt = Option(tableSpec.getPartSpec)
55 | val sparkLoadWork = SparkLoadWork(
56 | db,
57 | conf,
58 | hiveTable,
59 | partSpecOpt,
60 | isOverwrite = moveTask.getWork.getLoadTableWork.getReplace)
61 |
62 | // Create a SparkLoadTask that will read from the table's data directory. Make it a dependent
63 | // task of the LoadTask so that it's executed only if the LoadTask executes successfully.
64 | moveTask.addDependentTask(TaskFactory.get(sparkLoadWork, conf))
65 | }
66 | }
67 |
68 | private def getMoveTask(): MoveTask = {
69 | assert(rootTasks.size == 1)
70 |
71 | // If the execution is local, then the root task is a CopyTask with a MoveTask child.
72 | // Otherwise, the root is a MoveTask.
73 | var rootTask = rootTasks.head
74 | val moveTask = if (rootTask.isInstanceOf[CopyTask]) {
75 | val firstChildTask = rootTask.getChildTasks.head
76 | assert(firstChildTask.isInstanceOf[MoveTask])
77 | firstChildTask
78 | } else {
79 | rootTask
80 | }
81 |
82 | // In Hive, LoadTableDesc is referred to as LoadTableWork ...
83 | moveTask.asInstanceOf[MoveTask]
84 | }
85 |
86 | private def getTableName(node: ASTNode): String = {
87 | BaseSemanticAnalyzer.getUnescapedName(node.getChild(0).asInstanceOf[ASTNode])
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/test/scala/shark/memstore2/column/NullableColumnIteratorSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
21 | import org.apache.hadoop.io.Text
22 | import org.apache.hadoop.io.IntWritable
23 |
24 | import org.scalatest.FunSuite
25 |
26 |
27 | class NullableColumnIteratorSuite extends FunSuite {
28 |
29 | test("String Growth") {
30 | val c = new StringColumnBuilder
31 | c.initialize(4, "")
32 | val oi = PrimitiveObjectInspectorFactory.writableStringObjectInspector
33 |
34 | val a = Array[Text](
35 | new Text("a"), null,
36 | new Text("b"), null,
37 | new Text("abc"), null,
38 | null, null, new Text("efg")
39 | )
40 | a.foreach {
41 | t => c.append(t, oi)
42 | }
43 | val b = c.build()
44 | val i = ColumnIterator.newIterator(b)
45 | Range(0, a.length).foreach { x =>
46 | if (x > 0) assert(i.hasNext)
47 | i.next()
48 | val v = i.current
49 | if (a(x) == null) {
50 | assert(v == null)
51 | } else {
52 | assert(v.toString == a(x).toString)
53 | }
54 | }
55 | assert(!i.hasNext)
56 | }
57 |
58 | test("Iterate Strings") {
59 | val c = new StringColumnBuilder
60 | c.initialize(4, "")
61 | val oi = PrimitiveObjectInspectorFactory.writableStringObjectInspector
62 |
63 | c.append(new Text("a"), oi)
64 | c.append(new Text(""), oi)
65 | c.append(null, oi)
66 | c.append(new Text("b"), oi)
67 | c.append(new Text("Abcdz"), oi)
68 | c.append(null, oi)
69 | val b = c.build()
70 | val i = ColumnIterator.newIterator(b)
71 | i.next()
72 | assert(i.current.toString() == "a")
73 | i.next()
74 | assert(i.current.toString() == "")
75 | i.next()
76 | assert(i.current == null)
77 | i.next()
78 | assert(i.current.toString() == "b")
79 | i.next()
80 | assert(i.current.toString() == "Abcdz")
81 | i.next()
82 | assert(i.current == null)
83 | assert(false === i.hasNext)
84 | }
85 |
86 | test("Iterate Ints") {
87 | def testList(l: Seq[AnyRef]) {
88 | val c = new IntColumnBuilder
89 | c.initialize(l.size, "")
90 | val oi = PrimitiveObjectInspectorFactory.javaIntObjectInspector
91 |
92 | l.foreach { item =>
93 | if (item == null) {
94 | c.append(null, oi)
95 | } else {
96 | c.append(item.asInstanceOf[Object], oi)
97 | }
98 | }
99 |
100 | val b = c.build()
101 | val i = ColumnIterator.newIterator(b)
102 |
103 | l.foreach { x =>
104 | i.next()
105 | if (x == null) {
106 | assert(i.current === x)
107 | } else {
108 | assert(i.current.asInstanceOf[IntWritable].get === x)
109 | }
110 | }
111 | assert(false === i.hasNext)
112 | }
113 |
114 | testList(List(null, null, 123.asInstanceOf[AnyRef]))
115 | testList(List(123.asInstanceOf[AnyRef], 4.asInstanceOf[AnyRef], null))
116 | testList(List(null))
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/test/scala/shark/memstore2/column/ColumnTypeSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2.column
19 |
20 | import java.nio.ByteBuffer
21 |
22 | import org.apache.hadoop.io.IntWritable
23 | import org.apache.hadoop.io.LongWritable
24 | import org.apache.hadoop.hive.serde2.io._
25 |
26 | import org.scalatest.FunSuite
27 |
28 | class ColumnTypeSuite extends FunSuite {
29 |
30 | test("Int") {
31 | assert(INT.defaultSize == 4)
32 | var buffer = ByteBuffer.allocate(32)
33 | var a: Seq[Int] = Array[Int](35, 67, 899, 4569001)
34 | a.foreach {i => buffer.putInt(i)}
35 | buffer.rewind()
36 | a.foreach {i =>
37 | val v = INT.extract(buffer)
38 | assert(v == i)
39 | }
40 | buffer = ByteBuffer.allocate(32)
41 | a = Range(0, 4)
42 | a.foreach { i =>
43 | INT.append(i, buffer)
44 | }
45 | buffer.rewind()
46 | a.foreach { i => assert(buffer.getInt() == i)}
47 |
48 | buffer = ByteBuffer.allocate(32)
49 | a =Range(0,4)
50 | a.foreach { i => buffer.putInt(i)}
51 | buffer.rewind()
52 | val writable = new IntWritable()
53 | a.foreach { i =>
54 | INT.extractInto(buffer, writable)
55 | assert(writable.get == i)
56 | }
57 |
58 | }
59 |
60 | test("Short") {
61 | assert(SHORT.defaultSize == 2)
62 | assert(SHORT.actualSize(8) == 2)
63 | var buffer = ByteBuffer.allocate(32)
64 | var a = Array[Short](35, 67, 87, 45)
65 | a.foreach {i => buffer.putShort(i)}
66 | buffer.rewind()
67 | a.foreach {i =>
68 | val v = SHORT.extract(buffer)
69 | assert(v == i)
70 | }
71 |
72 | buffer = ByteBuffer.allocate(32)
73 | a = Array[Short](0,1,2,3)
74 | a.foreach { i =>
75 | SHORT.append(i, buffer)
76 | }
77 | buffer.rewind()
78 | a.foreach { i => assert(buffer.getShort() == i)}
79 |
80 | buffer = ByteBuffer.allocate(32)
81 | a =Array[Short](0,1,2,3)
82 | a.foreach { i => buffer.putShort(i)}
83 | buffer.rewind()
84 | val writable = new ShortWritable()
85 | a.foreach { i =>
86 | SHORT.extractInto(buffer, writable)
87 | assert(writable.get == i)
88 | }
89 | }
90 |
91 | test("Long") {
92 | assert(LONG.defaultSize == 8)
93 | assert(LONG.actualSize(45L) == 8)
94 | var buffer = ByteBuffer.allocate(64)
95 | var a = Array[Long](35L, 67L, 8799000880L, 45000999090L)
96 | a.foreach {i => buffer.putLong(i)}
97 | buffer.rewind()
98 | a.foreach {i =>
99 | val v = LONG.extract(buffer)
100 | assert(v == i)
101 | }
102 |
103 | buffer = ByteBuffer.allocate(32)
104 | a = Array[Long](0,1,2,3)
105 | a.foreach { i =>
106 | LONG.append(i, buffer)
107 | }
108 | buffer.rewind()
109 | a.foreach { i => assert(buffer.getLong() == i)}
110 |
111 | buffer = ByteBuffer.allocate(32)
112 | a =Array[Long](0,1,2,3)
113 | a.foreach { i => buffer.putLong(i)}
114 | buffer.rewind()
115 | val writable = new LongWritable()
116 | a.foreach { i =>
117 | LONG.extractInto(buffer, writable)
118 | assert(writable.get == i)
119 | }
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/ColumnarStructObjectInspector.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.util.{ArrayList => JArrayList, List => JList}
21 |
22 | import org.apache.hadoop.hive.serde2.`lazy`.LazyFactory
23 | import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe.SerDeParameters
24 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorUtils,
25 | StructField, StructObjectInspector}
26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category
27 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
28 | import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo
29 |
30 |
31 | class ColumnarStructObjectInspector(fields: JList[StructField]) extends StructObjectInspector {
32 |
33 | override def getCategory: Category = Category.STRUCT
34 |
35 | override def getTypeName: String = ObjectInspectorUtils.getStandardStructTypeName(this)
36 |
37 | override def getStructFieldRef(fieldName: String): StructField =
38 | ObjectInspectorUtils.getStandardStructFieldRef(fieldName, fields)
39 |
40 | override def getAllStructFieldRefs: JList[_ <: StructField] = fields
41 |
42 | override def getStructFieldData(data: Object, fieldRef: StructField): Object =
43 | data.asInstanceOf[ColumnarStruct].getField(
44 | fieldRef.asInstanceOf[ColumnarStructObjectInspector.IDStructField].fieldID)
45 |
46 | override def getStructFieldsDataAsList(data: Object): JList[Object] =
47 | if (data == null) null else data.asInstanceOf[ColumnarStruct].getFieldsAsList()
48 | }
49 |
50 |
51 | object ColumnarStructObjectInspector {
52 |
53 | def apply(serDeParams: SerDeParameters): ColumnarStructObjectInspector = {
54 |
55 | val columnNames = serDeParams.getColumnNames()
56 | val columnTypes = serDeParams.getColumnTypes()
57 | val fields = new JArrayList[StructField]()
58 | for (i <- 0 until columnNames.size) {
59 | val typeInfo = columnTypes.get(i)
60 | val fieldOI = typeInfo.getCategory match {
61 | case Category.PRIMITIVE =>
62 | PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(
63 | typeInfo.asInstanceOf[PrimitiveTypeInfo].getPrimitiveCategory)
64 | case _ => LazyFactory.createLazyObjectInspector(
65 | typeInfo, serDeParams.getSeparators(), 1, serDeParams.getNullSequence(),
66 | serDeParams.isEscaped(), serDeParams.getEscapeChar())
67 | }
68 | fields.add(new IDStructField(i, columnNames.get(i), fieldOI))
69 | }
70 | new ColumnarStructObjectInspector(fields)
71 | }
72 |
73 | class IDStructField(
74 | val fieldID: Int,
75 | val fieldName: String,
76 | val fieldObjectInspector: ObjectInspector,
77 | val fieldComment: String)
78 | extends StructField {
79 |
80 | def this(fieldID: Int, fieldName: String, fieldObjectInspector: ObjectInspector) =
81 | this(fieldID, fieldName, fieldObjectInspector, null)
82 |
83 | override def getFieldName: String = fieldName
84 | override def getFieldObjectInspector: ObjectInspector = fieldObjectInspector
85 | override def toString(): String = "" + fieldID + ":" + fieldName
86 | override def getFieldComment() : String = fieldComment
87 | }
88 | }
89 |
90 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/JoinUtil.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import java.util.{List => JavaList}
21 |
22 | import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator
23 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector => OI}
24 | import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorUtils => OIUtils}
25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.{ObjectInspectorCopyOption => CopyOption}
26 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector
27 |
28 | import org.apache.hadoop.io.BooleanWritable
29 | import org.apache.hadoop.io.NullWritable
30 | import org.apache.hadoop.io.Writable
31 |
32 | import shark.execution.serialization.SerializableWritable
33 |
34 |
35 | object JoinUtil {
36 |
37 | def computeJoinKey(row: Any, keyFields: JavaList[ExprNodeEvaluator], keyFieldsOI: JavaList[OI])
38 | : Seq[SerializableWritable[_]] = {
39 | Range(0, keyFields.size).map { i =>
40 | val c = copy(row, keyFields.get(i), keyFieldsOI.get(i), CopyOption.WRITABLE)
41 | val s = if (c == null) NullWritable.get else c
42 | new SerializableWritable(s.asInstanceOf[Writable])
43 | }
44 | }
45 |
46 | def joinKeyHasAnyNulls(joinKey: Seq[AnyRef], nullSafes: Array[Boolean]): Boolean = {
47 | joinKey.zipWithIndex.exists { x =>
48 | (nullSafes == null || nullSafes(x._2).unary_!) && (x._1 == null)
49 | }
50 | }
51 |
52 | def computeJoinValues(row: Any,
53 | valueFields: JavaList[ExprNodeEvaluator],
54 | valueFieldsOI: JavaList[OI],
55 | filters: JavaList[ExprNodeEvaluator],
56 | filtersOI: JavaList[OI],
57 | noOuterJoin: Boolean,
58 | serializable: Boolean = false)
59 | : Array[AnyRef] = {
60 |
61 | // isFiltered = true means failed in the join filter testing
62 | val isFiltered: Boolean = {
63 | if (filters == null) {
64 | false
65 | } else {
66 | var x = 0
67 | var exists = false
68 | while (x < filters.size() && !exists) {
69 | val cond = filters.get(x).evaluate(row)
70 | if (cond == null) {
71 | exists = true
72 | } else {
73 | exists = !filtersOI.get(x).asInstanceOf[BooleanObjectInspector].get(cond)
74 | }
75 | x += 1
76 | }
77 |
78 | exists
79 | }
80 | }
81 | val size = valueFields.size
82 | val a = new Array[AnyRef](size)
83 | var i = 0
84 | while (i < size) {
85 | a(i) = copy(row, valueFields.get(i), valueFieldsOI.get(i), CopyOption.WRITABLE)
86 | i += 1
87 | }
88 |
89 | val result = if (noOuterJoin) {
90 | a
91 | } else {
92 | val n = new Array[AnyRef](size + 1)
93 | Array.copy(a, 0, n, 0, size)
94 | n(size) = new BooleanWritable(isFiltered)
95 | n
96 | }
97 |
98 | if (serializable) {
99 | result.map(e => new SerializableWritable(e.asInstanceOf[Writable]))
100 | } else {
101 | result
102 | }
103 | }
104 |
105 | private def copy(row: Any, evaluator: ExprNodeEvaluator, oi: OI, copyOption: CopyOption) = {
106 | OIUtils.copyToStandardObject(evaluator.evaluate(row), oi, copyOption)
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/scala/shark/memstore2/TablePartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.memstore2
19 |
20 | import java.io.{Externalizable, ObjectInput, ObjectOutput}
21 | import java.nio.ByteBuffer
22 | import java.nio.ByteOrder
23 | import java.util.BitSet
24 | import shark.memstore2.column.ColumnIterator
25 |
26 |
27 | /**
28 | * TablePartition contains a whole partition of data in columnar format. It
29 | * simply contains a list of columns and their meta data. It should be built
30 | * using a TablePartitionBuilder.
31 | */
32 | class TablePartition(private var _numRows: Long, private var _columns: Array[ByteBuffer])
33 | extends Externalizable {
34 |
35 | // Empty constructor for Externalizable
36 | def this() {
37 | this(0, null)
38 | }
39 |
40 | def this(columns: Array[ByteBuffer]) {
41 | this(columns(0).getLong(), columns.tail)
42 | }
43 |
44 | def numRows: Long = _numRows
45 |
46 | def columns: Array[ByteBuffer] = _columns
47 |
48 | /** We store our per-partition metadata in a fake "column 0" for off-heap storage. */
49 | def toOffHeap: Array[ByteBuffer] = {
50 | val buffers = new Array[ByteBuffer](1 + _columns.size)
51 | buffers(0) = metadata
52 | System.arraycopy(_columns, 0, buffers, 1, _columns.size)
53 | buffers
54 | }
55 |
56 | def metadata: ByteBuffer = {
57 | val buffer = ByteBuffer.allocate(8)
58 | buffer.order(ByteOrder.nativeOrder())
59 | buffer.putLong(_numRows)
60 | buffer.rewind()
61 | buffer
62 | }
63 |
64 | /**
65 | * Return an iterator for the partition.
66 | */
67 | def iterator: TablePartitionIterator = {
68 | val columnIterators: Array[ColumnIterator] = _columns.map { case buffer: ByteBuffer =>
69 | val iter = ColumnIterator.newIterator(buffer)
70 | iter
71 | }
72 | new TablePartitionIterator(_numRows, columnIterators)
73 | }
74 |
75 | def prunedIterator(columnsUsed: BitSet) = {
76 | val columnIterators: Array[ColumnIterator] = _columns.map {
77 | case buffer: ByteBuffer =>
78 | ColumnIterator.newIterator(buffer)
79 | case _ =>
80 | // The buffer might be null if it is pruned in off-heap storage.
81 | null
82 | }
83 | new TablePartitionIterator(_numRows, columnIterators, columnsUsed)
84 | }
85 |
86 | override def readExternal(in: ObjectInput) {
87 | _numRows = in.readLong()
88 | val numColumns = in.readInt()
89 | _columns = Array.fill[ByteBuffer](numColumns) {
90 | val columnLen = in.readInt()
91 | val buf = ByteBuffer.allocate(columnLen)
92 | in.readFully(buf.array(), 0, columnLen)
93 | buf
94 | }
95 | }
96 |
97 | override def writeExternal(out: ObjectOutput) {
98 | out.writeLong(numRows)
99 | out.writeInt(columns.length)
100 | for (column <- columns) {
101 | val buf = column.duplicate()
102 | buf.rewind()
103 | // If the ByteBuffer is backed by a byte array, just write the byte array out.
104 | // Otherwise, write each byte one by one.
105 | if (buf.hasArray()) {
106 | val byteArray = buf.array()
107 | out.writeInt(byteArray.length)
108 | out.write(byteArray, 0, byteArray.length)
109 | } else {
110 | out.writeInt(buf.remaining())
111 | while (buf.hasRemaining()) {
112 | out.write(buf.get())
113 | }
114 | }
115 | }
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/scala/shark/execution/SharkExplainTask.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (C) 2012 The Regents of The University California.
3 | * All rights reserved.
4 | *
5 | * Licensed under the Apache License, Version 2.0 (the "License");
6 | * you may not use this file except in compliance with the License.
7 | * You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package shark.execution
19 |
20 | import java.io.PrintStream
21 | import java.util.{HashSet => JHashSet, List => JList}
22 |
23 | import scala.collection.JavaConversions._
24 |
25 | import org.apache.hadoop.fs.Path
26 | import org.apache.hadoop.hive.conf.HiveConf
27 | import org.apache.hadoop.hive.ql.exec.{ExplainTask, Task}
28 | import org.apache.hadoop.hive.ql.hooks.ReadEntity;
29 | import org.apache.hadoop.hive.ql.{Context, DriverContext, QueryPlan}
30 | import org.apache.hadoop.hive.ql.exec.{ExplainTask, Task}
31 | import org.apache.hadoop.hive.ql.plan.ExplainWork
32 | import org.apache.hadoop.util.StringUtils
33 |
34 | import shark.LogHelper
35 |
36 |
37 | class SharkExplainWork(
38 | resFile: String,
39 | rootTasks: JList[Task[_ <: java.io.Serializable]],
40 | astStringTree: String,
41 | inputs: JHashSet[ReadEntity],
42 | extended: Boolean)
43 | extends ExplainWork(resFile, rootTasks, astStringTree, inputs, extended, false, false)
44 |
45 |
46 | /**
47 | * SharkExplainTask executes EXPLAIN for RDD operators.
48 | */
49 | class SharkExplainTask extends Task[SharkExplainWork] with java.io.Serializable with LogHelper {
50 |
51 | val hiveExplainTask = new ExplainTask
52 |
53 | override def execute(driverContext: DriverContext): Int = {
54 | logDebug("Executing " + this.getClass.getName())
55 | hiveExplainTask.setWork(work)
56 |
57 | try {
58 | val resFile = new Path(work.getResFile())
59 | val outS = resFile.getFileSystem(conf).create(resFile)
60 | val out = new PrintStream(outS)
61 |
62 | // Print out the parse AST
63 | ExplainTask.outputAST(work.getAstStringTree, out, false, 0)
64 | out.println()
65 |
66 | ExplainTask.outputDependencies(out, work.isFormatted(), work.getRootTasks, 0)
67 | out.println()
68 |
69 | // Go over all the tasks and dump out the plans
70 | ExplainTask.outputStagePlans(out, work, work.getRootTasks, 0)
71 |
72 | // Print the Shark query plan if applicable.
73 | if (work != null && work.getRootTasks != null && work.getRootTasks.size > 0) {
74 | work.getRootTasks.zipWithIndex.foreach { case(task, taskIndex) =>
75 | task match {
76 | case sparkTask: SparkTask => {
77 | out.println("SHARK QUERY PLAN #%d:".format(taskIndex))
78 | val terminalOp = sparkTask.getWork().terminalOperator
79 | ExplainTaskHelper.outputPlan(terminalOp, out, work.getExtended, 2)
80 | out.println()
81 | }
82 | case _ => null
83 | }
84 | }
85 | }
86 |
87 | out.close()
88 | 0
89 | } catch {
90 | case e: Exception => {
91 | console.printError("Failed with exception " + e.getMessage(), "\n" +
92 | StringUtils.stringifyException(e))
93 | throw e
94 | 1
95 | }
96 | }
97 | }
98 |
99 | override def initialize(conf: HiveConf, queryPlan: QueryPlan, driverContext: DriverContext) {
100 | hiveExplainTask.initialize(conf, queryPlan, driverContext)
101 | super.initialize(conf, queryPlan, driverContext)
102 | }
103 |
104 | override def getType = hiveExplainTask.getType
105 |
106 | override def getName = hiveExplainTask.getName
107 |
108 | override def localizeMRTmpFilesImpl(ctx: Context) {
109 | // explain task has nothing to localize
110 | // we don't expect to enter this code path at all
111 | throw new RuntimeException ("Unexpected call")
112 | }
113 |
114 | }
115 |
116 |
--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This file is used to launch Shark on the master.
4 | export SCALA_VERSION=2.10
5 | SHARK_VERSION=0.9.2
6 |
7 | # Figure out where the framework is installed
8 | FWDIR="$(cd `dirname $0`; pwd)"
9 |
10 | export SHARK_HOME="$FWDIR"
11 |
12 | # Load environment variables from conf/shark-env.sh, if it exists
13 | if [ -e $SHARK_HOME/conf/shark-env.sh ] ; then
14 | . $SHARK_HOME/conf/shark-env.sh
15 | fi
16 |
17 | if [ -n "$MASTER" ] ; then
18 | if [ -z $SPARK_HOME ] ; then
19 | echo "No SPARK_HOME specified. Please set SPARK_HOME for cluster mode."
20 | exit 1
21 | fi
22 | fi
23 |
24 | # check for shark with spark on yarn params
25 | if [ "x$SHARK_EXEC_MODE" == "xyarn" ] ; then
26 | if [ "x$SPARK_ASSEMBLY_JAR" == "x" ] ; then
27 | echo "No SPARK_ASSEMBLY_JAR specified. Please set SPARK_ASSEMBLY_JAR for spark on yarn mode."
28 | exit 1
29 | else
30 | export SPARK_JAR=$SPARK_ASSEMBLY_JAR
31 | fi
32 |
33 | if [ "x$SHARK_ASSEMBLY_JAR" == "x" ] ; then
34 | echo "No SHARK_ASSEMBLY_JAR specified. please set SHARK_ASSEMBLY_JAR for spark on yarn mode."
35 | exit 1
36 | else
37 | export SPARK_YARN_APP_JAR=$SHARK_ASSEMBLY_JAR
38 | fi
39 |
40 | # use yarn-client mode for interactive shell.
41 | export MASTER=yarn-client
42 | fi
43 |
44 | # Check for optionally specified configuration file path
45 | if [ "x$HIVE_CONF_DIR" == "x" ] ; then
46 | HIVE_CONF_DIR="$SHARK_HOME/conf"
47 | fi
48 |
49 | if [ -f "${HIVE_CONF_DIR}/hive-env.sh" ]; then
50 | . "${HIVE_CONF_DIR}/hive-env.sh"
51 | fi
52 |
53 | # Add Shark jars.
54 | for jar in `find $SHARK_HOME/lib -name '*jar'`; do
55 | SPARK_CLASSPATH+=:$jar
56 | done
57 | for jar in `find $SHARK_HOME/lib_managed/jars -name '*jar'`; do
58 | SPARK_CLASSPATH+=:$jar
59 | done
60 | for jar in `find $SHARK_HOME/lib_managed/bundles -name '*jar'`; do
61 | SPARK_CLASSPATH+=:$jar
62 | done
63 |
64 | SPARK_CLASSPATH+=:$HIVE_CONF_DIR
65 |
66 | # Build up Shark's jar or classes.
67 | SHARK_CLASSES="$SHARK_HOME/target/scala-$SCALA_VERSION/classes"
68 | SHARK_JAR="$SHARK_HOME/target/scala-$SCALA_VERSION/shark_$SCALA_VERSION-$SHARK_VERSION.jar"
69 | if [ -d "$SHARK_CLASSES/shark" ] ; then
70 | SPARK_CLASSPATH+=":$SHARK_CLASSES"
71 | else
72 | if [ -f "$SHARK_JAR" ] ; then
73 | SPARK_CLASSPATH+=":$SHARK_JAR"
74 | else
75 | echo "Cannot find either compiled classes or compiled jar package for Shark."
76 | echo "Have you compiled Shark yet?"
77 | exit 1
78 | fi
79 | fi
80 |
81 | SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/test-classes"
82 |
83 |
84 | SHARK_JAR="$SHARK_HOME/target/scala-$SCALA_VERSION/shark_$SCALA_VERSION-$SHARK_VERSION.jar"
85 | if [ -f "$SHARK_JAR" ] ; then
86 | SPARK_CLASSPATH+=":$SHARK_JAR"
87 | else
88 | SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/classes"
89 | fi
90 |
91 | SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/test-classes"
92 |
93 |
94 | if [ "x$HADOOP_HOME" == "x" ] ; then
95 | echo "No HADOOP_HOME specified. Shark will run in local-mode"
96 | else
97 | SPARK_CLASSPATH+=:$HADOOP_HOME/etc/hadoop
98 | SPARK_CLASSPATH+=:$HADOOP_HOME/conf
99 | fi
100 |
101 |
102 | # TODO(rxin): Check aux classpath and aux java opts.
103 | #CLASSPATH=${CLASSPATH}:${AUX_CLASSPATH}
104 |
105 | export SPARK_CLASSPATH
106 | export CLASSPATH+=$SPARK_CLASSPATH # Needed for spark-shell
107 |
108 | export SPARK_JAVA_OPTS+=" $TEST_JAVA_OPTS"
109 |
110 | # supress the HADOOP_HOME warnings in 1.x.x
111 | export HADOOP_HOME_WARN_SUPPRESS=true
112 |
113 | if [ "x$SHARK_MASTER_MEM" == "x" ] ; then
114 | SHARK_MASTER_MEM="512m"
115 | fi
116 |
117 | # Set JAVA_OPTS to be able to load native libraries and to set heap size
118 | JAVA_OPTS+="$SPARK_JAVA_OPTS"
119 | JAVA_OPTS+=" -Djava.library.path=$SPARK_LIBRARY_PATH"
120 | JAVA_OPTS+=" -Xms$SHARK_MASTER_MEM -Xmx$SHARK_MASTER_MEM"
121 | export JAVA_OPTS
122 |
123 | # In case we are running Ant
124 | export ANT_OPTS=$JAVA_OPTS
125 |
126 | if [ "x$RUNNER" == "x" ] ; then
127 | if [ -n "$JAVA_HOME" ]; then
128 | RUNNER="${JAVA_HOME}/bin/java"
129 | else
130 | RUNNER=java
131 | fi
132 | # The JVM doesn't read JAVA_OPTS by default so we need to pass it in
133 | EXTRA_ARGS="$JAVA_OPTS"
134 | fi
135 |
136 | exec $RUNNER $EXTRA_ARGS "$@"
137 |
--------------------------------------------------------------------------------
/src/test/scala/shark/SharkServerSuite.scala:
--------------------------------------------------------------------------------
1 | package shark
2 |
3 | import java.io.{BufferedReader, InputStreamReader}
4 | import java.sql.DriverManager
5 | import java.sql.Statement
6 | import java.sql.Connection
7 |
8 | import scala.collection.JavaConversions._
9 |
10 | import org.scalatest.{BeforeAndAfterAll, FunSuite}
11 | import org.scalatest.matchers.ShouldMatchers
12 |
13 | import scala.concurrent._
14 | import ExecutionContext.Implicits.global
15 |
16 | /**
17 | * Test for the Shark server.
18 | */
19 | class SharkServerSuite extends FunSuite with BeforeAndAfterAll with ShouldMatchers with TestUtils {
20 |
21 | val WAREHOUSE_PATH = TestUtils.getWarehousePath("server")
22 | val METASTORE_PATH = TestUtils.getMetastorePath("server")
23 | val DRIVER_NAME = "org.apache.hadoop.hive.jdbc.HiveDriver"
24 | val TABLE = "test"
25 | // use a different port, than the hive standard 10000,
26 | // for tests to avoid issues with the port being taken on some machines
27 | val PORT = "9011"
28 |
29 | // If verbose is true, the testing program will print all outputs coming from the shark server.
30 | val VERBOSE = Option(System.getenv("SHARK_TEST_VERBOSE")).getOrElse("false").toBoolean
31 |
32 | Class.forName(DRIVER_NAME)
33 |
34 | override def beforeAll() { launchServer() }
35 |
36 | override def afterAll() { stopServer() }
37 |
38 | private def launchServer(args: Seq[String] = Seq.empty) {
39 | // Forking a new process to start the Shark server. The reason to do this is it is
40 | // hard to clean up Hive resources entirely, so we just start a new process and kill
41 | // that process for cleanup.
42 | val defaultArgs = Seq("./bin/shark", "--service", "sharkserver",
43 | "--verbose",
44 | "-p",
45 | PORT,
46 | "--hiveconf",
47 | "hive.root.logger=INFO,console",
48 | "--hiveconf",
49 | "\"javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + METASTORE_PATH + ";create=true\"",
50 | "--hiveconf",
51 | "\"hive.metastore.warehouse.dir=" + WAREHOUSE_PATH + "\"")
52 | val pb = new ProcessBuilder(defaultArgs ++ args)
53 | process = pb.start()
54 | inputReader = new BufferedReader(new InputStreamReader(process.getInputStream))
55 | errorReader = new BufferedReader(new InputStreamReader(process.getErrorStream))
56 | waitForOutput(inputReader, "Starting Shark server")
57 |
58 | // Spawn a thread to read the output from the forked process.
59 | // Note that this is necessary since in some configurations, log4j could be blocked
60 | // if its output to stderr are not read, and eventually blocking the entire test suite.
61 | future {
62 | while (true) {
63 | val stdout = readFrom(inputReader)
64 | val stderr = readFrom(errorReader)
65 | if (VERBOSE && stdout.length > 0) {
66 | println(stdout)
67 | }
68 | if (VERBOSE && stderr.length > 0) {
69 | println(stderr)
70 | }
71 | Thread.sleep(50)
72 | }
73 | }
74 | }
75 |
76 | private def stopServer() {
77 | process.destroy()
78 | process.waitFor()
79 | }
80 |
81 | test("test query execution against a shark server") {
82 | Thread.sleep(5*1000) // I know... Gross. However, without this the tests fail non-deterministically.
83 |
84 | val dataFilePath = TestUtils.dataFilePath + "/kv1.txt"
85 | val stmt = createStatement()
86 | stmt.executeQuery("DROP TABLE IF EXISTS test")
87 | stmt.executeQuery("DROP TABLE IF EXISTS test_cached")
88 | stmt.executeQuery("CREATE TABLE test(key int, val string)")
89 | stmt.executeQuery("LOAD DATA LOCAL INPATH '" + dataFilePath+ "' OVERWRITE INTO TABLE test")
90 | stmt.executeQuery("CREATE TABLE test_cached as select * from test limit 499")
91 |
92 | var rs = stmt.executeQuery("select count(*) from test")
93 | rs.next()
94 | rs.getInt(1) should equal (500)
95 |
96 | rs = stmt.executeQuery("select count(*) from test_cached")
97 | rs.next()
98 | rs.getInt(1) should equal (499)
99 |
100 | stmt.close()
101 | }
102 |
103 | def getConnection(): Connection = {
104 | DriverManager.getConnection("jdbc:hive://localhost:" + PORT + "/default", "", "")
105 | }
106 |
107 | def createStatement(): Statement = getConnection().createStatement()
108 | }
--------------------------------------------------------------------------------