├── .gitignore
├── BuildTestAll.sh
├── LICENSE
├── README.md
├── java
├── gradle
│ ├── dse
│ │ ├── .gitignore
│ │ ├── build.gradle
│ │ ├── gradle
│ │ │ └── wrapper
│ │ │ │ ├── gradle-wrapper.jar
│ │ │ │ └── gradle-wrapper.properties
│ │ ├── gradlew
│ │ ├── gradlew.bat
│ │ ├── settings.gradle
│ │ └── src
│ │ │ └── main
│ └── oss
│ │ ├── .gitignore
│ │ ├── build.gradle
│ │ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ │ ├── gradlew
│ │ ├── gradlew.bat
│ │ ├── settings.gradle
│ │ └── src
│ │ └── main
├── maven
│ ├── dse
│ │ ├── .gitignore
│ │ ├── pom.xml
│ │ └── src
│ │ │ └── main
│ └── oss
│ │ ├── .gitignore
│ │ ├── pom.xml
│ │ └── src
│ │ └── main
└── sbt
│ ├── dse
│ ├── .gitignore
│ ├── build.sbt
│ ├── project
│ │ └── assembly.sbt
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── datastax
│ │ └── spark
│ │ └── example
│ │ └── WriteRead.java
│ └── oss
│ ├── .gitignore
│ ├── build.sbt
│ ├── project
│ └── assembly.sbt
│ └── src
│ └── main
└── scala
├── gradle
├── dse
│ ├── .gitignore
│ ├── build.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ ├── settings.gradle
│ └── src
│ │ ├── main
│ │ └── test
└── oss
│ ├── .gitignore
│ ├── build.gradle
│ ├── gradle
│ └── wrapper
│ │ ├── gradle-wrapper.jar
│ │ └── gradle-wrapper.properties
│ ├── gradlew
│ ├── gradlew.bat
│ ├── settings.gradle
│ └── src
│ ├── main
│ └── test
├── maven
├── dse
│ ├── .gitignore
│ ├── pom.xml
│ └── src
│ │ ├── main
│ │ └── test
└── oss
│ ├── .gitignore
│ ├── pom.xml
│ └── src
│ ├── main
│ └── test
└── sbt
├── dse
├── .gitignore
├── build.sbt
├── project
│ └── assembly.sbt
└── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── datastax
│ │ └── spark
│ │ └── example
│ │ └── WriteRead.scala
│ └── test
│ └── resources
│ ├── cassandra-3.2.yaml.template
│ ├── keystore
│ ├── log4j.properties
│ ├── logback.xml
│ ├── metrics.properties
│ ├── triggers
│ └── README.txt
│ └── truststore
└── oss
├── .gitignore
├── build.sbt
├── project
└── assembly.sbt
└── src
├── main
└── test
/.gitignore:
--------------------------------------------------------------------------------
1 | */build
2 | */target
3 | .gradle
4 | .idea
5 | metastore_db
6 | derby.log
--------------------------------------------------------------------------------
/BuildTestAll.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | start_location=`pwd`
4 | exit_value=0
5 | failures=""
6 |
7 | # First Arg Directory
8 | # Second Arg Command
9 | test_command () {
10 | for sys in "dse" "oss"
11 | do
12 | echo "Testing $1/$sys $2"
13 | cd $start_location
14 | cd $1/$sys
15 | $2 || { exit_value=$?; echo "ERROR: $1/$sys $2 Failed"; failures=$failures+"$1/$sys $2 Failed"+$'\n'; }
16 | done
17 | }
18 |
19 | for language in "java" "scala"
20 | do
21 | echo "Testing $language Builds"
22 | echo "Gradle"
23 | test_command "$language/gradle" "./gradlew -q shadowJar"
24 | echo "SBT"
25 | test_command "$language/sbt" "sbt -Dsbt.log.noformat=true --error assembly"
26 | echo "Maven"
27 | test_command "$language/maven" "mvn -q package"
28 | done
29 | echo $failures
30 | exit $exit_value
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Example projects for using DSE Analytics
2 |
3 | These are template projects that illustrate how to build Spark Application written in Java or Scala
4 | with Maven, SBT or Gradle which can be run on either DataStax Enterprise (DSE) or Apache Spark. The
5 | example project implements a simple write-to-/read-from-Cassandra application for each language and
6 | build tool.
7 |
8 | ## Dependencies
9 |
10 | Compiling Spark applications depends on Apache Spark and optionally on Spark Cassandra Connector
11 | jars. Projects `dse` and `oss` show two different ways of supplying these dependencies. Both
12 | projects are built and executed with similar commands.
13 |
14 | ### DSE
15 |
16 | If you are planning to execute your Spark Application on a DSE cluster, you can use the `dse`
17 | project template which will automatically download (and use during compilation) all jars available
18 | in the DSE cluster. Please mind the DSE version specified in the build file; it should should match
19 | the one in your cluster.
20 |
21 | Please note that DSE projects templates are meant to be built with `sbt` 0.13.13 or newer. In case of
22 | unresolved dependencies errors, please update `sbt` and than clean `ivy` cache (with
23 | `rm ~/.ivy2/cache/com.datastax.dse/dse-spark-dependencies/` command)
24 |
25 | ### OSS
26 |
27 | If you are planning to execute your Spark Application against Open Source Apache Spark and Open
28 | Source Apache Cassandra, use the `oss` project template where all dependencies have to be specified
29 | manually in build files. Please mind the dependency versions; these should match the ones in your
30 | execution environment.
31 |
32 | For additional info about version compatibility please refer to the Spark Cassandra Connector
33 | [Version Compatibility Table](https://github.com/datastax/spark-cassandra-connector#version-compatibility).
34 |
35 | ### Additional dependencies
36 |
37 | Prepared projects use extra plugins so additional dependencies can be included with your
38 | application's jar. All you need to do is add dependencies in the build configuration file.
39 |
40 | ## Building & running
41 |
42 | ### Sbt
43 |
44 | Task | Command
45 | -------------|------------
46 | build | `sbt clean assembly`
47 | run (Scala) | `dse spark-submit --class com.datastax.spark.example.WriteRead target/scala-2.11/writeRead-assembly-0.1.jar`
48 | run (Java) | `dse spark-submit --class com.datastax.spark.example.WriteRead target/writeRead-assembly-0.1.jar`
49 |
50 | ### Gradle
51 |
52 | Task | Command
53 | --------------------|------------
54 | build | `gradle shadowJar`
55 | run (Scala, Java) | `dse spark-submit --class com.datastax.spark.example.WriteRead build/libs/writeRead-0.1-all.jar`
56 |
57 | ### Maven
58 |
59 | Task | Command
60 | --------------------|------------
61 | build | `mvn package`
62 | run (Scala, Java) | `dse spark-submit --class com.datastax.spark.example.WriteRead target/writeRead-0.1.jar`
63 |
64 | Notes:
65 |
66 | 1. The above command example are for DSE. To run with open source Spark, use `spark-submit` instead
67 | 2. Also see included example script [BuildTestAll.sh](BuildTestAll.sh) which runs all combinations
68 |
69 |
70 | ## Running Integrated Tests
71 |
72 | Integrated tests have been set up under a `test` task in each build system. To run
73 | the tests, invoke the build system and then launch `test`. These tests demonstrate
74 | how to run integrated embedded Cassandra as well as Local Spark from within your testing
75 | environment.
76 |
77 | Currently only Scala Testing examples are provided.
78 |
79 | These tests should also function inside IDEs that are configured with the ability to run
80 | the build system's tests.
81 |
82 | ## Support
83 |
84 | The code, examples, and snippets provided in this repository are not "Supported Software" under any DataStax subscriptions or other agreements.
85 |
86 | ## License
87 |
88 | Copyright 2016-2023, DataStax
89 |
90 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
91 |
92 | http://www.apache.org/licenses/LICENSE-2.0
93 |
94 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
95 |
96 |
--------------------------------------------------------------------------------
/java/gradle/dse/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 |
--------------------------------------------------------------------------------
/java/gradle/dse/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id "java"
3 | id "com.github.johnrengelman.shadow" version "1.2.3"
4 | }
5 |
6 | group 'com.datastax.spark.example'
7 | version '0.1'
8 |
9 | repositories {
10 | mavenLocal() // for testing
11 | mavenCentral()
12 | maven {
13 | url "https://repo.datastax.com/public-repos/"
14 | }
15 | }
16 |
17 | def dseVersion = "6.8.35"
18 |
19 | // The assembly configuration will cause jar to be included in assembled fat-jar
20 | configurations {
21 | assembly
22 | compile.extendsFrom assembly
23 | }
24 |
25 | // The provided configuration behaves the same as the sbt "provided" keyword which will cause jars to be
26 | // excluded from assembled fat-jar
27 | configurations {
28 | provided
29 | compile.extendsFrom provided
30 | }
31 |
32 | // Please make sure that following dependencies have versions corresponding to the ones in your cluster.
33 | // Note that spark-cassandra-connector should be provided with '--packages' flag to spark-submit command.
34 | dependencies {
35 | provided "com.datastax.dse:dse-spark-dependencies:$dseVersion"
36 | // assembly "org.apache.commons:commons-math3:3.6.1"
37 | // assembly "org.apache.commons:commons-csv:1.0"
38 | }
39 |
40 | shadowJar {
41 | configurations = [project.configurations.assembly]
42 | }
43 |
44 | //shadowJar {
45 | // relocate 'org.apache.commons.csv', 'shaded.org.apache.commons.csv'
46 | //}
47 |
--------------------------------------------------------------------------------
/java/gradle/dse/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataStax-Examples/SparkBuildExamples/554374f755d1f1c381cdcf8ba5e287d7c3204a28/java/gradle/dse/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/java/gradle/dse/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | zipStoreBase=GRADLE_USER_HOME
4 | zipStorePath=wrapper/dists
5 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-bin.zip
6 |
--------------------------------------------------------------------------------
/java/gradle/dse/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Attempt to set APP_HOME
10 | # Resolve links: $0 may be a link
11 | PRG="$0"
12 | # Need this for relative symlinks.
13 | while [ -h "$PRG" ] ; do
14 | ls=`ls -ld "$PRG"`
15 | link=`expr "$ls" : '.*-> \(.*\)$'`
16 | if expr "$link" : '/.*' > /dev/null; then
17 | PRG="$link"
18 | else
19 | PRG=`dirname "$PRG"`"/$link"
20 | fi
21 | done
22 | SAVED="`pwd`"
23 | cd "`dirname \"$PRG\"`/" >/dev/null
24 | APP_HOME="`pwd -P`"
25 | cd "$SAVED" >/dev/null
26 |
27 | APP_NAME="Gradle"
28 | APP_BASE_NAME=`basename "$0"`
29 |
30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31 | DEFAULT_JVM_OPTS=""
32 |
33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
34 | MAX_FD="maximum"
35 |
36 | warn () {
37 | echo "$*"
38 | }
39 |
40 | die () {
41 | echo
42 | echo "$*"
43 | echo
44 | exit 1
45 | }
46 |
47 | # OS specific support (must be 'true' or 'false').
48 | cygwin=false
49 | msys=false
50 | darwin=false
51 | nonstop=false
52 | case "`uname`" in
53 | CYGWIN* )
54 | cygwin=true
55 | ;;
56 | Darwin* )
57 | darwin=true
58 | ;;
59 | MINGW* )
60 | msys=true
61 | ;;
62 | NONSTOP* )
63 | nonstop=true
64 | ;;
65 | esac
66 |
67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68 |
69 | # Determine the Java command to use to start the JVM.
70 | if [ -n "$JAVA_HOME" ] ; then
71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72 | # IBM's JDK on AIX uses strange locations for the executables
73 | JAVACMD="$JAVA_HOME/jre/sh/java"
74 | else
75 | JAVACMD="$JAVA_HOME/bin/java"
76 | fi
77 | if [ ! -x "$JAVACMD" ] ; then
78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79 |
80 | Please set the JAVA_HOME variable in your environment to match the
81 | location of your Java installation."
82 | fi
83 | else
84 | JAVACMD="java"
85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86 |
87 | Please set the JAVA_HOME variable in your environment to match the
88 | location of your Java installation."
89 | fi
90 |
91 | # Increase the maximum file descriptors if we can.
92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93 | MAX_FD_LIMIT=`ulimit -H -n`
94 | if [ $? -eq 0 ] ; then
95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96 | MAX_FD="$MAX_FD_LIMIT"
97 | fi
98 | ulimit -n $MAX_FD
99 | if [ $? -ne 0 ] ; then
100 | warn "Could not set maximum file descriptor limit: $MAX_FD"
101 | fi
102 | else
103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 | fi
105 | fi
106 |
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 |
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 | JAVACMD=`cygpath --unix "$JAVACMD"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Escape application args
158 | save () {
159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 | echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 |
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 |
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 | cd "$(dirname "$0")"
170 | fi
171 |
172 | exec "$JAVACMD" "$@"
173 |
--------------------------------------------------------------------------------
/java/gradle/dse/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/java/gradle/dse/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'writeRead'
--------------------------------------------------------------------------------
/java/gradle/dse/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/java/gradle/oss/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 |
--------------------------------------------------------------------------------
/java/gradle/oss/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id "java"
3 | id "com.github.johnrengelman.shadow" version "1.2.3"
4 | }
5 |
6 | group 'com.datastax.spark.example'
7 | version '0.1'
8 |
9 | repositories {
10 | mavenCentral()
11 | }
12 |
13 | def sparkVersion = "2.2.2"
14 | def connectorVersion = "2.0.10"
15 |
16 | // The assembly configuration will cause jar to be included in assembled fat-jar
17 | configurations {
18 | assembly
19 | compile.extendsFrom assembly
20 | }
21 |
22 | // The provided configuration behaves the same as the sbt "provided" keyword which will cause jars to be
23 | // excluded from assembled fat-jar
24 | configurations {
25 | provided
26 | compile.extendsFrom provided
27 | }
28 |
29 | // Please make sure that following dependencies have versions corresponding to the ones in your cluster.
30 | // Note that spark-cassandra-connector should be provided with '--packages' flag to spark-submit command.
31 | dependencies {
32 | provided "org.apache.spark:spark-core_2.10:$sparkVersion"
33 | provided "org.apache.spark:spark-sql_2.10:$sparkVersion"
34 | provided "org.apache.spark:spark-hive_2.10:$sparkVersion"
35 | provided "com.datastax.spark:spark-cassandra-connector_2.10:$connectorVersion"
36 | // assembly "org.apache.commons:commons-math3:3.6.1"
37 | // assembly "org.apache.commons:commons-csv:1.0"
38 | }
39 |
40 | shadowJar {
41 | configurations = [project.configurations.assembly]
42 | }
43 |
44 | //shadowJar {
45 | // relocate 'org.apache.commons.csv', 'shaded.org.apache.commons.csv'
46 | //}
--------------------------------------------------------------------------------
/java/gradle/oss/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataStax-Examples/SparkBuildExamples/554374f755d1f1c381cdcf8ba5e287d7c3204a28/java/gradle/oss/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/java/gradle/oss/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | zipStoreBase=GRADLE_USER_HOME
4 | zipStorePath=wrapper/dists
5 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-bin.zip
6 |
--------------------------------------------------------------------------------
/java/gradle/oss/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Attempt to set APP_HOME
10 | # Resolve links: $0 may be a link
11 | PRG="$0"
12 | # Need this for relative symlinks.
13 | while [ -h "$PRG" ] ; do
14 | ls=`ls -ld "$PRG"`
15 | link=`expr "$ls" : '.*-> \(.*\)$'`
16 | if expr "$link" : '/.*' > /dev/null; then
17 | PRG="$link"
18 | else
19 | PRG=`dirname "$PRG"`"/$link"
20 | fi
21 | done
22 | SAVED="`pwd`"
23 | cd "`dirname \"$PRG\"`/" >/dev/null
24 | APP_HOME="`pwd -P`"
25 | cd "$SAVED" >/dev/null
26 |
27 | APP_NAME="Gradle"
28 | APP_BASE_NAME=`basename "$0"`
29 |
30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31 | DEFAULT_JVM_OPTS=""
32 |
33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
34 | MAX_FD="maximum"
35 |
36 | warn () {
37 | echo "$*"
38 | }
39 |
40 | die () {
41 | echo
42 | echo "$*"
43 | echo
44 | exit 1
45 | }
46 |
47 | # OS specific support (must be 'true' or 'false').
48 | cygwin=false
49 | msys=false
50 | darwin=false
51 | nonstop=false
52 | case "`uname`" in
53 | CYGWIN* )
54 | cygwin=true
55 | ;;
56 | Darwin* )
57 | darwin=true
58 | ;;
59 | MINGW* )
60 | msys=true
61 | ;;
62 | NONSTOP* )
63 | nonstop=true
64 | ;;
65 | esac
66 |
67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68 |
69 | # Determine the Java command to use to start the JVM.
70 | if [ -n "$JAVA_HOME" ] ; then
71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72 | # IBM's JDK on AIX uses strange locations for the executables
73 | JAVACMD="$JAVA_HOME/jre/sh/java"
74 | else
75 | JAVACMD="$JAVA_HOME/bin/java"
76 | fi
77 | if [ ! -x "$JAVACMD" ] ; then
78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79 |
80 | Please set the JAVA_HOME variable in your environment to match the
81 | location of your Java installation."
82 | fi
83 | else
84 | JAVACMD="java"
85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86 |
87 | Please set the JAVA_HOME variable in your environment to match the
88 | location of your Java installation."
89 | fi
90 |
91 | # Increase the maximum file descriptors if we can.
92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93 | MAX_FD_LIMIT=`ulimit -H -n`
94 | if [ $? -eq 0 ] ; then
95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96 | MAX_FD="$MAX_FD_LIMIT"
97 | fi
98 | ulimit -n $MAX_FD
99 | if [ $? -ne 0 ] ; then
100 | warn "Could not set maximum file descriptor limit: $MAX_FD"
101 | fi
102 | else
103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 | fi
105 | fi
106 |
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 |
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 | JAVACMD=`cygpath --unix "$JAVACMD"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Escape application args
158 | save () {
159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 | echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 |
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 |
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 | cd "$(dirname "$0")"
170 | fi
171 |
172 | exec "$JAVACMD" "$@"
173 |
--------------------------------------------------------------------------------
/java/gradle/oss/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/java/gradle/oss/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'writeRead'
--------------------------------------------------------------------------------
/java/gradle/oss/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/java/maven/dse/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | target
4 |
--------------------------------------------------------------------------------
/java/maven/dse/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.datastax.spark.example
6 | writeRead
7 | 0.1
8 | jar
9 |
10 |
11 | UTF-8
12 | 6.8.35
13 |
14 |
15 |
16 |
17 | com.datastax.dse
18 | dse-spark-dependencies
19 | ${dse.version}
20 | provided
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | DataStax-Repo
38 | https://repo.datastax.com/public-repos/
39 |
40 |
41 |
42 |
43 |
44 |
45 | org.apache.maven.plugins
46 | maven-compiler-plugin
47 | 3.5.1
48 |
49 | 1.8
50 | 1.8
51 |
52 |
53 |
54 | org.apache.maven.plugins
55 | maven-shade-plugin
56 | 2.4.3
57 |
58 |
59 | package
60 |
61 | shade
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/java/maven/dse/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/java/maven/oss/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | target
4 |
--------------------------------------------------------------------------------
/java/maven/oss/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.datastax.spark.example
6 | writeRead
7 | 0.1
8 | jar
9 |
10 |
11 | UTF-8
12 | 2.2.2
13 | 2.0.10
14 |
15 |
16 |
20 |
21 |
22 | org.apache.spark
23 | spark-core_2.10
24 | ${spark.version}
25 | provided
26 |
27 |
28 | org.apache.spark
29 | spark-sql_2.10
30 | ${spark.version}
31 | provided
32 |
33 |
34 | org.apache.spark
35 | spark-hive_2.10
36 | ${spark.version}
37 | provided
38 |
39 |
40 | com.datastax.spark
41 | spark-cassandra-connector_2.10
42 | ${connector.version}
43 | provided
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 | org.apache.maven.plugins
62 | maven-compiler-plugin
63 | 3.5.1
64 |
65 | 1.8
66 | 1.8
67 |
68 |
69 |
70 | org.apache.maven.plugins
71 | maven-shade-plugin
72 | 2.4.3
73 |
74 |
75 | package
76 |
77 | shade
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/java/maven/oss/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/java/sbt/dse/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | project/target
4 | target
5 |
--------------------------------------------------------------------------------
/java/sbt/dse/build.sbt:
--------------------------------------------------------------------------------
1 | name := "writeRead"
2 | version := "0.1"
3 |
4 | crossPaths := false
5 |
6 | autoScalaLibrary := false
7 |
8 | scalaVersion := "2.11.8"
9 |
10 | resolvers += Resolver.mavenLocal // for testing
11 | resolvers += "DataStax Repo" at "https://repo.datastax.com/public-repos/"
12 |
13 | val dseVersion = "6.8.35"
14 |
15 | // Please make sure that following DSE version matches your DSE cluster version.
16 | // SBT 0.13.13 or greater required because of a dependency resolution bug
17 | libraryDependencies += "com.datastax.dse" % "dse-spark-dependencies" % dseVersion % "provided"
18 |
19 | //Your dependencies
20 | //libraryDependencies += "org.apache.commons" % "commons-math3" % "3.6.1"
21 | //libraryDependencies += "org.apache.commons" % "commons-csv" % "1.0"
22 |
23 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
24 | //assemblyShadeRules in assembly := Seq(
25 | // ShadeRule.rename("org.apache.commons.csv.**" -> "shaded.org.apache.commons.csv.@1").inAll
26 | //)
27 |
--------------------------------------------------------------------------------
/java/sbt/dse/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 |
--------------------------------------------------------------------------------
/java/sbt/dse/src/main/java/com/datastax/spark/example/WriteRead.java:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.example;
2 |
3 | import com.datastax.driver.core.Session;
4 | import com.datastax.spark.connector.cql.CassandraConnector;
5 | import com.datastax.spark.connector.japi.rdd.CassandraTableScanJavaRDD;
6 | import com.google.common.collect.ImmutableMap;
7 | import org.apache.spark.api.java.JavaRDD;
8 | import org.apache.spark.api.java.function.MapFunction;
9 | import org.apache.spark.rdd.RDD;
10 | import org.apache.spark.sql.Dataset;
11 | import org.apache.spark.sql.Encoders;
12 | import org.apache.spark.sql.Row;
13 | import org.apache.spark.sql.SparkSession;
14 | import scala.Tuple2;
15 | import scala.runtime.AbstractFunction1;
16 |
17 | import java.util.List;
18 | import java.util.stream.Collectors;
19 | import java.util.stream.IntStream;
20 |
21 | import static com.datastax.spark.connector.japi.CassandraJavaUtil.*;
22 |
23 |
24 | // For DSE it is not necessary to set connection parameters for spark.master (since it will be done
25 | // automatically)
26 | public class WriteRead {
27 | public static void main(String[] args) {
28 |
29 | // A SparkSession
30 | SparkSession spark = SparkSession
31 | .builder()
32 | .appName("Datastax Java example")
33 | .getOrCreate();
34 |
35 | CassandraConnector.apply(spark.sparkContext()).withSessionDo(
36 | new AbstractFunction1() {
37 | public Object apply(Session session) {
38 | session.execute("CREATE KEYSPACE IF NOT EXISTS ks WITH "
39 | + "replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }");
40 | return session
41 | .execute("CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))");
42 | }
43 | });
44 |
45 | JavaRDD> data = spark
46 | .range(1, 10)
47 | .javaRDD()
48 | .map(x -> new Tuple2<>(x.intValue(), x.intValue()));
49 |
50 | javaFunctions(data)
51 | .writerBuilder("ks", "kv", mapTupleToRow(Integer.class, Integer.class))
52 | .saveToCassandra();
53 |
54 | // Read data as RDD
55 | JavaRDD> scReadData = javaFunctions(spark.sparkContext())
56 | .cassandraTable("ks", "kv", mapRowToTuple(Integer.class, Integer.class));
57 |
58 | // Read data as DataSet (DataFrame)
59 | Dataset dataset = spark
60 | .read()
61 | .format("org.apache.spark.sql.cassandra")
62 | .options(ImmutableMap.of("table", "kv", "keyspace", "ks"))
63 | .load();
64 |
65 | System.out.println("Data read as RDD");
66 | scReadData.collect()
67 | .forEach(System.out::println);
68 |
69 | System.out.println("Data read as DataSet (DataFrame)");
70 | dataset
71 | .javaRDD()
72 | .map(row -> new Tuple2<>(row.getInt(0), row.getInt(1)))
73 | .collect()
74 | .forEach(System.out::println);
75 |
76 | spark.stop();
77 | System.exit(0);
78 | }
79 | }
80 |
81 |
--------------------------------------------------------------------------------
/java/sbt/oss/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | project/target
4 | target
5 |
--------------------------------------------------------------------------------
/java/sbt/oss/build.sbt:
--------------------------------------------------------------------------------
1 | name := "writeRead"
2 | version := "0.1"
3 |
4 | crossPaths := false
5 |
6 | autoScalaLibrary := false
7 |
8 | scalaVersion := "2.11.8"
9 |
10 | val sparkVersion = "2.2.2"
11 | val connectorVersion = "2.0.10"
12 |
13 | // Please make sure that following dependencies have versions corresponding to the ones in your cluster.
14 | // Note that spark-cassandra-connector should be provided with '--packages' flag to spark-submit command.
15 | libraryDependencies ++= Seq(
16 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
17 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
18 | "org.apache.spark" %% "spark-hive" % sparkVersion % "provided",
19 | "com.datastax.spark" %% "spark-cassandra-connector" % connectorVersion % "provided"
20 | )
21 |
22 | //Your dependencies
23 | //libraryDependencies += "org.apache.commons" % "commons-math3" % "3.6.1"
24 | //libraryDependencies += "org.apache.commons" % "commons-csv" % "1.0"
25 |
26 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
27 | //assemblyShadeRules in assembly := Seq(
28 | // ShadeRule.rename("org.apache.commons.csv.**" -> "shaded.org.apache.commons.csv.@1").inAll
29 | //)
30 |
--------------------------------------------------------------------------------
/java/sbt/oss/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 |
--------------------------------------------------------------------------------
/java/sbt/oss/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/scala/gradle/dse/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 |
--------------------------------------------------------------------------------
/scala/gradle/dse/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id "scala"
3 | id "com.github.johnrengelman.shadow" version "1.2.3"
4 | }
5 |
6 | group 'com.datastax.spark.example'
7 | version '0.1'
8 |
9 | repositories {
10 | mavenLocal() // for testing
11 | mavenCentral()
12 | maven {
13 | url "https://repo.datastax.com/public-repos/"
14 | }
15 | }
16 |
17 | // The assembly configuration will cause jar to be included in assembled fat-jar
18 | configurations {
19 | assembly
20 | compile.extendsFrom assembly
21 | }
22 |
23 | // The provided configuration behaves the same as the sbt "provided" keyword which will cause jars to be
24 | // excluded from assembled fat-jar
25 | configurations {
26 | provided
27 | compile.extendsFrom provided
28 | testCompile.exclude group: 'org.slf4j', module: 'log4j-over-slf4j'
29 | }
30 |
31 | def dseVersion = "6.8.35"
32 |
33 | // Please make sure that following DSE version matches your DSE cluster version.
34 | dependencies {
35 | provided("com.datastax.dse:dse-spark-dependencies:$dseVersion")
36 | // assembly "org.apache.commons:commons-math3:3.6.1"
37 | // assembly "org.apache.commons:commons-csv:1.0"
38 | }
39 |
40 | shadowJar {
41 | configurations = [project.configurations.assembly]
42 | }
43 |
44 | //shadowJar {
45 | // relocate 'org.apache.commons.csv', 'shaded.org.apache.commons.csv'
46 | //}
47 |
--------------------------------------------------------------------------------
/scala/gradle/dse/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataStax-Examples/SparkBuildExamples/554374f755d1f1c381cdcf8ba5e287d7c3204a28/scala/gradle/dse/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/scala/gradle/dse/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | zipStoreBase=GRADLE_USER_HOME
4 | zipStorePath=wrapper/dists
5 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-bin.zip
6 |
--------------------------------------------------------------------------------
/scala/gradle/dse/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Attempt to set APP_HOME
10 | # Resolve links: $0 may be a link
11 | PRG="$0"
12 | # Need this for relative symlinks.
13 | while [ -h "$PRG" ] ; do
14 | ls=`ls -ld "$PRG"`
15 | link=`expr "$ls" : '.*-> \(.*\)$'`
16 | if expr "$link" : '/.*' > /dev/null; then
17 | PRG="$link"
18 | else
19 | PRG=`dirname "$PRG"`"/$link"
20 | fi
21 | done
22 | SAVED="`pwd`"
23 | cd "`dirname \"$PRG\"`/" >/dev/null
24 | APP_HOME="`pwd -P`"
25 | cd "$SAVED" >/dev/null
26 |
27 | APP_NAME="Gradle"
28 | APP_BASE_NAME=`basename "$0"`
29 |
30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31 | DEFAULT_JVM_OPTS=""
32 |
33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
34 | MAX_FD="maximum"
35 |
36 | warn () {
37 | echo "$*"
38 | }
39 |
40 | die () {
41 | echo
42 | echo "$*"
43 | echo
44 | exit 1
45 | }
46 |
47 | # OS specific support (must be 'true' or 'false').
48 | cygwin=false
49 | msys=false
50 | darwin=false
51 | nonstop=false
52 | case "`uname`" in
53 | CYGWIN* )
54 | cygwin=true
55 | ;;
56 | Darwin* )
57 | darwin=true
58 | ;;
59 | MINGW* )
60 | msys=true
61 | ;;
62 | NONSTOP* )
63 | nonstop=true
64 | ;;
65 | esac
66 |
67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68 |
69 | # Determine the Java command to use to start the JVM.
70 | if [ -n "$JAVA_HOME" ] ; then
71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72 | # IBM's JDK on AIX uses strange locations for the executables
73 | JAVACMD="$JAVA_HOME/jre/sh/java"
74 | else
75 | JAVACMD="$JAVA_HOME/bin/java"
76 | fi
77 | if [ ! -x "$JAVACMD" ] ; then
78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79 |
80 | Please set the JAVA_HOME variable in your environment to match the
81 | location of your Java installation."
82 | fi
83 | else
84 | JAVACMD="java"
85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86 |
87 | Please set the JAVA_HOME variable in your environment to match the
88 | location of your Java installation."
89 | fi
90 |
91 | # Increase the maximum file descriptors if we can.
92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93 | MAX_FD_LIMIT=`ulimit -H -n`
94 | if [ $? -eq 0 ] ; then
95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96 | MAX_FD="$MAX_FD_LIMIT"
97 | fi
98 | ulimit -n $MAX_FD
99 | if [ $? -ne 0 ] ; then
100 | warn "Could not set maximum file descriptor limit: $MAX_FD"
101 | fi
102 | else
103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 | fi
105 | fi
106 |
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 |
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 | JAVACMD=`cygpath --unix "$JAVACMD"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Escape application args
158 | save () {
159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 | echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 |
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 |
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 | cd "$(dirname "$0")"
170 | fi
171 |
172 | exec "$JAVACMD" "$@"
173 |
--------------------------------------------------------------------------------
/scala/gradle/dse/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/scala/gradle/dse/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'writeRead'
--------------------------------------------------------------------------------
/scala/gradle/dse/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/scala/gradle/dse/src/test:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/test
--------------------------------------------------------------------------------
/scala/gradle/oss/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .idea
3 |
--------------------------------------------------------------------------------
/scala/gradle/oss/build.gradle:
--------------------------------------------------------------------------------
1 | plugins {
2 | id "scala"
3 | id "com.github.johnrengelman.shadow" version "1.2.3"
4 | }
5 |
6 | group 'com.datastax.spark.example'
7 | version '0.1'
8 |
9 | repositories {
10 | mavenCentral()
11 | }
12 |
13 | def sparkVersion = "2.2.2"
14 | def connectorVersion = "2.0.10"
15 | def scalaVersion = "2.11"
16 |
17 | // The assembly configuration will cause jar to be included in assembled fat-jar
18 | configurations {
19 | assembly
20 | compile.extendsFrom assembly
21 | }
22 |
23 | // The provided configuration behaves the same as the sbt "provided" keyword which will cause jars to be
24 | // excluded from assembled fat-jar
25 | configurations {
26 | provided
27 | compile.extendsFrom provided
28 | }
29 |
30 | // Please make sure that following dependencies have versions corresponding to the ones in your cluster.
31 | // Note that spark-cassandra-connector should be provided with '--packages' flag to spark-submit command.
32 | dependencies {
33 | provided "org.apache.spark:spark-core_$scalaVersion:$sparkVersion"
34 | provided "org.apache.spark:spark-sql_$scalaVersion:$sparkVersion"
35 | provided "org.apache.spark:spark-hive_$scalaVersion:$sparkVersion"
36 | provided "com.datastax.spark:spark-cassandra-connector_$scalaVersion:$connectorVersion"
37 | // assembly "org.apache.commons:commons-math3:3.6.1"
38 | // assembly "org.apache.commons:commons-csv:1.0"
39 | }
40 |
41 | shadowJar {
42 | configurations = [project.configurations.assembly]
43 | }
44 |
45 | //shadowJar {
46 | // relocate 'org.apache.commons.csv', 'shaded.org.apache.commons.csv'
47 | //}
--------------------------------------------------------------------------------
/scala/gradle/oss/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataStax-Examples/SparkBuildExamples/554374f755d1f1c381cdcf8ba5e287d7c3204a28/scala/gradle/oss/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/scala/gradle/oss/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | zipStoreBase=GRADLE_USER_HOME
4 | zipStorePath=wrapper/dists
5 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-bin.zip
6 |
--------------------------------------------------------------------------------
/scala/gradle/oss/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Attempt to set APP_HOME
10 | # Resolve links: $0 may be a link
11 | PRG="$0"
12 | # Need this for relative symlinks.
13 | while [ -h "$PRG" ] ; do
14 | ls=`ls -ld "$PRG"`
15 | link=`expr "$ls" : '.*-> \(.*\)$'`
16 | if expr "$link" : '/.*' > /dev/null; then
17 | PRG="$link"
18 | else
19 | PRG=`dirname "$PRG"`"/$link"
20 | fi
21 | done
22 | SAVED="`pwd`"
23 | cd "`dirname \"$PRG\"`/" >/dev/null
24 | APP_HOME="`pwd -P`"
25 | cd "$SAVED" >/dev/null
26 |
27 | APP_NAME="Gradle"
28 | APP_BASE_NAME=`basename "$0"`
29 |
30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31 | DEFAULT_JVM_OPTS=""
32 |
33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
34 | MAX_FD="maximum"
35 |
36 | warn () {
37 | echo "$*"
38 | }
39 |
40 | die () {
41 | echo
42 | echo "$*"
43 | echo
44 | exit 1
45 | }
46 |
47 | # OS specific support (must be 'true' or 'false').
48 | cygwin=false
49 | msys=false
50 | darwin=false
51 | nonstop=false
52 | case "`uname`" in
53 | CYGWIN* )
54 | cygwin=true
55 | ;;
56 | Darwin* )
57 | darwin=true
58 | ;;
59 | MINGW* )
60 | msys=true
61 | ;;
62 | NONSTOP* )
63 | nonstop=true
64 | ;;
65 | esac
66 |
67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68 |
69 | # Determine the Java command to use to start the JVM.
70 | if [ -n "$JAVA_HOME" ] ; then
71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72 | # IBM's JDK on AIX uses strange locations for the executables
73 | JAVACMD="$JAVA_HOME/jre/sh/java"
74 | else
75 | JAVACMD="$JAVA_HOME/bin/java"
76 | fi
77 | if [ ! -x "$JAVACMD" ] ; then
78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79 |
80 | Please set the JAVA_HOME variable in your environment to match the
81 | location of your Java installation."
82 | fi
83 | else
84 | JAVACMD="java"
85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86 |
87 | Please set the JAVA_HOME variable in your environment to match the
88 | location of your Java installation."
89 | fi
90 |
91 | # Increase the maximum file descriptors if we can.
92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93 | MAX_FD_LIMIT=`ulimit -H -n`
94 | if [ $? -eq 0 ] ; then
95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96 | MAX_FD="$MAX_FD_LIMIT"
97 | fi
98 | ulimit -n $MAX_FD
99 | if [ $? -ne 0 ] ; then
100 | warn "Could not set maximum file descriptor limit: $MAX_FD"
101 | fi
102 | else
103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 | fi
105 | fi
106 |
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 |
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 | JAVACMD=`cygpath --unix "$JAVACMD"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Escape application args
158 | save () {
159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 | echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 |
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 |
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 | cd "$(dirname "$0")"
170 | fi
171 |
172 | exec "$JAVACMD" "$@"
173 |
--------------------------------------------------------------------------------
/scala/gradle/oss/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/scala/gradle/oss/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'writeRead'
--------------------------------------------------------------------------------
/scala/gradle/oss/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/scala/gradle/oss/src/test:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/test
--------------------------------------------------------------------------------
/scala/maven/dse/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | target
4 |
--------------------------------------------------------------------------------
/scala/maven/dse/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.datastax.spark.example
6 | writeRead
7 | 0.1
8 | jar
9 |
10 |
11 | UTF-8
12 | 6.8.35
13 | 2.11.8
14 | 2.11
15 | 3.0.0
16 | 2.0.10
17 | 3.2
18 | 4.12
19 |
20 |
21 |
22 |
23 | com.datastax.dse
24 | dse-spark-dependencies
25 | ${dse.version}
26 | provided
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | DataStax-Repo
44 | https://repo.datastax.com/public-repos/
45 |
46 |
47 |
48 |
49 |
50 |
51 | net.alchim31.maven
52 | scala-maven-plugin
53 | 3.2.2
54 |
55 |
56 | process-sources
57 |
58 | compile
59 | testCompile
60 |
61 |
62 | ${project.build.sourceDirectory}/../scala
63 |
64 |
65 |
66 |
67 |
68 | org.apache.maven.plugins
69 | maven-shade-plugin
70 | 2.4.3
71 |
72 |
73 | package
74 |
75 | shade
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/scala/maven/dse/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/scala/maven/dse/src/test:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/test
--------------------------------------------------------------------------------
/scala/maven/oss/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | target
4 |
--------------------------------------------------------------------------------
/scala/maven/oss/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.datastax.spark.example
6 | writeRead
7 | 0.1
8 | jar
9 |
10 |
11 | UTF-8
12 | 2.11.8
13 | 2.11
14 | 2.2.2
15 | 2.0.10
16 |
17 |
18 |
22 |
23 |
24 | org.scala-lang
25 | scala-library
26 | ${scala.version}
27 | provided
28 |
29 |
30 | org.apache.spark
31 | spark-core_${scala.main.version}
32 | ${spark.version}
33 | provided
34 |
35 |
36 | org.apache.spark
37 | spark-sql_${scala.main.version}
38 | ${spark.version}
39 | provided
40 |
41 |
42 | org.apache.spark
43 | spark-hive_${scala.main.version}
44 | ${spark.version}
45 | provided
46 |
47 |
48 | com.datastax.spark
49 | spark-cassandra-connector_${scala.main.version}
50 | ${connector.version}
51 | provided
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 | net.alchim31.maven
70 | scala-maven-plugin
71 | 3.2.2
72 |
73 |
74 | process-sources
75 |
76 | compile
77 | testCompile
78 |
79 |
80 | ${project.build.sourceDirectory}/../scala
81 |
82 |
83 |
84 |
85 |
86 | org.apache.maven.plugins
87 | maven-shade-plugin
88 | 2.4.3
89 |
90 |
91 | package
92 |
93 | shade
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/scala/maven/oss/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/scala/maven/oss/src/test:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/test
--------------------------------------------------------------------------------
/scala/sbt/dse/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | project/target
4 | target
5 |
--------------------------------------------------------------------------------
/scala/sbt/dse/build.sbt:
--------------------------------------------------------------------------------
1 | name := "writeRead"
2 | version := "0.1"
3 |
4 | scalaVersion := "2.11.8"
5 |
6 | resolvers += Resolver.mavenLocal // for testing
7 | resolvers += "DataStax Repo" at "https://repo.datastax.com/public-repos/"
8 |
9 | val dseVersion = "6.8.35"
10 |
11 | // Please make sure that following DSE version matches your DSE cluster version.
12 | // Exclusions are solely for running integrated testing
13 | // Warning Sbt 0.13.13 or greater is required due to a bug with dependency resolution
14 | libraryDependencies += "com.datastax.dse" % "dse-spark-dependencies" % dseVersion % "provided" exclude(
15 | "org.slf4j", "log4j-over-slf4j")
16 |
17 | //Your dependencies
18 | //libraryDependencies += "org.apache.commons" % "commons-math3" % "3.6.1"
19 | //libraryDependencies += "org.apache.commons" % "commons-csv" % "1.0"
20 |
21 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
22 | //assemblyShadeRules in assembly := Seq(
23 | // ShadeRule.rename("org.apache.commons.csv.**" -> "shaded.org.apache.commons.csv.@1").inAll
24 | //)
25 |
--------------------------------------------------------------------------------
/scala/sbt/dse/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 |
--------------------------------------------------------------------------------
/scala/sbt/dse/src/main/scala/com/datastax/spark/example/WriteRead.scala:
--------------------------------------------------------------------------------
1 | package com.datastax.spark.example
2 |
3 | import com.datastax.spark.connector._
4 | import com.datastax.spark.connector.cql.CassandraConnector
5 | import org.apache.spark.sql.{SaveMode, SparkSession}
6 | import org.apache.spark.sql.cassandra._
7 |
8 | // For DSE it is not necessary to set connection parameters for spark.master (since it will be done
9 | // automatically)
10 | object WriteRead extends App {
11 |
12 | val spark = SparkSession.builder
13 | .appName("Datastax Scala example")
14 | .enableHiveSupport()
15 | .getOrCreate()
16 |
17 | import spark.implicits._
18 |
19 | // Create keyspace and table
20 | CassandraConnector(spark.sparkContext).withSessionDo { session =>
21 | session.execute(
22 | """CREATE KEYSPACE IF NOT EXISTS ks WITH
23 | | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
24 | session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
25 | }
26 |
27 | // Write some data
28 | spark.range(1, 10)
29 | .map(x => (x, x))
30 | .rdd
31 | .saveToCassandra("ks", "kv")
32 |
33 | // Read data as RDD
34 | val rdd = spark.sparkContext
35 | .cassandraTable(keyspace = "ks", table = "kv")
36 |
37 | // Read data as DataSet (DataFrame)
38 | val dataset = spark.read
39 | .cassandraFormat(keyspace = "ks", table = "kv")
40 | .load()
41 |
42 | println("Data read as RDD")
43 | rdd.collect()
44 | .foreach(println)
45 |
46 | println("Data read as DataSet (DataFrame)")
47 | dataset.collect()
48 | .foreach(println)
49 |
50 | spark.stop()
51 | sys.exit(0)
52 | }
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/cassandra-3.2.yaml.template:
--------------------------------------------------------------------------------
1 | # Cassandra storage config YAML
2 |
3 | # NOTE:
4 | # See http://wiki.apache.org/cassandra/StorageConfiguration for
5 | # full explanations of configuration directives
6 | # /NOTE
7 |
8 | # The name of the cluster. This is mainly used to prevent machines in
9 | # one logical cluster from joining another.
10 | cluster_name: 'Test Cluster'
11 |
12 | # This defines the number of tokens randomly assigned to this node on the ring
13 | # The more tokens, relative to other nodes, the larger the proportion of data
14 | # that this node will store. You probably want all nodes to have the same number
15 | # of tokens assuming they have equal hardware capability.
16 | #
17 | # If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility,
18 | # and will use the initial_token as described below.
19 | #
20 | # Specifying initial_token will override this setting on the node's initial start,
21 | # on subsequent starts, this setting will apply even if initial token is set.
22 | #
23 | # If you already have a cluster with 1 token per node, and wish to migrate to
24 | # multiple tokens per node, see http://wiki.apache.org/cassandra/Operations
25 | num_tokens: 256
26 |
27 | # Triggers automatic allocation of num_tokens tokens for this node. The allocation
28 | # algorithm attempts to choose tokens in a way that optimizes replicated load over
29 | # the nodes in the datacenter for the replication strategy used by the specified
30 | # keyspace.
31 | #
32 | # The load assigned to each node will be close to proportional to its number of
33 | # vnodes.
34 | #
35 | # Only supported with the Murmur3Partitioner.
36 | # allocate_tokens_for_keyspace: KEYSPACE
37 |
38 | # initial_token allows you to specify tokens manually. While you can use # it with
39 | # vnodes (num_tokens > 1, above) -- in which case you should provide a
40 | # comma-separated list -- it's primarily used when adding nodes # to legacy clusters
41 | # that do not have vnodes enabled.
42 | # initial_token:
43 |
44 | # See http://wiki.apache.org/cassandra/HintedHandoff
45 | # May either be "true" or "false" to enable globally
46 | hinted_handoff_enabled: true
47 | # When hinted_handoff_enabled is true, a black list of data centers that will not
48 | # perform hinted handoff
49 | #hinted_handoff_disabled_datacenters:
50 | # - DC1
51 | # - DC2
52 | # this defines the maximum amount of time a dead host will have hints
53 | # generated. After it has been dead this long, new hints for it will not be
54 | # created until it has been seen alive and gone down again.
55 | max_hint_window_in_ms: 10800000 # 3 hours
56 |
57 | # Maximum throttle in KBs per second, per delivery thread. This will be
58 | # reduced proportionally to the number of nodes in the cluster. (If there
59 | # are two nodes in the cluster, each delivery thread will use the maximum
60 | # rate; if there are three, each will throttle to half of the maximum,
61 | # since we expect two nodes to be delivering hints simultaneously.)
62 | hinted_handoff_throttle_in_kb: 1024
63 |
64 | # Number of threads with which to deliver hints;
65 | # Consider increasing this number when you have multi-dc deployments, since
66 | # cross-dc handoff tends to be slower
67 | max_hints_delivery_threads: 2
68 |
69 | # Directory where Cassandra should store hints.
70 | # If not set, the default directory is $CASSANDRA_HOME/data/hints.
71 | # hints_directory: /var/lib/cassandra/hints
72 |
73 | # How often hints should be flushed from the internal buffers to disk.
74 | # Will *not* trigger fsync.
75 | hints_flush_period_in_ms: 10000
76 |
77 | # Maximum size for a single hints file, in megabytes.
78 | max_hints_file_size_in_mb: 128
79 |
80 | # Compression to apply to the hint files. If omitted, hints files
81 | # will be written uncompressed. LZ4, Snappy, and Deflate compressors
82 | # are supported.
83 | #hints_compression:
84 | # - class_name: LZ4Compressor
85 | # parameters:
86 | # -
87 |
88 | # Maximum throttle in KBs per second, total. This will be
89 | # reduced proportionally to the number of nodes in the cluster.
90 | batchlog_replay_throttle_in_kb: 1024
91 |
92 | # Authentication backend, implementing IAuthenticator; used to identify users
93 | # Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator,
94 | # PasswordAuthenticator}.
95 | #
96 | # - AllowAllAuthenticator performs no checks - set it to disable authentication.
97 | # - PasswordAuthenticator relies on username/password pairs to authenticate
98 | # users. It keeps usernames and hashed passwords in system_auth.credentials table.
99 | # Please increase system_auth keyspace replication factor if you use this authenticator.
100 | # If using PasswordAuthenticator, CassandraRoleManager must also be used (see below)
101 | authenticator: AllowAllAuthenticator
102 |
103 | # Authorization backend, implementing IAuthorizer; used to limit access/provide permissions
104 | # Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer,
105 | # CassandraAuthorizer}.
106 | #
107 | # - AllowAllAuthorizer allows any action to any user - set it to disable authorization.
108 | # - CassandraAuthorizer stores permissions in system_auth.permissions table. Please
109 | # increase system_auth keyspace replication factor if you use this authorizer.
110 | authorizer: AllowAllAuthorizer
111 |
112 | # Part of the Authentication & Authorization backend, implementing IRoleManager; used
113 | # to maintain grants and memberships between roles.
114 | # Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager,
115 | # which stores role information in the system_auth keyspace. Most functions of the
116 | # IRoleManager require an authenticated login, so unless the configured IAuthenticator
117 | # actually implements authentication, most of this functionality will be unavailable.
118 | #
119 | # - CassandraRoleManager stores role data in the system_auth keyspace. Please
120 | # increase system_auth keyspace replication factor if you use this role manager.
121 | role_manager: CassandraRoleManager
122 |
123 | # Validity period for roles cache (fetching permissions can be an
124 | # expensive operation depending on the authorizer). Granted roles are cached for
125 | # authenticated sessions in AuthenticatedUser and after the period specified
126 | # here, become eligible for (async) reload.
127 | # Defaults to 2000, set to 0 to disable.
128 | # Will be disabled automatically for AllowAllAuthenticator.
129 | roles_validity_in_ms: 2000
130 |
131 | # Refresh interval for roles cache (if enabled).
132 | # After this interval, cache entries become eligible for refresh. Upon next
133 | # access, an async reload is scheduled and the old value returned until it
134 | # completes. If roles_validity_in_ms is non-zero, then this must be
135 | # also.
136 | # Defaults to the same value as roles_validity_in_ms.
137 | # roles_update_interval_in_ms: 1000
138 |
139 | # Validity period for permissions cache (fetching permissions can be an
140 | # expensive operation depending on the authorizer, CassandraAuthorizer is
141 | # one example). Defaults to 2000, set to 0 to disable.
142 | # Will be disabled automatically for AllowAllAuthorizer.
143 | permissions_validity_in_ms: 2000
144 |
145 | # Refresh interval for permissions cache (if enabled).
146 | # After this interval, cache entries become eligible for refresh. Upon next
147 | # access, an async reload is scheduled and the old value returned until it
148 | # completes. If permissions_validity_in_ms is non-zero, then this must be
149 | # also.
150 | # Defaults to the same value as permissions_validity_in_ms.
151 | # permissions_update_interval_in_ms: 1000
152 |
153 | # The partitioner is responsible for distributing groups of rows (by
154 | # partition key) across nodes in the cluster. You should leave this
155 | # alone for new clusters. The partitioner can NOT be changed without
156 | # reloading all data, so when upgrading you should set this to the
157 | # same partitioner you were already using.
158 | #
159 | # Besides Murmur3Partitioner, partitioners included for backwards
160 | # compatibility include RandomPartitioner, ByteOrderedPartitioner, and
161 | # OrderPreservingPartitioner.
162 | #
163 | partitioner: org.apache.cassandra.dht.Murmur3Partitioner
164 |
165 | # Directories where Cassandra should store data on disk. Cassandra
166 | # will spread data evenly across them, subject to the granularity of
167 | # the configured compaction strategy.
168 | # If not set, the default directory is $CASSANDRA_HOME/data/data.
169 | # data_file_directories:
170 | # - /var/lib/cassandra/data
171 |
172 | # commit log. when running on magnetic HDD, this should be a
173 | # separate spindle than the data directories.
174 | # If not set, the default directory is $CASSANDRA_HOME/data/commitlog.
175 | # commitlog_directory: /var/lib/cassandra/commitlog
176 |
177 | # policy for data disk failures:
178 | # die: shut down gossip and client transports and kill the JVM for any fs errors or
179 | # single-sstable errors, so the node can be replaced.
180 | # stop_paranoid: shut down gossip and client transports even for single-sstable errors,
181 | # kill the JVM for errors during startup.
182 | # stop: shut down gossip and client transports, leaving the node effectively dead, but
183 | # can still be inspected via JMX, kill the JVM for errors during startup.
184 | # best_effort: stop using the failed disk and respond to requests based on
185 | # remaining available sstables. This means you WILL see obsolete
186 | # data at CL.ONE!
187 | # ignore: ignore fatal errors and let requests fail, as in pre-1.2 Cassandra
188 | disk_failure_policy: stop
189 |
190 | # policy for commit disk failures:
191 | # die: shut down gossip and Thrift and kill the JVM, so the node can be replaced.
192 | # stop: shut down gossip and Thrift, leaving the node effectively dead, but
193 | # can still be inspected via JMX.
194 | # stop_commit: shutdown the commit log, letting writes collect but
195 | # continuing to service reads, as in pre-2.0.5 Cassandra
196 | # ignore: ignore fatal errors and let the batches fail
197 | commit_failure_policy: stop
198 |
199 | # Maximum size of the key cache in memory.
200 | #
201 | # Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the
202 | # minimum, sometimes more. The key cache is fairly tiny for the amount of
203 | # time it saves, so it's worthwhile to use it at large numbers.
204 | # The row cache saves even more time, but must contain the entire row,
205 | # so it is extremely space-intensive. It's best to only use the
206 | # row cache if you have hot rows or static rows.
207 | #
208 | # NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
209 | #
210 | # Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache.
211 | key_cache_size_in_mb:
212 |
213 | # Duration in seconds after which Cassandra should
214 | # save the key cache. Caches are saved to saved_caches_directory as
215 | # specified in this configuration file.
216 | #
217 | # Saved caches greatly improve cold-start speeds, and is relatively cheap in
218 | # terms of I/O for the key cache. Row cache saving is much more expensive and
219 | # has limited use.
220 | #
221 | # Default is 14400 or 4 hours.
222 | key_cache_save_period: 14400
223 |
224 | # Number of keys from the key cache to save
225 | # Disabled by default, meaning all keys are going to be saved
226 | # key_cache_keys_to_save: 100
227 |
228 | # Row cache implementation class name.
229 | # Available implementations:
230 | # org.apache.cassandra.cache.OHCProvider Fully off-heap row cache implementation (default).
231 | # org.apache.cassandra.cache.SerializingCacheProvider This is the row cache implementation availabile
232 | # in previous releases of Cassandra.
233 | # row_cache_class_name: org.apache.cassandra.cache.OHCProvider
234 |
235 | # Maximum size of the row cache in memory.
236 | # Please note that OHC cache implementation requires some additional off-heap memory to manage
237 | # the map structures and some in-flight memory during operations before/after cache entries can be
238 | # accounted against the cache capacity. This overhead is usually small compared to the whole capacity.
239 | # Do not specify more memory that the system can afford in the worst usual situation and leave some
240 | # headroom for OS block level cache. Do never allow your system to swap.
241 | #
242 | # Default value is 0, to disable row caching.
243 | row_cache_size_in_mb: 0
244 |
245 | # Duration in seconds after which Cassandra should save the row cache.
246 | # Caches are saved to saved_caches_directory as specified in this configuration file.
247 | #
248 | # Saved caches greatly improve cold-start speeds, and is relatively cheap in
249 | # terms of I/O for the key cache. Row cache saving is much more expensive and
250 | # has limited use.
251 | #
252 | # Default is 0 to disable saving the row cache.
253 | row_cache_save_period: 0
254 |
255 | # Number of keys from the row cache to save.
256 | # Specify 0 (which is the default), meaning all keys are going to be saved
257 | # row_cache_keys_to_save: 100
258 |
259 | # Maximum size of the counter cache in memory.
260 | #
261 | # Counter cache helps to reduce counter locks' contention for hot counter cells.
262 | # In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before
263 | # write entirely. With RF > 1 a counter cache hit will still help to reduce the duration
264 | # of the lock hold, helping with hot counter cell updates, but will not allow skipping
265 | # the read entirely. Only the local (clock, count) tuple of a counter cell is kept
266 | # in memory, not the whole counter, so it's relatively cheap.
267 | #
268 | # NOTE: if you reduce the size, you may not get you hottest keys loaded on startup.
269 | #
270 | # Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache.
271 | # NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache.
272 | counter_cache_size_in_mb:
273 |
274 | # Duration in seconds after which Cassandra should
275 | # save the counter cache (keys only). Caches are saved to saved_caches_directory as
276 | # specified in this configuration file.
277 | #
278 | # Default is 7200 or 2 hours.
279 | counter_cache_save_period: 7200
280 |
281 | # Number of keys from the counter cache to save
282 | # Disabled by default, meaning all keys are going to be saved
283 | # counter_cache_keys_to_save: 100
284 |
285 | # saved caches
286 | # If not set, the default directory is $CASSANDRA_HOME/data/saved_caches.
287 | # saved_caches_directory: /var/lib/cassandra/saved_caches
288 |
289 | # commitlog_sync may be either "periodic" or "batch."
290 | #
291 | # When in batch mode, Cassandra won't ack writes until the commit log
292 | # has been fsynced to disk. It will wait
293 | # commitlog_sync_batch_window_in_ms milliseconds between fsyncs.
294 | # This window should be kept short because the writer threads will
295 | # be unable to do extra work while waiting. (You may need to increase
296 | # concurrent_writes for the same reason.)
297 | #
298 | # commitlog_sync: batch
299 | # commitlog_sync_batch_window_in_ms: 2
300 | #
301 | # the other option is "periodic" where writes may be acked immediately
302 | # and the CommitLog is simply synced every commitlog_sync_period_in_ms
303 | # milliseconds.
304 | commitlog_sync: periodic
305 | commitlog_sync_period_in_ms: 10000
306 |
307 | # The size of the individual commitlog file segments. A commitlog
308 | # segment may be archived, deleted, or recycled once all the data
309 | # in it (potentially from each columnfamily in the system) has been
310 | # flushed to sstables.
311 | #
312 | # The default size is 32, which is almost always fine, but if you are
313 | # archiving commitlog segments (see commitlog_archiving.properties),
314 | # then you probably want a finer granularity of archiving; 8 or 16 MB
315 | # is reasonable.
316 | # Max mutation size is also configurable via max_mutation_size_in_kb setting in
317 | # cassandra.yaml. The default is half the size commitlog_segment_size_in_mb * 1024.
318 | #
319 | # NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must
320 | # be set to at least twice the size of max_mutation_size_in_kb / 1024
321 | #
322 | commitlog_segment_size_in_mb: 32
323 |
324 | # Compression to apply to the commit log. If omitted, the commit log
325 | # will be written uncompressed. LZ4, Snappy, and Deflate compressors
326 | # are supported.
327 | #commitlog_compression:
328 | # - class_name: LZ4Compressor
329 | # parameters:
330 | # -
331 |
332 | # any class that implements the SeedProvider interface and has a
333 | # constructor that takes a Map of parameters will do.
334 | seed_provider:
335 | # Addresses of hosts that are deemed contact points.
336 | # Cassandra nodes use this list of hosts to find each other and learn
337 | # the topology of the ring. You must change this if you are running
338 | # multiple nodes!
339 | - class_name: org.apache.cassandra.locator.SimpleSeedProvider
340 | parameters:
341 | # seeds is actually a comma-delimited list of addresses.
342 | # Ex: ",,"
343 | - seeds: "127.0.0.1"
344 |
345 | # For workloads with more data than can fit in memory, Cassandra's
346 | # bottleneck will be reads that need to fetch data from
347 | # disk. "concurrent_reads" should be set to (16 * number_of_drives) in
348 | # order to allow the operations to enqueue low enough in the stack
349 | # that the OS and drives can reorder them. Same applies to
350 | # "concurrent_counter_writes", since counter writes read the current
351 | # values before incrementing and writing them back.
352 | #
353 | # On the other hand, since writes are almost never IO bound, the ideal
354 | # number of "concurrent_writes" is dependent on the number of cores in
355 | # your system; (8 * number_of_cores) is a good rule of thumb.
356 | concurrent_reads: 32
357 | concurrent_writes: 32
358 | concurrent_counter_writes: 32
359 |
360 | # For materialized view writes, as there is a read involved, so this should
361 | # be limited by the less of concurrent reads or concurrent writes.
362 | concurrent_materialized_view_writes: 32
363 |
364 | # Maximum memory to use for pooling sstable buffers. Defaults to the smaller
365 | # of 1/4 of heap or 512MB. This pool is allocated off-heap, so is in addition
366 | # to the memory allocated for heap. Memory is only allocated as needed.
367 | # file_cache_size_in_mb: 512
368 |
369 | # Flag indicating whether to allocate on or off heap when the sstable buffer
370 | # pool is exhausted, that is when it has exceeded the maximum memory
371 | # file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request.
372 |
373 | # buffer_pool_use_heap_if_exhausted: true
374 |
375 | # The strategy for optimizing disk read
376 | # Possible values are:
377 | # ssd (for solid state disks, the default)
378 | # spinning (for spinning disks)
379 | # disk_optimization_strategy: ssd
380 |
381 | # Total permitted memory to use for memtables. Cassandra will stop
382 | # accepting writes when the limit is exceeded until a flush completes,
383 | # and will trigger a flush based on memtable_cleanup_threshold
384 | # If omitted, Cassandra will set both to 1/4 the size of the heap.
385 | # memtable_heap_space_in_mb: 2048
386 | # memtable_offheap_space_in_mb: 2048
387 |
388 | # Ratio of occupied non-flushing memtable size to total permitted size
389 | # that will trigger a flush of the largest memtable. Larger mct will
390 | # mean larger flushes and hence less compaction, but also less concurrent
391 | # flush activity which can make it difficult to keep your disks fed
392 | # under heavy write load.
393 | #
394 | # memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1)
395 | # memtable_cleanup_threshold: 0.11
396 |
397 | # Specify the way Cassandra allocates and manages memtable memory.
398 | # Options are:
399 | # heap_buffers: on heap nio buffers
400 | # offheap_buffers: off heap (direct) nio buffers
401 | memtable_allocation_type: heap_buffers
402 |
403 | # Total space to use for commit logs on disk.
404 | #
405 | # If space gets above this value, Cassandra will flush every dirty CF
406 | # in the oldest segment and remove it. So a small total commitlog space
407 | # will tend to cause more flush activity on less-active columnfamilies.
408 | #
409 | # The default value is the smaller of 8192, and 1/4 of the total space
410 | # of the commitlog volume.
411 | #
412 | # commitlog_total_space_in_mb: 8192
413 |
414 | # This sets the amount of memtable flush writer threads. These will
415 | # be blocked by disk io, and each one will hold a memtable in memory
416 | # while blocked.
417 | #
418 | # memtable_flush_writers defaults to one per data_file_directory.
419 | #
420 | # If your data directories are backed by SSD, you can increase this, but
421 | # avoid having memtable_flush_writers * data_file_directories > number of cores
422 | #memtable_flush_writers: 1
423 |
424 | # A fixed memory pool size in MB for for SSTable index summaries. If left
425 | # empty, this will default to 5% of the heap size. If the memory usage of
426 | # all index summaries exceeds this limit, SSTables with low read rates will
427 | # shrink their index summaries in order to meet this limit. However, this
428 | # is a best-effort process. In extreme conditions Cassandra may need to use
429 | # more than this amount of memory.
430 | index_summary_capacity_in_mb:
431 |
432 | # How frequently index summaries should be resampled. This is done
433 | # periodically to redistribute memory from the fixed-size pool to sstables
434 | # proportional their recent read rates. Setting to -1 will disable this
435 | # process, leaving existing index summaries at their current sampling level.
436 | index_summary_resize_interval_in_minutes: 60
437 |
438 | # Whether to, when doing sequential writing, fsync() at intervals in
439 | # order to force the operating system to flush the dirty
440 | # buffers. Enable this to avoid sudden dirty buffer flushing from
441 | # impacting read latencies. Almost always a good idea on SSDs; not
442 | # necessarily on platters.
443 | trickle_fsync: false
444 | trickle_fsync_interval_in_kb: 10240
445 |
446 | # TCP port, for commands and data
447 | # For security reasons, you should not expose this port to the internet. Firewall it if needed.
448 | storage_port: 7000
449 |
450 | # SSL port, for encrypted communication. Unused unless enabled in
451 | # encryption_options
452 | # For security reasons, you should not expose this port to the internet. Firewall it if needed.
453 | ssl_storage_port: 7001
454 |
455 | # Address or interface to bind to and tell other Cassandra nodes to connect to.
456 | # You _must_ change this if you want multiple nodes to be able to communicate!
457 | #
458 | # Set listen_address OR listen_interface, not both. Interfaces must correspond
459 | # to a single address, IP aliasing is not supported.
460 | #
461 | # Leaving it blank leaves it up to InetAddress.getLocalHost(). This
462 | # will always do the Right Thing _if_ the node is properly configured
463 | # (hostname, name resolution, etc), and the Right Thing is to use the
464 | # address associated with the hostname (it might not be).
465 | #
466 | # Setting listen_address to 0.0.0.0 is always wrong.
467 | #
468 | # If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
469 | # you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4
470 | # address will be used. If true the first ipv6 address will be used. Defaults to false preferring
471 | # ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
472 | listen_address: localhost
473 | # listen_interface: eth0
474 | # listen_interface_prefer_ipv6: false
475 |
476 | # Address to broadcast to other Cassandra nodes
477 | # Leaving this blank will set it to the same value as listen_address
478 | # broadcast_address: 1.2.3.4
479 |
480 | # When using multiple physical network interfaces, set this
481 | # to true to listen on broadcast_address in addition to
482 | # the listen_address, allowing nodes to communicate in both
483 | # interfaces.
484 | # Ignore this property if the network configuration automatically
485 | # routes between the public and private networks such as EC2.
486 | # listen_on_broadcast_address: false
487 |
488 | # Internode authentication backend, implementing IInternodeAuthenticator;
489 | # used to allow/disallow connections from peer nodes.
490 | # internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator
491 |
492 | # Whether to start the native transport server.
493 | # Please note that the address on which the native transport is bound is the
494 | # same as the rpc_address. The port however is different and specified below.
495 | start_native_transport: true
496 | # port for the CQL native transport to listen for clients on
497 | # For security reasons, you should not expose this port to the internet. Firewall it if needed.
498 | native_transport_port: 9042
499 | # Enabling native transport encryption in client_encryption_options allows you to either use
500 | # encryption for the standard port or to use a dedicated, additional port along with the unencrypted
501 | # standard native_transport_port.
502 | # Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption
503 | # for native_transport_port. Setting native_transport_port_ssl to a different value
504 | # from native_transport_port will use encryption for native_transport_port_ssl while
505 | # keeping native_transport_port unencrypted.
506 | # native_transport_port_ssl: 9142
507 | # The maximum threads for handling requests when the native transport is used.
508 | # This is similar to rpc_max_threads though the default differs slightly (and
509 | # there is no native_transport_min_threads, idle threads will always be stopped
510 | # after 30 seconds).
511 | # native_transport_max_threads: 128
512 | #
513 | # The maximum size of allowed frame. Frame (requests) larger than this will
514 | # be rejected as invalid. The default is 256MB.
515 | # native_transport_max_frame_size_in_mb: 256
516 |
517 | # The maximum number of concurrent client connections.
518 | # The default is -1, which means unlimited.
519 | # native_transport_max_concurrent_connections: -1
520 |
521 | # The maximum number of concurrent client connections per source ip.
522 | # The default is -1, which means unlimited.
523 | # native_transport_max_concurrent_connections_per_ip: -1
524 |
525 | # Whether to start the thrift rpc server.
526 | start_rpc: false
527 |
528 | # The address or interface to bind the Thrift RPC service and native transport
529 | # server to.
530 | #
531 | # Set rpc_address OR rpc_interface, not both. Interfaces must correspond
532 | # to a single address, IP aliasing is not supported.
533 | #
534 | # Leaving rpc_address blank has the same effect as on listen_address
535 | # (i.e. it will be based on the configured hostname of the node).
536 | #
537 | # Note that unlike listen_address, you can specify 0.0.0.0, but you must also
538 | # set broadcast_rpc_address to a value other than 0.0.0.0.
539 | #
540 | # For security reasons, you should not expose this port to the internet. Firewall it if needed.
541 | #
542 | # If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address
543 | # you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4
544 | # address will be used. If true the first ipv6 address will be used. Defaults to false preferring
545 | # ipv4. If there is only one address it will be selected regardless of ipv4/ipv6.
546 | rpc_address: localhost
547 | # rpc_interface: eth1
548 | # rpc_interface_prefer_ipv6: false
549 |
550 | # port for Thrift to listen for clients on
551 | rpc_port: 9160
552 |
553 | # RPC address to broadcast to drivers and other Cassandra nodes. This cannot
554 | # be set to 0.0.0.0. If left blank, this will be set to the value of
555 | # rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must
556 | # be set.
557 | # broadcast_rpc_address: 1.2.3.4
558 |
559 | # enable or disable keepalive on rpc/native connections
560 | rpc_keepalive: true
561 |
562 | # Cassandra provides two out-of-the-box options for the RPC Server:
563 | #
564 | # sync -> One thread per thrift connection. For a very large number of clients, memory
565 | # will be your limiting factor. On a 64 bit JVM, 180KB is the minimum stack size
566 | # per thread, and that will correspond to your use of virtual memory (but physical memory
567 | # may be limited depending on use of stack space).
568 | #
569 | # hsha -> Stands for "half synchronous, half asynchronous." All thrift clients are handled
570 | # asynchronously using a small number of threads that does not vary with the amount
571 | # of thrift clients (and thus scales well to many clients). The rpc requests are still
572 | # synchronous (one thread per active request). If hsha is selected then it is essential
573 | # that rpc_max_threads is changed from the default value of unlimited.
574 | #
575 | # The default is sync because on Windows hsha is about 30% slower. On Linux,
576 | # sync/hsha performance is about the same, with hsha of course using less memory.
577 | #
578 | # Alternatively, can provide your own RPC server by providing the fully-qualified class name
579 | # of an o.a.c.t.TServerFactory that can create an instance of it.
580 | rpc_server_type: sync
581 |
582 | # Uncomment rpc_min|max_thread to set request pool size limits.
583 | #
584 | # Regardless of your choice of RPC server (see above), the number of maximum requests in the
585 | # RPC thread pool dictates how many concurrent requests are possible (but if you are using the sync
586 | # RPC server, it also dictates the number of clients that can be connected at all).
587 | #
588 | # The default is unlimited and thus provides no protection against clients overwhelming the server. You are
589 | # encouraged to set a maximum that makes sense for you in production, but do keep in mind that
590 | # rpc_max_threads represents the maximum number of client requests this server may execute concurrently.
591 | #
592 | # rpc_min_threads: 16
593 | # rpc_max_threads: 2048
594 |
595 | # uncomment to set socket buffer sizes on rpc connections
596 | # rpc_send_buff_size_in_bytes:
597 | # rpc_recv_buff_size_in_bytes:
598 |
599 | # Uncomment to set socket buffer size for internode communication
600 | # Note that when setting this, the buffer size is limited by net.core.wmem_max
601 | # and when not setting it it is defined by net.ipv4.tcp_wmem
602 | # See:
603 | # /proc/sys/net/core/wmem_max
604 | # /proc/sys/net/core/rmem_max
605 | # /proc/sys/net/ipv4/tcp_wmem
606 | # /proc/sys/net/ipv4/tcp_wmem
607 | # and: man tcp
608 | # internode_send_buff_size_in_bytes:
609 | # internode_recv_buff_size_in_bytes:
610 |
611 | # Frame size for thrift (maximum message length).
612 | thrift_framed_transport_size_in_mb: 15
613 |
614 | # Set to true to have Cassandra create a hard link to each sstable
615 | # flushed or streamed locally in a backups/ subdirectory of the
616 | # keyspace data. Removing these links is the operator's
617 | # responsibility.
618 | incremental_backups: false
619 |
620 | # Whether or not to take a snapshot before each compaction. Be
621 | # careful using this option, since Cassandra won't clean up the
622 | # snapshots for you. Mostly useful if you're paranoid when there
623 | # is a data format change.
624 | snapshot_before_compaction: false
625 |
626 | # Whether or not a snapshot is taken of the data before keyspace truncation
627 | # or dropping of column families. The STRONGLY advised default of true
628 | # should be used to provide data safety. If you set this flag to false, you will
629 | # lose data on truncation or drop.
630 | auto_snapshot: true
631 |
632 | # When executing a scan, within or across a partition, we need to keep the
633 | # tombstones seen in memory so we can return them to the coordinator, which
634 | # will use them to make sure other replicas also know about the deleted rows.
635 | # With workloads that generate a lot of tombstones, this can cause performance
636 | # problems and even exaust the server heap.
637 | # (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
638 | # Adjust the thresholds here if you understand the dangers and want to
639 | # scan more tombstones anyway. These thresholds may also be adjusted at runtime
640 | # using the StorageService mbean.
641 | tombstone_warn_threshold: 1000
642 | tombstone_failure_threshold: 100000
643 |
644 | # Granularity of the collation index of rows within a partition.
645 | # Increase if your rows are large, or if you have a very large
646 | # number of rows per partition. The competing goals are these:
647 | # 1) a smaller granularity means more index entries are generated
648 | # and looking up rows withing the partition by collation column
649 | # is faster
650 | # 2) but, Cassandra will keep the collation index in memory for hot
651 | # rows (as part of the key cache), so a larger granularity means
652 | # you can cache more hot rows
653 | column_index_size_in_kb: 64
654 |
655 |
656 | # Log WARN on any batch size exceeding this value. 5kb per batch by default.
657 | # Caution should be taken on increasing the size of this threshold as it can lead to node instability.
658 | batch_size_warn_threshold_in_kb: 5
659 |
660 | # Fail any batch exceeding this value. 50kb (10x warn threshold) by default.
661 | batch_size_fail_threshold_in_kb: 50
662 |
663 | # Number of simultaneous compactions to allow, NOT including
664 | # validation "compactions" for anti-entropy repair. Simultaneous
665 | # compactions can help preserve read performance in a mixed read/write
666 | # workload, by mitigating the tendency of small sstables to accumulate
667 | # during a single long running compactions. The default is usually
668 | # fine and if you experience problems with compaction running too
669 | # slowly or too fast, you should look at
670 | # compaction_throughput_mb_per_sec first.
671 | #
672 | # concurrent_compactors defaults to the smaller of (number of disks,
673 | # number of cores), with a minimum of 2 and a maximum of 8.
674 | #
675 | # If your data directories are backed by SSD, you should increase this
676 | # to the number of cores.
677 | #concurrent_compactors: 1
678 |
679 | # Throttles compaction to the given total throughput across the entire
680 | # system. The faster you insert data, the faster you need to compact in
681 | # order to keep the sstable count down, but in general, setting this to
682 | # 16 to 32 times the rate you are inserting data is more than sufficient.
683 | # Setting this to 0 disables throttling. Note that this account for all types
684 | # of compaction, including validation compaction.
685 | compaction_throughput_mb_per_sec: 16
686 |
687 | # Log a warning when compacting partitions larger than this value
688 | compaction_large_partition_warning_threshold_mb: 100
689 |
690 | # When compacting, the replacement sstable(s) can be opened before they
691 | # are completely written, and used in place of the prior sstables for
692 | # any range that has been written. This helps to smoothly transfer reads
693 | # between the sstables, reducing page cache churn and keeping hot rows hot
694 | sstable_preemptive_open_interval_in_mb: 50
695 |
696 | # Throttles all outbound streaming file transfers on this node to the
697 | # given total throughput in Mbps. This is necessary because Cassandra does
698 | # mostly sequential IO when streaming data during bootstrap or repair, which
699 | # can lead to saturating the network connection and degrading rpc performance.
700 | # When unset, the default is 200 Mbps or 25 MB/s.
701 | # stream_throughput_outbound_megabits_per_sec: 200
702 |
703 | # Throttles all streaming file transfer between the datacenters,
704 | # this setting allows users to throttle inter dc stream throughput in addition
705 | # to throttling all network stream traffic as configured with
706 | # stream_throughput_outbound_megabits_per_sec
707 | # inter_dc_stream_throughput_outbound_megabits_per_sec:
708 |
709 | # How long the coordinator should wait for read operations to complete
710 | read_request_timeout_in_ms: 5000
711 | # How long the coordinator should wait for seq or index scans to complete
712 | range_request_timeout_in_ms: 10000
713 | # How long the coordinator should wait for writes to complete
714 | write_request_timeout_in_ms: 2000
715 | # How long the coordinator should wait for counter writes to complete
716 | counter_write_request_timeout_in_ms: 5000
717 | # How long a coordinator should continue to retry a CAS operation
718 | # that contends with other proposals for the same row
719 | cas_contention_timeout_in_ms: 1000
720 | # How long the coordinator should wait for truncates to complete
721 | # (This can be much longer, because unless auto_snapshot is disabled
722 | # we need to flush first so we can snapshot before removing the data.)
723 | truncate_request_timeout_in_ms: 60000
724 | # The default timeout for other, miscellaneous operations
725 | request_timeout_in_ms: 10000
726 |
727 | # Enable operation timeout information exchange between nodes to accurately
728 | # measure request timeouts. If disabled, replicas will assume that requests
729 | # were forwarded to them instantly by the coordinator, which means that
730 | # under overload conditions we will waste that much extra time processing
731 | # already-timed-out requests.
732 | #
733 | # Warning: before enabling this property make sure to ntp is installed
734 | # and the times are synchronized between the nodes.
735 | cross_node_timeout: false
736 |
737 | # Enable socket timeout for streaming operation.
738 | # When a timeout occurs during streaming, streaming is retried from the start
739 | # of the current file. This _can_ involve re-streaming an important amount of
740 | # data, so you should avoid setting the value too low.
741 | # Default value is 3600000, which means streams timeout after an hour.
742 | # streaming_socket_timeout_in_ms: 3600000
743 |
744 | # phi value that must be reached for a host to be marked down.
745 | # most users should never need to adjust this.
746 | # phi_convict_threshold: 8
747 |
748 | # endpoint_snitch -- Set this to a class that implements
749 | # IEndpointSnitch. The snitch has two functions:
750 | # - it teaches Cassandra enough about your network topology to route
751 | # requests efficiently
752 | # - it allows Cassandra to spread replicas around your cluster to avoid
753 | # correlated failures. It does this by grouping machines into
754 | # "datacenters" and "racks." Cassandra will do its best not to have
755 | # more than one replica on the same "rack" (which may not actually
756 | # be a physical location)
757 | #
758 | # IF YOU CHANGE THE SNITCH AFTER DATA IS INSERTED INTO THE CLUSTER,
759 | # YOU MUST RUN A FULL REPAIR, SINCE THE SNITCH AFFECTS WHERE REPLICAS
760 | # ARE PLACED.
761 | #
762 | # IF THE RACK A REPLICA IS PLACED IN CHANGES AFTER THE REPLICA HAS BEEN
763 | # ADDED TO A RING, THE NODE MUST BE DECOMMISSIONED AND REBOOTSTRAPPED.
764 | #
765 | # Out of the box, Cassandra provides
766 | # - SimpleSnitch:
767 | # Treats Strategy order as proximity. This can improve cache
768 | # locality when disabling read repair. Only appropriate for
769 | # single-datacenter deployments.
770 | # - GossipingPropertyFileSnitch
771 | # This should be your go-to snitch for production use. The rack
772 | # and datacenter for the local node are defined in
773 | # cassandra-rackdc.properties and propagated to other nodes via
774 | # gossip. If cassandra-topology.properties exists, it is used as a
775 | # fallback, allowing migration from the PropertyFileSnitch.
776 | # - PropertyFileSnitch:
777 | # Proximity is determined by rack and data center, which are
778 | # explicitly configured in cassandra-topology.properties.
779 | # - Ec2Snitch:
780 | # Appropriate for EC2 deployments in a single Region. Loads Region
781 | # and Availability Zone information from the EC2 API. The Region is
782 | # treated as the datacenter, and the Availability Zone as the rack.
783 | # Only private IPs are used, so this will not work across multiple
784 | # Regions.
785 | # - Ec2MultiRegionSnitch:
786 | # Uses public IPs as broadcast_address to allow cross-region
787 | # connectivity. (Thus, you should set seed addresses to the public
788 | # IP as well.) You will need to open the storage_port or
789 | # ssl_storage_port on the public IP firewall. (For intra-Region
790 | # traffic, Cassandra will switch to the private IP after
791 | # establishing a connection.)
792 | # - RackInferringSnitch:
793 | # Proximity is determined by rack and data center, which are
794 | # assumed to correspond to the 3rd and 2nd octet of each node's IP
795 | # address, respectively. Unless this happens to match your
796 | # deployment conventions, this is best used as an example of
797 | # writing a custom Snitch class and is provided in that spirit.
798 | #
799 | # You can use a custom Snitch by setting this to the full class name
800 | # of the snitch, which will be assumed to be on your classpath.
801 | endpoint_snitch: SimpleSnitch
802 |
803 | # controls how often to perform the more expensive part of host score
804 | # calculation
805 | dynamic_snitch_update_interval_in_ms: 100
806 | # controls how often to reset all host scores, allowing a bad host to
807 | # possibly recover
808 | dynamic_snitch_reset_interval_in_ms: 600000
809 | # if set greater than zero and read_repair_chance is < 1.0, this will allow
810 | # 'pinning' of replicas to hosts in order to increase cache capacity.
811 | # The badness threshold will control how much worse the pinned host has to be
812 | # before the dynamic snitch will prefer other replicas over it. This is
813 | # expressed as a double which represents a percentage. Thus, a value of
814 | # 0.2 means Cassandra would continue to prefer the static snitch values
815 | # until the pinned host was 20% worse than the fastest.
816 | dynamic_snitch_badness_threshold: 0.1
817 |
818 | # request_scheduler -- Set this to a class that implements
819 | # RequestScheduler, which will schedule incoming client requests
820 | # according to the specific policy. This is useful for multi-tenancy
821 | # with a single Cassandra cluster.
822 | # NOTE: This is specifically for requests from the client and does
823 | # not affect inter node communication.
824 | # org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place
825 | # org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of
826 | # client requests to a node with a separate queue for each
827 | # request_scheduler_id. The scheduler is further customized by
828 | # request_scheduler_options as described below.
829 | request_scheduler: org.apache.cassandra.scheduler.NoScheduler
830 |
831 | # Scheduler Options vary based on the type of scheduler
832 | # NoScheduler - Has no options
833 | # RoundRobin
834 | # - throttle_limit -- The throttle_limit is the number of in-flight
835 | # requests per client. Requests beyond
836 | # that limit are queued up until
837 | # running requests can complete.
838 | # The value of 80 here is twice the number of
839 | # concurrent_reads + concurrent_writes.
840 | # - default_weight -- default_weight is optional and allows for
841 | # overriding the default which is 1.
842 | # - weights -- Weights are optional and will default to 1 or the
843 | # overridden default_weight. The weight translates into how
844 | # many requests are handled during each turn of the
845 | # RoundRobin, based on the scheduler id.
846 | #
847 | # request_scheduler_options:
848 | # throttle_limit: 80
849 | # default_weight: 5
850 | # weights:
851 | # Keyspace1: 1
852 | # Keyspace2: 5
853 |
854 | # request_scheduler_id -- An identifier based on which to perform
855 | # the request scheduling. Currently the only valid option is keyspace.
856 | # request_scheduler_id: keyspace
857 |
858 | # Enable or disable inter-node encryption
859 | # Default settings are TLS v1, RSA 1024-bit keys (it is imperative that
860 | # users generate their own keys) TLS_RSA_WITH_AES_128_CBC_SHA as the cipher
861 | # suite for authentication, key exchange and encryption of the actual data transfers.
862 | # Use the DHE/ECDHE ciphers if running in FIPS 140 compliant mode.
863 | # NOTE: No custom encryption options are enabled at the moment
864 | # The available internode options are : all, none, dc, rack
865 | #
866 | # If set to dc cassandra will encrypt the traffic between the DCs
867 | # If set to rack cassandra will encrypt the traffic between the racks
868 | #
869 | # The passwords used in these options must match the passwords used when generating
870 | # the keystore and truststore. For instructions on generating these files, see:
871 | # http://download.oracle.com/javase/6/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore
872 | #
873 | server_encryption_options:
874 | internode_encryption: none
875 | keystore: conf/.keystore
876 | keystore_password: cassandra
877 | truststore: conf/.truststore
878 | truststore_password: cassandra
879 | # More advanced defaults below:
880 | # protocol: TLS
881 | # algorithm: SunX509
882 | # store_type: JKS
883 | # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
884 | # require_client_auth: false
885 |
886 | # enable or disable client/server encryption.
887 | client_encryption_options:
888 | enabled: false
889 | # If enabled and optional is set to true encrypted and unencrypted connections are handled.
890 | optional: false
891 | keystore: conf/.keystore
892 | keystore_password: cassandra
893 | # require_client_auth: false
894 | # Set trustore and truststore_password if require_client_auth is true
895 | # truststore: conf/.truststore
896 | # truststore_password: cassandra
897 | # More advanced defaults below:
898 | # protocol: TLS
899 | # algorithm: SunX509
900 | # store_type: JKS
901 | # cipher_suites: [TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA,TLS_DHE_RSA_WITH_AES_128_CBC_SHA,TLS_DHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA]
902 |
903 | # internode_compression controls whether traffic between nodes is
904 | # compressed.
905 | # can be: all - all traffic is compressed
906 | # dc - traffic between different datacenters is compressed
907 | # none - nothing is compressed.
908 | internode_compression: all
909 |
910 | # Enable or disable tcp_nodelay for inter-dc communication.
911 | # Disabling it will result in larger (but fewer) network packets being sent,
912 | # reducing overhead from the TCP protocol itself, at the cost of increasing
913 | # latency if you block for cross-datacenter responses.
914 | inter_dc_tcp_nodelay: false
915 |
916 | # TTL for different trace types used during logging of the repair process.
917 | tracetype_query_ttl: 86400
918 | tracetype_repair_ttl: 604800
919 |
920 | # GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level
921 | # Adjust the threshold based on your application throughput requirement
922 | # By default, Cassandra logs GC Pauses greater than 200 ms at INFO level
923 | gc_warn_threshold_in_ms: 1000
924 |
925 | # UDFs (user defined functions) are disabled by default.
926 | # As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code.
927 | enable_user_defined_functions: false
928 |
929 | # Enables scripted UDFs (JavaScript UDFs).
930 | # Java UDFs are always enabled, if enable_user_defined_functions is true.
931 | # Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider.
932 | # This option has no effect, if enable_user_defined_functions is false.
933 | enable_scripted_user_defined_functions: false
934 |
935 | # The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation.
936 | # Lowering this value on Windows can provide much tighter latency and better throughput, however
937 | # some virtualized environments may see a negative performance impact from changing this setting
938 | # below their system default. The sysinternals 'clockres' tool can confirm your system's default
939 | # setting.
940 | windows_timer_interval: 1
941 |
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/keystore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataStax-Examples/SparkBuildExamples/554374f755d1f1c381cdcf8ba5e287d7c3204a28/scala/sbt/dse/src/test/resources/keystore
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one
2 | # or more contributor license agreements. See the NOTICE file
3 | # distributed with this work for additional information
4 | # regarding copyright ownership. The ASF licenses this file
5 | # to you under the Apache License, Version 2.0 (the
6 | # "License"); you may not use this file except in compliance
7 | # with the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # for production, you should probably set pattern to %c instead of %l.
18 | # (%l is slower.)
19 |
20 | # output messages into a rolling log file as well as stdout
21 | log4j.rootLogger=WARN,stdout
22 |
23 | # stdout
24 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
25 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
26 | log4j.appender.stdout.layout.ConversionPattern=%5p %d{HH:mm:ss,SSS} %C (%F:%L) - %m%n
27 |
28 | # Avoid "no host ID found" when starting a fresh node
29 | log4j.logger.org.apache.cassandra.db.SystemKeyspace=ERROR
30 |
31 | # Avoid "address already in use" when starting multiple local Spark masters
32 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
34 |
35 | # Suppress some warnings
36 | log4j.logger.com.datastax.spark.connector=INFO
37 | log4j.logger.org.apache.cassandra=ERROR
38 | log4j.logger.com.datastax.driver.core.NettyUtil=ERROR
39 | log4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR
40 | log4j.logger.org.apache.cassandra.utils.CLibrary=ERROR
41 | log4j.logger.org.apache.cassandra.service.StartupChecks=ERROR
42 | log4j.logger.org.spark-project.jetty.server.Server=ERROR
43 | log4j.logger.org.eclipse.jetty.server.Server=ERROR
44 |
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
6 |
7 |
8 |
9 |
11 |
12 |
13 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/metrics.properties:
--------------------------------------------------------------------------------
1 | #*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
2 | #
3 | ## Polling period for CsvSink
4 | #*.sink.csv.period=1
5 | #
6 | #*.sink.csv.unit=seconds
7 | #
8 | ## Polling directory for CsvSink
9 | #*.sink.csv.directory=/tmp/spark/sink
10 |
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/triggers/README.txt:
--------------------------------------------------------------------------------
1 | Place triggers to be loaded in this directory, as jar files.
2 |
--------------------------------------------------------------------------------
/scala/sbt/dse/src/test/resources/truststore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataStax-Examples/SparkBuildExamples/554374f755d1f1c381cdcf8ba5e287d7c3204a28/scala/sbt/dse/src/test/resources/truststore
--------------------------------------------------------------------------------
/scala/sbt/oss/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | project/target
4 | target
5 |
--------------------------------------------------------------------------------
/scala/sbt/oss/build.sbt:
--------------------------------------------------------------------------------
1 | name := "writeRead"
2 | version := "0.1"
3 |
4 | scalaVersion := "2.11.8"
5 |
6 | val sparkVersion = "2.2.2"
7 | val connectorVersion = "2.0.10"
8 |
9 | // Please make sure that following dependencies have versions corresponding to the ones in your cluster.
10 | // Note that spark-cassandra-connector should be provided with '--packages' flag to spark-submit command.
11 | libraryDependencies ++= Seq(
12 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
13 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
14 | "org.apache.spark" %% "spark-hive" % sparkVersion % "provided",
15 | "com.datastax.spark" %% "spark-cassandra-connector" % connectorVersion % "provided"
16 | )
17 |
18 | //Your dependencies
19 | //libraryDependencies += "org.apache.commons" % "commons-math3" % "3.6.1"
20 | //libraryDependencies += "org.apache.commons" % "commons-csv" % "1.0"
21 |
22 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
23 | //assemblyShadeRules in assembly := Seq(
24 | // ShadeRule.rename("org.apache.commons.csv.**" -> "shaded.org.apache.commons.csv.@1").inAll
25 | //)
26 |
--------------------------------------------------------------------------------
/scala/sbt/oss/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 |
--------------------------------------------------------------------------------
/scala/sbt/oss/src/main:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/main/
--------------------------------------------------------------------------------
/scala/sbt/oss/src/test:
--------------------------------------------------------------------------------
1 | ../../../sbt/dse/src/test
--------------------------------------------------------------------------------