├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── inputfile.txt
    └── people.txt
├── pom.xml
├── renovate.json
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── javachen
        │   │       └── spark
        │   │           └── examples
        │   │               ├── mllib
        │   │                   └── JavaALS.java
        │   │               ├── rdd
        │   │                   └── JavaWordCount.java
        │   │               └── sparksql
        │   │                   ├── JavaSparkSQLByReflection.java
        │   │                   └── JavaSparkSQLBySchema.java
        ├── python
        │   ├── PythonALS.py
        │   ├── PythonSparkSQLByReflection.py
        │   ├── PythonSparkSQLBySchema.py
        │   └── PythonWordCount.py
        └── scala
        │   ├── com
        │       └── javachen
        │       │   └── spark
        │       │       └── examples
        │       │           ├── mllib
        │       │               ├── EvaluateResult.scala
        │       │               ├── MovieLensALS.scala
        │       │               ├── MovieSimilarities.scala
        │       │               ├── ScalaLocalALS.scala
        │       │               └── ScalaMovieLensALS.scala
        │       │           ├── rdd
        │       │               ├── ActionTest.scala
        │       │               ├── Aggregate.scala
        │       │               ├── AggregateOrder.scala
        │       │               ├── Cartesian.scala
        │       │               ├── CollectAsMap.scala
        │       │               ├── FlatMap.scala
        │       │               ├── GroupByAction.scala
        │       │               ├── GroupByKey.scala
        │       │               ├── GroupWith.scala
        │       │               ├── Join.scala
        │       │               ├── Lookup.scala
        │       │               ├── MapPartitions.scala
        │       │               ├── MapValues.scala
        │       │               ├── PartitionBy.scala
        │       │               ├── Pipe.scala
        │       │               ├── ReduceByKey.scala
        │       │               ├── ScalaWordCount.scala
        │       │               └── TransformTest.scala
        │       │           └── sparksql
        │       │               ├── ScalaSparkSQLByReflection.scala
        │       │               └── ScalaSparkSQLBySchema.scala
        │   └── org
        │       └── apache
        │           └── spark
        │               └── examples
        │                   ├── BroadcastTest.scala
        │                   ├── DriverSubmissionTest.scala
        │                   ├── ExceptionHandlingTest.scala
        │                   ├── GroupByTest.scala
        │                   ├── HdfsTest.scala
        │                   ├── LocalALS.scala
        │                   ├── LocalFileLR.scala
        │                   ├── LocalKMeans.scala
        │                   ├── LocalLR.scala
        │                   ├── LocalPi.scala
        │                   ├── LogQuery.scala
        │                   ├── MultiBroadcastTest.scala
        │                   ├── SimpleSkewedGroupByTest.scala
        │                   ├── SkewedGroupByTest.scala
        │                   ├── SparkALS.scala
        │                   ├── SparkHdfsLR.scala
        │                   ├── SparkKMeans.scala
        │                   ├── SparkLR.scala
        │                   ├── SparkPageRank.scala
        │                   ├── SparkPi.scala
        │                   ├── SparkTC.scala
        │                   ├── SparkTachyonHdfsLR.scala
        │                   └── SparkTachyonPi.scala
    └── test
        └── java
            └── com
                └── javachen
                    └── spark
                        └── AppTest.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .idea/
 3 | target/
 4 | *.class
 5 | 
 6 | # Mobile Tools for Java (J2ME)
 7 | .mtj.tmp/
 8 | 
 9 | # Package Files #
10 | *.jar
11 | *.war
12 | *.ear
13 | 
14 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # learning-spark
2 | 
3 | Learning to write Spark examples
4 | 
5 | # Links
6 | 
7 | - https://github.com/JerryLead/SparkLearning
8 | - https://github.com/ceteri/spark-exercises
9 | - https://github.com/databricks/reference-apps


--------------------------------------------------------------------------------
/data/inputfile.txt:
--------------------------------------------------------------------------------
1 | apple
2 | banana counter
3 | counter one two three
4 | three one
5 | five seven eight
6 | twenty one three five counter six
7 | one siz helga
8 | apple banana fiver


--------------------------------------------------------------------------------
/data/people.txt:
--------------------------------------------------------------------------------
1 | Michael, 29
2 | Andy, 30
3 | Justin, 19


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 |     <groupId>com.javachen.spark</groupId>
  5 |     <artifactId>learning-spark</artifactId>
  6 |     <packaging>jar</packaging>
  7 |     <version>1.0-SNAPSHOT</version>
  8 |     <name>learning-spark</name>
  9 |     <url>http://maven.apache.org</url>
 10 | 
 11 |     <properties>
 12 | 
 13 |         <!-- Maven settings -->
 14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 16 |         <!-- Dependency versions -->
 17 |         <java.version>1.7</java.version>
 18 |         <scala.minor.version>2.10</scala.minor.version>
 19 |         <scala.complete.version>${scala.minor.version}.4</scala.complete.version>
 20 |         <scala.macros.version>2.1.0</scala.macros.version>
 21 | 
 22 |         <hadoop.version>2.10.2</hadoop.version>
 23 |         <hbase.version>1.7.2</hbase.version>
 24 |         <spark.version>1.6.3.2.6.5.0-292</spark.version>
 25 | 
 26 |         <slf4j.version>1.7.12</slf4j.version>
 27 | 
 28 |         <PermGen>64m</PermGen>
 29 |         <MaxPermGen>512m</MaxPermGen>
 30 |     </properties>
 31 | 
 32 | 
 33 |     <repositories>
 34 |         <repository>
 35 |             <id>spring-snapshots</id>
 36 |             <url>http://repo.spring.io/snapshot</url>
 37 |             <snapshots>
 38 |                 <enabled>true</enabled>
 39 |             </snapshots>
 40 |         </repository>
 41 |         <repository>
 42 |             <id>spring-milestones</id>
 43 |             <url>http://repo.spring.io/milestone</url>
 44 |         </repository>
 45 |         <repository>
 46 |             <id>cloudera repository</id>
 47 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
 48 |         </repository>
 49 |         <repository>
 50 |             <id>Sonatype Releases</id>
 51 |             <url>https://oss.sonatype.org/content/repositories/releases/</url>
 52 |         </repository>
 53 |     </repositories>
 54 | 
 55 |     <dependencies>
 56 |         <dependency>
 57 |             <groupId>org.apache.spark</groupId>
 58 |             <artifactId>spark-core_${scala.minor.version}</artifactId>
 59 |             <version>${spark.version}</version>
 60 |             <scope>provided</scope>
 61 |             <exclusions>
 62 |                 <exclusion>
 63 |                     <groupId>org.apache.hadoop</groupId>
 64 |                     <artifactId>hadoop-client</artifactId>
 65 |                 </exclusion>
 66 |             </exclusions>
 67 |         </dependency>
 68 |         <dependency>
 69 |             <groupId>org.apache.spark</groupId>
 70 |             <artifactId>spark-streaming_${scala.minor.version}</artifactId>
 71 |             <version>${spark.version}</version>
 72 |             <exclusions>
 73 |                 <exclusion>
 74 |                     <groupId>org.apache.hadoop</groupId>
 75 |                     <artifactId>hadoop-client</artifactId>
 76 |                 </exclusion>
 77 |             </exclusions>
 78 |         </dependency>
 79 |         <dependency>
 80 |             <groupId>org.apache.spark</groupId>
 81 |             <artifactId>spark-mllib_${scala.minor.version}</artifactId>
 82 |             <version>${spark.version}</version>
 83 |             <scope>provided</scope>
 84 |             <exclusions>
 85 |                 <exclusion>
 86 |                     <groupId>org.apache.hadoop</groupId>
 87 |                     <artifactId>hadoop-client</artifactId>
 88 |                 </exclusion>
 89 |             </exclusions>
 90 |         </dependency>
 91 |         <dependency>
 92 |             <groupId>org.apache.spark</groupId>
 93 |             <artifactId>spark-sql_${scala.minor.version}</artifactId>
 94 |             <version>${spark.version}</version>
 95 |             <scope>provided</scope>
 96 |             <exclusions>
 97 |                 <exclusion>
 98 |                     <groupId>org.apache.hadoop</groupId>
 99 |                     <artifactId>hadoop-client</artifactId>
100 |                 </exclusion>
101 |                 <exclusion>
102 |                     <groupId>org.apache.hive</groupId>
103 |                     <artifactId>hive-exec</artifactId>
104 |                 </exclusion>
105 |             </exclusions>
106 |         </dependency>
107 |         <dependency>
108 |             <groupId>org.apache.spark</groupId>
109 |             <artifactId>spark-hive_${scala.minor.version}</artifactId>
110 |             <version>${spark.version}</version>
111 |             <scope>provided</scope>
112 |             <exclusions>
113 |                 <exclusion>
114 |                     <groupId>org.apache.hadoop</groupId>
115 |                     <artifactId>hadoop-client</artifactId>
116 |                 </exclusion>
117 |                 <exclusion>
118 |                     <groupId>org.apache.hive</groupId>
119 |                     <artifactId>hive-exec</artifactId>
120 |                 </exclusion>
121 |             </exclusions>
122 |         </dependency>
123 | 
124 | 
125 |         <dependency>
126 |             <groupId>org.apache.hadoop</groupId>
127 |             <artifactId>hadoop-hdfs</artifactId>
128 |             <version>${hadoop.version}</version>
129 |             <scope>provided</scope>
130 |             <exclusions>
131 |                 <exclusion>
132 |                     <groupId>javax.servlet</groupId>
133 |                     <artifactId>servlet-api</artifactId>
134 |                 </exclusion>
135 |                 <exclusion>
136 |                     <groupId>javax.servlet.jsp</groupId>
137 |                     <artifactId>jsp-api</artifactId>
138 |                 </exclusion>
139 |                 <exclusion>
140 |                     <groupId>org.mortbay.jetty</groupId>
141 |                     <artifactId>servlet-api-2.5</artifactId>
142 |                 </exclusion>
143 |                 <exclusion>
144 |                     <groupId>com.google.guava</groupId>
145 |                     <artifactId>guava</artifactId>
146 |                 </exclusion>
147 |             </exclusions>
148 |         </dependency>
149 |         <dependency>
150 |             <groupId>org.apache.hadoop</groupId>
151 |             <artifactId>hadoop-client</artifactId>
152 |             <version>${hadoop.version}</version>
153 |             <scope>provided</scope>
154 |             <exclusions>
155 |                 <exclusion>
156 |                     <groupId>javax.servlet</groupId>
157 |                     <artifactId>servlet-api</artifactId>
158 |                 </exclusion>
159 |                 <exclusion>
160 |                     <groupId>javax.servlet.jsp</groupId>
161 |                     <artifactId>jsp-api</artifactId>
162 |                 </exclusion>
163 |                 <exclusion>
164 |                     <groupId>org.mortbay.jetty</groupId>
165 |                     <artifactId>servlet-api-2.5</artifactId>
166 |                 </exclusion>
167 |                 <exclusion>
168 |                     <groupId>com.google.guava</groupId>
169 |                     <artifactId>guava</artifactId>
170 |                 </exclusion>
171 |             </exclusions>
172 |         </dependency>
173 | 
174 |         <dependency>
175 |             <groupId>org.apache.hbase</groupId>
176 |             <artifactId>hbase-client</artifactId>
177 |             <version>${hbase.version}</version>
178 |             <exclusions>
179 |                 <exclusion>
180 |                     <groupId>org.apache.hadoop</groupId>
181 |                     <artifactId>hadoop-core</artifactId>
182 |                 </exclusion>
183 |                 <exclusion>
184 |                     <groupId>com.sun.jersey</groupId>
185 |                     <artifactId>jersey-json</artifactId>
186 |                 </exclusion>
187 |                 <exclusion>
188 |                     <groupId>org.slf4j</groupId>
189 |                     <artifactId>slf4j-log4j12</artifactId>
190 |                 </exclusion>
191 |                 <exclusion>
192 |                     <groupId>org.mortbay.jetty</groupId>
193 |                     <artifactId>servlet-api-2.5</artifactId>
194 |                 </exclusion>
195 |                 <exclusion>
196 |                     <groupId>com.google.guava</groupId>
197 |                     <artifactId>guava</artifactId>
198 |                 </exclusion>
199 |             </exclusions>
200 |         </dependency>
201 |         <dependency>
202 |             <groupId>org.apache.hbase</groupId>
203 |             <artifactId>hbase-server</artifactId>
204 |             <version>${hbase.version}</version>
205 |             <exclusions>
206 |                 <exclusion>
207 |                     <groupId>org.apache.hadoop</groupId>
208 |                     <artifactId>hadoop-core</artifactId>
209 |                 </exclusion>
210 |                 <exclusion>
211 |                     <groupId>org.mortbay.jetty</groupId>
212 |                     <artifactId>jsp-2.1</artifactId>
213 |                 </exclusion>
214 |                 <exclusion>
215 |                     <groupId>org.mortbay.jetty</groupId>
216 |                     <artifactId>jsp-api-2.1</artifactId>
217 |                 </exclusion>
218 |                 <exclusion>
219 |                     <groupId>javax.servlet.jsp</groupId>
220 |                     <artifactId>jsp-api</artifactId>
221 |                 </exclusion>
222 |                 <exclusion>
223 |                     <groupId>org.mortbay.jetty</groupId>
224 |                     <artifactId>servlet-api-2.5</artifactId>
225 |                 </exclusion>
226 |                 <exclusion>
227 |                     <groupId>com.sun.jersey</groupId>
228 |                     <artifactId>jersey-json</artifactId>
229 |                 </exclusion>
230 |                 <exclusion>
231 |                     <groupId>org.codehaus.jackson</groupId>
232 |                     <artifactId>jackson-mapper-asl</artifactId>
233 |                 </exclusion>
234 |                 <exclusion>
235 |                     <groupId>org.codehaus.jackson</groupId>
236 |                     <artifactId>jackson-core-asl</artifactId>
237 |                 </exclusion>
238 |                 <exclusion>
239 |                     <groupId>org.codehaus.jackson</groupId>
240 |                     <artifactId>jackson-jaxrs</artifactId>
241 |                 </exclusion>
242 |                 <exclusion>
243 |                     <groupId>javax.servlet</groupId>
244 |                     <artifactId>servlet-api</artifactId>
245 |                 </exclusion>
246 |                 <exclusion>
247 |                     <groupId>javax.servlet.jsp</groupId>
248 |                     <artifactId>jsp-api</artifactId>
249 |                 </exclusion>
250 |                 <exclusion>
251 |                     <groupId>org.mortbay.jetty</groupId>
252 |                     <artifactId>servlet-api-2.5</artifactId>
253 |                 </exclusion>
254 |                 <exclusion>
255 |                     <groupId>com.google.guava</groupId>
256 |                     <artifactId>guava</artifactId>
257 |                 </exclusion>
258 |             </exclusions>
259 |         </dependency>
260 | 
261 |         <!-- scala -->
262 |         <dependency>
263 |             <groupId>org.scalanlp</groupId>
264 |             <artifactId>breeze_${scala.minor.version}</artifactId>
265 |             <!-- or 2.11 -->
266 |             <version>0.13.2</version>
267 |             <scope>provided</scope>
268 |         </dependency>
269 |         <dependency>
270 |             <groupId>org.scala-lang</groupId>
271 |             <artifactId>scala-compiler</artifactId>
272 |             <version>${scala.complete.version}</version>
273 |             <scope>provided</scope>
274 |         </dependency>
275 |         <dependency>
276 |             <groupId>org.scala-lang</groupId>
277 |             <artifactId>scala-reflect</artifactId>
278 |             <version>${scala.complete.version}</version>
279 |             <scope>provided</scope>
280 |         </dependency>
281 |         <dependency>
282 |             <groupId>org.scala-lang</groupId>
283 |             <artifactId>jline</artifactId>
284 |             <version>${scala.complete.version}</version>
285 |             <scope>provided</scope>
286 |         </dependency>
287 |         <dependency>
288 |             <groupId>org.scala-lang</groupId>
289 |             <artifactId>scala-library</artifactId>
290 |             <version>${scala.complete.version}</version>
291 |             <scope>provided</scope>
292 |         </dependency>
293 |         <dependency>
294 |             <groupId>org.scala-lang</groupId>
295 |             <artifactId>scala-actors</artifactId>
296 |             <version>${scala.complete.version}</version>
297 |             <scope>provided</scope>
298 |         </dependency>
299 |         <dependency>
300 |             <groupId>org.scala-lang</groupId>
301 |             <artifactId>scalap</artifactId>
302 |             <version>${scala.complete.version}</version>
303 |             <scope>provided</scope>
304 |         </dependency>
305 |         <dependency>
306 |             <groupId>org.scalaj</groupId>
307 |             <artifactId>scalaj-collection_${scala.minor.version}</artifactId>
308 |             <version>1.6</version>
309 |             <scope>provided</scope>
310 |         </dependency>
311 | 
312 |         <dependency>
313 |             <groupId>com.github.scopt</groupId>
314 |             <artifactId>scopt_${scala.minor.version}</artifactId>
315 |             <version>3.7.1</version>
316 |         </dependency>
317 | 
318 |     </dependencies>
319 | 
320 |     <build>
321 |         <plugins>
322 |             <plugin>
323 |                 <groupId>org.apache.maven.plugins</groupId>
324 |                 <artifactId>maven-compiler-plugin</artifactId>
325 |                 <version>3.13.0</version>
326 |                 <configuration>
327 |                     <optimize>true</optimize>
328 |                     <showDeprecation>true</showDeprecation>
329 |                     <showWarnings>true</showWarnings>
330 |                     <source>${java.version}</source>
331 |                     <target>${java.version}</target>
332 |                     <compilerArgs>
333 |                         <compilerArg>-Xlint:all,-serial,-try</compilerArg>
334 |                     </compilerArgs>
335 |                 </configuration>
336 |             </plugin>
337 |             <plugin>
338 |                 <groupId>org.apache.maven.plugins</groupId>
339 |                 <artifactId>maven-clean-plugin</artifactId>
340 |                 <version>2.6.1</version>
341 |             </plugin>
342 |             <plugin>
343 |                 <groupId>org.apache.maven.plugins</groupId>
344 |                 <artifactId>maven-source-plugin</artifactId>
345 |                 <version>2.4</version>
346 |                 <executions>
347 |                     <execution>
348 |                         <id>attach-sources</id>
349 |                         <goals>
350 |                             <goal>jar</goal>
351 |                         </goals>
352 |                     </execution>
353 |                 </executions>
354 |             </plugin>
355 |             <plugin>
356 |                 <groupId>org.apache.maven.plugins</groupId>
357 |                 <artifactId>maven-jar-plugin</artifactId>
358 |                 <version>2.6</version>
359 |                 <configuration>
360 |                     <skipIfEmpty>true</skipIfEmpty>
361 |                 </configuration>
362 |                 <executions>
363 |                     <execution>
364 |                         <goals>
365 |                             <goal>test-jar</goal>
366 |                         </goals>
367 |                     </execution>
368 |                 </executions>
369 |             </plugin>
370 |             <plugin>
371 |                 <groupId>org.apache.maven.plugins</groupId>
372 |                 <artifactId>maven-resources-plugin</artifactId>
373 |                 <version>2.7</version>
374 |                 <configuration>
375 |                     <encoding>UTF-8</encoding>
376 |                 </configuration>
377 |             </plugin>
378 | 
379 |             <plugin>
380 |                 <groupId>org.apache.maven.plugins</groupId>
381 |                 <artifactId>maven-install-plugin</artifactId>
382 |                 <version>2.5.2</version>
383 |                 <configuration>
384 |                     <createChecksum>true</createChecksum>
385 |                 </configuration>
386 |             </plugin>
387 | 
388 |             <plugin>
389 |                 <groupId>org.apache.maven.plugins</groupId>
390 |                 <artifactId>maven-eclipse-plugin</artifactId>
391 |                 <version>2.10</version>
392 |                 <configuration>
393 |                     <downloadSources>true</downloadSources>
394 |                     <downloadJavadocs>true</downloadJavadocs>
395 |                     <wtpversion>2.0</wtpversion>
396 |                     <sourceIncludes>
397 |                         <sourceInclude>**/*.*</sourceInclude>
398 |                     </sourceIncludes>
399 |                     <additionalBuildcommands>
400 |                         <buildCommand>
401 |                             <name>org.springframework.ide.eclipse.core.springbuilder</name>
402 |                         </buildCommand>
403 |                         <buildCommand>
404 |                             <name>org.eclipse.m2e.core.maven2Builder</name>
405 |                         </buildCommand>
406 |                     </additionalBuildcommands>
407 |                     <additionalProjectnatures>
408 |                         <projectnature>org.eclipse.jdt.core.javanature</projectnature>
409 |                         <projectnature>org.springframework.ide.eclipse.core.springnature</projectnature>
410 |                         <projectnature>org.eclipse.m2e.core.maven2Nature</projectnature>
411 |                     </additionalProjectnatures>
412 |                 </configuration>
413 |             </plugin>
414 |             <plugin>
415 |                 <groupId>org.apache.maven.plugins</groupId>
416 |                 <artifactId>maven-dependency-plugin</artifactId>
417 |                 <executions>
418 |                     <execution>
419 |                         <id>install</id>
420 |                         <phase>install</phase>
421 |                         <goals>
422 |                             <goal>sources</goal>
423 |                         </goals>
424 |                     </execution>
425 |                 </executions>
426 |             </plugin>
427 | 
428 |             <plugin>
429 |                 <groupId>net.alchim31.maven</groupId>
430 |                 <artifactId>scala-maven-plugin</artifactId>
431 |                 <version>3.4.6</version>
432 |                 <executions>
433 |                     <execution>
434 |                         <id>scala-compile-first</id>
435 |                         <phase>process-resources</phase>
436 |                         <goals>
437 |                             <goal>compile</goal>
438 |                         </goals>
439 |                     </execution>
440 |                     <execution>
441 |                         <id>scala-test-compile-first</id>
442 |                         <phase>process-test-resources</phase>
443 |                         <goals>
444 |                             <goal>testCompile</goal>
445 |                         </goals>
446 |                     </execution>
447 |                     <execution>
448 |                         <id>attach-scaladocs</id>
449 |                         <phase>verify</phase>
450 |                         <goals>
451 |                             <goal>doc-jar</goal>
452 |                         </goals>
453 |                     </execution>
454 |                 </executions>
455 |                 <configuration>
456 |                     <scalaVersion>${scala.complete.version}</scalaVersion>
457 |                     <recompileMode>incremental</recompileMode>
458 |                     <useZincServer>true</useZincServer>
459 |                     <args>
460 |                         <arg>-unchecked</arg>
461 |                         <arg>-deprecation</arg>
462 |                         <arg>-feature</arg>
463 |                         <arg>-language:postfixOps</arg>
464 |                     </args>
465 |                     <jvmArgs>
466 |                         <jvmArg>-Xms1024m</jvmArg>
467 |                         <jvmArg>-Xmx1024m</jvmArg>
468 |                         <jvmArg>-XX:PermSize=${PermGen}</jvmArg>
469 |                         <jvmArg>-XX:MaxPermSize=${MaxPermGen}</jvmArg>
470 |                     </jvmArgs>
471 |                     <javacArgs>
472 |                         <javacArg>-source</javacArg>
473 |                         <javacArg>${java.version}</javacArg>
474 |                         <javacArg>-target</javacArg>
475 |                         <javacArg>${java.version}</javacArg>
476 |                     </javacArgs>
477 |                     <!-- The following plugin is required to use quasiquotes in Scala 2.10
478 |                         and is used by Spark SQL for code generation. -->
479 |                     <compilerPlugins>
480 |                         <compilerPlugin>
481 |                             <groupId>org.scalamacros</groupId>
482 |                             <artifactId>paradise_${scala.complete.version}</artifactId>
483 |                             <version>${scala.macros.version}</version>
484 |                         </compilerPlugin>
485 |                     </compilerPlugins>
486 |                 </configuration>
487 |             </plugin>
488 | 
489 |             <plugin>
490 |                 <groupId>org.codehaus.mojo</groupId>
491 |                 <artifactId>build-helper-maven-plugin</artifactId>
492 |                 <version>1.12</version>
493 |                 <executions>
494 |                     <execution>
495 |                         <id>add-scala-sources</id>
496 |                         <phase>generate-sources</phase>
497 |                         <goals>
498 |                             <goal>add-source</goal>
499 |                         </goals>
500 |                         <configuration>
501 |                             <sources>
502 |                                 <source>src/main/scala</source>
503 |                             </sources>
504 |                         </configuration>
505 |                     </execution>
506 |                     <execution>
507 |                         <id>add-scala-test-sources</id>
508 |                         <phase>generate-test-sources</phase>
509 |                         <goals>
510 |                             <goal>add-test-source</goal>
511 |                         </goals>
512 |                         <configuration>
513 |                             <sources>
514 |                                 <source>src/test/scala</source>
515 |                             </sources>
516 |                         </configuration>
517 |                     </execution>
518 |                 </executions>
519 |             </plugin>
520 | 
521 |             <plugin>
522 |                 <groupId>org.apache.maven.plugins</groupId>
523 |                 <artifactId>maven-shade-plugin</artifactId>
524 |                 <version>1.7.1</version>
525 |                 <executions>
526 |                     <execution>
527 |                         <phase>package</phase>
528 |                         <goals>
529 |                             <goal>shade</goal>
530 |                         </goals>
531 |                     </execution>
532 |                 </executions>
533 |                 <configuration>
534 |                     <transformers>
535 |                         <transformer
536 |                                 implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
537 |                             <mainClass></mainClass>
538 |                         </transformer>
539 |                     </transformers>
540 |                     <filters>
541 |                         <filter>
542 |                             <artifact>*:*</artifact>
543 |                             <excludes>
544 |                                 <exclude>META-INF/*.SF</exclude>
545 |                                 <exclude>META-INF/*.DSA</exclude>
546 |                                 <exclude>META-INF/*.RSA</exclude>
547 |                             </excludes>
548 |                         </filter>
549 |                     </filters>
550 |                 </configuration>
551 |             </plugin>
552 |         </plugins>
553 | 
554 |         <pluginManagement>
555 |             <plugins>
556 |                 <plugin>
557 |                     <groupId>org.eclipse.m2e</groupId>
558 |                     <artifactId>lifecycle-mapping</artifactId>
559 |                     <version>1.0.0</version>
560 |                     <configuration>
561 |                         <lifecycleMappingMetadata>
562 |                             <pluginExecutions>
563 |                                 <pluginExecution>
564 |                                     <pluginExecutionFilter>
565 |                                         <groupId>org.apache.maven.plugins</groupId>
566 |                                         <artifactId>maven-enforcer-plugin</artifactId>
567 |                                         <versionRange>[1.0.0,)</versionRange>
568 |                                         <goals>
569 |                                             <goal>enforce</goal>
570 |                                         </goals>
571 |                                     </pluginExecutionFilter>
572 |                                     <action>
573 |                                         <execute/>
574 |                                     </action>
575 |                                 </pluginExecution>
576 |                                 <pluginExecution>
577 |                                     <pluginExecutionFilter>
578 |                                         <groupId>org.apache.maven.plugins</groupId>
579 |                                         <artifactId>maven-dependency-plugin</artifactId>
580 |                                         <versionRange>[2.4,)</versionRange>
581 |                                         <goals>
582 |                                             <goal>unpack</goal>
583 |                                             <goal>sources</goal>
584 |                                         </goals>
585 |                                     </pluginExecutionFilter>
586 |                                     <action>
587 |                                         <execute/>
588 |                                     </action>
589 |                                 </pluginExecution>
590 |                             </pluginExecutions>
591 |                         </lifecycleMappingMetadata>
592 |                     </configuration>
593 |                 </plugin>
594 |             </plugins>
595 |         </pluginManagement>
596 |     </build>
597 | </project>
598 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 |   "extends": [
4 |     "config:recommended"
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/com/javachen/spark/examples/mllib/JavaALS.java:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.mllib;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaDoubleRDD;
 5 | import org.apache.spark.api.java.JavaPairRDD;
 6 | import org.apache.spark.api.java.JavaRDD;
 7 | import org.apache.spark.api.java.JavaSparkContext;
 8 | import org.apache.spark.api.java.function.Function;
 9 | import org.apache.spark.mllib.recommendation.ALS;
10 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
11 | import org.apache.spark.mllib.recommendation.Rating;
12 | import scala.Tuple2;
13 | 
14 | public class JavaALS {
15 |     public static void main(String[] args) {
16 |         SparkConf conf = new SparkConf().setAppName("Java Collaborative Filtering Example");
17 |         JavaSparkContext sc = new JavaSparkContext(conf);
18 | 
19 |         // Load and parse the data
20 |         String path = "data/mllib/als/test.data";
21 |         JavaRDD<String> data = sc.textFile(path);
22 |         JavaRDD<Rating> ratings = data.map(
23 |                 new Function<String, Rating>() {
24 |                     public Rating call(String s) {
25 |                         String[] sarray = s.split(",");
26 |                         return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]),
27 |                                 Double.parseDouble(sarray[2]));
28 |                     }
29 |                 }
30 |         );
31 | 
32 |         // Build the recommendation model using ALS
33 |         int rank = 10;
34 |         int numIterations = 20;
35 |         MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);
36 | 
37 |         // Evaluate the model on rating data
38 |         JavaRDD<Tuple2<Object, Object>> userProducts = ratings.map(
39 |                 new Function<Rating, Tuple2<Object, Object>>() {
40 |                     public Tuple2<Object, Object> call(Rating r) {
41 |                         return new Tuple2<Object, Object>(r.user(), r.product());
42 |                     }
43 |                 }
44 |         );
45 |         JavaPairRDD<Tuple2<Integer, Integer>, Double> predictions = JavaPairRDD.fromJavaRDD(
46 |                 model.predict(JavaRDD.toRDD(userProducts)).toJavaRDD().map(
47 |                         new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
48 |                             public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) {
49 |                                 return new Tuple2<Tuple2<Integer, Integer>, Double>(
50 |                                         new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
51 |                             }
52 |                         }
53 |                 ));
54 |         JavaRDD<Tuple2<Double, Double>> ratesAndPreds =
55 |                 JavaPairRDD.fromJavaRDD(ratings.map(
56 |                         new Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
57 |                             public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating r) {
58 |                                 return new Tuple2<Tuple2<Integer, Integer>, Double>(
59 |                                         new Tuple2<Integer, Integer>(r.user(), r.product()), r.rating());
60 |                             }
61 |                         }
62 |                 )).join(predictions).values();
63 | 
64 |         double MSE = JavaDoubleRDD.fromRDD(ratesAndPreds.map(
65 |                 new Function<Tuple2<Double, Double>, Object>() {
66 |                     public Object call(Tuple2<Double, Double> pair) {
67 |                         Double err = pair._1() - pair._2();
68 |                         return err * err;
69 |                     }
70 |                 }
71 |         ).rdd()).mean();
72 |         System.out.println("Mean Squared Error = " + MSE);
73 |     }
74 | }


--------------------------------------------------------------------------------
/src/main/java/com/javachen/spark/examples/rdd/JavaWordCount.java:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaPairRDD;
 5 | import org.apache.spark.api.java.JavaRDD;
 6 | import org.apache.spark.api.java.JavaSparkContext;
 7 | import org.apache.spark.api.java.function.FlatMapFunction;
 8 | import org.apache.spark.api.java.function.Function;
 9 | import org.apache.spark.api.java.function.Function2;
10 | import org.apache.spark.api.java.function.PairFunction;
11 | import scala.Tuple2;
12 | 
13 | import java.util.ArrayList;
14 | import java.util.Arrays;
15 | import java.util.Collection;
16 | 
17 | /**
18 |  * @author <a href="mailto:june.chan@foxmail.com">june</a>.
19 |  * @date 2015-05-06 16:20.
20 |  */
21 | public class JavaWordCount {
22 |     public static void main(String[] args) {
23 |         JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("JavaWordCount"));
24 |         final int threshold = Integer.parseInt(args[1]);
25 | 
26 |         // split each document into words
27 |         JavaRDD<String> tokenized = sc.textFile(args[0]).flatMap(
28 |                 new FlatMapFunction<String, String>() {
29 |                     @Override
30 |                     public Iterable<String> call(String s) {
31 |                         return Arrays.asList(s.split(" "));
32 |                     }
33 |                 }
34 |         );
35 | 
36 |         // count the occurrence of each word
37 |         JavaPairRDD<String, Integer> counts = tokenized.mapToPair(
38 |                 new PairFunction<String, String, Integer>() {
39 |                     @Override
40 |                     public Tuple2<String, Integer> call(String s) {
41 |                         return new Tuple2<String, Integer>(s, 1);
42 |                     }
43 |                 }
44 |         ).reduceByKey(
45 |                 new Function2<Integer, Integer, Integer>() {
46 |                     @Override
47 |                     public Integer call(Integer i1, Integer i2) {
48 |                         return i1 + i2;
49 |                     }
50 |                 }
51 |         );
52 | 
53 |         // filter out words with less than threshold occurrences
54 |         JavaPairRDD<String, Integer> filtered = counts.filter(
55 |                 new Function<Tuple2<String, Integer>, Boolean>() {
56 |                     @Override
57 |                     public Boolean call(Tuple2<String, Integer> tup) {
58 |                         return tup._2() >= threshold;
59 |                     }
60 |                 }
61 |         );
62 | 
63 |         // count characters
64 |         JavaPairRDD<Character, Integer> charCounts = filtered.flatMap(
65 |                 new FlatMapFunction<Tuple2<String, Integer>, Character>() {
66 |                     @Override
67 |                     public Iterable<Character> call(Tuple2<String, Integer> s) {
68 |                         Collection<Character> chars = new ArrayList<Character>(s._1().length());
69 |                         for (char c : s._1().toCharArray()) {
70 |                             chars.add(c);
71 |                         }
72 |                         return chars;
73 |                     }
74 |                 }
75 |         ).mapToPair(
76 |                 new PairFunction<Character, Character, Integer>() {
77 |                     @Override
78 |                     public Tuple2<Character, Integer> call(Character c) {
79 |                         return new Tuple2<Character, Integer>(c, 1);
80 |                     }
81 |                 }
82 |         ).reduceByKey(
83 |                 new Function2<Integer, Integer, Integer>() {
84 |                     @Override
85 |                     public Integer call(Integer i1, Integer i2) {
86 |                         return i1 + i2;
87 |                     }
88 |                 }
89 |         );
90 | 
91 |         System.out.println(charCounts.collect());
92 |     }
93 | }


--------------------------------------------------------------------------------
/src/main/java/com/javachen/spark/examples/sparksql/JavaSparkSQLByReflection.java:
--------------------------------------------------------------------------------
  1 | package com.javachen.spark.examples.sparksql;
  2 | 
  3 | import org.apache.spark.SparkConf;
  4 | import org.apache.spark.api.java.JavaRDD;
  5 | import org.apache.spark.api.java.JavaSparkContext;
  6 | import org.apache.spark.api.java.function.Function;
  7 | import org.apache.spark.sql.DataFrame;
  8 | import org.apache.spark.sql.Row;
  9 | import org.apache.spark.sql.SQLContext;
 10 | 
 11 | import java.io.Serializable;
 12 | import java.util.Arrays;
 13 | import java.util.List;
 14 | 
 15 | public class JavaSparkSQLByReflection {
 16 |     public static void main(String[] args) throws Exception {
 17 |         SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQLByReflection");
 18 |         JavaSparkContext ctx = new JavaSparkContext(sparkConf);
 19 |         SQLContext sqlCtx = new SQLContext(ctx);
 20 | 
 21 |         System.out.println("=== Data source: RDD ===");
 22 |         // Load a text file and convert each line to a Java Bean.
 23 |         JavaRDD<People> people = ctx.textFile("people.txt").map(
 24 |                 new Function<String, People>() {
 25 |                     @Override
 26 |                     public People call(String line) {
 27 |                         String[] parts = line.split(",");
 28 | 
 29 |                         People people = new People();
 30 |                         people.setName(parts[0]);
 31 |                         people.setAge(Integer.parseInt(parts[1].trim()));
 32 |                         return people;
 33 |                     }
 34 |                 });
 35 | 
 36 |         // Apply a schema to an RDD of Java Beans and register it as a table.
 37 |         DataFrame schemaPeople = sqlCtx.createDataFrame(people, People.class);
 38 |         schemaPeople.registerTempTable("people");
 39 | 
 40 |         // SQL can be run over RDDs that have been registered as tables.
 41 |         DataFrame teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
 42 | 
 43 |         // The results of SQL queries are DataFrames and support all the normal RDD operations.
 44 |         // The columns of a row in the result can be accessed by ordinal.
 45 |         List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {
 46 |             @Override
 47 |             public String call(Row row) {
 48 |                 return "Name: " + row.getString(0);
 49 |             }
 50 |         }).collect();
 51 | 
 52 |         for (String name : teenagerNames) {
 53 |             System.out.println(name);
 54 |         }
 55 | 
 56 | 
 57 |         System.out.println("=== Data source: Parquet File ===");
 58 |         // DataFrames can be saved as parquet files, maintaining the schema information.
 59 |         schemaPeople.saveAsParquetFile("people.parquet");
 60 | 
 61 |         // Read in the parquet file created above.
 62 |         // Parquet files are self-describing so the schema is preserved.
 63 |         // The result of loading a parquet file is also a DataFrame.
 64 |         DataFrame parquetFile = sqlCtx.parquetFile("people.parquet");
 65 | 
 66 |         //Parquet files can also be registered as tables and then used in SQL statements.
 67 |         parquetFile.registerTempTable("parquetFile");
 68 |         DataFrame teenagers2 =
 69 |                 sqlCtx.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
 70 |         teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() {
 71 |             @Override
 72 |             public String call(Row row) {
 73 |                 return "Name: " + row.getString(0);
 74 |             }
 75 |         }).collect();
 76 |         for (String name : teenagerNames) {
 77 |             System.out.println(name);
 78 |         }
 79 | 
 80 |         System.out.println("=== Data source: JSON Dataset ===");
 81 |         // A JSON dataset is pointed by path.
 82 |         // The path can be either a single text file or a directory storing text files.
 83 |         String path = "people.json";
 84 |         // Create a DataFrame from the file(s) pointed by path
 85 |         DataFrame peopleFromJsonFile = sqlCtx.jsonFile(path);
 86 | 
 87 |         // Because the schema of a JSON dataset is automatically inferred, to write queries,
 88 |         // it is better to take a look at what is the schema.
 89 |         peopleFromJsonFile.printSchema();
 90 |         // The schema of people is ...
 91 |         // root
 92 |         //  |-- age: IntegerType
 93 |         //  |-- name: StringType
 94 | 
 95 |         // Register this DataFrame as a table.
 96 |         peopleFromJsonFile.registerTempTable("people");
 97 | 
 98 |         // SQL statements can be run by using the sql methods provided by sqlCtx.
 99 |         DataFrame teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
100 | 
101 |         // The results of SQL queries are DataFrame and support all the normal RDD operations.
102 |         // The columns of a row in the result can be accessed by ordinal.
103 |         teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {
104 |             @Override
105 |             public String call(Row row) {
106 |                 return "Name: " + row.getString(0);
107 |             }
108 |         }).collect();
109 |         for (String name : teenagerNames) {
110 |             System.out.println(name);
111 |         }
112 | 
113 |         // Alternatively, a DataFrame can be created for a JSON dataset represented by
114 |         // a RDD[String] storing one JSON object per string.
115 |         List<String> jsonData = Arrays.asList(
116 |                 "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
117 |         JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
118 |         DataFrame peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd());
119 | 
120 | 
121 |         // Take a look at the schema of this new DataFrame.
122 |         peopleFromJsonRDD.printSchema();
123 |         // The schema of anotherPeople is ...
124 |         // root
125 |         //  |-- address: StructType
126 |         //  |    |-- city: StringType
127 |         //  |    |-- state: StringType
128 |         //  |-- name: StringType
129 | 
130 |         peopleFromJsonRDD.registerTempTable("people2");
131 | 
132 |         DataFrame peopleWithCity = sqlCtx.sql("SELECT name, address.city FROM people2");
133 |         List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() {
134 |             @Override
135 |             public String call(Row row) {
136 |                 return "Name: " + row.getString(0) + ", City: " + row.getString(1);
137 |             }
138 |         }).collect();
139 |         for (String name : nameAndCity) {
140 |             System.out.println(name);
141 |         }
142 | 
143 |         ctx.stop();
144 |     }
145 | 
146 |     public static class People implements Serializable {
147 |         private String name;
148 |         private int age;
149 | 
150 |         public String getName() {
151 |             return name;
152 |         }
153 | 
154 |         public void setName(String name) {
155 |             this.name = name;
156 |         }
157 | 
158 |         public int getAge() {
159 |             return age;
160 |         }
161 | 
162 |         public void setAge(int age) {
163 |             this.age = age;
164 |         }
165 |     }
166 | }


--------------------------------------------------------------------------------
/src/main/java/com/javachen/spark/examples/sparksql/JavaSparkSQLBySchema.java:
--------------------------------------------------------------------------------
 1 | //package com.javachen.spark.examples.sparksql;
 2 | //
 3 | //import org.apache.spark.SparkConf;
 4 | //import org.apache.spark.api.java.JavaRDD;
 5 | //import org.apache.spark.api.java.JavaSparkContext;
 6 | //import org.apache.spark.api.java.function.Function;
 7 | //import org.apache.spark.sql.DataFrame;
 8 | //import org.apache.spark.sql.Row;
 9 | //import org.apache.spark.sql.SQLContext;
10 | //
11 | //import java.util.List;
12 | //
13 | //public class JavaSparkSQLBySchema {
14 | //    public static void main(String[] args) throws Exception {
15 | //        SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQLBySchema");
16 | //        JavaSparkContext ctx = new JavaSparkContext(sparkConf);
17 | //        SQLContext sqlContext = new SQLContext(sc);
18 | //
19 | //        // Load a text file and convert each line to a JavaBean.
20 | //        JavaRDD<String> people = sc.textFile("people.txt");
21 | //
22 | //        // The schema is encoded in a string
23 | //        String schemaString = "name age";
24 | //
25 | //        // Generate the schema based on the string of schema
26 | //        List<StructField> fields = new ArrayList<StructField>();
27 | //        for (String fieldName : schemaString.split(" ")) {
28 | //            fields.add(DataType.createStructField(fieldName, DataType.StringType, true));
29 | //        }
30 | //        StructType schema = DataType.createStructType(fields);
31 | //
32 | //        // Convert records of the RDD (people) to Rows.
33 | //        JavaRDD<Row> rowRDD = people.map(
34 | //                new Function<String, Row>() {
35 | //                    public Row call(String record) throws Exception {
36 | //                        String[] fields = record.split(",");
37 | //                        return Row.create(fields[0], fields[1].trim());
38 | //                    }
39 | //                });
40 | //
41 | //        // Apply the schema to the RDD.
42 | //        DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
43 | //
44 | //        // Register the DataFrame as a table.
45 | //        peopleDataFrame.registerTempTable("people");
46 | //
47 | //        // SQL can be run over RDDs that have been registered as tables.
48 | //        DataFrame results = sqlContext.sql("SELECT name FROM people");
49 | //
50 | //        // The results of SQL queries are DataFrames and support all the normal RDD operations.
51 | //        // The columns of a row in the result can be accessed by ordinal.
52 | //        List<String> names = results.map(new Function<Row, String>() {
53 | //            public String call(Row row) {
54 | //                return "Name: " + row.getString(0);
55 | //            }
56 | //        }).collect();
57 | //    }
58 | //
59 | //}


--------------------------------------------------------------------------------
/src/main/python/PythonALS.py:
--------------------------------------------------------------------------------
 1 | from pyspark.mllib.recommendation import ALS
 2 | from numpy import array
 3 | 
 4 | # Load and parse the data
 5 | data = sc.textFile("data/mllib/als/test.data")
 6 | ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
 7 | 
 8 | # Build the recommendation model using Alternating Least Squares
 9 | rank = 10
10 | numIterations = 20
11 | model = ALS.train(ratings, rank, numIterations)
12 | 
13 | # Evaluate the model on training data
14 | testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
15 | predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
16 | ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
17 | MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count()
18 | print("Mean Squared Error = " + str(MSE))


--------------------------------------------------------------------------------
/src/main/python/PythonSparkSQLByReflection.py:
--------------------------------------------------------------------------------
 1 | # sc is an existing SparkContext.
 2 | from pyspark.sql import SQLContext, Row
 3 | 
 4 | sqlContext = SQLContext(sc)
 5 | 
 6 | # Load a text file and convert each line to a Row.
 7 | lines = sc.textFile("people.txt")
 8 | parts = lines.map(lambda l: l.split(","))
 9 | people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
10 | 
11 | # Infer the schema, and register the DataFrame as a table.
12 | schemaPeople = sqlContext.inferSchema(people)
13 | schemaPeople.registerTempTable("people")
14 | 
15 | # SQL can be run over DataFrames that have been registered as a table.
16 | teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
17 | 
18 | # The results of SQL queries are RDDs and support all the normal RDD operations.
19 | teenNames = teenagers.map(lambda p: "Name: " + p.name)
20 | for teenName in teenNames.collect():
21 |   print teenName


--------------------------------------------------------------------------------
/src/main/python/PythonSparkSQLBySchema.py:
--------------------------------------------------------------------------------
 1 | # Import SQLContext and data types
 2 | from pyspark.sql import *
 3 | 
 4 | # sc is an existing SparkContext.
 5 | sqlContext = SQLContext(sc)
 6 | 
 7 | # Load a text file and convert each line to a tuple.
 8 | lines = sc.textFile("people.txt")
 9 | parts = lines.map(lambda l: l.split(","))
10 | people = parts.map(lambda p: (p[0], p[1].strip()))
11 | 
12 | # The schema is encoded in a string.
13 | schemaString = "name age"
14 | 
15 | fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
16 | schema = StructType(fields)
17 | 
18 | # Apply the schema to the RDD.
19 | schemaPeople = sqlContext.createDataFrame(people, schema)
20 | 
21 | # Register the DataFrame as a table.
22 | schemaPeople.registerTempTable("people")
23 | 
24 | # SQL can be run over DataFrames that have been registered as a table.
25 | results = sqlContext.sql("SELECT name FROM people")
26 | 
27 | # The results of SQL queries are RDDs and support all the normal RDD operations.
28 | names = results.map(lambda p: "Name: " + p.name)
29 | for name in names.collect():
30 |   print name


--------------------------------------------------------------------------------
/src/main/python/PythonWordCount.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark import SparkContext
 4 | 
 5 | if __name__ == "__main__":
 6 |     file=sys.argv[1]
 7 |     threshold=int(sys.argv[2])
 8 |     sc = SparkContext(appName="PythonWordCount")
 9 |     lines = sc.textFile(file, 1)
10 |     counts = lines.flatMap(lambda x: x.split(' ')) \
11 |                 .map(lambda x: (x, 1))  \
12 |                 .reduceByKey(lambda a, b: a + b)  \
13 |                 .filter(lambda (a, b) : b >= threshold)  \
14 |                 .flatMap(lambda (a, b): list(a))  \
15 |                 .map(lambda x: (x, 1))  \
16 |                 .reduceByKey(lambda a, b: a + b)
17 | 
18 |     print ",".join(str(t) for t in counts.collect())
19 |     sc.stop()


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/mllib/EvaluateResult.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.grab
 2 | 
 3 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | /**
 7 |  *
 8 |  * Created by <a href="mailto:junechen@163.com">june</a> on 2015-05-27 09:13.
 9 |  */
10 | object EvaluateResult {
11 |   def coverage(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={
12 |     userRecommends.flatMap(_._2).distinct().count.toDouble / training.map(_.product).distinct().count
13 |   }
14 | 
15 |   def popularity(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={
16 |     var ret = 0.0
17 |     var n=0
18 |     val item_popularity=training.map{ case Rating(user, product, rate) =>
19 |       (product,(user, rate))
20 |     }.groupByKey(4).map{case (product,list)=>
21 |       (product,list.size)
22 |     }.collectAsMap()
23 | 
24 |     userRecommends.flatMap(_._2).collect().foreach { p =>
25 |       ret = ret + math.log(1 + item_popularity.get(p).get)
26 |       n = n + 1
27 |     }
28 | 
29 |     ret/n
30 |   }
31 | 
32 |   def recallAndPrecisionAndF1(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])]):(Double, Double,Double) = {
33 |     val usersProducts: RDD[(Int, Int)] = training.map { case Rating(user, product, rate) =>
34 |       (user, product)
35 |     }
36 | 
37 |     val groupData=userRecommends.join(usersProducts.groupByKey().map {case (k,v) => (k,v.toList)})
38 | 
39 |     val (hit, testNum, recNum) = groupData.map{ case (user, (mItems, tItems)) =>
40 |       var count = 0
41 |       // 计算准确率：推荐命中商品数/实际推荐商品数, topN为推荐上限值
42 |       val precNum = mItems.length
43 |       for (i <- 0 until precNum)
44 |         if (tItems.contains(mItems(i)))
45 |           count += 1
46 |       (count, tItems.length, precNum) }.reduce( (t1, t2) => (t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3) )
47 | 
48 |       val recall: Double = hit * 1.0 / testNum
49 |       val precision: Double = hit * 1.0 / recNum
50 |       val f1: Double = 2 * recall * precision / (recall + precision)
51 | 
52 |       println(s"$hit,$testNum,$recNum")
53 |       (recall,precision,f1)
54 |   }
55 | 
56 |   def recallAndPrecision(test:RDD[Rating],result:RDD[Rating]):Double = {
57 |     val numHit: Long = result.intersection(test).count
58 |     val recall: Double = numHit * 1.0 / test.count
59 |     val precision: Double = numHit * 1.0 / result.count
60 |     val f1: Double = 2 * recall * precision / (recall + precision)
61 |     System.out.println("recall : " + recall + "\nprecision : " + precision + "\nf1 : " + f1)
62 |     f1
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/mllib/MovieLensALS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.javachen.spark.examples.mllib
 19 | 
 20 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
 21 | import org.apache.spark.rdd.RDD
 22 | import org.apache.spark.{SparkConf, SparkContext}
 23 | import scopt.OptionParser
 24 | 
 25 | import scala.collection.mutable
 26 | 
 27 | /**
 28 |  * An example app for ALS on MovieLens data (http://grouplens.org/datasets/movielens/).
 29 |  * Run with
 30 |  * {{{
 31 |  * bin/run-example org.apache.spark.examples.mllib.MovieLensALS
 32 |  * }}}
 33 |  * A synthetic dataset in MovieLens format can be found at `data/mllib/sample_movielens_data.txt`.
 34 |  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
 35 |  */
 36 | object MovieLensALS {
 37 | 
 38 |   case class Params(
 39 |                      input: String = "data/ml-1m/ratings.dat",
 40 |                      userDataInput: String = "data/ml-1m/personalRatings.txt",
 41 |                      kryo: Boolean = false,
 42 |                      numIterations: Int = 20,
 43 |                      lambda: Double = 1.0,
 44 |                      rank: Int = 10,
 45 |                      numUserBlocks: Int = -1,
 46 |                      numProductBlocks: Int = -1,
 47 |                      implicitPrefs: Boolean = false)
 48 | 
 49 |   def main(args: Array[String]) {
 50 |     val defaultParams = Params()
 51 | 
 52 |     val parser = new OptionParser[Params]("MovieLensALS") {
 53 |       head("MovieLensALS: an example app for ALS on MovieLens data.")
 54 |       opt[Int]("rank")
 55 |         .text(s"rank, default: ${defaultParams.rank}}")
 56 |         .action((x, c) => c.copy(rank = x))
 57 |       opt[Int]("numIterations")
 58 |         .text(s"number of iterations, default: ${defaultParams.numIterations}")
 59 |         .action((x, c) => c.copy(numIterations = x))
 60 |       opt[Double]("lambda")
 61 |         .text(s"lambda (smoothing constant), default: ${defaultParams.lambda}")
 62 |         .action((x, c) => c.copy(lambda = x))
 63 |       opt[Unit]("kryo")
 64 |         .text("use Kryo serialization")
 65 |         .action((_, c) => c.copy(kryo = true))
 66 |       opt[Int]("numUserBlocks")
 67 |         .text(s"number of user blocks, default: ${defaultParams.numUserBlocks} (auto)")
 68 |         .action((x, c) => c.copy(numUserBlocks = x))
 69 |       opt[Int]("numProductBlocks")
 70 |         .text(s"number of product blocks, default: ${defaultParams.numProductBlocks} (auto)")
 71 |         .action((x, c) => c.copy(numProductBlocks = x))
 72 |       opt[Unit]("implicitPrefs")
 73 |         .text("use implicit preference")
 74 |         .action((_, c) => c.copy(implicitPrefs = true))
 75 |       opt[String]("userDataInput")
 76 |         .required()
 77 |         .text("input paths to user dataset")
 78 |         .action((x, c) => c.copy(userDataInput = x))
 79 |       arg[String]("<input>")
 80 |         .required()
 81 |         .text("input paths to a MovieLens dataset of ratings")
 82 |         .action((x, c) => c.copy(input = x))
 83 |       note(
 84 |         """
 85 |           |For example, the following command runs this app on a synthetic dataset:
 86 |           |
 87 |           | bin/spark-submit --class com.javachen.grab.examples.mllib.MovieLensALS \
 88 |           |  examples/target/scala-*/grab-examples-*.jar \
 89 |           |  --rank 5 --numIterations 20 --lambda 1.0 \
 90 |           |  --userDataInput data/ml-1m/personalRatings.txt \
 91 |           |  data/ml-1m/ratings.dat
 92 |         """.stripMargin)
 93 |     }
 94 | 
 95 |     parser.parse(args, defaultParams).map { params =>
 96 |       run(params)
 97 |     } getOrElse {
 98 |       System.exit(1)
 99 |     }
100 |   }
101 | 
102 |   def run(params: Params) {
103 |     val conf = new SparkConf().setAppName(s"MovieLensALS with $params").set("spark.executor.memory", "2g")
104 |     if (params.kryo) {
105 |       conf.registerKryoClasses(Array(classOf[mutable.BitSet], classOf[Rating]))
106 |         .set("spark.kryoserializer.buffer.mb", "8")
107 |     }
108 |     val sc = new SparkContext(conf)
109 | 
110 | 
111 |     val ratings = sc.textFile(params.input).map { line =>
112 |       val fields = line.split("::")
113 |       /*
114 |         * MovieLens ratings are on a scale of 1-5:
115 |         * 5: Must see
116 |         * 4: Will enjoy
117 |         * 3: It's okay
118 |         * 2: Fairly bad
119 |         * 1: Awful
120 |         * So we should not recommend a movie if the predicted rating is less than 3.
121 |         * To map ratings to confidence scores, we use
122 |         * 5 -> 2.5, 4 -> 1.5, 3 -> 0.5, 2 -> -0.5, 1 -> -1.5. This mappings means unobserved
123 |         * entries are generally between It's okay and Fairly bad.
124 |         * The semantics of 0 in this expanded world of non-positive weights
125 |         * are "the same as never having interacted at all".
126 |         */
127 |       if (params.implicitPrefs) {
128 |         // format: (timestamp % 10, Rating(userId, movieId, rating))
129 |         (fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble - 2.5))
130 |       } else {
131 |         // format: (timestamp % 10, Rating(userId, movieId, rating))
132 |         (fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
133 |       }
134 |     }.cache()
135 | 
136 | 
137 |     val numRatings = ratings.count()
138 |     val numUsers = ratings.map(_._2.user).distinct().count()
139 |     val numMovies = ratings.map(_._2.product).distinct().count()
140 | 
141 |     println("Got " + numRatings + " ratings from " + numUsers + " users on " + numMovies + " movies.")
142 | 
143 |     val model = evaluateMode(params, ratings)
144 | 
145 |     predictMoive(params, sc, model)
146 | 
147 |     // clean up
148 |     sc.stop()
149 |   }
150 | 
151 |   def predictMoive(params: Params, sc: SparkContext, model: MatrixFactorizationModel): Unit = {
152 |     //为用户1推荐10个
153 |     var rs = model.recommendProducts(1, 10)
154 |     var value = ""
155 |     var key = 0
156 | 
157 |     //保存推荐数据到hbase中
158 |     rs.foreach(r => {
159 |       key = r.user
160 |       value = value + r.product + ":" + r.rating + ","
161 |     })
162 | 
163 |     println(value)
164 | 
165 |   }
166 | 
167 |   def evaluateMode(params: Params, ratings: RDD[(Long, Rating)]): MatrixFactorizationModel = {
168 |     val training = ratings.values.repartition(4)
169 | 
170 |     //建立模型
171 |     val start = System.currentTimeMillis()
172 |     val model = new ALS().setRank(params.rank).setIterations(params.numIterations).setLambda(params.lambda).setImplicitPrefs(params.implicitPrefs).setUserBlocks(params.numUserBlocks).setProductBlocks(params.numProductBlocks).run(training)
173 |     println("Train Time = " + (System.currentTimeMillis() - start) * 1.0 / 1000)
174 |     val testRmse = computeRmse(model, training)
175 | 
176 |     println("RMSE = " + testRmse)
177 | 
178 |     model
179 |   }
180 | 
181 |   /** Compute RMSE (Root Mean Squared Error). */
182 |    def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]) = {
183 |     val usersProducts = data.map { case Rating(user, product, rate) =>
184 |       (user, product)
185 |     }
186 | 
187 |     val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) =>
188 |       ((user, product), rate)
189 |     }
190 | 
191 |     val ratesAndPreds = data.map { case Rating(user, product, rate) =>
192 |       ((user, product), rate)
193 |     }.join(predictions).sortByKey()
194 | 
195 |     math.sqrt(ratesAndPreds.map { case ((user, product), (r1, r2)) =>
196 |       val err = (r1 - r2)
197 |       err * err
198 |     }.mean())
199 |   }
200 | }
201 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/mllib/MovieSimilarities.scala:
--------------------------------------------------------------------------------
  1 | package com.javachen.spark.examples.mllib
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | 
  5 | /**
  6 |  * https://gist.github.com/MLnick/5286475
  7 |  * A port of [[http://blog.echen.me/2012/02/09/movie-recommendations-and-more-via-mapreduce-and-scalding/]]
  8 |  * to Spark.
  9 |  * Uses movie ratings data from MovieLens 100k dataset found at [[http://www.grouplens.org/node/73]]
 10 |  */
 11 | object MovieSimilarities {
 12 | 
 13 |   def main(args: Array[String]) {
 14 |     /**
 15 |      * Parameters to regularize correlation.
 16 |      */
 17 |     val PRIOR_COUNT = 10
 18 |     val PRIOR_CORRELATION = 0
 19 | 
 20 |     val TRAIN_FILENAME = "data/ml-100k/ua.base"
 21 |     val TEST_FIELNAME = "data/ml-100k/ua.test"
 22 |     val MOVIES_FILENAME = "data/ml-100k/u.item"
 23 | 
 24 |     /**
 25 |      * Spark programs require a SparkContext to be initialized
 26 |      */
 27 |     val master = "local[*]"
 28 |     val sc = new SparkContext(master, "MovieSimilarities")
 29 | 
 30 |     // get movie names keyed on id
 31 |     val movies = sc.textFile(MOVIES_FILENAME)
 32 |       .map(line => {
 33 |       val fields = line.split("\\|")
 34 |       (fields(0).toInt, fields(1))
 35 |     })
 36 |     val movieNames = movies.collectAsMap() // for local use to map id <-> movie name for pretty-printing
 37 | 
 38 |     // extract (userid, movieid, rating) from ratings data
 39 |     val ratings = sc.textFile(TRAIN_FILENAME)
 40 |       .map(line => {
 41 |       val fields = line.split("\t")
 42 |       (fields(0).toInt, fields(1).toInt, fields(2).toInt)
 43 |     })
 44 | 
 45 |     // get num raters per movie, keyed on movie id
 46 |     val numRatersPerMovie = ratings
 47 |       .groupBy(tup => tup._2)
 48 |       .map(grouped => (grouped._1, grouped._2.size))
 49 | 
 50 |     // join ratings with num raters on movie id
 51 |     val ratingsWithSize = ratings
 52 |       .groupBy(tup => tup._2)
 53 |       .join(numRatersPerMovie)
 54 |       .flatMap(joined => {
 55 |       joined._2._1.map(f => (f._1, f._2, f._3, joined._2._2))
 56 |     })
 57 | 
 58 |     // ratingsWithSize now contains the following fields: (user, movie, rating, numRaters).
 59 | 
 60 |     // dummy copy of ratings for self join
 61 |     val ratings2 = ratingsWithSize.keyBy(tup => tup._1)
 62 | 
 63 |     // join on userid and filter movie pairs such that we don't double-count and exclude self-pairs
 64 |     val ratingPairs =
 65 |       ratingsWithSize
 66 |         .keyBy(tup => tup._1)
 67 |         .join(ratings2)
 68 |         .filter(f => f._2._1._2 < f._2._2._2)
 69 | 
 70 |     // compute raw inputs to similarity metrics for each movie pair
 71 |     val vectorCalcs =
 72 |       ratingPairs
 73 |         .map(data => {
 74 |         val key = (data._2._1._2, data._2._2._2)
 75 |         val stats =
 76 |           (data._2._1._3 * data._2._2._3, // rating 1 * rating 2
 77 |             data._2._1._3, // rating movie 1
 78 |             data._2._2._3, // rating movie 2
 79 |             math.pow(data._2._1._3, 2), // square of rating movie 1
 80 |             math.pow(data._2._2._3, 2), // square of rating movie 2
 81 |             data._2._1._4, // number of raters movie 1
 82 |             data._2._2._4) // number of raters movie 2
 83 |         (key, stats)
 84 |       })
 85 |         .groupByKey()
 86 |         .map(data => {
 87 |         val key = data._1
 88 |         val vals = data._2
 89 |         val size = vals.size
 90 |         val dotProduct = vals.map(f => f._1).sum
 91 |         val ratingSum = vals.map(f => f._2).sum
 92 |         val rating2Sum = vals.map(f => f._3).sum
 93 |         val ratingSq = vals.map(f => f._4).sum
 94 |         val rating2Sq = vals.map(f => f._5).sum
 95 |         val numRaters = vals.map(f => f._6).max
 96 |         val numRaters2 = vals.map(f => f._7).max
 97 |         (key, (size, dotProduct, ratingSum, rating2Sum, ratingSq, rating2Sq, numRaters, numRaters2))
 98 |       })
 99 | 
100 |     // compute similarity metrics for each movie pair
101 |     val similarities =
102 |       vectorCalcs
103 |         .map(fields => {
104 |         val key = fields._1
105 |         val (size, dotProduct, ratingSum, rating2Sum, ratingNormSq, rating2NormSq, numRaters, numRaters2) = fields._2
106 |         val corr = correlation(size, dotProduct, ratingSum, rating2Sum, ratingNormSq, rating2NormSq)
107 |         val regCorr = regularizedCorrelation(size, dotProduct, ratingSum, rating2Sum,
108 |           ratingNormSq, rating2NormSq, PRIOR_COUNT, PRIOR_CORRELATION)
109 |         val cosSim = cosineSimilarity(dotProduct, scala.math.sqrt(ratingNormSq), scala.math.sqrt(rating2NormSq))
110 |         val jaccard = jaccardSimilarity(size, numRaters, numRaters2)
111 | 
112 |         (key, (corr, regCorr, cosSim, jaccard))
113 |       })
114 | 
115 |     // test a few movies out (substitute the contains call with the relevant movie name
116 |     val sample = similarities.filter(m => {
117 |       val movies = m._1
118 |       (movieNames(movies._1).contains("Star Wars (1977)"))
119 |     })
120 | 
121 |     // collect results, excluding NaNs if applicable
122 |     val result = sample.map(v => {
123 |       val m1 = v._1._1
124 |       val m2 = v._1._2
125 |       val corr = v._2._1
126 |       val rcorr = v._2._2
127 |       val cos = v._2._3
128 |       val j = v._2._4
129 |       (movieNames(m1), movieNames(m2), corr, rcorr, cos, j)
130 |     }).collect().filter(e => !(e._4 equals Double.NaN)) // test for NaNs must use equals rather than ==
131 |       .sortBy(elem => elem._4).take(10)
132 | 
133 |     // print the top 10 out
134 |     result.foreach(r => println(r._1 + " | " + r._2 + " | " + r._3.formatted("%2.4f") + " | " + r._4.formatted("%2.4f")
135 |       + " | " + r._5.formatted("%2.4f") + " | " + r._6.formatted("%2.4f")))
136 |   }
137 | 
138 |   // *************************
139 |   // * SIMILARITY MEASURES
140 |   // *************************
141 | 
142 |   /**
143 |    * The correlation between two vectors A, B is
144 |    * cov(A, B) / (stdDev(A) * stdDev(B))
145 |    *
146 |    * This is equivalent to
147 |    * [n * dotProduct(A, B) - sum(A) * sum(B)] /
148 |    * sqrt{ [n * norm(A)^2 - sum(A)^2] [n * norm(B)^2 - sum(B)^2] }
149 |    */
150 |   def correlation(size: Double, dotProduct: Double, ratingSum: Double,
151 |                   rating2Sum: Double, ratingNormSq: Double, rating2NormSq: Double) = {
152 | 
153 |     val numerator = size * dotProduct - ratingSum * rating2Sum
154 |     val denominator = scala.math.sqrt(size * ratingNormSq - ratingSum * ratingSum) *
155 |       scala.math.sqrt(size * rating2NormSq - rating2Sum * rating2Sum)
156 | 
157 |     numerator / denominator
158 |   }
159 | 
160 |   /**
161 |    * Regularize correlation by adding virtual pseudocounts over a prior:
162 |    * RegularizedCorrelation = w * ActualCorrelation + (1 - w) * PriorCorrelation
163 |    * where w = # actualPairs / (# actualPairs + # virtualPairs).
164 |    */
165 |   def regularizedCorrelation(size: Double, dotProduct: Double, ratingSum: Double,
166 |                              rating2Sum: Double, ratingNormSq: Double, rating2NormSq: Double,
167 |                              virtualCount: Double, priorCorrelation: Double) = {
168 | 
169 |     val unregularizedCorrelation = correlation(size, dotProduct, ratingSum, rating2Sum, ratingNormSq, rating2NormSq)
170 |     val w = size / (size + virtualCount)
171 | 
172 |     w * unregularizedCorrelation + (1 - w) * priorCorrelation
173 |   }
174 | 
175 |   /**
176 |    * The cosine similarity between two vectors A, B is
177 |    * dotProduct(A, B) / (norm(A) * norm(B))
178 |    */
179 |   def cosineSimilarity(dotProduct: Double, ratingNorm: Double, rating2Norm: Double) = {
180 |     dotProduct / (ratingNorm * rating2Norm)
181 |   }
182 | 
183 |   /**
184 |    * The Jaccard Similarity between two sets A, B is
185 |    * |Intersection(A, B)| / |Union(A, B)|
186 |    */
187 |   def jaccardSimilarity(usersInCommon: Double, totalUsers1: Double, totalUsers2: Double) = {
188 |     val union = totalUsers1 + totalUsers2 - usersInCommon
189 |     usersInCommon / union
190 |   }
191 | 
192 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/mllib/ScalaLocalALS.scala:
--------------------------------------------------------------------------------
  1 | package com.javachen.grab
  2 | 
  3 | import org.apache.spark.mllib.recommendation.{ALS, Rating}
  4 | import org.apache.spark.rdd.RDD
  5 | import org.apache.spark.{SparkConf, SparkContext}
  6 | import org.jblas.DoubleMatrix
  7 | import scala.sys.process._
  8 | 
  9 | import org.apache.log4j.{Level, Logger}
 10 | 
 11 | 
 12 | /**
 13 |  * 本地模式运行
 14 |  */
 15 | object ScalaLocalALS {
 16 |   def main(args: Array[String]): Unit = {
 17 |     val sc = new SparkContext(new SparkConf().setAppName("Scala Collaborative Filtering Example"))
 18 | 
 19 |     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
 20 |     Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
 21 |     Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
 22 | 
 23 |     // 1. 加载并解析数据
 24 |     val data = sc.textFile("data/ml-1m/ratings.dat")
 25 | 
 26 |     val ratings = data.map(_.split("::") match { case Array(user, item, rate, ts) =>
 27 |       Rating(user.toInt, item.toInt, rate.toDouble)
 28 |     }).cache()
 29 | 
 30 |     val users = ratings.map(_.user).distinct()
 31 |     val products = ratings.map(_.product).distinct()
 32 |     println("Got "+ratings.count()+" ratings from "+users.count+" users on "+products.count+" products.")
 33 |     //Got 1000209 ratings from 6040 users on 3706 products.
 34 | 
 35 |     // 2. 训练模型
 36 |     val rank = 12
 37 |     val lambda = 0.01
 38 |     val numIterations = 20
 39 |     val model = ALS.train(ratings, rank, numIterations, lambda)
 40 | 
 41 |     model.userFeatures
 42 |     model.userFeatures.count
 43 | 
 44 |     model.productFeatures
 45 |     model.productFeatures.count
 46 | 
 47 |     // 3. 计算均方差
 48 |     //从 ratings 中获得只包含用户和商品的数据集
 49 |     val usersProducts = ratings.map { case Rating(user, product, rate) =>
 50 |       (user, product)
 51 |     }
 52 | 
 53 |     usersProducts.count  //Long = 1000209
 54 | 
 55 |     //使用推荐模型对用户商品进行预测评分，得到预测评分的数据集
 56 |     var predictions = model.predict(usersProducts).map { case Rating(user, product, rate) =>
 57 |         ((user, product), rate)
 58 |     }
 59 | 
 60 |     predictions.count //Long = 1000209
 61 | 
 62 |     //将真实评分数据集与预测评分数据集进行合并
 63 |     val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
 64 |       ((user, product), rate)
 65 |     }.join(predictions)
 66 | 
 67 |     ratesAndPreds.count  //Long = 1000209
 68 | 
 69 |     val rmse= math.sqrt(ratesAndPreds.map { case ((user, product), (r1, r2)) =>
 70 |       val err = (r1 - r2)
 71 |       err * err
 72 |     }.mean())
 73 | 
 74 |     println(s"RMSE = $rmse")
 75 | 
 76 |     // 4.保存预测评分，确保只生成一个文件，并排序
 77 |     "rm -r /tmp/result".!
 78 |     ratesAndPreds.sortByKey().repartition(1).sortBy(_._1).map({
 79 |       case ((user, product), (rate, pred)) => (user + "," + product + "," + rate + "," + pred)
 80 |     }).saveAsTextFile("/tmp/result")
 81 | 
 82 |     //对预测的评分结果按用户进行分组并按评分倒排序
 83 |     predictions.map { case ((user, product), rate) =>
 84 |       (user, (product, rate))
 85 |     }.groupByKey().map{case (user_id,list)=>
 86 |       (user_id,list.toList.sortBy {case (goods_id,rate)=> - rate})
 87 |     }
 88 | 
 89 |     // 5. 对某一个用户推荐所有商品
 90 |     users.take(5) //Array[Int] = Array(384, 1084, 4904, 3702, 5618)
 91 |     val userId = users.take(1)(0) //384
 92 |     val K = 10
 93 |     val topKRecs = model.recommendProducts(userId, K)
 94 |     //topKRecs: Array[org.apache.spark.mllib.recommendation.Rating] = Array(Rating(384,1539,7.360670267591244), Rating(384,219,6.736019537477872), Rating(384,1520,6.730562698267339), Rating(384,775,6.697620546404394), Rating(384,3161,6.49555676613329), Rating(384,2711,6.445916831219404), Rating(384,2503,6.428273027496898), Rating(384,771,6.4255234943275825), Rating(384,853,6.170422982870869), Rating(384,759,6.04929517890501))
 95 | 
 96 |     println(topKRecs.mkString("\n"))
 97 | //    Rating(384,1539,7.360670267591244)
 98 | //    Rating(384,219,6.736019537477872)
 99 | //    Rating(384,1520,6.730562698267339)
100 | //    Rating(384,775,6.697620546404394)
101 | //    Rating(384,3161,6.49555676613329)
102 | //    Rating(384,2711,6.445916831219404)
103 | //    Rating(384,2503,6.428273027496898)
104 | //    Rating(384,771,6.4255234943275825)
105 | //    Rating(384,853,6.170422982870869)
106 | //    Rating(384,759,6.04929517890501)
107 | 
108 |     val productsForUser=ratings.keyBy(_.user).lookup(384)
109 | //    Seq[org.apache.spark.mllib.recommendation.Rating] = WrappedArray(Rating(384,2055,2.0), Rating(384,1197,4.0), Rating(384,593,5.0), Rating(384,599,3.0), Rating(384,673,2.0), Rating(384,3037,4.0), Rating(384,1381,2.0), Rating(384,1610,4.0), Rating(384,3074,4.0), Rating(384,204,4.0), Rating(384,3508,3.0), Rating(384,1007,3.0), Rating(384,260,4.0), Rating(384,3487,3.0), Rating(384,3494,3.0), Rating(384,1201,5.0), Rating(384,3671,5.0), Rating(384,1207,4.0), Rating(384,2947,4.0), Rating(384,2951,4.0), Rating(384,2896,2.0), Rating(384,1304,5.0))
110 | 
111 |     productsForUser.size //Int = 22
112 |     productsForUser.sortBy(-_.rating).take(10).map(rating => (rating.product, rating.rating)).foreach(println)
113 | //    (593,5.0)
114 | //    (1201,5.0)
115 | //    (3671,5.0)
116 | //    (1304,5.0)
117 | //    (1197,4.0)
118 | //    (3037,4.0)
119 | //    (1610,4.0)
120 | //    (3074,4.0)
121 | //    (204,4.0)
122 | //    (260,4.0)
123 | 
124 |     /* Compute squared error between a predicted and actual rating */
125 |     // We'll take the first rating for our example user 789
126 |     val actualRating = productsForUser.take(1)(0)
127 |     //actualRating: org.apache.spark.mllib.recommendation.Rating = Rating(384,2055,2.0)    val predictedRating = model.predict(789, actualRating.product)
128 |     val predictedRating = model.predict(384, actualRating.product)
129 |     //predictedRating: Double = 1.9426030777174637
130 | 
131 |     //找出和2055商品最相似的商品
132 |     val itemId = 2055
133 |     val itemFactor = model.productFeatures.lookup(itemId).head
134 |     //itemFactor: Array[Double] = Array(0.3660752773284912, 0.43573060631752014, -0.3421429991722107, 0.44382765889167786, -1.4875195026397705, 0.6274569630622864, -0.3264533579349518, -0.9939845204353333, -0.8710321187973022, -0.7578890323638916, -0.14621856808662415, -0.7254264950752258)
135 |     val itemVector = new DoubleMatrix(itemFactor)
136 |     //itemVector: org.jblas.DoubleMatrix = [0.366075; 0.435731; -0.342143; 0.443828; -1.487520; 0.627457; -0.326453; -0.993985; -0.871032; -0.757889; -0.146219; -0.725426]
137 | 
138 |     cosineSimilarity(itemVector, itemVector)
139 |     // res99: Double = 0.9999999999999999
140 | 
141 |     val sims = model.productFeatures.map{ case (id, factor) =>
142 |       val factorVector = new DoubleMatrix(factor)
143 |       val sim = cosineSimilarity(factorVector, itemVector)
144 |       (id, sim)
145 |     }
146 |     val sortedSims = sims.top(K)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
147 |     //sortedSims: Array[(Int, Double)] = Array((2055,0.9999999999999999), (2051,0.9138311231145874), (3520,0.8739823400539756), (2190,0.8718466671129721), (2050,0.8612639515847019), (1011,0.8466911667526461), (2903,0.8455764332511272), (3121,0.8227325520485377), (3674,0.8075743004357392), (2016,0.8063817280259447))
148 |     println(sortedSims.mkString("\n"))
149 | //    (2055,0.9999999999999999)
150 | //    (2051,0.9138311231145874)
151 | //    (3520,0.8739823400539756)
152 | //    (2190,0.8718466671129721)
153 | //    (2050,0.8612639515847019)
154 | //    (1011,0.8466911667526461)
155 | //    (2903,0.8455764332511272)
156 | //    (3121,0.8227325520485377)
157 | //    (3674,0.8075743004357392)
158 | //    (2016,0.8063817280259447)
159 | 
160 |     val sortedSims2 = sims.top(K + 1)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
161 |     //sortedSims2: Array[(Int, Double)] = Array((2055,0.9999999999999999), (2051,0.9138311231145874), (3520,0.8739823400539756), (2190,0.8718466671129721), (2050,0.8612639515847019), (1011,0.8466911667526461), (2903,0.8455764332511272), (3121,0.8227325520485377), (3674,0.8075743004357392), (2016,0.8063817280259447), (3672,0.8016276723120674))
162 | 
163 |     sortedSims2.slice(1, 11).map{ case (id, sim) => (id, sim) }.mkString("\n")
164 | //    (2051,0.9138311231145874)
165 | //    (3520,0.8739823400539756)
166 | //    (2190,0.8718466671129721)
167 | //    (2050,0.8612639515847019)
168 | //    (1011,0.8466911667526461)
169 | //    (2903,0.8455764332511272)
170 | //    (3121,0.8227325520485377)
171 | //    (3674,0.8075743004357392)
172 | //    (2016,0.8063817280259447)
173 | //    (3672,0.8016276723120674)
174 | 
175 |     //计算给该用户推荐的前K个商品的平均准确度MAPK
176 |     val actualProducts= productsForUser.map(_.product)
177 |     //actualProducts: Seq[Int] = ArrayBuffer(2055, 1197, 593, 599, 673, 3037, 1381, 1610, 3074, 204, 3508, 1007, 260, 3487, 3494, 1201, 3671, 1207, 2947, 2951, 2896, 1304)
178 |     val predictedProducts= topKRecs.map(_.product)
179 |     //predictedProducts:Array[Int] = Array(1539, 219, 1520, 775, 3161, 2711, 2503, 771, 853, 759)
180 |     val apk10 = avgPrecisionK(actualProducts, predictedProducts, 10)
181 |     // apk10: Double = 0.0
182 | 
183 |     users.collect.flatMap { user =>
184 |       model.recommendProducts(user, 10)
185 |     }
186 | 
187 |     //计算所有的推荐结果
188 |     val itemFactors = model.productFeatures.map { case (prodcut, factor) => factor }.collect()
189 |     val itemMatrix = new DoubleMatrix(itemFactors)
190 |     println(itemMatrix.rows, itemMatrix.columns)
191 | 
192 |     val imBroadcast = sc.broadcast(itemMatrix)
193 | 
194 |     var idxProducts=model.productFeatures.map { case (prodcut, factor) => prodcut }.zipWithIndex().map{case (prodcut, idx) => (idx,prodcut)}.collectAsMap()
195 |     val idxProductsBroadcast = sc.broadcast(idxProducts)
196 | 
197 |     val allRecs = model.userFeatures.map{ case (user, array) =>
198 |       val userVector = new DoubleMatrix(array)
199 |       val scores = imBroadcast.value.mmul(userVector)
200 |       val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
201 |       val recommendedProducts = sortedWithId.map(_._2).map{idx=>idxProductsBroadcast.value.get(idx).get}
202 |       (user, recommendedProducts)  //recommendedIds 为索引
203 |     }
204 | 
205 |     //验证结果是否正确
206 |     allRecs.lookup(384).head.take(10)
207 |     //res50: Array[Int] = Array(1539, 219, 1520, 775, 3161, 2711, 2503, 771, 853, 759)
208 |     topKRecs.map(_.product)
209 |     //res49: Array[Int] = Array(1539, 219, 1520, 775, 3161, 2711, 2503, 771, 853, 759)
210 | 
211 | 
212 |     //得到每个用户评分过的所有商品
213 |     val userProducts = ratings.map{ case Rating(user, product, rating) => (user, product) }.groupBy(_._1)
214 | 
215 |     // finally, compute the APK for each user, and average them to find MAPK
216 |     val MAPK = allRecs.join(userProducts).map{ case (userId, (predictedProducts, actualList)) =>
217 |       val actualProducts = actualList.map{case (user, product)=>product}.toSeq
218 |       avgPrecisionK(actualProducts, predictedProducts, K)
219 |     }.reduce(_ + _) / allRecs.count
220 |     println("Mean Average Precision at K = " + MAPK)
221 | 
222 |     // MSE, RMSE and MAE
223 |     import org.apache.spark.mllib.evaluation.RegressionMetrics
224 | 
225 |     val predictedAndTrue = ratesAndPreds.map { case ((user, product), (actual, predicted)) => (actual, predicted) }
226 |     val regressionMetrics = new RegressionMetrics(predictedAndTrue)
227 |     println("Mean Squared Error = " + regressionMetrics.meanSquaredError)
228 |     println("Root Mean Squared Error = " + regressionMetrics.rootMeanSquaredError)
229 |     // Mean Squared Error = 0.08231947642632852
230 |     // Root Mean Squared Error = 0.2869137090247319
231 | 
232 |     // MAPK
233 |     import org.apache.spark.mllib.evaluation.RankingMetrics
234 |     val predictedAndTrueForRanking = allRecs.join(userProducts).map{ case (userId, (predicted, actualWithIds)) =>
235 |       val actual = actualWithIds.map(_._2)
236 |       (predicted.toArray, actual.toArray)
237 |     }
238 |     val rankingMetrics = new RankingMetrics(predictedAndTrueForRanking)
239 |     println("Mean Average Precision = " + rankingMetrics.meanAveragePrecision)
240 |     // Mean Average Precision = 0.07171412913757183
241 | 
242 |     // Compare to our implementation, using K = 2000 to approximate the overall MAP
243 |     val MAPK2000 = allRecs.join(userProducts).map{ case (userId, (predicted, actualWithIds)) =>
244 |       val actual = actualWithIds.map(_._2).toSeq
245 |       avgPrecisionK(actual, predicted, 2000)
246 |     }.reduce(_ + _) / allRecs.count
247 |     println("Mean Average Precision = " + MAPK2000)
248 | 
249 | //    recommendsByUserTopN.foreachPartition(partitionOfRecords => {
250 | //        partitionOfRecords.foreach(pair => {
251 | //          val jedis = RedisClient.pool.getResource
252 | //          jedis.set(pair._1.toString,pair._2.mkString(","))
253 | //          jedis.close()
254 | //        })
255 | //      })
256 | 
257 |   }
258 | 
259 |   /* Compute the cosine similarity between two vectors */
260 |   def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
261 |     vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
262 |   }
263 | 
264 |   /* Function to compute average precision given a set of actual and predicted ratings */
265 |   // Code for this function is based on: https://github.com/benhamner/Metrics
266 |   def avgPrecisionK(actual: Seq[Int], predicted: Seq[Int], k: Int): Double = {
267 |     val predK = predicted.take(k)
268 |     var score = 0.0
269 |     var numHits = 0.0
270 |     for ((p, i) <- predK.zipWithIndex) {
271 |       if (actual.contains(p)) {
272 |         numHits += 1.0
273 |         score += numHits / (i.toDouble + 1.0)
274 |       }
275 |     }
276 |     if (actual.isEmpty) {
277 |       1.0
278 |     } else {
279 |       score / scala.math.min(actual.size, k).toDouble
280 |     }
281 |   }
282 | }
283 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/mllib/ScalaMovieLensALS.scala:
--------------------------------------------------------------------------------
  1 | package com.javachen.spark.examples.mllib
  2 | 
  3 | import java.util.Random
  4 | 
  5 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
  6 | import org.apache.spark.rdd._
  7 | import org.apache.spark.{SparkConf, SparkContext}
  8 | 
  9 | /**
 10 |  * see:https://github.com/mohit-shrma/RandomSamples/blob/d9f1117bc21bb09d9fa858bc6d95e08e753e6fa0/SparkScala/CollabFilter/src/main/scala/MovieLensALS.scala
 11 |  */
 12 | object ScalaMovieLensALS {
 13 | 
 14 |   def main(args: Array[String]) {
 15 | 
 16 |     //import org.apache.log4j.{Logger,Level}
 17 |     //Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
 18 |     //Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
 19 | 
 20 |     if (args.length != 2) {
 21 |       println("Usage: /path/to/spark/bin/spark-submit --driver-memory 2g --class com.javachen.spark.examples.mllib.ScalaMovieLensALS " +
 22 |         "target/scala-*/movielens-als-ssembly-*.jar movieLensHomeDir personalRatingsFile")
 23 |       sys.exit(1)
 24 |     }
 25 | 
 26 |     // set up environment
 27 |     val conf = new SparkConf().setAppName("ScalaMovieLensALS")
 28 |     val sc = new SparkContext(conf)
 29 | 
 30 |     // load ratings and movie titles
 31 |     val ratings = sc.textFile(args(0) + "/ratings.dat").map { line =>
 32 |       val fields = line.split("::")
 33 |       // format: (timestamp % 10, Rating(userId, movieId, rating))
 34 |       (fields(3).toLong % 10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
 35 |     }
 36 | 
 37 |     val movies = sc.textFile(args(0) + "/movies.dat").map { line =>
 38 |       val fields = line.split("::")
 39 |       // format: (movieId, movieName)
 40 |       (fields(0).toInt, fields(1))
 41 |     }.collect().toMap
 42 | 
 43 |     val numRatings = ratings.count()
 44 |     val numUsers = ratings.map(_._2.user).distinct().count()
 45 |     val numMovies = ratings.map(_._2.product).distinct().count()
 46 | 
 47 |     println(s"Got $numRatings ratings from $numUsers users on $numMovies movies.")
 48 | 
 49 |     //get ratings of user on top 50 popular movies
 50 |     val mostRatedMovieIds = ratings.map(_._2.product) //extract movieId
 51 |       .countByValue //count ratings per movie
 52 |       .toSeq //convert map to seq
 53 |       .sortBy(-_._2) //sort by rating count in decreasing order
 54 |       .take(50) //take 50 most rated
 55 |       .map(_._1) //get movie ids
 56 | 
 57 |     val random = new Random(0)
 58 |     val selectedMovies = mostRatedMovieIds.filter(x => random.nextDouble() < 0.2)
 59 |       .map(x => (x, movies(x)))
 60 |       .toSeq
 61 |     val myRatings = elicitateRatings(selectedMovies)
 62 |     //convert received ratings to RDD[Rating], now this can be worked in parallel
 63 |     val myRatingsRDD = sc.parallelize(myRatings)
 64 | 
 65 |     // split ratings into train (60%), validation (20%), and test (20%) based on the
 66 |     // last digit of the timestamp, add myRatings to train, and cache them
 67 | 
 68 |     val numPartitions = 4
 69 |     val training = ratings.filter(x => x._1 < 6).values.union(myRatingsRDD).repartition(numPartitions).cache()
 70 |     val validation = ratings.filter(x => x._1 >= 6 && x._1 < 8).values.repartition(numPartitions).cache()
 71 |     val test = ratings.filter(x => x._1 >= 8).values.cache()
 72 | 
 73 |     val numTraining = training.count()
 74 |     val numValidation = validation.count()
 75 |     val numTest = test.count()
 76 | 
 77 |     println(s"Training: $numTraining, validation: $numValidation, test: $numTest")
 78 | 
 79 |     // train models and evaluate them on the validation set
 80 |     val ranks = List(8, 10, 12)
 81 |     val lambdas = List(0.1, 1.0, 10.0)
 82 |     val numIterations = List(10, 20)
 83 |     var bestModel: Option[MatrixFactorizationModel] = None
 84 |     var bestValidationRmse = Double.MaxValue
 85 |     var bestRank = 0
 86 |     var bestLambda = -1.0
 87 |     var bestNumIter = -1
 88 |     for (rank <- ranks; lambda <- lambdas; numIter <- numIterations) {
 89 |       //learn model for these parameter
 90 |       val model = ALS.train(training, rank, numIter, lambda)
 91 |       val validationRmse = computeRmse(model, validation)
 92 |       println(s"RMSE (validation) = $validationRmse for the model trained with rank = $rank , lambda = $lambda , and numIter = $numIter .")
 93 |       if (validationRmse < bestValidationRmse) {
 94 |         bestModel = Some(model)
 95 |         bestValidationRmse = validationRmse
 96 |         bestRank = rank
 97 |         bestLambda = lambda
 98 |         bestNumIter = numIter
 99 |       }
100 |     }
101 | 
102 |     // evaluate the best model on the test set
103 |     val testRmse = computeRmse(bestModel.get, test)
104 |     println(s"The best model was trained with rank = $bestRank and lambda = $bestLambda , and numIter = $bestNumIter , and its RMSE on the test set is $testRmse .")
105 | 
106 |     //find best movies for the user
107 |     val myRatedMovieIds = myRatings.map(_.product).toSet
108 |     //generate candidates after taking out already rated movies
109 |     val candidates = sc.parallelize(movies.keys.filter(!myRatedMovieIds.contains(_)).toSeq)
110 |     val recommendations = bestModel.get.predict(candidates.map((0, _))).collect.sortBy(-_.rating).take(50)
111 |     var i = 1
112 |     println("Movies recommendation for you: ")
113 |     recommendations.foreach { r =>
114 |       println("%2d".format(i) + ": " + movies(r.product))
115 |       i += 1
116 |     }
117 | 
118 |     // create a naive baseline and compare it with the best model
119 |     val meanRating = training.union(validation).map(_.rating).mean
120 |     val baselineRmse = math.sqrt(test.map(x => (meanRating - x.rating) * (meanRating - x.rating)).mean)
121 |     val improvement = (baselineRmse - testRmse) / baselineRmse * 100
122 |     println("The best model improves the baseline by " + "%1.2f".format(improvement) + "%.")
123 | 
124 |     // clean up
125 |     sc.stop()
126 |   }
127 | 
128 |   /** Compute RMSE (Root Mean Squared Error). */
129 |   def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating]) = {
130 |     val usersProducts = data.map { case Rating(user, product, rate) =>
131 |       (user, product)
132 |     }
133 | 
134 |     val predictions = model.predict(usersProducts).map { case Rating(user, product, rate) =>
135 |       ((user, product), rate)
136 |     }
137 | 
138 |     val ratesAndPreds = data.map { case Rating(user, product, rate) =>
139 |       ((user, product), rate)
140 |     }.join(predictions).sortByKey()
141 | 
142 |     math.sqrt(ratesAndPreds.map { case ((user, product), (r1, r2)) =>
143 |       val err = (r1 - r2)
144 |       err * err
145 |     }.mean())
146 |   }
147 | 
148 |   /** Elicitate ratings from commandline **/
149 |   def elicitateRatings(movies: Seq[(Int, String)]) = {
150 |     val prompt = "Please rate following movie (1-5(best), or 0 if not seen):"
151 |     println(prompt)
152 |     val ratings = movies.flatMap { x =>
153 | 
154 |       var rating: Option[Rating] = None
155 |       var valid = false
156 | 
157 |       while (!valid) {
158 |         print(x._2 + ": ")
159 |         try {
160 |           val r = Console.readInt
161 |           if (r < 0 || r > 5) {
162 |             println(prompt)
163 |           } else {
164 |             valid = true
165 |             if (r > 0) {
166 |               rating = Some(Rating(0, x._1, r))
167 |             }
168 |           }
169 |         } catch {
170 |           case e: Exception => println(prompt)
171 |         }
172 |       }
173 | 
174 |       rating match {
175 |         case Some(r) => Iterator(r)
176 |         case None => Iterator.empty
177 |       }
178 | 
179 |     } //end flatMap
180 | 
181 |     if (ratings.isEmpty) {
182 |       error("No rating provided")
183 |     } else {
184 |       ratings
185 |     }
186 | 
187 |   }
188 | 
189 | }
190 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/ActionTest.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | /**
 4 |  *
 5 |  * @author <a href="mailto:june.chan@foxmail.com">june</a>.
 6 |  * @date 2015-05-12 17:25.
 7 |  */
 8 | object ActionTest {
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Aggregate.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Aggregate {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "AggregateAction Test")
10 |     val data = Array[(String, Int)](("A1", 1), ("A2", 2),
11 |       ("B1", 3), ("B2", 4),
12 |       ("C1", 5), ("C2", 6))
13 | 
14 |     val pairs = sc.parallelize(data, 3)
15 | 
16 |     // output:
17 |     // 	(A1,1)(A2,2)
18 |     //  (B1,3)(B2,4)
19 |     //	(C1,5)(C2,6)
20 |     pairs.foreach(print)
21 | 
22 |     val result = pairs.aggregate(("", 0))((U, T) => (U._1 + T._1, U._2 + T._2), (U, T) =>
23 |       ("[" + U._1 + T._1 + "] ", U._2 + T._2))
24 | 
25 |     // output ([[[A1A2] B1B2] C1C2] ,21)
26 |     println(result)
27 |   }
28 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/AggregateOrder.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object AggregateOrder {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "AggregateOrder Test")
10 |     val data = List("12", "23", "345", "4567")
11 | 
12 |     val pairs = sc.parallelize(data, 2)
13 |     pairs.foreach(x => println(x.length))
14 | 
15 |     //val result = pairs.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y)
16 | 
17 |     val result2 = pairs.aggregate("")((x,y) => "[" + x.length + "," + y.length + "] ", (x,y) => x + y)
18 | 
19 |     result2.foreach(println)
20 |     println(result2)
21 | 
22 |   }
23 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Cartesian.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Cartesian {
 6 |   def main(args: Array[String]) {
 7 |     val sc = new SparkContext("local", "Cartesian Test")
 8 |     val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
 9 |       ("B1", 3), ("B2", 4),
10 |       ("C1", 5), ("C1", 6))
11 | 
12 |     val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
13 |       ("B1", 9), ("C1", 0))
14 |     val pairs1 = sc.parallelize(data1, 3)
15 |     val pairs2 = sc.parallelize(data2, 2)
16 | 
17 |     val resultRDD = pairs1.cartesian(pairs2)
18 | 
19 |     resultRDD.foreach(println)
20 | 
21 |     /*
22 |      * Output of task1:
23 |      * ((A1,1),(A1,7))
24 |      * ((A1,1),(A2,8))
25 |      * ((A2,2),(A1,7))
26 |      * ((A2,2),(A2,8))
27 |      * Output of task2:
28 |      * ((A1,1),(B1,9))
29 |      * ((A1,1),(C1,0))
30 |      * ((A2,2),(B1,9))
31 |      * ((A2,2),(C1,0))
32 |      * Output of task3:
33 |      * ((B1,3),(A1,7))
34 |      * ((B1,3),(A2,8))
35 |      * ((B2,4),(A1,7))
36 |      * ((B2,4),(A2,8))
37 |      * Output of task4:
38 |      * ((B1,3),(B1,9))
39 |      * ((B1,3),(C1,0))
40 |      * ((B2,4),(B1,9))
41 |      * ((B2,4),(C1,0))
42 |      * Output of task5:
43 |      * ((C1,5),(A1,7))
44 |      * ((C1,5),(A2,8))
45 |      * ((C1,6),(A1,7))
46 |      * ((C1,6),(A2,8))
47 |      * Output of task6:
48 |      * ((C1,5),(B1,9))
49 |      * ((C1,5),(C1,0))
50 |      * ((C1,6),(B1,9))
51 |      * ((C1,6),(C1,0))
52 |      */
53 | 
54 |   }
55 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/CollectAsMap.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object CollectAsMap {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "CollectAsMap Test")
10 |     val data = Array[(String, Int)](("A", 1), ("B", 2),
11 |       ("B", 3), ("C", 4),
12 |       ("C", 5), ("C", 6))
13 | 
14 |     // as same as "val pairs = sc.parallelize(data, 3)"
15 |     val pairs = sc.makeRDD(data, 3)
16 | 
17 |     val result = pairs.collectAsMap
18 | 
19 |     // output Map(A -> 1, C -> 6, B -> 3)
20 |     print(result)
21 |   }
22 | 
23 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/FlatMap.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object FlatMap {
 6 |   def main(args: Array[String]) {
 7 | 
 8 |     val sc = new SparkContext("local", "FlatMap Test")
 9 |     val data = Array[(String, Int)](("A", 1), ("B", 2),
10 |       ("B", 3), ("C", 4),
11 |       ("C", 5), ("C", 6)
12 |     )
13 |     val pairs = sc.makeRDD(data, 3)
14 | 
15 |     val result = pairs.flatMap(T => (T._1 + T._2))
16 | 
17 |     result.foreach(println)
18 | 
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/GroupByAction.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.RangePartitioner
 5 | 
 6 | object GroupByAction {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "GroupByAction Test")
10 | 
11 |     val data = Array[(String, Int)](("A1", 1), ("A2", 2),
12 |       ("B1", 6), ("A2", 4),
13 |       ("B1", 3), ("B1", 5))
14 | 
15 |     val pairs = sc.parallelize(data, 3)
16 | 
17 |     // output:
18 |     // (A1,1)
19 |     // (A2,2)
20 |     //
21 |     // (B1,6)
22 |     // (A2,4)
23 |     //
24 |     // (B1,3)
25 |     // (B1,5)
26 |     pairs.foreach(println)
27 | 
28 |     val result1 = pairs.groupBy(K => K._1)
29 |     val result2 = pairs.groupBy((K : (String, Int)) => K._1, 1)
30 |     val result3 = pairs.groupBy((K : (String, Int)) => K._1, new RangePartitioner(3, pairs))
31 | 
32 |     // output of result1:
33 |     // (A1,ArrayBuffer((A1,1)))
34 |     //
35 |     // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
36 |     // (A2,ArrayBuffer((A2,2), (A2,4)))
37 |     result1.foreach(println)
38 | 
39 |     // output of result2:
40 |     // (A1,ArrayBuffer((A1,1)))
41 |     // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
42 |     // (A2,ArrayBuffer((A2,2), (A2,4)))
43 |     result2.foreach(println)
44 | 
45 |     // output of result3:
46 |     // (A1,ArrayBuffer((A1,1)))
47 |     // (A2,ArrayBuffer((A2,2), (A2,4)))
48 |     //
49 |     // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
50 |     result3.foreach(println)
51 | 
52 |   }
53 | 
54 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/GroupByKey.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import java.util.Random
 4 | 
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | import org.apache.spark.SparkContext._
 7 | 
 8 | /**
 9 |  * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
10 |  */
11 | object GroupByKey {
12 |   def main(args: Array[String]) {
13 |     val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]")
14 |     var numMappers = 10
15 |     var numKVPairs = 100
16 |     var valSize = 100
17 |     var numReducers = 3
18 | 
19 |     val sc = new SparkContext(sparkConf)
20 | 
21 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
22 |       val ranGen = new Random
23 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
24 |       for (i <- 0 until numKVPairs) {
25 |         val byteArr = new Array[Byte](valSize)
26 |         ranGen.nextBytes(byteArr)
27 |         arr1(i) = (ranGen.nextInt(10), byteArr)
28 |       }
29 |       arr1
30 |     }.cache
31 |     // Enforce that everything has been calculated and in cache
32 |     pairs1.count
33 | 
34 |     val result = pairs1.groupByKey(numReducers)
35 |     println(result.count)
36 |     println(result.toDebugString)
37 | 
38 |     sc.stop()
39 |   }
40 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/GroupWith.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object GroupWith {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local[2]", "GroupWith Test")
10 | 
11 |     val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 |       ("B1", 3), ("B2", 4),
13 |       ("C1", 5), ("C1", 6)
14 |     )
15 | 
16 |     val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 |       ("B1", 9), ("C1", 0)
18 |     )
19 |     val pairs1 = sc.parallelize(data1, 3)
20 |     val pairs2 = sc.parallelize(data2, 2)
21 | 
22 |     val result = pairs1.groupWith(pairs2)
23 |     result.foreach(println)
24 | 
25 |     // output:
26 |     // (B1,(ArrayBuffer(3),ArrayBuffer(9)))
27 |     // (A1,(ArrayBuffer(1),ArrayBuffer(7)))
28 |     // (A2,(ArrayBuffer(2),ArrayBuffer(8)))
29 |     //
30 |     // (C1,(ArrayBuffer(5, 6),ArrayBuffer(0)))
31 |     // (B2,(ArrayBuffer(4),ArrayBuffer()))
32 | 
33 | 
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Join.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object Join {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local[2]", "JoinAction Test")
10 | 
11 |     val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 |       ("B1", 3), ("B2", 4),
13 |       ("C1", 5), ("C1", 6)
14 |     )
15 | 
16 |     val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 |       ("B1", 9), ("C1", 0)
18 |     )
19 |     val pairs1 = sc.parallelize(data1, 3)
20 |     val pairs2 = sc.parallelize(data2, 2)
21 | 
22 | 
23 |     val result = pairs1.join(pairs2)
24 | 
25 |     // output:
26 |     // (A1,(1,7))
27 |     // (B1,(3,9))
28 |     // (A2,(2,8))
29 |     //
30 |     // (C1,(5,0))
31 |     // (C1,(6,0))
32 |     result.foreach(println)
33 |   }
34 | 
35 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Lookup.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object Lookup {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "LookUp Test")
10 | 
11 |     val data = Array[(String, Int)](("A", 1), ("B", 2),
12 |       ("B", 3), ("C", 4),
13 |       ("C", 5), ("C", 6))
14 | 
15 |     val pairs = sc.parallelize(data, 3)
16 | 
17 |     val finalRDD = pairs.lookup("B")
18 | 
19 |     finalRDD.foreach(println)
20 |     // output:
21 |     // 2
22 |     // 3
23 |   }
24 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/MapPartitions.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object MapPartitions {
 6 | 
 7 |   def main(args: Array[String]) {
 8 |     val sc = new SparkContext("local", "MapPartitionsRDD Test")
 9 |     val data = Array[(String, Int)](("A1", 1), ("A2", 2),
10 |       ("B1", 1), ("B2", 4),
11 |       ("C1", 3), ("C2", 4)
12 |     )
13 |     val pairs = sc.parallelize(data, 3)
14 | 
15 |     val finalRDD = pairs.mapPartitions(iter => iter.filter(_._2 >= 2))
16 |     // val finalRDD2 = pairs.mapPartitionsWithIndex(f, preservesPartitioning)
17 | 
18 |     finalRDD.toArray().foreach(println)
19 | 
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/MapValues.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object MapValues {
 6 |   def main(args: Array[String]) {
 7 | 
 8 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
 9 |     val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 |       ("T", 3), ("W", 4),
11 |       ("W", 5), ("W", 6)
12 |     )
13 |     val pairs = sc.parallelize(data1, 3)
14 |     //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 |     //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 |     //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 |     val result = pairs.mapValues(V => 10 * V)
18 |     result.foreach(println)
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/PartitionBy.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext}
 4 | 
 5 | object PartitionBy {
 6 |   def main(args: Array[String]) {
 7 | 
 8 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
 9 |     val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 |       ("T", 3), ("W", 4),
11 |       ("W", 5), ("W", 6)
12 |     )
13 |     val pairs = sc.parallelize(data1, 3)
14 |     //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 |     //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 |     var result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 |     result = pairs.partitionBy(new HashPartitioner(2))
18 |     result.foreach(println)
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Pipe.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | /**
 4 |  *
 5 |  * @author <a href="mailto:june.chan@foxmail.com">june</a>.
 6 |  * @date 2015-05-12 17:21.
 7 |  */
 8 | object Pipe {
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/ReduceByKey.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object ReduceByKey {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
10 |     val data1 = Array[(String, Int)](("K", 1), ("U", 2),
11 |       ("U", 3), ("W", 4),
12 |       ("W", 5), ("W", 6))
13 |     val pairs = sc.parallelize(data1, 3)
14 |     //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 |     //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 |     val result = pairs.reduceByKey(_ + _, 2)
17 |     result.foreach(println)
18 |   }
19 | 
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/ScalaWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | object ScalaWordCount {
 6 |   def main(args: Array[String]) {
 7 |     val sc = new SparkContext(new SparkConf().setAppName("ScalaWordCount"))
 8 |     val threshold = args(1).toInt
 9 | 
10 |     // split each document into words
11 |     val tokenized = sc.textFile(args(0)).flatMap(_.split(" "))
12 | 
13 |     // count the occurrence of each word
14 |     val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _)
15 | 
16 |     // filter out words with less than threshold occurrences
17 |     val filtered = wordCounts.filter(_._2 >= threshold)
18 | 
19 |     // count characters
20 |     val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _)
21 | 
22 |     System.out.println(charCounts.collect().mkString(", "))
23 |   }
24 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/TransformTest.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.rdd
 2 | 
 3 | /**
 4 |  *
 5 |  * @author <a href="mailto:june.chan@foxmail.com">june</a>.
 6 |  * @date 2015-05-12 17:25.
 7 |  */
 8 | object TransformTest {
 9 | 
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/sparksql/ScalaSparkSQLByReflection.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.sparksql
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | object ScalaSparkSQLByReflection {
 6 | 
 7 |   // Define the schema using a case class.
 8 |   // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,
 9 |   // you can use custom classes that implement the Product interface.
10 |   case class People(name: String, age: Int)
11 | 
12 |   def main(args: Array[String]) {
13 |     val sc = new SparkContext(new SparkConf().setAppName("ScalaSparkSQL"))
14 |     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
15 | 
16 |     // this is used to implicitly convert an RDD to a DataFrame.
17 |     import sqlContext.implicits._
18 | 
19 |     // Create an RDD of People objects and register it as a table.
20 |     val people = sc.textFile("people.txt").map(_.split(",")).map(p => People(p(0), p(1).trim.toInt)).toDF()
21 |     people.registerTempTable("people")
22 | 
23 |     // SQL statements can be run by using the sql methods provided by sqlContext.
24 |     val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
25 | 
26 |     // The results of SQL queries are DataFrames and support all the normal RDD operations.
27 |     // The columns of a row in the result can be accessed by ordinal.
28 |     teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
29 | 
30 |     people.saveAsParquetFile("people.parquet")
31 | 
32 |     val parquetFile = sqlContext.parquetFile("people.parquet")
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/sparksql/ScalaSparkSQLBySchema.scala:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark.examples.sparksql
 2 | 
 3 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | object ScalaSparkSQLBySchema {
 7 | 
 8 |   def main(args: Array[String]) {
 9 |     val sc = new SparkContext(new SparkConf().setAppName("ScalaSparkSQL"))
10 |     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
11 | 
12 |     // Create an RDD
13 |     val people = sc.textFile("people.txt")
14 | 
15 |     // The schema is encoded in a string
16 |     val schemaString = "name age"
17 | 
18 |     // Import Spark SQL data types and Row.
19 |     import org.apache.spark.sql._
20 | 
21 |     // Generate the schema based on the string of schema
22 |     val schema =
23 |       StructType(
24 |         schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
25 | 
26 |     // Convert records of the RDD (people) to Rows.
27 |     val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
28 | 
29 |     // Apply the schema to the RDD.
30 |     val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)
31 | 
32 |     // Register the DataFrames as a table.
33 |     peopleDataFrame.registerTempTable("people")
34 | 
35 |     // SQL statements can be run by using the sql methods provided by sqlContext.
36 |     val results = sqlContext.sql("SELECT name FROM people")
37 | 
38 |     // The results of SQL queries are DataFrames and support all the normal RDD operations.
39 |     // The columns of a row in the result can be accessed by ordinal.
40 |     results.map(t => "Name: " + t(0)).collect().foreach(println)
41 |   }
42 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/BroadcastTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | /**
23 |   * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize]
24 |   */
25 | object BroadcastTest {
26 |   def main(args: Array[String]) {
27 | 
28 |     val bcName = if (args.length > 2) args(2) else "Http"
29 |     val blockSize = if (args.length > 3) args(3) else "4096"
30 | 
31 |     val sparkConf = new SparkConf().setAppName("Broadcast Test")
32 |       .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroadcastFactory")
33 |       .set("spark.broadcast.blockSize", blockSize)
34 |     val sc = new SparkContext(sparkConf)
35 | 
36 |     val slices = if (args.length > 0) args(0).toInt else 2
37 |     val num = if (args.length > 1) args(1).toInt else 1000000
38 | 
39 |     val arr1 = (0 until num).toArray
40 | 
41 |     for (i <- 0 until 3) {
42 |       println("Iteration " + i)
43 |       println("===========")
44 |       val startTime = System.nanoTime
45 |       val barr1 = sc.broadcast(arr1)
46 |       val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size)
47 |       // Collect the small RDD so we can print the observed sizes locally.
48 |       observedSizes.collect().foreach(i => println(i))
49 |       println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6))
50 |     }
51 | 
52 |     sc.stop()
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.collection.JavaConversions._
21 | 
22 | import org.apache.spark.util.Utils
23 | 
24 | /** Prints out environmental information, sleeps, and then exits. Made to
25 |   * test driver submission in the standalone scheduler. */
26 | object DriverSubmissionTest {
27 |   def main(args: Array[String]) {
28 |     if (args.size < 1) {
29 |       println("Usage: DriverSubmissionTest <seconds-to-sleep>")
30 |       System.exit(0)
31 |     }
32 |     val numSecondsToSleep = args(0).toInt
33 | 
34 |     val env = System.getenv()
35 |     val properties = Utils.getSystemProperties
36 | 
37 |     println("Environment variables containing SPARK_TEST:")
38 |     env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)
39 | 
40 |     println("System properties containing spark.test:")
41 |     properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)
42 | 
43 |     for (i <- 1 until numSecondsToSleep) {
44 |       println(s"Alive for $i out of $numSecondsToSleep seconds")
45 |       Thread.sleep(1000)
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/ExceptionHandlingTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | object ExceptionHandlingTest {
23 |   def main(args: Array[String]) {
24 |     val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest")
25 |     val sc = new SparkContext(sparkConf)
26 |     sc.parallelize(0 until sc.defaultParallelism).foreach { i =>
27 |       if (math.random > 0.75) {
28 |         throw new Exception("Testing exception handling")
29 |       }
30 |     }
31 | 
32 |     sc.stop()
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object GroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 |     var numMappers = if (args.length > 0) args(0).toInt else 2
32 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 |     var valSize = if (args.length > 2) args(2).toInt else 1000
34 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 |       for (i <- 0 until numKVPairs) {
42 |         val byteArr = new Array[Byte](valSize)
43 |         ranGen.nextBytes(byteArr)
44 |         arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
45 |       }
46 |       arr1
47 |     }.cache()
48 |     // Enforce that everything has been calculated and in cache
49 |     pairs1.count()
50 | 
51 |     println(pairs1.groupByKey(numReducers).count())
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/HdfsTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark._
21 | 
22 | 
23 | object HdfsTest {
24 | 
25 |   /** Usage: HdfsTest [file] */
26 |   def main(args: Array[String]) {
27 |     if (args.length < 1) {
28 |       System.err.println("Usage: HdfsTest <file>")
29 |       System.exit(1)
30 |     }
31 |     val sparkConf = new SparkConf().setAppName("HdfsTest")
32 |     val sc = new SparkContext(sparkConf)
33 |     val file = sc.textFile(args(0))
34 |     val mapped = file.map(s => s.length).cache()
35 |     for (iter <- 1 to 10) {
36 |       val start = System.currentTimeMillis()
37 |       for (x <- mapped) { x + 2 }
38 |       val end = System.currentTimeMillis()
39 |       println("Iteration " + iter + " took " + (end-start) + " ms")
40 |     }
41 |     sc.stop()
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalALS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import org.apache.commons.math3.linear._
 21 | 
 22 | /**
 23 |  * Alternating least squares matrix factorization.
 24 |  *
 25 |  * This is an example implementation for learning how to use Spark. For more conventional use,
 26 |  * please refer to org.apache.spark.mllib.recommendation.ALS
 27 |  */
 28 | object LocalALS {
 29 | 
 30 |   // Parameters set through command line arguments
 31 |   var M = 0 // Number of movies
 32 |   var U = 0 // Number of users
 33 |   var F = 0 // Number of features
 34 |   var ITERATIONS = 0
 35 |   val LAMBDA = 0.01 // Regularization coefficient
 36 | 
 37 |   def generateR(): RealMatrix = {
 38 |     val mh = randomMatrix(M, F)
 39 |     val uh = randomMatrix(U, F)
 40 |     mh.multiply(uh.transpose())
 41 |   }
 42 | 
 43 |   def rmse(targetR: RealMatrix, ms: Array[RealVector], us: Array[RealVector]): Double = {
 44 |     val r = new Array2DRowRealMatrix(M, U)
 45 |     for (i <- 0 until M; j <- 0 until U) {
 46 |       r.setEntry(i, j, ms(i).dotProduct(us(j)))
 47 |     }
 48 |     val diffs = r.subtract(targetR)
 49 |     var sumSqs = 0.0
 50 |     for (i <- 0 until M; j <- 0 until U) {
 51 |       val diff = diffs.getEntry(i, j)
 52 |       sumSqs += diff * diff
 53 |     }
 54 |     math.sqrt(sumSqs / (M.toDouble * U.toDouble))
 55 |   }
 56 | 
 57 |   def updateMovie(i: Int, m: RealVector, us: Array[RealVector], R: RealMatrix) : RealVector = {
 58 |     var XtX: RealMatrix = new Array2DRowRealMatrix(F, F)
 59 |     var Xty: RealVector = new ArrayRealVector(F)
 60 |     // For each user that rated the movie
 61 |     for (j <- 0 until U) {
 62 |       val u = us(j)
 63 |       // Add u * u^t to XtX
 64 |       XtX = XtX.add(u.outerProduct(u))
 65 |       // Add u * rating to Xty
 66 |       Xty = Xty.add(u.mapMultiply(R.getEntry(i, j)))
 67 |     }
 68 |     // Add regularization coefficients to diagonal terms
 69 |     for (d <- 0 until F) {
 70 |       XtX.addToEntry(d, d, LAMBDA * U)
 71 |     }
 72 |     // Solve it with Cholesky
 73 |     new CholeskyDecomposition(XtX).getSolver.solve(Xty)
 74 |   }
 75 | 
 76 |   def updateUser(j: Int, u: RealVector, ms: Array[RealVector], R: RealMatrix) : RealVector = {
 77 |     var XtX: RealMatrix = new Array2DRowRealMatrix(F, F)
 78 |     var Xty: RealVector = new ArrayRealVector(F)
 79 |     // For each movie that the user rated
 80 |     for (i <- 0 until M) {
 81 |       val m = ms(i)
 82 |       // Add m * m^t to XtX
 83 |       XtX = XtX.add(m.outerProduct(m))
 84 |       // Add m * rating to Xty
 85 |       Xty = Xty.add(m.mapMultiply(R.getEntry(i, j)))
 86 |     }
 87 |     // Add regularization coefficients to diagonal terms
 88 |     for (d <- 0 until F) {
 89 |       XtX.addToEntry(d, d, LAMBDA * M)
 90 |     }
 91 |     // Solve it with Cholesky
 92 |     new CholeskyDecomposition(XtX).getSolver.solve(Xty)
 93 |   }
 94 | 
 95 |   def showWarning() {
 96 |     System.err.println(
 97 |       """WARN: This is a naive implementation of ALS and is given as an example!
 98 |         |Please use the ALS method found in org.apache.spark.mllib.recommendation
 99 |         |for more conventional use.
100 |       """.stripMargin)
101 |   }
102 | 
103 |   def main(args: Array[String]) {
104 | 
105 |     args match {
106 |       case Array(m, u, f, iters) => {
107 |         M = m.toInt
108 |         U = u.toInt
109 |         F = f.toInt
110 |         ITERATIONS = iters.toInt
111 |       }
112 |       case _ => {
113 |         System.err.println("Usage: LocalALS <M> <U> <F> <iters>")
114 |         System.exit(1)
115 |       }
116 |     }
117 | 
118 |     showWarning()
119 | 
120 |     println(s"Running with M=$M, U=$U, F=$F, iters=$ITERATIONS")
121 | 
122 |     val R = generateR()
123 | 
124 |     // Initialize m and u randomly
125 |     var ms = Array.fill(M)(randomVector(F))
126 |     var us = Array.fill(U)(randomVector(F))
127 | 
128 |     // Iteratively update movies then users
129 |     for (iter <- 1 to ITERATIONS) {
130 |       println(s"Iteration $iter:")
131 |       ms = (0 until M).map(i => updateMovie(i, ms(i), us, R)).toArray
132 |       us = (0 until U).map(j => updateUser(j, us(j), ms, R)).toArray
133 |       println("RMSE = " + rmse(R, ms, us))
134 |       println()
135 |     }
136 |   }
137 | 
138 |   private def randomVector(n: Int): RealVector =
139 |     new ArrayRealVector(Array.fill(n)(math.random))
140 | 
141 |   private def randomMatrix(rows: Int, cols: Int): RealMatrix =
142 |     new Array2DRowRealMatrix(Array.fill(rows, cols)(math.random))
143 | 
144 | }
145 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalFileLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector}
23 | 
24 | /**
25 |  * Logistic regression based classification.
26 |  *
27 |  * This is an example implementation for learning how to use Spark. For more conventional use,
28 |  * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
29 |  * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
30 |  */
31 | object LocalFileLR {
32 |   val D = 10   // Numer of dimensions
33 |   val rand = new Random(42)
34 | 
35 |   case class DataPoint(x: Vector[Double], y: Double)
36 | 
37 |   def parsePoint(line: String): DataPoint = {
38 |     val nums = line.split(' ').map(_.toDouble)
39 |     DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
40 |   }
41 | 
42 |   def showWarning() {
43 |     System.err.println(
44 |       """WARN: This is a naive implementation of Logistic Regression and is given as an example!
45 |         |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
46 |         |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
47 |         |for more conventional use.
48 |       """.stripMargin)
49 |   }
50 | 
51 |   def main(args: Array[String]) {
52 | 
53 |     showWarning()
54 | 
55 |     val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
56 |     val points = lines.map(parsePoint _)
57 |     val ITERATIONS = args(1).toInt
58 | 
59 |     // Initialize w to a random value
60 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
61 |     println("Initial w: " + w)
62 | 
63 |     for (i <- 1 to ITERATIONS) {
64 |       println("On iteration " + i)
65 |       var gradient = DenseVector.zeros[Double](D)
66 |       for (p <- points) {
67 |         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
68 |         gradient += p.x * scale
69 |       }
70 |       w -= gradient
71 |     }
72 | 
73 |     println("Final w: " + w)
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalKMeans.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import java.util.Random
 21 | 
 22 | import scala.collection.mutable.HashMap
 23 | import scala.collection.mutable.HashSet
 24 | 
 25 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
 26 | 
 27 | import org.apache.spark.SparkContext._
 28 | 
 29 | /**
 30 |  * K-means clustering.
 31 |  *
 32 |  * This is an example implementation for learning how to use Spark. For more conventional use,
 33 |  * please refer to org.apache.spark.mllib.clustering.KMeans
 34 |  */
 35 | object LocalKMeans {
 36 |   val N = 1000
 37 |   val R = 1000    // Scaling factor
 38 |   val D = 10
 39 |   val K = 10
 40 |   val convergeDist = 0.001
 41 |   val rand = new Random(42)
 42 | 
 43 |   def generateData = {
 44 |     def generatePoint(i: Int) = {
 45 |       DenseVector.fill(D){rand.nextDouble * R}
 46 |     }
 47 |     Array.tabulate(N)(generatePoint)
 48 |   }
 49 | 
 50 |   def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
 51 |     var index = 0
 52 |     var bestIndex = 0
 53 |     var closest = Double.PositiveInfinity
 54 | 
 55 |     for (i <- 1 to centers.size) {
 56 |       val vCurr = centers.get(i).get
 57 |       val tempDist = squaredDistance(p, vCurr)
 58 |       if (tempDist < closest) {
 59 |         closest = tempDist
 60 |         bestIndex = i
 61 |       }
 62 |     }
 63 | 
 64 |     bestIndex
 65 |   }
 66 | 
 67 |   def showWarning() {
 68 |     System.err.println(
 69 |       """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
 70 |         |Please use the KMeans method found in org.apache.spark.mllib.clustering
 71 |         |for more conventional use.
 72 |       """.stripMargin)
 73 |   }
 74 | 
 75 |   def main(args: Array[String]) {
 76 | 
 77 |     showWarning()
 78 | 
 79 |     val data = generateData
 80 |     var points = new HashSet[Vector[Double]]
 81 |     var kPoints = new HashMap[Int, Vector[Double]]
 82 |     var tempDist = 1.0
 83 | 
 84 |     while (points.size < K) {
 85 |       points.add(data(rand.nextInt(N)))
 86 |     }
 87 | 
 88 |     val iter = points.iterator
 89 |     for (i <- 1 to points.size) {
 90 |       kPoints.put(i, iter.next())
 91 |     }
 92 | 
 93 |     println("Initial centers: " + kPoints)
 94 | 
 95 |     while(tempDist > convergeDist) {
 96 |       var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
 97 | 
 98 |       var mappings = closest.groupBy[Int] (x => x._1)
 99 | 
100 |       var pointStats = mappings.map { pair =>
101 |         pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
102 |           case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
103 |         }
104 |       }
105 | 
106 |       var newPoints = pointStats.map {mapping =>
107 |         (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}
108 | 
109 |       tempDist = 0.0
110 |       for (mapping <- newPoints) {
111 |         tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
112 |       }
113 | 
114 |       for (newP <- newPoints) {
115 |         kPoints.put(newP._1, newP._2)
116 |       }
117 |     }
118 | 
119 |     println("Final centers: " + kPoints)
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector}
23 | 
24 | /**
25 |  * Logistic regression based classification.
26 |  *
27 |  * This is an example implementation for learning how to use Spark. For more conventional use,
28 |  * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
29 |  * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
30 |  */
31 | object LocalLR {
32 |   val N = 10000  // Number of data points
33 |   val D = 10   // Number of dimensions
34 |   val R = 0.7  // Scaling factor
35 |   val ITERATIONS = 5
36 |   val rand = new Random(42)
37 | 
38 |   case class DataPoint(x: Vector[Double], y: Double)
39 | 
40 |   def generateData = {
41 |     def generatePoint(i: Int) = {
42 |       val y = if(i % 2 == 0) -1 else 1
43 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
44 |       DataPoint(x, y)
45 |     }
46 |     Array.tabulate(N)(generatePoint)
47 |   }
48 | 
49 |   def showWarning() {
50 |     System.err.println(
51 |       """WARN: This is a naive implementation of Logistic Regression and is given as an example!
52 |         |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
53 |         |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
54 |         |for more conventional use.
55 |       """.stripMargin)
56 |   }
57 | 
58 |   def main(args: Array[String]) {
59 | 
60 |     showWarning()
61 | 
62 |     val data = generateData
63 |     // Initialize w to a random value
64 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
65 |     println("Initial w: " + w)
66 | 
67 |     for (i <- 1 to ITERATIONS) {
68 |       println("On iteration " + i)
69 |       var gradient = DenseVector.zeros[Double](D)
70 |       for (p <- data) {
71 |         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
72 |         gradient +=  p.x * scale
73 |       }
74 |       w -= gradient
75 |     }
76 | 
77 |     println("Final w: " + w)
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | import org.apache.spark.SparkContext._
24 | 
25 | object LocalPi {
26 |   def main(args: Array[String]) {
27 |     var count = 0
28 |     for (i <- 1 to 100000) {
29 |       val x = random * 2 - 1
30 |       val y = random * 2 - 1
31 |       if (x*x + y*y < 1) count += 1
32 |     }
33 |     println("Pi is roughly " + 4 * count / 100000.0)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LogQuery.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.SparkContext._
22 | 
23 | /**
24 |  * Executes a roll up-style query against Apache logs.
25 |  *  
26 |  * Usage: LogQuery [logFile]
27 |  */
28 | object LogQuery {
29 |   val exampleApacheLogs = List(
30 |     """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg
31 |       | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
32 |       | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
33 |       | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
34 |       | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 ""
35 |       | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.lines.mkString,
36 |     """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg
37 |       | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
38 |       | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
39 |       | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
40 |       | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 ""
41 |       | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.lines.mkString
42 |   )
43 | 
44 |   def main(args: Array[String]) {
45 | 
46 |     val sparkConf = new SparkConf().setAppName("Log Query")
47 |     val sc = new SparkContext(sparkConf)
48 | 
49 |     val dataSet =
50 |       if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs)
51 |     // scalastyle:off
52 |     val apacheLogRegex =
53 |       """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r
54 |     // scalastyle:on
55 |     /** Tracks the total query count and number of aggregate bytes for a particular group. */
56 |     class Stats(val count: Int, val numBytes: Int) extends Serializable {
57 |       def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes)
58 |       override def toString = "bytes=%s\tn=%s".format(numBytes, count)
59 |     }
60 | 
61 |     def extractKey(line: String): (String, String, String) = {
62 |       apacheLogRegex.findFirstIn(line) match {
63 |         case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
64 |           if (user != "\"-\"") (ip, user, query)
65 |           else (null, null, null)
66 |         case _ => (null, null, null)
67 |       }
68 |     }
69 | 
70 |     def extractStats(line: String): Stats = {
71 |       apacheLogRegex.findFirstIn(line) match {
72 |         case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
73 |           new Stats(1, bytes.toInt)
74 |         case _ => new Stats(1, 0)
75 |       }
76 |     }
77 | 
78 |     dataSet.map(line => (extractKey(line), extractStats(line)))
79 |       .reduceByKey((a, b) => a.merge(b))
80 |       .collect().foreach{
81 |         case (user, query) => println("%s\t%s".format(user, query))}
82 | 
83 |     sc.stop()
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | /**
24 |   * Usage: MultiBroadcastTest [slices] [numElem]
25 |   */
26 | object MultiBroadcastTest {
27 |   def main(args: Array[String]) {
28 | 
29 |     val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test")
30 |     val sc = new SparkContext(sparkConf)
31 | 
32 |     val slices = if (args.length > 0) args(0).toInt else 2
33 |     val num = if (args.length > 1) args(1).toInt else 1000000
34 | 
35 |     val arr1 = new Array[Int](num)
36 |     for (i <- 0 until arr1.length) {
37 |       arr1(i) = i
38 |     }
39 | 
40 |     val arr2 = new Array[Int](num)
41 |     for (i <- 0 until arr2.length) {
42 |       arr2(i) = i
43 |     }
44 | 
45 |     val barr1 = sc.broadcast(arr1)
46 |     val barr2 = sc.broadcast(arr2)
47 |     val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ =>
48 |       (barr1.value.size, barr2.value.size)
49 |     }
50 |     // Collect the small RDD so we can print the observed sizes locally.
51 |     observedSizes.collect().foreach(i => println(i))
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: SimpleSkewedGroupByTest [numMappers] [numKVPairs] [valSize] [numReducers] [ratio]
27 |   */
28 | object SimpleSkewedGroupByTest {
29 |   def main(args: Array[String]) {
30 | 
31 |     val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
32 |     var numMappers = if (args.length > 0) args(0).toInt else 2
33 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
34 |     var valSize = if (args.length > 2) args(2).toInt else 1000
35 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
36 |     var ratio = if (args.length > 4) args(4).toInt else 5.0
37 | 
38 |     val sc = new SparkContext(sparkConf)
39 | 
40 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
41 |       val ranGen = new Random
42 |       var result = new Array[(Int, Array[Byte])](numKVPairs)
43 |       for (i <- 0 until numKVPairs) {
44 |         val byteArr = new Array[Byte](valSize)
45 |         ranGen.nextBytes(byteArr)
46 |         val offset = ranGen.nextInt(1000) * numReducers
47 |         if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
48 |           // give ratio times higher chance of generating key 0 (for reducer 0)
49 |           result(i) = (offset, byteArr)
50 |         } else {
51 |           // generate a key for one of the other reducers
52 |           val key = 1 + ranGen.nextInt(numReducers-1) + offset
53 |           result(i) = (key, byteArr)
54 |         }
55 |       }
56 |       result
57 |     }.cache
58 |     // Enforce that everything has been calculated and in cache
59 |     pairs1.count
60 | 
61 |     println("RESULT: " + pairs1.groupByKey(numReducers).count)
62 |     // Print how many keys each reducer got (for debugging)
63 |     // println("RESULT: " + pairs1.groupByKey(numReducers)
64 |     //                           .map{case (k,v) => (k, v.size)}
65 |     //                           .collectAsMap)
66 | 
67 |     sc.stop()
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object SkewedGroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 |     var numMappers = if (args.length > 0) args(0).toInt else 2
32 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 |     var valSize = if (args.length > 2) args(2).toInt else 1000
34 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 | 
41 |       // map output sizes lineraly increase from the 1st to the last
42 |       numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt
43 | 
44 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
45 |       for (i <- 0 until numKVPairs) {
46 |         val byteArr = new Array[Byte](valSize)
47 |         ranGen.nextBytes(byteArr)
48 |         arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
49 |       }
50 |       arr1
51 |     }.cache()
52 |     // Enforce that everything has been calculated and in cache
53 |     pairs1.count()
54 | 
55 |     println(pairs1.groupByKey(numReducers).count())
56 | 
57 |     sc.stop()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkALS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import org.apache.commons.math3.linear._
 21 | 
 22 | import org.apache.spark._
 23 | 
 24 | /**
 25 |  * Alternating least squares matrix factorization.
 26 |  *
 27 |  * This is an example implementation for learning how to use Spark. For more conventional use,
 28 |  * please refer to org.apache.spark.mllib.recommendation.ALS
 29 |  */
 30 | object SparkALS {
 31 | 
 32 |   // Parameters set through command line arguments
 33 |   var M = 0 // Number of movies
 34 |   var U = 0 // Number of users
 35 |   var F = 0 // Number of features
 36 |   var ITERATIONS = 0
 37 |   val LAMBDA = 0.01 // Regularization coefficient
 38 | 
 39 |   def generateR(): RealMatrix = {
 40 |     val mh = randomMatrix(M, F)
 41 |     val uh = randomMatrix(U, F)
 42 |     mh.multiply(uh.transpose())
 43 |   }
 44 | 
 45 |   def rmse(targetR: RealMatrix, ms: Array[RealVector], us: Array[RealVector]): Double = {
 46 |     val r = new Array2DRowRealMatrix(M, U)
 47 |     for (i <- 0 until M; j <- 0 until U) {
 48 |       r.setEntry(i, j, ms(i).dotProduct(us(j)))
 49 |     }
 50 |     val diffs = r.subtract(targetR)
 51 |     var sumSqs = 0.0
 52 |     for (i <- 0 until M; j <- 0 until U) {
 53 |       val diff = diffs.getEntry(i, j)
 54 |       sumSqs += diff * diff
 55 |     }
 56 |     math.sqrt(sumSqs / (M.toDouble * U.toDouble))
 57 |   }
 58 | 
 59 |   def update(i: Int, m: RealVector, us: Array[RealVector], R: RealMatrix) : RealVector = {
 60 |     val U = us.size
 61 |     val F = us(0).getDimension
 62 |     var XtX: RealMatrix = new Array2DRowRealMatrix(F, F)
 63 |     var Xty: RealVector = new ArrayRealVector(F)
 64 |     // For each user that rated the movie
 65 |     for (j <- 0 until U) {
 66 |       val u = us(j)
 67 |       // Add u * u^t to XtX
 68 |       XtX = XtX.add(u.outerProduct(u))
 69 |       // Add u * rating to Xty
 70 |       Xty = Xty.add(u.mapMultiply(R.getEntry(i, j)))
 71 |     }
 72 |     // Add regularization coefs to diagonal terms
 73 |     for (d <- 0 until F) {
 74 |       XtX.addToEntry(d, d, LAMBDA * U)
 75 |     }
 76 |     // Solve it with Cholesky
 77 |     new CholeskyDecomposition(XtX).getSolver.solve(Xty)
 78 |   }
 79 | 
 80 |   def showWarning() {
 81 |     System.err.println(
 82 |       """WARN: This is a naive implementation of ALS and is given as an example!
 83 |         |Please use the ALS method found in org.apache.spark.mllib.recommendation
 84 |         |for more conventional use.
 85 |       """.stripMargin)
 86 |   }
 87 | 
 88 |   def main(args: Array[String]) {
 89 | 
 90 |     var slices = 0
 91 | 
 92 |     val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)
 93 | 
 94 |     options.toArray match {
 95 |       case Array(m, u, f, iters, slices_) =>
 96 |         M = m.getOrElse("100").toInt
 97 |         U = u.getOrElse("500").toInt
 98 |         F = f.getOrElse("10").toInt
 99 |         ITERATIONS = iters.getOrElse("5").toInt
100 |         slices = slices_.getOrElse("2").toInt
101 |       case _ =>
102 |         System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")
103 |         System.exit(1)
104 |     }
105 | 
106 |     showWarning()
107 | 
108 |     println(s"Running with M=$M, U=$U, F=$F, iters=$ITERATIONS")
109 | 
110 |     val sparkConf = new SparkConf().setAppName("SparkALS")
111 |     val sc = new SparkContext(sparkConf)
112 | 
113 |     val R = generateR()
114 | 
115 |     // Initialize m and u randomly
116 |     var ms = Array.fill(M)(randomVector(F))
117 |     var us = Array.fill(U)(randomVector(F))
118 | 
119 |     // Iteratively update movies then users
120 |     val Rc  = sc.broadcast(R)
121 |     var msb = sc.broadcast(ms)
122 |     var usb = sc.broadcast(us)
123 |     for (iter <- 1 to ITERATIONS) {
124 |       println(s"Iteration $iter:")
125 |       ms = sc.parallelize(0 until M, slices)
126 |                 .map(i => update(i, msb.value(i), usb.value, Rc.value))
127 |                 .collect()
128 |       msb = sc.broadcast(ms) // Re-broadcast ms because it was updated
129 |       us = sc.parallelize(0 until U, slices)
130 |                 .map(i => update(i, usb.value(i), msb.value, Rc.value.transpose()))
131 |                 .collect()
132 |       usb = sc.broadcast(us) // Re-broadcast us because it was updated
133 |       println("RMSE = " + rmse(R, ms, us))
134 |       println()
135 |     }
136 | 
137 |     sc.stop()
138 |   }
139 | 
140 |   private def randomVector(n: Int): RealVector =
141 |     new ArrayRealVector(Array.fill(n)(math.random))
142 | 
143 |   private def randomMatrix(rows: Int, cols: Int): RealMatrix =
144 |     new Array2DRowRealMatrix(Array.fill(rows, cols)(math.random))
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import java.util.Random
 21 | 
 22 | import scala.math.exp
 23 | 
 24 | import breeze.linalg.{Vector, DenseVector}
 25 | import org.apache.hadoop.conf.Configuration
 26 | 
 27 | import org.apache.spark._
 28 | import org.apache.spark.scheduler.InputFormatInfo
 29 | 
 30 | 
 31 | /**
 32 |  * Logistic regression based classification.
 33 |  *
 34 |  * This is an example implementation for learning how to use Spark. For more conventional use,
 35 |  * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
 36 |  * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
 37 |  */
 38 | object SparkHdfsLR {
 39 |   val D = 10   // Numer of dimensions
 40 |   val rand = new Random(42)
 41 | 
 42 |   case class DataPoint(x: Vector[Double], y: Double)
 43 | 
 44 |   def parsePoint(line: String): DataPoint = {
 45 |     val tok = new java.util.StringTokenizer(line, " ")
 46 |     var y = tok.nextToken.toDouble
 47 |     var x = new Array[Double](D)
 48 |     var i = 0
 49 |     while (i < D) {
 50 |       x(i) = tok.nextToken.toDouble; i += 1
 51 |     }
 52 |     DataPoint(new DenseVector(x), y)
 53 |   }
 54 | 
 55 |   def showWarning() {
 56 |     System.err.println(
 57 |       """WARN: This is a naive implementation of Logistic Regression and is given as an example!
 58 |         |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
 59 |         |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
 60 |         |for more conventional use.
 61 |       """.stripMargin)
 62 |   }
 63 | 
 64 |   def main(args: Array[String]) {
 65 | 
 66 |     if (args.length < 2) {
 67 |       System.err.println("Usage: SparkHdfsLR <file> <iters>")
 68 |       System.exit(1)
 69 |     }
 70 | 
 71 |     showWarning()
 72 | 
 73 |     val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
 74 |     val inputPath = args(0)
 75 |     val conf = new Configuration()
 76 |     val sc = new SparkContext(sparkConf,
 77 |       InputFormatInfo.computePreferredLocations(
 78 |         Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
 79 |       ))
 80 |     val lines = sc.textFile(inputPath)
 81 |     val points = lines.map(parsePoint _).cache()
 82 |     val ITERATIONS = args(1).toInt
 83 | 
 84 |     // Initialize w to a random value
 85 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
 86 |     println("Initial w: " + w)
 87 | 
 88 |     for (i <- 1 to ITERATIONS) {
 89 |       println("On iteration " + i)
 90 |       val gradient = points.map { p =>
 91 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
 92 |       }.reduce(_ + _)
 93 |       w -= gradient
 94 |     }
 95 | 
 96 |     println("Final w: " + w)
 97 |     sc.stop()
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkKMeans.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
 21 | 
 22 | import org.apache.spark.{SparkConf, SparkContext}
 23 | import org.apache.spark.SparkContext._
 24 | 
 25 | /**
 26 |  * K-means clustering.
 27 |  *
 28 |  * This is an example implementation for learning how to use Spark. For more conventional use,
 29 |  * please refer to org.apache.spark.mllib.clustering.KMeans
 30 |  */
 31 | object SparkKMeans {
 32 | 
 33 |   def parseVector(line: String): Vector[Double] = {
 34 |     DenseVector(line.split(' ').map(_.toDouble))
 35 |   }
 36 | 
 37 |   def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
 38 |     var bestIndex = 0
 39 |     var closest = Double.PositiveInfinity
 40 | 
 41 |     for (i <- 0 until centers.length) {
 42 |       val tempDist = squaredDistance(p, centers(i))
 43 |       if (tempDist < closest) {
 44 |         closest = tempDist
 45 |         bestIndex = i
 46 |       }
 47 |     }
 48 | 
 49 |     bestIndex
 50 |   }
 51 | 
 52 |   def showWarning() {
 53 |     System.err.println(
 54 |       """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
 55 |         |Please use the KMeans method found in org.apache.spark.mllib.clustering
 56 |         |for more conventional use.
 57 |       """.stripMargin)
 58 |   }
 59 | 
 60 |   def main(args: Array[String]) {
 61 | 
 62 |     if (args.length < 3) {
 63 |       System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
 64 |       System.exit(1)
 65 |     }
 66 | 
 67 |     showWarning()
 68 | 
 69 |     val sparkConf = new SparkConf().setAppName("SparkKMeans")
 70 |     val sc = new SparkContext(sparkConf)
 71 |     val lines = sc.textFile(args(0))
 72 |     val data = lines.map(parseVector _).cache()
 73 |     val K = args(1).toInt
 74 |     val convergeDist = args(2).toDouble
 75 | 
 76 |     val kPoints = data.takeSample(withReplacement = false, K, 42).toArray
 77 |     var tempDist = 1.0
 78 | 
 79 |     while(tempDist > convergeDist) {
 80 |       val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
 81 | 
 82 |       val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)}
 83 | 
 84 |       val newPoints = pointStats.map {pair =>
 85 |         (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()
 86 | 
 87 |       tempDist = 0.0
 88 |       for (i <- 0 until K) {
 89 |         tempDist += squaredDistance(kPoints(i), newPoints(i))
 90 |       }
 91 | 
 92 |       for (newP <- newPoints) {
 93 |         kPoints(newP._1) = newP._2
 94 |       }
 95 |       println("Finished iteration (delta = " + tempDist + ")")
 96 |     }
 97 | 
 98 |     println("Final centers:")
 99 |     kPoints.foreach(println)
100 |     sc.stop()
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | 
26 | import org.apache.spark._
27 | 
28 | /**
29 |  * Logistic regression based classification.
30 |  * Usage: SparkLR [slices]
31 |  *
32 |  * This is an example implementation for learning how to use Spark. For more conventional use,
33 |  * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
34 |  * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
35 |  */
36 | object SparkLR {
37 |   val N = 10000  // Number of data points
38 |   val D = 10   // Numer of dimensions
39 |   val R = 0.7  // Scaling factor
40 |   val ITERATIONS = 5
41 |   val rand = new Random(42)
42 | 
43 |   case class DataPoint(x: Vector[Double], y: Double)
44 | 
45 |   def generateData = {
46 |     def generatePoint(i: Int) = {
47 |       val y = if(i % 2 == 0) -1 else 1
48 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
49 |       DataPoint(x, y)
50 |     }
51 |     Array.tabulate(N)(generatePoint)
52 |   }
53 | 
54 |   def showWarning() {
55 |     System.err.println(
56 |       """WARN: This is a naive implementation of Logistic Regression and is given as an example!
57 |         |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
58 |         |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
59 |         |for more conventional use.
60 |       """.stripMargin)
61 |   }
62 | 
63 |   def main(args: Array[String]) {
64 | 
65 |     showWarning()
66 | 
67 |     val sparkConf = new SparkConf().setAppName("SparkLR")
68 |     val sc = new SparkContext(sparkConf)
69 |     val numSlices = if (args.length > 0) args(0).toInt else 2
70 |     val points = sc.parallelize(generateData, numSlices).cache()
71 | 
72 |     // Initialize w to a random value
73 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
74 |     println("Initial w: " + w)
75 | 
76 |     for (i <- 1 to ITERATIONS) {
77 |       println("On iteration " + i)
78 |       val gradient = points.map { p =>
79 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
80 |       }.reduce(_ + _)
81 |       w -= gradient
82 |     }
83 | 
84 |     println("Final w: " + w)
85 | 
86 |     sc.stop()
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkPageRank.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | /**
24 |  * Computes the PageRank of URLs from an input file. Input file should
25 |  * be in format of:
26 |  * URL         neighbor URL
27 |  * URL         neighbor URL
28 |  * URL         neighbor URL
29 |  * ...
30 |  * where URL and their neighbors are separated by space(s).
31 |  *
32 |  * This is an example implementation for learning how to use Spark. For more conventional use,
33 |  * please refer to org.apache.spark.graphx.lib.PageRank
34 |  */
35 | object SparkPageRank {
36 | 
37 |   def showWarning() {
38 |     System.err.println(
39 |       """WARN: This is a naive implementation of PageRank and is given as an example!
40 |         |Please use the PageRank implementation found in org.apache.spark.graphx.lib.PageRank
41 |         |for more conventional use.
42 |       """.stripMargin)
43 |   }
44 | 
45 |   def main(args: Array[String]) {
46 |     if (args.length < 1) {
47 |       System.err.println("Usage: SparkPageRank <file> <iter>")
48 |       System.exit(1)
49 |     }
50 | 
51 |     showWarning()
52 | 
53 |     val sparkConf = new SparkConf().setAppName("PageRank")
54 |     val iters = if (args.length > 0) args(1).toInt else 10
55 |     val ctx = new SparkContext(sparkConf)
56 |     val lines = ctx.textFile(args(0), 1)
57 |     val links = lines.map{ s =>
58 |       val parts = s.split("\\s+")
59 |       (parts(0), parts(1))
60 |     }.distinct().groupByKey().cache()
61 |     var ranks = links.mapValues(v => 1.0)
62 | 
63 |     for (i <- 1 to iters) {
64 |       val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>
65 |         val size = urls.size
66 |         urls.map(url => (url, rank / size))
67 |       }
68 |       ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
69 |     }
70 | 
71 |     val output = ranks.collect()
72 |     output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
73 | 
74 |     ctx.stop()
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | 
24 | /** Computes an approximation to pi */
25 | object SparkPi {
26 |   def main(args: Array[String]) {
27 |     val conf = new SparkConf().setAppName("Spark Pi")
28 |     val spark = new SparkContext(conf)
29 |     val slices = if (args.length > 0) args(0).toInt else 2
30 |     val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
31 |     val count = spark.parallelize(1 until n, slices).map { i =>
32 |       val x = random * 2 - 1
33 |       val y = random * 2 - 1
34 |       if (x*x + y*y < 1) 1 else 0
35 |     }.reduce(_ + _)
36 |     println("Pi is roughly " + 4.0 * count / n)
37 |     spark.stop()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkTC.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.util.Random
21 | import scala.collection.mutable
22 | 
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | import org.apache.spark.SparkContext._
25 | 
26 | /**
27 |  * Transitive closure on a graph.
28 |  */
29 | object SparkTC {
30 |   val numEdges = 200
31 |   val numVertices = 100
32 |   val rand = new Random(42)
33 | 
34 |   def generateGraph = {
35 |     val edges: mutable.Set[(Int, Int)] = mutable.Set.empty
36 |     while (edges.size < numEdges) {
37 |       val from = rand.nextInt(numVertices)
38 |       val to = rand.nextInt(numVertices)
39 |       if (from != to) edges.+=((from, to))
40 |     }
41 |     edges.toSeq
42 |   }
43 | 
44 |   def main(args: Array[String]) {
45 |     val sparkConf = new SparkConf().setAppName("SparkTC")
46 |     val spark = new SparkContext(sparkConf)
47 |     val slices = if (args.length > 0) args(0).toInt else 2
48 |     var tc = spark.parallelize(generateGraph, slices).cache()
49 | 
50 |     // Linear transitive closure: each round grows paths by one edge,
51 |     // by joining the graph's edges with the already-discovered paths.
52 |     // e.g. join the path (y, z) from the TC with the edge (x, y) from
53 |     // the graph to obtain the path (x, z).
54 | 
55 |     // Because join() joins on keys, the edges are stored in reversed order.
56 |     val edges = tc.map(x => (x._2, x._1))
57 | 
58 |     // This join is iterated until a fixed point is reached.
59 |     var oldCount = 0L
60 |     var nextCount = tc.count()
61 |     do {
62 |       oldCount = nextCount
63 |       // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
64 |       // then project the result to obtain the new (x, z) paths.
65 |       tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache()
66 |       nextCount = tc.count()
67 |     } while (nextCount != oldCount)
68 | 
69 |     println("TC has " + tc.count() + " edges.")
70 |     spark.stop()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | import org.apache.hadoop.conf.Configuration
26 | 
27 | import org.apache.spark._
28 | import org.apache.spark.scheduler.InputFormatInfo
29 | import org.apache.spark.storage.StorageLevel
30 | 
31 | 
32 | /**
33 |  * Logistic regression based classification.
34 |  * This example uses Tachyon to persist rdds during computation.
35 |  *
36 |  * This is an example implementation for learning how to use Spark. For more conventional use,
37 |  * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
38 |  * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
39 |  */
40 | object SparkTachyonHdfsLR {
41 |   val D = 10   // Numer of dimensions
42 |   val rand = new Random(42)
43 | 
44 |   def showWarning() {
45 |     System.err.println(
46 |       """WARN: This is a naive implementation of Logistic Regression and is given as an example!
47 |         |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
48 |         |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
49 |         |for more conventional use.
50 |       """.stripMargin)
51 |   }
52 | 
53 |   case class DataPoint(x: Vector[Double], y: Double)
54 | 
55 |   def parsePoint(line: String): DataPoint = {
56 |     val tok = new java.util.StringTokenizer(line, " ")
57 |     var y = tok.nextToken.toDouble
58 |     var x = new Array[Double](D)
59 |     var i = 0
60 |     while (i < D) {
61 |       x(i) = tok.nextToken.toDouble; i += 1
62 |     }
63 |     DataPoint(new DenseVector(x), y)
64 |   }
65 | 
66 |   def main(args: Array[String]) {
67 | 
68 |     showWarning()
69 | 
70 |     val inputPath = args(0)
71 |     val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
72 |     val conf = new Configuration()
73 |     val sc = new SparkContext(sparkConf,
74 |       InputFormatInfo.computePreferredLocations(
75 |         Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
76 |       ))
77 |     val lines = sc.textFile(inputPath)
78 |     val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
79 |     val ITERATIONS = args(1).toInt
80 | 
81 |     // Initialize w to a random value
82 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
83 |     println("Initial w: " + w)
84 | 
85 |     for (i <- 1 to ITERATIONS) {
86 |       println("On iteration " + i)
87 |       val gradient = points.map { p =>
88 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
89 |       }.reduce(_ + _)
90 |       w -= gradient
91 |     }
92 | 
93 |     println("Final w: " + w)
94 |     sc.stop()
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | import org.apache.spark.storage.StorageLevel
24 | 
25 | /**
26 |  *  Computes an approximation to pi
27 |  *  This example uses Tachyon to persist rdds during computation.
28 |  */
29 | object SparkTachyonPi {
30 |   def main(args: Array[String]) {
31 |     val sparkConf = new SparkConf().setAppName("SparkTachyonPi")
32 |     val spark = new SparkContext(sparkConf)
33 | 
34 |     val slices = if (args.length > 0) args(0).toInt else 2
35 |     val n = 100000 * slices
36 | 
37 |     val rdd = spark.parallelize(1 to n, slices)
38 |     rdd.persist(StorageLevel.OFF_HEAP)
39 |     val count = rdd.map { i =>
40 |       val x = random * 2 - 1
41 |       val y = random * 2 - 1
42 |       if (x * x + y * y < 1) 1 else 0
43 |     }.reduce(_ + _)
44 |     println("Pi is roughly " + 4.0 * count / n)
45 | 
46 |     spark.stop()
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/java/com/javachen/spark/AppTest.java:
--------------------------------------------------------------------------------
 1 | package com.javachen.spark;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest
11 |         extends TestCase {
12 |     /**
13 |      * Create the test case
14 |      *
15 |      * @param testName name of the test case
16 |      */
17 |     public AppTest(String testName) {
18 |         super(testName);
19 |     }
20 | 
21 |     /**
22 |      * @return the suite of tests being tested
23 |      */
24 |     public static Test suite() {
25 |         return new TestSuite(AppTest.class);
26 |     }
27 | 
28 |     /**
29 |      * Rigourous Test :-)
30 |      */
31 |     public void testApp() {
32 |         assertTrue(true);
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------