├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── main.workflow
└── workflows
│ ├── codacy-analysis.yml
│ └── scala.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
├── data
├── capitals.parquet
│ ├── ._SUCCESS.crc
│ ├── ._common_metadata.crc
│ ├── ._metadata.crc
│ ├── .part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc
│ ├── .part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc
│ ├── _SUCCESS
│ ├── _common_metadata
│ ├── _metadata
│ ├── part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet
│ └── part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet
├── countries-bbox.parquet
│ ├── ._SUCCESS.crc
│ ├── ._common_metadata.crc
│ ├── ._metadata.crc
│ ├── .part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc
│ ├── .part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc
│ ├── _SUCCESS
│ ├── _common_metadata
│ ├── _metadata
│ ├── part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet
│ └── part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet
├── countries-poly.parquet
│ ├── ._SUCCESS.crc
│ ├── ._common_metadata.crc
│ ├── ._metadata.crc
│ ├── .part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc
│ ├── .part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc
│ ├── _SUCCESS
│ ├── _common_metadata
│ ├── _metadata
│ ├── part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet
│ └── part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet
└── world-cities-points.parquet
│ ├── ._SUCCESS.crc
│ ├── ._common_metadata.crc
│ ├── ._metadata.crc
│ ├── .part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc
│ ├── .part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc
│ ├── _SUCCESS
│ ├── _common_metadata
│ ├── _metadata
│ ├── part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet
│ └── part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet
├── deployToSonartype.md
├── docker-compose.yml
├── notebooks
├── 2BXC9TF8J
│ └── note.json
└── 2BYXS4JRX
│ └── note.json
├── project
├── build.properties
└── plugins.sbt
├── scalastyle-config.xml
├── scripts
├── loadAlice.scala
├── loadCities.scala
├── loadH1BVisa.scala
├── loadWords.scala
├── recordLinkage
│ └── simpleExamples
│ │ ├── linkageFuzzyExample.scala
│ │ └── linkagePrefixExample.scala
└── spatial
│ ├── loadSolrSpatialData.scala
│ └── loadSwissCities.scala
├── spark-shell.sh
├── src
├── main
│ ├── resources
│ │ └── reference.conf
│ └── scala
│ │ └── org
│ │ └── zouzias
│ │ └── spark
│ │ └── lucenerdd
│ │ ├── LuceneRDD.scala
│ │ ├── LuceneRDDKryoRegistrator.scala
│ │ ├── aggregate
│ │ └── SparkFacetResultMonoid.scala
│ │ ├── analyzers
│ │ └── AnalyzerConfigurable.scala
│ │ ├── config
│ │ ├── Configurable.scala
│ │ ├── LuceneRDDConfigurable.scala
│ │ ├── LuceneRDDParams.scala
│ │ └── ShapeLuceneRDDConfigurable.scala
│ │ ├── facets
│ │ ├── FacetedLuceneRDD.scala
│ │ └── package.scala
│ │ ├── matrices
│ │ └── TermDocMatrix.scala
│ │ ├── models
│ │ ├── SparkFacetResult.scala
│ │ ├── SparkScoreDoc.scala
│ │ ├── TermVectorEntry.scala
│ │ └── indexstats
│ │ │ ├── FieldStatistics.scala
│ │ │ └── IndexStatistics.scala
│ │ ├── package.scala
│ │ ├── partition
│ │ ├── AbstractLuceneRDDPartition.scala
│ │ └── LuceneRDDPartition.scala
│ │ ├── query
│ │ ├── LuceneQueryHelpers.scala
│ │ └── SimilarityConfigurable.scala
│ │ ├── response
│ │ ├── FieldType.scala
│ │ ├── LuceneRDDResponse.scala
│ │ └── LuceneRDDResponsePartition.scala
│ │ ├── spatial
│ │ └── shape
│ │ │ ├── ShapeLuceneRDD.scala
│ │ │ ├── ShapeLuceneRDDKryoRegistrator.scala
│ │ │ ├── context
│ │ │ └── ContextLoader.scala
│ │ │ ├── grids
│ │ │ └── PrefixTreeLoader.scala
│ │ │ ├── package.scala
│ │ │ ├── partition
│ │ │ ├── AbstractShapeLuceneRDDPartition.scala
│ │ │ └── ShapeLuceneRDDPartition.scala
│ │ │ └── strategies
│ │ │ └── SpatialStrategy.scala
│ │ ├── store
│ │ ├── IndexStorable.scala
│ │ └── IndexWithTaxonomyWriter.scala
│ │ ├── testing
│ │ ├── FavoriteCaseClass.scala
│ │ └── Person.scala
│ │ └── versioning
│ │ └── Versionable.scala
└── test
│ ├── resources
│ ├── alice.txt
│ ├── capitals.txt
│ ├── cities.txt
│ ├── countries.geo.json
│ ├── countries.txt
│ ├── country-list.csv
│ ├── log4j.properties
│ ├── reference.conf
│ ├── spatial
│ │ └── CH.txt
│ └── words.txt
│ └── scala
│ └── org
│ └── zouzias
│ └── spark
│ └── lucenerdd
│ ├── BlockingDedupSpec.scala
│ ├── BlockingLinkageSpec.scala
│ ├── LuceneDocToSparkRowpec.scala
│ ├── LucenePrimitiveTypesSpec.scala
│ ├── LuceneRDDCustomCaseClassImplicitsSpec.scala
│ ├── LuceneRDDDataFrameImplicitsSpec.scala
│ ├── LuceneRDDMoreLikeThisSpec.scala
│ ├── LuceneRDDRecordLinkageSpec.scala
│ ├── LuceneRDDSearchSpec.scala
│ ├── LuceneRDDSpec.scala
│ ├── LuceneRDDTermVectorsSpec.scala
│ ├── LuceneRDDTuplesSpec.scala
│ ├── analyzers
│ └── AnalyzersConfigurableSpec.scala
│ ├── facets
│ ├── FacetedLuceneRDDFacetSpec.scala
│ └── FacetedLuceneRDDImplicitsSpec.scala
│ ├── query
│ └── LuceneQueryHelpersSpec.scala
│ ├── response
│ └── LuceneRDDResponseSpec.scala
│ ├── spatial
│ └── shape
│ │ ├── ShapeLuceneRDDKnnSearchSpec.scala
│ │ ├── ShapeLuceneRDDLinkageSpec.scala
│ │ ├── ShapeLuceneRDDSpatialSearchSpec.scala
│ │ ├── ShapeLuceneRDDSpec.scala
│ │ └── implicits
│ │ └── ShapeLuceneRDDImplicitsSpec.scala
│ └── testing
│ └── LuceneRDDTestUtils.scala
├── startZeppelin.sh
└── version.sbt
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 |
5 | ---
6 |
7 | **Describe the bug**
8 | A clear and concise description of what the bug is.
9 |
10 | **To Reproduce**
11 | Steps to reproduce the behavior (code snippet)
12 |
13 |
14 | **Expected behavior**
15 | A clear and concise description of what you expected to happen.
16 |
17 |
18 | **Versions (please complete the following information):**
19 | - spark-lucenerdd version: [e.g. 0.3.0]
20 | - SBT version: [e.g. 1.2.3]
21 | - Spark Version: [e.g. 2.3.2]
22 | - Java version: [e.g. Java 8]
23 |
24 |
25 | **Additional context**
26 | Add any other context about the problem here.
27 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 |
5 | ---
6 |
7 | **Is your feature request related to a problem? Please describe.**
8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
9 |
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 |
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 |
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 |
--------------------------------------------------------------------------------
/.github/main.workflow:
--------------------------------------------------------------------------------
1 | workflow "New workflow" {
2 | on = "push"
3 | }
4 |
--------------------------------------------------------------------------------
/.github/workflows/codacy-analysis.yml:
--------------------------------------------------------------------------------
1 | # This workflow checks out code, performs a Codacy security scan
2 | # and integrates the results with the
3 | # GitHub Advanced Security code scanning feature. For more information on
4 | # the Codacy security scan action usage and parameters, see
5 | # https://github.com/codacy/codacy-analysis-cli-action.
6 | # For more information on Codacy Analysis CLI in general, see
7 | # https://github.com/codacy/codacy-analysis-cli.
8 |
9 | name: Codacy Security Scan
10 |
11 | on:
12 | push:
13 | branches: [ master, BRANCH-0.1.x, BRANCH-0.2.x, develop ]
14 | pull_request:
15 | # The branches below must be a subset of the branches above
16 | branches: [ master ]
17 | schedule:
18 | - cron: '36 23 * * 5'
19 |
20 | jobs:
21 | codacy-security-scan:
22 | name: Codacy Security Scan
23 | runs-on: ubuntu-latest
24 | steps:
25 | # Checkout the repository to the GitHub Actions runner
26 | - name: Checkout code
27 | uses: actions/checkout@v2
28 |
29 | # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis
30 | - name: Run Codacy Analysis CLI
31 | uses: codacy/codacy-analysis-cli-action@1.1.0
32 | with:
33 | # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository
34 | # You can also omit the token and run the tools that support default configurations
35 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
36 | verbose: true
37 | output: results.sarif
38 | format: sarif
39 | # Adjust severity of non-security issues
40 | gh-code-scanning-compat: true
41 | # Force 0 exit code to allow SARIF file generation
42 | # This will handover control about PR rejection to the GitHub side
43 | max-allowed-issues: 2147483647
44 |
45 | # Upload the SARIF file generated in the previous step
46 | - name: Upload SARIF results file
47 | uses: github/codeql-action/upload-sarif@v1
48 | with:
49 | sarif_file: results.sarif
50 |
--------------------------------------------------------------------------------
/.github/workflows/scala.yml:
--------------------------------------------------------------------------------
1 | name: Scala CI
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 |
10 | steps:
11 | - uses: actions/checkout@v1
12 | - name: Set up JDK 11
13 | uses: actions/setup-java@v1
14 | with:
15 | java-version: 11
16 | - name: Run tests
17 | run: sbt test
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | commons-csv-1.1.jar
2 | spark-csv_2.11-1.4.0.jar
3 | spark-csv_2.10-1.4.0.jar
4 | src/test/resources/h1bvisa-2014.csv
5 |
6 | NOTES.md
7 | metastore_db/
8 | .idea/
9 | *.class
10 | *.log
11 |
12 | # sbt specific
13 | .cache
14 | .history
15 | .lib/
16 | dist/*
17 | target/
18 | lib_managed/
19 | src_managed/
20 | project/boot/
21 | project/plugins/project/
22 |
23 | # Scala-IDE specific
24 | .scala_dependencies
25 | .worksheet
26 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.12.10
4 | sudo: false
5 | dist: trusty
6 | cache:
7 | directories:
8 | - $HOME/.sbt/0.13/dependency
9 | - $HOME/.sbt/boot/scala*
10 | - $HOME/.sbt/launchers
11 | - $HOME/.ivy2/cache
12 | before_cache:
13 | - du -h -d 1 $HOME/.ivy2/cache
14 | - du -h -d 2 $HOME/.sbt/
15 | - find $HOME/.sbt -name "*.lock" -type f -delete
16 | - find $HOME/.ivy2/cache -name "ivydata-*.properties" -type f -delete
17 | matrix:
18 | include:
19 | - jdk: oraclejdk8
20 | env: LUCENERDD_ANALYZER_NAME="en" LUCENERDD_LINKER_METHOD="cartesian"
21 | - jdk: openjdk8
22 | env: LUCENERDD_ANALYZER_NAME="en" LUCENERDD_LINKER_METHOD="collectbroadcast"
23 | - jdk: openjdk8
24 | env: LUCENERDD_ANALYZER_NAME="whitespace" LUCENERDD_LINKER_METHOD="cartesian"
25 | - jdk: oraclejdk8
26 | env: LUCENERDD_ANALYZER_NAME="whitespace" LUCENERDD_LINKER_METHOD="collectbroadcast"
27 | script:
28 | - sbt ++$TRAVIS_SCALA_VERSION -Dlucenerdd.spatial.linker.method=${LUCENE_SPATIAL_LINKER_METHOD} clean update test
29 | - sbt ++$TRAVIS_SCALA_VERSION scalastyle
30 | - sbt ++$TRAVIS_SCALA_VERSION assembly
31 | - travis_wait 30 sbt ++$TRAVIS_SCALA_VERSION clean coverage test coverageReport
32 | after_success:
33 | - bash <(curl -s https://codecov.io/bash)
34 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | name := "spark-lucenerdd"
19 | organization := "org.zouzias"
20 | scalaVersion := "2.12.19"
21 | crossScalaVersions := Seq("2.12.19")
22 | licenses := Seq("Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html"))
23 | homepage := Some(url("https://github.com/zouzias/spark-lucenerdd"))
24 |
25 | scalacOptions ++= Seq("-deprecation",
26 | "-encoding", "UTF-8",
27 | "-feature",
28 | "-unchecked",
29 | "-Xlint",
30 | "-Yno-adapted-args",
31 | "-Ywarn-dead-code",
32 | "-Ywarn-numeric-widen",
33 | "-Ywarn-value-discard",
34 | "-language:implicitConversions")
35 |
36 | javacOptions ++= Seq("-Xlint",
37 | "-Xms512M",
38 | "-Xmx2048M",
39 | "-XX:MaxPermSize=2048M",
40 | "-XX:+CMSClassUnloadingEnabled"
41 | )
42 |
43 | // Add jcenter repo
44 | resolvers += Resolver.jcenterRepo
45 | resolvers += "Apache Repos" at "https://repository.apache.org/content/repositories/releases"
46 |
47 | releaseCrossBuild := false
48 | releasePublishArtifactsAction := PgpKeys.publishSigned.value
49 |
50 | publishMavenStyle := true
51 |
52 | sonatypeProfileName := "org.zouzias"
53 |
54 | publishTo := {
55 | val nexus = "https://oss.sonatype.org/"
56 | if (isSnapshot.value) {
57 | Some("snapshots" at nexus + "content/repositories/snapshots")
58 | }
59 | else {
60 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
61 | }
62 | }
63 |
64 | Test / publishArtifact := false
65 |
66 | pomIncludeRepository := { _ => false }
67 |
68 | pomExtra :=
69 | git@github.com:zouzias/spark-lucenerdd.git
70 | scm:git:git@github.com:zouzias/spark-lucenerdd.git
71 |
72 |
73 |
74 | zouzias
75 | Anastasios Zouzias
76 | https://github.com/zouzias
77 |
78 |
79 |
80 | val luceneV = "8.11.3"
81 | val sparkVersion = "3.5.6"
82 |
83 | credentials += Credentials(Path.userHome / ".sbt" / ".credentials")
84 |
85 |
86 | // scalastyle:off
87 |
88 | val scalactic = "org.scalactic" %% "scalactic" % "3.2.19"
89 | val scalatest = "org.scalatest" %% "scalatest" % "3.2.19" % "test"
90 |
91 |
92 | val joda_time = "joda-time" % "joda-time" % "2.12.7"
93 | val algebird = "com.twitter" %% "algebird-core" % "0.13.10"
94 | val joda_convert = "org.joda" % "joda-convert" % "2.2.3"
95 | val spatial4j = "org.locationtech.spatial4j" % "spatial4j" % "0.8"
96 |
97 | val typesafe_config = "com.typesafe" % "config" % "1.3.4"
98 |
99 | val lucene_facet = "org.apache.lucene" % "lucene-facet" % luceneV
100 | val lucene_analyzers = "org.apache.lucene" % "lucene-analyzers-common" % luceneV
101 | val lucene_query_parsers = "org.apache.lucene" % "lucene-queryparser" % luceneV
102 | val lucene_expressions = "org.apache.lucene" % "lucene-expressions" % luceneV
103 | val lucene_spatial_extras = "org.apache.lucene" % "lucene-spatial-extras" % luceneV
104 |
105 | val jts = "org.locationtech.jts" % "jts-core" % "1.19.0"
106 | // scalastyle:on
107 |
108 |
109 | libraryDependencies ++= Seq(
110 | algebird,
111 | lucene_facet,
112 | lucene_analyzers,
113 | lucene_expressions,
114 | lucene_query_parsers,
115 | typesafe_config,
116 | lucene_spatial_extras,
117 | spatial4j,
118 | jts,
119 | joda_time,
120 | joda_convert, // To avoid warning: Class org.joda.convert.ToString not found
121 | scalactic, // scalactic is recommended, see http://www.scalatest.org/install
122 | scalatest
123 | )
124 |
125 | libraryDependencies ++= Seq(
126 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
127 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
128 | "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
129 | "com.holdenkarau" %% "spark-testing-base" % s"3.5.1_1.5.3" % "test" intransitive(),
130 | "org.scala-lang" % "scala-library" % scalaVersion.value % "compile"
131 | )
132 |
133 | // Read version in code from build.sbt
134 | lazy val root = (project in file(".")).
135 | enablePlugins(BuildInfoPlugin).
136 | settings(
137 | buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion),
138 | // See https://github.com/sbt/sbt-buildinfo#buildinfooptionbuildtime
139 | buildInfoOptions += BuildInfoOption.BuildTime,
140 | // https://github.com/sbt/sbt-buildinfo#buildinfooptiontomap
141 | buildInfoOptions += BuildInfoOption.ToMap,
142 | buildInfoPackage := "org.zouzias.spark.lucenerdd"
143 | )
144 |
145 | lazy val compileScalastyle = taskKey[Unit]("compileScalastyle")
146 | compileScalastyle := scalastyle.in(Compile).toTask("").value
147 | (compile in Compile) := ((compile in Compile) dependsOn compileScalastyle).value
148 |
149 | Test / parallelExecution := false
150 |
151 | // Skip tests during assembly
152 | assembly / test := {}
153 |
154 | // To avoid merge issues
155 | assembly / assemblyMergeStrategy := {
156 | case PathList("module-info.class", xs @ _*) => MergeStrategy.first
157 | case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard
158 | case x =>
159 | val oldStrategy = (assembly / assemblyMergeStrategy).value
160 | oldStrategy(x)
161 | }
162 |
--------------------------------------------------------------------------------
/data/capitals.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/data/capitals.parquet/._common_metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/._common_metadata.crc
--------------------------------------------------------------------------------
/data/capitals.parquet/._metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/._metadata.crc
--------------------------------------------------------------------------------
/data/capitals.parquet/.part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/.part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc
--------------------------------------------------------------------------------
/data/capitals.parquet/.part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/.part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc
--------------------------------------------------------------------------------
/data/capitals.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/_SUCCESS
--------------------------------------------------------------------------------
/data/capitals.parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/_common_metadata
--------------------------------------------------------------------------------
/data/capitals.parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/_metadata
--------------------------------------------------------------------------------
/data/capitals.parquet/part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet
--------------------------------------------------------------------------------
/data/capitals.parquet/part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/._common_metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/._common_metadata.crc
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/._metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/._metadata.crc
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/.part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/.part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/.part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/.part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/_SUCCESS
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/_common_metadata
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/_metadata
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet
--------------------------------------------------------------------------------
/data/countries-bbox.parquet/part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet
--------------------------------------------------------------------------------
/data/countries-poly.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/data/countries-poly.parquet/._common_metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/._common_metadata.crc
--------------------------------------------------------------------------------
/data/countries-poly.parquet/._metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/._metadata.crc
--------------------------------------------------------------------------------
/data/countries-poly.parquet/.part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/.part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc
--------------------------------------------------------------------------------
/data/countries-poly.parquet/.part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/.part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc
--------------------------------------------------------------------------------
/data/countries-poly.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/_SUCCESS
--------------------------------------------------------------------------------
/data/countries-poly.parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/_common_metadata
--------------------------------------------------------------------------------
/data/countries-poly.parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/_metadata
--------------------------------------------------------------------------------
/data/countries-poly.parquet/part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet
--------------------------------------------------------------------------------
/data/countries-poly.parquet/part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/._common_metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/._common_metadata.crc
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/._metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/._metadata.crc
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/.part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/.part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/.part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/.part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/_SUCCESS
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/_common_metadata
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/_metadata
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet
--------------------------------------------------------------------------------
/data/world-cities-points.parquet/part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet
--------------------------------------------------------------------------------
/deployToSonartype.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | # Add `.credentials` file under `~/.sbt/` folder with contents
4 |
5 | ```
6 | realm=Sonatype Nexus Repository Manager
7 | host=oss.sonatype.org
8 | user=(USERNAME)
9 | password=(PASSWORD_HERE)
10 | ```
11 |
12 | ## Run sbt release to release signed both 2.10 and 2.11
13 |
14 | ```bash
15 | sbt release
16 | ```
17 |
18 | ## Then, git checkout v0.X.X to the release tag first, and then type
19 |
20 | ```bash
21 | sbt sonatypeRelease
22 | ```
23 |
24 | ## This will allow sonatype to release the artifacts to maven central.
25 | ## An alternative is to browse to https://oss.sonatype.org and do it manually
26 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | zeppelin:
2 | image: dylanmei/zeppelin
3 | environment:
4 | ZEPPELIN_PORT: 8080
5 | ZEPPELIN_JAVA_OPTS: >-
6 | -Dspark.driver.memory=1g
7 | -Dspark.executor.memory=1g
8 | MASTER: local[*]
9 | ports:
10 | - 8080:8080
11 | volumes:
12 | - ./data:/usr/zeppelin/data
13 | - ./notebooks:/usr/zeppelin/notebook
14 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.10.11
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
19 |
20 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.12.0")
21 |
22 | addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.6.4")
23 |
24 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")
25 |
26 | addSbtPlugin("com.github.sbt" % "sbt-release" % "1.4.0")
27 |
28 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
29 |
30 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3")
31 |
32 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.3.15")
33 |
34 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.11.3")
35 |
36 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.2.1")
37 |
--------------------------------------------------------------------------------
/scripts/loadAlice.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import scala.io.Source
19 | import org.zouzias.spark.lucenerdd._
20 | import org.zouzias.spark.lucenerdd.LuceneRDD
21 | val words = Source.fromFile("src/test/resources/alice.txt").getLines().map(_.trim.toLowerCase).filter(_.length > 3).toSeq
22 | val rdd = sc.parallelize(words)
23 | val luceneRDD = LuceneRDD(rdd)
24 | luceneRDD.cache
25 | luceneRDD.count
26 |
27 |
28 | luceneRDD.moreLikeThis("_1", "alice adventures wonderland", 1, 1, 20).take(20).foreach(println)
29 |
30 | import org.zouzias.spark.lucenerdd.matrices.TermDocMatrix
31 |
32 |
33 | // Construct the term-document matrix
34 | val terms = luceneRDD.termVectors("_1") // _1 is the default field name
35 | val mat = new TermDocMatrix(terms)
36 |
37 |
--------------------------------------------------------------------------------
/scripts/loadCities.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | sc.setLogLevel("INFO")
19 |
20 | import scala.io.Source
21 | import org.zouzias.spark.lucenerdd.partition.LuceneRDDPartition
22 | import org.zouzias.spark.lucenerdd._
23 | import org.zouzias.spark.lucenerdd.LuceneRDD
24 |
25 | val cities = Source.fromFile("src/test/resources/cities.txt").getLines().toSeq
26 | val rdd = sc.parallelize(cities)
27 | val luceneRDD = LuceneRDD(rdd)
28 | luceneRDD.cache
29 | luceneRDD.count
30 |
31 | println("=" * 20)
32 | luceneRDD.termQuery("_1", "toronto").take(10)
33 |
34 | println("=" * 20)
35 | luceneRDD.termQuery("_1", "athens").take(10)
36 |
37 | println("=" * 20)
38 | luceneRDD.termQuery("_1", "bern").take(10)
39 |
40 | println("=" * 20)
41 | luceneRDD.termQuery("_1", "madrid").take(10)
42 |
--------------------------------------------------------------------------------
/scripts/loadH1BVisa.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import org.zouzias.spark.lucenerdd._
19 | import org.zouzias.spark.lucenerdd.LuceneRDD
20 |
21 | val df = spark.sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("src/test/resources/h1bvisa-2014.csv")
22 | val words = df.select("lca_case_employer_name", "lca_case_job_title", "lca_case_employer_city", "lca_case_employer_state", "lca_case_employer_postal_code")
23 | val luceneRDD = LuceneRDD(df.sample(true, 0.01))
24 | luceneRDD.count
--------------------------------------------------------------------------------
/scripts/loadWords.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import scala.io.Source
19 | import org.zouzias.spark.lucenerdd._
20 | import org.zouzias.spark.lucenerdd.LuceneRDD
21 | val words = Source.fromFile("src/test/resources/words.txt").getLines().toSeq
22 | val rdd = sc.parallelize(words)
23 | val luceneRDD = LuceneRDD(rdd)
24 | luceneRDD.count
25 |
--------------------------------------------------------------------------------
/scripts/recordLinkage/simpleExamples/linkageFuzzyExample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import scala.io.Source
19 | import org.apache.spark.rdd.RDD
20 | import org.zouzias.spark.lucenerdd._
21 | import org.zouzias.spark.lucenerdd.LuceneRDD
22 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
23 |
24 | // Step 1: Query prefixes of countries
25 | // Shooting for Greece, Germany, Spain and Italy
26 | val leftCountries = Array("gree", "germa", "belgi", "ita")
27 | val leftCountriesRDD: RDD[String] = sc.parallelize(leftCountries)
28 |
29 | // Step 2: Load all country names
30 | val countries = sc.parallelize(Source.fromFile("src/test/resources/countries.txt").getLines()
31 | .map(_.toLowerCase()).toSeq)
32 | val luceneRDD = LuceneRDD(countries)
33 | luceneRDD.cache()
34 |
35 | // Step 3: Define you linkage function (prefix)
36 | def fuzzyLinker(country: String): String = {
37 | val Fuzziness = 2
38 | s"_1:${country}~${Fuzziness}"
39 | }
40 |
41 | // Step 4: Perform the linkage
42 | val linked: RDD[(String, Array[SparkScoreDoc])] = luceneRDD.link(leftCountriesRDD, fuzzyLinker, 10)
43 |
44 | // Step 5: View the results
45 | linked.foreach(x => println((x._1, x._2.mkString(","))))
46 |
47 | // spa,List(SparkScoreDoc(5.1271343,84,0,Text fields:_1:[spain])))
48 | // (gree,List(SparkScoreDoc(5.1271343,86,0,Text fields:_1:[greece])))
49 | // (germa,List(SparkScoreDoc(5.127134,83,0,Text fields:_1:[germany])))
50 | // (ita,List(SparkScoreDoc(2.9601524,106,0,Text fields:_1:[italy]), SparkScoreDoc(2.9601524,102,0,Text fields:_1:[iraq]), SparkScoreDoc(2.9601524,101,0,Text fields:_1:[iran]))
51 |
52 |
--------------------------------------------------------------------------------
/scripts/recordLinkage/simpleExamples/linkagePrefixExample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import scala.io.Source
19 | import org.apache.spark.rdd.RDD
20 | import org.zouzias.spark.lucenerdd._
21 | import org.zouzias.spark.lucenerdd.LuceneRDD
22 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
23 |
24 | // Step 1: Query prefixes of countries
25 | // Shooting for Greece, Russian, Argentina and Belgium
26 | val leftCountries = Array("gre", "ru", "ar", "bel")
27 | val leftCountriesRDD: RDD[String] = sc.parallelize(leftCountries)
28 |
29 | // Step 2: Load all country names
30 | val countries = sc.parallelize(Source.fromFile("src/test/resources/countries.txt").getLines()
31 | .map(_.toLowerCase()).toSeq)
32 | val luceneRDD = LuceneRDD(countries)
33 |
34 | // Step 3: Define you linkage function (prefix)
35 | def prefixLinker(country: String): String = {
36 | s"_1:${country}*"
37 | }
38 |
39 | // Step 4: Perform the linkage
40 | val linked: RDD[(String, Array[SparkScoreDoc])] = luceneRDD.link(leftCountriesRDD, prefixLinker, 10)
41 |
42 | // Step 5: View the results
43 | linked.foreach(x => println((x._1, x._2.mkString(","))))
44 |
45 | // (gre,List(SparkScoreDoc(1.0,88,0,Text fields:_1:[grenada]), SparkScoreDoc(1.0,87,0,Text fields:_1:[greenland]), SparkScoreDoc(1.0,86,0,Text fields:_1:[greece])))
46 | // (ar,List(SparkScoreDoc(1.0,12,0,Text fields:_1:[aruba]), SparkScoreDoc(1.0,11,0,Text fields:_1:[armenia]), SparkScoreDoc(1.0,10,0,Text fields:_1:[argentina])))
47 | // (ru,List(SparkScoreDoc(1.0,55,0,Text fields:_1:[russia])))
48 | // (be,List(SparkScoreDoc(1.0,25,0,Text fields:_1:[bermuda]), SparkScoreDoc(1.0,24,0,Text fields:_1:[benin]), SparkScoreDoc(1.0,23,0,Text fields:_1:[belize]), SparkScoreDoc(1.0,22,0,Text fields:_1:[belgium]), SparkScoreDoc(1.0,21,0,Text fields:_1:[belarus])))
--------------------------------------------------------------------------------
/scripts/spatial/loadSolrSpatialData.scala:
--------------------------------------------------------------------------------
1 |
2 |
3 | import java.io.StringReader
4 |
5 | import com.spatial4j.core.context.jts.JtsSpatialContext
6 | import com.spatial4j.core.io.ShapeIO
7 | import org.apache.spark.rdd.RDD
8 | import org.zouzias.spark.lucenerdd.spatial.shape._
9 | import org.zouzias.spark.lucenerdd.spatial.shape.ShapeLuceneRDD
10 | import org.zouzias.spark.lucenerdd._
11 | import org.zouzias.spark.lucenerdd.LuceneRDD
12 |
13 | import scala.reflect.ClassTag
14 |
15 | sc.setLogLevel("INFO")
16 |
17 | // Load all countries
18 | val allCountries = spark.read.parquet("data/countries-poly.parquet").select("name", "shape").map(row => (row.getString(1), row.getString(0)))
19 |
20 | // Load all cities
21 | val capitals = spark.read.parquet("data/capitals.parquet").select("name", "shape").map(row => (row.getString(1), row.getString(0)))
22 |
23 | def parseDouble(s: String): Double = try { s.toDouble } catch { case _: Throwable => 0.0 }
24 |
25 | def coords(city: (String, String)): (Double, Double) = {
26 | val str = city._1
27 | val nums = str.dropWhile(x => x.compareTo('(') != 0).drop(1).dropRight(1)
28 | val coords = nums.split(" ").map(_.trim)
29 | (parseDouble(coords(0)), parseDouble(coords(1)))
30 | }
31 |
32 | val shapes = ShapeLuceneRDD(allCountries)
33 | shapes.cache
34 |
35 |
36 | val linked = shapes.linkByRadius(capitals.rdd, coords, 50, 10)
37 | linked.cache
38 |
39 | linked.map(x => (x._1, x._2.map(_.doc.textField("_1")))).foreach(println)
--------------------------------------------------------------------------------
/scripts/spatial/loadSwissCities.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | import org.zouzias.spark.lucenerdd.spatial.shape._
19 | import org.zouzias.spark.lucenerdd.spatial.shape.ShapeLuceneRDD
20 | import org.zouzias.spark.lucenerdd._
21 | import org.zouzias.spark.lucenerdd.LuceneRDD
22 | val df = spark.read.format("com.databricks.spark.csv").option("header", "false").option("inferSchema", "true").option("delimiter", "\t").load("src/test/resources/spatial/CH.txt")
23 | val swissCities = df.select("_c0", "_c1", "_c5", "_c4").map(row => ((row.getDouble(2), row.getDouble(3)), row.getString(1).toLowerCase()))
24 | val shapes = ShapeLuceneRDD(swissCities.rdd)
25 | shapes.count
--------------------------------------------------------------------------------
/spark-shell.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | CURRENT_DIR=`pwd`
4 |
5 | # Read the version from version.sbt
6 | SPARK_LUCENERDD_VERSION=`cat version.sbt | awk '{print $5}' | xargs`
7 |
8 | # You should have downloaded this spark version under your ${HOME}
9 | SPARK_VERSION="3.2.1"
10 |
11 | echo "==============================================="
12 | echo "Loading LuceneRDD with version ${SPARK_LUCENERDD_VERSION}"
13 | echo "==============================================="
14 |
15 | echo "==============================================="
16 | echo "SPARK version: ${SPARK_VERSION}"
17 | echo "==============================================="
18 |
19 | # Assumes that spark is installed under home directory
20 | HOME_DIR=`echo ~`
21 | #export SPARK_LOCAL_IP=localhost
22 | SPARK_HOME=${HOME_DIR}/spark-${SPARK_VERSION}-bin-hadoop3.2
23 |
24 | # spark-lucenerdd assembly JAR
25 | MAIN_JAR=${CURRENT_DIR}/target/scala-2.12/spark-lucenerdd-assembly-${SPARK_LUCENERDD_VERSION}.jar
26 |
27 | # Run spark shell locally
28 | ${SPARK_HOME}/bin/spark-shell --jars "${MAIN_JAR}" \
29 | --conf "spark.executor.memory=1g" \
30 | --conf "spark.driver.memory=1g" \
31 | --conf "spark.rdd.compress=true" \
32 | --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
33 | --conf "spark.kryo.registrator=org.zouzias.spark.lucenerdd.LuceneRDDKryoRegistrator" \
34 | --conf spark.executor.extraJavaOptions="-Dlucenerdd.index.store.mode=disk" \
35 | --conf spark.driver.extraJavaOptions="-Dlucenerdd.index.store.mode=disk" \
36 | --conf "spark.kryoserializer.buffer=24mb" \
37 | --master local[*]
38 |
--------------------------------------------------------------------------------
/src/main/resources/reference.conf:
--------------------------------------------------------------------------------
1 | lucenerdd {
2 |
3 | // Name of analyzer as it is under Lucene's package org.apache.lucene.analysis.XX
4 | analyzer.name = "en"
5 | analyzer.name=${?LUCENERDD_ANALYZER_NAME}
6 |
7 |
8 | // Similarity scoring for Lucenes
9 | similarity.name = "bm25" // anything else will default to Lucene classic similarity
10 | similarity.name = ${?LUCENERDD_SIMILARITY_NAME}
11 |
12 | // Supported linkage methods
13 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD
14 | // fits in spark driver's memory)
15 | //
16 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions
17 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of
18 | // partitions of the queries RDD.
19 | linker.method = "collectbroadcast"
20 | linker.method = ${?LUCENERDD_LINKER_METHOD}
21 |
22 | index {
23 |
24 | // Lucene index storage
25 | // Use 'disk' to store the index in Java's temp directory
26 | // Otherwise the index will be stored in memory
27 | // Do not use memory, see http://lucene.apache.org/core/7_5_0/core/org/apache/lucene/store/RAMDirectory.html
28 | store.mode = "disk"
29 | store.mode = ${?LUCENERDD_INDEX_STORE_MODE}
30 |
31 | stringfields{
32 |
33 | // Analyze string fields by default or not
34 | // Implicit fields, like _1, _2, etc will use this option
35 | analyzed = true
36 | analyzed = ${?LUCENERDD_INDEX_STRINGFIELDS_ANALYZED}
37 |
38 | // Select a subset of string fields that you do not wish to be analyzed
39 | // Due to serialization issues this list should be set before starting a Spark Session
40 | // Moreover, all text/string fields that end with '_notanalyzed' are not analyzed
41 | not_analyzed_list = []
42 | not_analyzed_list = ${?LUCENERDD_INDEX_STRINGFIELDS_NOT_ANALYZED_LIST}
43 |
44 | // Text fields options as in org.apache.lucene.index.IndexOptions
45 | //
46 | // Other options are:
47 | // "DOCS"
48 | // "DOCS_AND_FREQS"
49 | // "DOCS_AND_FREQS_AND_POSITIONS"
50 | // "DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"
51 | // "NONE"
52 | options = "docs_and_freqs_and_positions_and_offsets"
53 | options = ${?LUCENERDD_INDEX_STRINGFIELDS_OPTIONS}
54 |
55 | terms {
56 | // Omit terms norms
57 | omitnorms = false
58 | omitnorms = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_OMITNORMS}
59 |
60 | // Store term positions
61 | positions = false
62 | positions = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_POSITIONS}
63 |
64 | // Store Term vectors (set true, otherwise LuceneRDD.termVectors(fieldName) will fail)
65 | vectors = true
66 | vectors = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_VECTORS}
67 | }
68 | }
69 | }
70 |
71 | query {
72 | // Maximum value on topK queries
73 | topk.maxvalue = 100
74 | topk.maxvalue = ${?LUCENERDD_QUERY_TOPK_MAXVALUE}
75 |
76 | // Default value of number of returned results
77 | topk.default = 10
78 | topk.default = ${?LUCENERDD_QUERY_TOPK_DEFAULT}
79 |
80 | // Default value of number of faceted results
81 | facets.number.default = 10
82 | facets.number.default = ${?LUCENERDD_QUERY_FACETS_NUMBER_DEFAULT}
83 |
84 | }
85 |
86 | // Spatial related configurations used by ShapeLuceneRDD
87 | spatial {
88 | prefixtree {
89 |
90 | // Spatial tree data structure
91 | name = "quad" // "geohash" or "quad"
92 | name = ${?LUCENE_SPATIAL_PREFIXTREE_NAME}
93 |
94 | maxlevel = 9 // 11 results in sub-meter precision for geohash
95 | maxlevel = ${?LUCENE_SPATIAL_PREFIXTREE_MAXLEVEL}
96 |
97 |
98 | maxDistErr = 5.0 // in kilometers
99 | maxDistErr = ${?LUCENE_SPATIAL_PREFIXTREE_MAXDISTERR}
100 |
101 | }
102 |
103 | // Shape format can be one of ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT
104 | shape.io.format = "WKT"
105 | shape.io.format = ${?LUCENE_SPATIAL_SHAPE_IO_FORMAT}
106 |
107 | // Supported linkage methods
108 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD
109 | // fits in spark driver's memory)
110 | //
111 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions
112 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of
113 | // partitions of the queries RDD.
114 | linker.method = "collectbroadcast"
115 | linker.method = ${?LUCENE_SPATIAL_LINKER_METHOD}
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/LuceneRDDKryoRegistrator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.twitter.algebird.TopK
20 | import com.twitter.chill.Kryo
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
23 | import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD
24 | import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc}
25 | import org.zouzias.spark.lucenerdd.partition.LuceneRDDPartition
26 | import org.zouzias.spark.lucenerdd.response.{LuceneRDDResponse, LuceneRDDResponsePartition}
27 | import org.zouzias.spark.lucenerdd.testing.{FavoriteCaseClass, Person}
28 |
29 | class LuceneRDDKryoRegistrator extends KryoRegistrator {
30 | def registerClasses(kryo: Kryo): Unit = {
31 | kryo.register(classOf[LuceneRDD[_]])
32 | kryo.register(classOf[LuceneRDDPartition[_]])
33 | kryo.register(classOf[FacetedLuceneRDD[_]])
34 | kryo.register(classOf[Number])
35 | kryo.register(classOf[java.lang.Double])
36 | kryo.register(classOf[java.lang.Float])
37 | kryo.register(classOf[java.lang.Integer])
38 | kryo.register(classOf[java.lang.Long])
39 | kryo.register(classOf[java.lang.Short])
40 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofRef[_]])
41 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofFloat])
42 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofDouble])
43 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofInt])
44 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofLong])
45 | kryo.register(classOf[Array[String]])
46 | kryo.register(classOf[Array[Number]])
47 | kryo.register(classOf[Array[Float]])
48 | kryo.register(classOf[Array[Int]])
49 | kryo.register(classOf[Array[Long]])
50 | kryo.register(classOf[Array[Double]])
51 | kryo.register(classOf[Array[Boolean]])
52 | kryo.register(classOf[Range])
53 | kryo.register(classOf[scala.collection.immutable.Map[String, String]])
54 | kryo.register(classOf[scala.collection.immutable.Map[String, Number]])
55 | kryo.register(classOf[scala.collection.immutable.Set[_]])
56 | kryo.register(classOf[scala.collection.immutable.Map[_, _]])
57 | kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]])
58 | kryo.register(classOf[SparkFacetResult])
59 | kryo.register(classOf[SparkScoreDoc])
60 | kryo.register(classOf[LuceneRDDResponse])
61 | kryo.register(classOf[LuceneRDDResponsePartition])
62 | kryo.register(classOf[TopK[_]])
63 | kryo.register(classOf[FavoriteCaseClass]) /* For testing */
64 | kryo.register(classOf[Array[FavoriteCaseClass]]) /* For testing */
65 | kryo.register(classOf[Person]) /* For testing */
66 | kryo.register(classOf[Array[Person]]) /* For testing */
67 | ()
68 | }
69 | }
70 |
71 | /**
72 | * Decorator for LuceneRDD Kryo serialization
73 | */
74 | object LuceneRDDKryoRegistrator {
75 | def registerKryoClasses(conf: SparkConf): SparkConf = {
76 | conf.set("spark.serializer", classOf[KryoSerializer].getName)
77 | .set("spark.kryo.registrator", classOf[LuceneRDDKryoRegistrator].getName)
78 | .set("spark.kryo.registrationRequired", "false")
79 | /* Set the above to true s.t. all classes are registered with Kryo */
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/aggregate/SparkFacetResultMonoid.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.aggregate
18 |
19 | import com.twitter.algebird.MapMonoid
20 | import org.zouzias.spark.lucenerdd.models.SparkFacetResult
21 |
22 | /**
23 | * Monoid used to aggregate faceted results [[SparkFacetResult]]
24 | * from the executors to the driver
25 | */
26 | object SparkFacetResultMonoid extends Serializable {
27 |
28 | private lazy val facetMonoid = new MapMonoid[String, Long]()
29 |
30 | def zero(facetName: String): SparkFacetResult = SparkFacetResult(facetName, facetMonoid.zero)
31 |
32 | def plus(l: SparkFacetResult, r: SparkFacetResult): SparkFacetResult = {
33 | require(l.facetName == r.facetName) // Check if summing same facets
34 | SparkFacetResult(l.facetName, facetMonoid.plus(l.facets, r.facets))
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/analyzers/AnalyzerConfigurable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.analyzers
18 |
19 | import org.apache.lucene.analysis.Analyzer
20 | import org.apache.lucene.analysis.ar.ArabicAnalyzer
21 | import org.apache.lucene.analysis.bg.BulgarianAnalyzer
22 | import org.apache.lucene.analysis.br.BrazilianAnalyzer
23 | import org.apache.lucene.analysis.ca.CatalanAnalyzer
24 | import org.apache.lucene.analysis.cjk.CJKAnalyzer
25 | import org.apache.lucene.analysis.ckb.SoraniAnalyzer
26 | import org.apache.lucene.analysis.core.WhitespaceAnalyzer
27 | import org.apache.lucene.analysis.cz.CzechAnalyzer
28 | import org.apache.lucene.analysis.da.DanishAnalyzer
29 | import org.apache.lucene.analysis.de.GermanAnalyzer
30 | import org.apache.lucene.analysis.el.GreekAnalyzer
31 | import org.apache.lucene.analysis.en.EnglishAnalyzer
32 | import org.apache.lucene.analysis.es.SpanishAnalyzer
33 | import org.apache.lucene.analysis.eu.BasqueAnalyzer
34 | import org.apache.lucene.analysis.fa.PersianAnalyzer
35 | import org.apache.lucene.analysis.fi.FinnishAnalyzer
36 | import org.apache.lucene.analysis.fr.FrenchAnalyzer
37 | import org.apache.lucene.analysis.ga.IrishAnalyzer
38 | import org.apache.lucene.analysis.gl.GalicianAnalyzer
39 | import org.apache.lucene.analysis.hi.HindiAnalyzer
40 | import org.apache.lucene.analysis.hu.HungarianAnalyzer
41 | import org.apache.lucene.analysis.id.IndonesianAnalyzer
42 | import org.apache.lucene.analysis.it.ItalianAnalyzer
43 | import org.apache.lucene.analysis.lt.LithuanianAnalyzer
44 | import org.apache.lucene.analysis.lv.LatvianAnalyzer
45 | import org.apache.lucene.analysis.nl.DutchAnalyzer
46 | import org.apache.lucene.analysis.no.NorwegianAnalyzer
47 | import org.apache.lucene.analysis.pt.PortugueseAnalyzer
48 | import org.apache.lucene.analysis.ru.RussianAnalyzer
49 | import org.apache.lucene.analysis.standard.StandardAnalyzer
50 | import org.apache.lucene.analysis.tr.TurkishAnalyzer
51 | import org.zouzias.spark.lucenerdd.config.Configurable
52 | import org.apache.spark.internal.Logging
53 |
54 | /**
55 | * Lucene Analyzer loader via configuration or via class name
56 | *
57 | * An analyzer can be loaded by using all the short country codes, i.e.,
58 | * en,el,de, etc or using a class name present in the classpath, i.e.,
59 | * 'org.apache.lucene.analysis.el.GreekAnalyzer'
60 | *
61 | * Custom Analyzers can be loaded provided that are present during runtime.
62 | */
63 | trait AnalyzerConfigurable extends Configurable
64 | with Logging {
65 |
66 | private val IndexAnalyzerConfigKey = "lucenerdd.index.analyzer.name"
67 | private val QueryAnalyzerConfigKey = "lucenerdd.query.analyzer.name"
68 |
69 | /** Get the configured analyzers or fallback to English */
70 | protected def getOrElseEn(analyzerName: Option[String]): String = analyzerName.getOrElse("en")
71 |
72 | protected val IndexAnalyzerConfigName: Option[String] =
73 | if (Config.hasPath(IndexAnalyzerConfigKey)) {
74 | Some(Config.getString(IndexAnalyzerConfigKey))} else None
75 |
76 | protected val QueryAnalyzerConfigName: Option[String] =
77 | if (Config.hasPath(QueryAnalyzerConfigKey)) {
78 | Some(Config.getString(QueryAnalyzerConfigKey))} else None
79 |
80 | protected def getAnalyzer(analyzerName: Option[String]): Analyzer = {
81 | if (analyzerName.isDefined) {
82 | analyzerName.get match {
83 | case "whitespace" => new WhitespaceAnalyzer()
84 | case "ar" => new ArabicAnalyzer()
85 | case "bg" => new BulgarianAnalyzer()
86 | case "br" => new BrazilianAnalyzer()
87 | case "ca" => new CatalanAnalyzer()
88 | case "cjk" => new CJKAnalyzer()
89 | case "ckb" => new SoraniAnalyzer()
90 | case "cz" => new CzechAnalyzer()
91 | case "da" => new DanishAnalyzer()
92 | case "de" => new GermanAnalyzer()
93 | case "el" => new GreekAnalyzer()
94 | case "en" => new EnglishAnalyzer()
95 | case "es" => new SpanishAnalyzer()
96 | case "eu" => new BasqueAnalyzer()
97 | case "fa" => new PersianAnalyzer()
98 | case "fi" => new FinnishAnalyzer()
99 | case "fr" => new FrenchAnalyzer()
100 | case "ga" => new IrishAnalyzer()
101 | case "gl" => new GalicianAnalyzer()
102 | case "hi" => new HindiAnalyzer()
103 | case "hu" => new HungarianAnalyzer()
104 | case "id" => new IndonesianAnalyzer()
105 | case "it" => new ItalianAnalyzer()
106 | case "lt" => new LithuanianAnalyzer()
107 | case "lv" => new LatvianAnalyzer()
108 | case "nl" => new DutchAnalyzer()
109 | case "no" => new NorwegianAnalyzer()
110 | case "pt" => new PortugueseAnalyzer()
111 | case "ru" => new RussianAnalyzer()
112 | case "tr" => new TurkishAnalyzer()
113 | case otherwise: String =>
114 | try {
115 | val clazz = loadConstructor[Analyzer](otherwise)
116 | clazz
117 | }
118 | catch {
119 | case e: ClassNotFoundException =>
120 | logError(s"Class ${otherwise} was not found in classpath. Does the class exist?", e)
121 | null
122 | case e: ClassCastException =>
123 | logError(s"Class ${otherwise} could not be " +
124 | s"cast to superclass org.apache.lucene.analysis.Analyzer.", e)
125 | null
126 | case e: Throwable =>
127 | logError(s"Class ${otherwise} could not be used as Analyzer.", e)
128 | null
129 | }
130 | }
131 | }
132 | else {
133 | logInfo("Analyzer name is not defined. Default analyzer is StandardAnalyzer().")
134 | new StandardAnalyzer()
135 | }
136 | }
137 |
138 | /**
139 | * Load a Lucene [[Analyzer]] using class name
140 | *
141 | * @param className The class name of the analyzer to load
142 | * @tparam T
143 | * @return Returns a Lucene Analyzer
144 | */
145 | private def loadConstructor[T <: Analyzer](className: String): T = {
146 | val loader = getClass.getClassLoader
147 | logInfo(s"Loading class ${className} using loader ${loader}")
148 | val loadedClass: Class[T] = loader.loadClass(className).asInstanceOf[Class[T]]
149 | val constructor = loadedClass.getConstructor()
150 | constructor.newInstance()
151 | }
152 |
153 | }
154 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/config/Configurable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.config
18 |
19 | import com.typesafe.config.ConfigFactory
20 |
21 | /**
22 | * Load typesafe configuration
23 | */
24 | trait Configurable extends Serializable {
25 | lazy val Config = ConfigFactory.load()
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/config/LuceneRDDConfigurable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.config
18 |
19 | import org.apache.lucene.index.IndexOptions
20 | import scala.collection.JavaConverters._
21 |
22 | /**
23 | * Configuration for [[org.zouzias.spark.lucenerdd.LuceneRDD]]
24 | */
25 | trait LuceneRDDConfigurable extends Configurable {
26 |
27 | protected val MaxDefaultTopKValue: Int = {
28 | if (Config.hasPath("lucenerdd.query.topk.default")) {
29 | Config.getInt("lucenerdd.query.topk.maxvalue")
30 | }
31 | else 1000
32 | }
33 |
34 | /** Default value for topK queries */
35 | protected val DefaultTopK: Int = {
36 | if (Config.hasPath("lucenerdd.query.topk.default")) {
37 | Config.getInt("lucenerdd.query.topk.default")
38 | }
39 | else 10
40 | }
41 |
42 | protected val DefaultFacetNum: Int = {
43 | if (Config.hasPath("lucenerdd.query.facet.topk.default")) {
44 | Config.getInt("lucenerdd.query.facet.topk.default")
45 | }
46 | else 10
47 | }
48 |
49 | protected val StringFieldsDefaultAnalyzed: Boolean = {
50 | if (Config.hasPath("lucenerdd.index.stringfields.analyzed")) {
51 | Config.getBoolean("lucenerdd.index.stringfields.analyzed")
52 | }
53 | else {
54 | true
55 | }
56 | }
57 |
58 | /**
59 | * List of string fields *not* to be analyzed
60 | */
61 | protected val StringFieldsListToBeNotAnalyzed: List[String] = {
62 | if (Config.hasPath("lucenerdd.index.stringfields.not_analyzed_list")) {
63 | Config.getStringList("lucenerdd.index.stringfields.not_analyzed_list")
64 | .asScala.toList
65 | }
66 | else {
67 | List.empty[String]
68 | }
69 | }
70 |
71 | protected val StringFieldsStoreTermVector: Boolean = {
72 | if (Config.hasPath("lucenerdd.index.stringfields.terms.vectors")) {
73 | Config.getBoolean("lucenerdd.index.stringfields.terms.vectors")
74 | }
75 | else true
76 | }
77 |
78 | protected val StringFieldsStoreTermPositions: Boolean = {
79 | if (Config.hasPath("lucenerdd.index.stringfields.terms.positions")) {
80 | Config.getBoolean("lucenerdd.index.stringfields.terms.positions")
81 | }
82 | else true
83 | }
84 |
85 | protected val StringFieldsOmitNorms: Boolean = {
86 | if (Config.hasPath("lucenerdd.index.stringfields.terms.omitnorms")) {
87 | Config.getBoolean("lucenerdd.index.stringfields.terms.omitnorms")
88 | }
89 | else false
90 | }
91 |
92 | protected val StringFieldsIndexOptions: IndexOptions = {
93 | if (Config.hasPath("lucenerdd.index.stringfields.options")) {
94 | val indexOptions = Config.getString("lucenerdd.index.stringfields.options")
95 |
96 | indexOptions.toLowerCase match {
97 | case "docs" => IndexOptions.DOCS
98 | case "docs_and_freqs" => IndexOptions.DOCS_AND_FREQS
99 | case "docs_and_freqs_and_positions" => IndexOptions.DOCS_AND_FREQS_AND_POSITIONS
100 | case "docs_and_freqs_and_positions_and_offsets" =>
101 | IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
102 | case _ => IndexOptions.NONE
103 | }
104 | }
105 | else IndexOptions.DOCS_AND_FREQS_AND_POSITIONS // Default
106 | }
107 |
108 | protected val getLinkerMethod: String = {
109 | if (Config.hasPath("lucenerdd.linker.method ")) {
110 | Config.getString("lucenerdd.linker.method ")
111 | }
112 | else "collectbroadcast" // collectbroadcast by default
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/config/LuceneRDDParams.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.config
18 |
19 | import org.zouzias.spark.lucenerdd.analyzers.AnalyzerConfigurable
20 | import org.zouzias.spark.lucenerdd.query.SimilarityConfigurable
21 |
22 | /** Lucene analysis parameters during indexing and querying.
23 | *
24 | * @param indexAnalyzer Index analyzer name. Lucene [[Analyzer]] used during indexing
25 | * @param queryAnalyzer Query analyzer name. Lucene [[Analyzer]] used during querying
26 | * @param similarity Lucene scoring similarity, i.e., BM25 or TF-IDF
27 | * @param indexAnalyzerPerField Lucene Analyzer per field (indexing time), default empty
28 | * @param queryAnalyzerPerField Lucene Analyzer per field (query time), default empty
29 | */
30 | case class LuceneRDDParams(indexAnalyzer: String,
31 | queryAnalyzer: String,
32 | similarity: String,
33 | indexAnalyzerPerField: Map[String, String],
34 | queryAnalyzerPerField: Map[String, String]) extends Serializable
35 |
36 |
37 | object LuceneRDDParams extends AnalyzerConfigurable with SimilarityConfigurable {
38 | def apply(): LuceneRDDParams = {
39 | new LuceneRDDParams(getOrElseEn(IndexAnalyzerConfigName),
40 | getOrElseEn(QueryAnalyzerConfigName),
41 | getOrElseClassic(),
42 | Map.empty[String, String],
43 | Map.empty[String, String])
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/config/ShapeLuceneRDDConfigurable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.config
18 |
19 | import org.locationtech.spatial4j.io.ShapeIO
20 |
21 | trait ShapeLuceneRDDConfigurable extends LuceneRDDConfigurable {
22 |
23 | protected val getPrefixTreeMaxLevel: Int = {
24 | if (Config.hasPath("lucenerdd.spatial.prefixtree.maxlevel")) {
25 | Config.getInt("lucenerdd.spatial.prefixtree.maxlevel")
26 | }
27 | else 11
28 | }
29 |
30 | protected val getPrefixTreeName: String = {
31 | if (Config.hasPath("lucenerdd.spatial.prefixtree.name")) {
32 | Config.getString("lucenerdd.spatial.prefixtree.name")
33 | }
34 | else "geohash" // Geohash tree by default
35 | }
36 |
37 | protected val getPrefixTreeMaxDistErr: Double = {
38 | if (Config.hasPath("lucenerdd.spatial.prefixtree.maxDistErr")) {
39 | Config.getDouble("lucenerdd.spatial.prefixtree.maxDistErr")
40 | }
41 | else 1D
42 | }
43 |
44 | protected val getLocationFieldName: String = {
45 | if (Config.hasPath("lucenerdd.spatial.location.field.name")) {
46 | Config.getString("lucenerdd.spatial.location.field.name")
47 | }
48 | else "__location__"
49 | }
50 |
51 | protected val getShapeFormat: String = {
52 | if (Config.hasPath("lucenerdd.spatial.shape.io.format")) {
53 | val format = Config.getString("lucenerdd.spatial.shape.io.format")
54 | val availableFormats = Array(ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT)
55 | if (availableFormats.contains(format)) format else ShapeIO.WKT
56 | }
57 | else ShapeIO.WKT
58 | }
59 |
60 | protected val getShapeLinkerMethod: String = {
61 | if (Config.hasPath("lucenerdd.spatial.linker.method ")) {
62 | Config.getString("lucenerdd.spatial.linker.method ")
63 | }
64 | else "collectbroadcast" // collectbroadcast by default
65 | }
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/facets/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import org.apache.lucene.document._
20 | import org.apache.lucene.facet.FacetField
21 | import org.apache.spark.sql.Row
22 |
23 | import scala.reflect.ClassTag
24 |
25 | /**
26 | * Contains implicit conversion to [[org.apache.lucene.document.Document]]
27 | * which prepares the index for faceted search as well.
28 | */
29 | package object facets {
30 |
31 | private val Stored = Field.Store.YES
32 | private val DefaultFieldName = "_1"
33 |
34 | /**
35 | * Adds extra field on index with suffix [[FacetedLuceneRDD.FacetTextFieldSuffix]]
36 | * This fiels is used on faceted queries
37 | *
38 | * @param doc Input document
39 | * @param fieldName Field name
40 | * @param fieldValue Field value to be indexed
41 | */
42 | private def addTextFacetField(doc: Document, fieldName: String, fieldValue: String): Unit = {
43 | if ( fieldValue.nonEmpty) { // Issues with empty strings on facets
44 | doc.add(new FacetField(s"${fieldName}${FacetedLuceneRDD.FacetTextFieldSuffix}",
45 | fieldValue))
46 | }
47 | }
48 |
49 | implicit def intToDocument(v: Int): Document = {
50 | val doc = new Document
51 | doc.add(new IntPoint(DefaultFieldName, v))
52 | addTextFacetField(doc, DefaultFieldName, v.toString)
53 | doc
54 | }
55 |
56 | implicit def longToDocument(v: Long): Document = {
57 | val doc = new Document
58 | doc.add(new LongPoint(DefaultFieldName, v))
59 | addTextFacetField(doc, DefaultFieldName, v.toString)
60 | doc
61 | }
62 |
63 | implicit def doubleToDocument(v: Double): Document = {
64 | val doc = new Document
65 | doc.add(new DoublePoint(DefaultFieldName, v))
66 | addTextFacetField(doc, DefaultFieldName, v.toString)
67 | doc
68 | }
69 |
70 | implicit def floatToDocument(v: Float): Document = {
71 | val doc = new Document
72 | doc.add(new FloatPoint(DefaultFieldName, v))
73 | addTextFacetField(doc, DefaultFieldName, v.toString)
74 | doc
75 | }
76 |
77 | implicit def stringToDocument(s: String): Document = {
78 | val doc = new Document
79 | doc.add(new TextField(DefaultFieldName, s, Stored))
80 | addTextFacetField(doc, DefaultFieldName, s)
81 | doc
82 | }
83 |
84 | private def tupleTypeToDocument[T: ClassTag](doc: Document, index: Int, s: T): Document = {
85 | typeToDocument(doc, s"_${index}", s)
86 | }
87 |
88 | def typeToDocument[T: ClassTag](doc: Document, fName: String, s: T): Document = {
89 | s match {
90 | case x: String =>
91 | doc.add(new TextField(fName, x, Stored))
92 | addTextFacetField(doc, fName, x)
93 | case x: Long =>
94 | doc.add(new LongPoint(fName, x))
95 | doc.add(new StoredField(fName, x))
96 | doc.add(new NumericDocValuesField(s"${fName} ${FacetedLuceneRDD.FacetNumericFieldSuffix}",
97 | x))
98 | case x: Int =>
99 | doc.add(new IntPoint(fName, x))
100 | doc.add(new StoredField(fName, x))
101 | doc.add(new NumericDocValuesField(s"${fName}${FacetedLuceneRDD.FacetNumericFieldSuffix}",
102 | x.toLong))
103 | case x: Float =>
104 | doc.add(new FloatPoint(fName, x))
105 | doc.add(new StoredField(fName, x))
106 | doc.add(new FloatDocValuesField(s"${fName}${FacetedLuceneRDD.FacetNumericFieldSuffix}",
107 | x))
108 | case x: Double =>
109 | doc.add(new DoublePoint(fName, x))
110 | doc.add(new StoredField(fName, x))
111 | doc.add(new DoubleDocValuesField(s"${fName}${FacetedLuceneRDD.FacetNumericFieldSuffix}",
112 | x))
113 | }
114 | doc
115 | }
116 |
117 | implicit def iterablePrimitiveToDocument[T: ClassTag](iter: Iterable[T]): Document = {
118 | val doc = new Document
119 | iter.foreach( item => tupleTypeToDocument(doc, 1, item))
120 | doc
121 | }
122 |
123 | implicit def mapToDocument[T: ClassTag](map: Map[String, T]): Document = {
124 | val doc = new Document
125 | map.foreach{ case (key, value) =>
126 | typeToDocument(doc, key, value)
127 | }
128 | doc
129 | }
130 |
131 | /**
132 | * Implicit conversion for all product types, such as case classes and Tuples
133 | * @param s
134 | * @tparam T
135 | * @return
136 | */
137 | implicit def productTypeToDocument[T <: Product : ClassTag](s: T): Document = {
138 | val doc = new Document
139 |
140 | val fieldNames = s.getClass.getDeclaredFields.map(_.getName).toIterator
141 | val fieldValues = s.productIterator
142 | fieldValues.zip(fieldNames).foreach{ case (elem, fieldName) =>
143 | typeToDocument(doc, fieldName, elem)
144 | }
145 |
146 | doc
147 | }
148 |
149 | /**
150 | * Implicit conversion for Spark Row: used for DataFrame
151 | * @param row
152 | * @return
153 | */
154 | implicit def sparkRowToDocument(row: Row): Document = {
155 | val doc = new Document
156 |
157 | val fieldNames = row.schema.fieldNames
158 | fieldNames.foreach{ case fieldName =>
159 | val index = row.fieldIndex(fieldName)
160 | typeToDocument(doc, fieldName, row.get(index))
161 | }
162 |
163 | doc
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/matrices/TermDocMatrix.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.matrices
18 |
19 | import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
20 | import org.apache.spark.rdd.RDD
21 | import org.zouzias.spark.lucenerdd.models.TermVectorEntry
22 |
23 | /**
24 | * Term Document Matrix of a Lucene field
25 | *
26 | * Each term (row of matrix) is uniquely assigned an index
27 | */
28 | class TermDocMatrix(triplets: RDD[TermVectorEntry]) extends Serializable {
29 |
30 | private lazy val docIdsPerShardMap = computeUniqueDocId()
31 |
32 | private lazy val indexedTerms = triplets.map(_.term).distinct().zipWithIndex().map(_.swap)
33 | private lazy val indexToTerm: Map[Long, String] = indexedTerms.collect().toMap
34 | private lazy val termToIndex: Map[String, Long] = indexToTerm.map(_.swap)
35 |
36 | private lazy val value_ = toMatrix()
37 |
38 | private lazy val nnz_ = value_.entries.count()
39 |
40 | /**
41 | * Returns a map from the matrix row indices to terms
42 | *
43 | * Using this map, you can associate the rows of the matrix with terms
44 | * @return
45 | */
46 | def rowIndexToTerm(): Map[Long, String] = indexToTerm
47 |
48 | /**
49 | * Returns a map from (documentId, partitionId) to the matrix column indices
50 | *
51 | * Using this map, you can associate the columns of the matrix to the documents
52 | * @return
53 | */
54 | def computeUniqueDocId(): Map[(String, Int), Long] = {
55 | triplets.map(_.docIdPerShard).distinct().zipWithIndex()
56 | .collect().toMap
57 | }
58 |
59 | private def toMatrix(): CoordinateMatrix = {
60 |
61 | // Broadcast termToIndex Map
62 | val termToIndexB = triplets.sparkContext.broadcast(termToIndex)
63 | val docIdsPerShardMapB = triplets.sparkContext.broadcast(docIdsPerShardMap)
64 |
65 | val entries = triplets.map { case t =>
66 | val i = termToIndexB.value(t.term)
67 | val j = docIdsPerShardMapB.value(t.docIdPerShard)
68 | MatrixEntry(i, j, t.count)
69 | }
70 |
71 | new CoordinateMatrix(entries)
72 | }
73 |
74 | /**
75 | * Returns the number of non-zero entries
76 | * @return
77 | */
78 | def nnz(): Long = {
79 | nnz_
80 | }
81 |
82 | /**
83 | * Number of rows (terms)
84 | * @return
85 | */
86 | def numRows(): Long = value_.numRows()
87 |
88 | /**
89 | * Number of columns (documents)
90 | * @return
91 | */
92 | def numCols(): Long = value_.numCols()
93 |
94 | def value(): CoordinateMatrix = value_
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/models/SparkFacetResult.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.models
18 |
19 | import org.apache.lucene.facet.FacetResult
20 |
21 | case class SparkFacetResult(facetName: String, facets: Map[String, Long]) {
22 |
23 | /**
24 | * Return facet counts sorted descending
25 | * @return Sequence of (facet value, facet counts)
26 | */
27 | def sortedFacets(): Seq[(String, Long)] = {
28 | facets.toSeq.sortBy[Long](x => -x._2)
29 | }
30 | }
31 |
32 |
33 | object SparkFacetResult extends Serializable {
34 |
35 | /**
36 | * Convert [[org.apache.lucene.facet.FacetResult]]
37 | * to [[org.zouzias.spark.lucenerdd.models.SparkFacetResult]]
38 | *
39 | * @param facetName name of facet
40 | * @param facetResult input facet results
41 | * @return
42 | */
43 | def apply(facetName: String, facetResult: FacetResult): SparkFacetResult = {
44 | val facetResultOpt = Option(facetResult)
45 | facetResultOpt match {
46 | case Some(fctResult) =>
47 | val map = fctResult.labelValues
48 | .map(labelValue => (labelValue.label, labelValue.value.longValue()))
49 | .toMap[String, Long]
50 | SparkFacetResult(facetName, map)
51 | case _ => SparkFacetResult(facetName, Map.empty[String, Long])
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/models/SparkScoreDoc.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.models
18 |
19 | import org.apache.lucene.document.Document
20 | import org.apache.lucene.index.IndexableField
21 | import org.apache.lucene.search.{IndexSearcher, ScoreDoc}
22 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
23 | import org.apache.spark.sql.Row
24 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
25 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType
26 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField}
27 |
28 | import scala.collection.JavaConverters._
29 |
30 | sealed trait FieldType extends Serializable
31 | object TextType extends FieldType
32 | object IntType extends FieldType
33 | object DoubleType extends FieldType
34 | object LongType extends FieldType
35 | object FloatType extends FieldType
36 |
37 |
38 | /**
39 | * A Lucene [[Document]] extended with score, docId and shard index
40 | *
41 | * @param score Score of document
42 | * @param docId Document id
43 | * @param shardIndex Shard index
44 | * @param doc Serialized Lucene document
45 | */
46 | case class SparkScoreDoc(score: Float, docId: Int, shardIndex: Int, doc: Document) {
47 |
48 | /**
49 | * Convert to [[Row]]
50 | *
51 | * @return
52 | */
53 | def toRow(): Row = {
54 |
55 | // Convert to Spark SQL DataFrame types
56 | val typeToValues = scala.collection.mutable.Map[StructField, List[Any]]().empty
57 |
58 | this.doc.getFields
59 | .asScala
60 | .filter(isOnlyStoredField)
61 | .foreach { field =>
62 | val fieldName = field.name()
63 |
64 | val tp = if (field.numericValue() != null) {
65 | inferNumericType(field.numericValue)
66 | }
67 | else if (field.numericValue() == null && field.stringValue() != null) {
68 | TextType
69 | }
70 |
71 | val item = tp match {
72 | case TextType => (StructField(fieldName, StringType), field.stringValue())
73 | case IntType => (StructField(fieldName, IntegerType), field.numericValue().intValue())
74 | case LongType => (StructField(fieldName,
75 | org.apache.spark.sql.types.LongType), field.numericValue().longValue())
76 | case DoubleType => (StructField(fieldName,
77 | org.apache.spark.sql.types.DoubleType), field.numericValue().doubleValue())
78 | case FloatType => (StructField(fieldName,
79 | org.apache.spark.sql.types.FloatType), field.numericValue().floatValue())
80 | case _ => (StructField(fieldName, StringType), field.stringValue())
81 | }
82 |
83 | // Append or set value
84 | val oldValue: List[Any] = typeToValues.getOrElse(item._1, List.empty)
85 | typeToValues.+=((item._1, oldValue.::(item._2)))
86 | }
87 |
88 | val arrayedTypesToValues = typeToValues.map{ case (tp, values) =>
89 |
90 | // If more than one values, wrap SQL type within ArrayType
91 | if (values.length == 1) {
92 | (tp, values.head)
93 | }
94 | else {
95 | (StructField(tp.name, ArrayType.apply(tp.dataType)), values)
96 | }
97 | }
98 |
99 | // Additional fields of [[SparkScoreDoc]] with known types inlucding
100 | // - document id
101 | // - documenet search score
102 | // - document shard index
103 | val extraSchemaWithValue = Seq((StructField(DocIdField, IntegerType), this.docId),
104 | (StructField(ScoreField, org.apache.spark.sql.types.FloatType), this.score),
105 | (StructField(ShardField, IntegerType), this.shardIndex))
106 |
107 | val allTogether = arrayedTypesToValues ++ extraSchemaWithValue
108 |
109 | new GenericRowWithSchema(allTogether.values.toArray, StructType(allTogether.keys.toSeq))
110 | }
111 |
112 | /**
113 | * Return fields that are stored only
114 | * @param field A field of a Lucene Document
115 | * @return
116 | */
117 | private def isOnlyStoredField(field: IndexableField): Boolean = {
118 | field.fieldType().stored()
119 | }
120 |
121 | override def toString: String = {
122 | val builder = new StringBuilder
123 | builder.append(s"[score: $score/")
124 | builder.append(s"docId: $docId/")
125 | builder.append(s"doc: $doc")
126 | builder.result()
127 | }
128 | }
129 |
130 | object SparkScoreDoc extends Serializable {
131 |
132 | val DocIdField = "__docid__"
133 | val ScoreField = "__score__"
134 | val ShardField = "__shardIndex__"
135 |
136 | def apply(indexSearcher: IndexSearcher, scoreDoc: ScoreDoc): SparkScoreDoc = {
137 | SparkScoreDoc(scoreDoc.score, scoreDoc.doc, scoreDoc.shardIndex,
138 | indexSearcher.doc(scoreDoc.doc))
139 | }
140 |
141 | def apply(indexSearcher: IndexSearcher, scoreDoc: ScoreDoc, score: Float): SparkScoreDoc = {
142 | SparkScoreDoc(score, scoreDoc.doc, scoreDoc.shardIndex, indexSearcher.doc(scoreDoc.doc))
143 | }
144 |
145 | /**
146 | * Ordering by score (descending)
147 | */
148 | def descending: Ordering[Row] = new Ordering[Row]{
149 | override def compare(x: Row, y: Row): Int = {
150 | val xScore = x.getFloat(x.fieldIndex(ScoreField))
151 | val yScore = y.getFloat(y.fieldIndex(ScoreField))
152 | if ( xScore > yScore) {
153 | -1
154 | } else if (xScore == yScore) 0 else 1
155 | }
156 | }
157 |
158 | /**
159 | * Ordering by score (ascending)
160 | */
161 | def ascending: Ordering[Row] = new Ordering[Row]{
162 | override def compare(x: Row, y: Row): Int = {
163 | val xScore = x.getFloat(x.fieldIndex(ScoreField))
164 | val yScore = y.getFloat(y.fieldIndex(ScoreField))
165 |
166 | if ( xScore < yScore) -1 else if (xScore == yScore) 0 else 1
167 | }
168 | }
169 |
170 | /**
171 | * Infers the subclass of [[Number]]
172 | * @param num A value of type [[Number]]
173 | * @return The [[FieldType]] of the input Number value
174 | */
175 | private def inferNumericType(num: Number): FieldType = {
176 | num match {
177 | case _: java.lang.Double => DoubleType
178 | case _: java.lang.Long => LongType
179 | case _: java.lang.Integer => IntType
180 | case _: java.lang.Float => FloatType
181 | case _ => TextType
182 | }
183 | }
184 | }
185 |
186 |
187 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/models/TermVectorEntry.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.models
18 |
19 | /**
20 | * A term vector entry (document id per shard, term as string, count)
21 | *
22 | * @param docIdPerShard Tuple2 containing (document id, partition id)
23 | * @param term Term text value
24 | * @param count Number of terms in the document
25 | */
26 | case class TermVectorEntry(docIdPerShard: (String, Int), term: String, count: Long)
27 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/models/indexstats/FieldStatistics.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.models.indexstats
18 |
19 | import org.apache.lucene.index.IndexReader
20 |
21 | /**
22 | * Statistics for Lucene index field
23 | */
24 | case class FieldStatistics(fieldName: String, docCount: Int, sumDocFreq: Long,
25 | totalTermFreq: Long) {
26 | override def toString(): String = {
27 | val buf = new StringBuilder()
28 | buf.append(s"fieldName: ${fieldName} / ")
29 | buf.append(s"docCount: ${docCount} / ")
30 | buf.append(s"sumDocFreq: ${sumDocFreq} / ")
31 | buf.append(s"totalTermFreq: ${totalTermFreq}\n")
32 | buf.result()
33 | }
34 | }
35 |
36 | object FieldStatistics {
37 | def apply(indexReader: IndexReader, fieldName: String): FieldStatistics = {
38 | val docCount = indexReader.getDocCount(fieldName)
39 | val sumDocFreq = indexReader.getSumDocFreq(fieldName)
40 | val totalTermFreq = indexReader.getSumTotalTermFreq(fieldName)
41 |
42 | FieldStatistics(fieldName, docCount, sumDocFreq, totalTermFreq)
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/models/indexstats/IndexStatistics.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.models.indexstats
18 |
19 | /**
20 | * Statistics for Lucene index
21 | */
22 | case class IndexStatistics(partitionId: Int,
23 | numDocs: Int,
24 | maxDocId: Int,
25 | numDeletedDocs: Int,
26 | numFields: Int,
27 | fieldsStatistics: Array[FieldStatistics]) {
28 |
29 | override def toString(): String = {
30 | val buf = new StringBuilder()
31 | buf.append(s"partitionId: ${partitionId}\n")
32 | buf.append(s"numDocs: ${numDocs}\n")
33 | buf.append(s"numDeletedDocs: ${numDeletedDocs}\n")
34 | buf.append(s"numFields: ${numFields}\n")
35 | fieldsStatistics.foreach(buf.append(_))
36 | buf.result()
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/partition/AbstractLuceneRDDPartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.partition
18 |
19 | import org.apache.lucene.search.{BooleanClause, Query}
20 | import org.zouzias.spark.lucenerdd.models.indexstats.IndexStatistics
21 | import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, TermVectorEntry}
22 | import org.zouzias.spark.lucenerdd.response.LuceneRDDResponsePartition
23 |
24 | import scala.reflect.ClassTag
25 |
26 | /**
27 | * LuceneRDD partition.
28 | *
29 | * @tparam T the type associated with each entry in the set.
30 | */
31 | private[lucenerdd] abstract class AbstractLuceneRDDPartition[T] extends Serializable
32 | with AutoCloseable {
33 |
34 | protected implicit def kTag: ClassTag[T]
35 |
36 | def size: Long
37 |
38 | def iterator: Iterator[T]
39 |
40 | def isDefined(key: T): Boolean
41 |
42 | def fields(): Set[String]
43 |
44 | /**
45 | * Multi term query
46 | *
47 | * @param docMap Map of field names to terms
48 | * @param topK Number of documents to return
49 | * @return
50 | */
51 | def multiTermQuery(docMap: Map[String, String],
52 | topK: Int,
53 | boolClause: BooleanClause.Occur = BooleanClause.Occur.MUST)
54 | : LuceneRDDResponsePartition
55 |
56 |
57 | /**
58 | * Generic Lucene Query using QueryParser
59 | * @param searchString Lucene query string, i.e., textField:hello*
60 | * @param topK Number of documents to return
61 | * @return
62 | */
63 | def query(searchString: String, topK: Int): LuceneRDDResponsePartition
64 |
65 |
66 | /**
67 | * Lucene search using Lucene [[Query]]
68 | * @param query Lucene query, i.e., [[org.apache.lucene.search.BooleanQuery]] or
69 | * [[org.apache.lucene.search.PhraseQuery]]
70 | * @param topK Number of documents to return
71 | * @return
72 | */
73 | def query(query: Query, topK: Int): LuceneRDDResponsePartition
74 |
75 | /**
76 | * Multiple generic Lucene Queries using QueryParser
77 | * @param searchString Lucene query string
78 | * @param topK Number of results to return
79 | * @return
80 | */
81 | def queries(searchString: Iterable[String], topK: Int)
82 | : Iterable[(String, LuceneRDDResponsePartition)]
83 |
84 | /**
85 | * Generic Lucene faceted Query using QueryParser
86 | * @param searchString Lucene query string, i.e., textField:hello*
87 | * @param topK Number of facets to return
88 | * @return
89 | */
90 | def facetQuery(searchString: String, facetField: String, topK: Int)
91 | : SparkFacetResult
92 |
93 | /**
94 | * Term Query
95 | * @param fieldName Name of field
96 | * @param query Query text
97 | * @param topK Number of documents to return
98 | * @return
99 | */
100 | def termQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition
101 |
102 | /**
103 | * Prefix Query
104 | * @param fieldName Name of field
105 | * @param query Prefix query
106 | * @param topK Number of documents to return
107 | * @return
108 | */
109 | def prefixQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition
110 |
111 | /**
112 | * Fuzzy Query
113 | * @param fieldName Name of field
114 | * @param query Query text
115 | * @param maxEdits Fuzziness, edit distance
116 | * @param topK Number of documents to return
117 | * @return
118 | */
119 | def fuzzyQuery(fieldName: String, query: String,
120 | maxEdits: Int, topK: Int): LuceneRDDResponsePartition
121 |
122 | /**
123 | * PhraseQuery
124 | * @param fieldName Name of field
125 | * @param query Phrase query, i.e., "hello world"
126 | * @param topK Number of documents to return
127 | * @return
128 | */
129 | def phraseQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition
130 |
131 |
132 | /**
133 | * Lucene's More Like This (MLT) functionality
134 | * @param fieldName Field name
135 | * @param query Query text
136 | * @param minTermFreq Minimum term frequency
137 | * @param minDocFreq Minimum document frequency
138 | * @param topK Number of returned documents
139 | * @return
140 | */
141 | def moreLikeThis(fieldName: String, query: String,
142 | minTermFreq: Int, minDocFreq: Int, topK: Int)
143 | : LuceneRDDResponsePartition
144 |
145 | /**
146 | * Returns term vectors for a partition
147 | *
148 | * Since each Lucene index is created per partition, docId are not unique.
149 | * The partitionIndex is used to compute "global" document id from all documents
150 | * over all partitions
151 | *
152 | * @param fieldName Field on which to compute term vectors
153 | * @param idFieldName Field name which contains unique id
154 | * @return Array of term vector entries
155 | */
156 | def termVectors(fieldName: String, idFieldName: Option[String]): Array[TermVectorEntry]
157 |
158 | /**
159 | * Returns statistics of the indices over all executors.
160 | *
161 | * @param fields Set of defined fields
162 | * @return
163 | */
164 | def indexStats(fields: Set[String]): IndexStatistics
165 |
166 | /**
167 | * Restricts the entries to those satisfying a predicate
168 | * @param pred Predicate to filter on
169 | * @return
170 | */
171 | def filter(pred: T => Boolean): AbstractLuceneRDDPartition[T]
172 | }
173 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/query/SimilarityConfigurable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.query
18 |
19 | import org.apache.lucene.search.similarities.{BM25Similarity, ClassicSimilarity, Similarity}
20 | import org.zouzias.spark.lucenerdd.config.Configurable
21 |
22 |
23 | /**
24 | * Lucene Similarity loader via configuration
25 | */
26 | trait SimilarityConfigurable extends Configurable {
27 |
28 | private val LuceneSimilarity = "lucenerdd.similarity.name"
29 |
30 | protected val LuceneSimilarityConfigValue: Option[String] =
31 | if (Config.hasPath(LuceneSimilarity)) {
32 | Some(Config.getString(LuceneSimilarity))} else None
33 |
34 | protected def getOrElseClassic(): String = LuceneSimilarityConfigValue.getOrElse("classic")
35 |
36 | protected def getSimilarity(similarityName: Option[String]): Similarity = {
37 | if (similarityName.isDefined) {
38 | similarityName.get match {
39 | case "bm25" => new BM25Similarity()
40 | case _ => new ClassicSimilarity()
41 | }
42 | }
43 | else {
44 | new ClassicSimilarity()
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/response/FieldType.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.response
18 |
19 | sealed trait FieldType extends Serializable
20 | object TextType extends FieldType
21 | object IntType extends FieldType
22 | object DoubleType extends FieldType
23 | object LongType extends FieldType
24 | object FloatType extends FieldType
25 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponse.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.response
18 |
19 | import com.twitter.algebird.TopKMonoid
20 | import org.apache.spark.annotation.DeveloperApi
21 | import org.apache.spark.{OneToOneDependency, Partition, TaskContext}
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
24 | import org.apache.spark.sql.types._
25 | import org.apache.spark.storage.StorageLevel
26 |
27 | /**
28 | * LuceneRDD response
29 | */
30 | private[lucenerdd] class LuceneRDDResponse
31 | (protected val partitionsRDD: RDD[LuceneRDDResponsePartition],
32 | protected val ordering: Ordering[Row])
33 | extends RDD[Row](partitionsRDD.context,
34 | List(new OneToOneDependency(partitionsRDD))) {
35 |
36 | setName("LuceneRDDResponse")
37 |
38 | @DeveloperApi
39 | override def compute(split: Partition, context: TaskContext)
40 | : Iterator[Row] = {
41 | firstParent[LuceneRDDResponsePartition].iterator(split, context).next().iterator()
42 | }
43 |
44 | override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
45 |
46 | override protected def getPreferredLocations(s: Partition): Seq[String] =
47 | partitionsRDD.preferredLocations(s)
48 |
49 | override def cache(): this.type = {
50 | this.persist(StorageLevel.MEMORY_ONLY)
51 | }
52 |
53 | override def persist(newLevel: StorageLevel): this.type = {
54 | partitionsRDD.persist(newLevel)
55 | super.persist(newLevel)
56 | this
57 | }
58 |
59 | override def unpersist(blocking: Boolean = true): this.type = {
60 | partitionsRDD.unpersist(blocking)
61 | super.unpersist(blocking)
62 | this
63 | }
64 |
65 | /**
66 | * Return the top-k result in terms of Lucene score
67 | *
68 | * It uses a [[TopKMonoid]] to compute topK
69 | * @param k Number of result to return
70 | * @return Array of result, size k
71 | */
72 | override def take(k: Int): Array[Row] = {
73 | val monoid = new TopKMonoid[Row](k)(ordering)
74 | partitionsRDD.map(monoid.build(_))
75 | .reduce(monoid.plus).items.toArray
76 | }
77 |
78 | override def collect(): Array[Row] = {
79 | val sz = partitionsRDD.map(_.size).sum().toInt
80 | if (sz > 0) {
81 | val monoid = new TopKMonoid[Row](sz)(ordering)
82 | partitionsRDD.map(monoid.build(_))
83 | .reduce(monoid.plus).items.toArray
84 | } else {
85 | Array.empty[Row]
86 | }
87 | }
88 |
89 | /**
90 | * Convert LuceneRDDResponse to Spark DataFrame
91 | * @param spark Spark Session
92 | * @return DataFrame
93 | */
94 | def toDF()(implicit spark: SparkSession): DataFrame = {
95 | val schema = this.first().schema
96 | spark.createDataFrame(this, schema)
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponsePartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.response
18 |
19 | import org.apache.spark.sql.Row
20 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
21 |
22 | case class LuceneRDDResponsePartition(results: Iterator[Row])
23 | extends Iterable[Row] {
24 | override def iterator(): Iterator[Row] = results
25 | }
26 |
27 | object LuceneRDDResponsePartition {
28 |
29 | def apply(sparkScoreDocs: Iterable[SparkScoreDoc]): LuceneRDDResponsePartition = {
30 | apply(sparkScoreDocs.map(_.toRow()).toIterator)
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDDKryoRegistrator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape
18 |
19 | import com.twitter.algebird.TopK
20 | import com.twitter.chill.Kryo
21 | import org.apache.spark.SparkConf
22 | import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
23 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
24 | import org.apache.spark.sql.types._
25 | import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc}
26 | import org.zouzias.spark.lucenerdd.spatial.shape.partition.ShapeLuceneRDDPartition
27 |
28 |
29 | class ShapeLuceneRDDKryoRegistrator extends KryoRegistrator {
30 | def registerClasses(kryo: Kryo): Unit = {
31 | kryo.register(classOf[ShapeLuceneRDD[_, _]])
32 | kryo.register(classOf[ShapeLuceneRDDPartition[_, _]])
33 | kryo.register(classOf[Number])
34 | kryo.register(classOf[java.lang.Double])
35 | kryo.register(classOf[java.lang.Float])
36 | kryo.register(classOf[java.lang.Integer])
37 | kryo.register(classOf[java.lang.Long])
38 | kryo.register(classOf[java.lang.Short])
39 | kryo.register(classOf[StructType])
40 | kryo.register(classOf[StructField])
41 | kryo.register(classOf[IntegerType])
42 | kryo.register(classOf[DoubleType])
43 | kryo.register(classOf[FloatType])
44 | kryo.register(classOf[StringType])
45 | kryo.register(classOf[GenericRowWithSchema])
46 | kryo.register(classOf[Metadata])
47 | kryo.register(classOf[Object])
48 | kryo.register(classOf[Array[Object]])
49 | kryo.register(classOf[Array[Array[Byte]]])
50 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofRef[_]])
51 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofFloat])
52 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofDouble])
53 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofInt])
54 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofLong])
55 | kryo.register(classOf[Array[String]])
56 | kryo.register(classOf[Array[Number]])
57 | kryo.register(classOf[Array[Float]])
58 | kryo.register(classOf[Array[Int]])
59 | kryo.register(classOf[Array[Long]])
60 | kryo.register(classOf[Array[Double]])
61 | kryo.register(classOf[Array[Boolean]])
62 | kryo.register(classOf[Array[SparkScoreDoc]])
63 | kryo.register(classOf[Array[StructType]])
64 | kryo.register(classOf[Array[StructField]])
65 | kryo.register(classOf[Range])
66 | kryo.register(classOf[scala.collection.immutable.Map[String, String]])
67 | kryo.register(classOf[scala.collection.immutable.Map[String, Number]])
68 | kryo.register(classOf[scala.collection.immutable.Map[_, _]])
69 | kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]])
70 | kryo.register(classOf[SparkFacetResult])
71 | kryo.register(classOf[SparkScoreDoc])
72 | kryo.register(classOf[TopK[_]])
73 |
74 | ()
75 | }
76 | }
77 |
78 | /**
79 | * Decorator for [[ShapeLuceneRDD]] Kryo serialization
80 | */
81 | object ShapeLuceneRDDKryoRegistrator {
82 | def registerKryoClasses(conf: SparkConf): SparkConf = {
83 | conf.set("spark.serializer", classOf[KryoSerializer].getName)
84 | .set("spark.kryo.registrator", classOf[ShapeLuceneRDDKryoRegistrator].getName)
85 | .set("spark.kryo.registrationRequired", "false")
86 | /* Set the above to true s.t. all classes are registered with Kryo */
87 | }
88 | }
89 |
90 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/context/ContextLoader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape.context
18 |
19 | import java.io.{StringReader, StringWriter}
20 |
21 | import org.locationtech.spatial4j.context.jts.JtsSpatialContext
22 | import org.locationtech.spatial4j.io.{ShapeReader, ShapeWriter}
23 | import org.locationtech.spatial4j.shape.Shape
24 | import org.zouzias.spark.lucenerdd.config.ShapeLuceneRDDConfigurable
25 |
26 | trait ContextLoader extends ShapeLuceneRDDConfigurable{
27 |
28 | protected val LocationDefaultField: String = getLocationFieldName
29 |
30 | protected lazy val shapeReader: ShapeReader = ctx.getFormats.getReader(getShapeFormat)
31 |
32 | protected lazy val shapeWriter: ShapeWriter = ctx.getFormats.getWriter(getShapeFormat)
33 |
34 | protected def shapeToString(shape: Shape): String = {
35 | val writer = new StringWriter()
36 | shapeWriter.write(writer, shape)
37 | writer.toString
38 | }
39 |
40 | protected def stringToShape(shapeAsString: String): Shape = {
41 | shapeReader.read(new StringReader(shapeAsString))
42 | }
43 |
44 | /**
45 | * The Spatial4j {@link SpatialContext} is a sort of global-ish singleton
46 | * needed by Lucene spatial. It's a facade to the rest of Spatial4j, acting
47 | * as a factory for {@link Shape}s and provides access to reading and writing
48 | * them from Strings.
49 | *
50 | * Quoting from spatial4j (https://github.com/locationtech/spatial4j#getting-started)
51 | *
52 | * "To get a SpatialContext (or just "context" for short), you could use a global singleton
53 | * SpatialContext.GEO or JtsSpatialContext.GEO which both use geodesic surface-of-sphere
54 | * calculations (when available); the JTS one principally adds Polygon support."
55 | */
56 | protected lazy val ctx: JtsSpatialContext = JtsSpatialContext.GEO // SpatialContext.GEO
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/grids/PrefixTreeLoader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape.grids
18 |
19 | import org.apache.lucene.spatial.prefix.tree.{SpatialPrefixTree, SpatialPrefixTreeFactory}
20 | import org.zouzias.spark.lucenerdd.config.ShapeLuceneRDDConfigurable
21 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader
22 |
23 | import scala.collection.JavaConverters._
24 |
25 | trait PrefixTreeLoader extends ContextLoader
26 | with ShapeLuceneRDDConfigurable {
27 |
28 | // results in sub-meter precision for geohash
29 | protected val maxLevels: Int = getPrefixTreeMaxLevel
30 |
31 | // Excepting 'geohash' or 'quad'
32 | protected val prefixTreeName: String = getPrefixTreeName
33 |
34 | // Maximum distance error (in KM)
35 | protected val prefixTreeMaxDistErr: Double = getPrefixTreeMaxDistErr
36 |
37 | // This can also be constructed from SpatialPrefixTreeFactory
38 | protected val grid: SpatialPrefixTree = SpatialPrefixTreeFactory.makeSPT(
39 | Map("prefixTree" -> prefixTreeName,
40 | "maxLevels" -> maxLevels.toString,
41 | "maxDistErr" -> prefixTreeMaxDistErr.toString).asJava,
42 | ClassLoader.getSystemClassLoader,
43 | ctx)
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial
18 |
19 | import java.io.StringReader
20 |
21 | import org.locationtech.jts.geom.{Coordinate, GeometryFactory}
22 | import org.locationtech.spatial4j.shape.Shape
23 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader
24 |
25 |
26 | package object shape extends ContextLoader{
27 |
28 | private val GeometryFactory = new GeometryFactory()
29 |
30 | implicit def convertToPoint(point: (Double, Double)): Shape = {
31 | ctx.makePoint(point._1, point._2)
32 | }
33 |
34 | /**
35 | * ***Experimental***
36 | *
37 | * Implicitly convert shape from its string representation
38 | *
39 | * @param shapeAsString
40 | * @return
41 | */
42 | implicit def WKTToShape(shapeAsString: String): Shape = {
43 | try {
44 | shapeReader.read(new StringReader(shapeAsString))
45 | }
46 | catch {
47 | case e: Exception => ctx.makePoint(0.0, 0.0)
48 | }
49 | }
50 |
51 | implicit def rectangleToShape(rect: (Double, Double, Double, Double)): Shape = {
52 | val minX = rect._1
53 | val maxX = rect._2
54 | val minY = rect._3
55 | val maxY = rect._4
56 | ctx.makeRectangle(minX, maxX, minY, maxY)
57 | }
58 |
59 | implicit def circleToShape(circle: ((Double, Double), Double)): Shape = {
60 | val x = circle._1._1
61 | val y = circle._1._2
62 | val radius = circle._2
63 | ctx.makeCircle(x, y, radius)
64 | }
65 |
66 | implicit def listPolygonToShape(rect: List[(Double, Double)]): Shape = {
67 | val coordinates = rect.map(p => new Coordinate(p._1, p._2)).toArray
68 | val polygon = GeometryFactory.createPolygon(coordinates)
69 | ctx.makeShape(polygon)
70 | }
71 |
72 | implicit def arrayPolygonToShape(rect: Array[(Double, Double)]): Shape = {
73 | val coordinates = rect.map(p => new Coordinate(p._1, p._2))
74 | val polygon = GeometryFactory.createPolygon(coordinates)
75 | ctx.makeShape(polygon)
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/partition/AbstractShapeLuceneRDDPartition.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape.partition
18 |
19 | import org.zouzias.spark.lucenerdd.response.LuceneRDDResponsePartition
20 | import org.zouzias.spark.lucenerdd.spatial.shape.ShapeLuceneRDD.PointType
21 |
22 | import scala.reflect.ClassTag
23 |
24 | private[shape] abstract class AbstractShapeLuceneRDDPartition[K, V] extends Serializable {
25 |
26 | protected implicit def kTag: ClassTag[K]
27 | protected implicit def vTag: ClassTag[V]
28 |
29 | def size: Long
30 |
31 | def iterator: Iterator[(K, V)]
32 |
33 | def isDefined(key: K): Boolean
34 |
35 | def close(): Unit
36 |
37 | /**
38 | * Nearest neighbour search
39 | *
40 | * @param point query point
41 | * @param k number of neighbors to return
42 | * @param searchString Lucene Query string
43 | * @return
44 | */
45 | def knnSearch(point: PointType, k: Int, searchString: String): LuceneRDDResponsePartition
46 |
47 | /**
48 | * Search for points within a circle
49 | *
50 | * @param center center of circle
51 | * @param radius radius of circle in kilometers (KM)
52 | * @param k number of points to return
53 | * @return
54 | */
55 | def circleSearch(center: PointType, radius: Double, k: Int, operationName: String)
56 | : LuceneRDDResponsePartition
57 |
58 | /**
59 | * Spatial search with arbitrary shape
60 | *
61 | * @param shapeAsString Shape object represented as String
62 | * @param k Number of results to return
63 | * @param operationName Operation name, i.e., intersect, within, etc
64 | * @return
65 | */
66 | def spatialSearch(shapeAsString: String, k: Int, operationName: String)
67 | : LuceneRDDResponsePartition
68 |
69 | /**
70 | * Spatial search with point
71 | *
72 | * @param point Query point
73 | * @param k Number of result to return
74 | * @param operationName Operation name, i.e., intersect, within, etc
75 | * @return
76 | */
77 | def spatialSearch(point: PointType, k: Int, operationName: String)
78 | : LuceneRDDResponsePartition
79 |
80 | /**
81 | * Bounding box search with point and radius
82 | *
83 | * @param center given as (x, y)
84 | * @param radius distance from center in kilometers (KM)
85 | * @param k Number of results to return
86 | * @param operationName Operation name, i.e., intersect, within, etc
87 | * @return
88 | */
89 | def bboxSearch(center: PointType, radius: Double, k: Int, operationName: String)
90 | : LuceneRDDResponsePartition
91 |
92 | /**
93 | * Bounding box search with lower left and upper right corners
94 | *
95 | * @param lowerLeft Lower left point
96 | * @param upperRight Upper left point
97 | * @param k Number of results
98 | * @param operationName Operation name, i.e., intersect, within, etc
99 | * @return
100 | */
101 | def bboxSearch(lowerLeft: PointType, upperRight: PointType, k: Int, operationName: String)
102 | : LuceneRDDResponsePartition
103 |
104 | /**
105 | * Restricts the entries to those satisfying a predicate
106 | *
107 | * @param pred Predicate to filter on
108 | * @return
109 | */
110 | def filter(pred: (K, V) => Boolean): AbstractShapeLuceneRDDPartition[K, V]
111 | }
112 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/strategies/SpatialStrategy.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape.strategies
18 |
19 | import org.apache.lucene.spatial.prefix.{PrefixTreeStrategy, RecursivePrefixTreeStrategy}
20 | import org.zouzias.spark.lucenerdd.spatial.shape.grids.PrefixTreeLoader
21 |
22 | trait SpatialStrategy extends PrefixTreeLoader {
23 |
24 | /**
25 | * The Lucene spatial {@link SpatialStrategy} encapsulates an approach to
26 | * indexing and searching shapes, and providing distance values for them.
27 | * It's a simple API to unify different approaches. You might use more than
28 | * one strategy for a shape as each strategy has its strengths and weaknesses.
29 | *
30 | * Note that these are initialized with a field name.
31 | */
32 | protected val strategy: PrefixTreeStrategy = new RecursivePrefixTreeStrategy(grid,
33 | LocationDefaultField)
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/store/IndexStorable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.store
18 |
19 | import java.nio.file.{Files, Path}
20 |
21 | import org.apache.lucene.facet.FacetsConfig
22 | import org.apache.lucene.store._
23 | import org.zouzias.spark.lucenerdd.config.Configurable
24 | import org.apache.spark.internal.Logging
25 |
26 | /**
27 | * Storage of a Lucene index Directory
28 | *
29 | * Currently, the following storage methods are supported:
30 | *
31 | * 1) "lucenerdd.index.store.mode=disk" : MMapStorage on temp disk
32 | * 2) Otherwise, memory storage using [[RAMDirectory]]
33 | */
34 | trait IndexStorable extends Configurable
35 | with AutoCloseable
36 | with Logging {
37 |
38 | protected lazy val FacetsConfig = new FacetsConfig()
39 |
40 | private val IndexStoreKey = "lucenerdd.index.store.mode"
41 |
42 | private val tmpJavaDir = System.getProperty("java.io.tmpdir")
43 |
44 | private val indexDirName =
45 | s"indexDirectory.${System.currentTimeMillis()}.${Thread.currentThread().getId}"
46 |
47 | private val indexDir = Files.createTempDirectory(indexDirName)
48 |
49 | private val taxonomyDirName =
50 | s"taxonomyDirectory-${System.currentTimeMillis()}.${Thread.currentThread().getId}"
51 |
52 | private val taxonomyDir = Files.createTempDirectory(taxonomyDirName)
53 |
54 | protected val IndexDir = storageMode(indexDir)
55 |
56 | protected val TaxonomyDir = storageMode(taxonomyDir)
57 |
58 | /**
59 | * Select Lucene index storage implementation based on config
60 | * @param directoryPath Directory in disk to store index
61 | * @return
62 | */
63 | protected def storageMode(directoryPath: Path): Directory = {
64 | if (Config.hasPath(IndexStoreKey)) {
65 | val storageMode = Config.getString(IndexStoreKey)
66 |
67 | storageMode match {
68 | // TODO: FIX: Currently there is a single lock instance for each directory.
69 | // TODO: Implement better lock handling here
70 | case "disk" => {
71 | logInfo(s"Config parameter ${IndexStoreKey} is set to 'disk'")
72 | logInfo("Lucene index will be storage in disk")
73 | logInfo(s"Index disk location ${tmpJavaDir}")
74 | // directoryPath.toFile.deleteOnExit() // Delete on exit
75 | new MMapDirectory(directoryPath, new SingleInstanceLockFactory)
76 | }
77 | case ow =>
78 | logInfo(s"Config parameter ${IndexStoreKey} is set to ${ow}")
79 | logInfo("Lucene index will be storage in memory (default)")
80 | logInfo(
81 | """
82 | Quoting from
83 | http://lucene.apache.org/core/7_5_0/core/org/apache/
84 | lucene/store/RAMDirectory.html
85 |
86 | A memory-resident Directory implementation. Locking
87 | implementation is by default the SingleInstanceLockFactory.
88 | Warning: This class is not intended to work with huge indexes.
89 | Everything beyond several hundred megabytes will waste resources
90 | (GC cycles), because it uses an internal buffer size of 1024 bytes,
91 | producing millions of byte[1024] arrays.
92 | This class is optimized for small memory-resident indexes.
93 | It also has bad concurrency on multithreaded environments.
94 |
95 | It is recommended to materialize large indexes on disk and
96 | use MMapDirectory, which is a high-performance directory
97 | implementation working directly on the file system cache of
98 | the operating system, so copying data to Java heap
99 | space is not useful.
100 | """.stripMargin)
101 | new RAMDirectory()
102 | }
103 | }
104 | else {
105 | logInfo(s"Config parameter ${IndexStoreKey} is not set")
106 | logInfo("Lucene index will be storage in disk")
107 | new MMapDirectory(directoryPath, new SingleInstanceLockFactory)
108 | }
109 | }
110 |
111 | override def close(): Unit = {
112 | IndexDir.close()
113 | TaxonomyDir.close()
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/store/IndexWithTaxonomyWriter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.store
18 |
19 | import org.apache.lucene.analysis.Analyzer
20 | import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
21 | import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter
22 | import org.apache.lucene.index.IndexWriterConfig.OpenMode
23 | import org.apache.lucene.index.{IndexWriter, IndexWriterConfig}
24 | import org.zouzias.spark.lucenerdd.analyzers.AnalyzerConfigurable
25 |
26 | /**
27 | * Index and Taxonomy Writer used for facet queries
28 | */
29 | trait IndexWithTaxonomyWriter extends IndexStorable
30 | with AnalyzerConfigurable {
31 |
32 | protected def indexAnalyzer(): Analyzer
33 |
34 | protected def indexPerFieldAnalyzer(): PerFieldAnalyzerWrapper
35 |
36 | protected lazy val indexWriter = new IndexWriter(IndexDir,
37 | new IndexWriterConfig(indexPerFieldAnalyzer())
38 | .setOpenMode(OpenMode.CREATE))
39 |
40 | protected lazy val taxoWriter = new DirectoryTaxonomyWriter(TaxonomyDir)
41 |
42 | protected def closeAllWriters(): Unit = {
43 | indexWriter.commit()
44 | taxoWriter.commit()
45 | taxoWriter.close()
46 | indexWriter.close()
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/testing/FavoriteCaseClass.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.testing
18 |
19 | case class FavoriteCaseClass(name: String, age: Int, myLong: Long, myFloat: Float, email: String)
20 |
21 | case class MultivalueFavoriteCaseClass(names: Array[String],
22 | age: Int,
23 | ages: List[Int],
24 | myLong: Long,
25 | myFloat: Float,
26 | email: String)
27 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/testing/Person.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.testing
18 |
19 | case class Person(name: String, age: Int, email: String)
20 |
21 |
--------------------------------------------------------------------------------
/src/main/scala/org/zouzias/spark/lucenerdd/versioning/Versionable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.versioning
18 |
19 | /**
20 | * Reads version from sbt and makes version available to Spark
21 | */
22 | trait Versionable {
23 |
24 | /**
25 | * Return project information, i.e., version number, build time etc
26 | * @return
27 | */
28 | def version(): Map[String, Any] = {
29 | // BuildInfo is automatically generated using sbt plugin `sbt-buildinfo`
30 | org.zouzias.spark.lucenerdd.BuildInfo.toMap
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/resources/capitals.txt:
--------------------------------------------------------------------------------
1 | capital
2 | Sukhumi
3 | Kabul
4 | Episkopi Cantonment
5 | Tirana
6 | Algiers
7 | Pago Pago
8 | Andorra la Vella
9 | Luanda
10 | The Valley
11 | St. John's
12 | Buenos Aires
13 | Yerevan
14 | Oranjestad
15 | Georgetown
16 | Canberra
17 | Vienna
18 | Baku
19 | Nassau
20 | Manama
21 | Dhaka
22 | Bridgetown
23 | Minsk
24 | Brussels
25 | Belmopan
26 | Porto-Novo
27 | Hamilton
28 | Thimphu
29 | Sucre
30 | La Paz
31 | Sarajevo
32 | Gaborone
33 | Brasília
34 | Road Town
35 | Bandar Seri Begawan
36 | Sofia
37 | Ouagadougou
38 | Bujumbura
39 | Phnom Penh
40 | Yaoundé
41 | Ottawa
42 | Praia
43 | George Town
44 | Bangui
45 | N'Djamena
46 | Santiago
47 | Beijing
48 | Flying Fish Cove
49 | West Island
50 | Bogotá
51 | Moroni
52 | Avarua
53 | San José
54 | Zagreb
55 | Havana
56 | Willemstad
57 | Nicosia
58 | Prague
59 | Yamoussoukro
60 | Kinshasa
61 | Copenhagen
62 | Djibouti
63 | Roseau
64 | Santo Domingo
65 | Dili
66 | Hanga Roa
67 | Quito
68 | Cairo
69 | San Salvador
70 | Malabo
71 | Asmara
72 | Tallinn
73 | Addis Ababa
74 | Stanley
75 | Tórshavn
76 | Palikir
77 | Suva
78 | Helsinki
79 | Paris
80 | Cayenne
81 | Papeete
82 | Libreville
83 | Banjul
84 | Tbilisi
85 | Berlin
86 | Accra
87 | Gibraltar
88 | Athens
89 | Nuuk
90 | St. George's
91 | Hagåtña
92 | Guatemala City
93 | St. Peter Port
94 | Conakry
95 | Bissau
96 | Georgetown
97 | Port-au-Prince
98 | Tegucigalpa
99 | Budapest
100 | Reykjavík
101 | New Delhi
102 | Jakarta
103 | Tehran
104 | Baghdad
105 | Dublin
106 | Douglas
107 | Jerusalem
108 | Rome
109 | Kingston
110 | Tokyo
111 | St. Helier
112 | Amman
113 | Astana
114 | Nairobi
115 | Tarawa
116 | Pristina
117 | Kuwait City
118 | Bishkek
119 | Vientiane
120 | Riga
121 | Beirut
122 | Maseru
123 | Monrovia
124 | Tripoli
125 | Vaduz
126 | Vilnius
127 | Luxembourg
128 | Skopje
129 | Antananarivo
130 | Lilongwe
131 | Kuala Lumpur
132 | Malé
133 | Bamako
134 | Valletta
135 | Majuro
136 | Nouakchott
137 | Port Louis
138 | Mexico City
139 | Chisinau
140 | Monaco
141 | Ulaanbaatar
142 | Podgorica
143 | Plymouth
144 | Rabat
145 | Maputo
146 | Naypyidaw
147 | Stepanakert
148 | Windhoek
149 | Yaren
150 | Kathmandu
151 | Amsterdam
152 | Nouméa
153 | Wellington
154 | Managua
155 | Niamey
156 | Abuja
157 | Alofi
158 | Kingston
159 | Pyongyang
160 | Nicosia
161 | Belfast
162 | Saipan
163 | Oslo
164 | Muscat
165 | Islamabad
166 | Ngerulmud
167 | Jerusalem
168 | Panama City
169 | Port Moresby
170 | Asunción
171 | Lima
172 | Manila
173 | Adamstown
174 | Warsaw
175 | Lisbon
176 | San Juan
177 | Doha
178 | Taipei
179 | Brazzaville
180 | Bucharest
181 | Moscow
182 | Kigali
183 | Gustavia
184 | Jamestown
185 | Basseterre
186 | Castries
187 | Marigot
188 | St. Pierre
189 | Kingstown
190 | Apia
191 | San Marino
192 | Riyadh
193 | Edinburgh
194 | Dakar
195 | Belgrade
196 | Victoria
197 | Freetown
198 | Singapore
199 | Philipsburg
200 | Bratislava
201 | Ljubljana
202 | Honiara
203 | Mogadishu
204 | Hargeisa
205 | Pretoria
206 | Grytviken
207 | Seoul
208 | Tskhinvali
209 | Juba
210 | Madrid
211 | Sri Jayawardenapura Kotte
212 | Khartoum
213 | Paramaribo
214 | Mbabane
215 | Stockholm
216 | Bern
217 | Damascus
218 | São Tomé
219 | Dushanbe
220 | Dodoma
221 | Bangkok
222 | Lomé
223 | Nukuʻalofa
224 | Tiraspol
225 | Port of Spain
226 | Edinburgh of the Seven Seas
227 | Tunis
228 | Ankara
229 | Ashgabat
230 | Cockburn Town
231 | Funafuti
232 | Kampala
233 | Kiev
234 | Abu Dhabi
235 | London
236 | Washington
237 | Charlotte Amalie
238 | Montevideo
239 | Tashkent
240 | Port Vila
241 | Vatican City
242 | Caracas
243 | Hanoi
244 | Cardiff
245 | Mata-Utu
246 | El Aaiún
247 | Sanaá
248 | Lusaka
249 | Harare
250 |
--------------------------------------------------------------------------------
/src/test/resources/countries.txt:
--------------------------------------------------------------------------------
1 | Abkhazia
2 | Afghanistan
3 | Akrotiri and Dhekelia
4 | Albania
5 | Algeria
6 | American Samoa
7 | Andorra
8 | Angola
9 | Anguilla
10 | Antigua and Barbuda
11 | Argentina
12 | Armenia
13 | Aruba
14 | Ascension Island
15 | Australia
16 | Austria
17 | Azerbaijan
18 | Bahamas
19 | Bahrain
20 | Bangladesh
21 | Barbados
22 | Belarus
23 | Belgium
24 | Belize
25 | Benin
26 | Bermuda
27 | Bhutan
28 | Bolivia
29 | Bolivia
30 | Bosnia and Herzegovina
31 | Botswana
32 | Brazil
33 | British Virgin Islands
34 | Brunei
35 | Bulgaria
36 | Burkina Faso
37 | Burundi
38 | Cambodia
39 | Cameroon
40 | Canada
41 | Cape Verde
42 | Cayman Islands
43 | Central African Republic
44 | Chad
45 | Chile
46 | China
47 | Christmas Island
48 | Cocos Islands
49 | Colombia
50 | Comoros
51 | Cook Islands
52 | Costa Rica
53 | Croatia
54 | Cuba
55 | Curaçao
56 | Cyprus
57 | Czech Republic
58 | Côte d'Ivoire
59 | Democratic Republic of the Congo
60 | Denmark
61 | Djibouti
62 | Dominica
63 | Dominican Republic
64 | East Timor
65 | Easter Island
66 | Ecuador
67 | Egypt
68 | El Salvador
69 | Equatorial Guinea
70 | Eritrea
71 | Estonia
72 | Ethiopia
73 | Falkland Islands
74 | Faroe Islands
75 | Federated States of Micronesia
76 | Fiji
77 | Finland
78 | France
79 | French Guiana
80 | French Polynesia
81 | Gabon
82 | Gambia
83 | Georgia
84 | Germany
85 | Ghana
86 | Gibraltar
87 | Greece
88 | Greenland
89 | Grenada
90 | Guam
91 | Guatemala
92 | Guernsey
93 | Guinea
94 | Guinea-Bissau
95 | Guyana
96 | Haiti
97 | Honduras
98 | Hungary
99 | Iceland
100 | India
101 | Indonesia
102 | Iran
103 | Iraq
104 | Ireland
105 | Isle of Man
106 | Israel
107 | Italy
108 | Jamaica
109 | Japan
110 | Jersey
111 | Jordan
112 | Kazakhstan
113 | Kenya
114 | Kiribati
115 | Kosovo
116 | Kuwait
117 | Kyrgyzstan
118 | Laos
119 | Latvia
120 | Lebanon
121 | Lesotho
122 | Liberia
123 | Libya
124 | Liechtenstein
125 | Lithuania
126 | Luxembourg
127 | Macedonia
128 | Madagascar
129 | Malawi
130 | Malaysia
131 | Maldives
132 | Mali
133 | Malta
134 | Marshall Islands
135 | Mauritania
136 | Mauritius
137 | Mexico
138 | Moldova
139 | Monaco
140 | Mongolia
141 | Montenegro
142 | Montserrat
143 | Morocco
144 | Mozambique
145 | Myanmar
146 | Nagorno-Karabakh Republic
147 | Namibia
148 | Nauru
149 | Nepal
150 | Netherlands
151 | New Caledonia
152 | New Zealand
153 | Nicaragua
154 | Niger
155 | Nigeria
156 | Niue
157 | Norfolk Island
158 | North Korea
159 | Northern Cyprus
160 | United Kingdom Northern Ireland
161 | Northern Mariana Islands
162 | Norway
163 | Oman
164 | Pakistan
165 | Palau
166 | Palestine
167 | Panama
168 | Papua New Guinea
169 | Paraguay
170 | Peru
171 | Philippines
172 | Pitcairn Islands
173 | Poland
174 | Portugal
175 | Puerto Rico
176 | Qatar
177 | Taiwan
178 | Republic of the Congo
179 | Romania
180 | Russia
181 | Rwanda
182 | Saint Barthélemy
183 | Saint Helena
184 | Saint Kitts and Nevis
185 | Saint Lucia
186 | Saint Martin
187 | Saint Pierre and Miquelon
188 | Saint Vincent and the Grenadines
189 | Samoa
190 | San Marino
191 | Saudi Arabia
192 | Scotland
193 | Senegal
194 | Serbia
195 | Seychelles
196 | Sierra Leone
197 | Singapore
198 | Sint Maarten
199 | Slovakia
200 | Slovenia
201 | Solomon Islands
202 | Somalia
203 | Somaliland
204 | South Africa
205 | South Georgia and the South Sandwich Islands
206 | South Korea
207 | South Ossetia
208 | South Sudan South Sudan
209 | Spain
210 | Sri Lanka
211 | Sudan
212 | Suriname
213 | Swaziland
214 | Sweden
215 | Switzerland
216 | Syria
217 | São Tomé and Príncipe
218 | Tajikistan
219 | Tanzania
220 | Thailand
221 | Togo
222 | Tonga
223 | Transnistria
224 | Trinidad and Tobago
225 | Tristan da Cunha
226 | Tunisia
227 | Turkey
228 | Turkmenistan
229 | Turks and Caicos Islands
230 | Tuvalu
231 | Uganda
232 | Ukraine
233 | United Arab Emirates
234 | United Kingdom; England
235 | United States
236 | United States Virgin Islands
237 | Uruguay
238 | Uzbekistan
239 | Vanuatu
240 | Vatican City
241 | Venezuela
242 | Vietnam
243 | Wales
244 | Wallis and Futuna
245 | Western Sahara
246 | Yemen
247 | Zambia
248 | Zimbabwe
249 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | # Set everything to be logged to the file core/target/unit-tests.log
19 | log4j.rootCategory=INFO, file
20 | log4j.appender.file=org.apache.log4j.FileAppender
21 | log4j.appender.file.append=false
22 | log4j.appender.file.file=target/unit-tests.log
23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
--------------------------------------------------------------------------------
/src/test/resources/reference.conf:
--------------------------------------------------------------------------------
1 | lucenerdd {
2 |
3 | // Name of analyzer as it is under Lucene's package org.apache.lucene.analysis.XX
4 | analyzer.name = "en"
5 |
6 | // Analyzer name must be "ngram"
7 | analyzer {
8 | ngram.mingram = 2
9 | ngram.maxgram = 5
10 | }
11 |
12 | // Similarity scoring for Lucenes
13 | similarity.name = "bm25" // anything else will default to Lucene classic similarity
14 |
15 | // Supported linkage methods
16 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD
17 | // fits in spark driver's memory)
18 | //
19 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions
20 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of
21 | // partitions of the queries RDD.
22 | linker.method = "collectbroadcast"
23 |
24 | index {
25 |
26 | // Lucene index storage
27 | // Use 'disk' to store the index in Java's temp directory
28 | // Otherwise the index will be stored in memory
29 | store.mode = "disk"
30 |
31 | stringfields{
32 |
33 | // Analyze text fields or not
34 | analyzed = true
35 |
36 | // Text fields options as in org.apache.lucene.index.IndexOptions
37 | //
38 | // Other options are:
39 | // "DOCS"
40 | // "DOCS_AND_FREQS"
41 | // "DOCS_AND_FREQS_AND_POSITIONS"
42 | // "DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS"
43 | // "NONE"
44 | options = "docs_and_freqs_and_positions_and_offsets"
45 |
46 | // Omit terms norms
47 | terms.omitnorms = false
48 |
49 | // Store term positions
50 | terms.positions = false
51 |
52 | // Store Term vectors (set true, otherwise LuceneRDD.termVectors(fieldName) will fail)
53 | terms.vectors = true
54 | }
55 | }
56 |
57 | // Maximum value on topK queries
58 | query.topk.maxvalue = 100
59 | // Default value of number of returned results
60 | query.topk.default = 10
61 |
62 | // Default value of number of faceted results
63 | query.facets.number.default = 10
64 |
65 | // Spatial related configurations used by ShapeLuceneRDD
66 | spatial {
67 | prefixtree {
68 | name = "quad" // "geohash" or "quad"
69 | maxlevel = 9 // 11 results in sub-meter precision for geohash
70 | maxDistErr = 5.0 // in kilometers
71 | }
72 |
73 | // Shape format can be one of ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT
74 | shape.io.format = "WKT"
75 |
76 | // Supported linkage methods
77 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD
78 | // fits in spark driver's memory)
79 | //
80 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions
81 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of
82 | // partitions of the queries RDD.
83 | linker.method = "collectbroadcast"
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/BlockingDedupSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.lucene.index.Term
21 | import org.apache.lucene.search.{Query, TermQuery}
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.sql.{Row, SparkSession}
24 | import org.scalatest.BeforeAndAfterEach
25 | import org.scalatest.flatspec.AnyFlatSpec
26 | import org.scalatest._
27 | import matchers.should._
28 |
29 | import org.zouzias.spark.lucenerdd.testing.Person
30 |
31 | class BlockingDedupSpec extends AnyFlatSpec
32 | with Matchers
33 | with BeforeAndAfterEach
34 | with SharedSparkContext {
35 |
36 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
37 | setMaster("local[*]").
38 | setAppName("test").
39 | set("spark.ui.enabled", "false").
40 | set("spark.app.id", appID))
41 |
42 | "LuceneRDD.blockDedup" should "deduplicate elements on unique elements" in {
43 | val spark = SparkSession.builder().getOrCreate()
44 | import spark.implicits._
45 |
46 | val people: Array[Person] = Array("fear", "death", "water", "fire", "house")
47 | .zipWithIndex.map { case (str, index) =>
48 | val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com"
49 | Person(str, index, email)
50 | }
51 | val df = sc.parallelize(people).repartition(2).toDF()
52 |
53 | val linker: Row => Query = { row =>
54 | val name = row.getString(row.fieldIndex("name"))
55 | val term = new Term("name", name)
56 |
57 | new TermQuery(term)
58 | }
59 |
60 |
61 | val linked = LuceneRDD.blockDedup(df, linker, Array("email"))
62 |
63 | val linkedCount, dfCount = (linked.count, df.count())
64 |
65 | linkedCount should equal(dfCount)
66 |
67 | // Check for correctness
68 | // Age is a unique index
69 | linked.collect().foreach { case (row, results) =>
70 | val leftAge, rightAge = (row.getInt(row.fieldIndex("age")),
71 | results.headOption.map(x => x.getInt(x.fieldIndex("age"))))
72 |
73 | leftAge should equal(rightAge)
74 |
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/BlockingLinkageSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.lucene.index.Term
21 | import org.apache.lucene.search.{Query, TermQuery}
22 | import org.apache.spark.SparkConf
23 | import org.apache.spark.sql.{Row, SparkSession}
24 | import org.scalatest.BeforeAndAfterEach
25 | import org.scalatest.flatspec.AnyFlatSpec
26 | import org.scalatest._
27 | import matchers.should._
28 | import org.zouzias.spark.lucenerdd.testing.Person
29 |
30 | class BlockingLinkageSpec extends AnyFlatSpec
31 | with Matchers
32 | with BeforeAndAfterEach
33 | with SharedSparkContext {
34 |
35 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
36 | setMaster("local[*]").
37 | setAppName("test").
38 | set("spark.ui.enabled", "false").
39 | set("spark.app.id", appID))
40 |
41 | "LuceneRDD.blockEntityLinkage" should "deduplicate elements on unique elements" in {
42 | val spark = SparkSession.builder().getOrCreate()
43 | import spark.implicits._
44 |
45 | val peopleLeft: Array[Person] = Array("fear", "death", "water", "fire", "house")
46 | .zipWithIndex.map { case (str, index) =>
47 | val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com"
48 | Person(str, index, email)
49 | }
50 |
51 | val peopleRight: Array[Person] = Array("fear", "death", "water", "fire", "house")
52 | .zipWithIndex.map { case (str, index) =>
53 | val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com"
54 | Person(str, index, email)
55 | }
56 |
57 | val leftDF = sc.parallelize(peopleLeft).repartition(2).toDF()
58 | val rightDF = sc.parallelize(peopleRight).repartition(3).toDF()
59 |
60 | // Define a Lucene Term linker
61 | val linker: Row => Query = { row =>
62 | val name = row.getString(row.fieldIndex("name"))
63 | val term = new Term("name", name)
64 |
65 | new TermQuery(term)
66 | }
67 |
68 |
69 | val linked = LuceneRDD.blockEntityLinkage(leftDF, rightDF, linker,
70 | Array("email"), Array("email"))
71 |
72 | val linkedCount, dfCount = (linked.count, leftDF.count())
73 |
74 | linkedCount should equal(dfCount)
75 |
76 | // Check for correctness
77 | // Age is a unique index
78 | linked.collect().foreach { case (row, results) =>
79 | val leftAge, rightAge = (row.getInt(row.fieldIndex("age")),
80 | results.headOption.map(x => x.getInt(x.fieldIndex("age"))))
81 |
82 | leftAge should equal(rightAge)
83 |
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneDocToSparkRowpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import java.io.{Reader, StringReader}
20 |
21 | import org.apache.lucene.document.{Document, DoublePoint, Field, FloatPoint, IntPoint, LongPoint, StoredField, TextField}
22 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
23 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField}
24 |
25 | import org.scalatest.BeforeAndAfterEach
26 | import org.scalatest.flatspec.AnyFlatSpec
27 | import org.scalatest._
28 | import matchers.should._
29 |
30 |
31 | import scala.collection.JavaConverters._
32 |
33 | class LuceneDocToSparkRowpec extends AnyFlatSpec
34 | with Matchers
35 | with BeforeAndAfterEach {
36 |
37 | val (score: Float, docId: Int, shardIndex: Int) = (1.0f, 1, 2)
38 | val float: Float = 20.001f
39 | val double: Double = 10.1000000001D
40 |
41 | def generate_doc(): Document = {
42 | val doc = new Document()
43 |
44 | // Add long field
45 | doc.add(new LongPoint("longField", 10))
46 | doc.add(new StoredField("longField", 10))
47 |
48 | doc.add(new FloatPoint("floatField", float))
49 | doc.add(new StoredField("floatField", float))
50 |
51 | doc.add(new IntPoint("intField", 9))
52 | doc.add(new StoredField("intField", 9))
53 |
54 | doc.add(new DoublePoint("doubleField", double))
55 | doc.add(new StoredField("doubleField", double))
56 |
57 | doc.add(new TextField("textField", "hello world", Field.Store.NO))
58 | doc.add(new StoredField("textField", "hello world"))
59 |
60 | doc
61 | }
62 |
63 | private val doc: Document = generate_doc()
64 |
65 | val sparkScoreDoc = SparkScoreDoc(score, docId, shardIndex, doc)
66 |
67 |
68 | "SparkScoreDoc.toRow" should "return correct score" in {
69 | val row = sparkScoreDoc.toRow()
70 | row.getFloat(row.fieldIndex(ScoreField)) should equal(score)
71 | }
72 |
73 | "SparkScoreDoc.toRow" should "return correct docId" in {
74 | val row = sparkScoreDoc.toRow()
75 | row.getInt(row.fieldIndex(DocIdField)) should equal(docId)
76 | }
77 |
78 | "SparkScoreDoc.toRow" should "return correct shard number" in {
79 | val row = sparkScoreDoc.toRow()
80 | row.getInt(row.fieldIndex(ShardField)) should equal(shardIndex)
81 | }
82 |
83 | "SparkScoreDoc.toRow" should "return correct number of fields" in {
84 | val row = sparkScoreDoc.toRow()
85 | row.getFields().asScala.count(_.fieldType().stored()) should equal(8)
86 | }
87 |
88 | "SparkScoreDoc.toRow" should "set correctly DoublePoint" in {
89 | val row = sparkScoreDoc.toRow()
90 | row.getDouble(row.fieldIndex("doubleField")) should equal(double)
91 | }
92 |
93 | "SparkScoreDoc.toRow" should "set correctly FloatPoint" in {
94 | val row = sparkScoreDoc.toRow()
95 | row.getFloat(row.fieldIndex("floatField")) should equal(float)
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LucenePrimitiveTypesSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.scalatest.BeforeAndAfterEach
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest._
24 | import matchers.should._
25 |
26 |
27 | class LucenePrimitiveTypesSpec extends AnyFlatSpec with Matchers
28 | with BeforeAndAfterEach
29 | with SharedSparkContext {
30 |
31 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
32 | setMaster("local[*]").
33 | setAppName("test").
34 | set("spark.ui.enabled", "false").
35 | set("spark.app.id", appID))
36 |
37 | def randomString(length: Int): String = scala.util.Random.alphanumeric.take(length).mkString
38 | val array = (1 to 24).map(randomString(_))
39 |
40 | var luceneRDD: LuceneRDD[_] = _
41 |
42 | override def afterEach() {
43 | luceneRDD.close()
44 | }
45 |
46 | /**
47 | "LuceneRDD" should "work with RDD[List[String]]" in {
48 | val array = Array(List("aaa", "aaa2"), List("bbb", "bbb2"),
49 | List("ccc", "ccc2"), List("ddd"), List("eee"))
50 | val rdd = sc.parallelize(array)
51 | luceneRDD = LuceneRDD(rdd)
52 | luceneRDD.count should be (array.length)
53 | }
54 | */
55 |
56 | "LuceneRDD" should "work with RDD[Array[String]]" in {
57 | val array = Array(Array("aaa", "aaa2"), Array("bbb", "bbb2"),
58 | Array("ccc", "ccc2"), Array("ddd"), Array("eee"))
59 | val rdd = sc.parallelize(array)
60 | luceneRDD = LuceneRDD(rdd)
61 | luceneRDD.count should be (array.length)
62 | }
63 |
64 | "LuceneRDD" should "work with RDD[Set[String]]" in {
65 | val array = Array(Set("aaa", "aaa2"), Set("bbb", "bbb2"),
66 | Set("ccc", "ccc2"), Set("ddd"), Set("eee"))
67 | val rdd = sc.parallelize(array)
68 | luceneRDD = LuceneRDD(rdd)
69 | luceneRDD.count should be (array.length)
70 | }
71 |
72 | "LuceneRDD" should "work with RDD[String]" in {
73 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee")
74 | val rdd = sc.parallelize(array)
75 | luceneRDD = LuceneRDD(rdd)
76 | luceneRDD.count should be (array.length)
77 | }
78 |
79 | "LuceneRDD" should "work with RDD[Int]" in {
80 | val array = (1 to 22)
81 | val rdd = sc.parallelize(array)
82 | luceneRDD = LuceneRDD(rdd)
83 | luceneRDD.count should be (array.size)
84 | }
85 |
86 | "LuceneRDD" should "work with RDD[Float]" in {
87 | val array: IndexedSeq[Float] = (1 to 22).map(_.toFloat)
88 | val rdd = sc.parallelize(array)
89 | luceneRDD = LuceneRDD(rdd)
90 | luceneRDD.count should be (array.size)
91 | }
92 |
93 | "LuceneRDD" should "work with RDD[Double]" in {
94 | val array: IndexedSeq[Double] = (1 to 22).map(_.toDouble)
95 | val rdd = sc.parallelize(array)
96 | luceneRDD = LuceneRDD(rdd)
97 | luceneRDD.count should be (array.size)
98 | }
99 |
100 | "LuceneRDD" should "work with RDD[Long]" in {
101 | val array: IndexedSeq[Long] = (1 to 22).map(_.toLong)
102 | val rdd = sc.parallelize(array)
103 | luceneRDD = LuceneRDD(rdd)
104 | luceneRDD.count should equal (array.size)
105 | }
106 |
107 | "LuceneRDD" should "work with RDD[Map[String, String]]" in {
108 | val maps = List(Map( "a" -> "hello"), Map("b" -> "world"), Map("c" -> "how are you"))
109 | val rdd = sc.parallelize(maps)
110 | luceneRDD = LuceneRDD(rdd)
111 | luceneRDD.count should equal (maps.size)
112 | luceneRDD.termQuery("a", "hello").isEmpty() should equal (false)
113 | luceneRDD.prefixQuery("b", "wor").isEmpty() should equal (false)
114 | luceneRDD.prefixQuery("a", "no").isEmpty() should equal (true)
115 | }
116 |
117 | "LuceneRDD" should "work with RDD[String] and ignore null values" in {
118 | val array = Array("aaa", null, "ccc", null, "eee")
119 | val rdd = sc.parallelize(array)
120 | luceneRDD = LuceneRDD(rdd)
121 | luceneRDD.count should be (array.length)
122 | }
123 |
124 | }
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDCustomCaseClassImplicitsSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.zouzias.spark.lucenerdd.testing.Person
22 | import org.scalatest.BeforeAndAfterEach
23 | import org.scalatest.flatspec.AnyFlatSpec
24 | import org.scalatest._
25 | import matchers.should._
26 |
27 |
28 | class LuceneRDDCustomCaseClassImplicitsSpec extends AnyFlatSpec
29 | with Matchers
30 | with BeforeAndAfterEach
31 | with SharedSparkContext {
32 |
33 | var luceneRDD: LuceneRDD[_] = _
34 |
35 | override def afterEach() {
36 | luceneRDD.close()
37 | }
38 |
39 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
40 | setMaster("local[*]").
41 | setAppName("test").
42 | set("spark.ui.enabled", "false").
43 | set("spark.app.id", appID))
44 |
45 | val elem: Array[Person] = Array("fear", "death", "water", "fire", "house")
46 | .zipWithIndex.map{ case (str, index) => Person(str, index, s"${str}@gmail.com")}
47 |
48 | "LuceneRDD(case class).count" should "handle nulls properly" in {
49 | val elemsWithNulls = Array("fear", "death", "water", "fire", "house")
50 | .zipWithIndex.map{ case (str, index) => Person(str, index, null)}
51 | val rdd = sc.parallelize(elemsWithNulls)
52 | luceneRDD = LuceneRDD(rdd)
53 | luceneRDD.count() should equal (elemsWithNulls.length)
54 | }
55 |
56 | "LuceneRDD(case class).count" should "return correct number of elements" in {
57 | val rdd = sc.parallelize(elem)
58 | luceneRDD = LuceneRDD(rdd)
59 | luceneRDD.count() should equal (elem.length)
60 | }
61 |
62 | "LuceneRDD(case class).fields" should "return all fields" in {
63 | val rdd = sc.parallelize(elem)
64 | luceneRDD = LuceneRDD(rdd)
65 |
66 | luceneRDD.fields().size should equal(3)
67 | luceneRDD.fields().contains("name") should equal(true)
68 | luceneRDD.fields().contains("age") should equal(true)
69 | luceneRDD.fields().contains("email") should equal(true)
70 | }
71 |
72 | "LuceneRDD(case class).termQuery" should "correctly search with TermQueries" in {
73 | val rdd = sc.parallelize(elem)
74 | luceneRDD = LuceneRDD(rdd)
75 |
76 | val results = luceneRDD.termQuery("name", "water")
77 | results.count() should equal(1)
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDMoreLikeThisSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import scala.collection.JavaConverters._
22 | import org.scalatest.BeforeAndAfterEach
23 | import org.scalatest.flatspec.AnyFlatSpec
24 | import org.scalatest._
25 | import matchers.should._
26 |
27 |
28 | import scala.io.Source
29 |
30 | class LuceneRDDMoreLikeThisSpec extends AnyFlatSpec
31 | with Matchers
32 | with BeforeAndAfterEach
33 | with SharedSparkContext {
34 |
35 | var luceneRDD: LuceneRDD[_] = _
36 |
37 |
38 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
39 | setMaster("local[*]").
40 | setAppName("test").
41 | set("spark.ui.enabled", "false").
42 | set("spark.app.id", appID))
43 |
44 | override def afterEach() {
45 | luceneRDD.close()
46 | }
47 |
48 | "LuceneRDD.moreLikeThis" should "return relevant documents" in {
49 | val words: Seq[String] = Source.fromFile("src/test/resources/alice.txt")
50 | .getLines().map(_.toLowerCase).toSeq
51 | val rdd = sc.parallelize(words)
52 | luceneRDD = LuceneRDD(rdd)
53 | val results = luceneRDD
54 | .moreLikeThis("_1", "alice adventures wonderland", 1, 1)
55 | .collect()
56 |
57 | results.length > 0 should equal(true)
58 | val firstDoc = results.head
59 | val x = firstDoc.getString(firstDoc.fieldIndex("_1"))
60 |
61 | x.contains("alice") &&
62 | x.contains("wonderland") &&
63 | x.contains("adventures") should equal(true)
64 |
65 | val lastDoc = results.last
66 | val y = lastDoc.getString(lastDoc.fieldIndex("_1"))
67 |
68 |
69 | y.contains("alice") &&
70 | !y.contains("wonderland") &&
71 | !y.contains("adventures") should equal(true)
72 |
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDSearchSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.scalatest.BeforeAndAfterEach
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest._
24 | import matchers.should._
25 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils
26 |
27 | class LuceneRDDSearchSpec extends AnyFlatSpec
28 | with Matchers
29 | with BeforeAndAfterEach
30 | with LuceneRDDTestUtils
31 | with SharedSparkContext {
32 |
33 | var luceneRDD: LuceneRDD[_] = _
34 |
35 | override def Radius: Double = 0
36 |
37 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
38 | setMaster("local[*]").
39 | setAppName("test").
40 | set("spark.ui.enabled", "false").
41 | set("spark.app.id", appID))
42 |
43 | override def afterEach() {
44 | luceneRDD.close()
45 | }
46 |
47 |
48 | val First = "_1"
49 |
50 | val array = List("fear", "death", " apologies", "romance", "tree", "fashion", "fascism")
51 |
52 | "LuceneRDD.query" should "use phrase query syntax" in {
53 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
54 | val rdd = sc.parallelize(words)
55 | luceneRDD = LuceneRDD(rdd)
56 | luceneRDD.query("_1:aadaa").isEmpty() should equal (false)
57 | luceneRDD.query("_1:aa*").count() should equal (4)
58 | luceneRDD.query("_1:q*").count() should equal (1)
59 | }
60 |
61 | "LuceneRDD.count" should "return correct number of elements" in {
62 | val rdd = sc.parallelize(array)
63 | luceneRDD = LuceneRDD(rdd)
64 | luceneRDD.count should equal (array.size)
65 | }
66 |
67 | "LuceneRDD.termQuery" should "correctly search with TermQueries" in {
68 | val rdd = sc.parallelize(array)
69 | luceneRDD = LuceneRDD(rdd)
70 | val results = luceneRDD.termQuery(First, array(1))
71 | results.count() should equal (1)
72 | }
73 |
74 | "LuceneRDD.prefixQuery" should "correctly search with PrefixQueries" in {
75 |
76 | val prefices = Array("aaaabcd", "aaadcb", "aaz", "az", "qwerty")
77 | val rdd = sc.parallelize(prefices)
78 | luceneRDD = LuceneRDD(rdd)
79 |
80 | luceneRDD.prefixQuery(First, "a").count() should equal (4)
81 | luceneRDD.prefixQuery(First, "aa").count() should equal(3)
82 | luceneRDD.prefixQuery(First, "aaa").count() should equal (2)
83 | luceneRDD.prefixQuery(First, "aaaa").count() should equal (1)
84 | }
85 |
86 | "LuceneRDD.fuzzyQuery" should "correctly search with FuzzyQuery" in {
87 | val rdd = sc.parallelize(array)
88 | luceneRDD = LuceneRDD(rdd)
89 |
90 | luceneRDD.fuzzyQuery(First, "fear", 1).count() should equal (1)
91 | luceneRDD.fuzzyQuery(First, "fascsm", 1).count() should equal(1)
92 | luceneRDD.fuzzyQuery(First, "dath", 1).count() should equal (1)
93 | luceneRDD.fuzzyQuery(First, "tree", 1).count() should equal (1)
94 | }
95 |
96 | /*
97 | "LuceneRDD.fuzzyQuery" should "correctly search for Bern in Cities dataset" in {
98 | val cities = Source.fromFile("src/test/resources/cities.txt").getLines().toSeq
99 | val rdd = sc.parallelize(cities)
100 | luceneRDD = LuceneRDD(rdd)
101 |
102 | val results = luceneRDD.fuzzyQuery(First, "Bern", 1).collect()
103 |
104 | // First result must be Bern
105 | results.headOption
106 | .forall( first => first.doc.textField(First).contains("Bern")) should equal(true)
107 |
108 | // Results must be sorted (descending)
109 | sortedDescSparkScoreDocs(results) should equal(true)
110 | }
111 | */
112 |
113 | "LuceneRDD.phraseQuery" should "correctly search with PhraseQuery" in {
114 | val phrases = Array("hello world", "the company name was", "highlight lucene")
115 | val rdd = sc.parallelize(phrases)
116 | luceneRDD = LuceneRDD(rdd)
117 |
118 | luceneRDD.phraseQuery(First, "company name", 10).count() should equal (1)
119 | luceneRDD.phraseQuery(First, "hello world", 10).count() should equal (1)
120 | luceneRDD.phraseQuery(First, "highlight lucene", 10).count() should equal(1)
121 | }
122 | }
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.scalatest.flatspec.AnyFlatSpec
22 | import org.scalatest._
23 | import matchers.should._
24 |
25 | class LuceneRDDSpec extends AnyFlatSpec
26 | with Matchers
27 | with BeforeAndAfterEach
28 | with SharedSparkContext {
29 |
30 | var luceneRDD: LuceneRDD[_] = _
31 |
32 |
33 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
34 | setMaster("local[*]").
35 | setAppName("test").
36 | set("spark.ui.enabled", "false").
37 | set("spark.app.id", appID))
38 |
39 | override def afterEach() {
40 | luceneRDD.close()
41 | }
42 |
43 | "LuceneRDD.exists(Map)" should "find elements that exist" in {
44 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
45 | val rdd = sc.parallelize(words)
46 | luceneRDD = LuceneRDD(rdd)
47 | luceneRDD.exists(Map("_1" -> "aaaa")) should equal (true)
48 | }
49 |
50 | "LuceneRDD.exists(Map)" should "not find elements that don't exist" in {
51 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
52 | val rdd = sc.parallelize(words)
53 | luceneRDD = LuceneRDD(rdd)
54 | luceneRDD.exists(Map("_1" -> "doNotExist")) should equal (false)
55 | }
56 |
57 | "LuceneRDD.exists(T)" should "find elements that exist" in {
58 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
59 | val rdd = sc.parallelize(words)
60 | val localLuceneRDD = LuceneRDD(rdd)
61 | localLuceneRDD.exists("aaaa") should equal (true)
62 | localLuceneRDD.close()
63 | }
64 |
65 | "LuceneRDD.exists(T)" should "not find elements that don't exist" in {
66 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
67 | val rdd = sc.parallelize(words)
68 | val localLuceneRDD = LuceneRDD(rdd)
69 | localLuceneRDD.exists("doNotExist") should equal (false)
70 | localLuceneRDD.close()
71 | }
72 |
73 | "LuceneRDD.count" should "count correctly the results" in {
74 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
75 | val rdd = sc.parallelize(words)
76 | val luceneRDD = LuceneRDD(rdd)
77 | luceneRDD.count should equal (5)
78 | }
79 |
80 | "LuceneRDD.count" should "count zero on empty RDD" in {
81 | val words = Array.empty[String]
82 | val rdd = sc.parallelize(words)
83 | luceneRDD = LuceneRDD(rdd)
84 | luceneRDD.count should equal (0)
85 | }
86 |
87 | "LuceneRDD.filter" should "filter correctly existing element" in {
88 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
89 | val rdd = sc.parallelize(words)
90 | val luceneRDD = LuceneRDD(rdd)
91 | luceneRDD.filter(x => x.startsWith("aaa")).count should equal (2)
92 | }
93 |
94 | "LuceneRDD.filter" should "not filter non existing elements" in {
95 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
96 | val rdd = sc.parallelize(words)
97 | val luceneRDD = LuceneRDD(rdd)
98 | luceneRDD.filter(x => x.startsWith("iDoNotExist")).count should equal (0)
99 | }
100 |
101 | "LuceneRDD.fields" should "return _1 as default field" in {
102 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty")
103 | val rdd = sc.parallelize(words)
104 | val luceneRDD = LuceneRDD(rdd)
105 | luceneRDD.fields().contains("_1") should equal(true)
106 | }
107 |
108 | "LuceneRDD.fields" should "correctly return field types" in {
109 | val words = Array(("a", 1.0F), ("b", 2.0F), ("c", 3.0F))
110 | val rdd = sc.parallelize(words)
111 | val luceneRDD = LuceneRDD(rdd)
112 | luceneRDD.fields().contains("_1") should equal(true)
113 | luceneRDD.fields().contains("_2") should equal(true)
114 | }
115 |
116 | "LuceneRDD.fields" should "return correct fields with RDD[Map[String, String]]" in {
117 | val maps = List(Map( "a" -> "hello"), Map("b" -> "world"), Map("c" -> "how are you"))
118 | val rdd = sc.parallelize(maps)
119 | luceneRDD = LuceneRDD(rdd)
120 | luceneRDD.fields() should equal(Set("a", "b", "c"))
121 | }
122 |
123 | "LuceneRDD.version" should "return project sbt build information" in {
124 | val map = LuceneRDD.version()
125 | map.contains("name") should equal(true)
126 | map.contains("builtAtMillis") should equal(true)
127 | map.contains("scalaVersion") should equal(true)
128 | map.contains("version") should equal(true)
129 | map.contains("sbtVersion") should equal(true)
130 | map.contains("builtAtString") should equal(true)
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDTermVectorsSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.scalatest.{BeforeAndAfterEach}
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest._
24 | import matchers.should._
25 |
26 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils
27 |
28 | class LuceneRDDTermVectorsSpec extends AnyFlatSpec
29 | with Matchers
30 | with BeforeAndAfterEach
31 | with LuceneRDDTestUtils
32 | with SharedSparkContext {
33 |
34 | var luceneRDD: LuceneRDD[_] = _
35 |
36 | override def Radius: Double = 0
37 |
38 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
39 | setMaster("local[*]").
40 | setAppName("test").
41 | set("spark.ui.enabled", "false").
42 | set("spark.app.id", appID))
43 |
44 | override def afterEach() {
45 | luceneRDD.close()
46 | }
47 |
48 | val First = "_1"
49 |
50 | "LuceneRDD.termVectors" should "return valid terms" in {
51 |
52 | val words = Array("To smile or not to smile smile",
53 | "Don't cry because it's over, smile because it happened",
54 | "So many books, so little time",
55 | "A room without books is like a body without a soul",
56 | "If you tell the truth, you don't have to remember anything")
57 | val rdd = sc.parallelize(words)
58 |
59 | luceneRDD = LuceneRDD(rdd)
60 |
61 | val terms = luceneRDD.termVectors(First).collect()
62 |
63 | // These terms should exist
64 | terms.exists(_.term.compareToIgnoreCase("time") == 0) should equal(true)
65 | terms.exists(_.term.compareToIgnoreCase("room") == 0) should equal(true)
66 | terms.exists(_.term.compareToIgnoreCase("soul") == 0) should equal(true)
67 | terms.exists(_.term.compareToIgnoreCase("smile") == 0) should equal(true)
68 |
69 | terms.exists(t => (t.term.compareToIgnoreCase("smile") == 0)
70 | && t.count == 3) should equal (true)
71 | terms.exists(t => (t.term.compareToIgnoreCase("becaus") == 0)
72 | && t.count == 2) should equal (true)
73 | }
74 | }
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDTuplesSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.scalatest.BeforeAndAfterEach
22 | import org.scalatest.flatspec.AnyFlatSpec
23 | import org.scalatest._
24 | import matchers.should._
25 |
26 |
27 | class LuceneRDDTuplesSpec extends AnyFlatSpec with Matchers with SharedSparkContext {
28 |
29 | val First = "_1"
30 | val Second = "_2"
31 |
32 | val array = List("fear", "death", " apology", "romance", "tree", "fashion", "fascism")
33 |
34 |
35 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
36 | setMaster("local[*]").
37 | setAppName("test").
38 | set("spark.ui.enabled", "false").
39 | set("spark.app.id", appID))
40 |
41 | "LuceneRDD" should "work with Tuple2" in {
42 | val rdd = sc.parallelize(array).map(x => (x, x))
43 | val luceneRDD = LuceneRDD(rdd)
44 | luceneRDD.count should equal (array.size)
45 | }
46 |
47 | "LuceneRDD" should "work with Tuple3" in {
48 | val rdd = sc.parallelize(array).map(x => (x, x, x))
49 | val luceneRDD = LuceneRDD(rdd)
50 | val results = luceneRDD.termQuery(Second, array(1))
51 | results.count should equal (1)
52 | }
53 |
54 | "LuceneRDD" should "work with Tuple4" in {
55 | val rdd = sc.parallelize(array).map(x => (x, x, x, x))
56 | val luceneRDD = LuceneRDD(rdd)
57 | val results = luceneRDD.termQuery(Second, array(1))
58 | results.count should equal (1)
59 | }
60 |
61 | "LuceneRDD" should "work with Tuple5" in {
62 | val rdd = sc.parallelize(array).map(x => (x, x, x, x, x))
63 | val luceneRDD = LuceneRDD(rdd)
64 | val results = luceneRDD.termQuery(Second, array(1))
65 | results.count should equal (1)
66 | }
67 |
68 | "LuceneRDD" should "work with Tuple6" in {
69 | val rdd = sc.parallelize(array).map(x => (x, x, x, x, x, x))
70 | val luceneRDD = LuceneRDD(rdd)
71 | val results = luceneRDD.termQuery(Second, array(1))
72 | results.count should equal (1)
73 | }
74 |
75 | "LuceneRDD" should "work with Tuple7" in {
76 | val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x))
77 | val luceneRDD = LuceneRDD(rdd)
78 | val results = luceneRDD.termQuery(First, array.head)
79 | results.count should equal (1)
80 | }
81 |
82 | "LuceneRDD" should "work with Tuple8" in {
83 | val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x, 3.4))
84 | val luceneRDD = LuceneRDD(rdd)
85 | val results = luceneRDD.termQuery(First, array(1))
86 | results.count should equal (1)
87 | }
88 |
89 | "LuceneRDD" should "work with mixed types in Tuples" in {
90 | val rdd = sc.parallelize(array).map(x => (x, 1, x, 2L, x, 3.0F))
91 | val luceneRDD = LuceneRDD(rdd)
92 | val results = luceneRDD.termQuery(First, array(1))
93 | results.count should equal (1)
94 | }
95 | }
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/analyzers/AnalyzersConfigurableSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.analyzers
18 |
19 | import org.apache.lucene.analysis.en.EnglishAnalyzer
20 | import org.apache.lucene.analysis.el.GreekAnalyzer
21 | import org.apache.lucene.analysis.de.GermanAnalyzer
22 | import org.scalatest.BeforeAndAfterEach
23 | import org.scalatest.flatspec.AnyFlatSpec
24 | import org.scalatest._
25 | import matchers.should._
26 |
27 |
28 | class AnalyzersConfigurableSpec extends AnyFlatSpec with Matchers
29 | with BeforeAndAfterEach
30 | with AnalyzerConfigurable {
31 |
32 | "AnalyzersConfigurable.getAnalyzer" should "return english analyzer with 'en' input" in {
33 | val englishAnalyzer = getAnalyzer(Some("en"))
34 | englishAnalyzer shouldNot equal(null)
35 | englishAnalyzer.isInstanceOf[EnglishAnalyzer] should equal(true)
36 | }
37 |
38 | "AnalyzersConfigurable.getAnalyzer" should
39 | "return custom test analyzer with 'org.apache.lucene.analysis.el.GreekAnalyzer'" in {
40 | val greekAnalyzer = getAnalyzer(Some("org.apache.lucene.analysis.el.GreekAnalyzer"))
41 | greekAnalyzer shouldNot equal(null)
42 | greekAnalyzer.isInstanceOf[GreekAnalyzer] should equal(true)
43 | }
44 |
45 | "AnalyzersConfigurable.getAnalyzer" should
46 | "return custom test analyzer with 'org.apache.lucene.analysis.de.GermanAnalyzer'" in {
47 | val deutschAnalyzer = getAnalyzer(Some("org.apache.lucene.analysis.de.GermanAnalyzer"))
48 | deutschAnalyzer shouldNot equal(null)
49 | deutschAnalyzer.isInstanceOf[GermanAnalyzer] should equal(true)
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/facets/FacetedLuceneRDDImplicitsSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.facets
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.sql.SparkSession
22 | import org.zouzias.spark.lucenerdd.testing.FavoriteCaseClass
23 | import org.zouzias.spark.lucenerdd.{LuceneRDD, LuceneRDDKryoRegistrator}
24 |
25 | import org.scalatest.BeforeAndAfterEach
26 | import org.scalatest.flatspec.AnyFlatSpec
27 | import org.scalatest._
28 | import matchers.should._
29 |
30 |
31 | class FacetedLuceneRDDImplicitsSpec extends AnyFlatSpec
32 | with Matchers
33 | with BeforeAndAfterEach
34 | with SharedSparkContext {
35 |
36 | var luceneRDD: LuceneRDD[_] = _
37 |
38 |
39 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
40 | setMaster("local[*]").
41 | setAppName("test").
42 | set("spark.ui.enabled", "false").
43 | set("spark.app.id", appID))
44 |
45 | override def afterEach() {
46 | luceneRDD.close()
47 | }
48 |
49 |
50 | val elem = Array("fear", "death", "water", "fire", "house")
51 | .zipWithIndex.map{ case (str, index) =>
52 | FavoriteCaseClass(str, index, 10L, 12.3F, s"${str}@gmail.com")}
53 |
54 |
55 | "FacetedLuceneRDD(case class).count" should "return correct number of elements" in {
56 | val rdd = sc.parallelize(elem)
57 | val spark = SparkSession.builder().getOrCreate()
58 | import spark.implicits._
59 | val df = rdd.toDF()
60 | luceneRDD = FacetedLuceneRDD(df)
61 | luceneRDD.count should equal (elem.size)
62 | }
63 |
64 | "FacetedLuceneRDD(case class).fields" should "return all fields" in {
65 | val rdd = sc.parallelize(elem)
66 | val spark = SparkSession.builder().getOrCreate()
67 | import spark.implicits._
68 | val df = rdd.toDF()
69 | luceneRDD = FacetedLuceneRDD(df)
70 |
71 | luceneRDD.fields().size should equal(5)
72 | luceneRDD.fields().contains("name") should equal(true)
73 | luceneRDD.fields().contains("age") should equal(true)
74 | luceneRDD.fields().contains("myLong") should equal(true)
75 | luceneRDD.fields().contains("myFloat") should equal(true)
76 | luceneRDD.fields().contains("email") should equal(true)
77 | }
78 |
79 | "FacetedLuceneRDD(case class).termQuery" should "correctly search with TermQueries" in {
80 | val rdd = sc.parallelize(elem)
81 | val spark = SparkSession.builder().getOrCreate()
82 | import spark.implicits._
83 | val df = rdd.toDF()
84 | luceneRDD = FacetedLuceneRDD(df)
85 |
86 | val results = luceneRDD.termQuery("name", "water")
87 | results.count() should equal(1)
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/query/LuceneQueryHelpersSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.query
18 |
19 | import org.apache.lucene.analysis.Analyzer
20 | import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper
21 | import org.apache.lucene.document.Field.Store
22 | import org.apache.lucene.document._
23 | import org.apache.lucene.facet.FacetField
24 | import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader
25 | import org.apache.lucene.index.DirectoryReader
26 | import org.apache.lucene.search.IndexSearcher
27 | import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD
28 | import org.zouzias.spark.lucenerdd.store.IndexWithTaxonomyWriter
29 | import scala.collection.JavaConverters._
30 | import org.scalatest.BeforeAndAfterEach
31 | import org.scalatest.flatspec.AnyFlatSpec
32 | import org.scalatest._
33 | import matchers.should._
34 |
35 |
36 | import scala.io.Source
37 |
38 | class LuceneQueryHelpersSpec extends AnyFlatSpec
39 | with IndexWithTaxonomyWriter
40 | with Matchers
41 | with BeforeAndAfterEach {
42 |
43 | // Load cities
44 | val countries: Seq[String] = Source
45 | .fromFile("src/test/resources/countries.txt")
46 | .getLines()
47 | .map(_.toLowerCase()).toSeq
48 |
49 | val indexAnalyzerPerField: Map[String, String] = Map("name"
50 | -> "org.apache.lucene.en.EnglishAnalyzer")
51 |
52 | private val MaxFacetValue: Int = 10
53 |
54 | override def indexAnalyzer(): Analyzer = getAnalyzer(Some("en"))
55 |
56 | override def indexPerFieldAnalyzer(): PerFieldAnalyzerWrapper = {
57 | val analyzerPerField: Map[String, Analyzer] = indexAnalyzerPerField
58 | .mapValues(x => getAnalyzer(Some(x)))
59 | new PerFieldAnalyzerWrapper(indexAnalyzer(), analyzerPerField.asJava)
60 | }
61 |
62 | countries.zipWithIndex.foreach { case (elem, index) =>
63 | val doc = convertToDoc(index % MaxFacetValue, elem)
64 | indexWriter.addDocument(FacetsConfig.build(taxoWriter, doc))
65 | }
66 |
67 | indexWriter.commit()
68 | taxoWriter.close()
69 | indexWriter.close()
70 |
71 | private val indexReader = DirectoryReader.open(IndexDir)
72 | private val indexSearcher = new IndexSearcher(indexReader)
73 | private lazy val taxoReader = new DirectoryTaxonomyReader(TaxonomyDir)
74 |
75 |
76 | private lazy val TestFacetName = s"_2${FacetedLuceneRDD.FacetTextFieldSuffix}"
77 |
78 | def convertToDoc(pos: Int, text: String): Document = {
79 | val doc = new Document()
80 | doc.add(new StringField("_1", text, Store.YES))
81 | doc.add(new FacetField(s"_1${FacetedLuceneRDD.FacetTextFieldSuffix}", text))
82 | doc.add(new IntPoint("_2", pos))
83 | doc.add(new StoredField("_2", pos))
84 | doc.add(new FacetField(TestFacetName, pos.toString))
85 | doc
86 | }
87 |
88 | "LuceneQueryHelpers.fields" should "return the list of fields" in {
89 | LuceneQueryHelpers.fields(indexSearcher) should equal (Set("_1", "_2"))
90 | }
91 |
92 | "LuceneQueryHelpers.totalDocs" should "return correct total document counts" in {
93 | LuceneQueryHelpers.totalDocs(indexSearcher) should equal (countries.size)
94 | }
95 |
96 | "LuceneQueryHelpers.facetedTextSearch" should "return correct facet counts" in {
97 | val facets = LuceneQueryHelpers.facetedTextSearch(indexSearcher, taxoReader,
98 | FacetsConfig, "*:*", TestFacetName, 100, indexAnalyzer())
99 |
100 | facets.facetName should equal(TestFacetName)
101 | facets.facets.size should equal(MaxFacetValue)
102 | }
103 |
104 | "LuceneQueryHelpers.termQuery" should "return correct documents" in {
105 | val greece = "greece"
106 | val topDocs = LuceneQueryHelpers
107 | .termQuery(indexSearcher, "_1", greece, 100)
108 | .map(_.toRow())
109 |
110 | topDocs.size should equal(1)
111 |
112 | topDocs.exists(d => d.getString(d.fieldIndex("_1")).
113 | toLowerCase()
114 | .contains(greece)) should equal(true)
115 | }
116 |
117 | "LuceneQueryHelpers.prefixQuery" should "return correct documents" in {
118 | val prefix = "gree"
119 | val topDocs = LuceneQueryHelpers
120 | .prefixQuery(indexSearcher, "_1", prefix, 100)
121 | .map(_.toRow())
122 |
123 | topDocs.forall(d => d.getString(d.fieldIndex("_1"))
124 | .toLowerCase()
125 | .contains(prefix)) should equal(true)
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponseSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.response
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.sql.SparkSession
22 | import org.zouzias.spark.lucenerdd.{LuceneRDD, LuceneRDDKryoRegistrator}
23 | import org.zouzias.spark.lucenerdd._
24 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
25 | import org.zouzias.spark.lucenerdd.testing.FavoriteCaseClass
26 | import org.scalatest.BeforeAndAfterEach
27 | import org.scalatest.flatspec.AnyFlatSpec
28 | import org.scalatest._
29 | import matchers.should._
30 |
31 |
32 | class LuceneRDDResponseSpec extends AnyFlatSpec with Matchers
33 | with BeforeAndAfterEach
34 | with SharedSparkContext {
35 |
36 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
37 | setMaster("local[*]").
38 | setAppName("test").
39 | set("spark.ui.enabled", "false").
40 | set("spark.app.id", appID))
41 |
42 | def randomString(length: Int): String = scala.util.Random.alphanumeric.take(length).mkString
43 |
44 | var luceneRDD: LuceneRDD[_] = _
45 |
46 | override def afterEach() {
47 | luceneRDD.close()
48 | }
49 |
50 | "LuceneRDDResponseSpec.take(k)" should "return exactly k elements" in {
51 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee")
52 | val rdd = sc.parallelize(array)
53 | luceneRDD = LuceneRDD(rdd)
54 | val result = luceneRDD.query("*:*", 10)
55 | result.take(2).length should be (2)
56 | }
57 |
58 | "LuceneRDDResponseSpec.collect()" should "return all elements" in {
59 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee")
60 | val rdd = sc.parallelize(array)
61 | luceneRDD = LuceneRDD(rdd)
62 | val result = luceneRDD.query("*:*", 10)
63 | result.collect().length should be (array.length)
64 | }
65 |
66 | "LuceneRDDResponseSpec.toDF()" should "convert to DataFrame" in {
67 | implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate()
68 | val elem = Array("fear", "death", "water", "fire", "house")
69 | .zipWithIndex.map{ case (str, index) =>
70 | FavoriteCaseClass(str, index, 10L, 10e-6F, s"${str}@gmail.com")}
71 | val rdd = sc.parallelize(elem)
72 | luceneRDD = LuceneRDD(rdd)
73 | val response = luceneRDD.query("*:*", 10)
74 | val schema = response.toDF().schema
75 |
76 | schema.nonEmpty should equal(true)
77 | schema.fieldNames.contains("name") should equal(true)
78 | schema.fieldNames.contains("age") should equal(true)
79 | schema.fieldNames.contains("myLong") should equal(true)
80 | schema.fieldNames.contains("myFloat") should equal(true)
81 | schema.fieldNames.contains("email") should equal(true)
82 |
83 | schema.fields(schema.fieldIndex("name")).dataType should
84 | equal(org.apache.spark.sql.types.StringType)
85 | schema.fields(schema.fieldIndex("age")).dataType should
86 | equal(org.apache.spark.sql.types.IntegerType)
87 | schema.fields(schema.fieldIndex("myLong")).dataType should
88 | equal(org.apache.spark.sql.types.LongType)
89 | schema.fields(schema.fieldIndex("myFloat")).dataType should
90 | equal(org.apache.spark.sql.types.FloatType)
91 | schema.fields(schema.fieldIndex("email")).dataType should
92 | equal(org.apache.spark.sql.types.StringType)
93 | }
94 |
95 | "LuceneRDDResponseSpec.toDF()" should "return score,shardIndex,docId with correct types" in {
96 | implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate()
97 | val elem = Array("fear", "death", "water", "fire", "house")
98 | .zipWithIndex.map { case (str, index) =>
99 | FavoriteCaseClass(str, index, 10L, 10e-6F, s"${str}@gmail.com")
100 | }
101 | val rdd = sc.parallelize(elem)
102 | luceneRDD = LuceneRDD(rdd)
103 | val response = luceneRDD.query("*:*", 10)
104 | val schema = response.toDF().schema
105 |
106 | schema.nonEmpty should equal(true)
107 |
108 | // Extra auxiliary fields that must exist on the DataFrame
109 | schema.fieldNames.contains(SparkScoreDoc.DocIdField) should equal(true)
110 | schema.fieldNames.contains(SparkScoreDoc.ShardField) should equal(true)
111 | schema.fieldNames.contains(SparkScoreDoc.ScoreField) should equal(true)
112 |
113 |
114 | schema.fields(schema.fieldIndex(SparkScoreDoc.DocIdField)).dataType should
115 | equal(org.apache.spark.sql.types.IntegerType)
116 | schema.fields(schema.fieldIndex(SparkScoreDoc.ShardField)).dataType should
117 | equal(org.apache.spark.sql.types.IntegerType)
118 | schema.fields(schema.fieldIndex(SparkScoreDoc.ScoreField)).dataType should
119 | equal(org.apache.spark.sql.types.FloatType)
120 | }
121 |
122 |
123 | "LuceneRDDResponseSpec.collect()" should "work when no results are found" in {
124 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee")
125 | val rdd = sc.parallelize(array)
126 | luceneRDD = LuceneRDD(rdd)
127 | val result = luceneRDD.query("fff", 10)
128 | result.collect().length should be (0)
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDDKnnSearchSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape
18 |
19 |
20 | import com.holdenkarau.spark.testing.SharedSparkContext
21 | import org.apache.spark.SparkConf
22 | import org.zouzias.spark.lucenerdd._
23 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader
24 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils
25 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.ScoreField
26 | import org.scalatest.BeforeAndAfterEach
27 | import org.scalatest.flatspec.AnyFlatSpec
28 | import org.scalatest._
29 | import matchers.should._
30 |
31 |
32 | class ShapeLuceneRDDKnnSearchSpec extends AnyFlatSpec
33 | with Matchers
34 | with BeforeAndAfterEach
35 | with SharedSparkContext
36 | with ContextLoader
37 | with LuceneRDDTestUtils {
38 |
39 | val k = 6
40 |
41 | val Radius: Double = 5D
42 |
43 | var pointLuceneRDD: ShapeLuceneRDD[_, _] = _
44 |
45 | override val conf: SparkConf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
46 | setMaster("local[*]").
47 | setAppName("test").
48 | set("spark.ui.enabled", "false").
49 | set("spark.app.id", appID))
50 |
51 | override def afterEach() {
52 | pointLuceneRDD.close()
53 | }
54 |
55 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (knn)" in {
56 |
57 | val rdd = sc.parallelize(cities)
58 | pointLuceneRDD = ShapeLuceneRDD(rdd)
59 |
60 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "*:*").collect()
61 |
62 | results.length should equal(k)
63 | results.length should be > 0
64 |
65 | // Closest is Bern and fartherst is Toronto
66 | docTextFieldEq(results.head, "_1", Bern._2) should equal(true)
67 | docTextFieldEq(results.last, "_1", Toronto._2) should equal(true)
68 |
69 | // Distances must be sorted
70 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse
71 | sortedDesc(revertedDists) should equal(true)
72 | }
73 |
74 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (prefix search)" in {
75 |
76 | val rdd = sc.parallelize(cities)
77 | pointLuceneRDD = ShapeLuceneRDD(rdd)
78 |
79 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "_1:Mil*").collect()
80 |
81 | results.length should be <= k
82 | results.length should be > 0
83 |
84 | // Closest is Bern and farthest is Toronto
85 | docTextFieldEq(results.head, "_1", Milan._2) should equal(true)
86 |
87 | // Distances must be sorted
88 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse
89 | sortedDesc(revertedDists) should equal(true)
90 | }
91 |
92 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (fuzzy search)" in {
93 |
94 | val rdd = sc.parallelize(cities)
95 | pointLuceneRDD = ShapeLuceneRDD(rdd)
96 |
97 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "_1:Miln~1").collect()
98 |
99 | results.length should be <= k
100 | results.length should be > 0
101 |
102 | // Closest is Bern and farthest is Toronto
103 | docTextFieldEq(results.head, "_1", Milan._2) should equal(true)
104 |
105 | // Distances must be sorted
106 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse
107 | sortedDesc(revertedDists) should equal(true)
108 | }
109 |
110 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (term query)" in {
111 |
112 | val rdd = sc.parallelize(cities)
113 | pointLuceneRDD = ShapeLuceneRDD(rdd)
114 |
115 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "_1:Milan").collect()
116 |
117 | results.length should be <= k
118 | results.length should be > 0
119 |
120 | // Closest is Milan (due to filtering)
121 | docTextFieldEq(results.head, "_1", Milan._2) should equal(true)
122 |
123 | // Distances must be sorted
124 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse
125 | sortedDesc(revertedDists) should equal(true)
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDDLinkageSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.sql.{Row, SparkSession}
22 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader
23 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils
24 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.ScoreField
25 | import org.scalatest.BeforeAndAfterEach
26 | import org.scalatest.flatspec.AnyFlatSpec
27 | import org.scalatest._
28 | import matchers.should._
29 |
30 |
31 | // Required for implicit Document conversion
32 | import org.zouzias.spark.lucenerdd._
33 |
34 | case class City(name: String, x: Double, y: Double)
35 |
36 | class ShapeLuceneRDDLinkageSpec extends AnyFlatSpec
37 | with Matchers
38 | with BeforeAndAfterEach
39 | with SharedSparkContext
40 | with ContextLoader
41 | with LuceneRDDTestUtils {
42 |
43 | val k = 6
44 |
45 | val Radius: Double = 5D
46 |
47 | var pointLuceneRDD: ShapeLuceneRDD[_, _] = _
48 |
49 | override val conf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
50 | setMaster("local[*]").
51 | setAppName("test").
52 | set("spark.ui.enabled", "false").
53 | set("spark.app.id", appID))
54 |
55 | override def afterEach() {
56 | pointLuceneRDD.close()
57 | }
58 |
59 | "ShapeLuceneRDD.linkByKnn" should "link correctly k-nearest neighbors (knn)" in {
60 |
61 | val citiesRDD = sc.parallelize(cities)
62 | pointLuceneRDD = ShapeLuceneRDD(citiesRDD)
63 | pointLuceneRDD.cache()
64 |
65 | val linker = (x: ((Double, Double), String)) => x._1
66 |
67 | val linkage = pointLuceneRDD.linkByKnn(citiesRDD, linker, k)
68 |
69 | linkage.count() should equal(cities.length)
70 |
71 | linkage.collect().foreach{ case (city, knnResults) =>
72 |
73 | // top result should be linked with its query result
74 | val doc = knnResults.head
75 | city._2 should equal(doc.getString(doc.fieldIndex("_1")))
76 |
77 | // Must return only at most k results
78 | knnResults.length should be <= k
79 |
80 | // Distances must be sorted
81 | val revertedDists = knnResults.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse
82 | sortedDesc(revertedDists) should equal(true)
83 | }
84 | }
85 |
86 | "ShapeLuceneRDD.linkByRadius" should "link correctly countries with capitals" in {
87 |
88 | val Radius = 50.0
89 | val sparkSession = SparkSession.builder.getOrCreate()
90 | import sparkSession.implicits._
91 | val countriesRDD = sparkSession.read.parquet("data/countries-poly.parquet")
92 | .select("name", "shape")
93 | .map(row => (row.getString(1), row.getString(0)))
94 |
95 | pointLuceneRDD = ShapeLuceneRDD(countriesRDD)
96 | pointLuceneRDD.cache()
97 |
98 | val capitals = sparkSession.read.parquet("data/capitals.parquet")
99 | .select("name", "shape")
100 | .map(row => (row.getString(1), row.getString(0)))
101 |
102 | /**
103 | * Convert WKT Point to (Double, Double)
104 | * @param city
105 | * @return
106 | */
107 | def coords(city: (String, String)): (Double, Double) = {
108 | val str = city._1
109 | val nums = str.dropWhile(x => x.compareTo('(') != 0).drop(1).dropRight(1)
110 | val coords = nums.split(" ").map(_.trim)
111 | (coords(0).toDouble, coords(1).toDouble)
112 | }
113 |
114 | val linkage = pointLuceneRDD.linkByRadius(capitals.rdd, coords, Radius).collect()
115 |
116 | linkage.length should equal(capitals.count)
117 |
118 | linkage.exists{case (cap, results) =>
119 | cap._2 == "Bern" && docTextFieldEq(results, "_1", "Switzerland")} should equal(true)
120 | linkage.exists{case (cap, results) =>
121 | cap._2 == "Berlin" && docTextFieldEq(results, "_1", "Germany")} should equal(true)
122 | linkage.exists{case (cap, results) =>
123 | cap._2 == "Ottawa" && docTextFieldEq(results, "_1", "Canada")} should equal(true)
124 | linkage.exists{case (cap, results) =>
125 | cap._2 == "Paris" && docTextFieldEq(results, "_1", "France")} should equal(true)
126 |
127 | }
128 |
129 | "ShapeLuceneRDD.linkDataFrameByKnn" should "link correctly k-nearest neighbors (knn)" in {
130 |
131 | val sparkSession = SparkSession.builder.getOrCreate()
132 | import sparkSession.implicits._
133 | val citiesRDD = sc.parallelize(cities)
134 | pointLuceneRDD = ShapeLuceneRDD(citiesRDD)
135 | pointLuceneRDD.cache()
136 |
137 | val citiesDF = citiesRDD.map(x => City(x._2, x._1._1, x._1._2)).toDF
138 | val linker = (x: Row) => (x.getDouble(1), x.getDouble(2))
139 |
140 | val linkage = pointLuceneRDD.linkDataFrameByKnn(citiesDF, linker, k)
141 |
142 | linkage.count() should equal(cities.length)
143 |
144 | linkage.collect().foreach { case (city, knnResults) =>
145 |
146 | // top result should be linked with its query result
147 | docTextFieldEq(knnResults, "_1", city.getString(0)) should equal(true)
148 |
149 | // Must return only at most k results
150 | knnResults.length should be <= k
151 |
152 | // Distances must be sorted
153 | val revertedDists = knnResults.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse
154 | sortedDesc(revertedDists) should equal(true)
155 | }
156 |
157 | }
158 |
159 | }
160 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/spatial/shape/implicits/ShapeLuceneRDDImplicitsSpec.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.spatial.shape.implicits
18 |
19 | import com.holdenkarau.spark.testing.SharedSparkContext
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.sql.SparkSession
22 | import org.zouzias.spark.lucenerdd.spatial.shape.{ShapeLuceneRDD, _}
23 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils
24 | import org.zouzias.spark.lucenerdd._
25 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader
26 |
27 | import org.scalatest.BeforeAndAfterEach
28 | import org.scalatest.flatspec.AnyFlatSpec
29 | import org.scalatest._
30 | import matchers.should._
31 |
32 |
33 | class ShapeLuceneRDDImplicitsSpec extends AnyFlatSpec
34 | with Matchers
35 | with BeforeAndAfterEach
36 | with SharedSparkContext
37 | with ContextLoader
38 | with LuceneRDDTestUtils {
39 |
40 | val Radius: Double = 5D
41 |
42 | override val conf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf().
43 | setMaster("local[*]").
44 | setAppName("test").
45 | set("spark.ui.enabled", "false").
46 | set("spark.app.id", appID))
47 |
48 | "ShapeLuceneRDDImplicits" should "implicitly convert to point" in {
49 |
50 | val rdd = sc.parallelize(cities)
51 | val shapeRDD = ShapeLuceneRDD(rdd)
52 |
53 | shapeRDD.count should equal(cities.length)
54 | }
55 |
56 | "ShapeLuceneRDDImplicits" should "implicitly convert to circle" in {
57 |
58 | val circleCities: Array[(((Double, Double), Double), String)]
59 | = cities.map(convertToCircle)
60 | val rdd = sc.parallelize(circleCities)
61 | val shapeRDD = ShapeLuceneRDD(rdd)
62 |
63 | shapeRDD.count should equal(circleCities.length)
64 | }
65 |
66 | "ShapeLuceneRDDImplicits" should "implicitly convert to rectangle" in {
67 |
68 | val rectangleCities = cities.map(convertToRectangle)
69 | val rdd = sc.parallelize(rectangleCities)
70 | val shapeRDD = ShapeLuceneRDD(rdd)
71 |
72 | shapeRDD.count should equal(rectangleCities.length)
73 | }
74 |
75 | "ShapeLuceneRDDImplicits" should "implicitly convert POINTS from WKT" in {
76 | val sparkSession = SparkSession.builder().getOrCreate()
77 | val citiesDF = sparkSession.read.parquet("data/world-cities-points.parquet")
78 | import sparkSession.implicits._
79 | val citiesRDD = citiesDF.map(row =>
80 | (row.getString(2), (row.getString(0), row.getString(1))))
81 |
82 | val total = citiesDF.count()
83 | total > 0 should equal(true)
84 |
85 | val shapeRDD = ShapeLuceneRDD(citiesRDD)
86 |
87 | shapeRDD.count > 0 should equal(true)
88 | }
89 |
90 | "ShapeLuceneRDDImplicits" should "implicitly convert BBOX from WKT" in {
91 | val sparkSession = SparkSession.builder().getOrCreate()
92 | import sparkSession.implicits._
93 | val countriesDF = sparkSession.read.parquet("data/countries-bbox.parquet")
94 | val citiesRDD = countriesDF.map(row =>
95 | (row.getString(2), (row.getString(0), row.getString(1))))
96 |
97 | val total = countriesDF.count()
98 | total > 0 should equal(true)
99 |
100 | val shapeRDD = ShapeLuceneRDD(citiesRDD)
101 |
102 | shapeRDD.count > 0 should equal(true)
103 | }
104 |
105 | "ShapeLuceneRDDImplicits" should "implicitly convert to polygon" in {
106 |
107 | val polygonCities = cities.map(convertToPolygon(_, Radius))
108 | val rdd = sc.parallelize(polygonCities)
109 | val shapeRDD = ShapeLuceneRDD(rdd)
110 |
111 | shapeRDD.count should equal(polygonCities.length)
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/src/test/scala/org/zouzias/spark/lucenerdd/testing/LuceneRDDTestUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.zouzias.spark.lucenerdd.testing
18 |
19 | import org.apache.spark.sql.Row
20 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc
21 |
22 | trait LuceneRDDTestUtils {
23 |
24 | val Bern = ( (7.45, 46.95), "Bern")
25 | val Zurich = ( (8.55, 47.366667), "Zurich")
26 | val Laussanne = ( (6.6335, 46.519833), "Laussanne")
27 | val Athens = ((23.716667, 37.966667), "Athens")
28 | val Toronto = ((-79.4, 43.7), "Toronto")
29 | val Milan = ((45.4646, 9.198), "Milan")
30 | val cities = Array(Bern, Zurich, Laussanne, Athens, Milan, Toronto)
31 |
32 | def Radius: Double
33 |
34 | def convertToCircle(city: ((Double, Double), String)): (((Double, Double), Double), String) = {
35 | ((city._1, Radius), city._2)
36 | }
37 |
38 | def convertToRectangle(city: ((Double, Double), String))
39 | : ((Double, Double, Double, Double), String) = {
40 | val x = city._1._1
41 | val y = city._1._2
42 |
43 | ((x - Radius, x + Radius, y - Radius, y + Radius), city._2)
44 | }
45 |
46 | def convertToPolygon(city: ((Double, Double), String), width: Double)
47 | : (Array[(Double, Double)], String) = {
48 | val x = city._1._1
49 | val y = city._1._2
50 |
51 | val coords = Array((x - width, y - width), (x - width, y + width),
52 | (x + width, y + width), (x + width, y - width), (x - width, y - width))
53 | (coords, city._2)
54 | }
55 |
56 | protected def docTextFieldEq(doc: Row, fieldName: String, fieldValue: String): Boolean = {
57 | doc.getString(doc.fieldIndex(fieldName)).contains(fieldValue)
58 | }
59 |
60 | protected def docTextFieldEq(docs: Array[Row], fieldName: String, fieldValue: String)
61 | : Boolean = {
62 | docs.exists(x => x.getString(x.fieldIndex(fieldName)).contains(fieldValue))
63 | }
64 |
65 | // Check if sequence is sorted in descending order
66 | protected def sortedDesc(seq : Seq[Float]) : Boolean = {
67 | if (seq.isEmpty) true else seq.zip(seq.tail).forall(x => x._1 >= x._2)
68 | }
69 |
70 | // Check if sequence is sorted in descending order
71 | protected def sortedDescSparkScoreDocs(seq : Seq[SparkScoreDoc]) : Boolean = {
72 | if (seq.isEmpty) true else seq.zip(seq.tail).forall(x => x._1.score >= x._2.score)
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/startZeppelin.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #############################################################
4 | # Start Zeppelin using Docker #
5 | #############################################################
6 |
7 | echo "==========================================="
8 | echo "==========================================="
9 | echo "Browse to http://localhost:8080/"
10 | echo "==========================================="
11 | echo "==========================================="
12 |
13 | docker-compose up
14 |
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | ThisBuild / versionScheme := Some("early-semver")
2 | ThisBuild / version := "0.4.1-SNAPSHOT"
3 |
--------------------------------------------------------------------------------