├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── main.workflow └── workflows │ ├── codacy-analysis.yml │ └── scala.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt ├── data ├── capitals.parquet │ ├── ._SUCCESS.crc │ ├── ._common_metadata.crc │ ├── ._metadata.crc │ ├── .part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc │ ├── .part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc │ ├── _SUCCESS │ ├── _common_metadata │ ├── _metadata │ ├── part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet │ └── part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet ├── countries-bbox.parquet │ ├── ._SUCCESS.crc │ ├── ._common_metadata.crc │ ├── ._metadata.crc │ ├── .part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc │ ├── .part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc │ ├── _SUCCESS │ ├── _common_metadata │ ├── _metadata │ ├── part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet │ └── part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet ├── countries-poly.parquet │ ├── ._SUCCESS.crc │ ├── ._common_metadata.crc │ ├── ._metadata.crc │ ├── .part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc │ ├── .part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc │ ├── _SUCCESS │ ├── _common_metadata │ ├── _metadata │ ├── part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet │ └── part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet └── world-cities-points.parquet │ ├── ._SUCCESS.crc │ ├── ._common_metadata.crc │ ├── ._metadata.crc │ ├── .part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc │ ├── .part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc │ ├── _SUCCESS │ ├── _common_metadata │ ├── _metadata │ ├── part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet │ └── part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet ├── deployToSonartype.md ├── docker-compose.yml ├── notebooks ├── 2BXC9TF8J │ └── note.json └── 2BYXS4JRX │ └── note.json ├── project ├── build.properties └── plugins.sbt ├── scalastyle-config.xml ├── scripts ├── loadAlice.scala ├── loadCities.scala ├── loadH1BVisa.scala ├── loadWords.scala ├── recordLinkage │ └── simpleExamples │ │ ├── linkageFuzzyExample.scala │ │ └── linkagePrefixExample.scala └── spatial │ ├── loadSolrSpatialData.scala │ └── loadSwissCities.scala ├── spark-shell.sh ├── src ├── main │ ├── resources │ │ └── reference.conf │ └── scala │ │ └── org │ │ └── zouzias │ │ └── spark │ │ └── lucenerdd │ │ ├── LuceneRDD.scala │ │ ├── LuceneRDDKryoRegistrator.scala │ │ ├── aggregate │ │ └── SparkFacetResultMonoid.scala │ │ ├── analyzers │ │ └── AnalyzerConfigurable.scala │ │ ├── config │ │ ├── Configurable.scala │ │ ├── LuceneRDDConfigurable.scala │ │ ├── LuceneRDDParams.scala │ │ └── ShapeLuceneRDDConfigurable.scala │ │ ├── facets │ │ ├── FacetedLuceneRDD.scala │ │ └── package.scala │ │ ├── matrices │ │ └── TermDocMatrix.scala │ │ ├── models │ │ ├── SparkFacetResult.scala │ │ ├── SparkScoreDoc.scala │ │ ├── TermVectorEntry.scala │ │ └── indexstats │ │ │ ├── FieldStatistics.scala │ │ │ └── IndexStatistics.scala │ │ ├── package.scala │ │ ├── partition │ │ ├── AbstractLuceneRDDPartition.scala │ │ └── LuceneRDDPartition.scala │ │ ├── query │ │ ├── LuceneQueryHelpers.scala │ │ └── SimilarityConfigurable.scala │ │ ├── response │ │ ├── FieldType.scala │ │ ├── LuceneRDDResponse.scala │ │ └── LuceneRDDResponsePartition.scala │ │ ├── spatial │ │ └── shape │ │ │ ├── ShapeLuceneRDD.scala │ │ │ ├── ShapeLuceneRDDKryoRegistrator.scala │ │ │ ├── context │ │ │ └── ContextLoader.scala │ │ │ ├── grids │ │ │ └── PrefixTreeLoader.scala │ │ │ ├── package.scala │ │ │ ├── partition │ │ │ ├── AbstractShapeLuceneRDDPartition.scala │ │ │ └── ShapeLuceneRDDPartition.scala │ │ │ └── strategies │ │ │ └── SpatialStrategy.scala │ │ ├── store │ │ ├── IndexStorable.scala │ │ └── IndexWithTaxonomyWriter.scala │ │ ├── testing │ │ ├── FavoriteCaseClass.scala │ │ └── Person.scala │ │ └── versioning │ │ └── Versionable.scala └── test │ ├── resources │ ├── alice.txt │ ├── capitals.txt │ ├── cities.txt │ ├── countries.geo.json │ ├── countries.txt │ ├── country-list.csv │ ├── log4j.properties │ ├── reference.conf │ ├── spatial │ │ └── CH.txt │ └── words.txt │ └── scala │ └── org │ └── zouzias │ └── spark │ └── lucenerdd │ ├── BlockingDedupSpec.scala │ ├── BlockingLinkageSpec.scala │ ├── LuceneDocToSparkRowpec.scala │ ├── LucenePrimitiveTypesSpec.scala │ ├── LuceneRDDCustomCaseClassImplicitsSpec.scala │ ├── LuceneRDDDataFrameImplicitsSpec.scala │ ├── LuceneRDDMoreLikeThisSpec.scala │ ├── LuceneRDDRecordLinkageSpec.scala │ ├── LuceneRDDSearchSpec.scala │ ├── LuceneRDDSpec.scala │ ├── LuceneRDDTermVectorsSpec.scala │ ├── LuceneRDDTuplesSpec.scala │ ├── analyzers │ └── AnalyzersConfigurableSpec.scala │ ├── facets │ ├── FacetedLuceneRDDFacetSpec.scala │ └── FacetedLuceneRDDImplicitsSpec.scala │ ├── query │ └── LuceneQueryHelpersSpec.scala │ ├── response │ └── LuceneRDDResponseSpec.scala │ ├── spatial │ └── shape │ │ ├── ShapeLuceneRDDKnnSearchSpec.scala │ │ ├── ShapeLuceneRDDLinkageSpec.scala │ │ ├── ShapeLuceneRDDSpatialSearchSpec.scala │ │ ├── ShapeLuceneRDDSpec.scala │ │ └── implicits │ │ └── ShapeLuceneRDDImplicitsSpec.scala │ └── testing │ └── LuceneRDDTestUtils.scala ├── startZeppelin.sh └── version.sbt /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **To Reproduce** 11 | Steps to reproduce the behavior (code snippet) 12 | 13 | 14 | **Expected behavior** 15 | A clear and concise description of what you expected to happen. 16 | 17 | 18 | **Versions (please complete the following information):** 19 | - spark-lucenerdd version: [e.g. 0.3.0] 20 | - SBT version: [e.g. 1.2.3] 21 | - Spark Version: [e.g. 2.3.2] 22 | - Java version: [e.g. Java 8] 23 | 24 | 25 | **Additional context** 26 | Add any other context about the problem here. 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /.github/main.workflow: -------------------------------------------------------------------------------- 1 | workflow "New workflow" { 2 | on = "push" 3 | } 4 | -------------------------------------------------------------------------------- /.github/workflows/codacy-analysis.yml: -------------------------------------------------------------------------------- 1 | # This workflow checks out code, performs a Codacy security scan 2 | # and integrates the results with the 3 | # GitHub Advanced Security code scanning feature. For more information on 4 | # the Codacy security scan action usage and parameters, see 5 | # https://github.com/codacy/codacy-analysis-cli-action. 6 | # For more information on Codacy Analysis CLI in general, see 7 | # https://github.com/codacy/codacy-analysis-cli. 8 | 9 | name: Codacy Security Scan 10 | 11 | on: 12 | push: 13 | branches: [ master, BRANCH-0.1.x, BRANCH-0.2.x, develop ] 14 | pull_request: 15 | # The branches below must be a subset of the branches above 16 | branches: [ master ] 17 | schedule: 18 | - cron: '36 23 * * 5' 19 | 20 | jobs: 21 | codacy-security-scan: 22 | name: Codacy Security Scan 23 | runs-on: ubuntu-latest 24 | steps: 25 | # Checkout the repository to the GitHub Actions runner 26 | - name: Checkout code 27 | uses: actions/checkout@v2 28 | 29 | # Execute Codacy Analysis CLI and generate a SARIF output with the security issues identified during the analysis 30 | - name: Run Codacy Analysis CLI 31 | uses: codacy/codacy-analysis-cli-action@1.1.0 32 | with: 33 | # Check https://github.com/codacy/codacy-analysis-cli#project-token to get your project token from your Codacy repository 34 | # You can also omit the token and run the tools that support default configurations 35 | project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} 36 | verbose: true 37 | output: results.sarif 38 | format: sarif 39 | # Adjust severity of non-security issues 40 | gh-code-scanning-compat: true 41 | # Force 0 exit code to allow SARIF file generation 42 | # This will handover control about PR rejection to the GitHub side 43 | max-allowed-issues: 2147483647 44 | 45 | # Upload the SARIF file generated in the previous step 46 | - name: Upload SARIF results file 47 | uses: github/codeql-action/upload-sarif@v1 48 | with: 49 | sarif_file: results.sarif 50 | -------------------------------------------------------------------------------- /.github/workflows/scala.yml: -------------------------------------------------------------------------------- 1 | name: Scala CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up JDK 11 13 | uses: actions/setup-java@v1 14 | with: 15 | java-version: 11 16 | - name: Run tests 17 | run: sbt test 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | commons-csv-1.1.jar 2 | spark-csv_2.11-1.4.0.jar 3 | spark-csv_2.10-1.4.0.jar 4 | src/test/resources/h1bvisa-2014.csv 5 | 6 | NOTES.md 7 | metastore_db/ 8 | .idea/ 9 | *.class 10 | *.log 11 | 12 | # sbt specific 13 | .cache 14 | .history 15 | .lib/ 16 | dist/* 17 | target/ 18 | lib_managed/ 19 | src_managed/ 20 | project/boot/ 21 | project/plugins/project/ 22 | 23 | # Scala-IDE specific 24 | .scala_dependencies 25 | .worksheet 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.12.10 4 | sudo: false 5 | dist: trusty 6 | cache: 7 | directories: 8 | - $HOME/.sbt/0.13/dependency 9 | - $HOME/.sbt/boot/scala* 10 | - $HOME/.sbt/launchers 11 | - $HOME/.ivy2/cache 12 | before_cache: 13 | - du -h -d 1 $HOME/.ivy2/cache 14 | - du -h -d 2 $HOME/.sbt/ 15 | - find $HOME/.sbt -name "*.lock" -type f -delete 16 | - find $HOME/.ivy2/cache -name "ivydata-*.properties" -type f -delete 17 | matrix: 18 | include: 19 | - jdk: oraclejdk8 20 | env: LUCENERDD_ANALYZER_NAME="en" LUCENERDD_LINKER_METHOD="cartesian" 21 | - jdk: openjdk8 22 | env: LUCENERDD_ANALYZER_NAME="en" LUCENERDD_LINKER_METHOD="collectbroadcast" 23 | - jdk: openjdk8 24 | env: LUCENERDD_ANALYZER_NAME="whitespace" LUCENERDD_LINKER_METHOD="cartesian" 25 | - jdk: oraclejdk8 26 | env: LUCENERDD_ANALYZER_NAME="whitespace" LUCENERDD_LINKER_METHOD="collectbroadcast" 27 | script: 28 | - sbt ++$TRAVIS_SCALA_VERSION -Dlucenerdd.spatial.linker.method=${LUCENE_SPATIAL_LINKER_METHOD} clean update test 29 | - sbt ++$TRAVIS_SCALA_VERSION scalastyle 30 | - sbt ++$TRAVIS_SCALA_VERSION assembly 31 | - travis_wait 30 sbt ++$TRAVIS_SCALA_VERSION clean coverage test coverageReport 32 | after_success: 33 | - bash <(curl -s https://codecov.io/bash) 34 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | name := "spark-lucenerdd" 19 | organization := "org.zouzias" 20 | scalaVersion := "2.12.19" 21 | crossScalaVersions := Seq("2.12.19") 22 | licenses := Seq("Apache-2.0" -> url("http://www.apache.org/licenses/LICENSE-2.0.html")) 23 | homepage := Some(url("https://github.com/zouzias/spark-lucenerdd")) 24 | 25 | scalacOptions ++= Seq("-deprecation", 26 | "-encoding", "UTF-8", 27 | "-feature", 28 | "-unchecked", 29 | "-Xlint", 30 | "-Yno-adapted-args", 31 | "-Ywarn-dead-code", 32 | "-Ywarn-numeric-widen", 33 | "-Ywarn-value-discard", 34 | "-language:implicitConversions") 35 | 36 | javacOptions ++= Seq("-Xlint", 37 | "-Xms512M", 38 | "-Xmx2048M", 39 | "-XX:MaxPermSize=2048M", 40 | "-XX:+CMSClassUnloadingEnabled" 41 | ) 42 | 43 | // Add jcenter repo 44 | resolvers += Resolver.jcenterRepo 45 | resolvers += "Apache Repos" at "https://repository.apache.org/content/repositories/releases" 46 | 47 | releaseCrossBuild := false 48 | releasePublishArtifactsAction := PgpKeys.publishSigned.value 49 | 50 | publishMavenStyle := true 51 | 52 | sonatypeProfileName := "org.zouzias" 53 | 54 | publishTo := { 55 | val nexus = "https://oss.sonatype.org/" 56 | if (isSnapshot.value) { 57 | Some("snapshots" at nexus + "content/repositories/snapshots") 58 | } 59 | else { 60 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 61 | } 62 | } 63 | 64 | Test / publishArtifact := false 65 | 66 | pomIncludeRepository := { _ => false } 67 | 68 | pomExtra := 69 | git@github.com:zouzias/spark-lucenerdd.git 70 | scm:git:git@github.com:zouzias/spark-lucenerdd.git 71 | 72 | 73 | 74 | zouzias 75 | Anastasios Zouzias 76 | https://github.com/zouzias 77 | 78 | 79 | 80 | val luceneV = "8.11.3" 81 | val sparkVersion = "3.5.6" 82 | 83 | credentials += Credentials(Path.userHome / ".sbt" / ".credentials") 84 | 85 | 86 | // scalastyle:off 87 | 88 | val scalactic = "org.scalactic" %% "scalactic" % "3.2.19" 89 | val scalatest = "org.scalatest" %% "scalatest" % "3.2.19" % "test" 90 | 91 | 92 | val joda_time = "joda-time" % "joda-time" % "2.12.7" 93 | val algebird = "com.twitter" %% "algebird-core" % "0.13.10" 94 | val joda_convert = "org.joda" % "joda-convert" % "2.2.3" 95 | val spatial4j = "org.locationtech.spatial4j" % "spatial4j" % "0.8" 96 | 97 | val typesafe_config = "com.typesafe" % "config" % "1.3.4" 98 | 99 | val lucene_facet = "org.apache.lucene" % "lucene-facet" % luceneV 100 | val lucene_analyzers = "org.apache.lucene" % "lucene-analyzers-common" % luceneV 101 | val lucene_query_parsers = "org.apache.lucene" % "lucene-queryparser" % luceneV 102 | val lucene_expressions = "org.apache.lucene" % "lucene-expressions" % luceneV 103 | val lucene_spatial_extras = "org.apache.lucene" % "lucene-spatial-extras" % luceneV 104 | 105 | val jts = "org.locationtech.jts" % "jts-core" % "1.19.0" 106 | // scalastyle:on 107 | 108 | 109 | libraryDependencies ++= Seq( 110 | algebird, 111 | lucene_facet, 112 | lucene_analyzers, 113 | lucene_expressions, 114 | lucene_query_parsers, 115 | typesafe_config, 116 | lucene_spatial_extras, 117 | spatial4j, 118 | jts, 119 | joda_time, 120 | joda_convert, // To avoid warning: Class org.joda.convert.ToString not found 121 | scalactic, // scalactic is recommended, see http://www.scalatest.org/install 122 | scalatest 123 | ) 124 | 125 | libraryDependencies ++= Seq( 126 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 127 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", 128 | "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", 129 | "com.holdenkarau" %% "spark-testing-base" % s"3.5.1_1.5.3" % "test" intransitive(), 130 | "org.scala-lang" % "scala-library" % scalaVersion.value % "compile" 131 | ) 132 | 133 | // Read version in code from build.sbt 134 | lazy val root = (project in file(".")). 135 | enablePlugins(BuildInfoPlugin). 136 | settings( 137 | buildInfoKeys := Seq[BuildInfoKey](name, version, scalaVersion, sbtVersion), 138 | // See https://github.com/sbt/sbt-buildinfo#buildinfooptionbuildtime 139 | buildInfoOptions += BuildInfoOption.BuildTime, 140 | // https://github.com/sbt/sbt-buildinfo#buildinfooptiontomap 141 | buildInfoOptions += BuildInfoOption.ToMap, 142 | buildInfoPackage := "org.zouzias.spark.lucenerdd" 143 | ) 144 | 145 | lazy val compileScalastyle = taskKey[Unit]("compileScalastyle") 146 | compileScalastyle := scalastyle.in(Compile).toTask("").value 147 | (compile in Compile) := ((compile in Compile) dependsOn compileScalastyle).value 148 | 149 | Test / parallelExecution := false 150 | 151 | // Skip tests during assembly 152 | assembly / test := {} 153 | 154 | // To avoid merge issues 155 | assembly / assemblyMergeStrategy := { 156 | case PathList("module-info.class", xs @ _*) => MergeStrategy.first 157 | case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard 158 | case x => 159 | val oldStrategy = (assembly / assemblyMergeStrategy).value 160 | oldStrategy(x) 161 | } 162 | -------------------------------------------------------------------------------- /data/capitals.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /data/capitals.parquet/._common_metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/._common_metadata.crc -------------------------------------------------------------------------------- /data/capitals.parquet/._metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/._metadata.crc -------------------------------------------------------------------------------- /data/capitals.parquet/.part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/.part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc -------------------------------------------------------------------------------- /data/capitals.parquet/.part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/.part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet.crc -------------------------------------------------------------------------------- /data/capitals.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/_SUCCESS -------------------------------------------------------------------------------- /data/capitals.parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/_common_metadata -------------------------------------------------------------------------------- /data/capitals.parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/_metadata -------------------------------------------------------------------------------- /data/capitals.parquet/part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/part-r-00000-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet -------------------------------------------------------------------------------- /data/capitals.parquet/part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/capitals.parquet/part-r-00001-a04af53d-dc52-452d-9807-e83325558a3e.gz.parquet -------------------------------------------------------------------------------- /data/countries-bbox.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /data/countries-bbox.parquet/._common_metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/._common_metadata.crc -------------------------------------------------------------------------------- /data/countries-bbox.parquet/._metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/._metadata.crc -------------------------------------------------------------------------------- /data/countries-bbox.parquet/.part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/.part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc -------------------------------------------------------------------------------- /data/countries-bbox.parquet/.part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/.part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet.crc -------------------------------------------------------------------------------- /data/countries-bbox.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/_SUCCESS -------------------------------------------------------------------------------- /data/countries-bbox.parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/_common_metadata -------------------------------------------------------------------------------- /data/countries-bbox.parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/_metadata -------------------------------------------------------------------------------- /data/countries-bbox.parquet/part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/part-r-00000-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet -------------------------------------------------------------------------------- /data/countries-bbox.parquet/part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-bbox.parquet/part-r-00001-d0949c8a-70f4-4615-bb5e-b5010cb9490f.gz.parquet -------------------------------------------------------------------------------- /data/countries-poly.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /data/countries-poly.parquet/._common_metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/._common_metadata.crc -------------------------------------------------------------------------------- /data/countries-poly.parquet/._metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/._metadata.crc -------------------------------------------------------------------------------- /data/countries-poly.parquet/.part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/.part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc -------------------------------------------------------------------------------- /data/countries-poly.parquet/.part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/.part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet.crc -------------------------------------------------------------------------------- /data/countries-poly.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/_SUCCESS -------------------------------------------------------------------------------- /data/countries-poly.parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/_common_metadata -------------------------------------------------------------------------------- /data/countries-poly.parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/_metadata -------------------------------------------------------------------------------- /data/countries-poly.parquet/part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/part-r-00000-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet -------------------------------------------------------------------------------- /data/countries-poly.parquet/part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/countries-poly.parquet/part-r-00001-fbf168ec-2750-4aad-b42b-d89727782fd1.gz.parquet -------------------------------------------------------------------------------- /data/world-cities-points.parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /data/world-cities-points.parquet/._common_metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/._common_metadata.crc -------------------------------------------------------------------------------- /data/world-cities-points.parquet/._metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/._metadata.crc -------------------------------------------------------------------------------- /data/world-cities-points.parquet/.part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/.part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc -------------------------------------------------------------------------------- /data/world-cities-points.parquet/.part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/.part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet.crc -------------------------------------------------------------------------------- /data/world-cities-points.parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/_SUCCESS -------------------------------------------------------------------------------- /data/world-cities-points.parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/_common_metadata -------------------------------------------------------------------------------- /data/world-cities-points.parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/_metadata -------------------------------------------------------------------------------- /data/world-cities-points.parquet/part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/part-r-00000-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet -------------------------------------------------------------------------------- /data/world-cities-points.parquet/part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zouzias/spark-lucenerdd/82fb477d0d6ba78b678e560f1a32bcdbce6babff/data/world-cities-points.parquet/part-r-00001-369d62a4-7282-4a38-933f-461a5a768c31.gz.parquet -------------------------------------------------------------------------------- /deployToSonartype.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | # Add `.credentials` file under `~/.sbt/` folder with contents 4 | 5 | ``` 6 | realm=Sonatype Nexus Repository Manager 7 | host=oss.sonatype.org 8 | user=(USERNAME) 9 | password=(PASSWORD_HERE) 10 | ``` 11 | 12 | ## Run sbt release to release signed both 2.10 and 2.11 13 | 14 | ```bash 15 | sbt release 16 | ``` 17 | 18 | ## Then, git checkout v0.X.X to the release tag first, and then type 19 | 20 | ```bash 21 | sbt sonatypeRelease 22 | ``` 23 | 24 | ## This will allow sonatype to release the artifacts to maven central. 25 | ## An alternative is to browse to https://oss.sonatype.org and do it manually 26 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | zeppelin: 2 | image: dylanmei/zeppelin 3 | environment: 4 | ZEPPELIN_PORT: 8080 5 | ZEPPELIN_JAVA_OPTS: >- 6 | -Dspark.driver.memory=1g 7 | -Dspark.executor.memory=1g 8 | MASTER: local[*] 9 | ports: 10 | - 8080:8080 11 | volumes: 12 | - ./data:/usr/zeppelin/data 13 | - ./notebooks:/usr/zeppelin/notebook 14 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.10.11 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/" 19 | 20 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.12.0") 21 | 22 | addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.6.4") 23 | 24 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0") 25 | 26 | addSbtPlugin("com.github.sbt" % "sbt-release" % "1.4.0") 27 | 28 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 29 | 30 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3") 31 | 32 | addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.3.15") 33 | 34 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.11.3") 35 | 36 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.2.1") 37 | -------------------------------------------------------------------------------- /scripts/loadAlice.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import scala.io.Source 19 | import org.zouzias.spark.lucenerdd._ 20 | import org.zouzias.spark.lucenerdd.LuceneRDD 21 | val words = Source.fromFile("src/test/resources/alice.txt").getLines().map(_.trim.toLowerCase).filter(_.length > 3).toSeq 22 | val rdd = sc.parallelize(words) 23 | val luceneRDD = LuceneRDD(rdd) 24 | luceneRDD.cache 25 | luceneRDD.count 26 | 27 | 28 | luceneRDD.moreLikeThis("_1", "alice adventures wonderland", 1, 1, 20).take(20).foreach(println) 29 | 30 | import org.zouzias.spark.lucenerdd.matrices.TermDocMatrix 31 | 32 | 33 | // Construct the term-document matrix 34 | val terms = luceneRDD.termVectors("_1") // _1 is the default field name 35 | val mat = new TermDocMatrix(terms) 36 | 37 | -------------------------------------------------------------------------------- /scripts/loadCities.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | sc.setLogLevel("INFO") 19 | 20 | import scala.io.Source 21 | import org.zouzias.spark.lucenerdd.partition.LuceneRDDPartition 22 | import org.zouzias.spark.lucenerdd._ 23 | import org.zouzias.spark.lucenerdd.LuceneRDD 24 | 25 | val cities = Source.fromFile("src/test/resources/cities.txt").getLines().toSeq 26 | val rdd = sc.parallelize(cities) 27 | val luceneRDD = LuceneRDD(rdd) 28 | luceneRDD.cache 29 | luceneRDD.count 30 | 31 | println("=" * 20) 32 | luceneRDD.termQuery("_1", "toronto").take(10) 33 | 34 | println("=" * 20) 35 | luceneRDD.termQuery("_1", "athens").take(10) 36 | 37 | println("=" * 20) 38 | luceneRDD.termQuery("_1", "bern").take(10) 39 | 40 | println("=" * 20) 41 | luceneRDD.termQuery("_1", "madrid").take(10) 42 | -------------------------------------------------------------------------------- /scripts/loadH1BVisa.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import org.zouzias.spark.lucenerdd._ 19 | import org.zouzias.spark.lucenerdd.LuceneRDD 20 | 21 | val df = spark.sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("src/test/resources/h1bvisa-2014.csv") 22 | val words = df.select("lca_case_employer_name", "lca_case_job_title", "lca_case_employer_city", "lca_case_employer_state", "lca_case_employer_postal_code") 23 | val luceneRDD = LuceneRDD(df.sample(true, 0.01)) 24 | luceneRDD.count -------------------------------------------------------------------------------- /scripts/loadWords.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import scala.io.Source 19 | import org.zouzias.spark.lucenerdd._ 20 | import org.zouzias.spark.lucenerdd.LuceneRDD 21 | val words = Source.fromFile("src/test/resources/words.txt").getLines().toSeq 22 | val rdd = sc.parallelize(words) 23 | val luceneRDD = LuceneRDD(rdd) 24 | luceneRDD.count 25 | -------------------------------------------------------------------------------- /scripts/recordLinkage/simpleExamples/linkageFuzzyExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import scala.io.Source 19 | import org.apache.spark.rdd.RDD 20 | import org.zouzias.spark.lucenerdd._ 21 | import org.zouzias.spark.lucenerdd.LuceneRDD 22 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc 23 | 24 | // Step 1: Query prefixes of countries 25 | // Shooting for Greece, Germany, Spain and Italy 26 | val leftCountries = Array("gree", "germa", "belgi", "ita") 27 | val leftCountriesRDD: RDD[String] = sc.parallelize(leftCountries) 28 | 29 | // Step 2: Load all country names 30 | val countries = sc.parallelize(Source.fromFile("src/test/resources/countries.txt").getLines() 31 | .map(_.toLowerCase()).toSeq) 32 | val luceneRDD = LuceneRDD(countries) 33 | luceneRDD.cache() 34 | 35 | // Step 3: Define you linkage function (prefix) 36 | def fuzzyLinker(country: String): String = { 37 | val Fuzziness = 2 38 | s"_1:${country}~${Fuzziness}" 39 | } 40 | 41 | // Step 4: Perform the linkage 42 | val linked: RDD[(String, Array[SparkScoreDoc])] = luceneRDD.link(leftCountriesRDD, fuzzyLinker, 10) 43 | 44 | // Step 5: View the results 45 | linked.foreach(x => println((x._1, x._2.mkString(",")))) 46 | 47 | // spa,List(SparkScoreDoc(5.1271343,84,0,Text fields:_1:[spain]))) 48 | // (gree,List(SparkScoreDoc(5.1271343,86,0,Text fields:_1:[greece]))) 49 | // (germa,List(SparkScoreDoc(5.127134,83,0,Text fields:_1:[germany]))) 50 | // (ita,List(SparkScoreDoc(2.9601524,106,0,Text fields:_1:[italy]), SparkScoreDoc(2.9601524,102,0,Text fields:_1:[iraq]), SparkScoreDoc(2.9601524,101,0,Text fields:_1:[iran])) 51 | 52 | -------------------------------------------------------------------------------- /scripts/recordLinkage/simpleExamples/linkagePrefixExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import scala.io.Source 19 | import org.apache.spark.rdd.RDD 20 | import org.zouzias.spark.lucenerdd._ 21 | import org.zouzias.spark.lucenerdd.LuceneRDD 22 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc 23 | 24 | // Step 1: Query prefixes of countries 25 | // Shooting for Greece, Russian, Argentina and Belgium 26 | val leftCountries = Array("gre", "ru", "ar", "bel") 27 | val leftCountriesRDD: RDD[String] = sc.parallelize(leftCountries) 28 | 29 | // Step 2: Load all country names 30 | val countries = sc.parallelize(Source.fromFile("src/test/resources/countries.txt").getLines() 31 | .map(_.toLowerCase()).toSeq) 32 | val luceneRDD = LuceneRDD(countries) 33 | 34 | // Step 3: Define you linkage function (prefix) 35 | def prefixLinker(country: String): String = { 36 | s"_1:${country}*" 37 | } 38 | 39 | // Step 4: Perform the linkage 40 | val linked: RDD[(String, Array[SparkScoreDoc])] = luceneRDD.link(leftCountriesRDD, prefixLinker, 10) 41 | 42 | // Step 5: View the results 43 | linked.foreach(x => println((x._1, x._2.mkString(",")))) 44 | 45 | // (gre,List(SparkScoreDoc(1.0,88,0,Text fields:_1:[grenada]), SparkScoreDoc(1.0,87,0,Text fields:_1:[greenland]), SparkScoreDoc(1.0,86,0,Text fields:_1:[greece]))) 46 | // (ar,List(SparkScoreDoc(1.0,12,0,Text fields:_1:[aruba]), SparkScoreDoc(1.0,11,0,Text fields:_1:[armenia]), SparkScoreDoc(1.0,10,0,Text fields:_1:[argentina]))) 47 | // (ru,List(SparkScoreDoc(1.0,55,0,Text fields:_1:[russia]))) 48 | // (be,List(SparkScoreDoc(1.0,25,0,Text fields:_1:[bermuda]), SparkScoreDoc(1.0,24,0,Text fields:_1:[benin]), SparkScoreDoc(1.0,23,0,Text fields:_1:[belize]), SparkScoreDoc(1.0,22,0,Text fields:_1:[belgium]), SparkScoreDoc(1.0,21,0,Text fields:_1:[belarus]))) -------------------------------------------------------------------------------- /scripts/spatial/loadSolrSpatialData.scala: -------------------------------------------------------------------------------- 1 | 2 | 3 | import java.io.StringReader 4 | 5 | import com.spatial4j.core.context.jts.JtsSpatialContext 6 | import com.spatial4j.core.io.ShapeIO 7 | import org.apache.spark.rdd.RDD 8 | import org.zouzias.spark.lucenerdd.spatial.shape._ 9 | import org.zouzias.spark.lucenerdd.spatial.shape.ShapeLuceneRDD 10 | import org.zouzias.spark.lucenerdd._ 11 | import org.zouzias.spark.lucenerdd.LuceneRDD 12 | 13 | import scala.reflect.ClassTag 14 | 15 | sc.setLogLevel("INFO") 16 | 17 | // Load all countries 18 | val allCountries = spark.read.parquet("data/countries-poly.parquet").select("name", "shape").map(row => (row.getString(1), row.getString(0))) 19 | 20 | // Load all cities 21 | val capitals = spark.read.parquet("data/capitals.parquet").select("name", "shape").map(row => (row.getString(1), row.getString(0))) 22 | 23 | def parseDouble(s: String): Double = try { s.toDouble } catch { case _: Throwable => 0.0 } 24 | 25 | def coords(city: (String, String)): (Double, Double) = { 26 | val str = city._1 27 | val nums = str.dropWhile(x => x.compareTo('(') != 0).drop(1).dropRight(1) 28 | val coords = nums.split(" ").map(_.trim) 29 | (parseDouble(coords(0)), parseDouble(coords(1))) 30 | } 31 | 32 | val shapes = ShapeLuceneRDD(allCountries) 33 | shapes.cache 34 | 35 | 36 | val linked = shapes.linkByRadius(capitals.rdd, coords, 50, 10) 37 | linked.cache 38 | 39 | linked.map(x => (x._1, x._2.map(_.doc.textField("_1")))).foreach(println) -------------------------------------------------------------------------------- /scripts/spatial/loadSwissCities.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | import org.zouzias.spark.lucenerdd.spatial.shape._ 19 | import org.zouzias.spark.lucenerdd.spatial.shape.ShapeLuceneRDD 20 | import org.zouzias.spark.lucenerdd._ 21 | import org.zouzias.spark.lucenerdd.LuceneRDD 22 | val df = spark.read.format("com.databricks.spark.csv").option("header", "false").option("inferSchema", "true").option("delimiter", "\t").load("src/test/resources/spatial/CH.txt") 23 | val swissCities = df.select("_c0", "_c1", "_c5", "_c4").map(row => ((row.getDouble(2), row.getDouble(3)), row.getString(1).toLowerCase())) 24 | val shapes = ShapeLuceneRDD(swissCities.rdd) 25 | shapes.count -------------------------------------------------------------------------------- /spark-shell.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CURRENT_DIR=`pwd` 4 | 5 | # Read the version from version.sbt 6 | SPARK_LUCENERDD_VERSION=`cat version.sbt | awk '{print $5}' | xargs` 7 | 8 | # You should have downloaded this spark version under your ${HOME} 9 | SPARK_VERSION="3.2.1" 10 | 11 | echo "===============================================" 12 | echo "Loading LuceneRDD with version ${SPARK_LUCENERDD_VERSION}" 13 | echo "===============================================" 14 | 15 | echo "===============================================" 16 | echo "SPARK version: ${SPARK_VERSION}" 17 | echo "===============================================" 18 | 19 | # Assumes that spark is installed under home directory 20 | HOME_DIR=`echo ~` 21 | #export SPARK_LOCAL_IP=localhost 22 | SPARK_HOME=${HOME_DIR}/spark-${SPARK_VERSION}-bin-hadoop3.2 23 | 24 | # spark-lucenerdd assembly JAR 25 | MAIN_JAR=${CURRENT_DIR}/target/scala-2.12/spark-lucenerdd-assembly-${SPARK_LUCENERDD_VERSION}.jar 26 | 27 | # Run spark shell locally 28 | ${SPARK_HOME}/bin/spark-shell --jars "${MAIN_JAR}" \ 29 | --conf "spark.executor.memory=1g" \ 30 | --conf "spark.driver.memory=1g" \ 31 | --conf "spark.rdd.compress=true" \ 32 | --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ 33 | --conf "spark.kryo.registrator=org.zouzias.spark.lucenerdd.LuceneRDDKryoRegistrator" \ 34 | --conf spark.executor.extraJavaOptions="-Dlucenerdd.index.store.mode=disk" \ 35 | --conf spark.driver.extraJavaOptions="-Dlucenerdd.index.store.mode=disk" \ 36 | --conf "spark.kryoserializer.buffer=24mb" \ 37 | --master local[*] 38 | -------------------------------------------------------------------------------- /src/main/resources/reference.conf: -------------------------------------------------------------------------------- 1 | lucenerdd { 2 | 3 | // Name of analyzer as it is under Lucene's package org.apache.lucene.analysis.XX 4 | analyzer.name = "en" 5 | analyzer.name=${?LUCENERDD_ANALYZER_NAME} 6 | 7 | 8 | // Similarity scoring for Lucenes 9 | similarity.name = "bm25" // anything else will default to Lucene classic similarity 10 | similarity.name = ${?LUCENERDD_SIMILARITY_NAME} 11 | 12 | // Supported linkage methods 13 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD 14 | // fits in spark driver's memory) 15 | // 16 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions 17 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of 18 | // partitions of the queries RDD. 19 | linker.method = "collectbroadcast" 20 | linker.method = ${?LUCENERDD_LINKER_METHOD} 21 | 22 | index { 23 | 24 | // Lucene index storage 25 | // Use 'disk' to store the index in Java's temp directory 26 | // Otherwise the index will be stored in memory 27 | // Do not use memory, see http://lucene.apache.org/core/7_5_0/core/org/apache/lucene/store/RAMDirectory.html 28 | store.mode = "disk" 29 | store.mode = ${?LUCENERDD_INDEX_STORE_MODE} 30 | 31 | stringfields{ 32 | 33 | // Analyze string fields by default or not 34 | // Implicit fields, like _1, _2, etc will use this option 35 | analyzed = true 36 | analyzed = ${?LUCENERDD_INDEX_STRINGFIELDS_ANALYZED} 37 | 38 | // Select a subset of string fields that you do not wish to be analyzed 39 | // Due to serialization issues this list should be set before starting a Spark Session 40 | // Moreover, all text/string fields that end with '_notanalyzed' are not analyzed 41 | not_analyzed_list = [] 42 | not_analyzed_list = ${?LUCENERDD_INDEX_STRINGFIELDS_NOT_ANALYZED_LIST} 43 | 44 | // Text fields options as in org.apache.lucene.index.IndexOptions 45 | // 46 | // Other options are: 47 | // "DOCS" 48 | // "DOCS_AND_FREQS" 49 | // "DOCS_AND_FREQS_AND_POSITIONS" 50 | // "DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS" 51 | // "NONE" 52 | options = "docs_and_freqs_and_positions_and_offsets" 53 | options = ${?LUCENERDD_INDEX_STRINGFIELDS_OPTIONS} 54 | 55 | terms { 56 | // Omit terms norms 57 | omitnorms = false 58 | omitnorms = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_OMITNORMS} 59 | 60 | // Store term positions 61 | positions = false 62 | positions = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_POSITIONS} 63 | 64 | // Store Term vectors (set true, otherwise LuceneRDD.termVectors(fieldName) will fail) 65 | vectors = true 66 | vectors = ${?LUCENERDD_INDEX_STRINGFIELDS_TERMS_VECTORS} 67 | } 68 | } 69 | } 70 | 71 | query { 72 | // Maximum value on topK queries 73 | topk.maxvalue = 100 74 | topk.maxvalue = ${?LUCENERDD_QUERY_TOPK_MAXVALUE} 75 | 76 | // Default value of number of returned results 77 | topk.default = 10 78 | topk.default = ${?LUCENERDD_QUERY_TOPK_DEFAULT} 79 | 80 | // Default value of number of faceted results 81 | facets.number.default = 10 82 | facets.number.default = ${?LUCENERDD_QUERY_FACETS_NUMBER_DEFAULT} 83 | 84 | } 85 | 86 | // Spatial related configurations used by ShapeLuceneRDD 87 | spatial { 88 | prefixtree { 89 | 90 | // Spatial tree data structure 91 | name = "quad" // "geohash" or "quad" 92 | name = ${?LUCENE_SPATIAL_PREFIXTREE_NAME} 93 | 94 | maxlevel = 9 // 11 results in sub-meter precision for geohash 95 | maxlevel = ${?LUCENE_SPATIAL_PREFIXTREE_MAXLEVEL} 96 | 97 | 98 | maxDistErr = 5.0 // in kilometers 99 | maxDistErr = ${?LUCENE_SPATIAL_PREFIXTREE_MAXDISTERR} 100 | 101 | } 102 | 103 | // Shape format can be one of ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT 104 | shape.io.format = "WKT" 105 | shape.io.format = ${?LUCENE_SPATIAL_SHAPE_IO_FORMAT} 106 | 107 | // Supported linkage methods 108 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD 109 | // fits in spark driver's memory) 110 | // 111 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions 112 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of 113 | // partitions of the queries RDD. 114 | linker.method = "collectbroadcast" 115 | linker.method = ${?LUCENE_SPATIAL_LINKER_METHOD} 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/LuceneRDDKryoRegistrator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.twitter.algebird.TopK 20 | import com.twitter.chill.Kryo 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} 23 | import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD 24 | import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc} 25 | import org.zouzias.spark.lucenerdd.partition.LuceneRDDPartition 26 | import org.zouzias.spark.lucenerdd.response.{LuceneRDDResponse, LuceneRDDResponsePartition} 27 | import org.zouzias.spark.lucenerdd.testing.{FavoriteCaseClass, Person} 28 | 29 | class LuceneRDDKryoRegistrator extends KryoRegistrator { 30 | def registerClasses(kryo: Kryo): Unit = { 31 | kryo.register(classOf[LuceneRDD[_]]) 32 | kryo.register(classOf[LuceneRDDPartition[_]]) 33 | kryo.register(classOf[FacetedLuceneRDD[_]]) 34 | kryo.register(classOf[Number]) 35 | kryo.register(classOf[java.lang.Double]) 36 | kryo.register(classOf[java.lang.Float]) 37 | kryo.register(classOf[java.lang.Integer]) 38 | kryo.register(classOf[java.lang.Long]) 39 | kryo.register(classOf[java.lang.Short]) 40 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofRef[_]]) 41 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofFloat]) 42 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofDouble]) 43 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofInt]) 44 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofLong]) 45 | kryo.register(classOf[Array[String]]) 46 | kryo.register(classOf[Array[Number]]) 47 | kryo.register(classOf[Array[Float]]) 48 | kryo.register(classOf[Array[Int]]) 49 | kryo.register(classOf[Array[Long]]) 50 | kryo.register(classOf[Array[Double]]) 51 | kryo.register(classOf[Array[Boolean]]) 52 | kryo.register(classOf[Range]) 53 | kryo.register(classOf[scala.collection.immutable.Map[String, String]]) 54 | kryo.register(classOf[scala.collection.immutable.Map[String, Number]]) 55 | kryo.register(classOf[scala.collection.immutable.Set[_]]) 56 | kryo.register(classOf[scala.collection.immutable.Map[_, _]]) 57 | kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]]) 58 | kryo.register(classOf[SparkFacetResult]) 59 | kryo.register(classOf[SparkScoreDoc]) 60 | kryo.register(classOf[LuceneRDDResponse]) 61 | kryo.register(classOf[LuceneRDDResponsePartition]) 62 | kryo.register(classOf[TopK[_]]) 63 | kryo.register(classOf[FavoriteCaseClass]) /* For testing */ 64 | kryo.register(classOf[Array[FavoriteCaseClass]]) /* For testing */ 65 | kryo.register(classOf[Person]) /* For testing */ 66 | kryo.register(classOf[Array[Person]]) /* For testing */ 67 | () 68 | } 69 | } 70 | 71 | /** 72 | * Decorator for LuceneRDD Kryo serialization 73 | */ 74 | object LuceneRDDKryoRegistrator { 75 | def registerKryoClasses(conf: SparkConf): SparkConf = { 76 | conf.set("spark.serializer", classOf[KryoSerializer].getName) 77 | .set("spark.kryo.registrator", classOf[LuceneRDDKryoRegistrator].getName) 78 | .set("spark.kryo.registrationRequired", "false") 79 | /* Set the above to true s.t. all classes are registered with Kryo */ 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/aggregate/SparkFacetResultMonoid.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.aggregate 18 | 19 | import com.twitter.algebird.MapMonoid 20 | import org.zouzias.spark.lucenerdd.models.SparkFacetResult 21 | 22 | /** 23 | * Monoid used to aggregate faceted results [[SparkFacetResult]] 24 | * from the executors to the driver 25 | */ 26 | object SparkFacetResultMonoid extends Serializable { 27 | 28 | private lazy val facetMonoid = new MapMonoid[String, Long]() 29 | 30 | def zero(facetName: String): SparkFacetResult = SparkFacetResult(facetName, facetMonoid.zero) 31 | 32 | def plus(l: SparkFacetResult, r: SparkFacetResult): SparkFacetResult = { 33 | require(l.facetName == r.facetName) // Check if summing same facets 34 | SparkFacetResult(l.facetName, facetMonoid.plus(l.facets, r.facets)) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/analyzers/AnalyzerConfigurable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.analyzers 18 | 19 | import org.apache.lucene.analysis.Analyzer 20 | import org.apache.lucene.analysis.ar.ArabicAnalyzer 21 | import org.apache.lucene.analysis.bg.BulgarianAnalyzer 22 | import org.apache.lucene.analysis.br.BrazilianAnalyzer 23 | import org.apache.lucene.analysis.ca.CatalanAnalyzer 24 | import org.apache.lucene.analysis.cjk.CJKAnalyzer 25 | import org.apache.lucene.analysis.ckb.SoraniAnalyzer 26 | import org.apache.lucene.analysis.core.WhitespaceAnalyzer 27 | import org.apache.lucene.analysis.cz.CzechAnalyzer 28 | import org.apache.lucene.analysis.da.DanishAnalyzer 29 | import org.apache.lucene.analysis.de.GermanAnalyzer 30 | import org.apache.lucene.analysis.el.GreekAnalyzer 31 | import org.apache.lucene.analysis.en.EnglishAnalyzer 32 | import org.apache.lucene.analysis.es.SpanishAnalyzer 33 | import org.apache.lucene.analysis.eu.BasqueAnalyzer 34 | import org.apache.lucene.analysis.fa.PersianAnalyzer 35 | import org.apache.lucene.analysis.fi.FinnishAnalyzer 36 | import org.apache.lucene.analysis.fr.FrenchAnalyzer 37 | import org.apache.lucene.analysis.ga.IrishAnalyzer 38 | import org.apache.lucene.analysis.gl.GalicianAnalyzer 39 | import org.apache.lucene.analysis.hi.HindiAnalyzer 40 | import org.apache.lucene.analysis.hu.HungarianAnalyzer 41 | import org.apache.lucene.analysis.id.IndonesianAnalyzer 42 | import org.apache.lucene.analysis.it.ItalianAnalyzer 43 | import org.apache.lucene.analysis.lt.LithuanianAnalyzer 44 | import org.apache.lucene.analysis.lv.LatvianAnalyzer 45 | import org.apache.lucene.analysis.nl.DutchAnalyzer 46 | import org.apache.lucene.analysis.no.NorwegianAnalyzer 47 | import org.apache.lucene.analysis.pt.PortugueseAnalyzer 48 | import org.apache.lucene.analysis.ru.RussianAnalyzer 49 | import org.apache.lucene.analysis.standard.StandardAnalyzer 50 | import org.apache.lucene.analysis.tr.TurkishAnalyzer 51 | import org.zouzias.spark.lucenerdd.config.Configurable 52 | import org.apache.spark.internal.Logging 53 | 54 | /** 55 | * Lucene Analyzer loader via configuration or via class name 56 | * 57 | * An analyzer can be loaded by using all the short country codes, i.e., 58 | * en,el,de, etc or using a class name present in the classpath, i.e., 59 | * 'org.apache.lucene.analysis.el.GreekAnalyzer' 60 | * 61 | * Custom Analyzers can be loaded provided that are present during runtime. 62 | */ 63 | trait AnalyzerConfigurable extends Configurable 64 | with Logging { 65 | 66 | private val IndexAnalyzerConfigKey = "lucenerdd.index.analyzer.name" 67 | private val QueryAnalyzerConfigKey = "lucenerdd.query.analyzer.name" 68 | 69 | /** Get the configured analyzers or fallback to English */ 70 | protected def getOrElseEn(analyzerName: Option[String]): String = analyzerName.getOrElse("en") 71 | 72 | protected val IndexAnalyzerConfigName: Option[String] = 73 | if (Config.hasPath(IndexAnalyzerConfigKey)) { 74 | Some(Config.getString(IndexAnalyzerConfigKey))} else None 75 | 76 | protected val QueryAnalyzerConfigName: Option[String] = 77 | if (Config.hasPath(QueryAnalyzerConfigKey)) { 78 | Some(Config.getString(QueryAnalyzerConfigKey))} else None 79 | 80 | protected def getAnalyzer(analyzerName: Option[String]): Analyzer = { 81 | if (analyzerName.isDefined) { 82 | analyzerName.get match { 83 | case "whitespace" => new WhitespaceAnalyzer() 84 | case "ar" => new ArabicAnalyzer() 85 | case "bg" => new BulgarianAnalyzer() 86 | case "br" => new BrazilianAnalyzer() 87 | case "ca" => new CatalanAnalyzer() 88 | case "cjk" => new CJKAnalyzer() 89 | case "ckb" => new SoraniAnalyzer() 90 | case "cz" => new CzechAnalyzer() 91 | case "da" => new DanishAnalyzer() 92 | case "de" => new GermanAnalyzer() 93 | case "el" => new GreekAnalyzer() 94 | case "en" => new EnglishAnalyzer() 95 | case "es" => new SpanishAnalyzer() 96 | case "eu" => new BasqueAnalyzer() 97 | case "fa" => new PersianAnalyzer() 98 | case "fi" => new FinnishAnalyzer() 99 | case "fr" => new FrenchAnalyzer() 100 | case "ga" => new IrishAnalyzer() 101 | case "gl" => new GalicianAnalyzer() 102 | case "hi" => new HindiAnalyzer() 103 | case "hu" => new HungarianAnalyzer() 104 | case "id" => new IndonesianAnalyzer() 105 | case "it" => new ItalianAnalyzer() 106 | case "lt" => new LithuanianAnalyzer() 107 | case "lv" => new LatvianAnalyzer() 108 | case "nl" => new DutchAnalyzer() 109 | case "no" => new NorwegianAnalyzer() 110 | case "pt" => new PortugueseAnalyzer() 111 | case "ru" => new RussianAnalyzer() 112 | case "tr" => new TurkishAnalyzer() 113 | case otherwise: String => 114 | try { 115 | val clazz = loadConstructor[Analyzer](otherwise) 116 | clazz 117 | } 118 | catch { 119 | case e: ClassNotFoundException => 120 | logError(s"Class ${otherwise} was not found in classpath. Does the class exist?", e) 121 | null 122 | case e: ClassCastException => 123 | logError(s"Class ${otherwise} could not be " + 124 | s"cast to superclass org.apache.lucene.analysis.Analyzer.", e) 125 | null 126 | case e: Throwable => 127 | logError(s"Class ${otherwise} could not be used as Analyzer.", e) 128 | null 129 | } 130 | } 131 | } 132 | else { 133 | logInfo("Analyzer name is not defined. Default analyzer is StandardAnalyzer().") 134 | new StandardAnalyzer() 135 | } 136 | } 137 | 138 | /** 139 | * Load a Lucene [[Analyzer]] using class name 140 | * 141 | * @param className The class name of the analyzer to load 142 | * @tparam T 143 | * @return Returns a Lucene Analyzer 144 | */ 145 | private def loadConstructor[T <: Analyzer](className: String): T = { 146 | val loader = getClass.getClassLoader 147 | logInfo(s"Loading class ${className} using loader ${loader}") 148 | val loadedClass: Class[T] = loader.loadClass(className).asInstanceOf[Class[T]] 149 | val constructor = loadedClass.getConstructor() 150 | constructor.newInstance() 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/config/Configurable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.config 18 | 19 | import com.typesafe.config.ConfigFactory 20 | 21 | /** 22 | * Load typesafe configuration 23 | */ 24 | trait Configurable extends Serializable { 25 | lazy val Config = ConfigFactory.load() 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/config/LuceneRDDConfigurable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.config 18 | 19 | import org.apache.lucene.index.IndexOptions 20 | import scala.collection.JavaConverters._ 21 | 22 | /** 23 | * Configuration for [[org.zouzias.spark.lucenerdd.LuceneRDD]] 24 | */ 25 | trait LuceneRDDConfigurable extends Configurable { 26 | 27 | protected val MaxDefaultTopKValue: Int = { 28 | if (Config.hasPath("lucenerdd.query.topk.default")) { 29 | Config.getInt("lucenerdd.query.topk.maxvalue") 30 | } 31 | else 1000 32 | } 33 | 34 | /** Default value for topK queries */ 35 | protected val DefaultTopK: Int = { 36 | if (Config.hasPath("lucenerdd.query.topk.default")) { 37 | Config.getInt("lucenerdd.query.topk.default") 38 | } 39 | else 10 40 | } 41 | 42 | protected val DefaultFacetNum: Int = { 43 | if (Config.hasPath("lucenerdd.query.facet.topk.default")) { 44 | Config.getInt("lucenerdd.query.facet.topk.default") 45 | } 46 | else 10 47 | } 48 | 49 | protected val StringFieldsDefaultAnalyzed: Boolean = { 50 | if (Config.hasPath("lucenerdd.index.stringfields.analyzed")) { 51 | Config.getBoolean("lucenerdd.index.stringfields.analyzed") 52 | } 53 | else { 54 | true 55 | } 56 | } 57 | 58 | /** 59 | * List of string fields *not* to be analyzed 60 | */ 61 | protected val StringFieldsListToBeNotAnalyzed: List[String] = { 62 | if (Config.hasPath("lucenerdd.index.stringfields.not_analyzed_list")) { 63 | Config.getStringList("lucenerdd.index.stringfields.not_analyzed_list") 64 | .asScala.toList 65 | } 66 | else { 67 | List.empty[String] 68 | } 69 | } 70 | 71 | protected val StringFieldsStoreTermVector: Boolean = { 72 | if (Config.hasPath("lucenerdd.index.stringfields.terms.vectors")) { 73 | Config.getBoolean("lucenerdd.index.stringfields.terms.vectors") 74 | } 75 | else true 76 | } 77 | 78 | protected val StringFieldsStoreTermPositions: Boolean = { 79 | if (Config.hasPath("lucenerdd.index.stringfields.terms.positions")) { 80 | Config.getBoolean("lucenerdd.index.stringfields.terms.positions") 81 | } 82 | else true 83 | } 84 | 85 | protected val StringFieldsOmitNorms: Boolean = { 86 | if (Config.hasPath("lucenerdd.index.stringfields.terms.omitnorms")) { 87 | Config.getBoolean("lucenerdd.index.stringfields.terms.omitnorms") 88 | } 89 | else false 90 | } 91 | 92 | protected val StringFieldsIndexOptions: IndexOptions = { 93 | if (Config.hasPath("lucenerdd.index.stringfields.options")) { 94 | val indexOptions = Config.getString("lucenerdd.index.stringfields.options") 95 | 96 | indexOptions.toLowerCase match { 97 | case "docs" => IndexOptions.DOCS 98 | case "docs_and_freqs" => IndexOptions.DOCS_AND_FREQS 99 | case "docs_and_freqs_and_positions" => IndexOptions.DOCS_AND_FREQS_AND_POSITIONS 100 | case "docs_and_freqs_and_positions_and_offsets" => 101 | IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS 102 | case _ => IndexOptions.NONE 103 | } 104 | } 105 | else IndexOptions.DOCS_AND_FREQS_AND_POSITIONS // Default 106 | } 107 | 108 | protected val getLinkerMethod: String = { 109 | if (Config.hasPath("lucenerdd.linker.method ")) { 110 | Config.getString("lucenerdd.linker.method ") 111 | } 112 | else "collectbroadcast" // collectbroadcast by default 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/config/LuceneRDDParams.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.config 18 | 19 | import org.zouzias.spark.lucenerdd.analyzers.AnalyzerConfigurable 20 | import org.zouzias.spark.lucenerdd.query.SimilarityConfigurable 21 | 22 | /** Lucene analysis parameters during indexing and querying. 23 | * 24 | * @param indexAnalyzer Index analyzer name. Lucene [[Analyzer]] used during indexing 25 | * @param queryAnalyzer Query analyzer name. Lucene [[Analyzer]] used during querying 26 | * @param similarity Lucene scoring similarity, i.e., BM25 or TF-IDF 27 | * @param indexAnalyzerPerField Lucene Analyzer per field (indexing time), default empty 28 | * @param queryAnalyzerPerField Lucene Analyzer per field (query time), default empty 29 | */ 30 | case class LuceneRDDParams(indexAnalyzer: String, 31 | queryAnalyzer: String, 32 | similarity: String, 33 | indexAnalyzerPerField: Map[String, String], 34 | queryAnalyzerPerField: Map[String, String]) extends Serializable 35 | 36 | 37 | object LuceneRDDParams extends AnalyzerConfigurable with SimilarityConfigurable { 38 | def apply(): LuceneRDDParams = { 39 | new LuceneRDDParams(getOrElseEn(IndexAnalyzerConfigName), 40 | getOrElseEn(QueryAnalyzerConfigName), 41 | getOrElseClassic(), 42 | Map.empty[String, String], 43 | Map.empty[String, String]) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/config/ShapeLuceneRDDConfigurable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.config 18 | 19 | import org.locationtech.spatial4j.io.ShapeIO 20 | 21 | trait ShapeLuceneRDDConfigurable extends LuceneRDDConfigurable { 22 | 23 | protected val getPrefixTreeMaxLevel: Int = { 24 | if (Config.hasPath("lucenerdd.spatial.prefixtree.maxlevel")) { 25 | Config.getInt("lucenerdd.spatial.prefixtree.maxlevel") 26 | } 27 | else 11 28 | } 29 | 30 | protected val getPrefixTreeName: String = { 31 | if (Config.hasPath("lucenerdd.spatial.prefixtree.name")) { 32 | Config.getString("lucenerdd.spatial.prefixtree.name") 33 | } 34 | else "geohash" // Geohash tree by default 35 | } 36 | 37 | protected val getPrefixTreeMaxDistErr: Double = { 38 | if (Config.hasPath("lucenerdd.spatial.prefixtree.maxDistErr")) { 39 | Config.getDouble("lucenerdd.spatial.prefixtree.maxDistErr") 40 | } 41 | else 1D 42 | } 43 | 44 | protected val getLocationFieldName: String = { 45 | if (Config.hasPath("lucenerdd.spatial.location.field.name")) { 46 | Config.getString("lucenerdd.spatial.location.field.name") 47 | } 48 | else "__location__" 49 | } 50 | 51 | protected val getShapeFormat: String = { 52 | if (Config.hasPath("lucenerdd.spatial.shape.io.format")) { 53 | val format = Config.getString("lucenerdd.spatial.shape.io.format") 54 | val availableFormats = Array(ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT) 55 | if (availableFormats.contains(format)) format else ShapeIO.WKT 56 | } 57 | else ShapeIO.WKT 58 | } 59 | 60 | protected val getShapeLinkerMethod: String = { 61 | if (Config.hasPath("lucenerdd.spatial.linker.method ")) { 62 | Config.getString("lucenerdd.spatial.linker.method ") 63 | } 64 | else "collectbroadcast" // collectbroadcast by default 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/facets/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import org.apache.lucene.document._ 20 | import org.apache.lucene.facet.FacetField 21 | import org.apache.spark.sql.Row 22 | 23 | import scala.reflect.ClassTag 24 | 25 | /** 26 | * Contains implicit conversion to [[org.apache.lucene.document.Document]] 27 | * which prepares the index for faceted search as well. 28 | */ 29 | package object facets { 30 | 31 | private val Stored = Field.Store.YES 32 | private val DefaultFieldName = "_1" 33 | 34 | /** 35 | * Adds extra field on index with suffix [[FacetedLuceneRDD.FacetTextFieldSuffix]] 36 | * This fiels is used on faceted queries 37 | * 38 | * @param doc Input document 39 | * @param fieldName Field name 40 | * @param fieldValue Field value to be indexed 41 | */ 42 | private def addTextFacetField(doc: Document, fieldName: String, fieldValue: String): Unit = { 43 | if ( fieldValue.nonEmpty) { // Issues with empty strings on facets 44 | doc.add(new FacetField(s"${fieldName}${FacetedLuceneRDD.FacetTextFieldSuffix}", 45 | fieldValue)) 46 | } 47 | } 48 | 49 | implicit def intToDocument(v: Int): Document = { 50 | val doc = new Document 51 | doc.add(new IntPoint(DefaultFieldName, v)) 52 | addTextFacetField(doc, DefaultFieldName, v.toString) 53 | doc 54 | } 55 | 56 | implicit def longToDocument(v: Long): Document = { 57 | val doc = new Document 58 | doc.add(new LongPoint(DefaultFieldName, v)) 59 | addTextFacetField(doc, DefaultFieldName, v.toString) 60 | doc 61 | } 62 | 63 | implicit def doubleToDocument(v: Double): Document = { 64 | val doc = new Document 65 | doc.add(new DoublePoint(DefaultFieldName, v)) 66 | addTextFacetField(doc, DefaultFieldName, v.toString) 67 | doc 68 | } 69 | 70 | implicit def floatToDocument(v: Float): Document = { 71 | val doc = new Document 72 | doc.add(new FloatPoint(DefaultFieldName, v)) 73 | addTextFacetField(doc, DefaultFieldName, v.toString) 74 | doc 75 | } 76 | 77 | implicit def stringToDocument(s: String): Document = { 78 | val doc = new Document 79 | doc.add(new TextField(DefaultFieldName, s, Stored)) 80 | addTextFacetField(doc, DefaultFieldName, s) 81 | doc 82 | } 83 | 84 | private def tupleTypeToDocument[T: ClassTag](doc: Document, index: Int, s: T): Document = { 85 | typeToDocument(doc, s"_${index}", s) 86 | } 87 | 88 | def typeToDocument[T: ClassTag](doc: Document, fName: String, s: T): Document = { 89 | s match { 90 | case x: String => 91 | doc.add(new TextField(fName, x, Stored)) 92 | addTextFacetField(doc, fName, x) 93 | case x: Long => 94 | doc.add(new LongPoint(fName, x)) 95 | doc.add(new StoredField(fName, x)) 96 | doc.add(new NumericDocValuesField(s"${fName} ${FacetedLuceneRDD.FacetNumericFieldSuffix}", 97 | x)) 98 | case x: Int => 99 | doc.add(new IntPoint(fName, x)) 100 | doc.add(new StoredField(fName, x)) 101 | doc.add(new NumericDocValuesField(s"${fName}${FacetedLuceneRDD.FacetNumericFieldSuffix}", 102 | x.toLong)) 103 | case x: Float => 104 | doc.add(new FloatPoint(fName, x)) 105 | doc.add(new StoredField(fName, x)) 106 | doc.add(new FloatDocValuesField(s"${fName}${FacetedLuceneRDD.FacetNumericFieldSuffix}", 107 | x)) 108 | case x: Double => 109 | doc.add(new DoublePoint(fName, x)) 110 | doc.add(new StoredField(fName, x)) 111 | doc.add(new DoubleDocValuesField(s"${fName}${FacetedLuceneRDD.FacetNumericFieldSuffix}", 112 | x)) 113 | } 114 | doc 115 | } 116 | 117 | implicit def iterablePrimitiveToDocument[T: ClassTag](iter: Iterable[T]): Document = { 118 | val doc = new Document 119 | iter.foreach( item => tupleTypeToDocument(doc, 1, item)) 120 | doc 121 | } 122 | 123 | implicit def mapToDocument[T: ClassTag](map: Map[String, T]): Document = { 124 | val doc = new Document 125 | map.foreach{ case (key, value) => 126 | typeToDocument(doc, key, value) 127 | } 128 | doc 129 | } 130 | 131 | /** 132 | * Implicit conversion for all product types, such as case classes and Tuples 133 | * @param s 134 | * @tparam T 135 | * @return 136 | */ 137 | implicit def productTypeToDocument[T <: Product : ClassTag](s: T): Document = { 138 | val doc = new Document 139 | 140 | val fieldNames = s.getClass.getDeclaredFields.map(_.getName).toIterator 141 | val fieldValues = s.productIterator 142 | fieldValues.zip(fieldNames).foreach{ case (elem, fieldName) => 143 | typeToDocument(doc, fieldName, elem) 144 | } 145 | 146 | doc 147 | } 148 | 149 | /** 150 | * Implicit conversion for Spark Row: used for DataFrame 151 | * @param row 152 | * @return 153 | */ 154 | implicit def sparkRowToDocument(row: Row): Document = { 155 | val doc = new Document 156 | 157 | val fieldNames = row.schema.fieldNames 158 | fieldNames.foreach{ case fieldName => 159 | val index = row.fieldIndex(fieldName) 160 | typeToDocument(doc, fieldName, row.get(index)) 161 | } 162 | 163 | doc 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/matrices/TermDocMatrix.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.matrices 18 | 19 | import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} 20 | import org.apache.spark.rdd.RDD 21 | import org.zouzias.spark.lucenerdd.models.TermVectorEntry 22 | 23 | /** 24 | * Term Document Matrix of a Lucene field 25 | * 26 | * Each term (row of matrix) is uniquely assigned an index 27 | */ 28 | class TermDocMatrix(triplets: RDD[TermVectorEntry]) extends Serializable { 29 | 30 | private lazy val docIdsPerShardMap = computeUniqueDocId() 31 | 32 | private lazy val indexedTerms = triplets.map(_.term).distinct().zipWithIndex().map(_.swap) 33 | private lazy val indexToTerm: Map[Long, String] = indexedTerms.collect().toMap 34 | private lazy val termToIndex: Map[String, Long] = indexToTerm.map(_.swap) 35 | 36 | private lazy val value_ = toMatrix() 37 | 38 | private lazy val nnz_ = value_.entries.count() 39 | 40 | /** 41 | * Returns a map from the matrix row indices to terms 42 | * 43 | * Using this map, you can associate the rows of the matrix with terms 44 | * @return 45 | */ 46 | def rowIndexToTerm(): Map[Long, String] = indexToTerm 47 | 48 | /** 49 | * Returns a map from (documentId, partitionId) to the matrix column indices 50 | * 51 | * Using this map, you can associate the columns of the matrix to the documents 52 | * @return 53 | */ 54 | def computeUniqueDocId(): Map[(String, Int), Long] = { 55 | triplets.map(_.docIdPerShard).distinct().zipWithIndex() 56 | .collect().toMap 57 | } 58 | 59 | private def toMatrix(): CoordinateMatrix = { 60 | 61 | // Broadcast termToIndex Map 62 | val termToIndexB = triplets.sparkContext.broadcast(termToIndex) 63 | val docIdsPerShardMapB = triplets.sparkContext.broadcast(docIdsPerShardMap) 64 | 65 | val entries = triplets.map { case t => 66 | val i = termToIndexB.value(t.term) 67 | val j = docIdsPerShardMapB.value(t.docIdPerShard) 68 | MatrixEntry(i, j, t.count) 69 | } 70 | 71 | new CoordinateMatrix(entries) 72 | } 73 | 74 | /** 75 | * Returns the number of non-zero entries 76 | * @return 77 | */ 78 | def nnz(): Long = { 79 | nnz_ 80 | } 81 | 82 | /** 83 | * Number of rows (terms) 84 | * @return 85 | */ 86 | def numRows(): Long = value_.numRows() 87 | 88 | /** 89 | * Number of columns (documents) 90 | * @return 91 | */ 92 | def numCols(): Long = value_.numCols() 93 | 94 | def value(): CoordinateMatrix = value_ 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/models/SparkFacetResult.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.models 18 | 19 | import org.apache.lucene.facet.FacetResult 20 | 21 | case class SparkFacetResult(facetName: String, facets: Map[String, Long]) { 22 | 23 | /** 24 | * Return facet counts sorted descending 25 | * @return Sequence of (facet value, facet counts) 26 | */ 27 | def sortedFacets(): Seq[(String, Long)] = { 28 | facets.toSeq.sortBy[Long](x => -x._2) 29 | } 30 | } 31 | 32 | 33 | object SparkFacetResult extends Serializable { 34 | 35 | /** 36 | * Convert [[org.apache.lucene.facet.FacetResult]] 37 | * to [[org.zouzias.spark.lucenerdd.models.SparkFacetResult]] 38 | * 39 | * @param facetName name of facet 40 | * @param facetResult input facet results 41 | * @return 42 | */ 43 | def apply(facetName: String, facetResult: FacetResult): SparkFacetResult = { 44 | val facetResultOpt = Option(facetResult) 45 | facetResultOpt match { 46 | case Some(fctResult) => 47 | val map = fctResult.labelValues 48 | .map(labelValue => (labelValue.label, labelValue.value.longValue())) 49 | .toMap[String, Long] 50 | SparkFacetResult(facetName, map) 51 | case _ => SparkFacetResult(facetName, Map.empty[String, Long]) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/models/SparkScoreDoc.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.models 18 | 19 | import org.apache.lucene.document.Document 20 | import org.apache.lucene.index.IndexableField 21 | import org.apache.lucene.search.{IndexSearcher, ScoreDoc} 22 | import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} 23 | import org.apache.spark.sql.Row 24 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 25 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType 26 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField} 27 | 28 | import scala.collection.JavaConverters._ 29 | 30 | sealed trait FieldType extends Serializable 31 | object TextType extends FieldType 32 | object IntType extends FieldType 33 | object DoubleType extends FieldType 34 | object LongType extends FieldType 35 | object FloatType extends FieldType 36 | 37 | 38 | /** 39 | * A Lucene [[Document]] extended with score, docId and shard index 40 | * 41 | * @param score Score of document 42 | * @param docId Document id 43 | * @param shardIndex Shard index 44 | * @param doc Serialized Lucene document 45 | */ 46 | case class SparkScoreDoc(score: Float, docId: Int, shardIndex: Int, doc: Document) { 47 | 48 | /** 49 | * Convert to [[Row]] 50 | * 51 | * @return 52 | */ 53 | def toRow(): Row = { 54 | 55 | // Convert to Spark SQL DataFrame types 56 | val typeToValues = scala.collection.mutable.Map[StructField, List[Any]]().empty 57 | 58 | this.doc.getFields 59 | .asScala 60 | .filter(isOnlyStoredField) 61 | .foreach { field => 62 | val fieldName = field.name() 63 | 64 | val tp = if (field.numericValue() != null) { 65 | inferNumericType(field.numericValue) 66 | } 67 | else if (field.numericValue() == null && field.stringValue() != null) { 68 | TextType 69 | } 70 | 71 | val item = tp match { 72 | case TextType => (StructField(fieldName, StringType), field.stringValue()) 73 | case IntType => (StructField(fieldName, IntegerType), field.numericValue().intValue()) 74 | case LongType => (StructField(fieldName, 75 | org.apache.spark.sql.types.LongType), field.numericValue().longValue()) 76 | case DoubleType => (StructField(fieldName, 77 | org.apache.spark.sql.types.DoubleType), field.numericValue().doubleValue()) 78 | case FloatType => (StructField(fieldName, 79 | org.apache.spark.sql.types.FloatType), field.numericValue().floatValue()) 80 | case _ => (StructField(fieldName, StringType), field.stringValue()) 81 | } 82 | 83 | // Append or set value 84 | val oldValue: List[Any] = typeToValues.getOrElse(item._1, List.empty) 85 | typeToValues.+=((item._1, oldValue.::(item._2))) 86 | } 87 | 88 | val arrayedTypesToValues = typeToValues.map{ case (tp, values) => 89 | 90 | // If more than one values, wrap SQL type within ArrayType 91 | if (values.length == 1) { 92 | (tp, values.head) 93 | } 94 | else { 95 | (StructField(tp.name, ArrayType.apply(tp.dataType)), values) 96 | } 97 | } 98 | 99 | // Additional fields of [[SparkScoreDoc]] with known types inlucding 100 | // - document id 101 | // - documenet search score 102 | // - document shard index 103 | val extraSchemaWithValue = Seq((StructField(DocIdField, IntegerType), this.docId), 104 | (StructField(ScoreField, org.apache.spark.sql.types.FloatType), this.score), 105 | (StructField(ShardField, IntegerType), this.shardIndex)) 106 | 107 | val allTogether = arrayedTypesToValues ++ extraSchemaWithValue 108 | 109 | new GenericRowWithSchema(allTogether.values.toArray, StructType(allTogether.keys.toSeq)) 110 | } 111 | 112 | /** 113 | * Return fields that are stored only 114 | * @param field A field of a Lucene Document 115 | * @return 116 | */ 117 | private def isOnlyStoredField(field: IndexableField): Boolean = { 118 | field.fieldType().stored() 119 | } 120 | 121 | override def toString: String = { 122 | val builder = new StringBuilder 123 | builder.append(s"[score: $score/") 124 | builder.append(s"docId: $docId/") 125 | builder.append(s"doc: $doc") 126 | builder.result() 127 | } 128 | } 129 | 130 | object SparkScoreDoc extends Serializable { 131 | 132 | val DocIdField = "__docid__" 133 | val ScoreField = "__score__" 134 | val ShardField = "__shardIndex__" 135 | 136 | def apply(indexSearcher: IndexSearcher, scoreDoc: ScoreDoc): SparkScoreDoc = { 137 | SparkScoreDoc(scoreDoc.score, scoreDoc.doc, scoreDoc.shardIndex, 138 | indexSearcher.doc(scoreDoc.doc)) 139 | } 140 | 141 | def apply(indexSearcher: IndexSearcher, scoreDoc: ScoreDoc, score: Float): SparkScoreDoc = { 142 | SparkScoreDoc(score, scoreDoc.doc, scoreDoc.shardIndex, indexSearcher.doc(scoreDoc.doc)) 143 | } 144 | 145 | /** 146 | * Ordering by score (descending) 147 | */ 148 | def descending: Ordering[Row] = new Ordering[Row]{ 149 | override def compare(x: Row, y: Row): Int = { 150 | val xScore = x.getFloat(x.fieldIndex(ScoreField)) 151 | val yScore = y.getFloat(y.fieldIndex(ScoreField)) 152 | if ( xScore > yScore) { 153 | -1 154 | } else if (xScore == yScore) 0 else 1 155 | } 156 | } 157 | 158 | /** 159 | * Ordering by score (ascending) 160 | */ 161 | def ascending: Ordering[Row] = new Ordering[Row]{ 162 | override def compare(x: Row, y: Row): Int = { 163 | val xScore = x.getFloat(x.fieldIndex(ScoreField)) 164 | val yScore = y.getFloat(y.fieldIndex(ScoreField)) 165 | 166 | if ( xScore < yScore) -1 else if (xScore == yScore) 0 else 1 167 | } 168 | } 169 | 170 | /** 171 | * Infers the subclass of [[Number]] 172 | * @param num A value of type [[Number]] 173 | * @return The [[FieldType]] of the input Number value 174 | */ 175 | private def inferNumericType(num: Number): FieldType = { 176 | num match { 177 | case _: java.lang.Double => DoubleType 178 | case _: java.lang.Long => LongType 179 | case _: java.lang.Integer => IntType 180 | case _: java.lang.Float => FloatType 181 | case _ => TextType 182 | } 183 | } 184 | } 185 | 186 | 187 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/models/TermVectorEntry.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.models 18 | 19 | /** 20 | * A term vector entry (document id per shard, term as string, count) 21 | * 22 | * @param docIdPerShard Tuple2 containing (document id, partition id) 23 | * @param term Term text value 24 | * @param count Number of terms in the document 25 | */ 26 | case class TermVectorEntry(docIdPerShard: (String, Int), term: String, count: Long) 27 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/models/indexstats/FieldStatistics.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.models.indexstats 18 | 19 | import org.apache.lucene.index.IndexReader 20 | 21 | /** 22 | * Statistics for Lucene index field 23 | */ 24 | case class FieldStatistics(fieldName: String, docCount: Int, sumDocFreq: Long, 25 | totalTermFreq: Long) { 26 | override def toString(): String = { 27 | val buf = new StringBuilder() 28 | buf.append(s"fieldName: ${fieldName} / ") 29 | buf.append(s"docCount: ${docCount} / ") 30 | buf.append(s"sumDocFreq: ${sumDocFreq} / ") 31 | buf.append(s"totalTermFreq: ${totalTermFreq}\n") 32 | buf.result() 33 | } 34 | } 35 | 36 | object FieldStatistics { 37 | def apply(indexReader: IndexReader, fieldName: String): FieldStatistics = { 38 | val docCount = indexReader.getDocCount(fieldName) 39 | val sumDocFreq = indexReader.getSumDocFreq(fieldName) 40 | val totalTermFreq = indexReader.getSumTotalTermFreq(fieldName) 41 | 42 | FieldStatistics(fieldName, docCount, sumDocFreq, totalTermFreq) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/models/indexstats/IndexStatistics.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.models.indexstats 18 | 19 | /** 20 | * Statistics for Lucene index 21 | */ 22 | case class IndexStatistics(partitionId: Int, 23 | numDocs: Int, 24 | maxDocId: Int, 25 | numDeletedDocs: Int, 26 | numFields: Int, 27 | fieldsStatistics: Array[FieldStatistics]) { 28 | 29 | override def toString(): String = { 30 | val buf = new StringBuilder() 31 | buf.append(s"partitionId: ${partitionId}\n") 32 | buf.append(s"numDocs: ${numDocs}\n") 33 | buf.append(s"numDeletedDocs: ${numDeletedDocs}\n") 34 | buf.append(s"numFields: ${numFields}\n") 35 | fieldsStatistics.foreach(buf.append(_)) 36 | buf.result() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/partition/AbstractLuceneRDDPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.partition 18 | 19 | import org.apache.lucene.search.{BooleanClause, Query} 20 | import org.zouzias.spark.lucenerdd.models.indexstats.IndexStatistics 21 | import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, TermVectorEntry} 22 | import org.zouzias.spark.lucenerdd.response.LuceneRDDResponsePartition 23 | 24 | import scala.reflect.ClassTag 25 | 26 | /** 27 | * LuceneRDD partition. 28 | * 29 | * @tparam T the type associated with each entry in the set. 30 | */ 31 | private[lucenerdd] abstract class AbstractLuceneRDDPartition[T] extends Serializable 32 | with AutoCloseable { 33 | 34 | protected implicit def kTag: ClassTag[T] 35 | 36 | def size: Long 37 | 38 | def iterator: Iterator[T] 39 | 40 | def isDefined(key: T): Boolean 41 | 42 | def fields(): Set[String] 43 | 44 | /** 45 | * Multi term query 46 | * 47 | * @param docMap Map of field names to terms 48 | * @param topK Number of documents to return 49 | * @return 50 | */ 51 | def multiTermQuery(docMap: Map[String, String], 52 | topK: Int, 53 | boolClause: BooleanClause.Occur = BooleanClause.Occur.MUST) 54 | : LuceneRDDResponsePartition 55 | 56 | 57 | /** 58 | * Generic Lucene Query using QueryParser 59 | * @param searchString Lucene query string, i.e., textField:hello* 60 | * @param topK Number of documents to return 61 | * @return 62 | */ 63 | def query(searchString: String, topK: Int): LuceneRDDResponsePartition 64 | 65 | 66 | /** 67 | * Lucene search using Lucene [[Query]] 68 | * @param query Lucene query, i.e., [[org.apache.lucene.search.BooleanQuery]] or 69 | * [[org.apache.lucene.search.PhraseQuery]] 70 | * @param topK Number of documents to return 71 | * @return 72 | */ 73 | def query(query: Query, topK: Int): LuceneRDDResponsePartition 74 | 75 | /** 76 | * Multiple generic Lucene Queries using QueryParser 77 | * @param searchString Lucene query string 78 | * @param topK Number of results to return 79 | * @return 80 | */ 81 | def queries(searchString: Iterable[String], topK: Int) 82 | : Iterable[(String, LuceneRDDResponsePartition)] 83 | 84 | /** 85 | * Generic Lucene faceted Query using QueryParser 86 | * @param searchString Lucene query string, i.e., textField:hello* 87 | * @param topK Number of facets to return 88 | * @return 89 | */ 90 | def facetQuery(searchString: String, facetField: String, topK: Int) 91 | : SparkFacetResult 92 | 93 | /** 94 | * Term Query 95 | * @param fieldName Name of field 96 | * @param query Query text 97 | * @param topK Number of documents to return 98 | * @return 99 | */ 100 | def termQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition 101 | 102 | /** 103 | * Prefix Query 104 | * @param fieldName Name of field 105 | * @param query Prefix query 106 | * @param topK Number of documents to return 107 | * @return 108 | */ 109 | def prefixQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition 110 | 111 | /** 112 | * Fuzzy Query 113 | * @param fieldName Name of field 114 | * @param query Query text 115 | * @param maxEdits Fuzziness, edit distance 116 | * @param topK Number of documents to return 117 | * @return 118 | */ 119 | def fuzzyQuery(fieldName: String, query: String, 120 | maxEdits: Int, topK: Int): LuceneRDDResponsePartition 121 | 122 | /** 123 | * PhraseQuery 124 | * @param fieldName Name of field 125 | * @param query Phrase query, i.e., "hello world" 126 | * @param topK Number of documents to return 127 | * @return 128 | */ 129 | def phraseQuery(fieldName: String, query: String, topK: Int): LuceneRDDResponsePartition 130 | 131 | 132 | /** 133 | * Lucene's More Like This (MLT) functionality 134 | * @param fieldName Field name 135 | * @param query Query text 136 | * @param minTermFreq Minimum term frequency 137 | * @param minDocFreq Minimum document frequency 138 | * @param topK Number of returned documents 139 | * @return 140 | */ 141 | def moreLikeThis(fieldName: String, query: String, 142 | minTermFreq: Int, minDocFreq: Int, topK: Int) 143 | : LuceneRDDResponsePartition 144 | 145 | /** 146 | * Returns term vectors for a partition 147 | * 148 | * Since each Lucene index is created per partition, docId are not unique. 149 | * The partitionIndex is used to compute "global" document id from all documents 150 | * over all partitions 151 | * 152 | * @param fieldName Field on which to compute term vectors 153 | * @param idFieldName Field name which contains unique id 154 | * @return Array of term vector entries 155 | */ 156 | def termVectors(fieldName: String, idFieldName: Option[String]): Array[TermVectorEntry] 157 | 158 | /** 159 | * Returns statistics of the indices over all executors. 160 | * 161 | * @param fields Set of defined fields 162 | * @return 163 | */ 164 | def indexStats(fields: Set[String]): IndexStatistics 165 | 166 | /** 167 | * Restricts the entries to those satisfying a predicate 168 | * @param pred Predicate to filter on 169 | * @return 170 | */ 171 | def filter(pred: T => Boolean): AbstractLuceneRDDPartition[T] 172 | } 173 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/query/SimilarityConfigurable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.query 18 | 19 | import org.apache.lucene.search.similarities.{BM25Similarity, ClassicSimilarity, Similarity} 20 | import org.zouzias.spark.lucenerdd.config.Configurable 21 | 22 | 23 | /** 24 | * Lucene Similarity loader via configuration 25 | */ 26 | trait SimilarityConfigurable extends Configurable { 27 | 28 | private val LuceneSimilarity = "lucenerdd.similarity.name" 29 | 30 | protected val LuceneSimilarityConfigValue: Option[String] = 31 | if (Config.hasPath(LuceneSimilarity)) { 32 | Some(Config.getString(LuceneSimilarity))} else None 33 | 34 | protected def getOrElseClassic(): String = LuceneSimilarityConfigValue.getOrElse("classic") 35 | 36 | protected def getSimilarity(similarityName: Option[String]): Similarity = { 37 | if (similarityName.isDefined) { 38 | similarityName.get match { 39 | case "bm25" => new BM25Similarity() 40 | case _ => new ClassicSimilarity() 41 | } 42 | } 43 | else { 44 | new ClassicSimilarity() 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/response/FieldType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.response 18 | 19 | sealed trait FieldType extends Serializable 20 | object TextType extends FieldType 21 | object IntType extends FieldType 22 | object DoubleType extends FieldType 23 | object LongType extends FieldType 24 | object FloatType extends FieldType 25 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponse.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.response 18 | 19 | import com.twitter.algebird.TopKMonoid 20 | import org.apache.spark.annotation.DeveloperApi 21 | import org.apache.spark.{OneToOneDependency, Partition, TaskContext} 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 24 | import org.apache.spark.sql.types._ 25 | import org.apache.spark.storage.StorageLevel 26 | 27 | /** 28 | * LuceneRDD response 29 | */ 30 | private[lucenerdd] class LuceneRDDResponse 31 | (protected val partitionsRDD: RDD[LuceneRDDResponsePartition], 32 | protected val ordering: Ordering[Row]) 33 | extends RDD[Row](partitionsRDD.context, 34 | List(new OneToOneDependency(partitionsRDD))) { 35 | 36 | setName("LuceneRDDResponse") 37 | 38 | @DeveloperApi 39 | override def compute(split: Partition, context: TaskContext) 40 | : Iterator[Row] = { 41 | firstParent[LuceneRDDResponsePartition].iterator(split, context).next().iterator() 42 | } 43 | 44 | override protected def getPartitions: Array[Partition] = partitionsRDD.partitions 45 | 46 | override protected def getPreferredLocations(s: Partition): Seq[String] = 47 | partitionsRDD.preferredLocations(s) 48 | 49 | override def cache(): this.type = { 50 | this.persist(StorageLevel.MEMORY_ONLY) 51 | } 52 | 53 | override def persist(newLevel: StorageLevel): this.type = { 54 | partitionsRDD.persist(newLevel) 55 | super.persist(newLevel) 56 | this 57 | } 58 | 59 | override def unpersist(blocking: Boolean = true): this.type = { 60 | partitionsRDD.unpersist(blocking) 61 | super.unpersist(blocking) 62 | this 63 | } 64 | 65 | /** 66 | * Return the top-k result in terms of Lucene score 67 | * 68 | * It uses a [[TopKMonoid]] to compute topK 69 | * @param k Number of result to return 70 | * @return Array of result, size k 71 | */ 72 | override def take(k: Int): Array[Row] = { 73 | val monoid = new TopKMonoid[Row](k)(ordering) 74 | partitionsRDD.map(monoid.build(_)) 75 | .reduce(monoid.plus).items.toArray 76 | } 77 | 78 | override def collect(): Array[Row] = { 79 | val sz = partitionsRDD.map(_.size).sum().toInt 80 | if (sz > 0) { 81 | val monoid = new TopKMonoid[Row](sz)(ordering) 82 | partitionsRDD.map(monoid.build(_)) 83 | .reduce(monoid.plus).items.toArray 84 | } else { 85 | Array.empty[Row] 86 | } 87 | } 88 | 89 | /** 90 | * Convert LuceneRDDResponse to Spark DataFrame 91 | * @param spark Spark Session 92 | * @return DataFrame 93 | */ 94 | def toDF()(implicit spark: SparkSession): DataFrame = { 95 | val schema = this.first().schema 96 | spark.createDataFrame(this, schema) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponsePartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.response 18 | 19 | import org.apache.spark.sql.Row 20 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc 21 | 22 | case class LuceneRDDResponsePartition(results: Iterator[Row]) 23 | extends Iterable[Row] { 24 | override def iterator(): Iterator[Row] = results 25 | } 26 | 27 | object LuceneRDDResponsePartition { 28 | 29 | def apply(sparkScoreDocs: Iterable[SparkScoreDoc]): LuceneRDDResponsePartition = { 30 | apply(sparkScoreDocs.map(_.toRow()).toIterator) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDDKryoRegistrator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape 18 | 19 | import com.twitter.algebird.TopK 20 | import com.twitter.chill.Kryo 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} 23 | import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema 24 | import org.apache.spark.sql.types._ 25 | import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc} 26 | import org.zouzias.spark.lucenerdd.spatial.shape.partition.ShapeLuceneRDDPartition 27 | 28 | 29 | class ShapeLuceneRDDKryoRegistrator extends KryoRegistrator { 30 | def registerClasses(kryo: Kryo): Unit = { 31 | kryo.register(classOf[ShapeLuceneRDD[_, _]]) 32 | kryo.register(classOf[ShapeLuceneRDDPartition[_, _]]) 33 | kryo.register(classOf[Number]) 34 | kryo.register(classOf[java.lang.Double]) 35 | kryo.register(classOf[java.lang.Float]) 36 | kryo.register(classOf[java.lang.Integer]) 37 | kryo.register(classOf[java.lang.Long]) 38 | kryo.register(classOf[java.lang.Short]) 39 | kryo.register(classOf[StructType]) 40 | kryo.register(classOf[StructField]) 41 | kryo.register(classOf[IntegerType]) 42 | kryo.register(classOf[DoubleType]) 43 | kryo.register(classOf[FloatType]) 44 | kryo.register(classOf[StringType]) 45 | kryo.register(classOf[GenericRowWithSchema]) 46 | kryo.register(classOf[Metadata]) 47 | kryo.register(classOf[Object]) 48 | kryo.register(classOf[Array[Object]]) 49 | kryo.register(classOf[Array[Array[Byte]]]) 50 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofRef[_]]) 51 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofFloat]) 52 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofDouble]) 53 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofInt]) 54 | kryo.register(classOf[scala.collection.mutable.WrappedArray.ofLong]) 55 | kryo.register(classOf[Array[String]]) 56 | kryo.register(classOf[Array[Number]]) 57 | kryo.register(classOf[Array[Float]]) 58 | kryo.register(classOf[Array[Int]]) 59 | kryo.register(classOf[Array[Long]]) 60 | kryo.register(classOf[Array[Double]]) 61 | kryo.register(classOf[Array[Boolean]]) 62 | kryo.register(classOf[Array[SparkScoreDoc]]) 63 | kryo.register(classOf[Array[StructType]]) 64 | kryo.register(classOf[Array[StructField]]) 65 | kryo.register(classOf[Range]) 66 | kryo.register(classOf[scala.collection.immutable.Map[String, String]]) 67 | kryo.register(classOf[scala.collection.immutable.Map[String, Number]]) 68 | kryo.register(classOf[scala.collection.immutable.Map[_, _]]) 69 | kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]]) 70 | kryo.register(classOf[SparkFacetResult]) 71 | kryo.register(classOf[SparkScoreDoc]) 72 | kryo.register(classOf[TopK[_]]) 73 | 74 | () 75 | } 76 | } 77 | 78 | /** 79 | * Decorator for [[ShapeLuceneRDD]] Kryo serialization 80 | */ 81 | object ShapeLuceneRDDKryoRegistrator { 82 | def registerKryoClasses(conf: SparkConf): SparkConf = { 83 | conf.set("spark.serializer", classOf[KryoSerializer].getName) 84 | .set("spark.kryo.registrator", classOf[ShapeLuceneRDDKryoRegistrator].getName) 85 | .set("spark.kryo.registrationRequired", "false") 86 | /* Set the above to true s.t. all classes are registered with Kryo */ 87 | } 88 | } 89 | 90 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/context/ContextLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape.context 18 | 19 | import java.io.{StringReader, StringWriter} 20 | 21 | import org.locationtech.spatial4j.context.jts.JtsSpatialContext 22 | import org.locationtech.spatial4j.io.{ShapeReader, ShapeWriter} 23 | import org.locationtech.spatial4j.shape.Shape 24 | import org.zouzias.spark.lucenerdd.config.ShapeLuceneRDDConfigurable 25 | 26 | trait ContextLoader extends ShapeLuceneRDDConfigurable{ 27 | 28 | protected val LocationDefaultField: String = getLocationFieldName 29 | 30 | protected lazy val shapeReader: ShapeReader = ctx.getFormats.getReader(getShapeFormat) 31 | 32 | protected lazy val shapeWriter: ShapeWriter = ctx.getFormats.getWriter(getShapeFormat) 33 | 34 | protected def shapeToString(shape: Shape): String = { 35 | val writer = new StringWriter() 36 | shapeWriter.write(writer, shape) 37 | writer.toString 38 | } 39 | 40 | protected def stringToShape(shapeAsString: String): Shape = { 41 | shapeReader.read(new StringReader(shapeAsString)) 42 | } 43 | 44 | /** 45 | * The Spatial4j {@link SpatialContext} is a sort of global-ish singleton 46 | * needed by Lucene spatial. It's a facade to the rest of Spatial4j, acting 47 | * as a factory for {@link Shape}s and provides access to reading and writing 48 | * them from Strings. 49 | * 50 | * Quoting from spatial4j (https://github.com/locationtech/spatial4j#getting-started) 51 | * 52 | * "To get a SpatialContext (or just "context" for short), you could use a global singleton 53 | * SpatialContext.GEO or JtsSpatialContext.GEO which both use geodesic surface-of-sphere 54 | * calculations (when available); the JTS one principally adds Polygon support." 55 | */ 56 | protected lazy val ctx: JtsSpatialContext = JtsSpatialContext.GEO // SpatialContext.GEO 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/grids/PrefixTreeLoader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape.grids 18 | 19 | import org.apache.lucene.spatial.prefix.tree.{SpatialPrefixTree, SpatialPrefixTreeFactory} 20 | import org.zouzias.spark.lucenerdd.config.ShapeLuceneRDDConfigurable 21 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader 22 | 23 | import scala.collection.JavaConverters._ 24 | 25 | trait PrefixTreeLoader extends ContextLoader 26 | with ShapeLuceneRDDConfigurable { 27 | 28 | // results in sub-meter precision for geohash 29 | protected val maxLevels: Int = getPrefixTreeMaxLevel 30 | 31 | // Excepting 'geohash' or 'quad' 32 | protected val prefixTreeName: String = getPrefixTreeName 33 | 34 | // Maximum distance error (in KM) 35 | protected val prefixTreeMaxDistErr: Double = getPrefixTreeMaxDistErr 36 | 37 | // This can also be constructed from SpatialPrefixTreeFactory 38 | protected val grid: SpatialPrefixTree = SpatialPrefixTreeFactory.makeSPT( 39 | Map("prefixTree" -> prefixTreeName, 40 | "maxLevels" -> maxLevels.toString, 41 | "maxDistErr" -> prefixTreeMaxDistErr.toString).asJava, 42 | ClassLoader.getSystemClassLoader, 43 | ctx) 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial 18 | 19 | import java.io.StringReader 20 | 21 | import org.locationtech.jts.geom.{Coordinate, GeometryFactory} 22 | import org.locationtech.spatial4j.shape.Shape 23 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader 24 | 25 | 26 | package object shape extends ContextLoader{ 27 | 28 | private val GeometryFactory = new GeometryFactory() 29 | 30 | implicit def convertToPoint(point: (Double, Double)): Shape = { 31 | ctx.makePoint(point._1, point._2) 32 | } 33 | 34 | /** 35 | * ***Experimental*** 36 | * 37 | * Implicitly convert shape from its string representation 38 | * 39 | * @param shapeAsString 40 | * @return 41 | */ 42 | implicit def WKTToShape(shapeAsString: String): Shape = { 43 | try { 44 | shapeReader.read(new StringReader(shapeAsString)) 45 | } 46 | catch { 47 | case e: Exception => ctx.makePoint(0.0, 0.0) 48 | } 49 | } 50 | 51 | implicit def rectangleToShape(rect: (Double, Double, Double, Double)): Shape = { 52 | val minX = rect._1 53 | val maxX = rect._2 54 | val minY = rect._3 55 | val maxY = rect._4 56 | ctx.makeRectangle(minX, maxX, minY, maxY) 57 | } 58 | 59 | implicit def circleToShape(circle: ((Double, Double), Double)): Shape = { 60 | val x = circle._1._1 61 | val y = circle._1._2 62 | val radius = circle._2 63 | ctx.makeCircle(x, y, radius) 64 | } 65 | 66 | implicit def listPolygonToShape(rect: List[(Double, Double)]): Shape = { 67 | val coordinates = rect.map(p => new Coordinate(p._1, p._2)).toArray 68 | val polygon = GeometryFactory.createPolygon(coordinates) 69 | ctx.makeShape(polygon) 70 | } 71 | 72 | implicit def arrayPolygonToShape(rect: Array[(Double, Double)]): Shape = { 73 | val coordinates = rect.map(p => new Coordinate(p._1, p._2)) 74 | val polygon = GeometryFactory.createPolygon(coordinates) 75 | ctx.makeShape(polygon) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/partition/AbstractShapeLuceneRDDPartition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape.partition 18 | 19 | import org.zouzias.spark.lucenerdd.response.LuceneRDDResponsePartition 20 | import org.zouzias.spark.lucenerdd.spatial.shape.ShapeLuceneRDD.PointType 21 | 22 | import scala.reflect.ClassTag 23 | 24 | private[shape] abstract class AbstractShapeLuceneRDDPartition[K, V] extends Serializable { 25 | 26 | protected implicit def kTag: ClassTag[K] 27 | protected implicit def vTag: ClassTag[V] 28 | 29 | def size: Long 30 | 31 | def iterator: Iterator[(K, V)] 32 | 33 | def isDefined(key: K): Boolean 34 | 35 | def close(): Unit 36 | 37 | /** 38 | * Nearest neighbour search 39 | * 40 | * @param point query point 41 | * @param k number of neighbors to return 42 | * @param searchString Lucene Query string 43 | * @return 44 | */ 45 | def knnSearch(point: PointType, k: Int, searchString: String): LuceneRDDResponsePartition 46 | 47 | /** 48 | * Search for points within a circle 49 | * 50 | * @param center center of circle 51 | * @param radius radius of circle in kilometers (KM) 52 | * @param k number of points to return 53 | * @return 54 | */ 55 | def circleSearch(center: PointType, radius: Double, k: Int, operationName: String) 56 | : LuceneRDDResponsePartition 57 | 58 | /** 59 | * Spatial search with arbitrary shape 60 | * 61 | * @param shapeAsString Shape object represented as String 62 | * @param k Number of results to return 63 | * @param operationName Operation name, i.e., intersect, within, etc 64 | * @return 65 | */ 66 | def spatialSearch(shapeAsString: String, k: Int, operationName: String) 67 | : LuceneRDDResponsePartition 68 | 69 | /** 70 | * Spatial search with point 71 | * 72 | * @param point Query point 73 | * @param k Number of result to return 74 | * @param operationName Operation name, i.e., intersect, within, etc 75 | * @return 76 | */ 77 | def spatialSearch(point: PointType, k: Int, operationName: String) 78 | : LuceneRDDResponsePartition 79 | 80 | /** 81 | * Bounding box search with point and radius 82 | * 83 | * @param center given as (x, y) 84 | * @param radius distance from center in kilometers (KM) 85 | * @param k Number of results to return 86 | * @param operationName Operation name, i.e., intersect, within, etc 87 | * @return 88 | */ 89 | def bboxSearch(center: PointType, radius: Double, k: Int, operationName: String) 90 | : LuceneRDDResponsePartition 91 | 92 | /** 93 | * Bounding box search with lower left and upper right corners 94 | * 95 | * @param lowerLeft Lower left point 96 | * @param upperRight Upper left point 97 | * @param k Number of results 98 | * @param operationName Operation name, i.e., intersect, within, etc 99 | * @return 100 | */ 101 | def bboxSearch(lowerLeft: PointType, upperRight: PointType, k: Int, operationName: String) 102 | : LuceneRDDResponsePartition 103 | 104 | /** 105 | * Restricts the entries to those satisfying a predicate 106 | * 107 | * @param pred Predicate to filter on 108 | * @return 109 | */ 110 | def filter(pred: (K, V) => Boolean): AbstractShapeLuceneRDDPartition[K, V] 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/spatial/shape/strategies/SpatialStrategy.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape.strategies 18 | 19 | import org.apache.lucene.spatial.prefix.{PrefixTreeStrategy, RecursivePrefixTreeStrategy} 20 | import org.zouzias.spark.lucenerdd.spatial.shape.grids.PrefixTreeLoader 21 | 22 | trait SpatialStrategy extends PrefixTreeLoader { 23 | 24 | /** 25 | * The Lucene spatial {@link SpatialStrategy} encapsulates an approach to 26 | * indexing and searching shapes, and providing distance values for them. 27 | * It's a simple API to unify different approaches. You might use more than 28 | * one strategy for a shape as each strategy has its strengths and weaknesses. 29 | *

30 | * Note that these are initialized with a field name. 31 | */ 32 | protected val strategy: PrefixTreeStrategy = new RecursivePrefixTreeStrategy(grid, 33 | LocationDefaultField) 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/store/IndexStorable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.store 18 | 19 | import java.nio.file.{Files, Path} 20 | 21 | import org.apache.lucene.facet.FacetsConfig 22 | import org.apache.lucene.store._ 23 | import org.zouzias.spark.lucenerdd.config.Configurable 24 | import org.apache.spark.internal.Logging 25 | 26 | /** 27 | * Storage of a Lucene index Directory 28 | * 29 | * Currently, the following storage methods are supported: 30 | * 31 | * 1) "lucenerdd.index.store.mode=disk" : MMapStorage on temp disk 32 | * 2) Otherwise, memory storage using [[RAMDirectory]] 33 | */ 34 | trait IndexStorable extends Configurable 35 | with AutoCloseable 36 | with Logging { 37 | 38 | protected lazy val FacetsConfig = new FacetsConfig() 39 | 40 | private val IndexStoreKey = "lucenerdd.index.store.mode" 41 | 42 | private val tmpJavaDir = System.getProperty("java.io.tmpdir") 43 | 44 | private val indexDirName = 45 | s"indexDirectory.${System.currentTimeMillis()}.${Thread.currentThread().getId}" 46 | 47 | private val indexDir = Files.createTempDirectory(indexDirName) 48 | 49 | private val taxonomyDirName = 50 | s"taxonomyDirectory-${System.currentTimeMillis()}.${Thread.currentThread().getId}" 51 | 52 | private val taxonomyDir = Files.createTempDirectory(taxonomyDirName) 53 | 54 | protected val IndexDir = storageMode(indexDir) 55 | 56 | protected val TaxonomyDir = storageMode(taxonomyDir) 57 | 58 | /** 59 | * Select Lucene index storage implementation based on config 60 | * @param directoryPath Directory in disk to store index 61 | * @return 62 | */ 63 | protected def storageMode(directoryPath: Path): Directory = { 64 | if (Config.hasPath(IndexStoreKey)) { 65 | val storageMode = Config.getString(IndexStoreKey) 66 | 67 | storageMode match { 68 | // TODO: FIX: Currently there is a single lock instance for each directory. 69 | // TODO: Implement better lock handling here 70 | case "disk" => { 71 | logInfo(s"Config parameter ${IndexStoreKey} is set to 'disk'") 72 | logInfo("Lucene index will be storage in disk") 73 | logInfo(s"Index disk location ${tmpJavaDir}") 74 | // directoryPath.toFile.deleteOnExit() // Delete on exit 75 | new MMapDirectory(directoryPath, new SingleInstanceLockFactory) 76 | } 77 | case ow => 78 | logInfo(s"Config parameter ${IndexStoreKey} is set to ${ow}") 79 | logInfo("Lucene index will be storage in memory (default)") 80 | logInfo( 81 | """ 82 | Quoting from 83 | http://lucene.apache.org/core/7_5_0/core/org/apache/ 84 | lucene/store/RAMDirectory.html 85 | 86 | A memory-resident Directory implementation. Locking 87 | implementation is by default the SingleInstanceLockFactory. 88 | Warning: This class is not intended to work with huge indexes. 89 | Everything beyond several hundred megabytes will waste resources 90 | (GC cycles), because it uses an internal buffer size of 1024 bytes, 91 | producing millions of byte[1024] arrays. 92 | This class is optimized for small memory-resident indexes. 93 | It also has bad concurrency on multithreaded environments. 94 | 95 | It is recommended to materialize large indexes on disk and 96 | use MMapDirectory, which is a high-performance directory 97 | implementation working directly on the file system cache of 98 | the operating system, so copying data to Java heap 99 | space is not useful. 100 | """.stripMargin) 101 | new RAMDirectory() 102 | } 103 | } 104 | else { 105 | logInfo(s"Config parameter ${IndexStoreKey} is not set") 106 | logInfo("Lucene index will be storage in disk") 107 | new MMapDirectory(directoryPath, new SingleInstanceLockFactory) 108 | } 109 | } 110 | 111 | override def close(): Unit = { 112 | IndexDir.close() 113 | TaxonomyDir.close() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/store/IndexWithTaxonomyWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.store 18 | 19 | import org.apache.lucene.analysis.Analyzer 20 | import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper 21 | import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter 22 | import org.apache.lucene.index.IndexWriterConfig.OpenMode 23 | import org.apache.lucene.index.{IndexWriter, IndexWriterConfig} 24 | import org.zouzias.spark.lucenerdd.analyzers.AnalyzerConfigurable 25 | 26 | /** 27 | * Index and Taxonomy Writer used for facet queries 28 | */ 29 | trait IndexWithTaxonomyWriter extends IndexStorable 30 | with AnalyzerConfigurable { 31 | 32 | protected def indexAnalyzer(): Analyzer 33 | 34 | protected def indexPerFieldAnalyzer(): PerFieldAnalyzerWrapper 35 | 36 | protected lazy val indexWriter = new IndexWriter(IndexDir, 37 | new IndexWriterConfig(indexPerFieldAnalyzer()) 38 | .setOpenMode(OpenMode.CREATE)) 39 | 40 | protected lazy val taxoWriter = new DirectoryTaxonomyWriter(TaxonomyDir) 41 | 42 | protected def closeAllWriters(): Unit = { 43 | indexWriter.commit() 44 | taxoWriter.commit() 45 | taxoWriter.close() 46 | indexWriter.close() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/testing/FavoriteCaseClass.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.testing 18 | 19 | case class FavoriteCaseClass(name: String, age: Int, myLong: Long, myFloat: Float, email: String) 20 | 21 | case class MultivalueFavoriteCaseClass(names: Array[String], 22 | age: Int, 23 | ages: List[Int], 24 | myLong: Long, 25 | myFloat: Float, 26 | email: String) 27 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/testing/Person.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.testing 18 | 19 | case class Person(name: String, age: Int, email: String) 20 | 21 | -------------------------------------------------------------------------------- /src/main/scala/org/zouzias/spark/lucenerdd/versioning/Versionable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.versioning 18 | 19 | /** 20 | * Reads version from sbt and makes version available to Spark 21 | */ 22 | trait Versionable { 23 | 24 | /** 25 | * Return project information, i.e., version number, build time etc 26 | * @return 27 | */ 28 | def version(): Map[String, Any] = { 29 | // BuildInfo is automatically generated using sbt plugin `sbt-buildinfo` 30 | org.zouzias.spark.lucenerdd.BuildInfo.toMap 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/test/resources/capitals.txt: -------------------------------------------------------------------------------- 1 | capital 2 | Sukhumi 3 | Kabul 4 | Episkopi Cantonment 5 | Tirana 6 | Algiers 7 | Pago Pago 8 | Andorra la Vella 9 | Luanda 10 | The Valley 11 | St. John's 12 | Buenos Aires 13 | Yerevan 14 | Oranjestad 15 | Georgetown 16 | Canberra 17 | Vienna 18 | Baku 19 | Nassau 20 | Manama 21 | Dhaka 22 | Bridgetown 23 | Minsk 24 | Brussels 25 | Belmopan 26 | Porto-Novo 27 | Hamilton 28 | Thimphu 29 | Sucre 30 | La Paz 31 | Sarajevo 32 | Gaborone 33 | Brasília 34 | Road Town 35 | Bandar Seri Begawan 36 | Sofia 37 | Ouagadougou 38 | Bujumbura 39 | Phnom Penh 40 | Yaoundé 41 | Ottawa 42 | Praia 43 | George Town 44 | Bangui 45 | N'Djamena 46 | Santiago 47 | Beijing 48 | Flying Fish Cove 49 | West Island 50 | Bogotá 51 | Moroni 52 | Avarua 53 | San José 54 | Zagreb 55 | Havana 56 | Willemstad 57 | Nicosia 58 | Prague 59 | Yamoussoukro 60 | Kinshasa 61 | Copenhagen 62 | Djibouti 63 | Roseau 64 | Santo Domingo 65 | Dili 66 | Hanga Roa 67 | Quito 68 | Cairo 69 | San Salvador 70 | Malabo 71 | Asmara 72 | Tallinn 73 | Addis Ababa 74 | Stanley 75 | Tórshavn 76 | Palikir 77 | Suva 78 | Helsinki 79 | Paris 80 | Cayenne 81 | Papeete 82 | Libreville 83 | Banjul 84 | Tbilisi 85 | Berlin 86 | Accra 87 | Gibraltar 88 | Athens 89 | Nuuk 90 | St. George's 91 | Hagåtña 92 | Guatemala City 93 | St. Peter Port 94 | Conakry 95 | Bissau 96 | Georgetown 97 | Port-au-Prince 98 | Tegucigalpa 99 | Budapest 100 | Reykjavík 101 | New Delhi 102 | Jakarta 103 | Tehran 104 | Baghdad 105 | Dublin 106 | Douglas 107 | Jerusalem 108 | Rome 109 | Kingston 110 | Tokyo 111 | St. Helier 112 | Amman 113 | Astana 114 | Nairobi 115 | Tarawa 116 | Pristina 117 | Kuwait City 118 | Bishkek 119 | Vientiane 120 | Riga 121 | Beirut 122 | Maseru 123 | Monrovia 124 | Tripoli 125 | Vaduz 126 | Vilnius 127 | Luxembourg 128 | Skopje 129 | Antananarivo 130 | Lilongwe 131 | Kuala Lumpur 132 | Malé 133 | Bamako 134 | Valletta 135 | Majuro 136 | Nouakchott 137 | Port Louis 138 | Mexico City 139 | Chisinau 140 | Monaco 141 | Ulaanbaatar 142 | Podgorica 143 | Plymouth 144 | Rabat 145 | Maputo 146 | Naypyidaw 147 | Stepanakert 148 | Windhoek 149 | Yaren 150 | Kathmandu 151 | Amsterdam 152 | Nouméa 153 | Wellington 154 | Managua 155 | Niamey 156 | Abuja 157 | Alofi 158 | Kingston 159 | Pyongyang 160 | Nicosia 161 | Belfast 162 | Saipan 163 | Oslo 164 | Muscat 165 | Islamabad 166 | Ngerulmud 167 | Jerusalem 168 | Panama City 169 | Port Moresby 170 | Asunción 171 | Lima 172 | Manila 173 | Adamstown 174 | Warsaw 175 | Lisbon 176 | San Juan 177 | Doha 178 | Taipei 179 | Brazzaville 180 | Bucharest 181 | Moscow 182 | Kigali 183 | Gustavia 184 | Jamestown 185 | Basseterre 186 | Castries 187 | Marigot 188 | St. Pierre 189 | Kingstown 190 | Apia 191 | San Marino 192 | Riyadh 193 | Edinburgh 194 | Dakar 195 | Belgrade 196 | Victoria 197 | Freetown 198 | Singapore 199 | Philipsburg 200 | Bratislava 201 | Ljubljana 202 | Honiara 203 | Mogadishu 204 | Hargeisa 205 | Pretoria 206 | Grytviken 207 | Seoul 208 | Tskhinvali 209 | Juba 210 | Madrid 211 | Sri Jayawardenapura Kotte 212 | Khartoum 213 | Paramaribo 214 | Mbabane 215 | Stockholm 216 | Bern 217 | Damascus 218 | São Tomé 219 | Dushanbe 220 | Dodoma 221 | Bangkok 222 | Lomé 223 | Nukuʻalofa 224 | Tiraspol 225 | Port of Spain 226 | Edinburgh of the Seven Seas 227 | Tunis 228 | Ankara 229 | Ashgabat 230 | Cockburn Town 231 | Funafuti 232 | Kampala 233 | Kiev 234 | Abu Dhabi 235 | London 236 | Washington 237 | Charlotte Amalie 238 | Montevideo 239 | Tashkent 240 | Port Vila 241 | Vatican City 242 | Caracas 243 | Hanoi 244 | Cardiff 245 | Mata-Utu 246 | El Aaiún 247 | Sanaá 248 | Lusaka 249 | Harare 250 | -------------------------------------------------------------------------------- /src/test/resources/countries.txt: -------------------------------------------------------------------------------- 1 | Abkhazia 2 | Afghanistan 3 | Akrotiri and Dhekelia 4 | Albania 5 | Algeria 6 | American Samoa 7 | Andorra 8 | Angola 9 | Anguilla 10 | Antigua and Barbuda 11 | Argentina 12 | Armenia 13 | Aruba 14 | Ascension Island 15 | Australia 16 | Austria 17 | Azerbaijan 18 | Bahamas 19 | Bahrain 20 | Bangladesh 21 | Barbados 22 | Belarus 23 | Belgium 24 | Belize 25 | Benin 26 | Bermuda 27 | Bhutan 28 | Bolivia 29 | Bolivia 30 | Bosnia and Herzegovina 31 | Botswana 32 | Brazil 33 | British Virgin Islands 34 | Brunei 35 | Bulgaria 36 | Burkina Faso 37 | Burundi 38 | Cambodia 39 | Cameroon 40 | Canada 41 | Cape Verde 42 | Cayman Islands 43 | Central African Republic 44 | Chad 45 | Chile 46 | China 47 | Christmas Island 48 | Cocos Islands 49 | Colombia 50 | Comoros 51 | Cook Islands 52 | Costa Rica 53 | Croatia 54 | Cuba 55 | Curaçao 56 | Cyprus 57 | Czech Republic 58 | Côte d'Ivoire 59 | Democratic Republic of the Congo 60 | Denmark 61 | Djibouti 62 | Dominica 63 | Dominican Republic 64 | East Timor 65 | Easter Island 66 | Ecuador 67 | Egypt 68 | El Salvador 69 | Equatorial Guinea 70 | Eritrea 71 | Estonia 72 | Ethiopia 73 | Falkland Islands 74 | Faroe Islands 75 | Federated States of Micronesia 76 | Fiji 77 | Finland 78 | France 79 | French Guiana 80 | French Polynesia 81 | Gabon 82 | Gambia 83 | Georgia 84 | Germany 85 | Ghana 86 | Gibraltar 87 | Greece 88 | Greenland 89 | Grenada 90 | Guam 91 | Guatemala 92 | Guernsey 93 | Guinea 94 | Guinea-Bissau 95 | Guyana 96 | Haiti 97 | Honduras 98 | Hungary 99 | Iceland 100 | India 101 | Indonesia 102 | Iran 103 | Iraq 104 | Ireland 105 | Isle of Man 106 | Israel 107 | Italy 108 | Jamaica 109 | Japan 110 | Jersey 111 | Jordan 112 | Kazakhstan 113 | Kenya 114 | Kiribati 115 | Kosovo 116 | Kuwait 117 | Kyrgyzstan 118 | Laos 119 | Latvia 120 | Lebanon 121 | Lesotho 122 | Liberia 123 | Libya 124 | Liechtenstein 125 | Lithuania 126 | Luxembourg 127 | Macedonia 128 | Madagascar 129 | Malawi 130 | Malaysia 131 | Maldives 132 | Mali 133 | Malta 134 | Marshall Islands 135 | Mauritania 136 | Mauritius 137 | Mexico 138 | Moldova 139 | Monaco 140 | Mongolia 141 | Montenegro 142 | Montserrat 143 | Morocco 144 | Mozambique 145 | Myanmar 146 | Nagorno-Karabakh Republic 147 | Namibia 148 | Nauru 149 | Nepal 150 | Netherlands 151 | New Caledonia 152 | New Zealand 153 | Nicaragua 154 | Niger 155 | Nigeria 156 | Niue 157 | Norfolk Island 158 | North Korea 159 | Northern Cyprus 160 | United Kingdom Northern Ireland 161 | Northern Mariana Islands 162 | Norway 163 | Oman 164 | Pakistan 165 | Palau 166 | Palestine 167 | Panama 168 | Papua New Guinea 169 | Paraguay 170 | Peru 171 | Philippines 172 | Pitcairn Islands 173 | Poland 174 | Portugal 175 | Puerto Rico 176 | Qatar 177 | Taiwan 178 | Republic of the Congo 179 | Romania 180 | Russia 181 | Rwanda 182 | Saint Barthélemy 183 | Saint Helena 184 | Saint Kitts and Nevis 185 | Saint Lucia 186 | Saint Martin 187 | Saint Pierre and Miquelon 188 | Saint Vincent and the Grenadines 189 | Samoa 190 | San Marino 191 | Saudi Arabia 192 | Scotland 193 | Senegal 194 | Serbia 195 | Seychelles 196 | Sierra Leone 197 | Singapore 198 | Sint Maarten 199 | Slovakia 200 | Slovenia 201 | Solomon Islands 202 | Somalia 203 | Somaliland 204 | South Africa 205 | South Georgia and the South Sandwich Islands 206 | South Korea 207 | South Ossetia 208 | South Sudan South Sudan 209 | Spain 210 | Sri Lanka 211 | Sudan 212 | Suriname 213 | Swaziland 214 | Sweden 215 | Switzerland 216 | Syria 217 | São Tomé and Príncipe 218 | Tajikistan 219 | Tanzania 220 | Thailand 221 | Togo 222 | Tonga 223 | Transnistria 224 | Trinidad and Tobago 225 | Tristan da Cunha 226 | Tunisia 227 | Turkey 228 | Turkmenistan 229 | Turks and Caicos Islands 230 | Tuvalu 231 | Uganda 232 | Ukraine 233 | United Arab Emirates 234 | United Kingdom; England 235 | United States 236 | United States Virgin Islands 237 | Uruguay 238 | Uzbekistan 239 | Vanuatu 240 | Vatican City 241 | Venezuela 242 | Vietnam 243 | Wales 244 | Wallis and Futuna 245 | Western Sahara 246 | Yemen 247 | Zambia 248 | Zimbabwe 249 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the file core/target/unit-tests.log 19 | log4j.rootCategory=INFO, file 20 | log4j.appender.file=org.apache.log4j.FileAppender 21 | log4j.appender.file.append=false 22 | log4j.appender.file.file=target/unit-tests.log 23 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 24 | log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n -------------------------------------------------------------------------------- /src/test/resources/reference.conf: -------------------------------------------------------------------------------- 1 | lucenerdd { 2 | 3 | // Name of analyzer as it is under Lucene's package org.apache.lucene.analysis.XX 4 | analyzer.name = "en" 5 | 6 | // Analyzer name must be "ngram" 7 | analyzer { 8 | ngram.mingram = 2 9 | ngram.maxgram = 5 10 | } 11 | 12 | // Similarity scoring for Lucenes 13 | similarity.name = "bm25" // anything else will default to Lucene classic similarity 14 | 15 | // Supported linkage methods 16 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD 17 | // fits in spark driver's memory) 18 | // 19 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions 20 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of 21 | // partitions of the queries RDD. 22 | linker.method = "collectbroadcast" 23 | 24 | index { 25 | 26 | // Lucene index storage 27 | // Use 'disk' to store the index in Java's temp directory 28 | // Otherwise the index will be stored in memory 29 | store.mode = "disk" 30 | 31 | stringfields{ 32 | 33 | // Analyze text fields or not 34 | analyzed = true 35 | 36 | // Text fields options as in org.apache.lucene.index.IndexOptions 37 | // 38 | // Other options are: 39 | // "DOCS" 40 | // "DOCS_AND_FREQS" 41 | // "DOCS_AND_FREQS_AND_POSITIONS" 42 | // "DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS" 43 | // "NONE" 44 | options = "docs_and_freqs_and_positions_and_offsets" 45 | 46 | // Omit terms norms 47 | terms.omitnorms = false 48 | 49 | // Store term positions 50 | terms.positions = false 51 | 52 | // Store Term vectors (set true, otherwise LuceneRDD.termVectors(fieldName) will fail) 53 | terms.vectors = true 54 | } 55 | } 56 | 57 | // Maximum value on topK queries 58 | query.topk.maxvalue = 100 59 | // Default value of number of returned results 60 | query.topk.default = 10 61 | 62 | // Default value of number of faceted results 63 | query.facets.number.default = 10 64 | 65 | // Spatial related configurations used by ShapeLuceneRDD 66 | spatial { 67 | prefixtree { 68 | name = "quad" // "geohash" or "quad" 69 | maxlevel = 9 // 11 results in sub-meter precision for geohash 70 | maxDistErr = 5.0 // in kilometers 71 | } 72 | 73 | // Shape format can be one of ShapeIO.GeoJSON, ShapeIO.LEGACY, ShapeIO.POLY, ShapeIO.WKT 74 | shape.io.format = "WKT" 75 | 76 | // Supported linkage methods 77 | // "collectbroadcast" : Collects the RDD that contains the queries (to be used only if query RDD 78 | // fits in spark driver's memory) 79 | // 80 | // "cartesian" : Uses cartesian product between the partitions of the queries RDD and the partitions 81 | // of LuceneRDD. Note it duplicates each partition of LuceneRDD n times where n is the number of 82 | // partitions of the queries RDD. 83 | linker.method = "collectbroadcast" 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/BlockingDedupSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.lucene.index.Term 21 | import org.apache.lucene.search.{Query, TermQuery} 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.sql.{Row, SparkSession} 24 | import org.scalatest.BeforeAndAfterEach 25 | import org.scalatest.flatspec.AnyFlatSpec 26 | import org.scalatest._ 27 | import matchers.should._ 28 | 29 | import org.zouzias.spark.lucenerdd.testing.Person 30 | 31 | class BlockingDedupSpec extends AnyFlatSpec 32 | with Matchers 33 | with BeforeAndAfterEach 34 | with SharedSparkContext { 35 | 36 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 37 | setMaster("local[*]"). 38 | setAppName("test"). 39 | set("spark.ui.enabled", "false"). 40 | set("spark.app.id", appID)) 41 | 42 | "LuceneRDD.blockDedup" should "deduplicate elements on unique elements" in { 43 | val spark = SparkSession.builder().getOrCreate() 44 | import spark.implicits._ 45 | 46 | val people: Array[Person] = Array("fear", "death", "water", "fire", "house") 47 | .zipWithIndex.map { case (str, index) => 48 | val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com" 49 | Person(str, index, email) 50 | } 51 | val df = sc.parallelize(people).repartition(2).toDF() 52 | 53 | val linker: Row => Query = { row => 54 | val name = row.getString(row.fieldIndex("name")) 55 | val term = new Term("name", name) 56 | 57 | new TermQuery(term) 58 | } 59 | 60 | 61 | val linked = LuceneRDD.blockDedup(df, linker, Array("email")) 62 | 63 | val linkedCount, dfCount = (linked.count, df.count()) 64 | 65 | linkedCount should equal(dfCount) 66 | 67 | // Check for correctness 68 | // Age is a unique index 69 | linked.collect().foreach { case (row, results) => 70 | val leftAge, rightAge = (row.getInt(row.fieldIndex("age")), 71 | results.headOption.map(x => x.getInt(x.fieldIndex("age")))) 72 | 73 | leftAge should equal(rightAge) 74 | 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/BlockingLinkageSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.lucene.index.Term 21 | import org.apache.lucene.search.{Query, TermQuery} 22 | import org.apache.spark.SparkConf 23 | import org.apache.spark.sql.{Row, SparkSession} 24 | import org.scalatest.BeforeAndAfterEach 25 | import org.scalatest.flatspec.AnyFlatSpec 26 | import org.scalatest._ 27 | import matchers.should._ 28 | import org.zouzias.spark.lucenerdd.testing.Person 29 | 30 | class BlockingLinkageSpec extends AnyFlatSpec 31 | with Matchers 32 | with BeforeAndAfterEach 33 | with SharedSparkContext { 34 | 35 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 36 | setMaster("local[*]"). 37 | setAppName("test"). 38 | set("spark.ui.enabled", "false"). 39 | set("spark.app.id", appID)) 40 | 41 | "LuceneRDD.blockEntityLinkage" should "deduplicate elements on unique elements" in { 42 | val spark = SparkSession.builder().getOrCreate() 43 | import spark.implicits._ 44 | 45 | val peopleLeft: Array[Person] = Array("fear", "death", "water", "fire", "house") 46 | .zipWithIndex.map { case (str, index) => 47 | val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com" 48 | Person(str, index, email) 49 | } 50 | 51 | val peopleRight: Array[Person] = Array("fear", "death", "water", "fire", "house") 52 | .zipWithIndex.map { case (str, index) => 53 | val email = if (index % 2 == 0) "yes@gmail.com" else "no@gmail.com" 54 | Person(str, index, email) 55 | } 56 | 57 | val leftDF = sc.parallelize(peopleLeft).repartition(2).toDF() 58 | val rightDF = sc.parallelize(peopleRight).repartition(3).toDF() 59 | 60 | // Define a Lucene Term linker 61 | val linker: Row => Query = { row => 62 | val name = row.getString(row.fieldIndex("name")) 63 | val term = new Term("name", name) 64 | 65 | new TermQuery(term) 66 | } 67 | 68 | 69 | val linked = LuceneRDD.blockEntityLinkage(leftDF, rightDF, linker, 70 | Array("email"), Array("email")) 71 | 72 | val linkedCount, dfCount = (linked.count, leftDF.count()) 73 | 74 | linkedCount should equal(dfCount) 75 | 76 | // Check for correctness 77 | // Age is a unique index 78 | linked.collect().foreach { case (row, results) => 79 | val leftAge, rightAge = (row.getInt(row.fieldIndex("age")), 80 | results.headOption.map(x => x.getInt(x.fieldIndex("age")))) 81 | 82 | leftAge should equal(rightAge) 83 | 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneDocToSparkRowpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import java.io.{Reader, StringReader} 20 | 21 | import org.apache.lucene.document.{Document, DoublePoint, Field, FloatPoint, IntPoint, LongPoint, StoredField, TextField} 22 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc 23 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField} 24 | 25 | import org.scalatest.BeforeAndAfterEach 26 | import org.scalatest.flatspec.AnyFlatSpec 27 | import org.scalatest._ 28 | import matchers.should._ 29 | 30 | 31 | import scala.collection.JavaConverters._ 32 | 33 | class LuceneDocToSparkRowpec extends AnyFlatSpec 34 | with Matchers 35 | with BeforeAndAfterEach { 36 | 37 | val (score: Float, docId: Int, shardIndex: Int) = (1.0f, 1, 2) 38 | val float: Float = 20.001f 39 | val double: Double = 10.1000000001D 40 | 41 | def generate_doc(): Document = { 42 | val doc = new Document() 43 | 44 | // Add long field 45 | doc.add(new LongPoint("longField", 10)) 46 | doc.add(new StoredField("longField", 10)) 47 | 48 | doc.add(new FloatPoint("floatField", float)) 49 | doc.add(new StoredField("floatField", float)) 50 | 51 | doc.add(new IntPoint("intField", 9)) 52 | doc.add(new StoredField("intField", 9)) 53 | 54 | doc.add(new DoublePoint("doubleField", double)) 55 | doc.add(new StoredField("doubleField", double)) 56 | 57 | doc.add(new TextField("textField", "hello world", Field.Store.NO)) 58 | doc.add(new StoredField("textField", "hello world")) 59 | 60 | doc 61 | } 62 | 63 | private val doc: Document = generate_doc() 64 | 65 | val sparkScoreDoc = SparkScoreDoc(score, docId, shardIndex, doc) 66 | 67 | 68 | "SparkScoreDoc.toRow" should "return correct score" in { 69 | val row = sparkScoreDoc.toRow() 70 | row.getFloat(row.fieldIndex(ScoreField)) should equal(score) 71 | } 72 | 73 | "SparkScoreDoc.toRow" should "return correct docId" in { 74 | val row = sparkScoreDoc.toRow() 75 | row.getInt(row.fieldIndex(DocIdField)) should equal(docId) 76 | } 77 | 78 | "SparkScoreDoc.toRow" should "return correct shard number" in { 79 | val row = sparkScoreDoc.toRow() 80 | row.getInt(row.fieldIndex(ShardField)) should equal(shardIndex) 81 | } 82 | 83 | "SparkScoreDoc.toRow" should "return correct number of fields" in { 84 | val row = sparkScoreDoc.toRow() 85 | row.getFields().asScala.count(_.fieldType().stored()) should equal(8) 86 | } 87 | 88 | "SparkScoreDoc.toRow" should "set correctly DoublePoint" in { 89 | val row = sparkScoreDoc.toRow() 90 | row.getDouble(row.fieldIndex("doubleField")) should equal(double) 91 | } 92 | 93 | "SparkScoreDoc.toRow" should "set correctly FloatPoint" in { 94 | val row = sparkScoreDoc.toRow() 95 | row.getFloat(row.fieldIndex("floatField")) should equal(float) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LucenePrimitiveTypesSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.scalatest.BeforeAndAfterEach 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest._ 24 | import matchers.should._ 25 | 26 | 27 | class LucenePrimitiveTypesSpec extends AnyFlatSpec with Matchers 28 | with BeforeAndAfterEach 29 | with SharedSparkContext { 30 | 31 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 32 | setMaster("local[*]"). 33 | setAppName("test"). 34 | set("spark.ui.enabled", "false"). 35 | set("spark.app.id", appID)) 36 | 37 | def randomString(length: Int): String = scala.util.Random.alphanumeric.take(length).mkString 38 | val array = (1 to 24).map(randomString(_)) 39 | 40 | var luceneRDD: LuceneRDD[_] = _ 41 | 42 | override def afterEach() { 43 | luceneRDD.close() 44 | } 45 | 46 | /** 47 | "LuceneRDD" should "work with RDD[List[String]]" in { 48 | val array = Array(List("aaa", "aaa2"), List("bbb", "bbb2"), 49 | List("ccc", "ccc2"), List("ddd"), List("eee")) 50 | val rdd = sc.parallelize(array) 51 | luceneRDD = LuceneRDD(rdd) 52 | luceneRDD.count should be (array.length) 53 | } 54 | */ 55 | 56 | "LuceneRDD" should "work with RDD[Array[String]]" in { 57 | val array = Array(Array("aaa", "aaa2"), Array("bbb", "bbb2"), 58 | Array("ccc", "ccc2"), Array("ddd"), Array("eee")) 59 | val rdd = sc.parallelize(array) 60 | luceneRDD = LuceneRDD(rdd) 61 | luceneRDD.count should be (array.length) 62 | } 63 | 64 | "LuceneRDD" should "work with RDD[Set[String]]" in { 65 | val array = Array(Set("aaa", "aaa2"), Set("bbb", "bbb2"), 66 | Set("ccc", "ccc2"), Set("ddd"), Set("eee")) 67 | val rdd = sc.parallelize(array) 68 | luceneRDD = LuceneRDD(rdd) 69 | luceneRDD.count should be (array.length) 70 | } 71 | 72 | "LuceneRDD" should "work with RDD[String]" in { 73 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee") 74 | val rdd = sc.parallelize(array) 75 | luceneRDD = LuceneRDD(rdd) 76 | luceneRDD.count should be (array.length) 77 | } 78 | 79 | "LuceneRDD" should "work with RDD[Int]" in { 80 | val array = (1 to 22) 81 | val rdd = sc.parallelize(array) 82 | luceneRDD = LuceneRDD(rdd) 83 | luceneRDD.count should be (array.size) 84 | } 85 | 86 | "LuceneRDD" should "work with RDD[Float]" in { 87 | val array: IndexedSeq[Float] = (1 to 22).map(_.toFloat) 88 | val rdd = sc.parallelize(array) 89 | luceneRDD = LuceneRDD(rdd) 90 | luceneRDD.count should be (array.size) 91 | } 92 | 93 | "LuceneRDD" should "work with RDD[Double]" in { 94 | val array: IndexedSeq[Double] = (1 to 22).map(_.toDouble) 95 | val rdd = sc.parallelize(array) 96 | luceneRDD = LuceneRDD(rdd) 97 | luceneRDD.count should be (array.size) 98 | } 99 | 100 | "LuceneRDD" should "work with RDD[Long]" in { 101 | val array: IndexedSeq[Long] = (1 to 22).map(_.toLong) 102 | val rdd = sc.parallelize(array) 103 | luceneRDD = LuceneRDD(rdd) 104 | luceneRDD.count should equal (array.size) 105 | } 106 | 107 | "LuceneRDD" should "work with RDD[Map[String, String]]" in { 108 | val maps = List(Map( "a" -> "hello"), Map("b" -> "world"), Map("c" -> "how are you")) 109 | val rdd = sc.parallelize(maps) 110 | luceneRDD = LuceneRDD(rdd) 111 | luceneRDD.count should equal (maps.size) 112 | luceneRDD.termQuery("a", "hello").isEmpty() should equal (false) 113 | luceneRDD.prefixQuery("b", "wor").isEmpty() should equal (false) 114 | luceneRDD.prefixQuery("a", "no").isEmpty() should equal (true) 115 | } 116 | 117 | "LuceneRDD" should "work with RDD[String] and ignore null values" in { 118 | val array = Array("aaa", null, "ccc", null, "eee") 119 | val rdd = sc.parallelize(array) 120 | luceneRDD = LuceneRDD(rdd) 121 | luceneRDD.count should be (array.length) 122 | } 123 | 124 | } -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDCustomCaseClassImplicitsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.zouzias.spark.lucenerdd.testing.Person 22 | import org.scalatest.BeforeAndAfterEach 23 | import org.scalatest.flatspec.AnyFlatSpec 24 | import org.scalatest._ 25 | import matchers.should._ 26 | 27 | 28 | class LuceneRDDCustomCaseClassImplicitsSpec extends AnyFlatSpec 29 | with Matchers 30 | with BeforeAndAfterEach 31 | with SharedSparkContext { 32 | 33 | var luceneRDD: LuceneRDD[_] = _ 34 | 35 | override def afterEach() { 36 | luceneRDD.close() 37 | } 38 | 39 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 40 | setMaster("local[*]"). 41 | setAppName("test"). 42 | set("spark.ui.enabled", "false"). 43 | set("spark.app.id", appID)) 44 | 45 | val elem: Array[Person] = Array("fear", "death", "water", "fire", "house") 46 | .zipWithIndex.map{ case (str, index) => Person(str, index, s"${str}@gmail.com")} 47 | 48 | "LuceneRDD(case class).count" should "handle nulls properly" in { 49 | val elemsWithNulls = Array("fear", "death", "water", "fire", "house") 50 | .zipWithIndex.map{ case (str, index) => Person(str, index, null)} 51 | val rdd = sc.parallelize(elemsWithNulls) 52 | luceneRDD = LuceneRDD(rdd) 53 | luceneRDD.count() should equal (elemsWithNulls.length) 54 | } 55 | 56 | "LuceneRDD(case class).count" should "return correct number of elements" in { 57 | val rdd = sc.parallelize(elem) 58 | luceneRDD = LuceneRDD(rdd) 59 | luceneRDD.count() should equal (elem.length) 60 | } 61 | 62 | "LuceneRDD(case class).fields" should "return all fields" in { 63 | val rdd = sc.parallelize(elem) 64 | luceneRDD = LuceneRDD(rdd) 65 | 66 | luceneRDD.fields().size should equal(3) 67 | luceneRDD.fields().contains("name") should equal(true) 68 | luceneRDD.fields().contains("age") should equal(true) 69 | luceneRDD.fields().contains("email") should equal(true) 70 | } 71 | 72 | "LuceneRDD(case class).termQuery" should "correctly search with TermQueries" in { 73 | val rdd = sc.parallelize(elem) 74 | luceneRDD = LuceneRDD(rdd) 75 | 76 | val results = luceneRDD.termQuery("name", "water") 77 | results.count() should equal(1) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDMoreLikeThisSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import scala.collection.JavaConverters._ 22 | import org.scalatest.BeforeAndAfterEach 23 | import org.scalatest.flatspec.AnyFlatSpec 24 | import org.scalatest._ 25 | import matchers.should._ 26 | 27 | 28 | import scala.io.Source 29 | 30 | class LuceneRDDMoreLikeThisSpec extends AnyFlatSpec 31 | with Matchers 32 | with BeforeAndAfterEach 33 | with SharedSparkContext { 34 | 35 | var luceneRDD: LuceneRDD[_] = _ 36 | 37 | 38 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 39 | setMaster("local[*]"). 40 | setAppName("test"). 41 | set("spark.ui.enabled", "false"). 42 | set("spark.app.id", appID)) 43 | 44 | override def afterEach() { 45 | luceneRDD.close() 46 | } 47 | 48 | "LuceneRDD.moreLikeThis" should "return relevant documents" in { 49 | val words: Seq[String] = Source.fromFile("src/test/resources/alice.txt") 50 | .getLines().map(_.toLowerCase).toSeq 51 | val rdd = sc.parallelize(words) 52 | luceneRDD = LuceneRDD(rdd) 53 | val results = luceneRDD 54 | .moreLikeThis("_1", "alice adventures wonderland", 1, 1) 55 | .collect() 56 | 57 | results.length > 0 should equal(true) 58 | val firstDoc = results.head 59 | val x = firstDoc.getString(firstDoc.fieldIndex("_1")) 60 | 61 | x.contains("alice") && 62 | x.contains("wonderland") && 63 | x.contains("adventures") should equal(true) 64 | 65 | val lastDoc = results.last 66 | val y = lastDoc.getString(lastDoc.fieldIndex("_1")) 67 | 68 | 69 | y.contains("alice") && 70 | !y.contains("wonderland") && 71 | !y.contains("adventures") should equal(true) 72 | 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDSearchSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.scalatest.BeforeAndAfterEach 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest._ 24 | import matchers.should._ 25 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils 26 | 27 | class LuceneRDDSearchSpec extends AnyFlatSpec 28 | with Matchers 29 | with BeforeAndAfterEach 30 | with LuceneRDDTestUtils 31 | with SharedSparkContext { 32 | 33 | var luceneRDD: LuceneRDD[_] = _ 34 | 35 | override def Radius: Double = 0 36 | 37 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 38 | setMaster("local[*]"). 39 | setAppName("test"). 40 | set("spark.ui.enabled", "false"). 41 | set("spark.app.id", appID)) 42 | 43 | override def afterEach() { 44 | luceneRDD.close() 45 | } 46 | 47 | 48 | val First = "_1" 49 | 50 | val array = List("fear", "death", " apologies", "romance", "tree", "fashion", "fascism") 51 | 52 | "LuceneRDD.query" should "use phrase query syntax" in { 53 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 54 | val rdd = sc.parallelize(words) 55 | luceneRDD = LuceneRDD(rdd) 56 | luceneRDD.query("_1:aadaa").isEmpty() should equal (false) 57 | luceneRDD.query("_1:aa*").count() should equal (4) 58 | luceneRDD.query("_1:q*").count() should equal (1) 59 | } 60 | 61 | "LuceneRDD.count" should "return correct number of elements" in { 62 | val rdd = sc.parallelize(array) 63 | luceneRDD = LuceneRDD(rdd) 64 | luceneRDD.count should equal (array.size) 65 | } 66 | 67 | "LuceneRDD.termQuery" should "correctly search with TermQueries" in { 68 | val rdd = sc.parallelize(array) 69 | luceneRDD = LuceneRDD(rdd) 70 | val results = luceneRDD.termQuery(First, array(1)) 71 | results.count() should equal (1) 72 | } 73 | 74 | "LuceneRDD.prefixQuery" should "correctly search with PrefixQueries" in { 75 | 76 | val prefices = Array("aaaabcd", "aaadcb", "aaz", "az", "qwerty") 77 | val rdd = sc.parallelize(prefices) 78 | luceneRDD = LuceneRDD(rdd) 79 | 80 | luceneRDD.prefixQuery(First, "a").count() should equal (4) 81 | luceneRDD.prefixQuery(First, "aa").count() should equal(3) 82 | luceneRDD.prefixQuery(First, "aaa").count() should equal (2) 83 | luceneRDD.prefixQuery(First, "aaaa").count() should equal (1) 84 | } 85 | 86 | "LuceneRDD.fuzzyQuery" should "correctly search with FuzzyQuery" in { 87 | val rdd = sc.parallelize(array) 88 | luceneRDD = LuceneRDD(rdd) 89 | 90 | luceneRDD.fuzzyQuery(First, "fear", 1).count() should equal (1) 91 | luceneRDD.fuzzyQuery(First, "fascsm", 1).count() should equal(1) 92 | luceneRDD.fuzzyQuery(First, "dath", 1).count() should equal (1) 93 | luceneRDD.fuzzyQuery(First, "tree", 1).count() should equal (1) 94 | } 95 | 96 | /* 97 | "LuceneRDD.fuzzyQuery" should "correctly search for Bern in Cities dataset" in { 98 | val cities = Source.fromFile("src/test/resources/cities.txt").getLines().toSeq 99 | val rdd = sc.parallelize(cities) 100 | luceneRDD = LuceneRDD(rdd) 101 | 102 | val results = luceneRDD.fuzzyQuery(First, "Bern", 1).collect() 103 | 104 | // First result must be Bern 105 | results.headOption 106 | .forall( first => first.doc.textField(First).contains("Bern")) should equal(true) 107 | 108 | // Results must be sorted (descending) 109 | sortedDescSparkScoreDocs(results) should equal(true) 110 | } 111 | */ 112 | 113 | "LuceneRDD.phraseQuery" should "correctly search with PhraseQuery" in { 114 | val phrases = Array("hello world", "the company name was", "highlight lucene") 115 | val rdd = sc.parallelize(phrases) 116 | luceneRDD = LuceneRDD(rdd) 117 | 118 | luceneRDD.phraseQuery(First, "company name", 10).count() should equal (1) 119 | luceneRDD.phraseQuery(First, "hello world", 10).count() should equal (1) 120 | luceneRDD.phraseQuery(First, "highlight lucene", 10).count() should equal(1) 121 | } 122 | } -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.scalatest.flatspec.AnyFlatSpec 22 | import org.scalatest._ 23 | import matchers.should._ 24 | 25 | class LuceneRDDSpec extends AnyFlatSpec 26 | with Matchers 27 | with BeforeAndAfterEach 28 | with SharedSparkContext { 29 | 30 | var luceneRDD: LuceneRDD[_] = _ 31 | 32 | 33 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 34 | setMaster("local[*]"). 35 | setAppName("test"). 36 | set("spark.ui.enabled", "false"). 37 | set("spark.app.id", appID)) 38 | 39 | override def afterEach() { 40 | luceneRDD.close() 41 | } 42 | 43 | "LuceneRDD.exists(Map)" should "find elements that exist" in { 44 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 45 | val rdd = sc.parallelize(words) 46 | luceneRDD = LuceneRDD(rdd) 47 | luceneRDD.exists(Map("_1" -> "aaaa")) should equal (true) 48 | } 49 | 50 | "LuceneRDD.exists(Map)" should "not find elements that don't exist" in { 51 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 52 | val rdd = sc.parallelize(words) 53 | luceneRDD = LuceneRDD(rdd) 54 | luceneRDD.exists(Map("_1" -> "doNotExist")) should equal (false) 55 | } 56 | 57 | "LuceneRDD.exists(T)" should "find elements that exist" in { 58 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 59 | val rdd = sc.parallelize(words) 60 | val localLuceneRDD = LuceneRDD(rdd) 61 | localLuceneRDD.exists("aaaa") should equal (true) 62 | localLuceneRDD.close() 63 | } 64 | 65 | "LuceneRDD.exists(T)" should "not find elements that don't exist" in { 66 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 67 | val rdd = sc.parallelize(words) 68 | val localLuceneRDD = LuceneRDD(rdd) 69 | localLuceneRDD.exists("doNotExist") should equal (false) 70 | localLuceneRDD.close() 71 | } 72 | 73 | "LuceneRDD.count" should "count correctly the results" in { 74 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 75 | val rdd = sc.parallelize(words) 76 | val luceneRDD = LuceneRDD(rdd) 77 | luceneRDD.count should equal (5) 78 | } 79 | 80 | "LuceneRDD.count" should "count zero on empty RDD" in { 81 | val words = Array.empty[String] 82 | val rdd = sc.parallelize(words) 83 | luceneRDD = LuceneRDD(rdd) 84 | luceneRDD.count should equal (0) 85 | } 86 | 87 | "LuceneRDD.filter" should "filter correctly existing element" in { 88 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 89 | val rdd = sc.parallelize(words) 90 | val luceneRDD = LuceneRDD(rdd) 91 | luceneRDD.filter(x => x.startsWith("aaa")).count should equal (2) 92 | } 93 | 94 | "LuceneRDD.filter" should "not filter non existing elements" in { 95 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 96 | val rdd = sc.parallelize(words) 97 | val luceneRDD = LuceneRDD(rdd) 98 | luceneRDD.filter(x => x.startsWith("iDoNotExist")).count should equal (0) 99 | } 100 | 101 | "LuceneRDD.fields" should "return _1 as default field" in { 102 | val words = Array("aabaa", "aaacaa", "aadaa", "aaaa", "qwerty") 103 | val rdd = sc.parallelize(words) 104 | val luceneRDD = LuceneRDD(rdd) 105 | luceneRDD.fields().contains("_1") should equal(true) 106 | } 107 | 108 | "LuceneRDD.fields" should "correctly return field types" in { 109 | val words = Array(("a", 1.0F), ("b", 2.0F), ("c", 3.0F)) 110 | val rdd = sc.parallelize(words) 111 | val luceneRDD = LuceneRDD(rdd) 112 | luceneRDD.fields().contains("_1") should equal(true) 113 | luceneRDD.fields().contains("_2") should equal(true) 114 | } 115 | 116 | "LuceneRDD.fields" should "return correct fields with RDD[Map[String, String]]" in { 117 | val maps = List(Map( "a" -> "hello"), Map("b" -> "world"), Map("c" -> "how are you")) 118 | val rdd = sc.parallelize(maps) 119 | luceneRDD = LuceneRDD(rdd) 120 | luceneRDD.fields() should equal(Set("a", "b", "c")) 121 | } 122 | 123 | "LuceneRDD.version" should "return project sbt build information" in { 124 | val map = LuceneRDD.version() 125 | map.contains("name") should equal(true) 126 | map.contains("builtAtMillis") should equal(true) 127 | map.contains("scalaVersion") should equal(true) 128 | map.contains("version") should equal(true) 129 | map.contains("sbtVersion") should equal(true) 130 | map.contains("builtAtString") should equal(true) 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDTermVectorsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.scalatest.{BeforeAndAfterEach} 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest._ 24 | import matchers.should._ 25 | 26 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils 27 | 28 | class LuceneRDDTermVectorsSpec extends AnyFlatSpec 29 | with Matchers 30 | with BeforeAndAfterEach 31 | with LuceneRDDTestUtils 32 | with SharedSparkContext { 33 | 34 | var luceneRDD: LuceneRDD[_] = _ 35 | 36 | override def Radius: Double = 0 37 | 38 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 39 | setMaster("local[*]"). 40 | setAppName("test"). 41 | set("spark.ui.enabled", "false"). 42 | set("spark.app.id", appID)) 43 | 44 | override def afterEach() { 45 | luceneRDD.close() 46 | } 47 | 48 | val First = "_1" 49 | 50 | "LuceneRDD.termVectors" should "return valid terms" in { 51 | 52 | val words = Array("To smile or not to smile smile", 53 | "Don't cry because it's over, smile because it happened", 54 | "So many books, so little time", 55 | "A room without books is like a body without a soul", 56 | "If you tell the truth, you don't have to remember anything") 57 | val rdd = sc.parallelize(words) 58 | 59 | luceneRDD = LuceneRDD(rdd) 60 | 61 | val terms = luceneRDD.termVectors(First).collect() 62 | 63 | // These terms should exist 64 | terms.exists(_.term.compareToIgnoreCase("time") == 0) should equal(true) 65 | terms.exists(_.term.compareToIgnoreCase("room") == 0) should equal(true) 66 | terms.exists(_.term.compareToIgnoreCase("soul") == 0) should equal(true) 67 | terms.exists(_.term.compareToIgnoreCase("smile") == 0) should equal(true) 68 | 69 | terms.exists(t => (t.term.compareToIgnoreCase("smile") == 0) 70 | && t.count == 3) should equal (true) 71 | terms.exists(t => (t.term.compareToIgnoreCase("becaus") == 0) 72 | && t.count == 2) should equal (true) 73 | } 74 | } -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/LuceneRDDTuplesSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.scalatest.BeforeAndAfterEach 22 | import org.scalatest.flatspec.AnyFlatSpec 23 | import org.scalatest._ 24 | import matchers.should._ 25 | 26 | 27 | class LuceneRDDTuplesSpec extends AnyFlatSpec with Matchers with SharedSparkContext { 28 | 29 | val First = "_1" 30 | val Second = "_2" 31 | 32 | val array = List("fear", "death", " apology", "romance", "tree", "fashion", "fascism") 33 | 34 | 35 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 36 | setMaster("local[*]"). 37 | setAppName("test"). 38 | set("spark.ui.enabled", "false"). 39 | set("spark.app.id", appID)) 40 | 41 | "LuceneRDD" should "work with Tuple2" in { 42 | val rdd = sc.parallelize(array).map(x => (x, x)) 43 | val luceneRDD = LuceneRDD(rdd) 44 | luceneRDD.count should equal (array.size) 45 | } 46 | 47 | "LuceneRDD" should "work with Tuple3" in { 48 | val rdd = sc.parallelize(array).map(x => (x, x, x)) 49 | val luceneRDD = LuceneRDD(rdd) 50 | val results = luceneRDD.termQuery(Second, array(1)) 51 | results.count should equal (1) 52 | } 53 | 54 | "LuceneRDD" should "work with Tuple4" in { 55 | val rdd = sc.parallelize(array).map(x => (x, x, x, x)) 56 | val luceneRDD = LuceneRDD(rdd) 57 | val results = luceneRDD.termQuery(Second, array(1)) 58 | results.count should equal (1) 59 | } 60 | 61 | "LuceneRDD" should "work with Tuple5" in { 62 | val rdd = sc.parallelize(array).map(x => (x, x, x, x, x)) 63 | val luceneRDD = LuceneRDD(rdd) 64 | val results = luceneRDD.termQuery(Second, array(1)) 65 | results.count should equal (1) 66 | } 67 | 68 | "LuceneRDD" should "work with Tuple6" in { 69 | val rdd = sc.parallelize(array).map(x => (x, x, x, x, x, x)) 70 | val luceneRDD = LuceneRDD(rdd) 71 | val results = luceneRDD.termQuery(Second, array(1)) 72 | results.count should equal (1) 73 | } 74 | 75 | "LuceneRDD" should "work with Tuple7" in { 76 | val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x)) 77 | val luceneRDD = LuceneRDD(rdd) 78 | val results = luceneRDD.termQuery(First, array.head) 79 | results.count should equal (1) 80 | } 81 | 82 | "LuceneRDD" should "work with Tuple8" in { 83 | val rdd = sc.parallelize(array).map(x => (x, x, 2.0d, 1.0d, x, 1, x, 3.4)) 84 | val luceneRDD = LuceneRDD(rdd) 85 | val results = luceneRDD.termQuery(First, array(1)) 86 | results.count should equal (1) 87 | } 88 | 89 | "LuceneRDD" should "work with mixed types in Tuples" in { 90 | val rdd = sc.parallelize(array).map(x => (x, 1, x, 2L, x, 3.0F)) 91 | val luceneRDD = LuceneRDD(rdd) 92 | val results = luceneRDD.termQuery(First, array(1)) 93 | results.count should equal (1) 94 | } 95 | } -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/analyzers/AnalyzersConfigurableSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.analyzers 18 | 19 | import org.apache.lucene.analysis.en.EnglishAnalyzer 20 | import org.apache.lucene.analysis.el.GreekAnalyzer 21 | import org.apache.lucene.analysis.de.GermanAnalyzer 22 | import org.scalatest.BeforeAndAfterEach 23 | import org.scalatest.flatspec.AnyFlatSpec 24 | import org.scalatest._ 25 | import matchers.should._ 26 | 27 | 28 | class AnalyzersConfigurableSpec extends AnyFlatSpec with Matchers 29 | with BeforeAndAfterEach 30 | with AnalyzerConfigurable { 31 | 32 | "AnalyzersConfigurable.getAnalyzer" should "return english analyzer with 'en' input" in { 33 | val englishAnalyzer = getAnalyzer(Some("en")) 34 | englishAnalyzer shouldNot equal(null) 35 | englishAnalyzer.isInstanceOf[EnglishAnalyzer] should equal(true) 36 | } 37 | 38 | "AnalyzersConfigurable.getAnalyzer" should 39 | "return custom test analyzer with 'org.apache.lucene.analysis.el.GreekAnalyzer'" in { 40 | val greekAnalyzer = getAnalyzer(Some("org.apache.lucene.analysis.el.GreekAnalyzer")) 41 | greekAnalyzer shouldNot equal(null) 42 | greekAnalyzer.isInstanceOf[GreekAnalyzer] should equal(true) 43 | } 44 | 45 | "AnalyzersConfigurable.getAnalyzer" should 46 | "return custom test analyzer with 'org.apache.lucene.analysis.de.GermanAnalyzer'" in { 47 | val deutschAnalyzer = getAnalyzer(Some("org.apache.lucene.analysis.de.GermanAnalyzer")) 48 | deutschAnalyzer shouldNot equal(null) 49 | deutschAnalyzer.isInstanceOf[GermanAnalyzer] should equal(true) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/facets/FacetedLuceneRDDImplicitsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.facets 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.sql.SparkSession 22 | import org.zouzias.spark.lucenerdd.testing.FavoriteCaseClass 23 | import org.zouzias.spark.lucenerdd.{LuceneRDD, LuceneRDDKryoRegistrator} 24 | 25 | import org.scalatest.BeforeAndAfterEach 26 | import org.scalatest.flatspec.AnyFlatSpec 27 | import org.scalatest._ 28 | import matchers.should._ 29 | 30 | 31 | class FacetedLuceneRDDImplicitsSpec extends AnyFlatSpec 32 | with Matchers 33 | with BeforeAndAfterEach 34 | with SharedSparkContext { 35 | 36 | var luceneRDD: LuceneRDD[_] = _ 37 | 38 | 39 | override val conf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 40 | setMaster("local[*]"). 41 | setAppName("test"). 42 | set("spark.ui.enabled", "false"). 43 | set("spark.app.id", appID)) 44 | 45 | override def afterEach() { 46 | luceneRDD.close() 47 | } 48 | 49 | 50 | val elem = Array("fear", "death", "water", "fire", "house") 51 | .zipWithIndex.map{ case (str, index) => 52 | FavoriteCaseClass(str, index, 10L, 12.3F, s"${str}@gmail.com")} 53 | 54 | 55 | "FacetedLuceneRDD(case class).count" should "return correct number of elements" in { 56 | val rdd = sc.parallelize(elem) 57 | val spark = SparkSession.builder().getOrCreate() 58 | import spark.implicits._ 59 | val df = rdd.toDF() 60 | luceneRDD = FacetedLuceneRDD(df) 61 | luceneRDD.count should equal (elem.size) 62 | } 63 | 64 | "FacetedLuceneRDD(case class).fields" should "return all fields" in { 65 | val rdd = sc.parallelize(elem) 66 | val spark = SparkSession.builder().getOrCreate() 67 | import spark.implicits._ 68 | val df = rdd.toDF() 69 | luceneRDD = FacetedLuceneRDD(df) 70 | 71 | luceneRDD.fields().size should equal(5) 72 | luceneRDD.fields().contains("name") should equal(true) 73 | luceneRDD.fields().contains("age") should equal(true) 74 | luceneRDD.fields().contains("myLong") should equal(true) 75 | luceneRDD.fields().contains("myFloat") should equal(true) 76 | luceneRDD.fields().contains("email") should equal(true) 77 | } 78 | 79 | "FacetedLuceneRDD(case class).termQuery" should "correctly search with TermQueries" in { 80 | val rdd = sc.parallelize(elem) 81 | val spark = SparkSession.builder().getOrCreate() 82 | import spark.implicits._ 83 | val df = rdd.toDF() 84 | luceneRDD = FacetedLuceneRDD(df) 85 | 86 | val results = luceneRDD.termQuery("name", "water") 87 | results.count() should equal(1) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/query/LuceneQueryHelpersSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.query 18 | 19 | import org.apache.lucene.analysis.Analyzer 20 | import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper 21 | import org.apache.lucene.document.Field.Store 22 | import org.apache.lucene.document._ 23 | import org.apache.lucene.facet.FacetField 24 | import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader 25 | import org.apache.lucene.index.DirectoryReader 26 | import org.apache.lucene.search.IndexSearcher 27 | import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD 28 | import org.zouzias.spark.lucenerdd.store.IndexWithTaxonomyWriter 29 | import scala.collection.JavaConverters._ 30 | import org.scalatest.BeforeAndAfterEach 31 | import org.scalatest.flatspec.AnyFlatSpec 32 | import org.scalatest._ 33 | import matchers.should._ 34 | 35 | 36 | import scala.io.Source 37 | 38 | class LuceneQueryHelpersSpec extends AnyFlatSpec 39 | with IndexWithTaxonomyWriter 40 | with Matchers 41 | with BeforeAndAfterEach { 42 | 43 | // Load cities 44 | val countries: Seq[String] = Source 45 | .fromFile("src/test/resources/countries.txt") 46 | .getLines() 47 | .map(_.toLowerCase()).toSeq 48 | 49 | val indexAnalyzerPerField: Map[String, String] = Map("name" 50 | -> "org.apache.lucene.en.EnglishAnalyzer") 51 | 52 | private val MaxFacetValue: Int = 10 53 | 54 | override def indexAnalyzer(): Analyzer = getAnalyzer(Some("en")) 55 | 56 | override def indexPerFieldAnalyzer(): PerFieldAnalyzerWrapper = { 57 | val analyzerPerField: Map[String, Analyzer] = indexAnalyzerPerField 58 | .mapValues(x => getAnalyzer(Some(x))) 59 | new PerFieldAnalyzerWrapper(indexAnalyzer(), analyzerPerField.asJava) 60 | } 61 | 62 | countries.zipWithIndex.foreach { case (elem, index) => 63 | val doc = convertToDoc(index % MaxFacetValue, elem) 64 | indexWriter.addDocument(FacetsConfig.build(taxoWriter, doc)) 65 | } 66 | 67 | indexWriter.commit() 68 | taxoWriter.close() 69 | indexWriter.close() 70 | 71 | private val indexReader = DirectoryReader.open(IndexDir) 72 | private val indexSearcher = new IndexSearcher(indexReader) 73 | private lazy val taxoReader = new DirectoryTaxonomyReader(TaxonomyDir) 74 | 75 | 76 | private lazy val TestFacetName = s"_2${FacetedLuceneRDD.FacetTextFieldSuffix}" 77 | 78 | def convertToDoc(pos: Int, text: String): Document = { 79 | val doc = new Document() 80 | doc.add(new StringField("_1", text, Store.YES)) 81 | doc.add(new FacetField(s"_1${FacetedLuceneRDD.FacetTextFieldSuffix}", text)) 82 | doc.add(new IntPoint("_2", pos)) 83 | doc.add(new StoredField("_2", pos)) 84 | doc.add(new FacetField(TestFacetName, pos.toString)) 85 | doc 86 | } 87 | 88 | "LuceneQueryHelpers.fields" should "return the list of fields" in { 89 | LuceneQueryHelpers.fields(indexSearcher) should equal (Set("_1", "_2")) 90 | } 91 | 92 | "LuceneQueryHelpers.totalDocs" should "return correct total document counts" in { 93 | LuceneQueryHelpers.totalDocs(indexSearcher) should equal (countries.size) 94 | } 95 | 96 | "LuceneQueryHelpers.facetedTextSearch" should "return correct facet counts" in { 97 | val facets = LuceneQueryHelpers.facetedTextSearch(indexSearcher, taxoReader, 98 | FacetsConfig, "*:*", TestFacetName, 100, indexAnalyzer()) 99 | 100 | facets.facetName should equal(TestFacetName) 101 | facets.facets.size should equal(MaxFacetValue) 102 | } 103 | 104 | "LuceneQueryHelpers.termQuery" should "return correct documents" in { 105 | val greece = "greece" 106 | val topDocs = LuceneQueryHelpers 107 | .termQuery(indexSearcher, "_1", greece, 100) 108 | .map(_.toRow()) 109 | 110 | topDocs.size should equal(1) 111 | 112 | topDocs.exists(d => d.getString(d.fieldIndex("_1")). 113 | toLowerCase() 114 | .contains(greece)) should equal(true) 115 | } 116 | 117 | "LuceneQueryHelpers.prefixQuery" should "return correct documents" in { 118 | val prefix = "gree" 119 | val topDocs = LuceneQueryHelpers 120 | .prefixQuery(indexSearcher, "_1", prefix, 100) 121 | .map(_.toRow()) 122 | 123 | topDocs.forall(d => d.getString(d.fieldIndex("_1")) 124 | .toLowerCase() 125 | .contains(prefix)) should equal(true) 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/response/LuceneRDDResponseSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.response 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.sql.SparkSession 22 | import org.zouzias.spark.lucenerdd.{LuceneRDD, LuceneRDDKryoRegistrator} 23 | import org.zouzias.spark.lucenerdd._ 24 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc 25 | import org.zouzias.spark.lucenerdd.testing.FavoriteCaseClass 26 | import org.scalatest.BeforeAndAfterEach 27 | import org.scalatest.flatspec.AnyFlatSpec 28 | import org.scalatest._ 29 | import matchers.should._ 30 | 31 | 32 | class LuceneRDDResponseSpec extends AnyFlatSpec with Matchers 33 | with BeforeAndAfterEach 34 | with SharedSparkContext { 35 | 36 | override val conf: SparkConf = LuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 37 | setMaster("local[*]"). 38 | setAppName("test"). 39 | set("spark.ui.enabled", "false"). 40 | set("spark.app.id", appID)) 41 | 42 | def randomString(length: Int): String = scala.util.Random.alphanumeric.take(length).mkString 43 | 44 | var luceneRDD: LuceneRDD[_] = _ 45 | 46 | override def afterEach() { 47 | luceneRDD.close() 48 | } 49 | 50 | "LuceneRDDResponseSpec.take(k)" should "return exactly k elements" in { 51 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee") 52 | val rdd = sc.parallelize(array) 53 | luceneRDD = LuceneRDD(rdd) 54 | val result = luceneRDD.query("*:*", 10) 55 | result.take(2).length should be (2) 56 | } 57 | 58 | "LuceneRDDResponseSpec.collect()" should "return all elements" in { 59 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee") 60 | val rdd = sc.parallelize(array) 61 | luceneRDD = LuceneRDD(rdd) 62 | val result = luceneRDD.query("*:*", 10) 63 | result.collect().length should be (array.length) 64 | } 65 | 66 | "LuceneRDDResponseSpec.toDF()" should "convert to DataFrame" in { 67 | implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate() 68 | val elem = Array("fear", "death", "water", "fire", "house") 69 | .zipWithIndex.map{ case (str, index) => 70 | FavoriteCaseClass(str, index, 10L, 10e-6F, s"${str}@gmail.com")} 71 | val rdd = sc.parallelize(elem) 72 | luceneRDD = LuceneRDD(rdd) 73 | val response = luceneRDD.query("*:*", 10) 74 | val schema = response.toDF().schema 75 | 76 | schema.nonEmpty should equal(true) 77 | schema.fieldNames.contains("name") should equal(true) 78 | schema.fieldNames.contains("age") should equal(true) 79 | schema.fieldNames.contains("myLong") should equal(true) 80 | schema.fieldNames.contains("myFloat") should equal(true) 81 | schema.fieldNames.contains("email") should equal(true) 82 | 83 | schema.fields(schema.fieldIndex("name")).dataType should 84 | equal(org.apache.spark.sql.types.StringType) 85 | schema.fields(schema.fieldIndex("age")).dataType should 86 | equal(org.apache.spark.sql.types.IntegerType) 87 | schema.fields(schema.fieldIndex("myLong")).dataType should 88 | equal(org.apache.spark.sql.types.LongType) 89 | schema.fields(schema.fieldIndex("myFloat")).dataType should 90 | equal(org.apache.spark.sql.types.FloatType) 91 | schema.fields(schema.fieldIndex("email")).dataType should 92 | equal(org.apache.spark.sql.types.StringType) 93 | } 94 | 95 | "LuceneRDDResponseSpec.toDF()" should "return score,shardIndex,docId with correct types" in { 96 | implicit val sparkSession: SparkSession = SparkSession.builder().getOrCreate() 97 | val elem = Array("fear", "death", "water", "fire", "house") 98 | .zipWithIndex.map { case (str, index) => 99 | FavoriteCaseClass(str, index, 10L, 10e-6F, s"${str}@gmail.com") 100 | } 101 | val rdd = sc.parallelize(elem) 102 | luceneRDD = LuceneRDD(rdd) 103 | val response = luceneRDD.query("*:*", 10) 104 | val schema = response.toDF().schema 105 | 106 | schema.nonEmpty should equal(true) 107 | 108 | // Extra auxiliary fields that must exist on the DataFrame 109 | schema.fieldNames.contains(SparkScoreDoc.DocIdField) should equal(true) 110 | schema.fieldNames.contains(SparkScoreDoc.ShardField) should equal(true) 111 | schema.fieldNames.contains(SparkScoreDoc.ScoreField) should equal(true) 112 | 113 | 114 | schema.fields(schema.fieldIndex(SparkScoreDoc.DocIdField)).dataType should 115 | equal(org.apache.spark.sql.types.IntegerType) 116 | schema.fields(schema.fieldIndex(SparkScoreDoc.ShardField)).dataType should 117 | equal(org.apache.spark.sql.types.IntegerType) 118 | schema.fields(schema.fieldIndex(SparkScoreDoc.ScoreField)).dataType should 119 | equal(org.apache.spark.sql.types.FloatType) 120 | } 121 | 122 | 123 | "LuceneRDDResponseSpec.collect()" should "work when no results are found" in { 124 | val array = Array("aaa", "bbb", "ccc", "ddd", "eee") 125 | val rdd = sc.parallelize(array) 126 | luceneRDD = LuceneRDD(rdd) 127 | val result = luceneRDD.query("fff", 10) 128 | result.collect().length should be (0) 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDDKnnSearchSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape 18 | 19 | 20 | import com.holdenkarau.spark.testing.SharedSparkContext 21 | import org.apache.spark.SparkConf 22 | import org.zouzias.spark.lucenerdd._ 23 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader 24 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils 25 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.ScoreField 26 | import org.scalatest.BeforeAndAfterEach 27 | import org.scalatest.flatspec.AnyFlatSpec 28 | import org.scalatest._ 29 | import matchers.should._ 30 | 31 | 32 | class ShapeLuceneRDDKnnSearchSpec extends AnyFlatSpec 33 | with Matchers 34 | with BeforeAndAfterEach 35 | with SharedSparkContext 36 | with ContextLoader 37 | with LuceneRDDTestUtils { 38 | 39 | val k = 6 40 | 41 | val Radius: Double = 5D 42 | 43 | var pointLuceneRDD: ShapeLuceneRDD[_, _] = _ 44 | 45 | override val conf: SparkConf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 46 | setMaster("local[*]"). 47 | setAppName("test"). 48 | set("spark.ui.enabled", "false"). 49 | set("spark.app.id", appID)) 50 | 51 | override def afterEach() { 52 | pointLuceneRDD.close() 53 | } 54 | 55 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (knn)" in { 56 | 57 | val rdd = sc.parallelize(cities) 58 | pointLuceneRDD = ShapeLuceneRDD(rdd) 59 | 60 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "*:*").collect() 61 | 62 | results.length should equal(k) 63 | results.length should be > 0 64 | 65 | // Closest is Bern and fartherst is Toronto 66 | docTextFieldEq(results.head, "_1", Bern._2) should equal(true) 67 | docTextFieldEq(results.last, "_1", Toronto._2) should equal(true) 68 | 69 | // Distances must be sorted 70 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse 71 | sortedDesc(revertedDists) should equal(true) 72 | } 73 | 74 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (prefix search)" in { 75 | 76 | val rdd = sc.parallelize(cities) 77 | pointLuceneRDD = ShapeLuceneRDD(rdd) 78 | 79 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "_1:Mil*").collect() 80 | 81 | results.length should be <= k 82 | results.length should be > 0 83 | 84 | // Closest is Bern and farthest is Toronto 85 | docTextFieldEq(results.head, "_1", Milan._2) should equal(true) 86 | 87 | // Distances must be sorted 88 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse 89 | sortedDesc(revertedDists) should equal(true) 90 | } 91 | 92 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (fuzzy search)" in { 93 | 94 | val rdd = sc.parallelize(cities) 95 | pointLuceneRDD = ShapeLuceneRDD(rdd) 96 | 97 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "_1:Miln~1").collect() 98 | 99 | results.length should be <= k 100 | results.length should be > 0 101 | 102 | // Closest is Bern and farthest is Toronto 103 | docTextFieldEq(results.head, "_1", Milan._2) should equal(true) 104 | 105 | // Distances must be sorted 106 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse 107 | sortedDesc(revertedDists) should equal(true) 108 | } 109 | 110 | "ShapeLuceneRDD.knnSearch" should "return k-nearest neighbors (term query)" in { 111 | 112 | val rdd = sc.parallelize(cities) 113 | pointLuceneRDD = ShapeLuceneRDD(rdd) 114 | 115 | val results = pointLuceneRDD.knnSearch(Bern._1, k, "_1:Milan").collect() 116 | 117 | results.length should be <= k 118 | results.length should be > 0 119 | 120 | // Closest is Milan (due to filtering) 121 | docTextFieldEq(results.head, "_1", Milan._2) should equal(true) 122 | 123 | // Distances must be sorted 124 | val revertedDists = results.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse 125 | sortedDesc(revertedDists) should equal(true) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/spatial/shape/ShapeLuceneRDDLinkageSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.sql.{Row, SparkSession} 22 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader 23 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils 24 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.ScoreField 25 | import org.scalatest.BeforeAndAfterEach 26 | import org.scalatest.flatspec.AnyFlatSpec 27 | import org.scalatest._ 28 | import matchers.should._ 29 | 30 | 31 | // Required for implicit Document conversion 32 | import org.zouzias.spark.lucenerdd._ 33 | 34 | case class City(name: String, x: Double, y: Double) 35 | 36 | class ShapeLuceneRDDLinkageSpec extends AnyFlatSpec 37 | with Matchers 38 | with BeforeAndAfterEach 39 | with SharedSparkContext 40 | with ContextLoader 41 | with LuceneRDDTestUtils { 42 | 43 | val k = 6 44 | 45 | val Radius: Double = 5D 46 | 47 | var pointLuceneRDD: ShapeLuceneRDD[_, _] = _ 48 | 49 | override val conf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 50 | setMaster("local[*]"). 51 | setAppName("test"). 52 | set("spark.ui.enabled", "false"). 53 | set("spark.app.id", appID)) 54 | 55 | override def afterEach() { 56 | pointLuceneRDD.close() 57 | } 58 | 59 | "ShapeLuceneRDD.linkByKnn" should "link correctly k-nearest neighbors (knn)" in { 60 | 61 | val citiesRDD = sc.parallelize(cities) 62 | pointLuceneRDD = ShapeLuceneRDD(citiesRDD) 63 | pointLuceneRDD.cache() 64 | 65 | val linker = (x: ((Double, Double), String)) => x._1 66 | 67 | val linkage = pointLuceneRDD.linkByKnn(citiesRDD, linker, k) 68 | 69 | linkage.count() should equal(cities.length) 70 | 71 | linkage.collect().foreach{ case (city, knnResults) => 72 | 73 | // top result should be linked with its query result 74 | val doc = knnResults.head 75 | city._2 should equal(doc.getString(doc.fieldIndex("_1"))) 76 | 77 | // Must return only at most k results 78 | knnResults.length should be <= k 79 | 80 | // Distances must be sorted 81 | val revertedDists = knnResults.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse 82 | sortedDesc(revertedDists) should equal(true) 83 | } 84 | } 85 | 86 | "ShapeLuceneRDD.linkByRadius" should "link correctly countries with capitals" in { 87 | 88 | val Radius = 50.0 89 | val sparkSession = SparkSession.builder.getOrCreate() 90 | import sparkSession.implicits._ 91 | val countriesRDD = sparkSession.read.parquet("data/countries-poly.parquet") 92 | .select("name", "shape") 93 | .map(row => (row.getString(1), row.getString(0))) 94 | 95 | pointLuceneRDD = ShapeLuceneRDD(countriesRDD) 96 | pointLuceneRDD.cache() 97 | 98 | val capitals = sparkSession.read.parquet("data/capitals.parquet") 99 | .select("name", "shape") 100 | .map(row => (row.getString(1), row.getString(0))) 101 | 102 | /** 103 | * Convert WKT Point to (Double, Double) 104 | * @param city 105 | * @return 106 | */ 107 | def coords(city: (String, String)): (Double, Double) = { 108 | val str = city._1 109 | val nums = str.dropWhile(x => x.compareTo('(') != 0).drop(1).dropRight(1) 110 | val coords = nums.split(" ").map(_.trim) 111 | (coords(0).toDouble, coords(1).toDouble) 112 | } 113 | 114 | val linkage = pointLuceneRDD.linkByRadius(capitals.rdd, coords, Radius).collect() 115 | 116 | linkage.length should equal(capitals.count) 117 | 118 | linkage.exists{case (cap, results) => 119 | cap._2 == "Bern" && docTextFieldEq(results, "_1", "Switzerland")} should equal(true) 120 | linkage.exists{case (cap, results) => 121 | cap._2 == "Berlin" && docTextFieldEq(results, "_1", "Germany")} should equal(true) 122 | linkage.exists{case (cap, results) => 123 | cap._2 == "Ottawa" && docTextFieldEq(results, "_1", "Canada")} should equal(true) 124 | linkage.exists{case (cap, results) => 125 | cap._2 == "Paris" && docTextFieldEq(results, "_1", "France")} should equal(true) 126 | 127 | } 128 | 129 | "ShapeLuceneRDD.linkDataFrameByKnn" should "link correctly k-nearest neighbors (knn)" in { 130 | 131 | val sparkSession = SparkSession.builder.getOrCreate() 132 | import sparkSession.implicits._ 133 | val citiesRDD = sc.parallelize(cities) 134 | pointLuceneRDD = ShapeLuceneRDD(citiesRDD) 135 | pointLuceneRDD.cache() 136 | 137 | val citiesDF = citiesRDD.map(x => City(x._2, x._1._1, x._1._2)).toDF 138 | val linker = (x: Row) => (x.getDouble(1), x.getDouble(2)) 139 | 140 | val linkage = pointLuceneRDD.linkDataFrameByKnn(citiesDF, linker, k) 141 | 142 | linkage.count() should equal(cities.length) 143 | 144 | linkage.collect().foreach { case (city, knnResults) => 145 | 146 | // top result should be linked with its query result 147 | docTextFieldEq(knnResults, "_1", city.getString(0)) should equal(true) 148 | 149 | // Must return only at most k results 150 | knnResults.length should be <= k 151 | 152 | // Distances must be sorted 153 | val revertedDists = knnResults.map(x => x.getFloat(x.fieldIndex(ScoreField))).reverse 154 | sortedDesc(revertedDists) should equal(true) 155 | } 156 | 157 | } 158 | 159 | } 160 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/spatial/shape/implicits/ShapeLuceneRDDImplicitsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.spatial.shape.implicits 18 | 19 | import com.holdenkarau.spark.testing.SharedSparkContext 20 | import org.apache.spark.SparkConf 21 | import org.apache.spark.sql.SparkSession 22 | import org.zouzias.spark.lucenerdd.spatial.shape.{ShapeLuceneRDD, _} 23 | import org.zouzias.spark.lucenerdd.testing.LuceneRDDTestUtils 24 | import org.zouzias.spark.lucenerdd._ 25 | import org.zouzias.spark.lucenerdd.spatial.shape.context.ContextLoader 26 | 27 | import org.scalatest.BeforeAndAfterEach 28 | import org.scalatest.flatspec.AnyFlatSpec 29 | import org.scalatest._ 30 | import matchers.should._ 31 | 32 | 33 | class ShapeLuceneRDDImplicitsSpec extends AnyFlatSpec 34 | with Matchers 35 | with BeforeAndAfterEach 36 | with SharedSparkContext 37 | with ContextLoader 38 | with LuceneRDDTestUtils { 39 | 40 | val Radius: Double = 5D 41 | 42 | override val conf = ShapeLuceneRDDKryoRegistrator.registerKryoClasses(new SparkConf(). 43 | setMaster("local[*]"). 44 | setAppName("test"). 45 | set("spark.ui.enabled", "false"). 46 | set("spark.app.id", appID)) 47 | 48 | "ShapeLuceneRDDImplicits" should "implicitly convert to point" in { 49 | 50 | val rdd = sc.parallelize(cities) 51 | val shapeRDD = ShapeLuceneRDD(rdd) 52 | 53 | shapeRDD.count should equal(cities.length) 54 | } 55 | 56 | "ShapeLuceneRDDImplicits" should "implicitly convert to circle" in { 57 | 58 | val circleCities: Array[(((Double, Double), Double), String)] 59 | = cities.map(convertToCircle) 60 | val rdd = sc.parallelize(circleCities) 61 | val shapeRDD = ShapeLuceneRDD(rdd) 62 | 63 | shapeRDD.count should equal(circleCities.length) 64 | } 65 | 66 | "ShapeLuceneRDDImplicits" should "implicitly convert to rectangle" in { 67 | 68 | val rectangleCities = cities.map(convertToRectangle) 69 | val rdd = sc.parallelize(rectangleCities) 70 | val shapeRDD = ShapeLuceneRDD(rdd) 71 | 72 | shapeRDD.count should equal(rectangleCities.length) 73 | } 74 | 75 | "ShapeLuceneRDDImplicits" should "implicitly convert POINTS from WKT" in { 76 | val sparkSession = SparkSession.builder().getOrCreate() 77 | val citiesDF = sparkSession.read.parquet("data/world-cities-points.parquet") 78 | import sparkSession.implicits._ 79 | val citiesRDD = citiesDF.map(row => 80 | (row.getString(2), (row.getString(0), row.getString(1)))) 81 | 82 | val total = citiesDF.count() 83 | total > 0 should equal(true) 84 | 85 | val shapeRDD = ShapeLuceneRDD(citiesRDD) 86 | 87 | shapeRDD.count > 0 should equal(true) 88 | } 89 | 90 | "ShapeLuceneRDDImplicits" should "implicitly convert BBOX from WKT" in { 91 | val sparkSession = SparkSession.builder().getOrCreate() 92 | import sparkSession.implicits._ 93 | val countriesDF = sparkSession.read.parquet("data/countries-bbox.parquet") 94 | val citiesRDD = countriesDF.map(row => 95 | (row.getString(2), (row.getString(0), row.getString(1)))) 96 | 97 | val total = countriesDF.count() 98 | total > 0 should equal(true) 99 | 100 | val shapeRDD = ShapeLuceneRDD(citiesRDD) 101 | 102 | shapeRDD.count > 0 should equal(true) 103 | } 104 | 105 | "ShapeLuceneRDDImplicits" should "implicitly convert to polygon" in { 106 | 107 | val polygonCities = cities.map(convertToPolygon(_, Radius)) 108 | val rdd = sc.parallelize(polygonCities) 109 | val shapeRDD = ShapeLuceneRDD(rdd) 110 | 111 | shapeRDD.count should equal(polygonCities.length) 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/test/scala/org/zouzias/spark/lucenerdd/testing/LuceneRDDTestUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.zouzias.spark.lucenerdd.testing 18 | 19 | import org.apache.spark.sql.Row 20 | import org.zouzias.spark.lucenerdd.models.SparkScoreDoc 21 | 22 | trait LuceneRDDTestUtils { 23 | 24 | val Bern = ( (7.45, 46.95), "Bern") 25 | val Zurich = ( (8.55, 47.366667), "Zurich") 26 | val Laussanne = ( (6.6335, 46.519833), "Laussanne") 27 | val Athens = ((23.716667, 37.966667), "Athens") 28 | val Toronto = ((-79.4, 43.7), "Toronto") 29 | val Milan = ((45.4646, 9.198), "Milan") 30 | val cities = Array(Bern, Zurich, Laussanne, Athens, Milan, Toronto) 31 | 32 | def Radius: Double 33 | 34 | def convertToCircle(city: ((Double, Double), String)): (((Double, Double), Double), String) = { 35 | ((city._1, Radius), city._2) 36 | } 37 | 38 | def convertToRectangle(city: ((Double, Double), String)) 39 | : ((Double, Double, Double, Double), String) = { 40 | val x = city._1._1 41 | val y = city._1._2 42 | 43 | ((x - Radius, x + Radius, y - Radius, y + Radius), city._2) 44 | } 45 | 46 | def convertToPolygon(city: ((Double, Double), String), width: Double) 47 | : (Array[(Double, Double)], String) = { 48 | val x = city._1._1 49 | val y = city._1._2 50 | 51 | val coords = Array((x - width, y - width), (x - width, y + width), 52 | (x + width, y + width), (x + width, y - width), (x - width, y - width)) 53 | (coords, city._2) 54 | } 55 | 56 | protected def docTextFieldEq(doc: Row, fieldName: String, fieldValue: String): Boolean = { 57 | doc.getString(doc.fieldIndex(fieldName)).contains(fieldValue) 58 | } 59 | 60 | protected def docTextFieldEq(docs: Array[Row], fieldName: String, fieldValue: String) 61 | : Boolean = { 62 | docs.exists(x => x.getString(x.fieldIndex(fieldName)).contains(fieldValue)) 63 | } 64 | 65 | // Check if sequence is sorted in descending order 66 | protected def sortedDesc(seq : Seq[Float]) : Boolean = { 67 | if (seq.isEmpty) true else seq.zip(seq.tail).forall(x => x._1 >= x._2) 68 | } 69 | 70 | // Check if sequence is sorted in descending order 71 | protected def sortedDescSparkScoreDocs(seq : Seq[SparkScoreDoc]) : Boolean = { 72 | if (seq.isEmpty) true else seq.zip(seq.tail).forall(x => x._1.score >= x._2.score) 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /startZeppelin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ############################################################# 4 | # Start Zeppelin using Docker # 5 | ############################################################# 6 | 7 | echo "===========================================" 8 | echo "===========================================" 9 | echo "Browse to http://localhost:8080/" 10 | echo "===========================================" 11 | echo "===========================================" 12 | 13 | docker-compose up 14 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | ThisBuild / versionScheme := Some("early-semver") 2 | ThisBuild / version := "0.4.1-SNAPSHOT" 3 | --------------------------------------------------------------------------------