├── .gitignore
├── Procfile
├── README.md
├── build.sbt
├── project
├── build.properties
└── plugins.sbt
└── src
├── main
├── scala
│ ├── JettyLauncher.scala
│ ├── demo
│ │ └── MyScalatraFilter.scala
│ ├── download
│ │ └── ImageDownloader.scala
│ └── search
│ │ ├── documents
│ │ ├── Document.scala
│ │ ├── MockDocument.scala
│ │ ├── NationalArchiveDocument.scala
│ │ └── QueryDocument.scala
│ │ ├── indexing
│ │ ├── InvertedIndex.scala
│ │ └── SearchRanker.scala
│ │ ├── managers
│ │ ├── LuceneSearchManager.scala
│ │ └── SearchManager.scala
│ │ ├── parsing
│ │ ├── Parser.scala
│ │ └── stopWords.txt
│ │ └── result
│ │ ├── Result.scala
│ │ └── Snippet.scala
└── webapp
│ ├── WEB-INF
│ ├── scalate
│ │ └── layouts
│ │ │ └── default.scaml
│ └── web.xml
│ └── static
│ ├── css
│ ├── bootstrap-responsive.css
│ ├── bootstrap-responsive.min.css
│ ├── bootstrap.css
│ ├── bootstrap.min.css
│ ├── bootstrap2.css
│ ├── jqueryui.css
│ ├── lightbox.css
│ ├── main.css
│ ├── timeline.css
│ └── timeline.png
│ └── js
│ ├── backbone.js
│ ├── bootstrap.js
│ ├── bootstrap.min.js
│ ├── custom
│ ├── masonryloader.js
│ ├── search.js
│ └── timelineloader.js
│ ├── jquery-1.7.2.min.js
│ ├── jquery-ui-1.8.16.custom.min.js
│ ├── jquery.imagesloaded.min.js
│ ├── jquery.isotope.min.js
│ ├── jquery.knob.js
│ ├── jquery.lazyload.min.js
│ ├── jquery.masonry.min.js
│ ├── lightbox.js
│ ├── mustache.js
│ ├── spin.min.js
│ ├── storyjs-embed.js
│ ├── timeline-min.js
│ └── underscore.js
├── resources
├── PhotoMetaData10000.csv
├── documents
│ ├── bible
│ │ ├── 1Chronicles.txt
│ │ ├── 1Corinthians.txt
│ │ ├── 1John.txt
│ │ ├── 1Kings.txt
│ │ ├── 1Peter.txt
│ │ ├── 1Samuel.txt
│ │ ├── 1Thessalonians.txt
│ │ ├── 1Timothy.txt
│ │ ├── 2Chronicles.txt
│ │ ├── 2Corinthians.txt
│ │ ├── 2John.txt
│ │ ├── 2Kings.txt
│ │ ├── 2Peter.txt
│ │ ├── 2Samuel.txt
│ │ ├── 2Thessalonians.txt
│ │ ├── 2Timothy.txt
│ │ ├── 3John.txt
│ │ ├── Acts.txt
│ │ ├── Amos.txt
│ │ ├── Colossians.txt
│ │ ├── Daniel.txt
│ │ ├── Dedicatory.txt
│ │ ├── Deuteronomy.txt
│ │ ├── Ecclesiastes.txt
│ │ ├── Ephesians.txt
│ │ ├── Esther.txt
│ │ ├── Exodus.txt
│ │ ├── Ezekiel.txt
│ │ ├── Ezra.txt
│ │ ├── Galatians.txt
│ │ ├── Genesis.txt
│ │ ├── Habakkuk.txt
│ │ ├── Haggai.txt
│ │ ├── Hebrews.txt
│ │ ├── Hosea.txt
│ │ ├── Isaiah.txt
│ │ ├── James.txt
│ │ ├── Jeremiah.txt
│ │ ├── Job.txt
│ │ ├── Joel.txt
│ │ ├── John.txt
│ │ ├── Jonah.txt
│ │ ├── Joshua.txt
│ │ ├── Jude.txt
│ │ ├── Judges.txt
│ │ ├── Lamentations.txt
│ │ ├── Leviticus.txt
│ │ ├── Luke.txt
│ │ ├── Malachi.txt
│ │ ├── Mark.txt
│ │ ├── Matthew.txt
│ │ ├── Micah.txt
│ │ ├── Nahum.txt
│ │ ├── Nehemiah.txt
│ │ ├── Numbers.txt
│ │ ├── Obadiah.txt
│ │ ├── Philemon.txt
│ │ ├── Philippians.txt
│ │ ├── Preface.txt
│ │ ├── Preface_w_footnotes.txt
│ │ ├── Proverbs.txt
│ │ ├── Psalms.txt
│ │ ├── Revelation.txt
│ │ ├── Romans.txt
│ │ ├── Ruth.txt
│ │ ├── SongofSolomon.txt
│ │ ├── Titus.txt
│ │ ├── Zechariah.txt
│ │ └── Zephaniah.txt
│ └── mopp
│ │ ├── A_01_01.txt
│ │ ├── A_01_03.txt
│ │ ├── A_01_04.txt
│ │ ├── A_01_05.txt
│ │ ├── A_02_01.txt
│ │ ├── A_02_02.txt
│ │ ├── A_02_03.txt
│ │ ├── A_02_04.txt
│ │ ├── A_02_05.txt
│ │ ├── A_02_06.txt
│ │ ├── A_03_01.txt
│ │ ├── A_03_02.txt
│ │ ├── A_03_03.txt
│ │ ├── A_03_04.txt
│ │ ├── A_03_05.txt
│ │ ├── A_03_06.txt
│ │ ├── A_03_07.txt
│ │ ├── A_03_08.txt
│ │ ├── A_03_09.txt
│ │ ├── A_04_01.txt
│ │ ├── A_06_01.txt
│ │ ├── A_07_01.txt
│ │ ├── A_08_01.txt
│ │ ├── A_08_02.txt
│ │ ├── A_08_03.txt
│ │ ├── A_08_04.txt
│ │ ├── A_08_05.txt
│ │ ├── A_08_06.txt
│ │ ├── A_08_07.txt
│ │ ├── A_08_08.txt
│ │ ├── A_08_09.txt
│ │ ├── A_09_01.txt
│ │ ├── A_09_02.txt
│ │ ├── A_09_03.txt
│ │ ├── A_09_04.txt
│ │ ├── A_09_05.txt
│ │ ├── A_09_06.txt
│ │ ├── A_09_07.txt
│ │ ├── A_09_08.txt
│ │ ├── B_01_01.txt
│ │ ├── B_02_01.txt
│ │ ├── B_02_02.txt
│ │ ├── B_02_03.txt
│ │ ├── B_03_01.txt
│ │ ├── B_03_02.txt
│ │ ├── B_03_03.txt
│ │ ├── B_03_05.txt
│ │ ├── B_03_06.txt
│ │ ├── B_03_07.txt
│ │ ├── B_03_08.txt
│ │ ├── B_04_01.txt
│ │ ├── B_04_02.txt
│ │ ├── B_04_03.txt
│ │ ├── B_04_04.txt
│ │ ├── B_04_05.txt
│ │ ├── B_04_06.txt
│ │ ├── B_04_07.txt
│ │ ├── B_04_08.txt
│ │ ├── B_04_10.txt
│ │ ├── B_04_11.txt
│ │ ├── B_05_01.txt
│ │ ├── B_05_02.txt
│ │ ├── B_05_03.txt
│ │ ├── B_05_04.txt
│ │ ├── B_05_05.txt
│ │ ├── B_05_06.txt
│ │ ├── B_06_01.txt
│ │ ├── B_06_02.txt
│ │ ├── B_06_03.txt
│ │ ├── B_06_04.txt
│ │ ├── B_06_05.txt
│ │ ├── B_06_06.txt
│ │ ├── B_06_07.txt
│ │ ├── B_06_08.txt
│ │ ├── B_06_09.txt
│ │ ├── B_07_01.txt
│ │ ├── B_07_02.txt
│ │ ├── B_07_03.txt
│ │ ├── B_07_04.txt
│ │ ├── B_07_05.txt
│ │ ├── B_07_06.txt
│ │ ├── B_07_07.txt
│ │ ├── B_07_08.txt
│ │ ├── B_07_09.txt
│ │ ├── B_07_10.txt
│ │ ├── B_07_11.txt
│ │ ├── B_07_12.txt
│ │ ├── B_07_13.txt
│ │ ├── B_08_01.txt
│ │ ├── B_08_02.txt
│ │ ├── B_08_03.txt
│ │ ├── B_08_05.txt
│ │ ├── B_08_06.txt
│ │ ├── B_09_01.txt
│ │ ├── B_09_02.txt
│ │ ├── B_09_03.txt
│ │ ├── B_09_05.txt
│ │ ├── B_09_07.txt
│ │ ├── B_10_01.txt
│ │ ├── B_11_01.txt
│ │ ├── B_11_02.txt
│ │ ├── B_11_03.txt
│ │ ├── B_11_04.txt
│ │ ├── B_12_01.txt
│ │ ├── B_12_02.txt
│ │ ├── B_12_03.txt
│ │ ├── B_12_04.txt
│ │ ├── B_12_05.txt
│ │ ├── B_12_06.txt
│ │ ├── B_12_07.txt
│ │ ├── B_12_08.txt
│ │ ├── B_12_09.txt
│ │ ├── C_01_01.txt
│ │ ├── C_01_02.txt
│ │ ├── C_01_03.txt
│ │ ├── C_02_01.txt
│ │ ├── C_03_01.txt
│ │ ├── C_03_02.txt
│ │ ├── C_03_03.txt
│ │ ├── C_03_04.txt
│ │ ├── C_03_05.txt
│ │ ├── C_04_01.txt
│ │ ├── C_04_02.txt
│ │ ├── C_04_03.txt
│ │ ├── C_04_04.txt
│ │ ├── C_04_05.txt
│ │ ├── C_04_06.txt
│ │ ├── C_04_07.txt
│ │ ├── C_05_01.txt
│ │ ├── C_05_02.txt
│ │ ├── C_05_03.txt
│ │ ├── C_06_01.txt
│ │ ├── C_06_02.txt
│ │ ├── C_06_03.txt
│ │ ├── C_06_04.txt
│ │ ├── C_07_01.txt
│ │ ├── D_01_01.txt
│ │ ├── D_01_02.txt
│ │ ├── D_02.txt
│ │ ├── D_02_01.txt
│ │ ├── D_02_02.txt
│ │ ├── D_02_06.txt
│ │ ├── D_02_07.txt
│ │ ├── D_02_08.txt
│ │ ├── D_03_01.txt
│ │ ├── D_04_01.txt
│ │ ├── D_04_02.txt
│ │ ├── D_04_03.txt
│ │ ├── D_04_04.txt
│ │ ├── D_04_05.txt
│ │ ├── D_05_01.txt
│ │ ├── D_05_02.txt
│ │ ├── D_05_03.txt
│ │ ├── D_05_04.txt
│ │ ├── D_05_05.txt
│ │ ├── D_06_01.txt
│ │ ├── D_06_02.txt
│ │ ├── D_06_03.txt
│ │ ├── D_06_04.txt
│ │ ├── D_06_05.txt
│ │ ├── D_06_06.txt
│ │ ├── D_06_07.txt
│ │ ├── D_07_01.txt
│ │ ├── E_01_01.txt
│ │ ├── E_01_02.txt
│ │ ├── E_01_03.txt
│ │ ├── E_01_04.txt
│ │ ├── E_02_01.txt
│ │ ├── E_03_01.txt
│ │ ├── E_04_01.txt
│ │ ├── E_04_02.txt
│ │ ├── E_04_03.txt
│ │ ├── E_04_04.txt
│ │ ├── E_04_05.txt
│ │ ├── E_05_01.txt
│ │ ├── E_06_01.txt
│ │ ├── E_06_02.txt
│ │ ├── E_06_03.txt
│ │ ├── E_06_04.txt
│ │ ├── E_06_05.txt
│ │ ├── E_06_06.txt
│ │ ├── E_06_07.txt
│ │ ├── E_06_08.txt
│ │ ├── E_07_01.txt
│ │ ├── E_07_02.txt
│ │ ├── E_07_03.txt
│ │ ├── E_07_04.txt
│ │ ├── E_08_01.txt
│ │ ├── E_09_01.txt
│ │ ├── E_09_02.txt
│ │ ├── E_09_03.txt
│ │ ├── E_10_01.txt
│ │ ├── E_10_02.txt
│ │ ├── E_10_03.txt
│ │ ├── E_10_04.txt
│ │ ├── E_11_01.txt
│ │ ├── E_11_02.txt
│ │ ├── E_11_03.txt
│ │ ├── E_11_04.txt
│ │ ├── E_11_05.txt
│ │ ├── E_11_06.txt
│ │ ├── F_01_01.txt
│ │ ├── F_01_02.txt
│ │ ├── F_01_03.txt
│ │ ├── F_01_04.txt
│ │ ├── F_01_05.txt
│ │ ├── F_01_06.txt
│ │ ├── F_01_07.txt
│ │ ├── F_01_08.txt
│ │ ├── F_01_09.txt
│ │ ├── F_01_10.txt
│ │ ├── F_01_11.txt
│ │ ├── F_01_12.txt
│ │ ├── F_02_01.txt
│ │ ├── F_02_02.txt
│ │ ├── F_02_03.txt
│ │ ├── F_03_01.txt
│ │ ├── F_03_03.txt
│ │ ├── F_03_04.txt
│ │ ├── F_03_05.txt
│ │ ├── F_03_06.txt
│ │ ├── F_04_01.txt
│ │ ├── F_04_02.txt
│ │ ├── F_05_01.txt
│ │ ├── F_06_01.txt
│ │ ├── F_06_02.txt
│ │ ├── F_06_03.txt
│ │ ├── G_01_01.txt
│ │ ├── G_02_01.txt
│ │ ├── G_02_02.txt
│ │ ├── G_02_03.txt
│ │ ├── G_03_01.txt
│ │ ├── G_03_02.txt
│ │ ├── G_03_03.txt
│ │ ├── G_03_04.txt
│ │ ├── G_04_01.txt
│ │ ├── G_05_01.txt
│ │ ├── G_05_02.txt
│ │ ├── G_05_03.txt
│ │ ├── G_05_04.txt
│ │ ├── G_05_05.txt
│ │ ├── G_05_06.txt
│ │ ├── G_05_07.txt
│ │ ├── G_06_01.txt
│ │ ├── G_07_01.txt
│ │ ├── G_07_02.txt
│ │ ├── G_07_03.txt
│ │ ├── G_08_01.txt
│ │ ├── G_08_02.txt
│ │ ├── G_08_03.txt
│ │ ├── G_08_04.txt
│ │ ├── H_01_01.txt
│ │ ├── H_02_01.txt
│ │ ├── H_03_01.txt
│ │ ├── H_03_02.txt
│ │ ├── H_03_03.txt
│ │ ├── H_03_04.txt
│ │ ├── H_03_05.txt
│ │ ├── H_03_06.txt
│ │ ├── H_04_01.txt
│ │ ├── H_04_02.txt
│ │ ├── H_04_03.txt
│ │ ├── H_04_04.txt
│ │ ├── H_05_01.txt
│ │ ├── H_05_02.txt
│ │ ├── H_05_03.txt
│ │ ├── H_06_01.txt
│ │ ├── H_06_02.txt
│ │ ├── H_06_03.txt
│ │ ├── H_07_01.txt
│ │ ├── I_01_01.txt
│ │ ├── I_01_02.txt
│ │ ├── I_02_01.txt
│ │ ├── I_02_02.txt
│ │ ├── I_02_04.txt
│ │ ├── I_02_05.txt
│ │ ├── I_02_06.txt
│ │ ├── I_03_01.txt
│ │ ├── I_03_02.txt
│ │ ├── I_04_01.txt
│ │ ├── I_05_01.txt
│ │ ├── I_05_02.txt
│ │ ├── I_06_01.txt
│ │ ├── I_07_01.txt
│ │ ├── I_07_02.txt
│ │ ├── I_07_03.txt
│ │ ├── I_08.txt
│ │ ├── app_A_1_1_2.txt
│ │ ├── app_B_7_9_1.txt
│ │ ├── app_D_7_1_1.txt
│ │ └── index.txt
├── stopWords.txt
└── testDocument.txt
└── test
└── scala
└── search
├── documents
└── TestDocument.scala
├── indexing
├── TestSearchRanker.scala
└── TestSearchRanker2.scala
├── managers
├── TestSearchManager.scala
├── TestSearchManagerOnLucene.scala
└── TestSearchManagerOnMopp.scala
└── parser
└── TestParser.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | # use glob syntax.
2 | syntax: glob
3 | *.ser
4 | *.class
5 | *~
6 | *.bak
7 | #*.off
8 | *.old
9 |
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 |
17 | # idea
18 | .idea
19 | *.iml
20 |
21 | # building
22 | target
23 | build
24 | null
25 | tmp*
26 | temp*
27 | dist
28 | test-output
29 | build.log
30 |
31 | # other scm
32 | .svn
33 | .CVS
34 | .hg*
35 | .jpg
36 |
37 | # switch to regexp syntax.
38 | # syntax: regexp
39 | # ^\.pc/
40 |
41 | #SHITTY output not in target directory
42 | build.log
43 | .DS_Store
44 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: target/start
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import com.typesafe.startscript.StartScriptPlugin
2 |
3 | organization := "com.github.dbousamra"
4 |
5 | name := "search-engine-scala"
6 |
7 | version := "0.0.1"
8 |
9 | scalaVersion := "2.9.2"
10 |
11 | seq(webSettings :_*)
12 |
13 | classpathTypes ~= (_ + "orbit")
14 |
15 | seq(StartScriptPlugin.startScriptForClassesSettings: _*)
16 |
17 | libraryDependencies ++= Seq(
18 | "org.apache.lucene" % "lucene-core" % "3.6.1",
19 | "xstream" % "xstream" % "1.2.2",
20 | "net.liftweb" % "lift-json_2.9.1" % "2.4",
21 | "net.sf.opencsv" % "opencsv" % "2.0",
22 | "junit" % "junit" % "4.8.1" % "test",
23 | "org.scalatest" %% "scalatest" % "1.8" % "test",
24 | "org.scalaz" %% "scalaz-core" % "6.0.4",
25 | "org.scalatra" % "scalatra" % "2.1.1",
26 | "org.scalatra" % "scalatra-scalate" % "2.1.1",
27 | "org.scalatra" % "scalatra-specs2" % "2.1.1" % "test",
28 | "ch.qos.logback" % "logback-classic" % "1.0.6" % "runtime",
29 | "edu.mit" % "jwi" % "2.2.1",
30 | "org.eclipse.jetty" % "jetty-webapp" % "8.1.7.v20120910" % "container;test;provided",
31 | "org.eclipse.jetty.orbit" % "javax.servlet" % "3.0.0.v201112011016" % "container;provided;test" artifacts (Artifact("javax.servlet", "jar", "jar"))
32 | )
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.12.0
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Classpaths.typesafeResolver
2 |
3 | resolvers += "Web plugin repo" at "http://siasia.github.com/maven2"
4 |
5 | libraryDependencies += "com.github.siasia" % "xsbt-web-plugin_2.9.2" % "0.12.0-0.2.11.1"
6 |
7 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.1.0")
8 |
9 | addSbtPlugin("com.typesafe.startscript" % "xsbt-start-script-plugin" % "0.5.3")
--------------------------------------------------------------------------------
/src/main/scala/JettyLauncher.scala:
--------------------------------------------------------------------------------
1 | import org.eclipse.jetty.server.Server
2 | import org.eclipse.jetty.servlet.{DefaultServlet, ServletContextHandler}
3 | import net.srirangan.MyScalatraFilter
4 | import org.eclipse.jetty.webapp.WebAppContext
5 |
6 | object JettyLauncher {
7 | def main(args: Array[String]) {
8 | val port = if(System.getenv("PORT") != null) System.getenv("PORT").toInt else 8080
9 |
10 | val server = new Server(port)
11 | val context = new WebAppContext()
12 | context setContextPath "/"
13 | context.setResourceBase("src/main/webapp")
14 | context.addServlet(classOf[MyScalatraFilter], "/*")
15 | context.addServlet(classOf[DefaultServlet], "/")
16 |
17 | server.setHandler(context)
18 |
19 | server.start
20 | server.join
21 | }
22 | }
--------------------------------------------------------------------------------
/src/main/scala/demo/MyScalatraFilter.scala:
--------------------------------------------------------------------------------
1 | package net.srirangan
2 | import org.scalatra._
3 | import java.net.URL
4 | import scalate.ScalateSupport
5 | import search.managers.SearchManager
6 | import java.io.File
7 | import net.liftweb.json.JsonDSL._
8 | import net.liftweb.json._
9 | import search.documents.NationalArchiveDocument
10 | import search.documents.NationalArchiveDocumentManager
11 | import search.managers.LuceneSearchManager
12 |
13 | class MyScalatraFilter extends ScalatraServlet with ScalateSupport {
14 |
15 | override implicit val contentType = "text/html"
16 | private val searchManager = new SearchManager[NationalArchiveDocument]()
17 | private val documentManager = new NationalArchiveDocumentManager()
18 | searchManager.addToIndex(documentManager.parse("src/resources/PhotoMetaData10000Replaced.csv"))
19 |
20 | get("/") {
21 | scaml("home")
22 | }
23 |
24 | get("/timeline") {
25 | scaml("timeline")
26 | }
27 |
28 | get("/timeline/data") {
29 | val queryString = "gold coast"
30 | val results = searchManager.query(queryString)
31 | val json = (
32 | ("timeline") ->
33 | ("headline" -> "National Archives of Australia")
34 | ~ ("text" -> "Search stuff")
35 | ~ ("type" -> "default")
36 | ~ ("date" -> results.map {
37 | p =>
38 | (
39 | ("startDate" -> p.document.year.toString)
40 | ~ ("headline" -> p.document.barcode)
41 | ~ ("text" -> p.document.description)
42 | ~ ("asset" -> ("media" -> p.document.largeImageURL)))
43 | }))
44 | pretty(render(json))
45 | }
46 |
47 | get("/search") {
48 | val queryString = params("query")
49 | val results = searchManager.query(queryString)
50 | val min = if (results.isEmpty) 0 else results.minBy(_.document.year).document.year
51 | val max = if (results.isEmpty) 0 else results.maxBy(_.document.year).document.year
52 | val json = (
53 | ("results" -> results.map (
54 | p =>
55 | (
56 | ("barcode" -> p.document.barcode)
57 | ~ ("description" -> p.document.description)
58 | ~ ("score" -> p.score)
59 | ~ ("year" -> p.document.year)
60 | ~ ("smallImageURL" -> p.document.smallImageURL)
61 | ~ ("largeImageURL" -> p.document.largeImageURL)
62 | ~ ("location" -> p.document.location))
63 | ))
64 | ~ ("resultsLength" -> results.length)
65 | ~ ("startDate" -> min)
66 | ~ ("endDate" -> max))
67 | pretty(render(json))
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/scala/download/ImageDownloader.scala:
--------------------------------------------------------------------------------
1 | package download
2 |
3 | import search.documents.NationalArchiveDocument
4 | import search.documents.NationalArchiveDocumentManager
5 | import java.io._
6 |
7 | object ImageDownloader {
8 |
9 | def main(args: Array[String]): Unit = {
10 | val documents = new NationalArchiveDocumentManager().parse("someFile")
11 | // val documents = new NationalArchiveDocumentManager().parse("src/resources/PhotoMetaData10000.csv")
12 | val out = new java.io.FileWriter(new File("someFile2"))
13 | val urls = documents.foreach { x =>
14 | // val source = scala.io.Source.fromURL(x.smallImageURL)
15 | // println(x.barcode)
16 | // val writer = new PrintWriter(new File("images", x.barcode + ".jpg"))
17 | // writer.write(source.mkString(""))
18 | // writer.close()
19 | val lo = "images/" + x.barcode+ ".jpg"
20 | out.write(x.barcode + "," + "\"" + x.description.replaceAll("\"", "\"\"") + "\"" + "," + x.year.toString + "," + x.location + "," + lo + "," + lo + "\n")
21 |
22 | }
23 | out.close
24 | }
25 | }
--------------------------------------------------------------------------------
/src/main/scala/search/documents/Document.scala:
--------------------------------------------------------------------------------
1 | package search.documents
2 |
3 | abstract class Document(val words: List[String]) {
4 |
5 | def getWordCount(word: String) = counts.getOrElse(word, 0)
6 |
7 | private lazy val counts = words.foldLeft(collection.mutable.HashMap[String, Int]()) {
8 | (map, word) => map += word -> (map.getOrElse(word, 0) + 1)
9 | }
10 |
11 | }
--------------------------------------------------------------------------------
/src/main/scala/search/documents/MockDocument.scala:
--------------------------------------------------------------------------------
1 | package search.documents
2 |
3 | import java.io.File
4 | import search.parsing.Parser
5 | import search.parsing.Parser._
6 |
7 | class MockDocument(val _name: Option[String], val file: Option[File], words: List[String]) extends Document(words) {
8 |
9 | val name = file match {
10 | case Some(file) => file.getName()
11 | case None => _name.getOrElse("Untitled")
12 | }
13 |
14 | override def toString = name
15 | }
16 |
17 | class MockDocumentManager {
18 |
19 | private val parser = new Parser()
20 | val removeStopWords: Boolean = true
21 |
22 | def parseFolder(folder: File): List[MockDocument] = {
23 | folder.listFiles().toList.map(f => parse(f))
24 | }
25 |
26 | def parse(file: File): MockDocument = {
27 | val words = parser.parse(file, removeStopWords)
28 | new MockDocument(Some(file.getName()), Some(file), words)
29 | }
30 |
31 | def parseText(input: String, removeStopWords: Boolean = true): MockDocument = {
32 | val words = parser.parse(input, removeStopWords)
33 | new MockDocument(None, None, words)
34 | }
35 |
36 |
37 | }
--------------------------------------------------------------------------------
/src/main/scala/search/documents/NationalArchiveDocument.scala:
--------------------------------------------------------------------------------
1 | package search.documents
2 |
3 | import au.com.bytecode.opencsv.CSVReader
4 | import scala.collection.JavaConverters._
5 | import java.io.FileReader
6 | import search.parsing.Parser
7 | import search.parsing.Parser._
8 |
9 | class NationalArchiveDocument(
10 | val barcode: String,
11 | title: List[String],
12 | val description: String,
13 | val year: Int,
14 | val location: String,
15 | val largeImageURL: String,
16 | val smallImageURL: String) extends Document(title)
17 |
18 | class NationalArchiveDocumentManager {
19 |
20 | private val parser = new Parser()
21 |
22 | val photos = "src/resources/PhotoMetaData.csv"
23 |
24 | def parse(filename: String): Seq[NationalArchiveDocument] = {
25 | val reader = new CSVReader(new FileReader(filename));
26 | // reader.readAll().asScala.tail.map(parseRow).toList
27 |
28 | val iterator = Iterator.continually(reader.readNext()).takeWhile(_ != null)
29 | iterator.toSeq.tail.map(parseRow)
30 | }
31 |
32 | def parseRow(row: Array[String]): NationalArchiveDocument = {
33 | new NationalArchiveDocument(
34 | barcode = row(0),
35 | description = row(1),
36 | title = parser.parse(row(1)),
37 | year = row(2).toInt,
38 | location = row(3),
39 | largeImageURL = row(4),
40 | smallImageURL = row(5))
41 | }
42 | }
--------------------------------------------------------------------------------
/src/main/scala/search/documents/QueryDocument.scala:
--------------------------------------------------------------------------------
1 | package search.documents
2 |
3 | import search.parsing.Parser
4 | import search.parsing.Parser._
5 |
6 | class QueryDocument(words: List[String]) extends Document(words)
7 |
8 | class QueryDocumentManager {
9 |
10 | val parser = new Parser
11 |
12 | def parseText(input: String, removeStopWords: Boolean = true): QueryDocument = {
13 | val words = parser.parse(input, removeStopWords)
14 | new QueryDocument(words)
15 | }
16 | }
--------------------------------------------------------------------------------
/src/main/scala/search/indexing/InvertedIndex.scala:
--------------------------------------------------------------------------------
1 | package search.indexing
2 |
3 | import scala.collection.mutable.LinkedHashMap
4 | import search.documents.Document
5 | import search.documents.MockDocument
6 | import search.documents.QueryDocument
7 | import scala.collection.mutable.ArrayBuffer
8 |
9 | class InvertedIndex[T <: Document] {
10 |
11 | val index = new LinkedHashMap[String, LinkedHashMap[T, Int]]
12 | val weights = new LinkedHashMap[T, Double]
13 | val names = new ArrayBuffer[T]()
14 | private var _totalDocumentsIndexed = 0
15 |
16 | def addDocumentToIndex(document: T*) = {
17 | document.foreach { d =>
18 | d.words.foreach { word =>
19 | val x = index.getOrElseUpdate(word, LinkedHashMap(d -> 0))
20 | x.put(d, x.get(d).getOrElse(0) + 1)
21 | }
22 | incrementTotalDocumentsIndexed()
23 | }
24 | for (doc <- document) {
25 | calculateVectorSpaces(doc)
26 | names += (doc)
27 | }
28 | }
29 |
30 | def calculateVectorSpaces(document: T) = {
31 | weights.put(document, vectorWeights(document))
32 | }
33 |
34 | def similarity(query: QueryDocument, document: T) = {
35 | dotProduct(query, document) / (vectorWeights(query) * weights.get(document).get)
36 | // dotProduct(query, document) / (vectorWeights(query) * vectorWeights(document))
37 | }
38 |
39 | def vectorWeights(document: Document) = {
40 | val weights = index.map { word =>
41 | math.pow(tfidf(word._1, document), 2)
42 | }
43 | math.sqrt(weights.sum)
44 | }
45 |
46 | /**
47 | * http://c2.com/cgi/wiki?DotProductInManyProgrammingLanguages
48 | */
49 | private def dp[T <% Double](as: Iterable[T], bs: Iterable[T]) = {
50 | require(as.size == bs.size)
51 | (for ((a, b) <- as zip bs) yield a * b) sum
52 | }
53 |
54 | def dotProduct(query: QueryDocument, document: T) = {
55 | val queryTfidfs = index.map(word => tfidf(word._1, query))
56 | val documentTfidfs = index.map(word => tfidf(word._1, document))
57 | dp(queryTfidfs, documentTfidfs)
58 | }
59 |
60 | def normalize(word: String, document: T) = {
61 | math.sqrt(document.words.foldLeft(0D)((accum, w) => accum + math.pow(idf(w), 2)))
62 | }
63 |
64 | def tf(word: String, document: Document) = {
65 | val count = document.getWordCount(word)
66 | if (count > 0) count
67 | else 0.0
68 | }
69 |
70 | def idf(word: String) = {
71 | val occursInAll: Double = index.get(word) match {
72 | case Some(occurrence) => occurrence.size
73 | case None => 0
74 | }
75 | val idf = 1.0 + math.log10(totalDocumentsIndexed / occursInAll)
76 | if (idf.isNaN()) 0.0 else idf
77 | }
78 |
79 | def tfidf(word: String, document: Document) = {
80 | val tfw = tf(word, document)
81 | if (tfw == 0) 0 else tfw * idf(word)
82 | }
83 |
84 | def getAllRelevantDocuments(words: List[String]): List[T] = {
85 | words.map(word => index.get(word).getOrElse(Nil).map(x => x._1).toList).flatten.distinct
86 | }
87 |
88 | def containsDocument(document: T) = names.contains(document)
89 |
90 | def incrementTotalDocumentsIndexed() = _totalDocumentsIndexed += 1
91 |
92 | def totalDocumentsIndexed = _totalDocumentsIndexed
93 |
94 | override def toString = index.mkString("\n")
95 | }
--------------------------------------------------------------------------------
/src/main/scala/search/indexing/SearchRanker.scala:
--------------------------------------------------------------------------------
1 | package search.indexing
2 |
3 | import search.documents.Document
4 | import search.result.Result
5 | import search.documents.QueryDocument
6 |
7 | class SearchRanker[T <: Document](val index: InvertedIndex[T]) {
8 |
9 | def query(inputQuery: QueryDocument): List[Result[T]] = {
10 | val documents = index.getAllRelevantDocuments(inputQuery.words)
11 | // println("relevant documents " + documents)
12 | documents.map(doc => query(inputQuery, doc)).sortBy(_.score).reverse
13 | }
14 |
15 | def queryer(input: String) = {
16 | List(1,2,3).reduce(_+_)
17 | }
18 |
19 | def query(query: QueryDocument, document: T): Result[T] = {
20 | val score = index.similarity(query, document)
21 | new Result[T](document, score)
22 | }
23 | }
--------------------------------------------------------------------------------
/src/main/scala/search/managers/LuceneSearchManager.scala:
--------------------------------------------------------------------------------
1 | package search.managers
2 |
3 | import scala.Array.canBuildFrom
4 | import scala.collection.mutable.LinkedHashMap
5 |
6 | import org.apache.lucene.analysis.standard.StandardAnalyzer
7 | import org.apache.lucene.document.Document
8 | import org.apache.lucene.document.Field
9 | import org.apache.lucene.index.IndexWriter
10 | import org.apache.lucene.queryParser.QueryParser
11 | import org.apache.lucene.search.IndexSearcher
12 | import org.apache.lucene.store.RAMDirectory
13 | import org.apache.lucene.util.Version
14 |
15 | import com.thoughtworks.xstream.XStream
16 |
17 | import search.documents.{Document => doc}
18 | import search.result.Result
19 |
20 | class LuceneSearchManager[T <: doc] {
21 |
22 | val analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT)
23 | val directory = new RAMDirectory();
24 | val writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED)
25 | val mapper = new LinkedHashMap[Int, T]
26 |
27 | def addToIndex(documents: Traversable[T]) = {
28 | var id = 0
29 | documents.foreach { d =>
30 | val doc = simpleDoc(id, d)
31 | writer.addDocument(doc)
32 | mapper += id -> d
33 | id += 1
34 | }
35 | writer.commit
36 | writer.close
37 | }
38 |
39 | private def simpleDoc(id: Int, d: T) = {
40 | val doc = new Document()
41 | doc.add(new Field("content", d.words.mkString(" "), Field.Store.YES, Field.Index.ANALYZED))
42 | doc.add(new Field("id", id.toString, Field.Store.YES, Field.Index.NO))
43 | doc
44 | }
45 |
46 | def query(input: String) = {
47 | val searcher = new IndexSearcher(directory)
48 | val q = new QueryParser(Version.LUCENE_36, "content", analyzer).parse(input);
49 | val docs = searcher.search(q, 100)
50 | val xstream = new XStream()
51 |
52 | val results = docs.scoreDocs map { docId =>
53 | val d = searcher.doc(docId.doc)
54 | val backToDocument = mapper.get(d.get("id").toInt).get
55 | new Result(backToDocument, docId.score)
56 | }
57 |
58 | searcher.close
59 | results.toList
60 | }
61 | }
--------------------------------------------------------------------------------
/src/main/scala/search/managers/SearchManager.scala:
--------------------------------------------------------------------------------
1 | package search.managers
2 |
3 | import scala.Array.canBuildFrom
4 | import search.documents.Document
5 | import search.documents.QueryDocumentManager
6 | import search.indexing.InvertedIndex
7 | import search.indexing.SearchRanker
8 | import search.parsing.Parser
9 | import search.parsing.Parser.string2Iterator
10 |
11 | class SearchManager[T <: Document] {
12 |
13 | private val _index = new InvertedIndex[T]()
14 | private val ranker = new SearchRanker[T](index)
15 | private val parser: Parser = new Parser()
16 |
17 | def addToIndex(documents: Traversable[T]): List[T] = {
18 | documents.map(addToIndex).toList
19 | }
20 |
21 | def addToIndex(document: T): T = {
22 | if (index.containsDocument(document)) {
23 | document
24 | } else {
25 | _index.addDocumentToIndex(document)
26 | document
27 | }
28 | }
29 |
30 | def query(input: String) = {
31 | val queryable = new QueryDocumentManager().parseText(input)
32 | ranker.query(queryable).filter(d => d.score > 0.0).take(100)
33 | }
34 |
35 | def queryMatch(input: String) = {
36 | val queryable = new QueryDocumentManager().parseText(input)
37 | // _index.index.keys.filter(_.startsWith(input).)
38 | val x = _index.index.filter(_._1.startsWith(input))
39 | }
40 |
41 | def index = _index
42 |
43 | }
--------------------------------------------------------------------------------
/src/main/scala/search/parsing/Parser.scala:
--------------------------------------------------------------------------------
1 | package search.parsing
2 |
3 | import scala.io.Source
4 | import java.io.File
5 | import java.util.Locale
6 | import java.io.InputStream
7 |
8 | object Parser {
9 | implicit def file2Iterator(file: File) = {
10 | Source.fromFile(file, "latin1").getLines()
11 | }
12 | implicit def string2Iterator(input: String) = {
13 | List(input).toIterator
14 | }
15 | }
16 |
17 | class Parser {
18 |
19 | private val STOP_WORDS = "search/parsing/stopWords.txt";
20 | private val stopWords: Set[String] = parseStopWords(getClass.getClassLoader.getResourceAsStream(STOP_WORDS))
21 |
22 | def parseStopWords(stream: InputStream) = Source.fromInputStream(stream).getLines().toSet
23 |
24 | def parse(input: Iterator[String], removeStopWords: Boolean = true): List[String] = {
25 | input.map { x =>
26 | val words = getWordsFromLine(x)
27 | if (removeStopWords)
28 | filterStopWords(words)
29 | else
30 | words
31 | }.toList.flatten
32 | }
33 |
34 | private val getWordsFromLine = (line: String) => {
35 | line.split(" ")
36 | .map(_.toLowerCase())
37 | .map(word => word.filter(Character.isLetter(_)))
38 | // .filter(_.length() > 1)
39 | .toList
40 | }
41 |
42 | private val filterStopWords = (words: List[String]) => {
43 | words.filterNot(word => stopWords.contains(word))
44 | }
45 | }
--------------------------------------------------------------------------------
/src/main/scala/search/result/Result.scala:
--------------------------------------------------------------------------------
1 | package search.result
2 |
3 | import search.documents.Document
4 | import scala.collection.mutable.Map
5 |
6 | case class Result[T <: Document](document: T, var score: Double, snippet: Snippet) {
7 |
8 |
9 | def this(document: T, score: Double) = {
10 | this(document, score, new Snippet(document))
11 | }
12 |
13 | override def toString = "\n" + document + ": Score=" + score
14 |
15 | }
--------------------------------------------------------------------------------
/src/main/scala/search/result/Snippet.scala:
--------------------------------------------------------------------------------
1 | package search.result
2 |
3 | import search.documents.Document
4 | import scala.io.Source
5 |
6 | class Snippet(document: Document) {
7 |
8 | val sentences = {
9 | //Source.fromFile(document.file.get).getLines.toList.flatten
10 | }
11 |
12 | }
--------------------------------------------------------------------------------
/src/main/webapp/WEB-INF/scalate/layouts/default.scaml:
--------------------------------------------------------------------------------
1 | -@ val title: String = "National Archives of Australia Search"
2 | -@ val headline: String = title
3 | -@ val body: String
4 |
5 | !!!
6 | %html
7 | %head
8 | %title= title
9 | %link(rel="stylesheet" type="text/css" href="/static/css/bootstrap2.css")
10 | %link(rel="stylesheet" type="text/css" href="/static/css/main.css")
11 |
12 | %script(type="text/javascript" src="/static/js/jquery-1.7.2.min.js")
13 | %script(type="text/javascript" src="/static/js/bootstrap.min.js")
14 | %body
15 |
16 | %div.navbar
17 | %div.navbar-inner
18 | %a.brand{:href => "#"}
19 | National Archives Search
20 |
21 | != body
--------------------------------------------------------------------------------
/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
2 |