├── .idea
├── .name
├── codeStyles
│ ├── codeStyleConfig.xml
│ └── Project.xml
├── kotlinc.xml
├── vcs.xml
├── jpa-buddy.xml
├── misc.xml
├── gradle.xml
├── workspace.xml
└── uiDesigner.xml
├── gradle.properties
├── fatjar.bat
├── settings.gradle.kts
├── env.bat
├── gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── application.local.props.template
├── src
└── main
│ ├── resources
│ ├── application.props
│ ├── templates
│ │ ├── status.html
│ │ ├── about.html
│ │ ├── docs.html
│ │ ├── base.html
│ │ └── home.html
│ ├── log4j.properties
│ ├── META-INF
│ │ └── services
│ │ │ ├── org.apache.lucene.codecs.Codec
│ │ │ └── org.apache.lucene.codecs.PostingsFormat
│ └── tika-config.xml
│ └── kotlin
│ └── gr
│ └── serafeim
│ ├── db.kt
│ ├── web
│ ├── util.kt
│ └── routes.kt
│ ├── conf
│ └── config.kt
│ ├── cui.kt
│ ├── server.kt
│ └── search
│ ├── tools.kt
│ └── lucene_parser.kt.kt
├── .gitignore
├── .github
└── workflows
│ └── workflow.yml
├── LICENSE
├── tika-config-ocr.xml
├── gradlew.bat
├── gradlew
└── README.md
/.idea/.name:
--------------------------------------------------------------------------------
1 | docparser
--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
1 | kotlin.code.style=official
2 |
--------------------------------------------------------------------------------
/fatjar.bat:
--------------------------------------------------------------------------------
1 | REM gradlew buildFatJar
2 | gradlew shadowJar
3 |
--------------------------------------------------------------------------------
/settings.gradle.kts:
--------------------------------------------------------------------------------
1 |
2 | rootProject.name = "docparser"
3 |
4 |
--------------------------------------------------------------------------------
/env.bat:
--------------------------------------------------------------------------------
1 | set JAVA_HOME=c:\progr\java\jdk-18.0.2
2 | set PATH=c:\progr\java\jdk-18.0.2\bin;%PATH%
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spapas/doc-parser-searcher/HEAD/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/kotlinc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/jpa-buddy.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
--------------------------------------------------------------------------------
/.idea/codeStyles/Project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/application.local.props.template:
--------------------------------------------------------------------------------
1 |
2 | parser.parseDirectory=.
3 | parser.dataDirectory=.
4 | parser.interval=60
5 | parser.pageSize=10
6 | parser.analyzerClazzString=org.apache.lucene.analysis.el.GreekAnalyzer
7 | parser.parseExtensions=doc,docx,xls,xlsx,ppt,pptx,odt,fodt,ods,fods,odp,fodp,txt,html,md,rtf,pdf,txt
8 |
9 | server.port=8080
10 | server.host=127.0.0.1
11 | server.userUsername=
12 | server.userPassword=
13 | server.adminUsername=
14 | server.adminPassword=
--------------------------------------------------------------------------------
/src/main/resources/application.props:
--------------------------------------------------------------------------------
1 |
2 |
3 | parser.parseDirectory=.
4 | parser.dataDirectory=.
5 | parser.interval=60
6 | parser.pageSize=10
7 | parser.analyzerClazzString=org.apache.lucene.analysis.el.GreekAnalyzer
8 | parser.parseExtensions=doc,docx,xls,xlsx,ppt,pptx,odt,fodt,ods,fods,odp,fodp,txt,html,md,rst,rtf,pdf
9 |
10 | server.port=8080
11 | server.host=127.0.0.1
12 | server.userUsername=
13 | server.userPassword=
14 | server.adminUsername=
15 | server.adminPassword=
--------------------------------------------------------------------------------
/src/main/resources/templates/status.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 |
Status
4 |
5 | Parsing: {{ parsing }}
6 | Keys Size: {{ keySize }}
7 |
8 |
9 | Config server: {{ config.server }}
10 |
11 | Config parser: {{ config.parser }}
12 |
13 |
14 |
15 |
16 |
17 | Back
18 | {% endblock %}}
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/gradle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set root logger level to DEBUG and its only appender to A1.
2 | log4j.rootLogger=INFO, FILE, CONSOLE
3 |
4 | log4j.appender.FILE=org.apache.log4j.RollingFileAppender
5 | log4j.appender.FILE.file=logs/out.log
6 | log4j.appender.FILE.MaxFileSize=5MB
7 | log4j.appender.FILE.MaxBackupIndex=5
8 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.FILE.layout.ConversionPattern=[%d{ISO8601}][%-5p][%t][%c{1}] %m%n
10 |
11 | log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
12 | log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
13 | log4j.appender.CONSOLE.layout.ConversionPattern=%-5p [%t]: %m%n
14 |
15 | log4j.logger.org.apache.pdfbox.pdmodel.font=error
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/db.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim
2 |
3 | import gr.serafeim.conf.ConfigHolder
4 | import org.mapdb.DBMaker
5 | import org.mapdb.Serializer
6 | import org.slf4j.LoggerFactory
7 | import java.nio.file.Paths
8 |
9 |
10 | fun getFileDB(): DBMaker.Maker {
11 | return DBMaker.fileDB(
12 | Paths.get(ConfigHolder.config.parser.dataDirectory, "map.db").toFile()
13 | )
14 | }
15 |
16 | object DBHolder {
17 | val logger = LoggerFactory.getLogger("DBHolder")
18 | val db = getFileDB().transactionEnable().make()
19 | val map = db.hashMap("docs").keySerializer(Serializer.STRING).valueSerializer(Serializer.JAVA).createOrOpen()
20 |
21 | init {
22 | logger.info("DB Singleton class invoked.")
23 | }
24 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .gradle
2 | build/
3 | !gradle/wrapper/gradle-wrapper.jar
4 | !**/src/main/**/build/
5 | !**/src/test/**/build/
6 |
7 | ### IntelliJ IDEA ###
8 | .idea/modules.xml
9 | .idea/jarRepositories.xml
10 | .idea/compiler.xml
11 | .idea/libraries/
12 | *.iws
13 | *.iml
14 | *.ipr
15 | out/
16 | !**/src/main/**/out/
17 | !**/src/test/**/out/
18 |
19 | ### Eclipse ###
20 | .apt_generated
21 | .classpath
22 | .factorypath
23 | .project
24 | .settings
25 | .springBeans
26 | .sts4-cache
27 | bin/
28 | !**/src/main/**/bin/
29 | !**/src/test/**/bin/
30 |
31 | ### NetBeans ###
32 | /nbproject/private/
33 | /nbbuild/
34 | /dist/
35 | /nbdist/
36 | /.nb-gradle/
37 |
38 | ### VS Code ###
39 | .vscode/
40 |
41 | ### Mac OS ###
42 | .DS_Store
43 |
44 | lucene_index/*
45 | map.db
46 | map.db.wal*
47 | *.local.props
48 | logs*
49 | *.log
--------------------------------------------------------------------------------
/src/main/resources/templates/about.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 | About Doc Parser Searcher
4 | An application that allows you to index and quickly search all your documents.
5 |
6 | More information:
7 | https://github.com/spapas/doc-parser-searcher
8 |
9 | Main tools used
10 |
16 |
17 | Copyright (c) 2024 Serafeim Papastefanos
18 |
19 | If you find this project useful consider
20 | buying me a coffee !
21 |
22 |
23 | Back
24 | {% endblock %}}
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | org.apache.lucene.codecs.simpletext.SimpleTextCodec
17 | org.apache.lucene.codecs.lucene99.Lucene99Codec
--------------------------------------------------------------------------------
/.github/workflows/workflow.yml:
--------------------------------------------------------------------------------
1 | name: Java CI
2 |
3 | on:
4 | push:
5 | tags:
6 | - '*'
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - uses: actions/checkout@v3
14 | - name: Set up JDK 21
15 | uses: actions/setup-java@v3
16 | with:
17 | java-version: '18'
18 | distribution: 'adopt'
19 | architecture: x64
20 | - name: Validate Gradle wrapper
21 | uses: gradle/wrapper-validation-action@ccb4328a959376b642e027874838f60f8e596de3
22 | - name: Build with Gradle
23 | uses: gradle/gradle-build-action@749f47bda3e44aa060e82d7b3ef7e40d953bd629
24 | with:
25 | arguments: shadowJar
26 | - uses: actions/upload-artifact@v3
27 | with:
28 | name: docparser.jar
29 | path: build/libs/docparser-all.jar
30 | - uses: ncipollo/release-action@v1
31 | name: Publish Release
32 | with:
33 | token: ${{secrets.GITHUB_TOKEN}}
34 | artifacts: build/libs/docparser-all.jar
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Serafeim Papastefanos
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/web/util.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim.web
2 |
3 | import io.ktor.http.*
4 | import io.ktor.server.request.*
5 | import org.apache.lucene.document.DateTools
6 | import java.text.SimpleDateFormat
7 | import java.util.*
8 |
9 |
10 | fun toDate(s: String): Date? {
11 | if (s!="") {
12 | val formatter = SimpleDateFormat("yyyy-MM-dd")
13 | return formatter.parse(s)
14 | }
15 | return null
16 | }
17 |
18 | fun nextPage(req: ApplicationRequest, p: Int, n: Int, total: Int): String {
19 | if (n * p < total) {
20 | var u = URLBuilder(req.uri)
21 | u.parameters["page"] = "${p + 1}"
22 | return u.build().fullPath
23 | } else {
24 | return "#"
25 | }
26 | }
27 |
28 | fun prevPage(req: ApplicationRequest, p: Int): String {
29 | if (p > 1) {
30 | var u = URLBuilder(req.uri)
31 | u.parameters["page"] = "${p - 1}"
32 | return u.build().fullPath
33 | } else {
34 | return "#"
35 | }
36 | }
37 |
38 |
39 | fun fromDateString(s: String): Date {
40 | return DateTools.stringToDate(s)
41 | }
42 |
43 | fun dateToMillis(d: Date?, default: Long): Long {
44 | if(d==null) {
45 | return default
46 | }
47 | return d.time
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat
17 | org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
18 | org.apache.lucene.codecs.memory.DirectPostingsFormat
19 | org.apache.lucene.codecs.memory.FSTPostingsFormat
20 | org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat
21 | org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat
22 | org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat
--------------------------------------------------------------------------------
/src/main/resources/templates/docs.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block content %}
3 | List of docs
4 |
5 | Size: {{ docSize }} docs
6 |
7 |
17 |
18 |
19 |
20 | Filename
21 | OK?
22 | Mod Date
23 |
24 | {% for k in docs %}
25 |
26 | {{ k.first }}
27 | {{ k.second.first }}
28 | {{ k.second.second }}
29 |
30 |
31 | {% endfor %}
32 |
33 |
34 |
35 |
36 |
41 |
42 |
43 | Back
44 |
45 |
46 | {% endblock %}}
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/conf/config.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim.conf
2 |
3 | import com.sksamuel.hoplite.*
4 | import org.apache.lucene.analysis.Analyzer
5 | import org.slf4j.LoggerFactory
6 | import java.io.File
7 |
8 |
9 | data class Parser(val parseDirectory: String,
10 | val dataDirectory: String,
11 | val interval: Int,
12 | val pageSize: Int,
13 | val parseExtensions: List,
14 | val analyzerClazzString: String,
15 | val externalTikaConfig: String?)
16 | data class Server(val host: String, val port: Int, val userUsername: String, val userPassword: String, val adminUsername: String, val adminPassword: String)
17 | data class Config(val parser: Parser, val server: Server)
18 |
19 |
20 | object ConfigHolder {
21 | val logger = LoggerFactory.getLogger("ConfigHolder")
22 | private lateinit var analyzerClazz: Class<*>
23 | lateinit var config: Config
24 |
25 | fun init(f: File?) {
26 |
27 | val configLB = ConfigLoaderBuilder.default()
28 |
29 | if(f!=null) {
30 | // logger.info("Config with $f")
31 | configLB.addFileSource(f, optional = false)
32 | }
33 |
34 | config = configLB
35 | .addResourceSource("/application.props")
36 | .build()
37 | .loadConfigOrThrow()
38 | analyzerClazz = Class.forName(config.parser.analyzerClazzString)
39 |
40 | }
41 |
42 | fun getAnalyzerInstance(): Analyzer {
43 | val t = analyzerClazz.getDeclaredConstructor().newInstance()
44 | if(t is Analyzer) {
45 | return t
46 | } else {
47 | throw Exception("$analyzerClazz is not an analyzer")
48 | }
49 | }
50 | }
--------------------------------------------------------------------------------
/tika-config-ocr.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
8 |
9 |
10 |
11 |
12 | false
13 | gray
14 | 300
15 | 4
16 | false
17 | triangle
18 |
19 |
22 | eng
23 | 2147483647
24 | 0
25 | 1
26 |
27 | false
28 | 200
29 | false
30 |
31 | c:/util/tesseract-ocr
32 | 120
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/src/main/resources/tika-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
34 |
35 |
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/cui.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim
2 |
3 | import com.github.ajalt.clikt.core.CliktCommand
4 | import com.github.ajalt.clikt.core.subcommands
5 | import com.github.ajalt.clikt.parameters.arguments.argument
6 | import com.github.ajalt.clikt.parameters.options.option
7 | import com.github.ajalt.clikt.parameters.types.file
8 | import gr.serafeim.conf.ConfigHolder
9 | import gr.serafeim.search.Result
10 | import gr.serafeim.search.SearchHolder
11 | import gr.serafeim.search.getLuceneDirName
12 | import gr.serafeim.search.parse
13 | import gr.serafeim.web.SearchParams
14 | import java.io.File
15 | import java.nio.file.Files
16 | import java.nio.file.Path
17 |
18 | class Server(): CliktCommand() {
19 | override fun run() {
20 | server()
21 | }
22 | }
23 |
24 | class Search(): CliktCommand() {
25 | val search by argument(help="What to search for")
26 | override fun run() {
27 | println("Search")
28 | val sh = SearchHolder.search(SearchParams(search, 10, 1))
29 | if(sh.total == 0) {
30 | println("Empty results")
31 | }
32 | for(r: Result in sh.results) {
33 | println(r.path)
34 |
35 | }
36 | }
37 | }
38 |
39 | class Info(): CliktCommand() {
40 | override fun run() {
41 | println("Info")
42 | println("- Config")
43 | println(ConfigHolder.config)
44 |
45 | println("- Number of docs on map ${DBHolder.map.keys.size}")
46 | try {
47 | val sh = SearchHolder.getTotalDocs()
48 | println("- Number of docs on index $sh")
49 | } catch (e: Throwable) {
50 | println("- No lucene index")
51 | }
52 |
53 | }
54 | }
55 |
56 | class Main: CliktCommand() {
57 | val configFile by option("-c", "--config", help="Config file").file()
58 |
59 | override fun run() {
60 | ConfigHolder.init(configFile)
61 |
62 | }
63 | }
64 |
65 | class Parse(): CliktCommand() {
66 | override fun run() {
67 | val dir = ConfigHolder.config.parser.parseDirectory
68 | println("Parsing, from directory $dir")
69 | parse(dir)
70 | }
71 | }
72 |
73 | class Clear(): CliktCommand() {
74 | override fun run() {
75 | val dir = ConfigHolder.config.parser.dataDirectory
76 | println("Clearing data from directory $dir")
77 | DBHolder.map.clear()
78 | DBHolder.db.commit()
79 | val luceneDirName = getLuceneDirName()
80 | Files.walk(luceneDirName)
81 | .sorted(Comparator.reverseOrder())
82 | .map(Path::toFile)
83 | .forEach(File::delete);
84 | }
85 | }
86 |
87 | fun main(args: Array) = Main().subcommands(
88 | Server(), Search(), Parse(), Info(), Clear()
89 | ).main(args)
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/server.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim
2 |
3 | import gr.serafeim.conf.ConfigHolder
4 | import gr.serafeim.web.*
5 | import io.ktor.server.application.*
6 | import io.ktor.server.auth.*
7 | import io.ktor.server.engine.*
8 | import io.ktor.server.jetty.*
9 | import io.ktor.server.pebble.*
10 | import io.ktor.server.routing.*
11 | import io.pebbletemplates.pebble.loader.ClasspathLoader
12 | import org.slf4j.LoggerFactory
13 |
14 |
15 | object StateHolder {
16 | var parsing = false
17 | }
18 |
19 | fun server() {
20 | val logger = LoggerFactory.getLogger("server")
21 | logger.info("Starting server...")
22 | val config = ConfigHolder.config
23 |
24 | val userUsername = config.server.userUsername
25 | val userPassword = config.server.userPassword
26 | val adminUsername = config.server.adminUsername
27 | val adminPassword = config.server.adminPassword
28 |
29 | gr.serafeim.search.init(config.parser.parseDirectory, config.parser.interval)
30 |
31 | embeddedServer(Jetty, port = config.server.port, host = config.server.host, watchPaths = listOf("classes", "resources")) {
32 | install(Pebble) {
33 | loader(ClasspathLoader().apply {
34 | prefix = "templates"
35 | })
36 | }
37 |
38 | install(Authentication) {
39 | basic("auth-basic-user") {
40 | realm = "User access"
41 | validate { credentials ->
42 | if (credentials.name == userUsername && credentials.password == userPassword) {
43 | UserIdPrincipal(credentials.name)
44 | } else {
45 | null
46 | }
47 | }
48 | }
49 |
50 | basic("auth-basic-admin") {
51 | realm = "Admin access"
52 | validate { credentials ->
53 | if (credentials.name == adminUsername && credentials.password == adminPassword) {
54 | UserIdPrincipal(credentials.name)
55 | } else {
56 | null
57 | }
58 | }
59 | }
60 | }
61 |
62 | routing {
63 |
64 | if (userUsername != "" && userPassword != "") {
65 | authenticate("auth-basic-user") {
66 | index(config.parser.pageSize)
67 | aboutRoute()
68 | downloadFile()
69 | }
70 | } else {
71 | index(config.parser.pageSize)
72 | aboutRoute()
73 | downloadFile()
74 | }
75 | if (adminUsername != "" && adminPassword != "") {
76 | authenticate("auth-basic-admin") {
77 | listKeysRoute()
78 | statusRoute()
79 | }
80 | } else {
81 | listKeysRoute()
82 | statusRoute()
83 | }
84 | }
85 | }.start(wait = true)
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/resources/templates/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Doc search
8 |
9 |
14 |
15 |
16 |
34 | {% block extra_style %}{% endblock %}
35 |
36 |
37 |
38 |
39 |
64 |
65 |
66 | {% block content %}
67 | {% endblock %}
68 |
69 |
70 |
83 |
84 |
85 |
86 |
87 |
88 |
89 | {% block extra_script %}{% endblock %}
90 |
91 |
92 |
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/search/tools.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim.search
2 |
3 | import gr.serafeim.conf.ConfigHolder
4 | import gr.serafeim.web.SearchParams
5 | import gr.serafeim.web.dateToMillis
6 | import gr.serafeim.web.fromDateString
7 | import org.apache.lucene.analysis.Analyzer
8 | import org.apache.lucene.document.Document
9 | import org.apache.lucene.document.LongPoint
10 | import org.apache.lucene.index.DirectoryReader
11 | import org.apache.lucene.index.Term
12 | import org.apache.lucene.queryparser.classic.QueryParser
13 | import org.apache.lucene.search.*
14 | import org.apache.lucene.search.highlight.Highlighter
15 | import org.apache.lucene.search.highlight.QueryScorer
16 | import org.apache.lucene.search.highlight.SimpleHTMLFormatter
17 | import org.apache.lucene.search.highlight.SimpleSpanFragmenter
18 | import org.apache.lucene.store.Directory
19 | import org.apache.lucene.store.FSDirectory
20 | import org.slf4j.Logger
21 | import org.slf4j.LoggerFactory
22 | import java.nio.file.Path
23 | import java.nio.file.Paths
24 | import java.util.*
25 |
26 |
27 | data class Results(val results: List, val total: Int)
28 |
29 | data class Result(
30 | val id: String,
31 | val text: String,
32 | val hfragments: List,
33 | val name: String,
34 | val path: List,
35 | val created: Date,
36 | val modified: Date,
37 | val accessed: Date
38 | )
39 |
40 | fun addDateQuery(bqb: BooleanQuery.Builder, dateFrom: Date?, dateTo: Date?, what: String) {
41 | if (dateFrom != null || dateTo != null) {
42 | val fromMillis = dateToMillis(dateFrom, 0L)
43 | val toMillis = dateToMillis(dateTo, Long.MAX_VALUE)
44 | val query3: Query = LongPoint.newRangeQuery(what, fromMillis, toMillis)
45 | bqb.add(query3, BooleanClause.Occur.FILTER)
46 | }
47 | }
48 |
49 | fun getLuceneDirName(): Path {
50 | return Paths.get(ConfigHolder.config.parser.dataDirectory, "lucene_index")
51 | }
52 |
53 | fun getLuceneDir(): Directory {
54 | return FSDirectory.open(getLuceneDirName())
55 | }
56 |
57 | object SearchHolder {
58 | private val logger: Logger = LoggerFactory.getLogger("Search")
59 | private val directory: Directory = getLuceneDir()
60 | private val reader: DirectoryReader = DirectoryReader.open(directory)
61 | private val indexSearcher = IndexSearcher(reader)
62 | private val analyzer: Analyzer = ConfigHolder.getAnalyzerInstance()
63 |
64 | init {
65 | logger.info("Search Singleton class invoked.")
66 | }
67 | fun search(sp: SearchParams): Results {
68 |
69 | // https://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
70 | val query1 = QueryParser("text", analyzer).parse(sp.q)
71 | val bqb = BooleanQuery.Builder()
72 | bqb.add(query1, BooleanClause.Occur.SHOULD)
73 | val query2: Query = WildcardQuery(Term("name", sp.q))
74 | bqb.add(query2, BooleanClause.Occur.SHOULD)
75 | bqb.setMinimumNumberShouldMatch(1)
76 |
77 | val query2a = QueryParser("name_t", analyzer).parse(sp.q)
78 | bqb.add(query2a, BooleanClause.Occur.SHOULD)
79 | bqb.setMinimumNumberShouldMatch(1)
80 |
81 | addDateQuery(bqb, sp.createdFrom, sp.createdTo, "created_point")
82 | addDateQuery(bqb, sp.modifiedFrom, sp.modifiedTo, "modified_point")
83 | addDateQuery(bqb, sp.accessedFrom, sp.accessedTo, "accessed_point")
84 |
85 | if (sp.path != null && sp.path != "") {
86 | val query4: Query = WildcardQuery(Term("path", sp.path))
87 | bqb.add(query4, BooleanClause.Occur.FILTER)
88 | }
89 |
90 | if (sp.ext != null && sp.ext != "") {
91 | val query5: Query = WildcardQuery(Term("extension", sp.ext))
92 | bqb.add(query5, BooleanClause.Occur.FILTER)
93 | }
94 |
95 | val booleanQuery = bqb.build()
96 | val collector = TopScoreDocCollector.create(99999, 100)
97 |
98 | indexSearcher.search(booleanQuery, collector)
99 |
100 | val start = (sp.p - 1) * sp.n
101 | val howmany = sp.n
102 |
103 | // Highlight
104 | val formatter = SimpleHTMLFormatter("", " ");
105 | val queryScorer = QueryScorer(booleanQuery);
106 | val highlighter = Highlighter(formatter, queryScorer);
107 | highlighter.textFragmenter = SimpleSpanFragmenter(queryScorer, Int.MAX_VALUE)
108 | highlighter.maxDocCharsToAnalyze = Int.MAX_VALUE
109 |
110 | val fragmentHighlighter = Highlighter(formatter, queryScorer);
111 | fragmentHighlighter.textFragmenter = SimpleSpanFragmenter(queryScorer, 30)
112 | fragmentHighlighter.maxDocCharsToAnalyze = Int.MAX_VALUE
113 |
114 | val results = collector.topDocs(start, howmany).scoreDocs.map {
115 |
116 | val doc: Document = indexSearcher.doc(it.doc)
117 | val id = doc.get("id")
118 | val path = doc.getValues("path").asList()
119 | val name = doc.get("name")
120 | val text = doc.get("text")
121 |
122 | val created = fromDateString(doc.get("created"))
123 | val accessed = fromDateString(doc.get("accessed"))
124 | val modified = fromDateString(doc.get("modified"))
125 |
126 | val fragments = fragmentHighlighter.getBestFragments(analyzer.tokenStream("text", text), text, 10)
127 |
128 | val htext = highlighter.getBestFragment(analyzer, "text", text)?:text;
129 | Result(
130 | id = id,
131 | text = htext.take(10*1024),
132 | hfragments = fragments.toList(),
133 | name = name,
134 | path = path,
135 | accessed = accessed,
136 | modified = modified,
137 | created = created
138 | )
139 | }
140 | return Results(results = results, total = collector.totalHits)
141 | }
142 |
143 | fun getTotalDocs(): Long {
144 | val q = MatchAllDocsQuery()
145 | return indexSearcher.search(q, Int.MAX_VALUE).totalHits?.value?:0L
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/web/routes.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim.web
2 |
3 | import gr.serafeim.*
4 | import gr.serafeim.conf.ConfigHolder
5 | import gr.serafeim.search.Result
6 | import gr.serafeim.search.SearchHolder
7 | import io.ktor.http.*
8 | import io.ktor.server.application.*
9 | import io.ktor.server.pebble.*
10 | import io.ktor.server.response.*
11 | import io.ktor.server.routing.*
12 |
13 | import java.io.File
14 | import java.util.*
15 | import java.net.URLEncoder;
16 |
17 | data class SearchParams(
18 | val q: String,
19 | val n: Int,
20 | val p: Int,
21 | val path: String? = null,
22 | val ext: String? = null,
23 | val createdFrom: Date? = null,
24 | val createdTo: Date? = null,
25 | val modifiedFrom: Date? = null,
26 | val modifiedTo: Date? = null,
27 | val accessedFrom: Date? = null,
28 | val accessedTo: Date? = null
29 | )
30 |
31 | fun Route.listKeysRoute() {
32 | get("/docs") {
33 | val docsmap = DBHolder.map
34 | val q = call.request.queryParameters.get("query") ?: ""
35 | val p = call.request.queryParameters.get("page")?.toInt() ?: 1
36 | val psize = call.request.queryParameters.get("page_size")?.toInt() ?: 10
37 | val docs = if (q != "") docsmap.filter { q in it.key } else docsmap
38 |
39 | call.respond(
40 | PebbleContent(
41 | "docs.html", mapOf(
42 | "docs" to docs.map { Pair(it.key, it.value) }.drop(psize * (p - 1)).take(psize),
43 | "docSize" to docsmap.keys.size,
44 | "q" to q,
45 | "page" to p,
46 | "next_page" to nextPage(call.request, p.toInt(), psize, docsmap.keys.size),
47 | "prev_page" to prevPage(call.request, p.toInt())
48 | )
49 | )
50 | )
51 | }
52 | }
53 |
54 | fun Route.statusRoute() {
55 | get("/status") {
56 |
57 | val map = DBHolder.map
58 | call.respond(
59 | PebbleContent(
60 | "status.html", mapOf(
61 | "keySize" to map.keys.size,
62 | "parsing" to StateHolder.parsing,
63 | "config" to ConfigHolder.config
64 | )
65 | )
66 | )
67 | }
68 | }
69 |
70 |
71 | fun Route.aboutRoute() {
72 | get("/about") {
73 |
74 | val map = DBHolder.map
75 | call.respond(
76 | PebbleContent(
77 | "about.html", mapOf(
78 | )
79 | )
80 | )
81 | }
82 | }
83 |
84 | fun Route.downloadFile() {
85 | get("/download") {
86 | val path = call.request.queryParameters.get("path") ?: ""
87 | println(path)
88 | val map = DBHolder.map
89 | if (path in map.keys) {
90 | val file = File(path)
91 | val cdVal = ContentDisposition.Attachment.withParameter(
92 | ContentDisposition.Parameters.FileName,
93 | URLEncoder.encode(file.name)
94 | )
95 | call.response.header(
96 | HttpHeaders.ContentDisposition,
97 | cdVal.toString()
98 | )
99 | call.respondFile(file)
100 | }
101 | }
102 | }
103 |
104 | fun Route.index(pageSize: Int) {
105 |
106 | get("/") {
107 | val q = call.request.queryParameters.get("query") ?: ""
108 | val p = call.request.queryParameters.get("page")?.toInt() ?: 1
109 | val path = call.request.queryParameters.get("path") ?: ""
110 | val ext = call.request.queryParameters.get("ext") ?: ""
111 | val createdFromStr = call.request.queryParameters.get("created-from") ?: ""
112 | val createdToStr = call.request.queryParameters.get("created-to") ?: ""
113 | val modifiedFromStr = call.request.queryParameters.get("modified-from") ?: ""
114 | val modifiedToStr = call.request.queryParameters.get("modified-to") ?: ""
115 | val accessedFromStr = call.request.queryParameters.get("modified-from") ?: ""
116 | val accessedToStr = call.request.queryParameters.get("modified-to") ?: ""
117 | val createdFrom = toDate(createdFromStr)
118 | val createdTo = toDate(createdToStr)
119 | val modifiedFrom = toDate(modifiedFromStr)
120 | val modifiedTo = toDate(modifiedToStr)
121 | val accessedFrom = toDate(accessedFromStr)
122 | val accessedTo = toDate(accessedToStr)
123 | var totalTime = 0L
124 | var total = 0
125 | var results = listOf()
126 | if (q != "") {
127 | val sp = SearchParams(
128 | q = q,
129 | p = p,
130 | n = pageSize,
131 | path = path,
132 | ext = ext,
133 | createdFrom = createdFrom,
134 | createdTo = createdTo,
135 | modifiedFrom = modifiedFrom,
136 | modifiedTo = modifiedTo,
137 | accessedFrom = accessedFrom,
138 | accessedTo = accessedTo,
139 | )
140 | try {
141 | val startTime = System.nanoTime()
142 | val rt = SearchHolder.search(sp)
143 | results = rt.results
144 | total = rt.total
145 | val endTime = System.nanoTime()
146 | totalTime = endTime - startTime
147 | } catch (e: org.apache.lucene.queryparser.classic.ParseException) {
148 | e.printStackTrace()
149 | }
150 | }
151 | call.request
152 | call.respond(
153 | PebbleContent(
154 | "home.html", mapOf(
155 | "results" to results,
156 | "total" to total,
157 | "q" to q,
158 | "page" to p,
159 | "totalTime" to totalTime,
160 | "showingFrom" to pageSize * (p - 1) + 1,
161 | "showingTo" to if (pageSize * p < total) {
162 | pageSize * p
163 | } else {
164 | total
165 | },
166 | "created_from" to createdFromStr,
167 | "created_to" to createdToStr,
168 | "modified_from" to modifiedFromStr,
169 | "modified_to" to modifiedToStr,
170 | "accessed_from" to accessedFromStr,
171 | "accessed_to" to accessedToStr,
172 | "path" to path,
173 | "ext" to ext,
174 | "next_page" to nextPage(call.request, p.toInt(), pageSize, total),
175 | "prev_page" to prevPage(call.request, p.toInt())
176 | )
177 | )
178 | )
179 | }
180 | }
181 |
--------------------------------------------------------------------------------
/src/main/kotlin/gr/serafeim/search/lucene_parser.kt.kt:
--------------------------------------------------------------------------------
1 | package gr.serafeim.search
2 |
3 | import gr.serafeim.conf.ConfigHolder
4 | import gr.serafeim.DBHolder
5 | import gr.serafeim.StateHolder
6 | import kotlinx.coroutines.*
7 | import kotlinx.coroutines.sync.Semaphore
8 | import kotlinx.coroutines.sync.withPermit
9 | import org.apache.lucene.analysis.Analyzer
10 | import org.apache.lucene.backward_codecs.lucene95.Lucene95Codec
11 | import org.apache.lucene.codecs.PostingsFormat
12 | import org.apache.lucene.codecs.lucene99.Lucene99Codec
13 | import org.apache.lucene.document.*
14 | import org.apache.lucene.index.IndexReader
15 | import org.apache.lucene.index.IndexWriter
16 | import org.apache.lucene.index.IndexWriterConfig
17 | import org.apache.lucene.index.Term
18 | import org.apache.lucene.store.Directory
19 | import org.apache.lucene.store.FSDirectory
20 | import org.apache.tika.Tika
21 | import org.apache.tika.config.TikaConfig
22 | import org.mapdb.HTreeMap
23 | import org.slf4j.LoggerFactory
24 | import java.io.File
25 | import java.nio.file.Files
26 | import java.nio.file.Paths
27 | import java.nio.file.attribute.BasicFileAttributes
28 | import java.nio.file.attribute.FileTime
29 | import java.util.*
30 | import java.util.concurrent.ConcurrentHashMap
31 | import java.util.concurrent.TimeUnit
32 | import kotlin.concurrent.schedule
33 | import kotlin.io.path.absolutePathString
34 |
35 |
36 | val logger = LoggerFactory.getLogger("LuceneParser")
37 |
38 | fun init(directory: String, interval: Int) {
39 | val x = PostingsFormat.availablePostingsFormats()
40 | if(!x.contains("Lucene99")) {
41 | throw Exception("Lucene99 Not found!")
42 | }
43 | val codec = Lucene99Codec()
44 | logger.info("Lucene parser init, directory: $directory, interval: $interval minutes")
45 | Timer("Parser").schedule(
46 | 0, TimeUnit.MINUTES.toMillis(interval.toLong())) {
47 | logger.debug("Parse START init....")
48 | parse(directory)
49 | }
50 | }
51 |
52 | fun toDateString(ft: FileTime): String {
53 | return DateTools.timeToString(ft.toMillis(), DateTools.Resolution.MINUTE)
54 | }
55 |
56 | fun configureTika(): Tika {
57 |
58 | var cfg = if (ConfigHolder.config.parser.externalTikaConfig == null) {
59 | logger.info("Using default tika config")
60 | TikaConfig(object {}.javaClass.getResourceAsStream("/tika-config.xml"))
61 |
62 | } else {
63 | logger.info("Using custom tika config")
64 | TikaConfig(ConfigHolder.config.parser.externalTikaConfig)
65 |
66 | }
67 |
68 | val tika = Tika(cfg)
69 |
70 | // Allow tika to read unlimited characters
71 | tika.maxStringLength = -1
72 | logger.debug("Will read up to ${tika.maxStringLength} length")
73 | return tika
74 | }
75 |
76 | fun configureIndexWriter(): IndexWriter {
77 | //We open a File System directory as we want to store the index on our local file system.
78 | val directory: Directory = getLuceneDir()
79 |
80 | //The analyzer is used to perform analysis on text of documents and create the terms that will be added in the index.
81 | val analyzer: Analyzer = ConfigHolder.getAnalyzerInstance()
82 | val indexWriterConfig = IndexWriterConfig(analyzer)
83 |
84 | // NOTE: IndexWriter instances are completely thread safe, meaning multiple threads can call any of its methods, concurrently. If your application requires external synchronization, you should not synchronize on the IndexWriter instance as this may cause deadlock; use your own (non-Lucene) objects instead.
85 | val indexWriter = IndexWriter(directory, indexWriterConfig)
86 |
87 | return indexWriter
88 | }
89 |
90 | fun parseDocument(it: File, indexWriter: IndexWriter, tika: Tika, map: HTreeMap) {
91 | logger.debug(it.name)
92 | val attrs = Files.readAttributes(Paths.get(it.path), BasicFileAttributes::class.java)
93 | val modified = attrs.lastModifiedTime().toMillis()
94 |
95 | val existingModTime: Pair? = (map[it.path] as Pair?)
96 | logger.debug("Existing mod time is $existingModTime and current mod time is $modified")
97 | print("Parsing ${it.name}\r")
98 |
99 | if(existingModTime==null || existingModTime.second < modified) {
100 | logger.debug("Need to parse and index ${it.name}")
101 |
102 | var content: String? = null;
103 | try {
104 | content = tika.parseToString(it.absoluteFile)
105 | map[it.path] = Pair(true, modified)
106 | } catch (e: Exception) {
107 | e.printStackTrace()
108 | logger.info("File ${it.path} cannot be parsed, skipping")
109 | map[it.path] = Pair(false, modified)
110 | }
111 |
112 | if (content!= null) {
113 | val doc = Document()
114 |
115 | doc.add(StringField("id", it.path, Field.Store.YES))
116 | doc.add(TextField("text", content, Field.Store.YES))
117 | doc.add(StringField("name", it.name, Field.Store.YES))
118 | doc.add(TextField("name_t", it.name, Field.Store.NO))
119 | it.path.split(File.separator).forEach {
120 | doc.add(StringField("path", it, Field.Store.YES))
121 | }
122 | doc.add(StringField("extension", it.extension, Field.Store.YES))
123 |
124 | doc.add(StringField("created", toDateString(attrs.creationTime()), Field.Store.YES))
125 | doc.add(StringField("accessed", toDateString(attrs.lastAccessTime()), Field.Store.YES))
126 | doc.add(StringField("modified", toDateString(attrs.lastModifiedTime()), Field.Store.YES))
127 |
128 | doc.add(LongPoint("created_point", attrs.creationTime().toMillis()))
129 | doc.add(LongPoint("modified_point", attrs.lastModifiedTime().toMillis()))
130 | doc.add(LongPoint("accessed_point", attrs.lastAccessTime().toMillis()))
131 |
132 | val idTerm = Term("id", it.path)
133 | indexWriter.updateDocument(idTerm, doc)
134 | }
135 | } else {
136 | logger.debug("Skipping the file, not changed since we last saw it")
137 | }
138 | }
139 |
140 | fun parse(sdir: String) {
141 | logger.info("Parse START, extensions are ${ConfigHolder.config.parser.parseExtensions}")
142 | logger.info("Parse directory is ${Paths.get(sdir).absolutePathString()}")
143 | StateHolder.parsing = true
144 |
145 | val tika = configureTika()
146 | val indexWriter = configureIndexWriter()
147 |
148 | var uniquePaths = ConcurrentHashMap.newKeySet()
149 |
150 | val dir = File(sdir)
151 | val requestSemaphore = Semaphore(4)
152 | runBlocking {
153 | val jobs = mutableListOf()
154 | var totJobs = 0;
155 | //logger.info("Run the blocking")
156 | dir.walk(direction = FileWalkDirection.TOP_DOWN).forEach {
157 | //logger.info("Waqlking .. ${it.name}")
158 | if (!it.name.startsWith("~$")) {
159 | if (ConfigHolder.config.parser.parseExtensions.contains(it.extension.lowercase())) {
160 | //logger.info("Parsing ${it.path}")
161 | uniquePaths.add(it.path)
162 |
163 | val job = GlobalScope.launch {
164 | requestSemaphore.withPermit {
165 | totJobs += 1
166 | logger.debug("Start job, $totJobs")
167 | parseDocument(it, indexWriter, tika, DBHolder.map)
168 | totJobs -= 1
169 | logger.debug("End job, $totJobs")
170 |
171 | }
172 | }
173 | jobs.add(job)
174 | }
175 | }
176 | }
177 | jobs.joinAll()
178 | }
179 |
180 | clearDeleted(DBHolder.map, uniquePaths, indexWriter)
181 |
182 | DBHolder.db.commit()
183 |
184 | logger.info("Docs Indexed Successfully!")
185 | indexWriter.close()
186 | StateHolder.parsing = false
187 | }
188 |
189 | private fun clearDeleted(
190 | map: HTreeMap,
191 | uniquePaths: ConcurrentHashMap.KeySetView,
192 | indexWriter: IndexWriter
193 | ) {
194 | val existingPathsSet = map.map { it.key }.toSet()
195 | val uniquePathsSet = uniquePaths.toSet()
196 | val remaining = existingPathsSet.minus(uniquePathsSet)
197 | logger.debug("Clear deleted, remaining ${remaining}")
198 | remaining.forEach {
199 | map.remove(it)
200 | val idTerm = Term("id", it)
201 | indexWriter.deleteDocuments(idTerm)
202 | }
203 | }
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 | {
53 | "associatedIndex": 0
54 | }
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 | 1670568498663
133 |
134 |
135 | 1670568498663
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | #
4 | # Copyright © 2015-2021 the original authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | #
21 | # Gradle start up script for POSIX generated by Gradle.
22 | #
23 | # Important for running:
24 | #
25 | # (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
26 | # noncompliant, but you have some other compliant shell such as ksh or
27 | # bash, then to run this script, type that shell name before the whole
28 | # command line, like:
29 | #
30 | # ksh Gradle
31 | #
32 | # Busybox and similar reduced shells will NOT work, because this script
33 | # requires all of these POSIX shell features:
34 | # * functions;
35 | # * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
36 | # «${var#prefix}», «${var%suffix}», and «$( cmd )»;
37 | # * compound commands having a testable exit status, especially «case»;
38 | # * various built-in commands including «command», «set», and «ulimit».
39 | #
40 | # Important for patching:
41 | #
42 | # (2) This script targets any POSIX shell, so it avoids extensions provided
43 | # by Bash, Ksh, etc; in particular arrays are avoided.
44 | #
45 | # The "traditional" practice of packing multiple parameters into a
46 | # space-separated string is a well documented source of bugs and security
47 | # problems, so this is (mostly) avoided, by progressively accumulating
48 | # options in "$@", and eventually passing that to Java.
49 | #
50 | # Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
51 | # and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
52 | # see the in-line comments for details.
53 | #
54 | # There are tweaks for specific operating systems such as AIX, CygWin,
55 | # Darwin, MinGW, and NonStop.
56 | #
57 | # (3) This script is generated from the Groovy template
58 | # https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
59 | # within the Gradle project.
60 | #
61 | # You can find Gradle at https://github.com/gradle/gradle/.
62 | #
63 | ##############################################################################
64 |
65 | # Attempt to set APP_HOME
66 |
67 | # Resolve links: $0 may be a link
68 | app_path=$0
69 |
70 | # Need this for daisy-chained symlinks.
71 | while
72 | APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
73 | [ -h "$app_path" ]
74 | do
75 | ls=$( ls -ld "$app_path" )
76 | link=${ls#*' -> '}
77 | case $link in #(
78 | /*) app_path=$link ;; #(
79 | *) app_path=$APP_HOME$link ;;
80 | esac
81 | done
82 |
83 | APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
84 |
85 | APP_NAME="Gradle"
86 | APP_BASE_NAME=${0##*/}
87 |
88 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
89 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
90 |
91 | # Use the maximum available, or set MAX_FD != -1 to use that value.
92 | MAX_FD=maximum
93 |
94 | warn () {
95 | echo "$*"
96 | } >&2
97 |
98 | die () {
99 | echo
100 | echo "$*"
101 | echo
102 | exit 1
103 | } >&2
104 |
105 | # OS specific support (must be 'true' or 'false').
106 | cygwin=false
107 | msys=false
108 | darwin=false
109 | nonstop=false
110 | case "$( uname )" in #(
111 | CYGWIN* ) cygwin=true ;; #(
112 | Darwin* ) darwin=true ;; #(
113 | MSYS* | MINGW* ) msys=true ;; #(
114 | NONSTOP* ) nonstop=true ;;
115 | esac
116 |
117 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
118 |
119 |
120 | # Determine the Java command to use to start the JVM.
121 | if [ -n "$JAVA_HOME" ] ; then
122 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
123 | # IBM's JDK on AIX uses strange locations for the executables
124 | JAVACMD=$JAVA_HOME/jre/sh/java
125 | else
126 | JAVACMD=$JAVA_HOME/bin/java
127 | fi
128 | if [ ! -x "$JAVACMD" ] ; then
129 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
130 |
131 | Please set the JAVA_HOME variable in your environment to match the
132 | location of your Java installation."
133 | fi
134 | else
135 | JAVACMD=java
136 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
137 |
138 | Please set the JAVA_HOME variable in your environment to match the
139 | location of your Java installation."
140 | fi
141 |
142 | # Increase the maximum file descriptors if we can.
143 | if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
144 | case $MAX_FD in #(
145 | max*)
146 | MAX_FD=$( ulimit -H -n ) ||
147 | warn "Could not query maximum file descriptor limit"
148 | esac
149 | case $MAX_FD in #(
150 | '' | soft) :;; #(
151 | *)
152 | ulimit -n "$MAX_FD" ||
153 | warn "Could not set maximum file descriptor limit to $MAX_FD"
154 | esac
155 | fi
156 |
157 | # Collect all arguments for the java command, stacking in reverse order:
158 | # * args from the command line
159 | # * the main class name
160 | # * -classpath
161 | # * -D...appname settings
162 | # * --module-path (only if needed)
163 | # * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
164 |
165 | # For Cygwin or MSYS, switch paths to Windows format before running java
166 | if "$cygwin" || "$msys" ; then
167 | APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
168 | CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
169 |
170 | JAVACMD=$( cygpath --unix "$JAVACMD" )
171 |
172 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
173 | for arg do
174 | if
175 | case $arg in #(
176 | -*) false ;; # don't mess with options #(
177 | /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
178 | [ -e "$t" ] ;; #(
179 | *) false ;;
180 | esac
181 | then
182 | arg=$( cygpath --path --ignore --mixed "$arg" )
183 | fi
184 | # Roll the args list around exactly as many times as the number of
185 | # args, so each arg winds up back in the position where it started, but
186 | # possibly modified.
187 | #
188 | # NB: a `for` loop captures its iteration list before it begins, so
189 | # changing the positional parameters here affects neither the number of
190 | # iterations, nor the values presented in `arg`.
191 | shift # remove old arg
192 | set -- "$@" "$arg" # push replacement arg
193 | done
194 | fi
195 |
196 | # Collect all arguments for the java command;
197 | # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
198 | # shell script including quotes and variable substitutions, so put them in
199 | # double quotes to make sure that they get re-expanded; and
200 | # * put everything else in single quotes, so that it's not re-expanded.
201 |
202 | set -- \
203 | "-Dorg.gradle.appname=$APP_BASE_NAME" \
204 | -classpath "$CLASSPATH" \
205 | org.gradle.wrapper.GradleWrapperMain \
206 | "$@"
207 |
208 | # Use "xargs" to parse quoted args.
209 | #
210 | # With -n1 it outputs one arg per line, with the quotes and backslashes removed.
211 | #
212 | # In Bash we could simply go:
213 | #
214 | # readarray ARGS < <( xargs -n1 <<<"$var" ) &&
215 | # set -- "${ARGS[@]}" "$@"
216 | #
217 | # but POSIX shell has neither arrays nor command substitution, so instead we
218 | # post-process each arg (as a line of input to sed) to backslash-escape any
219 | # character that might be a shell metacharacter, then use eval to reverse
220 | # that process (while maintaining the separation between arguments), and wrap
221 | # the whole thing up as a single "set" statement.
222 | #
223 | # This will of course break if any of these variables contains a newline or
224 | # an unmatched quote.
225 | #
226 |
227 | eval "set -- $(
228 | printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
229 | xargs -n1 |
230 | sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
231 | tr '\n' ' '
232 | )" '"$@"'
233 |
234 | exec "$JAVACMD" "$@"
235 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # doc-parser-searcher
2 |
3 | ## A tool to help you index and search your documents.
4 |
5 | This tool combines two great java libraries to help you index and then very fast search your documents:
6 |
7 | * apache lucene (https://lucene.apache.org/) for searching text
8 | * apache tika (https://tika.apache.org/) for extracting text from various types of files
9 |
10 | With this tool you can select a folder which contains all your documents.
11 | These documents will then be parsed, and you'll be able to search their contents.
12 |
13 | ## Screenshot
14 |
15 | 
16 |
17 | ## Requirements
18 |
19 | You need java 18 to run this program.
20 |
21 | ## Usage
22 |
23 | To run this download the `docparser-all.jar` from the
24 | [github releases](https://github.com/spapas/doc-parser-searcher/releases)
25 | and run it with java using something like
26 |
27 | ```
28 | java -jar docparser-all.jar
29 | ```
30 |
31 | You need to pass a parameter to the program to indicate its mode of operation:
32 |
33 | * server: Runs as a server, this is the main way to use this program. When running as a server it will first index all your documents and re-index them after a configurable interval.
34 | * search: You can pass a query to search the documents
35 | * parse: Parses/indexes your documents
36 | * info: Prints info on your document index
37 | * clear: Deletes the index so you can index everything again
38 |
39 | The main mode of operation is the `server` so you can visit the configured host/port (by default 127.0.0.1:8080) and, after your documents have been indexed, search them.
40 |
41 | ## How to search
42 |
43 | You should search using the lucene query parser syntax:
44 |
45 | https://lucene.apache.org/core/9_11_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package.description
46 |
47 | Quick searching tutorial:
48 |
49 | Simple: Enter a word and it will search for it using stemming rules for the configured language(i.e if you search for "apple" it will also return documents containing "apples")
50 | Phrase: If you want to search for an exact phrase, f.e "hello, world" you need to enter it between quotes. If you enter two words without the quotes it will search for documents containing one of these words. So searching for hello, world (without quotes) will return documents containing hello and documents containing word (see boolean search for more explanation)
51 | Wildcard: You can do wildcard search: Searching for app* will return documents containing apple, applying or application. Use ? for a single letter, * for any number of characters and + for at least one character. The wildcard character cannot be on the start of your query, i.e *ppl will not work.
52 | Boolean: You can use boolean operators like AND OR and NOT to create more complex queries. Things like (apple AND orange) OR (not strawberry) should work.
53 | Always include/exclude: You can use the + or - operators before a word (or phrase) to include or exclude documents containing it. For example +apple +orange -strawberry will return documents containing apple and orange but not strawberry.
54 | Distance: You can search by distance using the ~ operator. For example, "commit local"~3 will search for documents that have the words commit and local on a distance less than 3. That means that a document containing the phrase "commit all changes to local dev" will be returned but a document with the phrase "commit all changes to production and local dev" will not work.
55 | Filtering: You can use the extra search choices to filter based on the name of the folder that contains the document or its created/modified/accessed date. For example if you write appl* to the folder it will only return documents that are contained within a folder named apples or applications (this includes all ancestor folders).
56 | Combinations: You can use all the above in whatever combinations: For example +"commit local"~3 +download -conf* will search documents containing the word commit near the word local and also contain the word download but do not contain any words starting with conf
57 |
58 |
59 | ## Configuration
60 |
61 | The default configuration is this:
62 |
63 | ```
64 | parser.parseDirectory=. # starts parsing from the directory your start the program from
65 | parser.dataDirectory=. # saves index data to the directory your start the program from
66 | parser.interval=60 # re-indexes docs every 60 mins
67 | parser.pageSize=10 # result page size
68 | parser.analyzerClazzString=org.apache.lucene.analysis.el.GreekAnalyzer # use the correspoding analyzer for your language
69 | parser.parseExtensions=doc,docx,xls,xlsx,ppt,pptx,odt,fodt,ods,fods,odp,fodp,txt,html,md,rst,rtf,pdf # parse allowed extensions
70 |
71 | server.port=8080 # Which port to listen to
72 | server.host=127.0.0.1 # IP to bind to. Use 0.0.0.0 to allow remote connections
73 | server.userUsername= # Enables HTTP basic auth for users if set
74 | server.userPassword= # Enables HTTP basic auth for users if set
75 | server.adminUsername= # Enables HTTP basic auth for admins if set
76 | server.adminPassword= # Enables HTTP basic auth for admins if set
77 | ```
78 |
79 | Right now the admin doesn't have much functionality. Beyond
80 | searching, it allows connecting to:
81 |
82 | * /status to see the status of the server and
83 | * /docs to be able to see all the files that are indexed.
84 |
85 | To override the configuration you can copy over the
86 | [application.local.props.template](https://github.com/spapas/doc-parser-searcher/blob/master/application.local.props.template)
87 | to the same folder as the jar as `application.props` and edit it according to your needs.
88 |
89 | Then pass it to the app using `-c`, for example:
90 |
91 | `java -jar docparser-all.jar -c application.props server`
92 |
93 | If you use the default configuration it will parse from the directory
94 | you start the program from and keep its data to that directory.
95 |
96 | ### The analyzer
97 |
98 | Lucene uses an "analyzer" to parse your documents and your search queries. The
99 | analyzer will proeprly transform the docs for each language (case sensitivity,
100 | stemming, accents etc). By default, I'm using the analyzer for the greek
101 | language, but you should use the correct on for your own language. See
102 | here for the existing available languages:
103 |
104 | https://lucene.apache.org/core/9_7_0/analysis/common/index.html
105 |
106 | i.e for english use `org.apache.lucene.analysis.en.EnglishAnalyzer`
107 |
108 | ### How this works
109 |
110 | When the file parser sees a file it keeps its pathname and last modified date on a (persistant)
111 | hashmap. This way the file won't need to be re-indexed if it hasn't been changed. You can observe this behavior
112 | by running `parse`: The first time it will take a lot of time to index everything but if you re-run
113 | it should be much faster (notice it will need *some* time because it has to walk all files, if there are
114 | a lot of files it will need a lot of time even if nothing has changed).
115 |
116 | The application generates a `lucene_index` directory where the search
117 | index is saved and a `map.db` (and map.db.wal.*) file to keep the
118 | persistent hashmap (it uses the [mapdb](https://mapdb.org/) library for this).
119 |
120 | The parser indexes the text, title, path and created/modified/accessed dates
121 | for each document.
122 |
123 | ## Example
124 |
125 | Indexing the docs of my organization(> 100k pdf/doc/xls etc files) takes a couple of hours and creates an index of ~ 1 GB.
126 | Then depending on the query results are returned in 100 ms to 1 second.
127 |
128 | ## The OCR situation
129 |
130 | The apache tika library allows you to use [tesseract OCR](https://github.com/tesseract-ocr/tesseract)
131 | to read some files. You need to install tesseract to your server and then use a
132 | custom tika.config.xml by passing the `parser.externalTikaConfig` setting to
133 | your application.props. For example `parser.externalTikaConfig=c:\\progr\\kotlin\\doc-parser-searcher\\tika-config.xml`.
134 | One sample tika config that uses tesseract can be found [here](https://github.com/spapas/doc-parser-searcher/blob/master/tika-config-ocr.xml).
135 |
136 | *Warning*: Using the OCR is very slow. I have mainly included to test the functionality, I don't think that this is useful
137 | in a lot of situations. Of course, you are free to use it if you need it. Finally, you need to properly set the language of the
138 | documents you'll OCR or else your results will be very bad (using ` ell`).
139 |
140 | ## Development
141 |
142 | I'm using Intellij Idea for development. You should be able to run it directly from Intellij if you wish. For deployment check the fatjar.bat file or .github/workflows/workflow.yml for how to
143 | create a "fat" jar (will be built on `build\libs\docparser-all.jar`).
144 |
145 | ### Changes to fat.jar
146 |
147 | This shouldn't be needed anymore.
148 |
149 | - org.apache.lucene.codecs.lucene94.Lucene94Codec >> fat-jar/META-INF/services/org.apache.lucene.codecs.Codec
150 | - org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat >> fat-jar/META-INF/services/org.apache.lucene.codecs.PostingsFormat
151 |
152 |
153 | ## Changelog
154 |
155 | - v1.3: Improve project docs and styling
156 | - v1.2: Update dependencies
157 | - v1.1: First public version
158 |
159 | ## About
160 | If you find this project useful, consider
161 | buying me a coffee !
162 |
--------------------------------------------------------------------------------
/src/main/resources/templates/home.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block extra_style %}
3 |
18 | {% endblock %}
19 | {% block content %}
20 | Doc search
21 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
98 |
99 |
100 | Simple: Enter a word and it will search for it using stemming rules for the configured language(i.e if you search for "apple" it will also return documents containing "apples")
101 | Phrase: If you want to search for an exact phrase, f.e "hello, world" you need to enter it between quotes. If you enter two words without the quotes it will search for documents containing one of these words. So searching for hello, world (without quotes) will return documents containing hello and documents containing word (see boolean search for more explanation)
102 | Wildcard: You can do wildcard search: Searching for app* will return documents containing apple, applying or application. Use ? for a single letter, * for any number of characters and + for at least one character. The wildcard character cannot be on the start of your query, i.e *ppl will not work.
103 | Boolean: You can use boolean operators like AND OR and NOT to create more complex queries. Things like (apple AND orange) OR (not strawberry) should work.
104 | Always include/exclude: You can use the + or - operators before a word (or phrase) to include or exclude documents containing it. For example +apple +orange -strawberry will return documents containing apple and orange but not strawberry.
105 | Distance: You can search by distance using the ~ operator. For example, "commit local"~3 will search for documents that have the words commit and local on a distance less than 3. That means that a document containing the phrase "commit all changes to local dev" will be returned but a document with the phrase "commit all changes to production and local dev" will not work.
106 | Filtering: You can use the extra search choices to filter based on the name of the folder that contains the document or its created/modified/accessed date. For example if you write appl* to the folder it will only return documents that are contained within a folder named apples or applications (this includes all ancestor folders).
107 | Combinations: You can use all the above in whatever combinations: For example +"commit local"~3 +download -conf* will search documents containing the word commit near the word local and also contain the word download but do not contain any words starting with conf
108 |
109 |
110 |
113 |
114 |
115 |
116 | {% if q != "" %}
117 |
118 | Results
119 |
120 |
121 | Total time: {{ totalTime / 1000000 }} ms
122 |
123 | {% if results.size() > 0 %}
124 |
125 | Showing {{ showingFrom }} - {{ showingTo }} from {{ total }} results
126 |
127 |
128 |
133 |
134 | {% endif %}
135 |
136 |
137 | {% for r in results %}
138 |
139 |
140 |
141 |
151 |
152 |
153 |
Matches
154 |
155 | {% for f in r.hfragments %}
156 | {{ f|raw }}
157 | {% endfor %}
158 |
159 |
160 |
161 |
162 |
163 |
168 |
169 |
170 | {{ r.text|raw }}
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
183 |
184 |
185 |
186 | {% else %}
187 | Nothing found!
188 | {% endfor %}
189 |
190 | {% if results.size() > 0 %}
191 |
192 |
197 |
198 | {% endif %}
199 |
200 | {% endif %}
201 | {% endblock %}
--------------------------------------------------------------------------------