├── .idea ├── .name ├── codeStyles │ ├── codeStyleConfig.xml │ └── Project.xml ├── kotlinc.xml ├── vcs.xml ├── jpa-buddy.xml ├── misc.xml ├── gradle.xml ├── workspace.xml └── uiDesigner.xml ├── gradle.properties ├── fatjar.bat ├── settings.gradle.kts ├── env.bat ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── application.local.props.template ├── src └── main │ ├── resources │ ├── application.props │ ├── templates │ │ ├── status.html │ │ ├── about.html │ │ ├── docs.html │ │ ├── base.html │ │ └── home.html │ ├── log4j.properties │ ├── META-INF │ │ └── services │ │ │ ├── org.apache.lucene.codecs.Codec │ │ │ └── org.apache.lucene.codecs.PostingsFormat │ └── tika-config.xml │ └── kotlin │ └── gr │ └── serafeim │ ├── db.kt │ ├── web │ ├── util.kt │ └── routes.kt │ ├── conf │ └── config.kt │ ├── cui.kt │ ├── server.kt │ └── search │ ├── tools.kt │ └── lucene_parser.kt.kt ├── .gitignore ├── .github └── workflows │ └── workflow.yml ├── LICENSE ├── tika-config-ocr.xml ├── gradlew.bat ├── gradlew └── README.md /.idea/.name: -------------------------------------------------------------------------------- 1 | docparser -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.code.style=official 2 | -------------------------------------------------------------------------------- /fatjar.bat: -------------------------------------------------------------------------------- 1 | REM gradlew buildFatJar 2 | gradlew shadowJar 3 | -------------------------------------------------------------------------------- /settings.gradle.kts: -------------------------------------------------------------------------------- 1 | 2 | rootProject.name = "docparser" 3 | 4 | -------------------------------------------------------------------------------- /env.bat: -------------------------------------------------------------------------------- 1 | set JAVA_HOME=c:\progr\java\jdk-18.0.2 2 | set PATH=c:\progr\java\jdk-18.0.2\bin;%PATH% -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spapas/doc-parser-searcher/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/kotlinc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/jpa-buddy.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 9 | 10 | -------------------------------------------------------------------------------- /application.local.props.template: -------------------------------------------------------------------------------- 1 | 2 | parser.parseDirectory=. 3 | parser.dataDirectory=. 4 | parser.interval=60 5 | parser.pageSize=10 6 | parser.analyzerClazzString=org.apache.lucene.analysis.el.GreekAnalyzer 7 | parser.parseExtensions=doc,docx,xls,xlsx,ppt,pptx,odt,fodt,ods,fods,odp,fodp,txt,html,md,rtf,pdf,txt 8 | 9 | server.port=8080 10 | server.host=127.0.0.1 11 | server.userUsername= 12 | server.userPassword= 13 | server.adminUsername= 14 | server.adminPassword= -------------------------------------------------------------------------------- /src/main/resources/application.props: -------------------------------------------------------------------------------- 1 | 2 | 3 | parser.parseDirectory=. 4 | parser.dataDirectory=. 5 | parser.interval=60 6 | parser.pageSize=10 7 | parser.analyzerClazzString=org.apache.lucene.analysis.el.GreekAnalyzer 8 | parser.parseExtensions=doc,docx,xls,xlsx,ppt,pptx,odt,fodt,ods,fods,odp,fodp,txt,html,md,rst,rtf,pdf 9 | 10 | server.port=8080 11 | server.host=127.0.0.1 12 | server.userUsername= 13 | server.userPassword= 14 | server.adminUsername= 15 | server.adminPassword= -------------------------------------------------------------------------------- /src/main/resources/templates/status.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block content %} 3 |

Status

4 |

Parsing: {{ parsing }}
Keys Size: {{ keySize }}

Config server: {{ config.server }}

Config parser: {{ config.parser }}

15 |
16 | 17 | Back 18 | {% endblock %}} -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/gradle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 17 | 18 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=INFO, FILE, CONSOLE 3 | 4 | log4j.appender.FILE=org.apache.log4j.RollingFileAppender 5 | log4j.appender.FILE.file=logs/out.log 6 | log4j.appender.FILE.MaxFileSize=5MB 7 | log4j.appender.FILE.MaxBackupIndex=5 8 | log4j.appender.FILE.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.FILE.layout.ConversionPattern=[%d{ISO8601}][%-5p][%t][%c{1}] %m%n 10 | 11 | log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender 12 | log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout 13 | log4j.appender.CONSOLE.layout.ConversionPattern=%-5p [%t]: %m%n 14 | 15 | log4j.logger.org.apache.pdfbox.pdmodel.font=error -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/db.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim 2 | 3 | import gr.serafeim.conf.ConfigHolder 4 | import org.mapdb.DBMaker 5 | import org.mapdb.Serializer 6 | import org.slf4j.LoggerFactory 7 | import java.nio.file.Paths 8 | 9 | 10 | fun getFileDB(): DBMaker.Maker { 11 | return DBMaker.fileDB( 12 | Paths.get(ConfigHolder.config.parser.dataDirectory, "map.db").toFile() 13 | ) 14 | } 15 | 16 | object DBHolder { 17 | val logger = LoggerFactory.getLogger("DBHolder") 18 | val db = getFileDB().transactionEnable().make() 19 | val map = db.hashMap("docs").keySerializer(Serializer.STRING).valueSerializer(Serializer.JAVA).createOrOpen() 20 | 21 | init { 22 | logger.info("DB Singleton class invoked.") 23 | } 24 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | build/ 3 | !gradle/wrapper/gradle-wrapper.jar 4 | !**/src/main/**/build/ 5 | !**/src/test/**/build/ 6 | 7 | ### IntelliJ IDEA ### 8 | .idea/modules.xml 9 | .idea/jarRepositories.xml 10 | .idea/compiler.xml 11 | .idea/libraries/ 12 | *.iws 13 | *.iml 14 | *.ipr 15 | out/ 16 | !**/src/main/**/out/ 17 | !**/src/test/**/out/ 18 | 19 | ### Eclipse ### 20 | .apt_generated 21 | .classpath 22 | .factorypath 23 | .project 24 | .settings 25 | .springBeans 26 | .sts4-cache 27 | bin/ 28 | !**/src/main/**/bin/ 29 | !**/src/test/**/bin/ 30 | 31 | ### NetBeans ### 32 | /nbproject/private/ 33 | /nbbuild/ 34 | /dist/ 35 | /nbdist/ 36 | /.nb-gradle/ 37 | 38 | ### VS Code ### 39 | .vscode/ 40 | 41 | ### Mac OS ### 42 | .DS_Store 43 | 44 | lucene_index/* 45 | map.db 46 | map.db.wal* 47 | *.local.props 48 | logs* 49 | *.log -------------------------------------------------------------------------------- /src/main/resources/templates/about.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block content %} 3 |

About Doc Parser Searcher

4 |

An application that allows you to index and quickly search all your documents.

5 | 6 |

More information: 7 | https://github.com/spapas/doc-parser-searcher

8 | 9 |

Main tools used

10 |

Apache Lucene: https://lucene.apache.org/
Apache Tika: https://tika.apache.org/
MapDB: https://mapdb.org/
Ktor: https://ktor.io/>

16 | 17 |

Copyright (c) 2024 Serafeim Papastefanos

18 | 19 |

If you find this project useful consider 20 | buying me a coffee! 21 |

22 | 23 | Back 24 | {% endblock %}} -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | org.apache.lucene.codecs.simpletext.SimpleTextCodec 17 | org.apache.lucene.codecs.lucene99.Lucene99Codec -------------------------------------------------------------------------------- /.github/workflows/workflow.yml: -------------------------------------------------------------------------------- 1 | name: Java CI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up JDK 21 15 | uses: actions/setup-java@v3 16 | with: 17 | java-version: '18' 18 | distribution: 'adopt' 19 | architecture: x64 20 | - name: Validate Gradle wrapper 21 | uses: gradle/wrapper-validation-action@ccb4328a959376b642e027874838f60f8e596de3 22 | - name: Build with Gradle 23 | uses: gradle/gradle-build-action@749f47bda3e44aa060e82d7b3ef7e40d953bd629 24 | with: 25 | arguments: shadowJar 26 | - uses: actions/upload-artifact@v3 27 | with: 28 | name: docparser.jar 29 | path: build/libs/docparser-all.jar 30 | - uses: ncipollo/release-action@v1 31 | name: Publish Release 32 | with: 33 | token: ${{secrets.GITHUB_TOKEN}} 34 | artifacts: build/libs/docparser-all.jar -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Serafeim Papastefanos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/web/util.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim.web 2 | 3 | import io.ktor.http.* 4 | import io.ktor.server.request.* 5 | import org.apache.lucene.document.DateTools 6 | import java.text.SimpleDateFormat 7 | import java.util.* 8 | 9 | 10 | fun toDate(s: String): Date? { 11 | if (s!="") { 12 | val formatter = SimpleDateFormat("yyyy-MM-dd") 13 | return formatter.parse(s) 14 | } 15 | return null 16 | } 17 | 18 | fun nextPage(req: ApplicationRequest, p: Int, n: Int, total: Int): String { 19 | if (n * p < total) { 20 | var u = URLBuilder(req.uri) 21 | u.parameters["page"] = "${p + 1}" 22 | return u.build().fullPath 23 | } else { 24 | return "#" 25 | } 26 | } 27 | 28 | fun prevPage(req: ApplicationRequest, p: Int): String { 29 | if (p > 1) { 30 | var u = URLBuilder(req.uri) 31 | u.parameters["page"] = "${p - 1}" 32 | return u.build().fullPath 33 | } else { 34 | return "#" 35 | } 36 | } 37 | 38 | 39 | fun fromDateString(s: String): Date { 40 | return DateTools.stringToDate(s) 41 | } 42 | 43 | fun dateToMillis(d: Date?, default: Long): Long { 44 | if(d==null) { 45 | return default 46 | } 47 | return d.time 48 | } 49 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat 17 | org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat 18 | org.apache.lucene.codecs.memory.DirectPostingsFormat 19 | org.apache.lucene.codecs.memory.FSTPostingsFormat 20 | org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat 21 | org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat 22 | org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat -------------------------------------------------------------------------------- /src/main/resources/templates/docs.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block content %} 3 |

List of docs

4 | 5 | Size: {{ docSize }} docs 6 | 7 | 17 |
18 | 19 | 20 | 21 | 22 | 23 | 24 | {% for k in docs %} 25 | 26 | 27 | 28 | 29 | 30 | 31 | {% endfor %} 32 |

Filename	OK?	Mod Date
{{ k.first }}	{{ k.second.first }}	{{ k.second.second }}

33 | 34 | 35 | 42 | 43 | Back 44 | 45 | 46 | {% endblock %}} -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/conf/config.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim.conf 2 | 3 | import com.sksamuel.hoplite.* 4 | import org.apache.lucene.analysis.Analyzer 5 | import org.slf4j.LoggerFactory 6 | import java.io.File 7 | 8 | 9 | data class Parser(val parseDirectory: String, 10 | val dataDirectory: String, 11 | val interval: Int, 12 | val pageSize: Int, 13 | val parseExtensions: List, 14 | val analyzerClazzString: String, 15 | val externalTikaConfig: String?) 16 | data class Server(val host: String, val port: Int, val userUsername: String, val userPassword: String, val adminUsername: String, val adminPassword: String) 17 | data class Config(val parser: Parser, val server: Server) 18 | 19 | 20 | object ConfigHolder { 21 | val logger = LoggerFactory.getLogger("ConfigHolder") 22 | private lateinit var analyzerClazz: Class<*> 23 | lateinit var config: Config 24 | 25 | fun init(f: File?) { 26 | 27 | val configLB = ConfigLoaderBuilder.default() 28 | 29 | if(f!=null) { 30 | // logger.info("Config with $f") 31 | configLB.addFileSource(f, optional = false) 32 | } 33 | 34 | config = configLB 35 | .addResourceSource("/application.props") 36 | .build() 37 | .loadConfigOrThrow() 38 | analyzerClazz = Class.forName(config.parser.analyzerClazzString) 39 | 40 | } 41 | 42 | fun getAnalyzerInstance(): Analyzer { 43 | val t = analyzerClazz.getDeclaredConstructor().newInstance() 44 | if(t is Analyzer) { 45 | return t 46 | } else { 47 | throw Exception("$analyzerClazz is not an analyzer") 48 | } 49 | } 50 | } -------------------------------------------------------------------------------- /tika-config-ocr.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 8 | 9 | 10 | 11 | 12 | false 13 | gray 14 | 300 15 | 4 16 | false 17 | triangle 18 | 19 | 22 | eng 23 | 2147483647 24 | 0 25 | 1 26 | 27 | false 28 | 200 29 | false 30 | 31 | c:/util/tesseract-ocr 32 | 120 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/main/resources/tika-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/cui.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim 2 | 3 | import com.github.ajalt.clikt.core.CliktCommand 4 | import com.github.ajalt.clikt.core.subcommands 5 | import com.github.ajalt.clikt.parameters.arguments.argument 6 | import com.github.ajalt.clikt.parameters.options.option 7 | import com.github.ajalt.clikt.parameters.types.file 8 | import gr.serafeim.conf.ConfigHolder 9 | import gr.serafeim.search.Result 10 | import gr.serafeim.search.SearchHolder 11 | import gr.serafeim.search.getLuceneDirName 12 | import gr.serafeim.search.parse 13 | import gr.serafeim.web.SearchParams 14 | import java.io.File 15 | import java.nio.file.Files 16 | import java.nio.file.Path 17 | 18 | class Server(): CliktCommand() { 19 | override fun run() { 20 | server() 21 | } 22 | } 23 | 24 | class Search(): CliktCommand() { 25 | val search by argument(help="What to search for") 26 | override fun run() { 27 | println("Search") 28 | val sh = SearchHolder.search(SearchParams(search, 10, 1)) 29 | if(sh.total == 0) { 30 | println("Empty results") 31 | } 32 | for(r: Result in sh.results) { 33 | println(r.path) 34 | 35 | } 36 | } 37 | } 38 | 39 | class Info(): CliktCommand() { 40 | override fun run() { 41 | println("Info") 42 | println("- Config") 43 | println(ConfigHolder.config) 44 | 45 | println("- Number of docs on map ${DBHolder.map.keys.size}") 46 | try { 47 | val sh = SearchHolder.getTotalDocs() 48 | println("- Number of docs on index $sh") 49 | } catch (e: Throwable) { 50 | println("- No lucene index") 51 | } 52 | 53 | } 54 | } 55 | 56 | class Main: CliktCommand() { 57 | val configFile by option("-c", "--config", help="Config file").file() 58 | 59 | override fun run() { 60 | ConfigHolder.init(configFile) 61 | 62 | } 63 | } 64 | 65 | class Parse(): CliktCommand() { 66 | override fun run() { 67 | val dir = ConfigHolder.config.parser.parseDirectory 68 | println("Parsing, from directory $dir") 69 | parse(dir) 70 | } 71 | } 72 | 73 | class Clear(): CliktCommand() { 74 | override fun run() { 75 | val dir = ConfigHolder.config.parser.dataDirectory 76 | println("Clearing data from directory $dir") 77 | DBHolder.map.clear() 78 | DBHolder.db.commit() 79 | val luceneDirName = getLuceneDirName() 80 | Files.walk(luceneDirName) 81 | .sorted(Comparator.reverseOrder()) 82 | .map(Path::toFile) 83 | .forEach(File::delete); 84 | } 85 | } 86 | 87 | fun main(args: Array) = Main().subcommands( 88 | Server(), Search(), Parse(), Info(), Clear() 89 | ).main(args) -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/server.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim 2 | 3 | import gr.serafeim.conf.ConfigHolder 4 | import gr.serafeim.web.* 5 | import io.ktor.server.application.* 6 | import io.ktor.server.auth.* 7 | import io.ktor.server.engine.* 8 | import io.ktor.server.jetty.* 9 | import io.ktor.server.pebble.* 10 | import io.ktor.server.routing.* 11 | import io.pebbletemplates.pebble.loader.ClasspathLoader 12 | import org.slf4j.LoggerFactory 13 | 14 | 15 | object StateHolder { 16 | var parsing = false 17 | } 18 | 19 | fun server() { 20 | val logger = LoggerFactory.getLogger("server") 21 | logger.info("Starting server...") 22 | val config = ConfigHolder.config 23 | 24 | val userUsername = config.server.userUsername 25 | val userPassword = config.server.userPassword 26 | val adminUsername = config.server.adminUsername 27 | val adminPassword = config.server.adminPassword 28 | 29 | gr.serafeim.search.init(config.parser.parseDirectory, config.parser.interval) 30 | 31 | embeddedServer(Jetty, port = config.server.port, host = config.server.host, watchPaths = listOf("classes", "resources")) { 32 | install(Pebble) { 33 | loader(ClasspathLoader().apply { 34 | prefix = "templates" 35 | }) 36 | } 37 | 38 | install(Authentication) { 39 | basic("auth-basic-user") { 40 | realm = "User access" 41 | validate { credentials -> 42 | if (credentials.name == userUsername && credentials.password == userPassword) { 43 | UserIdPrincipal(credentials.name) 44 | } else { 45 | null 46 | } 47 | } 48 | } 49 | 50 | basic("auth-basic-admin") { 51 | realm = "Admin access" 52 | validate { credentials -> 53 | if (credentials.name == adminUsername && credentials.password == adminPassword) { 54 | UserIdPrincipal(credentials.name) 55 | } else { 56 | null 57 | } 58 | } 59 | } 60 | } 61 | 62 | routing { 63 | 64 | if (userUsername != "" && userPassword != "") { 65 | authenticate("auth-basic-user") { 66 | index(config.parser.pageSize) 67 | aboutRoute() 68 | downloadFile() 69 | } 70 | } else { 71 | index(config.parser.pageSize) 72 | aboutRoute() 73 | downloadFile() 74 | } 75 | if (adminUsername != "" && adminPassword != "") { 76 | authenticate("auth-basic-admin") { 77 | listKeysRoute() 78 | statusRoute() 79 | } 80 | } else { 81 | listKeysRoute() 82 | statusRoute() 83 | } 84 | } 85 | }.start(wait = true) 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/resources/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Doc search 8 | 9 | 14 | 15 | 16 | 34 | {% block extra_style %}{% endblock %} 35 | 36 | 37 | 38 |

39 |

40 | 43 | 46 | 63 |

64 |

65 |

66 | {% block content %} 67 | {% endblock %} 68 | 69 | 70 | 83 |

84 | 85 | 86 | 87 | 88 | 89 | {% block extra_script %}{% endblock %} 90 | 91 | 92 | -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/search/tools.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim.search 2 | 3 | import gr.serafeim.conf.ConfigHolder 4 | import gr.serafeim.web.SearchParams 5 | import gr.serafeim.web.dateToMillis 6 | import gr.serafeim.web.fromDateString 7 | import org.apache.lucene.analysis.Analyzer 8 | import org.apache.lucene.document.Document 9 | import org.apache.lucene.document.LongPoint 10 | import org.apache.lucene.index.DirectoryReader 11 | import org.apache.lucene.index.Term 12 | import org.apache.lucene.queryparser.classic.QueryParser 13 | import org.apache.lucene.search.* 14 | import org.apache.lucene.search.highlight.Highlighter 15 | import org.apache.lucene.search.highlight.QueryScorer 16 | import org.apache.lucene.search.highlight.SimpleHTMLFormatter 17 | import org.apache.lucene.search.highlight.SimpleSpanFragmenter 18 | import org.apache.lucene.store.Directory 19 | import org.apache.lucene.store.FSDirectory 20 | import org.slf4j.Logger 21 | import org.slf4j.LoggerFactory 22 | import java.nio.file.Path 23 | import java.nio.file.Paths 24 | import java.util.* 25 | 26 | 27 | data class Results(val results: List, val total: Int) 28 | 29 | data class Result( 30 | val id: String, 31 | val text: String, 32 | val hfragments: List, 33 | val name: String, 34 | val path: List, 35 | val created: Date, 36 | val modified: Date, 37 | val accessed: Date 38 | ) 39 | 40 | fun addDateQuery(bqb: BooleanQuery.Builder, dateFrom: Date?, dateTo: Date?, what: String) { 41 | if (dateFrom != null || dateTo != null) { 42 | val fromMillis = dateToMillis(dateFrom, 0L) 43 | val toMillis = dateToMillis(dateTo, Long.MAX_VALUE) 44 | val query3: Query = LongPoint.newRangeQuery(what, fromMillis, toMillis) 45 | bqb.add(query3, BooleanClause.Occur.FILTER) 46 | } 47 | } 48 | 49 | fun getLuceneDirName(): Path { 50 | return Paths.get(ConfigHolder.config.parser.dataDirectory, "lucene_index") 51 | } 52 | 53 | fun getLuceneDir(): Directory { 54 | return FSDirectory.open(getLuceneDirName()) 55 | } 56 | 57 | object SearchHolder { 58 | private val logger: Logger = LoggerFactory.getLogger("Search") 59 | private val directory: Directory = getLuceneDir() 60 | private val reader: DirectoryReader = DirectoryReader.open(directory) 61 | private val indexSearcher = IndexSearcher(reader) 62 | private val analyzer: Analyzer = ConfigHolder.getAnalyzerInstance() 63 | 64 | init { 65 | logger.info("Search Singleton class invoked.") 66 | } 67 | fun search(sp: SearchParams): Results { 68 | 69 | // https://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser 70 | val query1 = QueryParser("text", analyzer).parse(sp.q) 71 | val bqb = BooleanQuery.Builder() 72 | bqb.add(query1, BooleanClause.Occur.SHOULD) 73 | val query2: Query = WildcardQuery(Term("name", sp.q)) 74 | bqb.add(query2, BooleanClause.Occur.SHOULD) 75 | bqb.setMinimumNumberShouldMatch(1) 76 | 77 | val query2a = QueryParser("name_t", analyzer).parse(sp.q) 78 | bqb.add(query2a, BooleanClause.Occur.SHOULD) 79 | bqb.setMinimumNumberShouldMatch(1) 80 | 81 | addDateQuery(bqb, sp.createdFrom, sp.createdTo, "created_point") 82 | addDateQuery(bqb, sp.modifiedFrom, sp.modifiedTo, "modified_point") 83 | addDateQuery(bqb, sp.accessedFrom, sp.accessedTo, "accessed_point") 84 | 85 | if (sp.path != null && sp.path != "") { 86 | val query4: Query = WildcardQuery(Term("path", sp.path)) 87 | bqb.add(query4, BooleanClause.Occur.FILTER) 88 | } 89 | 90 | if (sp.ext != null && sp.ext != "") { 91 | val query5: Query = WildcardQuery(Term("extension", sp.ext)) 92 | bqb.add(query5, BooleanClause.Occur.FILTER) 93 | } 94 | 95 | val booleanQuery = bqb.build() 96 | val collector = TopScoreDocCollector.create(99999, 100) 97 | 98 | indexSearcher.search(booleanQuery, collector) 99 | 100 | val start = (sp.p - 1) * sp.n 101 | val howmany = sp.n 102 | 103 | // Highlight 104 | val formatter = SimpleHTMLFormatter("", ""); 105 | val queryScorer = QueryScorer(booleanQuery); 106 | val highlighter = Highlighter(formatter, queryScorer); 107 | highlighter.textFragmenter = SimpleSpanFragmenter(queryScorer, Int.MAX_VALUE) 108 | highlighter.maxDocCharsToAnalyze = Int.MAX_VALUE 109 | 110 | val fragmentHighlighter = Highlighter(formatter, queryScorer); 111 | fragmentHighlighter.textFragmenter = SimpleSpanFragmenter(queryScorer, 30) 112 | fragmentHighlighter.maxDocCharsToAnalyze = Int.MAX_VALUE 113 | 114 | val results = collector.topDocs(start, howmany).scoreDocs.map { 115 | 116 | val doc: Document = indexSearcher.doc(it.doc) 117 | val id = doc.get("id") 118 | val path = doc.getValues("path").asList() 119 | val name = doc.get("name") 120 | val text = doc.get("text") 121 | 122 | val created = fromDateString(doc.get("created")) 123 | val accessed = fromDateString(doc.get("accessed")) 124 | val modified = fromDateString(doc.get("modified")) 125 | 126 | val fragments = fragmentHighlighter.getBestFragments(analyzer.tokenStream("text", text), text, 10) 127 | 128 | val htext = highlighter.getBestFragment(analyzer, "text", text)?:text; 129 | Result( 130 | id = id, 131 | text = htext.take(10*1024), 132 | hfragments = fragments.toList(), 133 | name = name, 134 | path = path, 135 | accessed = accessed, 136 | modified = modified, 137 | created = created 138 | ) 139 | } 140 | return Results(results = results, total = collector.totalHits) 141 | } 142 | 143 | fun getTotalDocs(): Long { 144 | val q = MatchAllDocsQuery() 145 | return indexSearcher.search(q, Int.MAX_VALUE).totalHits?.value?:0L 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/web/routes.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim.web 2 | 3 | import gr.serafeim.* 4 | import gr.serafeim.conf.ConfigHolder 5 | import gr.serafeim.search.Result 6 | import gr.serafeim.search.SearchHolder 7 | import io.ktor.http.* 8 | import io.ktor.server.application.* 9 | import io.ktor.server.pebble.* 10 | import io.ktor.server.response.* 11 | import io.ktor.server.routing.* 12 | 13 | import java.io.File 14 | import java.util.* 15 | import java.net.URLEncoder; 16 | 17 | data class SearchParams( 18 | val q: String, 19 | val n: Int, 20 | val p: Int, 21 | val path: String? = null, 22 | val ext: String? = null, 23 | val createdFrom: Date? = null, 24 | val createdTo: Date? = null, 25 | val modifiedFrom: Date? = null, 26 | val modifiedTo: Date? = null, 27 | val accessedFrom: Date? = null, 28 | val accessedTo: Date? = null 29 | ) 30 | 31 | fun Route.listKeysRoute() { 32 | get("/docs") { 33 | val docsmap = DBHolder.map 34 | val q = call.request.queryParameters.get("query") ?: "" 35 | val p = call.request.queryParameters.get("page")?.toInt() ?: 1 36 | val psize = call.request.queryParameters.get("page_size")?.toInt() ?: 10 37 | val docs = if (q != "") docsmap.filter { q in it.key } else docsmap 38 | 39 | call.respond( 40 | PebbleContent( 41 | "docs.html", mapOf( 42 | "docs" to docs.map { Pair(it.key, it.value) }.drop(psize * (p - 1)).take(psize), 43 | "docSize" to docsmap.keys.size, 44 | "q" to q, 45 | "page" to p, 46 | "next_page" to nextPage(call.request, p.toInt(), psize, docsmap.keys.size), 47 | "prev_page" to prevPage(call.request, p.toInt()) 48 | ) 49 | ) 50 | ) 51 | } 52 | } 53 | 54 | fun Route.statusRoute() { 55 | get("/status") { 56 | 57 | val map = DBHolder.map 58 | call.respond( 59 | PebbleContent( 60 | "status.html", mapOf( 61 | "keySize" to map.keys.size, 62 | "parsing" to StateHolder.parsing, 63 | "config" to ConfigHolder.config 64 | ) 65 | ) 66 | ) 67 | } 68 | } 69 | 70 | 71 | fun Route.aboutRoute() { 72 | get("/about") { 73 | 74 | val map = DBHolder.map 75 | call.respond( 76 | PebbleContent( 77 | "about.html", mapOf( 78 | ) 79 | ) 80 | ) 81 | } 82 | } 83 | 84 | fun Route.downloadFile() { 85 | get("/download") { 86 | val path = call.request.queryParameters.get("path") ?: "" 87 | println(path) 88 | val map = DBHolder.map 89 | if (path in map.keys) { 90 | val file = File(path) 91 | val cdVal = ContentDisposition.Attachment.withParameter( 92 | ContentDisposition.Parameters.FileName, 93 | URLEncoder.encode(file.name) 94 | ) 95 | call.response.header( 96 | HttpHeaders.ContentDisposition, 97 | cdVal.toString() 98 | ) 99 | call.respondFile(file) 100 | } 101 | } 102 | } 103 | 104 | fun Route.index(pageSize: Int) { 105 | 106 | get("/") { 107 | val q = call.request.queryParameters.get("query") ?: "" 108 | val p = call.request.queryParameters.get("page")?.toInt() ?: 1 109 | val path = call.request.queryParameters.get("path") ?: "" 110 | val ext = call.request.queryParameters.get("ext") ?: "" 111 | val createdFromStr = call.request.queryParameters.get("created-from") ?: "" 112 | val createdToStr = call.request.queryParameters.get("created-to") ?: "" 113 | val modifiedFromStr = call.request.queryParameters.get("modified-from") ?: "" 114 | val modifiedToStr = call.request.queryParameters.get("modified-to") ?: "" 115 | val accessedFromStr = call.request.queryParameters.get("modified-from") ?: "" 116 | val accessedToStr = call.request.queryParameters.get("modified-to") ?: "" 117 | val createdFrom = toDate(createdFromStr) 118 | val createdTo = toDate(createdToStr) 119 | val modifiedFrom = toDate(modifiedFromStr) 120 | val modifiedTo = toDate(modifiedToStr) 121 | val accessedFrom = toDate(accessedFromStr) 122 | val accessedTo = toDate(accessedToStr) 123 | var totalTime = 0L 124 | var total = 0 125 | var results = listOf() 126 | if (q != "") { 127 | val sp = SearchParams( 128 | q = q, 129 | p = p, 130 | n = pageSize, 131 | path = path, 132 | ext = ext, 133 | createdFrom = createdFrom, 134 | createdTo = createdTo, 135 | modifiedFrom = modifiedFrom, 136 | modifiedTo = modifiedTo, 137 | accessedFrom = accessedFrom, 138 | accessedTo = accessedTo, 139 | ) 140 | try { 141 | val startTime = System.nanoTime() 142 | val rt = SearchHolder.search(sp) 143 | results = rt.results 144 | total = rt.total 145 | val endTime = System.nanoTime() 146 | totalTime = endTime - startTime 147 | } catch (e: org.apache.lucene.queryparser.classic.ParseException) { 148 | e.printStackTrace() 149 | } 150 | } 151 | call.request 152 | call.respond( 153 | PebbleContent( 154 | "home.html", mapOf( 155 | "results" to results, 156 | "total" to total, 157 | "q" to q, 158 | "page" to p, 159 | "totalTime" to totalTime, 160 | "showingFrom" to pageSize * (p - 1) + 1, 161 | "showingTo" to if (pageSize * p < total) { 162 | pageSize * p 163 | } else { 164 | total 165 | }, 166 | "created_from" to createdFromStr, 167 | "created_to" to createdToStr, 168 | "modified_from" to modifiedFromStr, 169 | "modified_to" to modifiedToStr, 170 | "accessed_from" to accessedFromStr, 171 | "accessed_to" to accessedToStr, 172 | "path" to path, 173 | "ext" to ext, 174 | "next_page" to nextPage(call.request, p.toInt(), pageSize, total), 175 | "prev_page" to prevPage(call.request, p.toInt()) 176 | ) 177 | ) 178 | ) 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/main/kotlin/gr/serafeim/search/lucene_parser.kt.kt: -------------------------------------------------------------------------------- 1 | package gr.serafeim.search 2 | 3 | import gr.serafeim.conf.ConfigHolder 4 | import gr.serafeim.DBHolder 5 | import gr.serafeim.StateHolder 6 | import kotlinx.coroutines.* 7 | import kotlinx.coroutines.sync.Semaphore 8 | import kotlinx.coroutines.sync.withPermit 9 | import org.apache.lucene.analysis.Analyzer 10 | import org.apache.lucene.backward_codecs.lucene95.Lucene95Codec 11 | import org.apache.lucene.codecs.PostingsFormat 12 | import org.apache.lucene.codecs.lucene99.Lucene99Codec 13 | import org.apache.lucene.document.* 14 | import org.apache.lucene.index.IndexReader 15 | import org.apache.lucene.index.IndexWriter 16 | import org.apache.lucene.index.IndexWriterConfig 17 | import org.apache.lucene.index.Term 18 | import org.apache.lucene.store.Directory 19 | import org.apache.lucene.store.FSDirectory 20 | import org.apache.tika.Tika 21 | import org.apache.tika.config.TikaConfig 22 | import org.mapdb.HTreeMap 23 | import org.slf4j.LoggerFactory 24 | import java.io.File 25 | import java.nio.file.Files 26 | import java.nio.file.Paths 27 | import java.nio.file.attribute.BasicFileAttributes 28 | import java.nio.file.attribute.FileTime 29 | import java.util.* 30 | import java.util.concurrent.ConcurrentHashMap 31 | import java.util.concurrent.TimeUnit 32 | import kotlin.concurrent.schedule 33 | import kotlin.io.path.absolutePathString 34 | 35 | 36 | val logger = LoggerFactory.getLogger("LuceneParser") 37 | 38 | fun init(directory: String, interval: Int) { 39 | val x = PostingsFormat.availablePostingsFormats() 40 | if(!x.contains("Lucene99")) { 41 | throw Exception("Lucene99 Not found!") 42 | } 43 | val codec = Lucene99Codec() 44 | logger.info("Lucene parser init, directory: $directory, interval: $interval minutes") 45 | Timer("Parser").schedule( 46 | 0, TimeUnit.MINUTES.toMillis(interval.toLong())) { 47 | logger.debug("Parse START init....") 48 | parse(directory) 49 | } 50 | } 51 | 52 | fun toDateString(ft: FileTime): String { 53 | return DateTools.timeToString(ft.toMillis(), DateTools.Resolution.MINUTE) 54 | } 55 | 56 | fun configureTika(): Tika { 57 | 58 | var cfg = if (ConfigHolder.config.parser.externalTikaConfig == null) { 59 | logger.info("Using default tika config") 60 | TikaConfig(object {}.javaClass.getResourceAsStream("/tika-config.xml")) 61 | 62 | } else { 63 | logger.info("Using custom tika config") 64 | TikaConfig(ConfigHolder.config.parser.externalTikaConfig) 65 | 66 | } 67 | 68 | val tika = Tika(cfg) 69 | 70 | // Allow tika to read unlimited characters 71 | tika.maxStringLength = -1 72 | logger.debug("Will read up to ${tika.maxStringLength} length") 73 | return tika 74 | } 75 | 76 | fun configureIndexWriter(): IndexWriter { 77 | //We open a File System directory as we want to store the index on our local file system. 78 | val directory: Directory = getLuceneDir() 79 | 80 | //The analyzer is used to perform analysis on text of documents and create the terms that will be added in the index. 81 | val analyzer: Analyzer = ConfigHolder.getAnalyzerInstance() 82 | val indexWriterConfig = IndexWriterConfig(analyzer) 83 | 84 | // NOTE: IndexWriter instances are completely thread safe, meaning multiple threads can call any of its methods, concurrently. If your application requires external synchronization, you should not synchronize on the IndexWriter instance as this may cause deadlock; use your own (non-Lucene) objects instead. 85 | val indexWriter = IndexWriter(directory, indexWriterConfig) 86 | 87 | return indexWriter 88 | } 89 | 90 | fun parseDocument(it: File, indexWriter: IndexWriter, tika: Tika, map: HTreeMap) { 91 | logger.debug(it.name) 92 | val attrs = Files.readAttributes(Paths.get(it.path), BasicFileAttributes::class.java) 93 | val modified = attrs.lastModifiedTime().toMillis() 94 | 95 | val existingModTime: Pair? = (map[it.path] as Pair?) 96 | logger.debug("Existing mod time is $existingModTime and current mod time is $modified") 97 | print("Parsing ${it.name}\r") 98 | 99 | if(existingModTime==null || existingModTime.second < modified) { 100 | logger.debug("Need to parse and index ${it.name}") 101 | 102 | var content: String? = null; 103 | try { 104 | content = tika.parseToString(it.absoluteFile) 105 | map[it.path] = Pair(true, modified) 106 | } catch (e: Exception) { 107 | e.printStackTrace() 108 | logger.info("File ${it.path} cannot be parsed, skipping") 109 | map[it.path] = Pair(false, modified) 110 | } 111 | 112 | if (content!= null) { 113 | val doc = Document() 114 | 115 | doc.add(StringField("id", it.path, Field.Store.YES)) 116 | doc.add(TextField("text", content, Field.Store.YES)) 117 | doc.add(StringField("name", it.name, Field.Store.YES)) 118 | doc.add(TextField("name_t", it.name, Field.Store.NO)) 119 | it.path.split(File.separator).forEach { 120 | doc.add(StringField("path", it, Field.Store.YES)) 121 | } 122 | doc.add(StringField("extension", it.extension, Field.Store.YES)) 123 | 124 | doc.add(StringField("created", toDateString(attrs.creationTime()), Field.Store.YES)) 125 | doc.add(StringField("accessed", toDateString(attrs.lastAccessTime()), Field.Store.YES)) 126 | doc.add(StringField("modified", toDateString(attrs.lastModifiedTime()), Field.Store.YES)) 127 | 128 | doc.add(LongPoint("created_point", attrs.creationTime().toMillis())) 129 | doc.add(LongPoint("modified_point", attrs.lastModifiedTime().toMillis())) 130 | doc.add(LongPoint("accessed_point", attrs.lastAccessTime().toMillis())) 131 | 132 | val idTerm = Term("id", it.path) 133 | indexWriter.updateDocument(idTerm, doc) 134 | } 135 | } else { 136 | logger.debug("Skipping the file, not changed since we last saw it") 137 | } 138 | } 139 | 140 | fun parse(sdir: String) { 141 | logger.info("Parse START, extensions are ${ConfigHolder.config.parser.parseExtensions}") 142 | logger.info("Parse directory is ${Paths.get(sdir).absolutePathString()}") 143 | StateHolder.parsing = true 144 | 145 | val tika = configureTika() 146 | val indexWriter = configureIndexWriter() 147 | 148 | var uniquePaths = ConcurrentHashMap.newKeySet() 149 | 150 | val dir = File(sdir) 151 | val requestSemaphore = Semaphore(4) 152 | runBlocking { 153 | val jobs = mutableListOf() 154 | var totJobs = 0; 155 | //logger.info("Run the blocking") 156 | dir.walk(direction = FileWalkDirection.TOP_DOWN).forEach { 157 | //logger.info("Waqlking .. ${it.name}") 158 | if (!it.name.startsWith("~$")) { 159 | if (ConfigHolder.config.parser.parseExtensions.contains(it.extension.lowercase())) { 160 | //logger.info("Parsing ${it.path}") 161 | uniquePaths.add(it.path) 162 | 163 | val job = GlobalScope.launch { 164 | requestSemaphore.withPermit { 165 | totJobs += 1 166 | logger.debug("Start job, $totJobs") 167 | parseDocument(it, indexWriter, tika, DBHolder.map) 168 | totJobs -= 1 169 | logger.debug("End job, $totJobs") 170 | 171 | } 172 | } 173 | jobs.add(job) 174 | } 175 | } 176 | } 177 | jobs.joinAll() 178 | } 179 | 180 | clearDeleted(DBHolder.map, uniquePaths, indexWriter) 181 | 182 | DBHolder.db.commit() 183 | 184 | logger.info("Docs Indexed Successfully!") 185 | indexWriter.close() 186 | StateHolder.parsing = false 187 | } 188 | 189 | private fun clearDeleted( 190 | map: HTreeMap, 191 | uniquePaths: ConcurrentHashMap.KeySetView, 192 | indexWriter: IndexWriter 193 | ) { 194 | val existingPathsSet = map.map { it.key }.toSet() 195 | val uniquePathsSet = uniquePaths.toSet() 196 | val remaining = existingPathsSet.minus(uniquePathsSet) 197 | logger.debug("Clear deleted, remaining ${remaining}") 198 | remaining.forEach { 199 | map.remove(it) 200 | val idTerm = Term("id", it) 201 | indexWriter.deleteDocuments(idTerm) 202 | } 203 | } -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 26 | 27 | 28 |

29 |

30 |

31 |

32 |

33 | 36 |

37 |

38 |

39 |

40 |

41 | Folder name 42 | 43 |

44 |

45 | File extension 46 | 47 |

48 | 49 |

50 | Created date from 51 | 52 |

53 |

54 | Created date to 55 | 56 |

57 |

58 | Modified date from 59 | 60 |

61 |

62 | Modified date to 63 | 64 |

65 |

66 | Access date from 67 | 68 |

69 |

70 | Access date to 71 | 72 |

73 |

74 |

75 |

76 |

77 |

78 |

79 |

80 | 81 | Clear 82 | 83 | 84 | 87 | 88 |

89 | 90 | 91 |

92 |

93 |

94 | 98 |

99 |

Simple: Enter a word and it will search for it using stemming rules for the configured language(i.e if you search for "apple" it will also return documents containing "apples")
Phrase: If you want to search for an exact phrase, f.e "hello, world" you need to enter it between quotes. If you enter two words without the quotes it will search for documents containing one of these words. So searching for hello, world (without quotes) will return documents containing hello and documents containing word (see boolean search for more explanation)
Wildcard: You can do wildcard search: Searching for app* will return documents containing apple, applying or application. Use ? for a single letter, * for any number of characters and + for at least one character. The wildcard character cannot be on the start of your query, i.e *ppl will not work.
Boolean: You can use boolean operators like AND OR and NOT to create more complex queries. Things like (apple AND orange) OR (not strawberry) should work.
Always include/exclude: You can use the + or - operators before a word (or phrase) to include or exclude documents containing it. For example +apple +orange -strawberry will return documents containing apple and orange but not strawberry.
Distance: You can search by distance using the ~ operator. For example, "commit local"~3 will search for documents that have the words commit and local on a distance less than 3. That means that a document containing the phrase "commit all changes to local dev" will be returned but a document with the phrase "commit all changes to production and local dev" will not work.
Filtering: You can use the extra search choices to filter based on the name of the folder that contains the document or its created/modified/accessed date. For example if you write appl* to the folder it will only return documents that are contained within a folder named apples or applications (this includes all ancestor folders).
Combinations: You can use all the above in whatever combinations: For example +"commit local"~3 +download -conf* will search documents containing the word commit near the word local and also contain the word download but do not contain any words starting with conf

109 |

110 | 113 |

114 |

115 |

116 | {% if q != "" %} 117 | 118 |

Results

119 | 120 | 121 | Total time: {{ totalTime / 1000000 }} ms 122 | 123 | {% if results.size() > 0 %} 124 |

125 | Showing {{ showingFrom }} - {{ showingTo }} from {{ total }} results
126 |

127 | 134 | {% endif %} 135 | 136 | 137 | {% for r in results %} 138 |

139 |

140 |

141 |

142 | 143 | {{ r.id }} 144 | 148 | 149 | 150 |

151 |

152 | 153 |

Matches

154 |

{{ f|raw }}

159 | 160 | 161 |

162 |

163 |

164 | 167 |

168 |

169 |

170 | {{ r.text|raw }} 171 |

172 |

173 |

174 |

175 | 176 |

177 |

178 |

179 | Created: {{ r.created }} 180 | Modified: {{ r.modified }} 181 | Accessed: {{ r.accessed }}
182 |

183 |

184 | 185 | 186 | {% else %} 187 |

Nothing found!

188 | {% endfor %} 189 | 190 | {% if results.size() > 0 %} 191 | 198 | {% endif %} 199 | 200 | {% endif %} 201 | {% endblock %} --------------------------------------------------------------------------------