├── README.md ├── src ├── agmodule │ ├── tfidfsearch.nim │ ├── searchmanager.nim │ ├── submodule.nim │ ├── constants.nim │ ├── simplesearch.nim │ ├── utils.nim │ ├── messages.nim │ ├── tfidf.nim │ └── text.nim └── agsearch.nim ├── nim.cfg ├── agsearch.nimble └── .gitignore /README.md: -------------------------------------------------------------------------------- 1 | # agsearch 2 | A Simple Search Engine for Ancient Greek 3 | -------------------------------------------------------------------------------- /src/agmodule/tfidfsearch.nim: -------------------------------------------------------------------------------- 1 | ## smart searcher 2 | import text 3 | import utils 4 | import tfidf 5 | 6 | type 7 | TFIdfSearch* = object 8 | info*: seq[TextInfo] 9 | terms*: seq[string] 10 | 11 | 12 | -------------------------------------------------------------------------------- /nim.cfg: -------------------------------------------------------------------------------- 1 | warnings=on 2 | checks=on 3 | assertions=on 4 | stackTrace=on 5 | lineTrace=on 6 | threads=on 7 | styleCheck=hint 8 | debuginfo=on 9 | threadanalysis=on 10 | implicitStatic=on 11 | memTracker=on 12 | nilseqs=off 13 | gc=refc 14 | parallelBuild=0 15 | #cppCompileToNamespace=agsearch 16 | profiler=on 17 | -------------------------------------------------------------------------------- /src/agmodule/searchmanager.nim: -------------------------------------------------------------------------------- 1 | ## search manager for simple and smart search 2 | import text 3 | import simplesearch 4 | import tfidfsearch 5 | 6 | type 7 | SearcherKind* = enum 8 | skBoolean, 9 | skTfIdf 10 | 11 | type 12 | SearchManager* = object 13 | terms*: seq[string] 14 | kind*: SearcherKind 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/agmodule/submodule.nim: -------------------------------------------------------------------------------- 1 | # This is just an example to get you started. Users of your hybrid library will 2 | # import this file by writing ``import agsearchpkg/submodule``. Feel free to rename or 3 | # remove this file altogether. You may create additional modules alongside 4 | # this file as required. 5 | 6 | proc getWelcomeMessage*(): string = "Hello, World!" 7 | -------------------------------------------------------------------------------- /src/agmodule/constants.nim: -------------------------------------------------------------------------------- 1 | ## some constants for file structure 2 | import os 3 | 4 | let appDir* = getAppDir().absolutePath() 5 | 6 | let dataDir* = joinPath(appDir, "data") 7 | 8 | let rawDataDir* = joinPath(dataDir, "raw") 9 | 10 | let normalizedDataDir* = joinPath(dataDir, "normalized") 11 | 12 | let resultsDir* = joinPath(appDir, "results") 13 | 14 | let tempDir* = joinPath(appDir, "temp") 15 | 16 | let textInfoDbPath* = joinPath(dataDir, "textInfoDB.json") 17 | -------------------------------------------------------------------------------- /agsearch.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.1.0" 4 | author = "Qm Auber" 5 | description = "Ancient Greek Search Engine" 6 | license = "MIT" 7 | srcDir = "src" 8 | installExt = @["nim"] 9 | bin = @["agsearch.out"] 10 | binDir = "bin" 11 | 12 | backend = "cpp" 13 | 14 | # Dependencies 15 | 16 | requires "nim >= 1.0.2" 17 | 18 | task docs, "Generate documentation": 19 | exec "rm -R docs/*" 20 | exec "nim doc --project --index:on --outdir:docs src/agsearch.nim" 21 | exec "nim buildIndex -o:docs/theindex.html docs" 22 | -------------------------------------------------------------------------------- /src/agmodule/simplesearch.nim: -------------------------------------------------------------------------------- 1 | ## simple boolean search in text 2 | import text 3 | import utils 4 | 5 | type 6 | SimpleSearch* = object 7 | info*: TextInfo 8 | terms*: seq[string] 9 | 10 | proc boolSearchTerms(terms: seq[string], txt: string, 11 | match: proc(e1: string, e2: string): bool): bool = 12 | var check = false 13 | for term in terms: 14 | let matchResult = match(term, txt) 15 | if matchResult: 16 | check = matchResult 17 | return check 18 | 19 | proc boolSearchTerms(terms: seq[string], chunks: seq[string], 20 | match: proc(e1: string, e2: string): bool): bool = 21 | var check = false 22 | for chunk in chunks: 23 | let boolRes = boolSearchTerms(terms, chunk, match) 24 | if boolRes: 25 | check = boolRes 26 | return check 27 | 28 | 29 | proc simpleSearchTxt( 30 | searcher: SimpleSearch, 31 | match: proc(e1, e2: string): bool): bool = 32 | ## simple search main search function 33 | let txt = toText(searcher.info) 34 | if txt.hasChunks: 35 | return boolSearchTerms(searcher.terms, txt.chunks, match) 36 | else: 37 | return boolSearchTerms(searcher.terms, txt.chunks[0], match) 38 | 39 | proc simpleInSearch*(searcher: SimpleSearch): bool = 40 | return simpleSearchTxt(searcher, inMatch) 41 | 42 | proc simpleEqSearch*(searcher: SimpleSearch): bool = 43 | return simpleSearchTxt(searcher, eqMatch) 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/agmodule/utils.nim: -------------------------------------------------------------------------------- 1 | ## utility functions 2 | import constants 3 | 4 | import os 5 | import json 6 | 7 | proc emptyTempDir*() = 8 | removeDir(tempDir) 9 | createDir(tempDir) 10 | 11 | proc moveToRawData*(s: string) = 12 | moveFile(s, joinPath(rawDataDir, splitPath(s).tail)) 13 | 14 | proc moveToNormalizedData*(s: string) = 15 | moveFile(s, joinPath(normalizedDataDir, splitPath(s).tail)) 16 | 17 | proc moveToData*(s: string) = 18 | moveFile(s, joinPath(dataDir, splitPath(s).tail)) 19 | 20 | proc moveToTemp*(s: string) = 21 | moveFile(s, joinPath(tempDir, splitPath(s).tail)) 22 | 23 | proc moveToResults*(s: string) = 24 | moveFile(s, joinPath(resultsDir, splitPath(s).tail)) 25 | 26 | proc copyToRawData*(s: string) = 27 | copyFile(s, joinPath(rawDataDir, splitPath(s).tail)) 28 | 29 | proc copyToNormalizedData*(s: string) = 30 | copyFile(s, joinPath(normalizedDataDir, splitPath(s).tail)) 31 | 32 | proc copyToData*(s: string) = 33 | copyFile(s, joinPath(dataDir, splitPath(s).tail)) 34 | 35 | proc copyToTemp*(s: string) = 36 | copyFile(s, joinPath(tempDir, splitPath(s).tail)) 37 | 38 | proc copyToResults*(s: string) = 39 | copyFile(s, joinPath(resultsDir, splitPath(s).tail)) 40 | 41 | proc getTextInfoDB*(): JsonNode = 42 | return parseFile(textInfoDbPath) 43 | 44 | proc getNodeVal*[T](n: JsonNode): T {.raises: ValueError.} = 45 | ## get value of node if json node is string int float or bool 46 | when (T is string): 47 | return n.getStr() 48 | elif (T is int): 49 | return n.getInt() 50 | elif (T is float): 51 | return n.getFloat() 52 | elif (T is bool): 53 | return n.getBool() 54 | else: 55 | raise newException(ValueError, "wrong type of json node ") 56 | 57 | 58 | proc eqMatch*[T](e1, e2: T): bool = e1 == e2 59 | proc inMatch*[T](e1, e2: T): bool = e1 in e2 60 | 61 | -------------------------------------------------------------------------------- /src/agsearch.nim: -------------------------------------------------------------------------------- 1 | # This is just an example to get you started. A typical hybrid package 2 | # uses this file as the main entry point of the application. 3 | 4 | import agmodule/constants 5 | import agmodule/utils 6 | import agmodule/messages 7 | import agmodule/text 8 | import agmodule/searchmanager 9 | import agmodule/simplesearch 10 | import agmodule/tfidfsearch 11 | import agmodule/tfidf 12 | 13 | import os # paramCount 14 | import threadpool 15 | import strutils 16 | import tables 17 | import sets 18 | 19 | let debug = true 20 | 21 | var terms: seq[string] 22 | var separator = "" 23 | 24 | 25 | when isMainModule: 26 | printStart() 27 | if paramCount() == 0: 28 | printNoArgMessage() 29 | separator = handleSeparator() 30 | var searchTerms = gatherSearchTerms(separator) 31 | terms = printFilterSearchTerms(searchTerms, separator) 32 | let searchChoice = chooseSearchInterface() 33 | var manager: SearchManager 34 | var infos = fetchInfos() 35 | case searchChoice 36 | of 1: 37 | manager = SearchManager(terms: terms, 38 | kind: skBoolean) 39 | of 2: 40 | manager = SearchManager(terms: terms, 41 | kind: skTfIdf) 42 | else: 43 | raise newException(ValueError, "Unknown searcher choice") 44 | var results: seq[TextInfo] 45 | for info in infos: 46 | if manager.kind == skBoolean: 47 | let searcher = SimpleSearch(info: info, 48 | terms: terms) 49 | if simpleInSearch(searcher): 50 | results.add(info) 51 | 52 | var tfs: Table[string, seq[TfIdfTerm]] 53 | if manager.kind == skTfIdf: 54 | var termSet = toHashSet(terms) 55 | tfs = getTfIdfTerms(termSet, sep = " ", 56 | infos = infos) 57 | if debug: 58 | echo(terms) 59 | echo($(results)) 60 | echo($(tfs)) 61 | -------------------------------------------------------------------------------- /src/agmodule/messages.nim: -------------------------------------------------------------------------------- 1 | ## simple functions for printing messages on interface 2 | import strutils 3 | 4 | proc printStart*() = 5 | echo("") 6 | echo("---------------------------------------------") 7 | echo("Search engine for Ancient Greek Texts Started") 8 | echo("---------------------------------------------") 9 | echo("") 10 | 11 | proc printNoArgMessage*() = 12 | echo( 13 | "You have not specified any of" & 14 | "the parameters through command line interface. " 15 | ) 16 | echo("") 17 | 18 | proc handleSeparator*(): string = 19 | echo("Please enter your separator." & 20 | "The default one is three exclamation marks (!!!): ") 21 | echo("") 22 | result = stdin.readLine() 23 | if result == "": 24 | result = "!!!" 25 | echo("") 26 | 27 | proc gatherSearchTerms*(separator: string): string = 28 | echo("Please enter search terms separated by " & separator) 29 | echo("Ex: μάχης φασὶ χρῆναι" & 30 | separator & 31 | "μάχης φασὶ χρῆναι" & separator & "ὑστεροῦμεν" 32 | ) 33 | echo("") 34 | echo("Please do note that your terms are used as is." & 35 | "No tokenization is performed") 36 | echo("") 37 | echo("Now please enter your search terms:") 38 | echo("") 39 | result = stdin.readLine() 40 | echo("") 41 | 42 | proc printFilterSearchTerms*(terms, sep: string): seq[string] = 43 | var enteredTerms = split(terms, sep) 44 | echo("You have entered the following terms:") 45 | for term in enteredTerms: 46 | if term != "": 47 | echo("\t"&term) 48 | result.add(term) 49 | 50 | proc chooseSearchInterface*(): int {.raises: [ValueError, IOError].} = 51 | echo("") 52 | echo("Please choose your search interface") 53 | echo("Available ones are:") 54 | echo("") 55 | echo("\t1. Simple Boolean Search") 56 | echo("\t2. Smart TF-IDF Search") 57 | echo("") 58 | echo("Now enter your choice") 59 | var userChoice: string 60 | let isRead: bool = readLine(stdin, userChoice) 61 | let choice = parseInt(userChoice) 62 | echo(userChoice) 63 | if choice == 1: 64 | result = choice 65 | elif choice == 2: 66 | result = choice 67 | else: 68 | raise newException(ValueError, 69 | "Choice should be" & "either 1 or 2") 70 | -------------------------------------------------------------------------------- /src/agmodule/tfidf.nim: -------------------------------------------------------------------------------- 1 | # tf idf document 2 | import tables 3 | import text 4 | import utils 5 | import constants 6 | import sets 7 | import os 8 | import strutils 9 | import math 10 | import algorithm 11 | 12 | type 13 | TfIdfTerm* = object 14 | termDocCount*: int 15 | invDocCount*: float 16 | tfIdf*: float 17 | term*: string 18 | docLocalPath*: string # used as index 19 | 20 | proc `$`(t: TfIdfTerm): string = 21 | result = t.term & " " & $(t.termDocCount) & " " & t.docLocalPath & " " & $(t.tfIdf) 22 | 23 | proc getTermsFreqFromText(txt: Text, sep: string = " "): CountTable[string] = 24 | ## extract terms from text 25 | var terms: seq[string] 26 | if txt.hasChunks == true: 27 | for chunk in txt.chunks: 28 | var chunkTerms = chunk.split(sep) 29 | for chunkTerm in chunkTerms: 30 | if chunkTerm != sep: 31 | terms.add(chunkTerm) 32 | else: 33 | var chunk = txt.chunks[0] 34 | var chunkTerms = chunk.split(sep) 35 | for chunkTerm in chunkTerms: 36 | if chunkTerm != sep: 37 | terms.add(chunkTerm) 38 | result = toCountTable(terms) 39 | 40 | proc getTermCountInDocument(term: string, 41 | sep: string = " ", 42 | info: TextInfo): int = 43 | ## 44 | var txt = toText(info) 45 | if txt.hasChunks == true: 46 | for chunk in txt.chunks: 47 | var chunkTerms = chunk.split(sep) 48 | for chunkTerm in chunkTerms: 49 | if chunkTerm == term: 50 | result += 1 51 | else: 52 | var chunk = txt.chunks[0] 53 | var chunkTerms = chunk.split(sep) 54 | for chunkTerm in chunkTerms: 55 | if chunkTerm == sep: 56 | result += 1 57 | 58 | 59 | proc getDocumentFreqForTerm(term: string, 60 | infos: seq[TextInfo]): int = 61 | result = 0 62 | for info in infos: 63 | var txt = readFile(joinPath(dataDir, info.localPath)) 64 | if term in txt: 65 | result += 1 66 | 67 | proc getInverseDocumentFreq(term: string, infos: seq[TextInfo]): float = 68 | var count = getDocumentFreqForTerm(term, infos) 69 | result = log10(len(infos) / count) 70 | 71 | proc tfcomp(a, b: TfIdfTerm): int = 72 | if a.tfIdf < b.tfIdf: 73 | result = 1 74 | elif a.tfIdf == b.tfIdf: 75 | result = 0 76 | else: 77 | result = -1 78 | 79 | proc getTfIdfTerm(term: string, sep: string = " ", 80 | infos: seq[TextInfo]): seq[TfIdfTerm] = 81 | for info in infos: 82 | var tcount = getTermCountInDocument(term, sep, 83 | info) 84 | var docCount = getInverseDocumentFreq(term, infos) 85 | result.add(TfIdfTerm(termDocCount: tcount, 86 | invDocCount: docCount, 87 | term: term, 88 | tfIdf: float(tcount) * docCount, 89 | docLocalPath: info.localPath)) 90 | result.sort(tfcomp) 91 | 92 | proc getTfIdfTerms*(terms: HashSet[string], sep: string = " ", 93 | infos: seq[TextInfo]): Table[string, seq[TfIdfTerm]] = 94 | for term in terms: 95 | result[term] = getTfIdfTerm(term, sep, infos) 96 | 97 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # nim files 2 | nimcache/ 3 | nimblecache/ 4 | htmldocs/ 5 | docs/ 6 | tests/ 7 | bin/* 8 | 9 | # + python 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | 126 | # Spyder project settings 127 | .spyderproject 128 | .spyproject 129 | 130 | # Rope project settings 131 | .ropeproject 132 | 133 | # mkdocs documentation 134 | /site 135 | 136 | # mypy 137 | .mypy_cache/ 138 | .dmypy.json 139 | dmypy.json 140 | 141 | # Pyre type checker 142 | .pyre/ 143 | 144 | # pytype static type analyzer 145 | .pytype/ 146 | 147 | # Cython debug symbols 148 | cython_debug/ 149 | 150 | # c++ 151 | # Prerequisites 152 | *.d 153 | 154 | # Compiled Object files 155 | *.slo 156 | *.lo 157 | *.o 158 | *.obj 159 | 160 | # Precompiled Headers 161 | *.gch 162 | *.pch 163 | 164 | # Compiled Dynamic libraries 165 | *.so 166 | *.dylib 167 | *.dll 168 | 169 | # Fortran module files 170 | *.mod 171 | *.smod 172 | 173 | # Compiled Static libraries 174 | *.lai 175 | *.la 176 | *.a 177 | *.lib 178 | 179 | # Executables 180 | *.exe 181 | *.out 182 | *.app 183 | -------------------------------------------------------------------------------- /src/agmodule/text.nim: -------------------------------------------------------------------------------- 1 | ## implements a simple text helpers 2 | import json 3 | import utils 4 | import constants 5 | import strutils 6 | import system # readfile 7 | import os 8 | 9 | type 10 | TextInfo* = object 11 | localPath*: string 12 | url*: string 13 | id*: string 14 | hasChunks*: bool 15 | chunkSeparator*: string 16 | 17 | type 18 | Text* = object 19 | chunks*: seq[string] 20 | localPath*: string 21 | hasChunks*: bool 22 | 23 | 24 | proc toText*(info: TextInfo): Text = 25 | ## read text from local info if hasChunks 26 | ## separate it using chunkSeparator 27 | ## else read the whole text blob into text chunks 28 | var txt = readFile(joinPath(dataDir, info.localPath)) 29 | var chunks: seq[string] 30 | if info.hasChunks: 31 | chunks = txt.split(info.chunkSeparator) 32 | else: 33 | chunks = @[txt] 34 | return Text(chunks: chunks, 35 | localPath: info.localPath, 36 | hasChunks: info.hasChunks 37 | ) 38 | 39 | proc toJson(info: TextInfo): JsonNode = 40 | return %*info 41 | 42 | proc `$`*(info: TextInfo): string = 43 | return $(toJson(info)) 44 | 45 | 46 | proc fromJson(node: JsonNode): TextInfo = 47 | return TextInfo( 48 | localPath: getStr(node["localPath"]), 49 | url: getStr(node["url"]), 50 | id: getStr(node["id"]), 51 | hasChunks: getBool(node["hasChunks"]), 52 | chunkSeparator: getStr(node["chunkSeparator"]) 53 | ) 54 | 55 | proc makeInfoFromNode(id: string, node: JsonNode): TextInfo = 56 | node.add("id", newJString(id)) 57 | return fromJson(node) 58 | 59 | proc fetchInfos*(): seq[TextInfo] = 60 | let db = getTextInfoDB() 61 | for id, node in db.pairs(): 62 | result.add(makeInfoFromNode(id, node)) 63 | 64 | proc fetchInfoById*(id: string): TextInfo = 65 | let db = getTextInfoDB() 66 | var node = db[id] 67 | return makeInfoFromNode(id, node) 68 | 69 | proc fetchInfoByComponents[T]( 70 | compName: string, 71 | compVal: T, 72 | matchFn: proc(e1, e2: T): bool): seq[TextInfo] = 73 | ## fetch info by using a component as value 74 | let db = getTextInfoDB() 75 | var check = false 76 | for id, node in db.pairs(): 77 | var nodeVal = getNodeVal[T](node[compName]) 78 | if matchFn(compVal, nodeVal): 79 | node.add("id", newJString(id)) 80 | check = true 81 | result.add(fromJson(node)) 82 | 83 | proc fetchInfoByComponent[T](compName, compVal: T, 84 | matchFn: proc(e1, e2: T): bool): TextInfo = 85 | # 86 | var infos = fetchInfoByComponents[T](compName, compVal, matchFn) 87 | if len(infos) == 0: 88 | raise newException(ValueError, "value not found int db") 89 | result = infos[0] 90 | 91 | 92 | proc fetchInfoByPath(path: string): TextInfo = 93 | return fetchInfoByComponent[string]("localPath", path, eqMatch[string]) 94 | 95 | proc fuzzyFetchInfoByPath(path: string): TextInfo = 96 | return fetchInfoByComponent[string]("localPath", path, inMatch[string]) 97 | 98 | proc fetchInfoByUrl(url: string): TextInfo = 99 | return fetchInfoByComponent("url", url, eqMatch[string]) 100 | 101 | proc fuzzyFetchInfoByUrl(url: string): TextInfo = 102 | return fetchInfoByComponent("url", url, inMatch[string]) 103 | 104 | proc fetchInfoNotChunked(): seq[TextInfo] = 105 | return fetchInfoByComponents[bool]("hasChunks", false, eqMatch[bool]) 106 | 107 | proc fetchInfoChunked(): seq[TextInfo] = 108 | return fetchInfoByComponents[bool]("hasChunks", true, eqMatch[bool]) 109 | 110 | proc fetchInfoByChunkSeparator(sep: string): seq[TextInfo] = 111 | return fetchInfoByComponents[string]("chunkSeparator", sep, eqMatch[string]) 112 | 113 | proc fuzzyFetchInfoByChunkSeparator(sep: string): seq[TextInfo] = 114 | return fetchInfoByComponents[string]("chunkSeparator", sep, inMatch[string]) 115 | 116 | --------------------------------------------------------------------------------