├── .gitattributes ├── .gitignore ├── README.md ├── aips ├── __init__.py ├── data_loaders │ ├── cities.py │ ├── index_time_boosts.py │ ├── movies.py │ ├── outdoors.py │ ├── products.py │ └── reviews.py ├── environment.py ├── indexers │ └── product.py ├── search_requests.py └── spark │ ├── __init__.py │ └── dataframe.py ├── build ├── Dockerfile ├── ch5_spacy_requirements.txt ├── ipython_kernel_config.py ├── log4j.properties └── requirements.txt ├── chapters ├── ch03 │ ├── 1.vectors-and-text-similarity.ipynb │ └── 2.controlling-relevance.ipynb ├── ch04 │ ├── 1.setting-up-the-retrotech-dataset.ipynb │ └── 2.signals-boosting.ipynb ├── ch05 │ ├── 1.open-information-extraction.ipynb │ ├── 2.index-datasets.ipynb │ ├── 3.semantic-knowledge-graph.ipynb │ └── licenses │ │ └── hearst.NOTICE.txt ├── ch06 │ ├── 1.skg-classification-disambiguation.ipynb │ ├── 2.related-keywords-from-signals.ipynb │ ├── 3.spell-correction.ipynb │ ├── bonus.phrase-detection.ipynb │ └── bonus.related-terms-from-documents.ipynb ├── ch07 │ ├── 1.index-datasets.ipynb │ └── 2.semantic-search.ipynb ├── ch08 │ └── 1.signals-boosting.ipynb ├── ch09 │ ├── 1.personalization.ipynb │ └── 2.embedding-based-personalization.ipynb ├── ch10 │ ├── 1.setup-the-movie-db.ipynb │ ├── 2.judgments-and-logging.ipynb │ ├── 3.pairwise-transform.ipynb │ └── 4.train-and-evaluate-the-model.ipynb ├── ch11 │ ├── 0.setup.ipynb │ ├── 1.click-through-rate-judgments.ipynb │ ├── 2.sdbn-judgments-to-overcome-position-bias.ipynb │ ├── 3.SDBN-Confidence-Bias.ipynb │ ├── 4.train-upload-search-ltr.ipynb │ ├── a.defunct.synthesize-search-sessions.ipynb │ ├── a.synthesize-search-sessions.ipynb │ └── session_gen.py ├── ch12 │ ├── 0.setup.ipynb │ └── 1.ab-testing-to-active-learning.ipynb ├── ch13 │ ├── 1.setting-up-the-outdoors-dataset.ipynb │ ├── 2.introduction-to-transformers.ipynb │ ├── 3.natural-language-autocomplete.ipynb │ ├── 4.semantic-search.ipynb │ ├── 5.quantization.ipynb │ ├── bert-vocab.txt │ ├── ch13-tokenizer-analysis.ipynb │ ├── outdoors_golden_answers.xlsx │ └── pull_aips_dependency.py ├── ch14 │ ├── 1.question-answering-visualizer.ipynb │ ├── 2.question-answering-data-preparation.ipynb │ ├── 3.question-answering-fine-tuning.ipynb │ └── 4.question-answering-demo-application.ipynb ├── ch15 │ ├── 1.llm-exploration.ipynb │ ├── 2.multimodal-and-hybrid-search.ipynb │ ├── a.generate-movie-embeddings.ipynb │ ├── delorean-query.jpg │ └── mockedGenerativeResponses.csv └── welcome.ipynb ├── data └── retrotech │ ├── images │ ├── 021331131393.jpg │ ├── 027242755871.jpg │ ├── 027242831599.jpg │ ├── 037988909926.jpg │ ├── 037988910045.jpg │ ├── 037988910182.jpg │ ├── 037988910250.jpg │ ├── 037988910427.jpg │ ├── 048231316835.jpg │ ├── 05024545249224.jpg │ ├── 072244106916.jpg │ ├── 12505382925.jpg │ ├── 12505451713.jpg │ ├── 12505525766.jpg │ ├── 12505527456.jpg │ ├── 12505559105.jpg │ ├── 14381196320.jpg │ ├── 21331131393.jpg │ ├── 23272335397.jpg │ ├── 24543701538.jpg │ ├── 25192107191.jpg │ ├── 27108936499.jpg │ ├── 27242752436.jpg │ ├── 27242755871.jpg │ ├── 27242798236.jpg │ ├── 27242831599.jpg │ ├── 32429037763.jpg │ ├── 36172950027.jpg │ ├── 36725560390.jpg │ ├── 36725560451.jpg │ ├── 36725561977.jpg │ ├── 36725569331.jpg │ ├── 36725569454.jpg │ ├── 36725578241.jpg │ ├── 37988909926.jpg │ ├── 37988910045.jpg │ ├── 37988910182.jpg │ ├── 37988910250.jpg │ ├── 37988910427.jpg │ ├── 400032015667.jpg │ ├── 400037252074.jpg │ ├── 400037252258.jpg │ ├── 400037252432.jpg │ ├── 400037252616.jpg │ ├── 400037252890.jpg │ ├── 400037253316.jpg │ ├── 400192926087.jpg │ ├── 45626176.jpg │ ├── 47875841406.jpg │ ├── 47875841420.jpg │ ├── 47875842328.jpg │ ├── 47875842335.jpg │ ├── 48231011396.jpg │ ├── 48231011402.jpg │ ├── 48231316835.jpg │ ├── 48231317436.jpg │ ├── 48231317498.jpg │ ├── 5024545249224.jpg │ ├── 600603105265.jpg │ ├── 600603138423.jpg │ ├── 603497664429.jpg │ ├── 610839379408.jpg │ ├── 612572171585.jpg │ ├── 635753490879.jpg │ ├── 635753493559.jpg │ ├── 635753493573.jpg │ ├── 665331101927.jpg │ ├── 694318011294.jpg │ ├── 696055169191.jpg │ ├── 708056579739.jpg │ ├── 708056579746.jpg │ ├── 708431390614.jpg │ ├── 711719842309.jpg │ ├── 716829772249.jpg │ ├── 72244106916.jpg │ ├── 722868830062.jpg │ ├── 722868840177.jpg │ ├── 74108007469.jpg │ ├── 74108056764.jpg │ ├── 74108096487.jpg │ ├── 77283045400.jpg │ ├── 783722274422.jpg │ ├── 786936817218.jpg │ ├── 793447512228.jpg │ ├── 803238004525.jpg │ ├── 821793013776.jpg │ ├── 826663114164.jpg │ ├── 826663126044.jpg │ ├── 843163089211.jpg │ ├── 843404073153.jpg │ ├── 84691170679.jpg │ ├── 84691211174.jpg │ ├── 84691226703.jpg │ ├── 84691226727.jpg │ ├── 848447000005.jpg │ ├── 848447000081.jpg │ ├── 848447000135.jpg │ ├── 856751002097.jpg │ ├── 878816004532.jpg │ ├── 883049066905.jpg │ ├── 883929085118.jpg │ ├── 883929106172.jpg │ ├── 883929154012.jpg │ ├── 884116069973.jpg │ ├── 885038021209.jpg │ ├── 885038024644.jpg │ ├── 885038024651.jpg │ ├── 885170045132.jpg │ ├── 885370315080.jpg │ ├── 885370325348.jpg │ ├── 885909300549.jpg │ ├── 885909393404.jpg │ ├── 885909394845.jpg │ ├── 885909395095.jpg │ ├── 885909457588.jpg │ ├── 885909457595.jpg │ ├── 885909457601.jpg │ ├── 885909457632.jpg │ ├── 885909471812.jpg │ ├── 885909472376.jpg │ ├── 886111271283.jpg │ ├── 886111287055.jpg │ ├── 886971404722.jpg │ ├── 886973561621.jpg │ ├── 92636260712.jpg │ ├── 93624956037.jpg │ ├── 93624995012.jpg │ ├── 97360716641.jpg │ ├── 97360722345.jpg │ ├── 97360724240.jpg │ ├── 97360810042.jpg │ ├── 97363532149.jpg │ ├── 97363560449.jpg │ ├── 97368920347.jpg │ ├── playground_tues.ipynb │ └── unavailable.jpg │ └── templates │ └── search-results.html ├── docker-compose.yml ├── engines ├── Collection.py ├── Engine.py ├── EntityExtractor.py ├── LTR.py ├── README.md ├── SemanticKnowledgeGraph.py ├── SparseSemanticSearch.py ├── opensearch │ ├── OpenSearchCollection.py │ ├── OpenSearchEngine.py │ ├── OpenSearchLTR.py │ ├── OpenSearchSparseSemanticSearch.py │ ├── build │ │ ├── engine-Dockerfile │ │ ├── log4j2.properties │ │ ├── ltr-2.14.0-os2.14.0.zip │ │ ├── opensearch-docker-entrypoint.sh │ │ ├── opensearch-onetime-setup.sh │ │ └── performance-analyzer.properties │ └── config.py └── solr │ ├── SolrCollection.py │ ├── SolrEngine.py │ ├── SolrEntityExtractor.py │ ├── SolrLTR.py │ ├── SolrSemanticKnowledgeGraph.py │ ├── SolrSparseSemanticSearch.py │ ├── __init__.py │ ├── build │ ├── Dockerfile │ ├── log4j2-config.xml │ ├── run_solr_w_ltr.sh │ └── solr.xml │ └── config.py ├── ltr ├── MART_model.py ├── __init__.py ├── clickmodels │ ├── __init__.py │ ├── cascade.py │ ├── coec.py │ ├── conversion.py │ ├── pbm.py │ ├── sdbn.py │ ├── session.py │ └── ubm.py ├── client │ ├── __init__.py │ ├── base_client.py │ ├── solr_client.py │ └── solr_parse.py ├── download.py ├── evaluate.py ├── helpers │ ├── __init__.py │ ├── butterfingers.py │ ├── convert.py │ ├── defaultlist.py │ ├── esUrlParse.py │ ├── handle_resp.py │ ├── msmarco │ │ ├── __init__.py │ │ └── evaluate.py │ ├── ranklib_result.py │ ├── solr_escape.py │ ├── tau.py │ └── timed_block.py ├── index.py ├── injectTypos.py ├── judgments.py ├── log.py ├── plots.py ├── ranklib.py ├── sdbn_functions.py ├── search.py └── years_as_ratings.py ├── semantic_search ├── __init__.py └── query_tree.py └── webserver ├── .vscode ├── launch.json └── settings.json ├── display ├── render_search_results.py ├── search-results-template.html └── search.html ├── is-running.png ├── managed-schema.xml └── start-webserver.py /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | *.sh text eol=lf 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | system.config 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | #Mac 133 | *.DS_Store 134 | 135 | #no-commit folders 136 | *no-commit/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI-Powered Search 2 | 3 | Code examples for the book [_AI-Powered Search_](https://aipoweredsearch.com) by [Trey Grainger](https://www.linkedin.com/in/treygrainger/), [Doug Turnbull](https://www.linkedin.com/in/softwaredoug/), and [Max Irwin](https://www.linkedin.com/in/maxirwin/). Published by [Manning Publications](https://www.manning.com). 4 | 5 |
6 | 7 | 10 | 11 | ## Book Overview 12 | [_AI-Powered Search_](https://aipoweredsearch.com) teaches you the latest machine learning techniques to build search engines that continuously learn from your users and your content to drive more domain-aware and intelligent search. 13 | 14 | Search engine technology is rapidly evolving, with Artificial Intelligence (AI) driving much of that innovation. Crowdsourced relevance and the integration of large language models (LLMs) like GPT and other foundation models are massively accelerating the capabilities and expectations of search technology. 15 | 16 | AI-Powered Search will teach you modern, data-science-driven search techniques like: 17 | - Semantic search using dense vector embeddings from foundation models 18 | - Retrieval Augmented Generation 19 | - Question answering and summarization combining search and LLMs 20 | - Fine-tuning transformer-based LLMs 21 | - Personalized search based on user signals and vector embeddings 22 | - Collecting user behavioral signals and building signals boosting models 23 | - Semantic knowledge graphs for domain-specific learning 24 | - Implementing machine-learned ranking models (learning to rank) 25 | - Building click models to automate machine-learned ranking 26 | - Generative search, hybrid search, and the search frontier 27 | 28 | Today’s search engines are expected to be smart, understanding the nuances of natural language queries, as well as each user’s preferences and context. This book empowers you to build search engines that take advantage of user interactions and the hidden semantic relationships in your content to automatically deliver better, more relevant search experiences. 29 | 30 | ## How to run 31 | For simplicity of setup, all code is shipped in Jupyter Notebooks and packaged in Docker containers. This means that installing Docker and then pulling (or building) and running the book's Docker containers is the only necessary setup. [Appendix A](https://livebook.manning.com/book/ai-powered-search/appendix-a?origin=code-base) of the book provides full step-by-step instructions for running the code examples, but you can run the following to get up and running quickly: 32 | 33 | If you haven't already pulled the source code locally, run: 34 | ``` 35 | git clone https://github.com/treygrainger/ai-powered-search.git 36 | ``` 37 | 38 | Then, to build and start the codebase with interactive Jupyter notebooks, run: 39 | ``` 40 | cd ai-powered-search 41 | docker compose up 42 | ``` 43 | 44 | That's all it takes! Once the containers are built and running (this may take a while, especially on the first build), visit: 45 | `http://localhost:8888` to launch the [Welcome](chapters/welcome.ipynb) notebook and see a Table of Contents for all the live code examples from throughout the book. 46 | 47 | 49 | 50 | ## Supported Technologies 51 | AI-Powered Search teaches many modern search techniques leveraging machine learning approaches. While we utilize specific technologies to demonstrate concepts, most techniques are applicable to many modern search engines and vector databases. 52 | 53 | Throughout the book, all code examples are in **Python**, with **PySpark** (the Python interface to **Apache Spark**) being utilized heavily for data processing tasks. The default search engine leveraged by the book's examples is **Apache Solr**, but most examples are abstracted away from the particular search engine, and swappable implementation will be soon available for most popular search engines and vector databases. For more information about the search engine abstractions and custom integrations check out the [engine documentation](engines/README.md). 54 | 55 | **See Full List**: [Supported Search Engines and Vector Databases](engines/README.md) 56 | 57 | [ *Note*: if you work for a search engine / vector database company, project, or hosting provider and want to work with us on getting your engine supported, please reach out to trey@searchkernel.com ] 58 | 59 | ## Questions and help 60 | Your purchase of _AI-Powered Search_ includes online access to Manning's [LiveBook forum](https://livebook.manning.com/forum?product=graingert). This allows you to provide comments and ask questions about any parts of the book. Additionally, feel free to submit pull requests, Github issues, or comments on the project's official Github repo at https://github.com/treygrainger/ai-powered-search. 61 | 62 | ## License 63 | All code in this repository is open source under the [Apache License, Version 2.0 (ASL 2.0)](https://www.apache.org/licenses/LICENSE-2.0), unless otherwise specified. 64 | 65 | Note that when executing the code, it may pull additional dependencies that follow alternate licenses, so please be sure to inspect those licenses before using them in your projects to ensure they are suitable. The code may also pull in datasets subject to various licenses, some of which may be derived from AI models and some of which may be derived from web crawls of data subject to fair use under the copyright laws in the country of publication (the USA). Any such datasets are published "as-is", for the sole purpose of demonstrating the concepts in the book, and these datasets and their associated licenses may be subject to change over time. 66 | 67 | ## Grab a copy of the book 68 | If you don't yet have a copy, please support the authors and the publisher by purchasing a copy of [_AI-Powered Search_](http://aipoweredsearch.com). It will walk you step by step through the concepts and techniques shown in the code examples in this repository, providing needed context and insights to help you better understand the techniques. 69 | -------------------------------------------------------------------------------- /aips/__init__.py: -------------------------------------------------------------------------------- 1 | import aips.environment as environment 2 | from engines.solr import SolrLTR, SolrSemanticKnowledgeGraph, SolrEntityExtractor, SolrSparseSemanticSearch 3 | from engines.solr.SolrEngine import SolrEngine 4 | from engines.solr.SolrCollection import SolrCollection 5 | 6 | from engines.opensearch.OpenSearchCollection import OpenSearchCollection 7 | from engines.opensearch.OpenSearchEngine import OpenSearchEngine 8 | from engines.opensearch.OpenSearchLTR import OpenSearchLTR 9 | from engines.opensearch.OpenSearchSparseSemanticSearch import OpenSearchSparseSemanticSearch 10 | 11 | import os 12 | from IPython.display import display, HTML 13 | import pandas 14 | import re 15 | 16 | engine_type_map = {"SOLR": SolrEngine(), 17 | "OPENSEARCH": OpenSearchEngine()} 18 | 19 | def get_engine(override=None): 20 | engine_name = override.upper() if override else environment.get("AIPS_SEARCH_ENGINE", "SOLR") 21 | return engine_type_map[engine_name] 22 | 23 | def set_engine(engine_name): 24 | engine_name = engine_name.upper() 25 | if engine_name not in engine_type_map: 26 | raise ValueError(f"No search engine implementation found for {engine_name}") 27 | else: 28 | environment.set("AIPS_SEARCH_ENGINE", engine_name) 29 | 30 | def get_ltr_engine(collection): 31 | ltr_engine_map = {SolrCollection: SolrLTR, 32 | OpenSearchCollection: OpenSearchLTR} 33 | return ltr_engine_map[type(collection)](collection) 34 | 35 | def get_semantic_knowledge_graph(collection): 36 | return SolrSemanticKnowledgeGraph(get_engine("solr").get_collection(collection.name)) 37 | 38 | def get_entity_extractor(collection): 39 | return SolrEntityExtractor(get_engine("solr").get_collection(collection.name)) 40 | 41 | def get_sparse_semantic_search(): 42 | SSS_map = {SolrEngine: SolrSparseSemanticSearch, 43 | OpenSearchEngine: OpenSearchSparseSemanticSearch} 44 | return SSS_map[type(get_engine())]() 45 | 46 | def healthcheck(): 47 | try: 48 | if get_engine().health_check(): 49 | print("All Systems are ready. Happy Searching!") 50 | else: 51 | print("The search engine is not in a ready state.") 52 | except: 53 | print("Error! One or more containers are not responding.\nPlease follow the instructions in Appendix A.") 54 | 55 | def num2str(number): 56 | return str(round(number,4)) #round to 4 decimal places for readibility 57 | 58 | def vec2str(vector): 59 | return "[" + ", ".join(map(num2str,vector)) + "]" 60 | 61 | def tokenize(text): 62 | return text.replace(".","").replace(",","").lower().split() 63 | 64 | def get_executing_notebook_path(): 65 | return globals().get("__vsc_ipynb_file__", #only exists during a remote vscode kernel 66 | globals().get("_dh", [None])[0]) 67 | 68 | def images_directory(): 69 | path = get_executing_notebook_path() 70 | if path: 71 | relative = os.path.relpath(os.environ.get("HOME"), path) 72 | else: 73 | relative = "../.." 74 | return f"{relative}/data/retrotech/images" 75 | 76 | def img_path_for_product(product): 77 | directory = images_directory() 78 | file = product.get("upc", "no-upc") 79 | if not os.path.exists(f"data/retrotech/images/{file}.jpg"): 80 | file = "unavailable" 81 | return f"{directory}/{file}.jpg" 82 | 83 | def remove_new_lines(data): 84 | return str(data).replace('\\n', '').replace('\\N', '') 85 | 86 | def as_html(data): 87 | return remove_new_lines(data).replace(", '", ",
'") 88 | 89 | def display_search(query, documents): 90 | doc_html = as_html(documents) 91 | display(HTML(f"Query: {query}

Results:")) 92 | display(HTML(doc_html)) 93 | 94 | def display_product_search(query, documents): 95 | rendered_html = render_search_results(query, documents) 96 | display(HTML(rendered_html)) 97 | 98 | def render_search_results(query, results): 99 | search_results_template_file = os.path.join("data/retrotech/templates/", "search-results.html") 100 | with open(search_results_template_file) as file: 101 | file_content = file.read() 102 | 103 | template_syntax = "(.*)" 104 | header_template = re.sub(template_syntax, "", file_content, flags=re.S) 105 | 106 | results_template_syntax = "(.*)" 107 | x = re.search(results_template_syntax, file_content, flags=re.S) 108 | results_template = x.group(1) 109 | 110 | separator_template_syntax = "(.*)" 111 | x = re.search(separator_template_syntax, file_content, flags=re.S) 112 | separator_template = x.group(1) 113 | 114 | rendered = header_template.replace("${QUERY}", query.replace('"', '\"')) 115 | for result in results: 116 | image_url = img_path_for_product(result) 117 | rendered += results_template.replace("${NAME}", result.get("name", "UNKNOWN")) \ 118 | .replace("${MANUFACTURER}", result.get("manufacturer", "UNKNOWN")) \ 119 | .replace("${IMAGE_URL}", image_url) 120 | 121 | rendered += separator_template 122 | return rendered 123 | 124 | def fetch_products(doc_ids): 125 | request = {"query": " ".join([str(id) for id in doc_ids]), 126 | "query_fields": ["upc"], 127 | "limit": len(doc_ids)} 128 | response = get_engine().get_collection("products").search(**request) 129 | 130 | df = pandas.DataFrame(response["docs"]) 131 | df['upc'] = df['upc'].astype('int64') 132 | df.insert(0, 'image', df.apply(lambda row: "", axis=1)) 133 | return df 134 | 135 | def render_judged(products, judged, grade_col='ctr', label=""): 136 | """ Render the computed judgments alongside the products and description data""" 137 | w_prods = judged.merge(products, left_on='doc_id', right_on='upc', how='left') 138 | 139 | style = """ 140 | """ 145 | w_prods = w_prods[[grade_col, 'upc', 'image', 'name']][:5] 146 | return HTML(style + 147 | f"

{label}" + w_prods.to_html(float_format=lambda x: '%10.4f' % x, 148 | escape=False)) 149 | 150 | #def print_s(series_data, column): 151 | ##pandas.set_option("display.width", 76) 152 | #dataframe = series_data.to_frame(name=column).sort_values(column, ascending=False) 153 | #merged = dataframe.merge(products, left_on='doc_id', right_on='upc', how='left') 154 | #print(merged.rename(columns={"upc": "doc_id"})[["doc_id", column, "name"]].set_index("doc_id")) -------------------------------------------------------------------------------- /aips/data_loaders/cities.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StructType, StringType, IntegerType 2 | from pyspark.sql.functions import concat_ws, lit 3 | from pyspark.sql import SparkSession 4 | 5 | def load_dataframe(csv_file): 6 | print("Loading Geonames...") 7 | schema = StructType() \ 8 | .add("id",StringType(),True) \ 9 | .add("name",StringType(),True) \ 10 | .add("ascii_name_s",StringType(),True) \ 11 | .add("alternative_names_s",StringType(),True) \ 12 | .add("latitude_s",StringType(),True) \ 13 | .add("longitude_s",StringType(),True) \ 14 | .add("feature_class_s",StringType(),True) \ 15 | .add("feature_code_s",StringType(),True) \ 16 | .add("country",StringType(),True) \ 17 | .add("cc2_s",StringType(),True) \ 18 | .add("admin_area",StringType(),True) \ 19 | .add("admin_code_2_s",StringType(),True) \ 20 | .add("admin_code_3_s",StringType(),True) \ 21 | .add("admin_code_4_s",StringType(),True) \ 22 | .add("popularity",IntegerType(),True) \ 23 | .add("elevation_s",StringType(),True) \ 24 | .add("dem_s",StringType(),True) \ 25 | .add("timezone_s",StringType(),True) \ 26 | .add("modification_date_s",StringType(),True) 27 | 28 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 29 | dataframe = spark.read.csv(csv_file, schema=schema, multiLine=True, escape="\\", sep="\t") \ 30 | .withColumn("type", lit("city")) \ 31 | .withColumn("location_coordinates", concat_ws(",", "latitude_s", "longitude_s")) 32 | 33 | return dataframe -------------------------------------------------------------------------------- /aips/data_loaders/index_time_boosts.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import collect_list, create_map 2 | from aips.spark.dataframe import from_sql 3 | from aips.spark import create_view_from_collection 4 | 5 | def load_dataframe(boosted_products_collection, boosts_collection): 6 | assert(type(boosted_products_collection) == type(boosts_collection)) 7 | create_view_from_collection(boosts_collection, 8 | boosts_collection.name) 9 | create_view_from_collection(boosted_products_collection, 10 | boosted_products_collection.name) 11 | match boosted_products_collection.get_engine_name(): 12 | case "solr": 13 | query = f"""SELECT p.*, b.signals_boosts FROM ( 14 | SELECT doc, CONCAT_WS(',', COLLECT_LIST(CONCAT(query, '|', boost))) 15 | AS signals_boosts FROM {boosts_collection.name} GROUP BY doc 16 | ) b INNER JOIN {boosted_products_collection.name} p ON p.upc = b.doc""" 17 | boosts_dataframe = from_sql(query) 18 | case "opensearch": 19 | product_query = f"SELECT * FROM {boosted_products_collection.name}" 20 | boosts_query = f"SELECT doc, boost, REPLACE(query, '.', '') AS query FROM {boosts_collection.name}" 21 | 22 | grouped_boosts = from_sql(boosts_query).groupBy("doc") \ 23 | .agg(collect_list(create_map("query", "boost"))[0].alias("signals_boost")) \ 24 | .withColumnRenamed("doc", "upc") 25 | 26 | boosts_dataframe = from_sql(product_query).join(grouped_boosts, "upc") 27 | case _: 28 | raise Exception(f"Index time boost not implemented for {type(boosted_products_collection)}") 29 | 30 | return boosts_dataframe -------------------------------------------------------------------------------- /aips/data_loaders/movies.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pyspark.sql import SparkSession, Row 3 | 4 | def load_dataframe(movie_file="data/tmdb.json", movie_image_ids={}): 5 | movies = [] 6 | for movieId, tmdbMovie in json.load(open(movie_file)).items(): 7 | try: 8 | releaseDate = None 9 | if "release_date" in tmdbMovie and len(tmdbMovie["release_date"]) > 0: 10 | releaseDate = tmdbMovie["release_date"] 11 | releaseYear = releaseDate[0:4] 12 | 13 | full_poster_path = "" 14 | if "poster_path" in tmdbMovie and tmdbMovie["poster_path"] is not None and len(tmdbMovie["poster_path"]) > 0: 15 | full_poster_path = "https://image.tmdb.org/t/p/w185" + tmdbMovie["poster_path"] 16 | 17 | movie = {"id": movieId, 18 | "title": tmdbMovie["title"], 19 | "overview": tmdbMovie["overview"], 20 | "tagline": tmdbMovie["tagline"], 21 | "directors": [director["name"] for director in tmdbMovie["directors"]], 22 | "cast": " ".join([castMember["name"] for castMember in tmdbMovie["cast"]]), 23 | "genres": [genre["name"] for genre in tmdbMovie["genres"]], 24 | "release_date": releaseDate, 25 | "release_year": releaseYear, 26 | "poster_file": (tmdbMovie["poster_path"] or " ")[1:], 27 | "poster_path": full_poster_path, 28 | "vote_average": float(tmdbMovie["vote_average"]) if "vote_average" in tmdbMovie else None, 29 | "vote_count": int(tmdbMovie["vote_count"]) if "vote_count" in tmdbMovie else 0} 30 | if movie["title"].lower() in movie_image_ids: 31 | joined_ids = ",".join(movie_image_ids[movie["title"].lower()]) 32 | else: 33 | joined_ids = "" 34 | movie["movie_image_ids"] = joined_ids 35 | 36 | movies.append(movie) 37 | except KeyError as k: # Ignore any movies missing these attributes 38 | continue 39 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 40 | return spark.createDataFrame(Row(**m) for m in movies) -------------------------------------------------------------------------------- /aips/data_loaders/outdoors.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import coalesce, col, lit, udf 3 | from pyspark.sql.types import ArrayType, StringType 4 | import html 5 | import re 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | def load_dataframe(csv_file): 10 | def split_tags(ascii_html): 11 | tags = re.compile("[\<\>]").split(html.unescape(ascii_html or "")) 12 | return [t.replace("-", " ") for t in tags if len(t)] 13 | 14 | def strip_HTML(ascii_html): 15 | text = html.unescape(ascii_html or "") 16 | text = BeautifulSoup(text, "lxml").get_text(separator=" ") 17 | return re.sub(r"\s+", " ", text.strip()) 18 | 19 | split_tags_udf = udf(split_tags, ArrayType(StringType())) 20 | strip_html_udf = udf(strip_HTML) 21 | generate_url_udf = udf(lambda id: f"https://outdoors.stackexchange.com/questions/{id}", StringType()) 22 | post_type_udf = udf(lambda type_id: "question" if type_id == 1 else "answer", StringType()) 23 | 24 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 25 | dataframe = spark.read.csv(csv_file, header=True, inferSchema=True) 26 | dataframe = dataframe.filter((dataframe.post_type_id == 1) | (dataframe.post_type_id == 2)) 27 | dataframe = dataframe.withColumn("post_type", post_type_udf(col("post_type_id"))) 28 | dataframe = dataframe.withColumn("view_count", coalesce(col("view_count"), lit(0))) 29 | dataframe = dataframe.withColumn("body", strip_html_udf(col("body"))) 30 | dataframe = dataframe.withColumn("owner_user_id", coalesce(col("owner_user_id"), col("owner_display_name"))) 31 | dataframe = dataframe.withColumn("title", strip_html_udf(col("title"))) 32 | dataframe = dataframe.withColumn("tags", split_tags_udf(col("tags"))) 33 | dataframe = dataframe.withColumn("url", generate_url_udf(col("id"))) 34 | dataframe = dataframe.drop("post_type_id", "deletion_date", "owner_display_name", "last_editor_user_id", 35 | "last_editor_display_name", "last_edit_date", "last_activity_date", "comment_count", 36 | "favorite_count", "closed_date", "community_owned_date") 37 | return dataframe -------------------------------------------------------------------------------- /aips/data_loaders/products.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import col, udf 3 | 4 | def load_dataframe(csv_file): 5 | print("Loading Products") 6 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 7 | dataframe = spark.read.csv(csv_file, header=True, inferSchema=True) 8 | dataframe = dataframe.withColumn("upc", udf(str)(col("upc"))) 9 | print("Schema: ") 10 | dataframe.printSchema() 11 | return dataframe -------------------------------------------------------------------------------- /aips/data_loaders/reviews.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import col 3 | 4 | def load_dataframe(csv_file): 5 | print("\nLoading Reviews...") 6 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 7 | dataframe = spark.read.csv(csv_file, inferSchema=True, header=True, multiLine=True, escape="\"") \ 8 | .select(col("id"), col("name_t").alias("business_name"), 9 | col("name_s").alias("name"), 10 | col("city_t").alias("city"), 11 | col("state_t").alias("state"), col("text_t").alias("content"), 12 | col("categories_t").alias("categories"), col("stars_i").alias("stars_rating"), 13 | col("location_pt_s").alias("location_coordinates")) 14 | dataframe.printSchema() 15 | dataframe = dataframe.filter(dataframe.business_name != "Charlotte Center City Partners") 16 | return dataframe -------------------------------------------------------------------------------- /aips/environment.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | AIPS_NOTEBOOK_HOST = "aips-notebook" 5 | AIPS_NOTEBOOK_PORT = os.getenv("AIPS_NOTEBOOK_PORT") or "8888" 6 | 7 | AIPS_ZK_HOST = "aips-zk" 8 | AIPS_ZK_PORT = os.getenv("AIPS_ZK_PORT") or "2181" 9 | 10 | AIPS_WEBSERVER_HOST = os.getenv("AIPS_WEBSERVER_HOST") or "localhost" 11 | AIPS_WEBSERVER_PORT = os.getenv("AIPS_WEBSERVER_PORT") or "2345" 12 | WEBSERVER_URL = f"http://{AIPS_WEBSERVER_HOST}:{AIPS_WEBSERVER_PORT}" 13 | DEFAULT_CONFIG = {"AIPS_SEARCH_ENGINE": "SOLR", 14 | "PRINT_REQUESTS": False} 15 | 16 | CONFIG_FILE_PATH = os.path.abspath(os.path.join(os.path.join( 17 | os.path.dirname(__file__) , './'), 'system.config')) 18 | 19 | def write_config(config): 20 | with open(CONFIG_FILE_PATH, "w") as config_file: 21 | json.dump(config, config_file) 22 | 23 | def read_config(): 24 | with open(CONFIG_FILE_PATH, "r") as f: 25 | return json.load(f) 26 | 27 | def load_config(): 28 | try: 29 | config = read_config() 30 | except: 31 | write_config(DEFAULT_CONFIG) 32 | config = read_config() 33 | return config 34 | 35 | def set(key, value): 36 | config = load_config() 37 | config[key] = value 38 | with open(CONFIG_FILE_PATH, "w") as config_file: 39 | json.dump(config, config_file) 40 | 41 | def get(key, default=None): 42 | config = load_config() 43 | if default: 44 | return config.get(key, default) 45 | else: 46 | return config[key] -------------------------------------------------------------------------------- /aips/indexers/product.py: -------------------------------------------------------------------------------- 1 | from ltr.download import download, extract_tgz 2 | from git import Repo # pip install gitpython 3 | 4 | 5 | #Get datasets 6 | ![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git 7 | ! cd retrotech && git pull 8 | ! cd retrotech && mkdir -p '../data/retrotech/' && tar -xvf products.tgz -C '../data/retrotech/' && tar -xvf signals.tgz -C '../data/retrotech/' 9 | 10 | 11 | dataset = ["https://github.com/ai-powered-search/tmdb/raw/main/judgments.tgz", 12 | "https://github.com/ai-powered-search/tmdb/raw/main/movies.tgz"] 13 | download(dataset, dest="") 14 | extract_tgz("data/movies.tgz", "data/") # -> Holds "tmdb.json", big json dict with corpus 15 | extract_tgz("data/judgments.tgz", "data/") # -> Holds "ai_pow_search_judgments.txt", 16 | # which is our labeled judgment list 17 | Repo.clone_from("https://github.com/ai-powered-search/retrotech.git", "data/retrotech/") 18 | 19 | from aips.data_loaders.products import load_dataframe 20 | 21 | products_collection = engine.create_collection("products") 22 | products_dataframe = load_dataframe("data/retrotech/products.csv") 23 | products_collection.write(products_dataframe) 24 | 25 | signals_collection = engine.create_collection("signals") 26 | signals_collection.write(from_csv("data/retrotech/signals.csv")) -------------------------------------------------------------------------------- /aips/search_requests.py: -------------------------------------------------------------------------------- 1 | def product_search_request(query, param_overrides={}): 2 | request = {"query": query, 3 | "query_fields": ["name", "manufacturer", "long_description"], 4 | "return_fields": ["upc", "name", "manufacturer", 5 | "short_description", "score"], 6 | "limit": 5, 7 | "order_by": [("score", "desc"), ("upc", "asc")]} 8 | return request | param_overrides 9 | 10 | def search_for_boosts(query, collection, query_field="query"): 11 | boosts_request = {"query": query, 12 | "query_fields": [query_field], 13 | "return_fields": ["query", "doc", "boost"], 14 | "limit": 20, 15 | "order_by": [("boost", "desc")]} 16 | response = collection.search(**boosts_request) 17 | return response["docs"] 18 | 19 | def create_boosts_query(boost_documents): 20 | print("Boost Documents:") 21 | print(boost_documents) 22 | boosts = " ".join([f'"{b["doc"]}"^{b["boost"]}' 23 | for b in boost_documents]) 24 | print(f"\nBoost Query: \n{boosts}\n") 25 | return boosts 26 | 27 | def boosted_product_search_request(query, collection, boost_field=None): 28 | signals_documents = search_for_boosts(query, collection) 29 | signals_boosts = create_boosts_query(signals_documents) 30 | boosted_request = product_search_request(query) 31 | if boost_field: 32 | signals_boosts = (boost_field, signals_boosts) 33 | boosted_request["query_boosts"] = signals_boosts 34 | return boosted_request -------------------------------------------------------------------------------- /aips/spark/__init__.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | from pyspark.sql.functions import col, udf 4 | from pyspark.sql.types import StringType 5 | 6 | from aips.environment import AIPS_ZK_HOST 7 | from engines.opensearch.config import OPENSEARCH_URL 8 | 9 | def create_view_from_collection(collection, view_name, spark=None): 10 | if not spark: 11 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 12 | match collection.get_engine_name(): 13 | case "solr": 14 | opts = {"zkhost": AIPS_ZK_HOST, "collection": collection.name} 15 | spark.read.format("solr").options(**opts).load().createOrReplaceTempView(view_name) 16 | case "opensearch": 17 | parse_id_udf = udf(lambda s: s["_id"], StringType()) 18 | opts = {"opensearch.nodes": OPENSEARCH_URL, 19 | "opensearch.net.ssl": "false", 20 | "opensearch.read.metadata": "true"} 21 | dataframe = spark.read.format("opensearch").options(**opts).load(collection.name) 22 | if "_metadata" in dataframe.columns: 23 | dataframe = dataframe.withColumn("id", parse_id_udf(col("_metadata"))) 24 | dataframe = dataframe.drop("_metadata") 25 | print(dataframe.columns) 26 | dataframe.createOrReplaceTempView(view_name) 27 | case _: 28 | raise NotImplementedError(type(collection)) -------------------------------------------------------------------------------- /aips/spark/dataframe.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import lit 3 | 4 | def from_csv(file, more_opts=False, log=True): 5 | if log: 6 | print(f"Loading {file}") 7 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 8 | reader = spark.read.format("csv").option("header", "true").option("inferSchema", "true") 9 | if more_opts: 10 | reader = reader.option("charset", "utf-8").option("quote", "\"").option("escape", "\"").option("multiLine","true").option("delimiter", ",") 11 | dataframe = reader.load(file) 12 | if more_opts and "category" in more_opts: 13 | # We can rely on automatic generation of IDs, or we can create them ourselves. 14 | # If we do it, comment out previous line 15 | # .withColumn("id", concat(col("category"), lit("_") col("id"))) 16 | dataframe = dataframe.withColumn("category", lit(more_opts.get("category"))).drop("id") 17 | 18 | if log: 19 | print("Schema: ") 20 | dataframe.printSchema() 21 | 22 | return dataframe 23 | 24 | def from_sql(query, spark=None): 25 | if not spark: 26 | spark = SparkSession.builder.appName("AIPS").getOrCreate() 27 | return spark.sql(query) -------------------------------------------------------------------------------- /build/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1.5 2 | FROM jupyter/pyspark-notebook:spark-3.3.1 3 | USER root 4 | 5 | # Install gcc, c++, and related dependencies needed for pip to build some python dependencies 6 | RUN sudo apt-get -y update && \ 7 | apt-get install -y --reinstall build-essential gcc cargo && \ 8 | rm -rf /var/lib/apt/lists/ 9 | 10 | # Install Spark-Solr 11 | ENV SPARK_SOLR_VERSION=4.0.2 12 | ENV SHADED_SOLR_JAR_PATH=/usr/local/spark/lib/spark-solr-${SPARK_SOLR_VERSION}-shaded.jar 13 | RUN mkdir -p /usr/local/spark/lib/ && cd /usr/local/spark/lib/ && \ 14 | wget -q https://repo1.maven.org/maven2/com/lucidworks/spark/spark-solr/${SPARK_SOLR_VERSION}/spark-solr-${SPARK_SOLR_VERSION}-shaded.jar -O $SHADED_SOLR_JAR_PATH && \ 15 | echo "c5293f10257603bcf650780afcb91ed1bb118f09feb731502c2dc7ac14ba950e586a033cb2f50e5c122c5ec442dc0d2b55f76c4f6522b555e67f4981a38bca26 *spark-solr-${SPARK_SOLR_VERSION}-shaded.jar" \ 16 | | sha512sum -c - && chmod 0777 $SHADED_SOLR_JAR_PATH 17 | 18 | # Install Spark-OpenSearch 19 | ENV SPARK_OS_VERSION=1.2.0 20 | ENV SPARK_OS_JAR=opensearch-spark-30_2.12-${SPARK_OS_VERSION}.jar 21 | ENV SPARK_OS_PATH=/usr/local/spark/lib/${SPARK_OS_JAR} 22 | RUN cd /usr/local/spark/lib/ && \ 23 | wget -q https://repo1.maven.org/maven2/org/opensearch/client/opensearch-spark-30_2.12/${SPARK_OS_VERSION}/${SPARK_OS_JAR} -O $SPARK_OS_PATH && \ 24 | echo "5b9ae056b6ac21ae009f79a3a761774c7178b995fbe035572a4f35d5738e055d02828d2ec0ff98dd063ffffe37f4c48dc9a418d71269fc560f65b33c94493f2d *${SPARK_OS_JAR}" \ 25 | | sha512sum -c - && chmod 0777 $SPARK_OS_PATH 26 | 27 | WORKDIR /home/$NB_USER 28 | 29 | # Install Python dependencies 30 | COPY build/ch5_spacy_requirements.txt /home/$NB_USER 31 | RUN conda create --name ch5-spacy python=3.10.0 -y 32 | RUN conda run --name ch5-spacy pip install -r ch5_spacy_requirements.txt 33 | RUN conda run --name ch5-spacy python -m spacy download en_core_web_sm 34 | RUN conda run --name ch5-spacy python -m ipykernel install --name ch5-spacy --display-name "[ONLY FOR CH5.1] spaCy experimental kernel" 35 | 36 | ENV BLIS_ARCH="generic" PIP_CACHE_DIR=/var/cache/pip 37 | RUN mkdir -p $PIP_CACHE_DIR 38 | COPY build/requirements.txt /home/$NB_USER 39 | RUN --mount=type=cache,target=$PIP_CACHE_DIR \ 40 | pip install -r requirements.txt && \ 41 | python -m spacy download en_core_web_sm 42 | RUN rm ch5_spacy_requirements.txt requirements.txt 43 | 44 | # Configure home directory 45 | COPY build/log4j.properties /usr/local/spark/conf/ 46 | COPY aips/ /home/$NB_USER/aips 47 | COPY chapters/ /home/$NB_USER/chapters 48 | COPY data/ /home/$NB_USER/data 49 | COPY engines/ /home/$NB_USER/engines 50 | COPY ltr/ /home/$NB_USER/ltr 51 | COPY semantic_search/ /home/$NB_USER/semantic_search 52 | COPY webserver/ /home/$NB_USER/webserver 53 | COPY build/ipython_kernel_config.py /etc/ipython/ 54 | RUN rm -rf work/ 55 | 56 | # Change to notebook user 57 | RUN chown -R $NB_UID:$NB_UID /home/$NB_USER 58 | RUN fix-permissions /home/$NB_USER 59 | USER $NB_USER 60 | 61 | # Spark Config 62 | ENV SPARK_OPTS="$SPARK_OPTS --driver-java-options=\"-DXlint:none -Dlog4j.logLevel=error -Dallow-access=java.nio.DirectByteBuffer -Dlog4j.logger.org.apache.spark.repl.Main=ERROR\" --spark.ui.showConsoleProgress=False" \ 63 | PYSPARK_SUBMIT_ARGS="-c spark.driver.defaultJavaOptions=\"-DXlint=none -Dlog4j.logLevel=error -Dallow-access=java.nio.DirectByteBuffer\" -c spark.ui.showConsoleProgress=False --jars $SHADED_SOLR_JAR_PATH,$SPARK_OS_PATH pyspark-shell" \ 64 | PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-*-src.zip:%PYTHONPATH% \ 65 | DOCKER_STACKS_JUPYTER_CMD=lab 66 | 67 | # If you want to edit the notebooks and have your changes persist, 68 | # uncomment the line below and restart with `docker compose up --build` 69 | #WORKDIR /tmp/notebooks 70 | 71 | # Mark all notebooks as trusted by default 72 | RUN find . -name \*.ipynb -print0|xargs -0 jupyter-trust -y 73 | 74 | # Start Jupyter Notebooks 75 | RUN jupyter labextension disable "@jupyterlab/apputils-extension:announcements" 76 | CMD start-notebook.sh --ServerApp.password='' \ 77 | --ServerApp.token='' --NotebookApp.token='' --LabApp.token='' \ 78 | --LabApp.default_url='/lab/tree/chapters/welcome.ipynb' \ 79 | --NotebookApp.allow_origin='*' --NotebookApp.ip='0.0.0.0' --ServerApp.ip=0.0.0.0 --no-browser -------------------------------------------------------------------------------- /build/ch5_spacy_requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | ipykernel 3 | spacy==3.4.4 4 | numpy==1.26.4 5 | matplotlib==3.9.2 6 | networkx==3.3 7 | https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl -------------------------------------------------------------------------------- /build/ipython_kernel_config.py: -------------------------------------------------------------------------------- 1 | # Configuration file for ipython-kernel. 2 | # See 3 | 4 | # With IPython >= 6.0.0, all outputs to stdout/stderr are captured. 5 | # It is the case for subprocesses and output of compiled libraries like Spark. 6 | # Those logs now both head to notebook logs and in notebooks outputs. 7 | # Logs are particularly verbose with Spark, that is why we turn them off through this flag. 8 | # 9 | 10 | # Attempt to capture and forward low-level output, e.g. produced by Extension 11 | # libraries. 12 | # Default: True 13 | # type:ignore 14 | c.IPKernelApp.capture_fd_output = False # noqa: F821 15 | c.IPKernelApp.code_to_run = "cd /home/jovyan" -------------------------------------------------------------------------------- /build/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}:> 7 | 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, > 9 | # log level for this class is used to overwrite the root logger's log level, so> 10 | # the user can have different defaults for the shell and regular Spark apps. 11 | log4j.logger.org.apache.spark.repl.Main=ERROR 12 | 13 | # Settings to quiet third party logs that are too verbose 14 | log4j.logger.org.sparkproject.jetty=ERROR 15 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR 16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR 17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR 18 | log4j.logger.org.apache.parquet=ERROR 19 | log4j.logger.parquet=ERROR 20 | 21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent U> 22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 24 | 25 | # For deploying Spark ThriftServer 26 | # SPARK-34128:Suppress undesirable TTransportException warnings involved in TH> 27 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter 28 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during proc> 29 | log4j.appender.console.filter.1.AcceptOnMatch=false -------------------------------------------------------------------------------- /build/requirements.txt: -------------------------------------------------------------------------------- 1 | #Implicit dependencies with set versions for optimization 2 | adjusttext==0.8 3 | mdurl==0.1.2 4 | mizani==0.11.4 5 | patsy==0.5.6 6 | wcwidth==0.2.13 7 | pandas==2.2.3 8 | contourpy==1.0.7 9 | thinc==8.1.0 10 | numba==0.60.0 11 | wrapit==0.3.0 12 | h5py==3.12.1 13 | gitpython==3.1.43 14 | 15 | #Forced versions to prevent implicit dep version errors 16 | # (these sub packages are only used indirectly through normal deps) 17 | bottleneck==1.4.0 18 | numexpr==2.10.1 19 | pyarrow==17.0.0 20 | statsmodels==0.14.4 21 | networkx==3.3 22 | 23 | #Normal deps 24 | accelerate==0.34.2 25 | beautifulsoup4==4.12.3 26 | lxml==5.3.0 27 | datasets==3.0.1 28 | jupyter-console==6.6.3 29 | matplotlib==3.9.2 30 | nltk==3.9.1 31 | nmslib==2.1.1 32 | plotly==5.24.1 33 | plotnine==0.13.5 34 | openai-clip==1.0.1 35 | torchvision==0.20.1 36 | numpy==1.26.4 37 | scipy==1.14.1 38 | #scikit-learn==1.5.2 39 | scikit-learn==1.2.1 40 | spacy==3.5.0 41 | staticmap==0.5.7 42 | faiss-cpu==1.9.0 43 | sentence-transformers==3.1.1 44 | spladerunner==0.1.6 -------------------------------------------------------------------------------- /chapters/ch05/licenses/hearst.NOTICE.txt: -------------------------------------------------------------------------------- 1 | (From https://github.com/mmichelsonIF/hearst_patterns_python) 2 | -------------- 3 | 4 | Code in the notebook "1.open-information-extraction.ipynb" 5 | related the Hearst patterns is reused in whole or in part 6 | from https://github.com/mmichelsonIF/hearst_patterns_python, 7 | which is licensed under the Apache (Software) License, 8 | version 2.0 ("the License"), and is subject to the following notice: 9 | 10 | 11 | Copyright 2016-2019 mmichelsonIF (https://github.com/mmichelsonIF) 12 | 13 | Licensed under the Apache License, Version 2.0 (the "License"); 14 | you may not use this file except in compliance with the License. 15 | You may obtain a copy of the License at 16 | 17 | http://www.apache.org/licenses/LICENSE-2.0 18 | 19 | Unless required by applicable law or agreed to in writing, software 20 | distributed under the License is distributed on an "AS IS" BASIS, 21 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | See the License for the specific language governing permissions and 23 | limitations under the License. -------------------------------------------------------------------------------- /chapters/ch10/1.setup-the-movie-db.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [ Chapter 10 - Learning to Rank for Generalizable Search Relevance ]\n", 8 | "# Setup TheMovieDB Collection" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from aips import get_engine, get_ltr_engine\n", 18 | "engine = get_engine()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Create Collection\n", 26 | "\n", 27 | "Create collection for http://themoviedb.org (TMDB) dataset for this book. We will just look at title, overview, and release_year fields." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Wiping \"tmdb\" collection\n", 40 | "Creating \"tmdb\" collection\n", 41 | "Status: Success\n", 42 | "Adding LTR QParser for tmdb collection\n", 43 | "Adding LTR Doc Transformer for tmdb collection\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "tmdb_collection = engine.create_collection(\"tmdb\")\n", 49 | "get_ltr_engine(tmdb_collection).enable_ltr()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Download and index data\n", 57 | "\n", 58 | "Download TMDB data and index. We also download a judgment list, labeled movies as relevant/irrelevant for several movie queries" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "GET https://github.com/ai-powered-search/tmdb/raw/main/judgments.tgz\n", 71 | "GET https://github.com/ai-powered-search/tmdb/raw/main/movies.tgz\n", 72 | "Successfully written 65616 documents\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "from ltr.download import download, extract_tgz \n", 78 | "from aips.data_loaders.movies import load_dataframe\n", 79 | "import tarfile\n", 80 | "import json\n", 81 | "\n", 82 | "dataset = [\"https://github.com/ai-powered-search/tmdb/raw/main/judgments.tgz\", \n", 83 | " \"https://github.com/ai-powered-search/tmdb/raw/main/movies.tgz\"]\n", 84 | "download(dataset, dest=\"data/\")\n", 85 | "extract_tgz(\"data/movies.tgz\", \"data/\") # -> Holds \"tmdb.json\", big json dict with corpus\n", 86 | "extract_tgz(\"data/judgments.tgz\", \"data/\") # -> Holds \"ai_pow_search_judgments.txt\", \n", 87 | " # which is our labeled judgment list\n", 88 | "\n", 89 | "movies_dataframe = load_dataframe(\"data/tmdb.json\")\n", 90 | "tmdb_collection.write(movies_dataframe)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Next Up, judgments and feature logging\n", 98 | "\n", 99 | "Next up we use a _judgment list_, a set of labeled relevant / irrelevant movies for search query strings. We then extract some features from the search engine to setup a full training set we can use to train a model.\n", 100 | "\n", 101 | "Up next: [judgments and Logging](2.judgments-and-logging.ipynb)" 102 | ] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3 (ipykernel)", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.10.9" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 4 126 | } 127 | -------------------------------------------------------------------------------- /chapters/ch11/0.setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [ Chapter 11 - Automating Learning to Rank with Click Models ]\n", 8 | "# Indexing Search Sessions Data" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipyn) notebook or execute the next cell uncommented." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import sys\n", 25 | "sys.path.append(\"..\")\n", 26 | "from ltr import download" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Download simulated raw clickstream data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Already up to date.\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git\n", 51 | "! cd retrotech && git pull\n", 52 | "! cp retrotech/sessions/* data/" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Already up to date.\n", 65 | "products.csv\n", 66 | "signals.csv\n", 67 | "\"upc\",\"name\",\"manufacturer\",\"short_description\",\"long_description\"\n", 68 | "\"096009010836\",\"Fists of Bruce Lee - Dolby - DVD\", , , \n", 69 | "\"043396061965\",\"The Professional - Widescreen Uncut - DVD\", , , \n", 70 | "\"085391862024\",\"Pokemon the Movie: 2000 - DVD\", , , \n", 71 | "\"067003016025\",\"Summerbreeze - CD\",\"Nettwerk\", , \n", 72 | "\"731454813822\",\"Back for the First Time [PA] - CD\",\"Def Jam South\", , \n", 73 | "\"024543008200\",\"Big Momma's House - Widescreen - DVD\", , , \n", 74 | "\"031398751823\",\"Kids - DVD\", , , \n", 75 | "\"037628413929\",\"20 Grandes Exitos - CD\",\"Sony Discos Inc.\", , \n", 76 | "\"060768972223\",\"Power Of Trinity (Box) - CD\",\"Sanctuary Records\", , \n", 77 | "Wiping \"products\" collection\n", 78 | "Creating \"products\" collection\n", 79 | "Status: Success\n", 80 | "Loading Products\n", 81 | "Schema: \n", 82 | "root\n", 83 | " |-- upc: long (nullable = true)\n", 84 | " |-- name: string (nullable = true)\n", 85 | " |-- manufacturer: string (nullable = true)\n", 86 | " |-- short_description: string (nullable = true)\n", 87 | " |-- long_description: string (nullable = true)\n", 88 | "\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "#%run chapters/ch04/1.setting-up-the-retrotech-dataset.ipynb" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Up next: [Your First Click Model: Click Thru Rate](1.click-through-rate-judgments.ipynb)" 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.10.9" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /chapters/ch13/outdoors_golden_answers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/chapters/ch13/outdoors_golden_answers.xlsx -------------------------------------------------------------------------------- /chapters/ch13/pull_aips_dependency.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../..') 3 | import requests 4 | 5 | def pull_dependency(file_name): 6 | print ("Pulling: \"" + file_name + "\"") 7 | with open(file_name, "wb") as file: 8 | for part in map(chr,range(ord('a'),ord('z')+1)): 9 | part_name = "part_" + str(part) 10 | response = requests.get("https://github.com/ai-powered-search/aips-build-dependencies/raw/main/" + file_name + "/" + part_name) 11 | if response.status_code == 200: 12 | print("Successfully downloaded " + part_name) 13 | file.write(response.content) 14 | elif response.status_code == 404: 15 | break 16 | else: 17 | raise Exception("Error: Status Code " + response.status_code + "\n" + response.text) 18 | print(file_name + " successfully pulled") 19 | 20 | if len(sys.argv) == 2: 21 | pull_dependency(sys.argv[1]) 22 | -------------------------------------------------------------------------------- /chapters/ch15/delorean-query.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/chapters/ch15/delorean-query.jpg -------------------------------------------------------------------------------- /data/retrotech/images/021331131393.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/021331131393.jpg -------------------------------------------------------------------------------- /data/retrotech/images/027242755871.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/027242755871.jpg -------------------------------------------------------------------------------- /data/retrotech/images/027242831599.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/027242831599.jpg -------------------------------------------------------------------------------- /data/retrotech/images/037988909926.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988909926.jpg -------------------------------------------------------------------------------- /data/retrotech/images/037988910045.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910045.jpg -------------------------------------------------------------------------------- /data/retrotech/images/037988910182.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910182.jpg -------------------------------------------------------------------------------- /data/retrotech/images/037988910250.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910250.jpg -------------------------------------------------------------------------------- /data/retrotech/images/037988910427.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910427.jpg -------------------------------------------------------------------------------- /data/retrotech/images/048231316835.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/048231316835.jpg -------------------------------------------------------------------------------- /data/retrotech/images/05024545249224.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/05024545249224.jpg -------------------------------------------------------------------------------- /data/retrotech/images/072244106916.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/072244106916.jpg -------------------------------------------------------------------------------- /data/retrotech/images/12505382925.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505382925.jpg -------------------------------------------------------------------------------- /data/retrotech/images/12505451713.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505451713.jpg -------------------------------------------------------------------------------- /data/retrotech/images/12505525766.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505525766.jpg -------------------------------------------------------------------------------- /data/retrotech/images/12505527456.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505527456.jpg -------------------------------------------------------------------------------- /data/retrotech/images/12505559105.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505559105.jpg -------------------------------------------------------------------------------- /data/retrotech/images/14381196320.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/14381196320.jpg -------------------------------------------------------------------------------- /data/retrotech/images/21331131393.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/21331131393.jpg -------------------------------------------------------------------------------- /data/retrotech/images/23272335397.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/23272335397.jpg -------------------------------------------------------------------------------- /data/retrotech/images/24543701538.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/24543701538.jpg -------------------------------------------------------------------------------- /data/retrotech/images/25192107191.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/25192107191.jpg -------------------------------------------------------------------------------- /data/retrotech/images/27108936499.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27108936499.jpg -------------------------------------------------------------------------------- /data/retrotech/images/27242752436.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242752436.jpg -------------------------------------------------------------------------------- /data/retrotech/images/27242755871.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242755871.jpg -------------------------------------------------------------------------------- /data/retrotech/images/27242798236.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242798236.jpg -------------------------------------------------------------------------------- /data/retrotech/images/27242831599.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242831599.jpg -------------------------------------------------------------------------------- /data/retrotech/images/32429037763.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/32429037763.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36172950027.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36172950027.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36725560390.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725560390.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36725560451.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725560451.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36725561977.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725561977.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36725569331.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725569331.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36725569454.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725569454.jpg -------------------------------------------------------------------------------- /data/retrotech/images/36725578241.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725578241.jpg -------------------------------------------------------------------------------- /data/retrotech/images/37988909926.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988909926.jpg -------------------------------------------------------------------------------- /data/retrotech/images/37988910045.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910045.jpg -------------------------------------------------------------------------------- /data/retrotech/images/37988910182.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910182.jpg -------------------------------------------------------------------------------- /data/retrotech/images/37988910250.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910250.jpg -------------------------------------------------------------------------------- /data/retrotech/images/37988910427.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910427.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400032015667.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400032015667.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400037252074.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252074.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400037252258.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252258.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400037252432.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252432.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400037252616.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252616.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400037252890.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252890.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400037253316.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037253316.jpg -------------------------------------------------------------------------------- /data/retrotech/images/400192926087.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400192926087.jpg -------------------------------------------------------------------------------- /data/retrotech/images/45626176.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/45626176.jpg -------------------------------------------------------------------------------- /data/retrotech/images/47875841406.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875841406.jpg -------------------------------------------------------------------------------- /data/retrotech/images/47875841420.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875841420.jpg -------------------------------------------------------------------------------- /data/retrotech/images/47875842328.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875842328.jpg -------------------------------------------------------------------------------- /data/retrotech/images/47875842335.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875842335.jpg -------------------------------------------------------------------------------- /data/retrotech/images/48231011396.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231011396.jpg -------------------------------------------------------------------------------- /data/retrotech/images/48231011402.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231011402.jpg -------------------------------------------------------------------------------- /data/retrotech/images/48231316835.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231316835.jpg -------------------------------------------------------------------------------- /data/retrotech/images/48231317436.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231317436.jpg -------------------------------------------------------------------------------- /data/retrotech/images/48231317498.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231317498.jpg -------------------------------------------------------------------------------- /data/retrotech/images/5024545249224.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/5024545249224.jpg -------------------------------------------------------------------------------- /data/retrotech/images/600603105265.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/600603105265.jpg -------------------------------------------------------------------------------- /data/retrotech/images/600603138423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/600603138423.jpg -------------------------------------------------------------------------------- /data/retrotech/images/603497664429.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/603497664429.jpg -------------------------------------------------------------------------------- /data/retrotech/images/610839379408.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/610839379408.jpg -------------------------------------------------------------------------------- /data/retrotech/images/612572171585.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/612572171585.jpg -------------------------------------------------------------------------------- /data/retrotech/images/635753490879.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/635753490879.jpg -------------------------------------------------------------------------------- /data/retrotech/images/635753493559.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/635753493559.jpg -------------------------------------------------------------------------------- /data/retrotech/images/635753493573.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/635753493573.jpg -------------------------------------------------------------------------------- /data/retrotech/images/665331101927.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/665331101927.jpg -------------------------------------------------------------------------------- /data/retrotech/images/694318011294.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/694318011294.jpg -------------------------------------------------------------------------------- /data/retrotech/images/696055169191.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/696055169191.jpg -------------------------------------------------------------------------------- /data/retrotech/images/708056579739.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/708056579739.jpg -------------------------------------------------------------------------------- /data/retrotech/images/708056579746.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/708056579746.jpg -------------------------------------------------------------------------------- /data/retrotech/images/708431390614.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/708431390614.jpg -------------------------------------------------------------------------------- /data/retrotech/images/711719842309.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/711719842309.jpg -------------------------------------------------------------------------------- /data/retrotech/images/716829772249.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/716829772249.jpg -------------------------------------------------------------------------------- /data/retrotech/images/72244106916.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/72244106916.jpg -------------------------------------------------------------------------------- /data/retrotech/images/722868830062.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/722868830062.jpg -------------------------------------------------------------------------------- /data/retrotech/images/722868840177.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/722868840177.jpg -------------------------------------------------------------------------------- /data/retrotech/images/74108007469.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/74108007469.jpg -------------------------------------------------------------------------------- /data/retrotech/images/74108056764.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/74108056764.jpg -------------------------------------------------------------------------------- /data/retrotech/images/74108096487.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/74108096487.jpg -------------------------------------------------------------------------------- /data/retrotech/images/77283045400.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/77283045400.jpg -------------------------------------------------------------------------------- /data/retrotech/images/783722274422.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/783722274422.jpg -------------------------------------------------------------------------------- /data/retrotech/images/786936817218.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/786936817218.jpg -------------------------------------------------------------------------------- /data/retrotech/images/793447512228.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/793447512228.jpg -------------------------------------------------------------------------------- /data/retrotech/images/803238004525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/803238004525.jpg -------------------------------------------------------------------------------- /data/retrotech/images/821793013776.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/821793013776.jpg -------------------------------------------------------------------------------- /data/retrotech/images/826663114164.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/826663114164.jpg -------------------------------------------------------------------------------- /data/retrotech/images/826663126044.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/826663126044.jpg -------------------------------------------------------------------------------- /data/retrotech/images/843163089211.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/843163089211.jpg -------------------------------------------------------------------------------- /data/retrotech/images/843404073153.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/843404073153.jpg -------------------------------------------------------------------------------- /data/retrotech/images/84691170679.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691170679.jpg -------------------------------------------------------------------------------- /data/retrotech/images/84691211174.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691211174.jpg -------------------------------------------------------------------------------- /data/retrotech/images/84691226703.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691226703.jpg -------------------------------------------------------------------------------- /data/retrotech/images/84691226727.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691226727.jpg -------------------------------------------------------------------------------- /data/retrotech/images/848447000005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/848447000005.jpg -------------------------------------------------------------------------------- /data/retrotech/images/848447000081.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/848447000081.jpg -------------------------------------------------------------------------------- /data/retrotech/images/848447000135.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/848447000135.jpg -------------------------------------------------------------------------------- /data/retrotech/images/856751002097.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/856751002097.jpg -------------------------------------------------------------------------------- /data/retrotech/images/878816004532.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/878816004532.jpg -------------------------------------------------------------------------------- /data/retrotech/images/883049066905.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883049066905.jpg -------------------------------------------------------------------------------- /data/retrotech/images/883929085118.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883929085118.jpg -------------------------------------------------------------------------------- /data/retrotech/images/883929106172.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883929106172.jpg -------------------------------------------------------------------------------- /data/retrotech/images/883929154012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883929154012.jpg -------------------------------------------------------------------------------- /data/retrotech/images/884116069973.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/884116069973.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885038021209.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885038021209.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885038024644.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885038024644.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885038024651.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885038024651.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885170045132.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885170045132.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885370315080.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885370315080.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885370325348.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885370325348.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909300549.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909300549.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909393404.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909393404.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909394845.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909394845.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909395095.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909395095.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909457588.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457588.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909457595.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457595.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909457601.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457601.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909457632.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457632.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909471812.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909471812.jpg -------------------------------------------------------------------------------- /data/retrotech/images/885909472376.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909472376.jpg -------------------------------------------------------------------------------- /data/retrotech/images/886111271283.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886111271283.jpg -------------------------------------------------------------------------------- /data/retrotech/images/886111287055.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886111287055.jpg -------------------------------------------------------------------------------- /data/retrotech/images/886971404722.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886971404722.jpg -------------------------------------------------------------------------------- /data/retrotech/images/886973561621.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886973561621.jpg -------------------------------------------------------------------------------- /data/retrotech/images/92636260712.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/92636260712.jpg -------------------------------------------------------------------------------- /data/retrotech/images/93624956037.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/93624956037.jpg -------------------------------------------------------------------------------- /data/retrotech/images/93624995012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/93624995012.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97360716641.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360716641.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97360722345.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360722345.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97360724240.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360724240.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97360810042.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360810042.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97363532149.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97363532149.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97363560449.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97363560449.jpg -------------------------------------------------------------------------------- /data/retrotech/images/97368920347.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97368920347.jpg -------------------------------------------------------------------------------- /data/retrotech/images/unavailable.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/unavailable.jpg -------------------------------------------------------------------------------- /data/retrotech/templates/search-results.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
5 |
6 | 7 |
8 |
9 | 10 |
11 |
12 |

Name: ${NAME} 13 |
Manufacturer: ${MANUFACTURER}

14 |

15 |
16 |
17 | 18 | 19 | 20 |
21 |
22 |
23 | 24 | 25 |
26 |
-------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | notebooks: 3 | build: 4 | context: . 5 | dockerfile: build/Dockerfile 6 | container_name: aips-notebooks 7 | ports: 8 | - 7077:7077 # Spark Master 9 | - 8082:8080 # Spark Master UI - 8082 less likely to conflict 10 | - 8081:8081 # Spark Worker UI 11 | - 4041:4041 # Spark UI 12 | - 8888:8888 # Jupyter Notebook UI 13 | - 2345:2345 # Search Webserver 14 | networks: 15 | - solr-network 16 | - opensearch-network 17 | restart: unless-stopped 18 | environment: 19 | #PYSPARK_SUBMIT_ARGS: '--jars /usr/local/spark/lib/spark-solr-4.0.0-shaded.jar pyspark-shell' 20 | #NB_USER: 'aips' 21 | #NB_UID: 1010 22 | #NB_GID: 1020 23 | #CHOWN_HOME: 'yes' 24 | #CHOWN_HOME_OPTS: -R 25 | SOLR_HOST: 'aips-solr' 26 | volumes: 27 | - type: bind 28 | source: "." 29 | target: "/tmp/notebooks/" 30 | profiles: 31 | - all 32 | - "" 33 | 34 | solr: 35 | build: 36 | context: ./engines/solr/build/ 37 | dockerfile: Dockerfile 38 | container_name: aips-solr 39 | hostname: aips-solr 40 | ports: 41 | - 8983:8983 42 | environment: 43 | - ZK_HOST=aips-zk:2181 44 | - SOLR_HOST=aips-solr 45 | networks: 46 | - zk-solr 47 | - solr-network 48 | restart: unless-stopped 49 | depends_on: 50 | - zookeeper 51 | - notebooks 52 | profiles: 53 | - all 54 | - "" 55 | 56 | opensearch: 57 | build: 58 | context: ./engines/opensearch/build/ 59 | dockerfile: engine-Dockerfile 60 | container_name: opensearch-node1 61 | hostname: aips-opensearch 62 | environment: 63 | - cluster.name=opensearch-cluster 64 | - node.name=opensearch-node1 65 | - discovery.type=single-node 66 | - network.host=0.0.0.0 67 | - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping 68 | - "OPENSEARCH_INITIAL_ADMIN_PASSWORD=''" 69 | - "DISABLE_SECURITY_PLUGIN=true" 70 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM 71 | ulimits: 72 | memlock: 73 | soft: -1 74 | hard: -1 75 | nofile: 76 | soft: 262114 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems 77 | hard: 262114 78 | volumes: 79 | - opensearch-data:/usr/share/opensearch/data 80 | ports: 81 | - 9200:9200 82 | - 9600:9600 # required for Performance Analyzer 83 | expose: 84 | - 9200:9200 85 | networks: 86 | - opensearch-network 87 | depends_on: 88 | - notebooks 89 | - opensearch-dashboards 90 | profiles: 91 | - all 92 | 93 | opensearch-dashboards: 94 | image: opensearchproject/opensearch-dashboards:2.14.0 95 | container_name: opensearch-dashboards 96 | ports: 97 | - 5601:5601 98 | expose: 99 | - 5601:5601 100 | environment: 101 | DISABLE_SECURITY_DASHBOARDS_PLUGIN: "true" 102 | OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]' 103 | networks: 104 | - opensearch-network 105 | profiles: 106 | - all 107 | 108 | zookeeper: 109 | image: zookeeper:3.5.8 110 | container_name: aips-zk 111 | hostname: aips-zk 112 | ports: 113 | - 2181:2128 114 | networks: 115 | - zk-solr 116 | - solr-network 117 | restart: unless-stopped 118 | profiles: 119 | - all 120 | - "" 121 | 122 | volumes: 123 | opensearch-data: 124 | 125 | networks: 126 | zk-solr: 127 | solr-network: 128 | opensearch-network: 129 | -------------------------------------------------------------------------------- /engines/Collection.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import aips.environment as env 3 | import json 4 | 5 | class Collection(ABC): 6 | def __init__(self, name): 7 | self.name = name 8 | 9 | @abstractmethod 10 | def get_engine_name(self): 11 | "Returns the name of the search engine for the collection" 12 | pass 13 | 14 | @abstractmethod 15 | def commit(self): 16 | "Force the collection to commit all uncommited data into the collection" 17 | pass 18 | 19 | @abstractmethod 20 | def write(self, dataframe): 21 | "Writes a pyspark dataframe containing documents into the collection" 22 | pass 23 | 24 | @abstractmethod 25 | def add_documents(self, docs, commit=True): 26 | "Adds a collection of documents into the collection" 27 | pass 28 | 29 | @abstractmethod 30 | def transform_request(self, **search_args): 31 | "Transforms a generic search request into a native search request" 32 | pass 33 | 34 | @abstractmethod 35 | def transform_response(self, search_response): 36 | "Transform a native search response into a generic search response" 37 | pass 38 | 39 | @abstractmethod 40 | def native_search(self, request=None): 41 | "Executes a search against the search engine given a native search request" 42 | pass 43 | 44 | @abstractmethod 45 | def spell_check(self, query, log=False): 46 | "Execute a spellcheck against the collection" 47 | pass 48 | 49 | def search(self, **search_args): 50 | """ 51 | Searches the collection 52 | :param str query: The main query for the search request 53 | :param str query_parser: The name of the query parser to use in the search 54 | :param list of str query_fields: the fields to query against 55 | :param list of str return_fields: the fields to return on each document 56 | :param list of tuple of str filters: A list of tuples (field, value) to filter the results by 57 | :param int limit: The number of results to return 58 | :param list of tuple of str order_by: A list of tuples (field, ASC/DESC) to order the results by 59 | :param str rerank_query: A query to rerank the results by 60 | :param str default_operator: Sets the default operator of the search query (AND/OR) 61 | :param str min_match: Specificies the minimum matching constraints for matching documents 62 | :param str query_boosts: A boost query to boost documents at query time 63 | :param tuple of str index_time_boosts: An index time boost 64 | :param boolean explain: Enables debugging on the request 65 | :param boolean log: Enables logging for the query 66 | :param boolean highlight: Returns results with highlight information (if supported) 67 | """ 68 | request = self.transform_request(**search_args) 69 | if "log" in search_args or env.get("PRINT_REQUESTS", False): 70 | print(json.dumps(request, indent=2)) 71 | search_response = self.native_search(request=request) 72 | if "log" in search_args: 73 | print(json.dumps(search_response, indent=2)) 74 | return self.transform_response(search_response) 75 | 76 | def hybrid_search(self, searches=[], limit=None, algorithm="rrf", algorithm_params={}): 77 | hybrid_search_results = None 78 | match algorithm: 79 | case "rrf": 80 | search_results = [self.search(**request)["docs"] 81 | for request in searches] 82 | 83 | hybrid_search_scores = reciprocal_rank_fusion(search_results, 84 | algorithm_params.get("k")) 85 | scored_docs = merge_search_results(search_results, 86 | hybrid_search_scores) 87 | return {"docs": scored_docs[:limit]} 88 | case "lexical_vector_rerank": 89 | lexical_search_request = searches[0] 90 | searches[1]["k"] = algorithm_params.get("k", 10) #TODO: should probably default to "limit" instead of 10 91 | lexical_search_request["rerank_query"] = searches[1] 92 | return self.search(**lexical_search_request) 93 | return hybrid_search_results 94 | 95 | def merge_search_results(search_results, scores): 96 | merged_results = {} 97 | for results in search_results: 98 | for doc in results: 99 | if doc["id"] in merged_results: 100 | merged_results[doc["id"]] = {**doc, **merged_results[doc["id"]]} 101 | else: 102 | merged_results[doc["id"]] = doc 103 | return [{**merged_results[id], "score": score} 104 | for id, score in scores.items()] 105 | 106 | 107 | def reciprocal_rank_fusion(search_results, k=None): 108 | if k is None: k = 60 109 | scores = {} 110 | for ranked_docs in search_results: 111 | for rank, doc in enumerate(ranked_docs, 1): 112 | scores[doc["id"]] = scores.get(doc["id"], 0) + (1.0 / (k + rank)) 113 | sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True)) 114 | return sorted_scores -------------------------------------------------------------------------------- /engines/Engine.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class Engine(ABC): 4 | def __init__(self, name): 5 | self.name = name 6 | 7 | @abstractmethod 8 | def health_check(self): 9 | "Checks the state of the search engine returning a boolean" 10 | pass 11 | 12 | @abstractmethod 13 | def print_status(self, response): 14 | "Prints the resulting status of a search engine request" 15 | pass 16 | 17 | @abstractmethod 18 | def create_collection(self, name): 19 | "Create and initialize the schema for a collection, returns the initialized collection" 20 | pass 21 | 22 | @abstractmethod 23 | def get_collection(self, name): 24 | "Returns initialized object for a given collection" 25 | pass -------------------------------------------------------------------------------- /engines/EntityExtractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class EntityExtractor(ABC): 4 | def __init__(self, collection): 5 | "The collection containing entities" 6 | self.collection = collection 7 | 8 | @abstractmethod 9 | def extract_entities(self, query): 10 | "Returns extracted entities and tag data for a given query" 11 | pass -------------------------------------------------------------------------------- /engines/LTR.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class LTR(ABC): 4 | def __init__(self, collection): 5 | self.collection = collection 6 | 7 | @abstractmethod 8 | def enable_ltr(self, log=False): 9 | "Initializes LTR dependencies for a given collection" 10 | pass 11 | 12 | @abstractmethod 13 | def generate_feature(self, feature_name, params, feature_type): 14 | "Generates an LTR feature definition." 15 | pass 16 | 17 | @abstractmethod 18 | def generate_query_feature(self, feature_name, field_name, constant_score=False, value="(${keywords})"): 19 | "Generates an LTR query feature definition." 20 | pass 21 | 22 | @abstractmethod 23 | def generate_fuzzy_query_feature(self, feature_name, field_name): 24 | "Generates an LTR fuzzy query feature definition." 25 | pass 26 | 27 | @abstractmethod 28 | def generate_bigram_query_feature(self, feature_name, field_name): 29 | "Generates an LTR bigram query feature definition." 30 | pass 31 | 32 | @abstractmethod 33 | def generate_field_value_feature(self, feature_name, field_name): 34 | "Generates an LTR field value feature definition." 35 | pass 36 | 37 | @abstractmethod 38 | def generate_field_length_feature(self, feature_name, field_name): 39 | "Generates an LTR field length feature definition." 40 | pass 41 | 42 | @abstractmethod 43 | def generate_model(self, model_name, feature_names, means, std_devs, weights): 44 | "Generate a model definition." 45 | pass 46 | 47 | @abstractmethod 48 | def delete_feature_store(self, name, log=False): 49 | "Deletes the feature store of the given name." 50 | pass 51 | 52 | @abstractmethod 53 | def upload_features(self, features, model_name, log=False): 54 | "Uploads features into the engine with a given name" 55 | pass 56 | 57 | @abstractmethod 58 | def delete_model(self, model_name, log=False): 59 | "Deletes the model from the engine." 60 | pass 61 | 62 | @abstractmethod 63 | def upload_model(self, model, log=False): 64 | "Upload a model to the engine." 65 | pass 66 | 67 | @abstractmethod 68 | def upsert_model(self, model, log=False): 69 | "Deletes and uploads a model to the engine." 70 | pass 71 | 72 | @abstractmethod 73 | def get_explore_candidate(self, query, explore_vector, feature_config, log=False): 74 | "Generates a exploration search request with the given criteria." 75 | pass 76 | 77 | @abstractmethod 78 | def get_logged_features(self, model_name, doc_ids, options={}, 79 | id_field="id", fields=None, log=False): 80 | "Deletes the model from the engine." 81 | pass 82 | 83 | @abstractmethod 84 | def search_with_model(self, model_name, **search_args): 85 | """Search a collection using an uploaded model. 86 | See engines.Collection.search() for information on parameters""" 87 | pass 88 | -------------------------------------------------------------------------------- /engines/SemanticKnowledgeGraph.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | class SemanticKnowledgeGraph(ABC): 4 | def __init__(self, collection): 5 | self.collection = collection 6 | 7 | @abstractmethod 8 | def traverse(self, *nodes): 9 | "Traverses a semantic knowledge through each request node" 10 | pass 11 | 12 | @abstractmethod 13 | def transform_request(self, *nodes): 14 | """ 15 | Generates a semantic knowledge graph request from a list of nodes, or multi-nodes 16 | A node can contain the following params: `name`, `values`, `field`, `min_occurance` and `limit`. 17 | :param str name: An optional name of the node. If not provided a default will be assigned 18 | :param list of str value: If a value is present, this node represents a query 19 | Otherwise, this node will discover terms terms are discovered. Otherwise the query is applied. 20 | :param str field: The field to query against or discover values from. 21 | :param int min_occurance: The minimum number of occurances that a term must occur within 22 | the knowledge base to be qualify for discovery. 23 | :param int limit: The limit on number of terms to discover 24 | """ 25 | pass -------------------------------------------------------------------------------- /engines/SparseSemanticSearch.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | class SparseSemanticSearch(ABC): 3 | def __init__(self): 4 | pass 5 | 6 | @abstractmethod 7 | def location_distance(self, query, position): 8 | "A semantic function to create a location distance query. Applies a transformed query node it to the query tree." 9 | pass 10 | 11 | @abstractmethod 12 | def popularity(self, query, position): 13 | "A semantic function to create a popularity query. Applies a transformed query node it to the query tree." 14 | pass 15 | 16 | @abstractmethod 17 | def transform_query(self, query_tree): 18 | "Transforms the query tree into an engine specific query tree" 19 | pass 20 | 21 | @abstractmethod 22 | def generate_basic_query(self, query): 23 | "Creates a basic engine specific search query" 24 | pass -------------------------------------------------------------------------------- /engines/opensearch/OpenSearchEngine.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import requests 4 | 5 | from engines.Engine import Engine 6 | from engines.opensearch.config import OPENSEARCH_URL, SCHEMAS 7 | from engines.opensearch.OpenSearchCollection import OpenSearchCollection 8 | 9 | STATUS_URL = f"{OPENSEARCH_URL}/_cluster/health" 10 | 11 | class OpenSearchEngine(Engine): 12 | def __init__(self): 13 | super().__init__("OpenSearch") 14 | 15 | def health_check(self): 16 | status = requests.get(STATUS_URL).json()["status"] in ["green", "yellow"] 17 | if status: 18 | print("OpenSearch engine is online") 19 | return status 20 | 21 | def print_status(self, response): 22 | #print(json.dumps(response, indent=2)) 23 | "Prints the resulting status of a search engine request" 24 | pass 25 | 26 | def create_collection(self, name, log=False): 27 | print(f'Wiping "{name}" collection') 28 | response = requests.delete(f"{OPENSEARCH_URL}/{name}").json() 29 | 30 | print(f'Creating "{name}" collection') 31 | collection = self.get_collection(name) 32 | request = SCHEMAS[name]["schema"] if name in SCHEMAS else {} 33 | response = requests.put(f"{OPENSEARCH_URL}/{name}", json=request).json() 34 | if log: 35 | print("Schema:", json.dumps(request, indent=2)) 36 | if log: 37 | print("Status:", json.dumps(response, indent=2)) 38 | return collection 39 | 40 | def get_collection(self, name): 41 | "Returns initialized object for a given collection" 42 | id_field = SCHEMAS.get(name, {}).get("id_field", "_id") 43 | return OpenSearchCollection(name, id_field) -------------------------------------------------------------------------------- /engines/opensearch/OpenSearchSparseSemanticSearch.py: -------------------------------------------------------------------------------- 1 | from engines.SparseSemanticSearch import SparseSemanticSearch 2 | 3 | def escape_quotes(text): 4 | return text.replace('"', '\\"') 5 | 6 | class OpenSearchSparseSemanticSearch(SparseSemanticSearch): 7 | def __init__(self): 8 | pass 9 | 10 | def location_distance(self, query, position): 11 | if len(query["query_tree"]) -1 > position: 12 | next_entity = query["query_tree"][position + 1] 13 | if next_entity["type"] == "city": 14 | query["query_tree"].pop(position + 1) 15 | query["query_tree"][position] = { 16 | "type": "transformed", 17 | "syntax": "opensearch", 18 | "query": self.create_geo_filter(next_entity["location_coordinates"], 19 | "location_coordinates", 50)} 20 | return True 21 | return False 22 | 23 | def create_geo_filter(self, coordinates, field, distance_KM): 24 | return {"geo_distance": {"distance": f"{distance_KM}km", 25 | field: {"lat": coordinates.split(",")[0], 26 | "lon": coordinates.split(",")[1]}}} 27 | 28 | def popularity(self, query, position): 29 | if len(query["query_tree"]) -1 > position: 30 | query["query_tree"][position] = { 31 | "type": "transformed", 32 | "syntax": "opensearch", 33 | "query": {"function_score": {"field_value_factor": { 34 | "field": "stars_rating", 35 | "factor": 1.5, 36 | "missing": 0}}}} 37 | return True 38 | return False 39 | 40 | def transform_query(self, query_tree): 41 | for i, item in enumerate(query_tree): 42 | match item["type"]: 43 | case "transformed": 44 | continue 45 | case "skg_enriched": 46 | enrichments = item["enrichments"] 47 | if "term_vector" in enrichments: 48 | query_string = enrichments["term_vector"] 49 | if "category" in enrichments: 50 | query_string += f' +doc_type:"{enrichments["category"]}"' 51 | transformed_query = '"' + escape_quotes(item["surface_form"]) + '"' 52 | else: 53 | continue 54 | case "color": 55 | transformed_query = f'+colors:"{item["canonical_form"]}"' 56 | case "known_item" | "event": 57 | transformed_query = f'+name:"{item["canonical_form"]}"' 58 | case "city": 59 | transformed_query = f'+city:"{item["canonical_form"]}"' 60 | case "brand": 61 | transformed_query = f'+brand:"{item["canonical_form"]}"' 62 | case _: 63 | transformed_query = '"' + escape_quotes(item["surface_form"]) + '"' 64 | query_tree[i] = {"type": "transformed", 65 | "syntax": "opensearch", 66 | "query": transformed_query} 67 | return query_tree 68 | 69 | def generate_basic_query(self, query): 70 | return '"' + escape_quotes(query) + '"' -------------------------------------------------------------------------------- /engines/opensearch/build/engine-Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright OpenSearch Contributors 2 | # SPDX-License-Identifier: Apache-2.0 3 | FROM amazonlinux:2 AS linux_stage_0 4 | 5 | ENV OS_VERSION=2.14.0 6 | ARG OPENSEARCH_HOME=/usr/share/opensearch 7 | ARG UID=1000 8 | ARG GID=1000 9 | ARG BUILDARCH 10 | 11 | RUN yum update -y && yum install -y tar gzip shadow-utils which && yum clean all 12 | RUN mkdir /tmp/build/ 13 | RUN mkdir $OPENSEARCH_HOME 14 | RUN groupadd -g $GID os_group && adduser -g $GID -u $UID -b $OPENSEARCH_HOME opensearch 15 | 16 | WORKDIR /tmp/build/ 17 | RUN pwd 18 | ENV DISTRO_CACHE_DIR=/var/cache/pip 19 | RUN mkdir -p $DISTRO_CACHE_DIR 20 | RUN set -eux && BUILDARCH=$(echo $BUILDARCH | sed 's/amd64/x64/' -) && \ 21 | OS_DISTRO_FILE=opensearch-${OS_VERSION}-linux-${BUILDARCH}.tar.gz && \ 22 | OS_DISTRO_URL=https://artifacts.opensearch.org/releases/bundle/opensearch/${OS_VERSION}/${OS_DISTRO_FILE} && \ 23 | curl -O $OS_DISTRO_URL -O $OS_DISTRO_URL.sig && \ 24 | curl https://artifacts.opensearch.org/publickeys/opensearch.pgp | gpg --import && \ 25 | gpg --verify $OS_DISTRO_FILE.sig $OS_DISTRO_FILE && \ 26 | tar --warning=no-timestamp -zxf /tmp/build/$OS_DISTRO_FILE -C $OPENSEARCH_HOME --strip-components=1 && \ 27 | install -d -m 750 -o $UID -g $GID $OPENSEARCH_HOME/data/ 28 | 29 | ########################### Stage 1 ######################## 30 | # Copy working directory to the actual release docker images 31 | FROM amazonlinux:2 32 | 33 | ENV OS_VERSION=2.14.0 34 | ARG OPENSEARCH_HOME=/usr/share/opensearch 35 | ARG UID=1000 36 | ARG GID=1000 37 | ARG BUILDARCH 38 | 39 | RUN yum update -y && yum install -y tar gzip shadow-utils which && yum clean all 40 | 41 | WORKDIR $OPENSEARCH_HOME 42 | COPY --from=linux_stage_0 $OPENSEARCH_HOME $OPENSEARCH_HOME 43 | RUN echo "export JAVA_HOME=$OPENSEARCH_HOME/jdk" >> /etc/profile.d/java_home.sh && \ 44 | echo "export PATH=\$PATH:\$JAVA_HOME/bin" >> /etc/profile.d/java_home.sh 45 | ENV JAVA_HOME=$OPENSEARCH_HOME/jdk 46 | ENV PATH=$PATH:$JAVA_HOME/bin:$OPENSEARCH_HOME/bin 47 | ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$OPENSEARCH_HOME/plugins/opensearch-knn/lib" 48 | 49 | USER $UID 50 | 51 | ARG DISABLE_INSTALL_DEMO_CONFIG=true 52 | ARG DISABLE_SECURITY_PLUGIN=true 53 | ARG UBI_VERSION=v0.0.12.1-os2.14.0 54 | ARG UBI_PLUGIN_FILE=opensearch-ubi-plugin-$UBI_VERSION.zip 55 | ARG UBI_PLUGIN_URL=https://github.com/o19s/opensearch-ubi/releases/download/release-$UBI_VERSION/$UBI_PLUGIN_FILE 56 | RUN bin/opensearch-plugin install $UBI_PLUGIN_URL --batch 57 | 58 | ARG LTR_PLUGIN_FILE=ltr-2.14.0-os2.14.0.zip 59 | COPY $LTR_PLUGIN_FILE . 60 | RUN bin/opensearch-plugin install file://$OPENSEARCH_HOME/$LTR_PLUGIN_FILE --batch 61 | 62 | COPY *.properties $OPENSEARCH_HOME/config/ 63 | ARG ANALYZER_DIR=$OPENSEARCH_HOME/config/opensearch-performance-analyzer/ 64 | RUN [ ! -d $ANALYZER_DIR ] || mv $OPENSEARCH_HOME/config/performance-analyzer.properties $ANALYZER_DIR 65 | COPY --chown=$UID:$UID --chmod=0770 *.sh $OPENSEARCH_HOME/ 66 | RUN chown -R $UID:$GID $OPENSEARCH_HOME/data/ 67 | RUN chmod 0770 *.sh 68 | RUN ./opensearch-onetime-setup.sh 69 | 70 | EXPOSE 9200 9300 9600 9650 71 | 72 | LABEL org.label-schema.schema-version="1.0" \ 73 | org.label-schema.name="opensearch" \ 74 | org.label-schema.version="$OS_VERSION" \ 75 | org.label-schema.url="https://opensearch.org" \ 76 | org.label-schema.vcs-url="https://github.com/opensearch" \ 77 | org.label-schema.license="Apache-2.0" \ 78 | org.label-schema.vendor="OpenSearch" 79 | 80 | ENTRYPOINT ["./opensearch-docker-entrypoint.sh"] 81 | CMD ["opensearch"] 82 | -------------------------------------------------------------------------------- /engines/opensearch/build/log4j2.properties: -------------------------------------------------------------------------------- 1 | status = error 2 | 3 | appender.console.type = Console 4 | appender.console.name = console 5 | appender.console.layout.type = PatternLayout 6 | appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n 7 | 8 | rootLogger.level = info 9 | rootLogger.appenderRef.console.ref = console 10 | -------------------------------------------------------------------------------- /engines/opensearch/build/ltr-2.14.0-os2.14.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/engines/opensearch/build/ltr-2.14.0-os2.14.0.zip -------------------------------------------------------------------------------- /engines/opensearch/build/opensearch-docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright OpenSearch Contributors 4 | # SPDX-License-Identifier: Apache-2.0 5 | 6 | # This script specify the entrypoint startup actions for opensearch 7 | # It will start both opensearch and performance analyzer plugin cli 8 | # If either process failed, the entire docker container will be removed 9 | # in favor of a newly started container 10 | 11 | # Export OpenSearch Home 12 | export OPENSEARCH_HOME=/usr/share/opensearch 13 | export OPENSEARCH_PATH_CONF=$OPENSEARCH_HOME/config 14 | 15 | # The virtual file /proc/self/cgroup should list the current cgroup 16 | # membership. For each hierarchy, you can follow the cgroup path from 17 | # this file to the cgroup filesystem (usually /sys/fs/cgroup/) and 18 | # introspect the statistics for the cgroup for the given 19 | # hierarchy. Alas, Docker breaks this by mounting the container 20 | # statistics at the root while leaving the cgroup paths as the actual 21 | # paths. Therefore, OpenSearch provides a mechanism to override 22 | # reading the cgroup path from /proc/self/cgroup and instead uses the 23 | # cgroup path defined the JVM system property 24 | # opensearch.cgroups.hierarchy.override. Therefore, we set this value here so 25 | # that cgroup statistics are available for the container this process 26 | # will run in. 27 | export OPENSEARCH_JAVA_OPTS="-Dopensearch.cgroups.hierarchy.override=/ $OPENSEARCH_JAVA_OPTS" 28 | 29 | # Security Plugin 30 | function setupSecurityPlugin { 31 | SECURITY_PLUGIN="opensearch-security" 32 | 33 | if [ -d "$OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN" ]; then 34 | if [ "$DISABLE_INSTALL_DEMO_CONFIG" = "true" ]; then 35 | echo "Disabling execution of install_demo_configuration.sh for OpenSearch Security Plugin" 36 | else 37 | echo "Enabling execution of install_demo_configuration.sh for OpenSearch Security Plugin" 38 | bash $OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN/tools/install_demo_configuration.sh -y -i -s 39 | fi 40 | 41 | if [ "$DISABLE_SECURITY_PLUGIN" = "true" ]; then 42 | echo "Disabling OpenSearch Security Plugin" 43 | opensearch_opt="-Eplugins.security.disabled=true" 44 | opensearch_opts+=("${opensearch_opt}") 45 | else 46 | echo "Enabling OpenSearch Security Plugin" 47 | fi 48 | else 49 | echo "OpenSearch Security Plugin does not exist, disable by default" 50 | fi 51 | } 52 | 53 | # Performance Analyzer Plugin 54 | function setupPerformanceAnalyzerPlugin { 55 | PERFORMANCE_ANALYZER_PLUGIN="opensearch-performance-analyzer" 56 | if [ -d "$OPENSEARCH_HOME/plugins/$PERFORMANCE_ANALYZER_PLUGIN" ]; then 57 | if [ "$DISABLE_PERFORMANCE_ANALYZER_AGENT_CLI" = "true" ]; then 58 | echo "Disabling execution of $OPENSEARCH_HOME/bin/$PERFORMANCE_ANALYZER_PLUGIN/performance-analyzer-agent-cli for OpenSearch Performance Analyzer Plugin" 59 | else 60 | echo "Enabling execution of OPENSEARCH_HOME/bin/$PERFORMANCE_ANALYZER_PLUGIN/performance-analyzer-agent-cli for OpenSearch Performance Analyzer Plugin" 61 | $OPENSEARCH_HOME/bin/opensearch-performance-analyzer/performance-analyzer-agent-cli > $OPENSEARCH_HOME/logs/performance-analyzer.log 2>&1 & 62 | fi 63 | else 64 | echo "OpenSearch Performance Analyzer Plugin does not exist, disable by default" 65 | fi 66 | } 67 | 68 | # Start up the opensearch and performance analyzer agent processes. 69 | # When either of them halts, this script exits, or we receive a SIGTERM or SIGINT signal then we want to kill both these processes. 70 | function runOpensearch { 71 | # Files created by OpenSearch should always be group writable too 72 | umask 0002 73 | 74 | if [[ "$(id -u)" == "0" ]]; then 75 | echo "OpenSearch cannot run as root. Please start your container as another user." 76 | exit 1 77 | fi 78 | 79 | # Parse Docker env vars to customize OpenSearch 80 | # 81 | # e.g. Setting the env var cluster.name=testcluster 82 | # will cause OpenSearch to be invoked with -Ecluster.name=testcluster 83 | opensearch_opts=() 84 | while IFS='=' read -r envvar_key envvar_value 85 | do 86 | # OpenSearch settings need to have at least two dot separated lowercase 87 | # words, e.g. `cluster.name`, except for `processors` which we handle 88 | # specially 89 | if [[ "$envvar_key" =~ ^[a-z0-9_]+\.[a-z0-9_]+ || "$envvar_key" == "processors" ]]; then 90 | if [[ ! -z $envvar_value ]]; then 91 | opensearch_opt="-E${envvar_key}=${envvar_value}" 92 | opensearch_opts+=("${opensearch_opt}") 93 | fi 94 | fi 95 | done < <(env) 96 | 97 | setupSecurityPlugin 98 | setupPerformanceAnalyzerPlugin 99 | 100 | # Start opensearch 101 | "$@" "${opensearch_opts[@]}" 102 | 103 | } 104 | 105 | # Prepend "opensearch" command if no argument was provided or if the first 106 | # argument looks like a flag (i.e. starts with a dash). 107 | if [ $# -eq 0 ] || [ "${1:0:1}" = '-' ]; then 108 | set -- opensearch "$@" 109 | fi 110 | 111 | if [ "$1" = "opensearch" ]; then 112 | # If the first argument is opensearch, then run the setup script. 113 | runOpensearch "$@" 114 | else 115 | # Otherwise, just exec the command. 116 | exec "$@" 117 | fi 118 | -------------------------------------------------------------------------------- /engines/opensearch/build/opensearch-onetime-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # The OpenSearch Contributors require contributions made to 6 | # this file be licensed under the Apache-2.0 license or a 7 | # compatible open source license. 8 | 9 | # This script performs one-time setup for the OpenSearch tarball distribution. 10 | # It installs a demo security config and sets up the performance analyzer 11 | 12 | export OPENSEARCH_HOME=`dirname $(realpath $0)` 13 | export OPENSEARCH_PATH_CONF=$OPENSEARCH_HOME/config 14 | cd $OPENSEARCH_HOME 15 | 16 | ##Security Plugin 17 | SECURITY_PLUGIN="opensearch-security" 18 | if [ -d "$OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN" ]; then 19 | if [ "$DISABLE_INSTALL_DEMO_CONFIG" = "true" ]; then 20 | echo "Disabling execution of install_demo_configuration.sh for OpenSearch Security Plugin" 21 | else 22 | echo "Enabling execution of install_demo_configuration.sh for OpenSearch Security Plugin" 23 | bash $OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN/tools/install_demo_configuration.sh -y -i -s 24 | fi 25 | 26 | if [ "$DISABLE_SECURITY_PLUGIN" = "true" ]; then 27 | echo "Disabling OpenSearch Security Plugin" 28 | sed -i '/plugins.security.disabled/d' $OPENSEARCH_PATH_CONF/opensearch.yml 29 | echo "plugins.security.disabled: true" >> $OPENSEARCH_PATH_CONF/opensearch.yml 30 | else 31 | echo "Enabling OpenSearch Security Plugin" 32 | sed -i '/plugins.security.disabled/d' $OPENSEARCH_PATH_CONF/opensearch.yml 33 | fi 34 | fi 35 | 36 | ##Perf Plugin 37 | PA_PLUGIN="opensearch-performance-analyzer" 38 | 39 | if ! grep -q '## OpenDistro Performance Analyzer' $OPENSEARCH_PATH_CONF/jvm.options; then 40 | CLK_TCK=`/usr/bin/getconf CLK_TCK` 41 | echo >> $OPENSEARCH_PATH_CONF/jvm.options 42 | echo '## OpenDistro Performance Analyzer' >> $OPENSEARCH_PATH_CONF/jvm.options 43 | echo "-Dclk.tck=$CLK_TCK" >> $OPENSEARCH_PATH_CONF/jvm.options 44 | echo "-Djdk.attach.allowAttachSelf=true" >> $OPENSEARCH_PATH_CONF/jvm.options 45 | echo "-Djava.security.policy=$OPENSEARCH_PATH_CONF/$PA_PLUGIN/opensearch_security.policy" >> $OPENSEARCH_PATH_CONF/jvm.options 46 | echo "--add-opens=jdk.attach/sun.tools.attach=ALL-UNNAMED" >> $OPENSEARCH_PATH_CONF/jvm.options 47 | fi 48 | 49 | 50 | -------------------------------------------------------------------------------- /engines/opensearch/build/performance-analyzer.properties: -------------------------------------------------------------------------------- 1 | # ======================== OpenSearch performance analyzer plugin config ========================= 2 | 3 | # NOTE: this is an example for Linux. Please modify the config accordingly if you are using it under other OS. 4 | 5 | # Metrics data location 6 | metrics-location = /dev/shm/performanceanalyzer/ 7 | 8 | # Metrics deletion interval (minutes) for metrics data. 9 | # Interval should be between 1 to 60. 10 | metrics-deletion-interval = 1 11 | 12 | # If set to true, the system cleans up the files behind it. So at any point, we should expect only 2 13 | # metrics-db-file-prefix-path files. If set to false, no files are cleaned up. This can be useful, if you are archiving 14 | # the files and wouldn't like for them to be cleaned up. 15 | cleanup-metrics-db-files = true 16 | 17 | # WebService exposed by App's port 18 | webservice-listener-port = 9600 19 | 20 | # Port for RPC Communication 21 | rpc-port = 9650 22 | 23 | # Metric DB File Prefix Path location 24 | metrics-db-file-prefix-path = /tmp/metricsdb_ 25 | 26 | https-enabled = false 27 | 28 | # Setup the correct path for server certificates 29 | certificate-file-path = none 30 | private-key-file-path = none 31 | #trusted-cas-file-path = none 32 | 33 | # Setup the correct path for client certificates (by default, the client will just use the server certificates) 34 | #client-certificate-file-path = specify_path 35 | #client-private-key-file-path = specify_path 36 | #client-trusted-cas-file-path = specify_path 37 | 38 | # WebService bind host; default only to local interface 39 | webservice-bind-host = 0.0.0.0 40 | 41 | # Plugin Stats Metadata file name, expected to be in the same location 42 | plugin-stats-metadata = plugin-stats-metadata 43 | 44 | # Agent Stats Metadata file name, expected to be in the same location 45 | agent-stats-metadata = agent-stats-metadata 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /engines/solr/SolrEntityExtractor.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from engines.solr.config import SOLR_URL 3 | from engines.solr.SolrCollection import SolrCollection 4 | from engines.EntityExtractor import EntityExtractor 5 | 6 | def transform_response(query, response): 7 | return {"query": query, 8 | "tags": response["tags"], 9 | "entities": response["response"]["docs"]} 10 | 11 | class SolrEntityExtractor(EntityExtractor): 12 | def __init__(self, collection): 13 | if not isinstance(collection, SolrCollection): 14 | raise TypeError("Only supports a SolrCollection") 15 | super().__init__(collection) 16 | 17 | def extract_entities(self, query): 18 | response = requests.post(f"{SOLR_URL}/{self.collection.name}/tag", query).json() 19 | return transform_response(query, response) -------------------------------------------------------------------------------- /engines/solr/SolrSemanticKnowledgeGraph.py: -------------------------------------------------------------------------------- 1 | from engines.SemanticKnowledgeGraph import SemanticKnowledgeGraph 2 | from engines.solr.SolrCollection import SolrCollection 3 | 4 | def generate_request_root(): 5 | return { 6 | "limit": 0, 7 | "params": { 8 | "q": "*:*", 9 | "fore": "{!${defType} v=$q}", 10 | "back": "*:*", 11 | "defType": "edismax" 12 | }, 13 | "facet": {} 14 | } 15 | 16 | def generate_facets(name=None, values=None, field=None, 17 | min_occurrences=None, limit=None, 18 | min_popularity=None, default_operator="AND"): 19 | base_facet = {"type": "query" if values else "terms", 20 | "limit": limit if limit else 10, 21 | "sort": {"relatedness": "desc" }, 22 | "facet": { 23 | "relatedness": { 24 | "type": "func", 25 | "func": "relatedness($fore,$back)"}}} 26 | if min_occurrences: 27 | base_facet["mincount"] = min_occurrences 28 | if min_popularity: 29 | base_facet["facet"]["relatedness"]["min_popularity"] = min_popularity 30 | if field: 31 | base_facet["field"] = field 32 | facets = [] 33 | if values: 34 | if min_occurrences: base_facet.pop("mincount") 35 | if not limit: base_facet.pop("limit") 36 | for i, _ in enumerate(values): 37 | facets.append(base_facet.copy()) 38 | facets[i]["query"] = "{" + f'!edismax q.op={default_operator} qf={field} v=${name}_{i}_query' + "}" 39 | else: 40 | facets = [base_facet] 41 | return facets 42 | 43 | def default_node_name(i, j): 44 | return "f" + str(i) + (f"_{j}" if j else "") 45 | 46 | def validate_skg_request_input(multi_node): 47 | if isinstance(multi_node, list): 48 | map(validate_skg_request_input, multi_node) 49 | node_names = [node["name"] for node in multi_node] 50 | if len(node_names) != len(set(node_names)): 51 | raise ValueError("Node names must be distinct on a given level.") 52 | if "field" not in multi_node: # and "values" in multi_node 53 | raise ValueError("'field' must be provided") 54 | 55 | def transform_request(*multi_nodes): 56 | """Generates a faceted Solr SKG request from a set of multi-nodes. 57 | A multi-node can be a single node or a collection of nodes. 58 | A node can contain the following params: `name`, `values`, `field`, `min_occurance` and `limit`. 59 | :param str name: An optional name of the node. If not provided a default will be assigned 60 | :param list of str value: If empty or absent, a terms facet is used. Otherwise a query facet per value is used 61 | :param str field: The field to query against or discover values from. 62 | :param int min_occurance: The mincount on the facet. 63 | :param int limit: The limit on the facet. 64 | Each subsequent node is applied as a nested facet to all parent facets.""" 65 | map(validate_skg_request_input, multi_nodes) 66 | request = generate_request_root() 67 | parent_nodes = [request] 68 | for i, multi_node in enumerate(multi_nodes): 69 | current_facets = [] 70 | if isinstance(multi_node, dict): 71 | multi_node = [multi_node] 72 | for j, node in enumerate(multi_node): 73 | if "name" not in node: 74 | node["name"] = default_node_name(i, j) 75 | facets = generate_facets(**node) 76 | current_facets.extend(facets) 77 | for i, parent_node in enumerate(parent_nodes): 78 | for j, facet in enumerate(facets): 79 | parent_node["facet"][f'{node["name"]}_{j}'] = facet 80 | if "values" in node: 81 | for i, value in enumerate(node["values"]): 82 | request["params"][f'{node["name"]}_{i}_query'] = value 83 | parent_nodes = current_facets 84 | return request 85 | 86 | def transform_node(node, response_params): 87 | relatedness = node["relatedness"]["relatedness"] if node["count"] > 0 else 0.0 88 | value_node = {"relatedness": relatedness} 89 | sub_traversals = transform_response_facet(node, response_params) 90 | if sub_traversals: 91 | value_node["traversals"] = sub_traversals 92 | return value_node 93 | 94 | def transform_response_facet(node, response_params): 95 | ignored_keys = ["count", "relatedness", "val"] 96 | traversals = {} 97 | for full_name, data in node.items(): 98 | if full_name in ignored_keys: 99 | continue 100 | name = full_name.removesuffix("_" + full_name.split("_")[-1]) 101 | if name not in traversals: 102 | traversals[name] = {"name": name, "values": {}} 103 | if "buckets" in data: 104 | values_node = {b["val"] : transform_node(b, response_params) 105 | for b in data["buckets"]} 106 | traversals[name]["values"] = values_node 107 | else: 108 | value_name = response_params[f"{full_name}_query"] 109 | traversals[name]["values"][value_name] = transform_node(data, response_params) 110 | for k in traversals.keys(): 111 | traversals[k]["values"] = sort_by_relatedness_desc(traversals[k]["values"]) 112 | return list(traversals.values()) 113 | 114 | def sort_by_relatedness_desc(d): 115 | return {k: v for k, v in sorted(d.items(), key=lambda item: item[1]["relatedness"], reverse=True)} 116 | 117 | class SolrSemanticKnowledgeGraph(SemanticKnowledgeGraph): 118 | def __init__(self, collection): 119 | if not isinstance(collection, SolrCollection): 120 | raise TypeError("Only supports a SolrCollection") 121 | super().__init__(collection) 122 | 123 | def traverse(self, *multi_nodes): 124 | request = self.transform_request(*multi_nodes) 125 | response = self.collection.native_search(request) 126 | return {"graph": transform_response_facet(response["facets"], request["params"])} 127 | 128 | def transform_request(self, *multi_nodes): 129 | return transform_request(*multi_nodes) -------------------------------------------------------------------------------- /engines/solr/SolrSparseSemanticSearch.py: -------------------------------------------------------------------------------- 1 | from engines.SparseSemanticSearch import SparseSemanticSearch 2 | 3 | def escape_quotes(text): 4 | return text.replace('"', '\\"') 5 | 6 | class SolrSparseSemanticSearch(SparseSemanticSearch): 7 | def __init__(self): 8 | pass 9 | 10 | def location_distance(self, query, position): 11 | if len(query["query_tree"]) -1 > position: 12 | next_entity = query["query_tree"][position + 1] 13 | if next_entity["type"] == "city": 14 | query["query_tree"].pop(position + 1) 15 | query["query_tree"][position] = { 16 | "type": "transformed", 17 | "syntax": "solr", 18 | "query": self.create_geo_filter(next_entity['location_coordinates'], 19 | "location_coordinates", 50)} 20 | return True 21 | return False 22 | 23 | def create_geo_filter(self, coordinates, field, distance_KM): 24 | clause = f'!geofilt d={distance_KM} sfield="{field}" pt="{coordinates}"' 25 | return "+{" + clause + '}' 26 | 27 | def popularity(self, query, position): 28 | if len(query["query_tree"]) -1 > position: 29 | query["query_tree"][position] = { 30 | "type": "transformed", 31 | "syntax": "solr", 32 | "query": '+{!func v="mul(if(stars_rating,stars_rating,0),20)"}'} 33 | return True 34 | return False 35 | 36 | def transform_query(self, query_tree): 37 | for i, item in enumerate(query_tree): 38 | match item["type"]: 39 | case "transformed": 40 | continue 41 | case "skg_enriched": 42 | enrichments = item["enrichments"] 43 | if "term_vector" in enrichments: 44 | query_string = enrichments["term_vector"] 45 | if "category" in enrichments: 46 | query_string += f' +doc_type:"{enrichments["category"]}"' 47 | transformed_query = '+{!edismax v="' + escape_quotes(query_string) + '"}' 48 | else: 49 | continue 50 | case "color": 51 | transformed_query = f'+colors_s:"{item["canonical_form"]}"' 52 | case "known_item" | "event": 53 | transformed_query = f'+name_s:"{item["canonical_form"]}"' 54 | case "city": 55 | transformed_query = f'+city:"{str(item["canonical_form"])}"' 56 | case "brand": 57 | transformed_query = f'+brand_s:"{item["canonical_form"]}"' 58 | case _: 59 | transformed_query = "+{!edismax v=\"" + escape_quotes(item["surface_form"]) + "\"}" 60 | query_tree[i] = {"type": "transformed", 61 | "syntax": "solr", 62 | "query": transformed_query} 63 | return query_tree 64 | 65 | def generate_basic_query(self, query): 66 | return '+{!edismax mm=100% v="' + escape_quotes(query) + '"}' -------------------------------------------------------------------------------- /engines/solr/__init__.py: -------------------------------------------------------------------------------- 1 | from .SolrLTR import SolrLTR 2 | from .SolrSemanticKnowledgeGraph import SolrSemanticKnowledgeGraph 3 | from .SolrEntityExtractor import SolrEntityExtractor 4 | from .SolrSparseSemanticSearch import SolrSparseSemanticSearch -------------------------------------------------------------------------------- /engines/solr/build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM solr:9.4.1 2 | 3 | USER root 4 | 5 | ADD solr.xml ./server/solr/solr.xml 6 | ADD run_solr_w_ltr.sh ./run_solr_w_ltr.sh 7 | RUN chown solr:solr run_solr_w_ltr.sh 8 | RUN sed -i -e 's/\r$//' run_solr_w_ltr.sh 9 | RUN chmod u+x run_solr_w_ltr.sh 10 | 11 | ADD log4j2-config.xml ./log4j2-config.xml 12 | 13 | USER solr 14 | 15 | ENTRYPOINT "./run_solr_w_ltr.sh" 16 | -------------------------------------------------------------------------------- /engines/solr/build/log4j2-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /engines/solr/build/run_solr_w_ltr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | mkdir -p /var/solr/data/ 3 | 4 | SOLR_MODULES=ltr 5 | solr-foreground -Dsolr.modules=ltr -Dsolr.ltr.enabled=true -Dlog4j2.configurationFile=/opt/solr-9.4.1/log4j2-config.xml -------------------------------------------------------------------------------- /engines/solr/build/solr.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | ${solr.max.booleanClauses:1024} 4 | ${solr.sharedLib:} 5 | ${solr.modules:} 6 | ${solr.allowPaths:} 7 | ${solr.allowUrls:} 8 | ${solr.hideStackTrace:false} 9 | 10 | 11 | 12 | ${host:} 13 | ${solr.port.advertise:0} 14 | ${hostContext:solr} 15 | 16 | ${genericCoreNodeNames:true} 17 | 18 | ${zkClientTimeout:30000} 19 | ${distribUpdateSoTimeout:600000} 20 | ${distribUpdateConnTimeout:60000} 21 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider} 22 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider} 23 | ${zkCredentialsInjector:org.apache.solr.common.cloud.DefaultZkCredentialsInjector} 24 | ${distributedClusterStateUpdates:false} 25 | ${distributedCollectionConfigSetExecution:false} 26 | ${minStateByteLenForCompression:-1} 27 | ${stateCompressor:org.apache.solr.common.util.ZLibCompressor} 28 | 29 | 30 | 31 | 33 | ${socketTimeout:600000} 34 | ${connTimeout:60000} 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 52 | 53 | 54 | QUERY_DOC_FV 55 | 56 | -------------------------------------------------------------------------------- /engines/solr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | AIPS_SOLR_HOST = os.getenv("AIPS_SOLR_HOST") or "aips-solr" 4 | AIPS_SOLR_PORT = os.getenv("AIPS_SOLR_PORT") or "8983" 5 | SOLR_URL = f"http://{AIPS_SOLR_HOST}:{AIPS_SOLR_PORT}/solr" 6 | STATUS_URL = f"{SOLR_URL}/admin/zookeeper/status" 7 | SOLR_COLLECTIONS_URL = f"{SOLR_URL}/admin/collections" -------------------------------------------------------------------------------- /ltr/__init__.py: -------------------------------------------------------------------------------- 1 | # Make the most important pieces just available as 2 | # ie - from ltr import download 3 | from .download import download 4 | from .evaluate import evaluate, rre_table 5 | from .search import search 6 | -------------------------------------------------------------------------------- /ltr/clickmodels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/ltr/clickmodels/__init__.py -------------------------------------------------------------------------------- /ltr/clickmodels/cascade.py: -------------------------------------------------------------------------------- 1 | from ltr.clickmodels.session import build 2 | from collections import Counter, defaultdict 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Attractiveness per query-doc 7 | self.attracts = defaultdict(lambda : 0.5) 8 | 9 | def cascade_model(sessions): 10 | """ Cascading model can be solved directly: 11 | - sessions with skips count against a doc 12 | - sessions with clicks count for 13 | - stop at first click 14 | """ 15 | session_counts = Counter() 16 | click_counts = Counter() 17 | model=Model() 18 | 19 | for session in sessions: 20 | for rank, doc in enumerate(session.docs): 21 | query_doc_key = (session.query, doc.doc_id) 22 | session_counts[query_doc_key] += 1 23 | 24 | if doc.click: 25 | # Cascading model doesn't consider 26 | # clicks past the last one, so we count 27 | # this one and break out 28 | click_counts[query_doc_key] += 1 29 | break; 30 | 31 | for (query_id, doc_id), count in session_counts.items(): 32 | query_doc_key = (query_id, doc_id) 33 | model.attracts[query_doc_key] = click_counts[query_doc_key] / session_counts[query_doc_key] 34 | return model 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | sessions = build([ 40 | ('A', ((1, True), (2, False), (3, True), (0, False))), 41 | ('B', ((5, False), (2, True), (3, True), (0, False))), 42 | ('A', ((1, False), (2, False), (3, True), (0, False))), 43 | ('B', ((1, False), (2, False), (3, False), (9, True))), 44 | ('A', ((9, False), (2, False), (1, True), (0, True))), 45 | ('B', ((6, True), (2, False), (3, True), (1, False))), 46 | ('A', ((7, False), (4, True), (1, False), (3, False))), 47 | ('B', ((8, True), (2, False), (3, True), (1, False))), 48 | ('A', ((1, False), (4, True), (2, False), (3, False))), 49 | ('B', ((7, True), (4, False), (5, True), (1, True))), 50 | ]) 51 | cascade_model(sessions) 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /ltr/clickmodels/coec.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | class Model(): 4 | def __init__(self): 5 | # COEC statistic 6 | self.coecs = Counter() 7 | 8 | # CTR for each query-doc pair in this session 9 | self.ctrs = {} 10 | 11 | def coec(ctr_by_rank, sessions): 12 | """ Clicks over expected clicks is a metric 13 | used for seeing what items get above or 14 | below average CTR for their rank. From paper 15 | 16 | > Personalized Click Prediction in Sponsored Search 17 | by Cheng, Cantu Paz 18 | 19 | A COEC > 1 means above average CTR for it's position 20 | A COEC < 1 means below average 21 | 22 | -ctr_by_rank is the global CTR at each rank position 23 | -sessions are an array of search session objects 24 | 25 | returned: 26 | each query-doc pair in provided sessions COEC 27 | 28 | """ 29 | clicks = Counter() 30 | weighted_impressions = Counter() 31 | 32 | for session in sessions: 33 | for rank, doc in enumerate(session.docs): 34 | weighted_impressions[(session.query, doc.doc_id)] += ctr_by_rank[rank] 35 | if doc.click: 36 | clicks[(session.query, doc.doc_id)] += 1 37 | 38 | model = Model() 39 | for query_id, doc_id in weighted_impressions: 40 | model.coecs[(query_id,doc_id)] = \ 41 | clicks[(query_id,doc_id)] / weighted_impressions[(query_id,doc_id)] 42 | 43 | return model 44 | -------------------------------------------------------------------------------- /ltr/clickmodels/conversion.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | def conv_aug_attracts(attracts, sessions, costs): 4 | """ Rescan sessions, using click-derrived attractiveness. 5 | 6 | If theres no conversion, punish the attractiveness derrived judgment 7 | 8 | BUT we punish costly things less, and cheap things more 9 | """ 10 | satisfacts = Counter() 11 | counts = Counter() 12 | for session in sessions: 13 | for rank, doc in enumerate(session.docs): 14 | attract = attracts[(session.query, doc.doc_id)] 15 | if doc.click: 16 | if doc.conversion: 17 | # Confirms the attractiveness was real with actual relevance 18 | counts[(session.query, doc.doc_id)] += 1 19 | satisfacts[(session.query, doc.doc_id)] += attract 20 | else: 21 | # If it costs a lot, and there wasn't a conversion, 22 | # thats ok, we default to attractiveness 23 | # If it costs little, and there wasn't a conversion, 24 | # thats generally not ok, why didn't they do (easy action) 25 | counts[(session.query, doc.doc_id)] += 1 26 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id] 27 | else: 28 | counts[(session.query, doc.doc_id)] += 1 29 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id] 30 | 31 | for (query_id, doc_id), count in counts.items(): 32 | satisfacts[(query_id, doc_id)] = satisfacts[(query_id,doc_id)] / count 33 | 34 | return satisfacts 35 | 36 | 37 | -------------------------------------------------------------------------------- /ltr/clickmodels/pbm.py: -------------------------------------------------------------------------------- 1 | from ltr.clickmodels.session import build 2 | from collections import Counter, defaultdict 3 | from ltr.helpers.defaultlist import defaultlist 4 | 5 | 6 | class Model(): 7 | def __init__(self): 8 | # Examine prob per-rank 9 | self.ranks = defaultlist(lambda: 0.4) 10 | 11 | # Attractiveness per query-doc 12 | self.attracts = defaultdict(lambda : 0.5) 13 | 14 | 15 | 16 | def update_attractiveness(sessions, model): 17 | """ Run through the step of updating attractiveness 18 | based on session information and the current rank 19 | examine probabilities 20 | 21 | Algorithm based on Expectation Maximization derived in 22 | chapter 4 of "Click Models for Web Search" by 23 | Chulkin, Markov, de Rijke 24 | 25 | """ 26 | attractions = Counter() #Track query-doc attractiveness in this round 27 | num_sessions = Counter() #Track num sessions where query-doc appears 28 | for session in sessions: 29 | for rank, doc in enumerate(session.docs): 30 | query_doc_key = (session.query, doc.doc_id) 31 | att = 0 32 | if doc.click: 33 | # By PBM rules, if its clicked, 34 | # the user thought it was attractive 35 | att = 1 36 | else: 37 | exam = model.ranks[rank] 38 | assert exam <= 1.0 39 | doc_a = model.attracts[query_doc_key] 40 | # Not examined, but attractive / 41 | # 1 - (examined and attractive) 42 | # When not clicked: 43 | # If somehow this is currently a rank examined 44 | # a lot and this doc is historically attractive, then 45 | # we might still count it as mostly attractive 46 | # OR if the doc IS examined a lot AND its not 47 | # attractive, then we do the opposite, add 48 | # close to 0 49 | att = (((1 - exam) * doc_a) / (1 - (exam * doc_a))) 50 | 51 | # Store away a_sum and 52 | assert att <= 1.0 53 | attractions[query_doc_key] += att 54 | num_sessions[query_doc_key] += 1 55 | assert attractions[query_doc_key] <= num_sessions[query_doc_key] 56 | 57 | # Update the main query attractiveness from the attractions / num sessions 58 | for (query_id, doc_id), a_sum in attractions.items(): 59 | query_doc_key = (query_id, doc_id) 60 | att = a_sum / num_sessions[query_doc_key] 61 | assert att <= 1.0 62 | model.attracts[query_doc_key] = att 63 | 64 | 65 | def update_examines(sessions, model): 66 | """ Run through the step of updating position examine 67 | probabilities given current query-doc attractiveness 68 | 69 | Algorithm based on Expectation Maximization derived in 70 | chapter 4 of "Click Models for Web Search" by 71 | Chulkin, Markov, de Rijke 72 | 73 | """ 74 | new_rank_probs = defaultlist(lambda: 0) 75 | 76 | for session in sessions: 77 | for rank, doc in enumerate(session.docs): 78 | if doc.click: 79 | new_rank_probs[rank] += 1 80 | else: 81 | # attractiveness at this query/doc pair 82 | a_qd = model.attracts[(session.query, doc.doc_id)] 83 | numerator = (1 - a_qd) * model.ranks[rank] 84 | denominator = 1 - (a_qd * model.ranks[rank]) 85 | # When not clicked - was it examined? We have to guess! 86 | # - If it has seemed very attractive, we assume it 87 | # was not examined. Because who could pass up such 88 | # a yummy looking search result? (numerator) 89 | # 90 | # - If its not attractive, but this rank gets examined 91 | # a lot, the new rank prob is closer to 1 92 | # (approaches ranks[rank] / ranks[rank]) 93 | # 94 | # - If its not examined much, wont contribute much 95 | new_rank_probs[rank] += numerator / denominator 96 | for i in range(len(new_rank_probs)): 97 | model.ranks[i] = new_rank_probs[i] / len(sessions) 98 | 99 | 100 | def position_based_model(sessions, rounds=20): 101 | """ 102 | Algorithm based on Expectation Maximization derived in 103 | chapter 4 (table 4.1) of "Click Models for Web Search" by 104 | Chulkin, Markov, de Rijke 105 | 106 | Given the observed sessions 107 | Initialized: 108 | - prob a ranks is examined (`ranks`) 109 | - randomly initialized query/doc attractiveness 110 | 111 | Compute: 112 | - Probability a doc is attractive for a query 113 | """ 114 | model=Model() 115 | for i in range(0,rounds): 116 | update_attractiveness(sessions, model) 117 | update_examines(sessions, model) 118 | return model 119 | 120 | 121 | if __name__ == "__main__": 122 | sessions = build([ 123 | ('A', ((1, True), (2, False), (3, True), (0, False))), 124 | ('B', ((5, False), (2, True), (3, True), (0, False))), 125 | ('A', ((1, False), (2, False), (3, True), (0, False))), 126 | ('B', ((1, False), (2, False), (3, False), (9, True))), 127 | ('A', ((9, False), (2, False), (1, True), (0, True))), 128 | ('B', ((6, True), (2, False), (3, True), (1, False))), 129 | ('A', ((7, False), (4, True), (1, False), (3, False))), 130 | ('B', ((8, True), (2, False), (3, True), (1, False))), 131 | ('A', ((1, False), (4, True), (2, False), (3, False))), 132 | ('B', ((7, True), (4, False), (5, True), (1, True))), 133 | ]) 134 | position_based_model(sessions, rounds=100) 135 | -------------------------------------------------------------------------------- /ltr/clickmodels/sdbn.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from ltr.clickmodels.session import build 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Satisfaction per query-doc 7 | self.satisfacts = defaultdict(lambda: 0.1) 8 | 9 | # Attractiveness per query-doc 10 | self.attracts = defaultdict(lambda : 0.1) 11 | 12 | reverse_enumerate = lambda l: zip(range(len(l)-1, -1, -1), reversed(l)) 13 | 14 | 15 | def sdbn(sessions): 16 | """ Simplified Dynamic Bayesian Network is a simpler 17 | version of the much more complex Dynamic Bayesian Network 18 | that the authors say comes close to the accuracy of DBN 19 | 20 | Most importantly, it can be solved directly and simply without 21 | an EM learning process 22 | 23 | Features of sdbn: 24 | - Attractiveness is any click out of sessions where that document 25 | appears before the last click of the session 26 | - Satisfaction occurs when a doc is the last document clicked 27 | out of all sessions where that document is clicked 28 | 29 | """ 30 | model = Model() 31 | NO_CLICK = -1 32 | counts = Counter() 33 | clicks = Counter() 34 | last_clicks = Counter() 35 | for session in sessions: 36 | last_click = NO_CLICK 37 | for rank, doc in reverse_enumerate(session.docs): 38 | if last_click == NO_CLICK and doc.click: 39 | last_click = rank 40 | 41 | if last_click != NO_CLICK: 42 | query_doc = (session.query, doc.doc_id) 43 | counts[query_doc] += 1 44 | 45 | if doc.click: 46 | # Cascading model doesn't consider 47 | # clicks past the last one, so we count 48 | # this one and break out 49 | clicks[query_doc] += 1 50 | if rank == last_click: 51 | last_clicks[query_doc] += 1 52 | 53 | # For all meaningful sessions (where query_doc appear) 54 | # count attractiveness clicks / num sessions 55 | # count satisfacts last clicks / sessions with clicks 56 | for query_doc, count in counts.items(): 57 | model.attracts[query_doc] = clicks[query_doc] / count 58 | if query_doc in clicks: 59 | model.satisfacts[query_doc] = last_clicks[query_doc] / clicks[query_doc] 60 | return model 61 | 62 | 63 | if __name__ == "__main__": 64 | sessions = build([ 65 | ('A', ((1, True), (2, False), (3, True), (0, False))), 66 | ('B', ((5, False), (2, True), (3, True), (0, False))), 67 | ('A', ((1, False), (2, False), (3, True), (0, False))), 68 | ('B', ((1, False), (2, False), (3, False), (9, True))), 69 | ('A', ((9, False), (2, False), (1, True), (0, True))), 70 | ('B', ((6, True), (2, False), (3, True), (1, False))), 71 | ('A', ((7, False), (4, True), (1, False), (3, False))), 72 | ('B', ((8, True), (2, False), (3, True), (1, False))), 73 | ('A', ((1, False), (4, True), (2, False), (3, False))), 74 | ('B', ((7, True), (4, False), (5, True), (1, True))), 75 | ]) 76 | model = sdbn(sessions) 77 | print(model.attracts[('A', 1)]) 78 | print(model.satisfacts[('A', 1)]) 79 | print(model.attracts[('B', 1)]) 80 | print(model.satisfacts[('B', 1)]) 81 | -------------------------------------------------------------------------------- /ltr/clickmodels/session.py: -------------------------------------------------------------------------------- 1 | 2 | class Doc: 3 | def __init__(self, click, doc_id, conversion=False): 4 | self.click = click 5 | self.doc_id = doc_id 6 | self.conversion = conversion 7 | 8 | def __repr__(self): 9 | return "Doc(doc_id=%s, click=%s, conversion=%s)" % (self.doc_id, self.click, self.conversion) 10 | 11 | def __str__(self): 12 | return "(%s, %s, %s)" % (self.doc_id, self.click, self.conversion) 13 | 14 | 15 | class Session: 16 | def __init__(self, query, docs): 17 | self.query = query 18 | self.docs = docs 19 | # Check if docs are unique 20 | docset = set() 21 | for doc in docs: 22 | if doc.doc_id in docset: 23 | raise ValueError("A session may only list a doc exactly once in search results") 24 | docset.add(doc.doc_id) 25 | 26 | def __repr__(self): 27 | return "Session(query=%s, docs=%s)" % (self.query, self.docs) 28 | 29 | def __str__(self): 30 | return "(%s, (%s))" % (self.query, self.docs) 31 | 32 | 33 | def build_one(sess_tuple): 34 | """ Take a tuple where 35 | 0th item is query (a string that uniquely identifies it) 36 | 1st item is a list of docs, with clicks 37 | and optionally a conversion id or true/false 38 | 39 | 40 | ('A', ((1, True), (2, False), (3, True), (0, False))), 41 | 42 | alternatively a value can be attached to the doc 43 | 44 | ('A', ((1, True, 0.9), (2, False, 0.8), (3, True, 1.0), (0, False))), 45 | """ 46 | query = sess_tuple[0] 47 | docs = [] 48 | for doc_tuple in sess_tuple[1]: 49 | conversion = False 50 | if len(doc_tuple) > 2: 51 | conversion = doc_tuple[2] 52 | docs.append(Doc(doc_id=doc_tuple[0], 53 | click=doc_tuple[1], 54 | conversion=conversion)) 55 | return Session(query=query, docs=docs) 56 | 57 | 58 | def build(sess_tuples): 59 | sesss = [] 60 | for sess_tup in sess_tuples: 61 | sesss.append(build_one(sess_tup)) 62 | return sesss 63 | 64 | -------------------------------------------------------------------------------- /ltr/clickmodels/ubm.py: -------------------------------------------------------------------------------- 1 | from ltr.clickmodels.session import build 2 | from collections import Counter, defaultdict 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Examine prob per-rank 7 | # Rank 0 is first displayed on page 8 | # Rank -1 i 9 | self.ranks = defaultdict(lambda: 0.4) 10 | 11 | # Attractiveness per query-doc 12 | self.attracts = defaultdict(lambda : 0.5) 13 | 14 | 15 | def update_attractiveness(sessions, model): 16 | """ Run through the step of updating attractiveness 17 | based on session information and the current rank 18 | examine probabilities 19 | 20 | Algorithm based on Expectation Maximization derived in 21 | chapter 4 of "Click Models for Web Search" by 22 | Chulkin, Markov, de Rijke 23 | 24 | """ 25 | attractions = Counter() #Track query-doc attractiveness in this round 26 | num_sessions = Counter() #Track num sessions where query-doc appears 27 | for session in sessions: 28 | last_click = -1 29 | for rank, doc in enumerate(session.docs): 30 | query_doc_key = (session.query, doc.doc_id) 31 | att = 0 32 | if doc.click: 33 | 34 | last_click = rank 35 | 36 | att = 1 37 | else: 38 | exam = model.ranks[(last_click,rank)] 39 | assert exam <= 1.0 40 | doc_a = model.attracts[query_doc_key] 41 | # Not examined, but attractive / 42 | # 1 - (examined and attractive) 43 | # When not clicked: 44 | # If somehow this is currently a rank examined 45 | # a lot and this doc is historically attractive, then 46 | # we might still count it as mostly attractive 47 | # OR if the doc IS examined a lot AND its not 48 | # attractive, then we do the opposite, add 49 | # close to 0 50 | att = (((1 - exam) * doc_a) / (1 - (exam * doc_a))) 51 | 52 | # Store away a_sum and 53 | assert att <= 1.0 54 | attractions[query_doc_key] += att 55 | num_sessions[query_doc_key] += 1 56 | assert attractions[query_doc_key] <= num_sessions[query_doc_key] 57 | 58 | # Update the main query attractiveness from the attractions / num sessions 59 | for (query_id, doc_id), a_sum in attractions.items(): 60 | query_doc_key = (query_id, doc_id) 61 | att = a_sum / num_sessions[query_doc_key] 62 | assert att <= 1.0 63 | model.attracts[query_doc_key] = att 64 | 65 | 66 | def update_examines(sessions, model): 67 | """ Run through the step of updating position examine 68 | probabilities given current query-doc attractiveness 69 | 70 | Algorithm based on Expectation Maximization derived in 71 | chapter 4 of "Click Models for Web Search" by 72 | Chulkin, Markov, de Rijke 73 | 74 | """ 75 | new_rank_probs = defaultdict(lambda: 0) 76 | counts = defaultdict(lambda: 0) 77 | 78 | for session in sessions: 79 | last_click = -1 80 | for rank, doc in enumerate(session.docs): 81 | if doc.click: 82 | new_rank_probs[(last_click, rank)] += 1 83 | counts[(last_click, rank)] += 1 84 | if last_click == -1 and rank == 3: 85 | print(counts[(last_click,rank)]) 86 | 87 | last_click = rank 88 | else: 89 | # attractiveness at this query/doc pair 90 | a_qd = model.attracts[(session.query, doc.doc_id)] 91 | numerator = (1 - a_qd) * model.ranks[(last_click, rank)] 92 | denominator = 1 - (a_qd * model.ranks[(last_click, rank)]) 93 | # When not clicked - was it examined? We have to guess! 94 | # - If it has seemed very attractive, we assume it 95 | # was not examined. Because who could pass up such 96 | # a yummy looking search result? (numerator) 97 | # 98 | # - If its not attractive, but this rank gets examined 99 | # a lot, the new rank prob is closer to 1 100 | # (approaches ranks[rank] / ranks[rank]) 101 | # 102 | # - If its not examined much, wont contribute much 103 | new_rank_probs[(last_click, rank)] += numerator / denominator 104 | counts[(last_click, rank)] += 1 105 | if last_click == -1 and rank == 3: 106 | print(counts[(last_click,rank)]) 107 | 108 | for (last_click, click), count in counts.items(): 109 | model.ranks[(last_click, click)] = new_rank_probs[(last_click, click)] / count 110 | 111 | 112 | def user_browse_model(sessions, rounds=20): 113 | """ 114 | Algorithm based on Expectation Maximization derived in 115 | chapter 4 (table 4.1) of "Click Models for Web Search" by 116 | Chulkin, Markov, de Rijke 117 | 118 | """ 119 | model=Model() 120 | for i in range(0,rounds): 121 | update_attractiveness(sessions, model) 122 | update_examines(sessions, model) 123 | return model 124 | 125 | 126 | if __name__ == "__main__": 127 | sessions = build([ 128 | ('A', ((1, True), (2, False), (3, True), (0, False))), 129 | ('B', ((5, False), (2, True), (3, True), (0, False))), 130 | ('A', ((1, False), (2, False), (3, True), (0, False))), 131 | ('B', ((1, False), (2, False), (3, False), (9, True))), 132 | ('A', ((9, False), (2, False), (1, True), (0, True))), 133 | ('B', ((6, True), (2, False), (3, True), (1, False))), 134 | ('A', ((7, False), (4, True), (1, False), (3, False))), 135 | ('B', ((8, True), (2, False), (3, True), (1, False))), 136 | ('A', ((1, False), (4, True), (2, False), (3, False))), 137 | ('B', ((7, True), (4, False), (5, True), (1, True))), 138 | ]) 139 | user_browse_model(sessions, rounds=100) 140 | -------------------------------------------------------------------------------- /ltr/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .solr_client import SolrClient 2 | -------------------------------------------------------------------------------- /ltr/client/base_client.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | ''' 4 | This project demonstrates working with LTR in Elasticsearch and Solr 5 | 6 | The goal of this class is to abstract away the server and highlight the steps 7 | required to begin working with LTR. This keeps the examples agnostic about 8 | which backend is being used, but the implementations of each client 9 | should be useful references to those getting started with LTR on 10 | their specific platform 11 | ''' 12 | class BaseClient(ABC): 13 | @abstractmethod 14 | def get_host(self): 15 | pass 16 | 17 | @abstractmethod 18 | def name(self): 19 | pass 20 | 21 | @abstractmethod 22 | def delete_index(self, index): 23 | pass 24 | 25 | @abstractmethod 26 | def create_index(self, index): 27 | pass 28 | 29 | @abstractmethod 30 | def index_documents(self, index, doc_src): 31 | pass 32 | 33 | @abstractmethod 34 | def reset_ltr(self, index): 35 | pass 36 | 37 | @abstractmethod 38 | def create_featureset(self, index, name, ftr_config): 39 | pass 40 | 41 | @abstractmethod 42 | def query(self, index, query): 43 | pass 44 | 45 | @abstractmethod 46 | def get_doc(self, doc_id, fields=None): 47 | pass 48 | 49 | @abstractmethod 50 | def log_query(self, index, featureset, ids, params): 51 | pass 52 | 53 | @abstractmethod 54 | def submit_model(self, featureset, index, model_name, model_payload): 55 | pass 56 | 57 | @abstractmethod 58 | def submit_ranklib_model(self, featureset, index, model_name, model_payload): 59 | pass 60 | 61 | @abstractmethod 62 | def model_query(self, index, model, model_params, query): 63 | pass 64 | 65 | @abstractmethod 66 | def feature_set(self, index, name): 67 | """ Return a mapping of name/feature ordinal 68 | and the raw (search engine specific) feature list""" 69 | pass 70 | 71 | 72 | -------------------------------------------------------------------------------- /ltr/client/solr_parse.py: -------------------------------------------------------------------------------- 1 | def every_other_zipped(lst): 2 | return zip(lst[0::2],lst[1::2]) 3 | 4 | def dictify(nl_tups): 5 | """ Return dict if all keys unique, otherwise 6 | dont modify """ 7 | as_dict = dict(nl_tups) 8 | if len(as_dict) == len(nl_tups): 9 | return as_dict 10 | return nl_tups 11 | 12 | def parse_named_list(lst): 13 | shallow_tups = [tup for tup in every_other_zipped(lst)] 14 | 15 | nl_as_tups = [] 16 | 17 | for tup in shallow_tups: 18 | if isinstance(tup[1], list): 19 | tup = (tup[0], parse_named_list(tup[1])) 20 | nl_as_tups.append(tup) 21 | return dictify(nl_as_tups) 22 | 23 | 24 | def parse_termvect_namedlist(lst, field): 25 | """ Parse the named list and perform some transformations to create consistent 26 | JSON to parse 27 | 28 | Specifically changing {"positions": ...} to {"positions": [1234,4567]} 29 | 30 | """ 31 | 32 | def listify_posns(posn_attrs): 33 | if isinstance(posn_attrs, dict): 34 | assert len(posn_attrs) == 1 35 | return [posn_attrs['position']] 36 | return [posn_attr[1] for posn_attr in posn_attrs] 37 | 38 | 39 | tv_parsed = parse_named_list(lst) 40 | for doc_id, doc_field_tv in tv_parsed.items(): 41 | for field_name, term_vects in doc_field_tv.items(): 42 | # T 43 | if field_name == field: 44 | for term, attrs in term_vects.items(): 45 | for attr_key, attr_val in attrs.items(): 46 | if attr_key == 'positions': 47 | attrs['positions'] = listify_posns(attr_val) 48 | return tv_parsed 49 | 50 | 51 | 52 | if __name__ == "__main__": 53 | solr_nl = [ 54 | "D100000", [ 55 | "uniqueKey", "D100000", 56 | "body", [ 57 | "1", [ 58 | "positions", [ 59 | "position", 92, 60 | "position", 113 61 | ]], 62 | "2", [ 63 | "positions", [ 64 | "position", 22, 65 | "position", 413 66 | ]], 67 | "boo", [ 68 | "positions", [ 69 | "position", 22, 70 | ]] 71 | ]]] 72 | print(repr(parse_termvect_namedlist(solr_nl, 'body'))) 73 | -------------------------------------------------------------------------------- /ltr/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from os import path 3 | from tqdm import tqdm 4 | 5 | def download_one(uri, dest='data/', force=False, fancy=False): 6 | import os 7 | 8 | if not os.path.exists(dest): 9 | os.makedirs(dest) 10 | 11 | if not os.path.isdir(dest): 12 | raise ValueError("dest {} is not a directory".format(dest)) 13 | 14 | filename = uri[uri.rfind('/') + 1:] 15 | filepath = os.path.join(dest, filename) 16 | if path.exists(filepath): 17 | if not force: 18 | print(filepath + ' already exists') 19 | return 20 | print("exists but force=True, Downloading anyway") 21 | 22 | if not fancy: 23 | with open(filepath, 'wb') as out: 24 | print('GET {}'.format(uri)) 25 | resp = requests.get(uri, stream=True) 26 | for chunk in resp.iter_content(chunk_size=1024): 27 | if chunk: 28 | out.write(chunk) 29 | else: 30 | resp = requests.get(uri, stream=True) 31 | total = int(resp.headers.get('content-length', 0)) 32 | with open(filepath, 'wb') as file, tqdm( 33 | desc=filepath, 34 | total=total, 35 | unit='iB', 36 | unit_scale=True, 37 | unit_divisor=1024, 38 | ) as bar: 39 | for data in resp.iter_content(chunk_size=1024): 40 | size = file.write(data) 41 | bar.update(size) 42 | 43 | def extract_tgz(fname, dest='data/'): 44 | import tarfile 45 | with tarfile.open(fname, 'r:gz') as tar: 46 | tar.extractall(path=dest) 47 | 48 | 49 | def download(uris, dest='data/', force=False, fancy=False): 50 | for uri in uris: 51 | download_one(uri=uri, dest=dest, force=force, fancy=fancy) 52 | -------------------------------------------------------------------------------- /ltr/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import plotly.graph_objs as go 5 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 6 | 7 | def log_run(cmd): 8 | resp = os.popen(cmd).read() 9 | print(resp) 10 | 11 | def quiet_run(cmd): 12 | os.popen(cmd).read() 13 | 14 | def evaluate(mode): 15 | # Build the docker image 16 | if mode == 'elastic': 17 | cmd = 'docker build --no-cache -t ltr-rre rre/elastic/.' 18 | else: 19 | cmd = 'docker build --no-cache -t ltr-rre rre/solr/.' 20 | 21 | print('Building RRE image - This will take a while') 22 | quiet_run(cmd) 23 | 24 | # Remove and run a fresh docker image 25 | cmd = 'docker rm -f ltr-rre' 26 | quiet_run(cmd) 27 | 28 | cmd = 'docker run --name ltr-rre ltr-rre' 29 | print('Running evaluation') 30 | log_run(cmd) 31 | 32 | # Copy out reports 33 | cmd = 'docker cp ltr-rre:/rre/target/rre/evaluation.json data/rre-evaluation.json' 34 | log_run(cmd) 35 | 36 | cmd = 'docker cp ltr-rre:/rre/target/site/rre-report.xlsx data/rre-report.xlsx' 37 | log_run(cmd) 38 | 39 | print('RRE Evaluation complete') 40 | 41 | 42 | def rre_table(): 43 | init_notebook_mode(connected=True) 44 | 45 | with open('data/rre-evaluation.json') as src: 46 | report = json.load(src) 47 | metrics = report['metrics'] 48 | 49 | experiments = ['baseline', 'classic', 'latest'] 50 | precisions = [] 51 | recalls = [] 52 | errs = [] 53 | 54 | for exp in experiments: 55 | precisions.append(metrics['P']['versions'][exp]['value']) 56 | recalls.append(metrics['R']['versions'][exp]['value']) 57 | errs.append(metrics['ERR@30']['versions'][exp]['value']) 58 | 59 | trace = go.Table( 60 | header=dict(values=['', 'Precision', 'Recall', 'ERR'], fill = dict(color='#AAAAAA')), 61 | cells=dict(values=[ 62 | experiments, 63 | precisions, 64 | recalls, 65 | errs 66 | ]) 67 | ) 68 | 69 | data = [trace] 70 | iplot(data) 71 | 72 | -------------------------------------------------------------------------------- /ltr/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/ltr/helpers/__init__.py -------------------------------------------------------------------------------- /ltr/helpers/butterfingers.py: -------------------------------------------------------------------------------- 1 | def butterfingers(text,prob=0.1,keyboard='qwerty'): 2 | import random 3 | 4 | """ taken from 5 | https://github.com/Decagon/butter-fingers/blob/master/butterfingers/butterfingers.py """ 6 | 7 | keyApprox = {} 8 | 9 | if keyboard == "qwerty": 10 | keyApprox['q'] = "qwasedzx" 11 | keyApprox['w'] = "wqesadrfcx" 12 | keyApprox['e'] = "ewrsfdqazxcvgt" 13 | keyApprox['r'] = "retdgfwsxcvgt" 14 | keyApprox['t'] = "tryfhgedcvbnju" 15 | keyApprox['y'] = "ytugjhrfvbnji" 16 | keyApprox['u'] = "uyihkjtgbnmlo" 17 | keyApprox['i'] = "iuojlkyhnmlp" 18 | keyApprox['o'] = "oipklujm" 19 | keyApprox['p'] = "plo['ik" 20 | 21 | keyApprox['a'] = "aqszwxwdce" 22 | keyApprox['s'] = "swxadrfv" 23 | keyApprox['d'] = "decsfaqgbv" 24 | keyApprox['f'] = "fdgrvwsxyhn" 25 | keyApprox['g'] = "gtbfhedcyjn" 26 | keyApprox['h'] = "hyngjfrvkim" 27 | keyApprox['j'] = "jhknugtblom" 28 | keyApprox['k'] = "kjlinyhn" 29 | keyApprox['l'] = "lokmpujn" 30 | 31 | keyApprox['z'] = "zaxsvde" 32 | keyApprox['x'] = "xzcsdbvfrewq" 33 | keyApprox['c'] = "cxvdfzswergb" 34 | keyApprox['v'] = "vcfbgxdertyn" 35 | keyApprox['b'] = "bvnghcftyun" 36 | keyApprox['n'] = "nbmhjvgtuik" 37 | keyApprox['m'] = "mnkjloik" 38 | keyApprox[' '] = " " 39 | else: 40 | print("Keyboard not supported.") 41 | 42 | probOfTypo = int(prob * 100) 43 | 44 | buttertext = "" 45 | for letter in text: 46 | lcletter = letter.lower() 47 | if not lcletter in keyApprox.keys(): 48 | newletter = lcletter 49 | else: 50 | if random.choice(range(0, 100)) <= probOfTypo: 51 | newletter = random.choice(keyApprox[lcletter]) 52 | else: 53 | newletter = lcletter 54 | # go back to original case 55 | if not lcletter == letter: 56 | newletter = newletter.upper() 57 | buttertext += newletter 58 | 59 | return buttertext 60 | 61 | 62 | -------------------------------------------------------------------------------- /ltr/helpers/convert.py: -------------------------------------------------------------------------------- 1 | # converts LambdaMART XML models to JSON for Solr.. 2 | 3 | import xml.etree.ElementTree as ET 4 | 5 | 6 | def convert(ensemble_xml_string, modelName, featureSet, featureMapping): 7 | modelClass = 'org.apache.solr.ltr.model.MultipleAdditiveTreesModel' 8 | 9 | model = { 10 | 'store': featureSet, 11 | 'name': modelName, 12 | 'class': modelClass, 13 | 'features': featureMapping 14 | } 15 | 16 | # Clean up header 17 | ensemble_xml_string = '\n'.join(ensemble_xml_string.split('\n')[7:]) 18 | lambdaModel = ET.fromstring(ensemble_xml_string) 19 | 20 | trees = [] 21 | for node in lambdaModel: 22 | t = { 23 | 'weight': str(node.attrib['weight']), 24 | 'root': parseSplits(node[0], featureMapping) 25 | } 26 | trees.append(t) 27 | 28 | # print(trees) 29 | model['params'] = {'trees': trees} 30 | 31 | return model 32 | 33 | def parseSplits(split, features): 34 | obj = {} 35 | for el in split: 36 | if (el.tag == 'feature'): 37 | obj['feature'] = features[(int(el.text.strip()) - 1)]['name'] 38 | elif (el.tag == 'threshold'): 39 | obj['threshold'] = str(el.text.strip()) 40 | elif (el.tag == 'split' and 'pos' in el.attrib): 41 | obj[el.attrib['pos']] = parseSplits(el, features) 42 | elif (el.tag == 'output'): 43 | obj['value'] = str(el.text.strip()) 44 | return obj 45 | -------------------------------------------------------------------------------- /ltr/helpers/defaultlist.py: -------------------------------------------------------------------------------- 1 | class DefaultList(list): 2 | """ adapted from https://stackoverflow.com/a/869901/8123""" 3 | 4 | def __init__(self, factory): 5 | self.factory = factory 6 | 7 | def __getitem__(self, index): 8 | size = len(self) 9 | if index >= size: 10 | self.extend(self.factory() for _ in range(size, index + 1)) 11 | 12 | return list.__getitem__(self, index) 13 | 14 | def __setitem__(self, index, value): 15 | size = len(self) 16 | if index >= size: 17 | self.extend(self.factory() for _ in range(size, index + 1)) 18 | 19 | list.__setitem__(self, index, value) 20 | 21 | def defaultlist(factory): 22 | return DefaultList(factory) 23 | -------------------------------------------------------------------------------- /ltr/helpers/esUrlParse.py: -------------------------------------------------------------------------------- 1 | def parseUrl(fullEsUrl): 2 | from urllib.parse import urlsplit, urlunsplit 3 | import os.path 4 | o = urlsplit(fullEsUrl) 5 | 6 | esUrl = urlunsplit([o.scheme, o.netloc, '','','']) 7 | 8 | indexAndSearchType = os.path.split(o.path) 9 | 10 | return (esUrl, indexAndSearchType[0][1:], indexAndSearchType[1]) 11 | 12 | 13 | if __name__ == "__main__": 14 | from sys import argv 15 | print(parseUrl(argv[1])) 16 | -------------------------------------------------------------------------------- /ltr/helpers/handle_resp.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def resp_msg(msg, resp, throw=True): 4 | print('{} [Status: {}]'.format(msg, resp.status_code)) 5 | if resp.status_code >= 400: 6 | print(resp.text) 7 | if throw: 8 | raise RuntimeError(resp.text) 9 | 10 | -------------------------------------------------------------------------------- /ltr/helpers/msmarco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/ltr/helpers/msmarco/__init__.py -------------------------------------------------------------------------------- /ltr/helpers/msmarco/evaluate.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | 4 | 5 | class QRel(): 6 | 7 | def __init__(self, qid, docid, keywords): 8 | self.qid=qid 9 | self.docid=docid 10 | self.keywords = keywords 11 | 12 | def eval_rr(self, doc_ranking): 13 | """ Evaluate the provided doc ranking using reciprical rank 14 | (1/rank of the expected doc) 15 | 16 | returns 0 if this qrels doc id is missing 17 | """ 18 | 19 | for rank, docid in enumerate(doc_ranking, start=1): 20 | if docid == self.docid: 21 | return 1.0 / rank 22 | return 0.0 23 | 24 | @staticmethod 25 | def read_qrels(qrels_fname='data/msmarco-doctrain-qrels.tsv.gz', 26 | queries_fname='data/msmarco-doctrain-queries.tsv.gz'): 27 | 28 | qids_to_keywords = QRel.get_keyword_lookup(queries_fname) 29 | 30 | with gzip.open(qrels_fname, 'rt') as f: 31 | reader = csv.reader(f, delimiter=' ') 32 | for row in reader: 33 | qid = row[0] 34 | keywords = None 35 | if qid in qids_to_keywords: 36 | keywords = qids_to_keywords[qid] 37 | else: 38 | print("Missing keywords for %s" % qid) 39 | yield QRel(qid=row[0], docid=row[2], keywords=keywords) 40 | 41 | @staticmethod 42 | def get_keyword_lookup(fname='data/msmarco-doctrain-queries.tsv.gz'): 43 | qids_to_keywords = {} 44 | with gzip.open(fname, 'rt') as f: 45 | reader = csv.reader(f, delimiter='\t') 46 | for row in reader: 47 | qids_to_keywords[row[0]] = row[1] 48 | return qids_to_keywords 49 | 50 | def __str__(self): 51 | return "qid:%s(%s) => doc:%s" % (self.qid, self.keywords, self.docid) 52 | 53 | 54 | if __name__ == "__main__": 55 | qrels = {} 56 | for qrel in QRel.read_qrels(): 57 | qrels[qrel.qid] = qrel 58 | 59 | print(qrels['1185869'].eval_rr(['1','1'])) 60 | 61 | -------------------------------------------------------------------------------- /ltr/helpers/ranklib_result.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | class RanklibResult: 5 | 6 | """ A result of ranklib training, either for a 7 | single training operation 8 | (where trainingLogs is just set, and has a single item) 9 | or k-folds cross validation 10 | (where the foldResults/kcv are set; with a result for 11 | each fold that is run """ 12 | 13 | def __init__(self, trainingLogs, foldResults, 14 | kcvTestAvg, kcvTrainAvg): 15 | self.trainingLogs = trainingLogs 16 | self.foldResults = foldResults 17 | self.kcvTrainAvg = kcvTrainAvg 18 | self.kcvTestAvg = kcvTestAvg 19 | 20 | class TrainingLog: 21 | 22 | def __init__(self, rounds, impacts, trainMetricName, trainMetricVal): 23 | self.impacts = impacts 24 | self.rounds = rounds 25 | self.trainMetricName = trainMetricName 26 | self.trainMetricVal = trainMetricVal 27 | 28 | 29 | def metric(self): 30 | if self.trainMetricName is not None: 31 | return self.trainMetricVal 32 | if len(self.rounds) > 0: 33 | return self.rounds[-1] 34 | else: 35 | return 0 36 | 37 | class FoldResult: 38 | def __init__(self, foldId, trainMetric, testMetric): 39 | self.foldNum = foldId 40 | self.trainMetric = trainMetric 41 | self.testMetric = testMetric 42 | 43 | impactRe = re.compile(' Feature (\d+) reduced error (.*)') 44 | roundsRe = re.compile('(\d+)\s+\| (\d+)') 45 | foldsRe = re.compile('^Fold (\d+)\s+\|(.*)\|(.*)') 46 | avgRe = re.compile('^Avg.\s+\|(.*)\|(.*)') 47 | trainMetricRe = re.compile('(.*@.*) on training data: (.*)') 48 | 49 | def parse_training_log(rawResult): 50 | """ Takes raw result from Ranklib training and 51 | gathers the feature impacts, training rounds, 52 | and any cross-validation information """ 53 | lines = rawResult.split('\n') 54 | # Fold 1 | 0.9396 | 0.8764 55 | train = False 56 | logs = [] 57 | folds = [] 58 | impacts = {} 59 | rounds = [] 60 | trainMetricName = None 61 | trainMetricVal = 0.0 62 | kcvTestAvg = kcvTrainAvg = None 63 | for line in lines: 64 | if 'Training starts...' in line: 65 | if train: 66 | log = TrainingLog(rounds=rounds, 67 | impacts=impacts, 68 | trainMetricName=trainMetricName, 69 | trainMetricVal=trainMetricVal) 70 | logs.append(log) 71 | impacts = {} 72 | rounds = [] 73 | train = True 74 | 75 | if train: 76 | m = re.match(impactRe, line) 77 | if m: 78 | ftrId = m.group(1) 79 | error = float(m.group(2)) 80 | impacts[ftrId] = error 81 | m = re.match(roundsRe, line) 82 | if m: 83 | values = line.split('|') 84 | metricTrain = float(values[1]) 85 | rounds.append(metricTrain) 86 | m = re.match(trainMetricRe, line) 87 | if m: 88 | trainMetricVal = float(m.group(2)) 89 | trainMetricName = m.group(1) 90 | 91 | m = re.match(foldsRe, line) 92 | if m: 93 | foldId = m.group(1) 94 | trainMetric = float(m.group(2)) 95 | testMetric = float(m.group(3)) 96 | folds.append(FoldResult(foldId=foldId, 97 | testMetric=testMetric, 98 | trainMetric=trainMetric)) 99 | m = re.match(avgRe, line) 100 | if m: 101 | kcvTrainAvg = float(m.group(1)) 102 | kcvTestAvg = float(m.group(2)) 103 | 104 | if train: 105 | log = TrainingLog(rounds=rounds, 106 | impacts=impacts, 107 | trainMetricName=trainMetricName, 108 | trainMetricVal=trainMetricVal) 109 | logs.append(log) 110 | 111 | return RanklibResult(trainingLogs=logs, 112 | foldResults=folds, 113 | kcvTrainAvg=kcvTrainAvg, 114 | kcvTestAvg=kcvTestAvg) 115 | 116 | -------------------------------------------------------------------------------- /ltr/helpers/solr_escape.py: -------------------------------------------------------------------------------- 1 | def esc_kw(kw): 2 | """ Take a keyword and escape all the 3 | Solr parts we want to escape!""" 4 | kw = kw.replace('\\', '\\\\') # be sure to do this first, as we inject \! 5 | kw = kw.replace('(', '\(') 6 | kw = kw.replace(')', '\)') 7 | kw = kw.replace('+', '\+') 8 | kw = kw.replace('-', '\-') 9 | kw = kw.replace(':', '\:') 10 | kw = kw.replace('/', '\/') 11 | kw = kw.replace(']', '\]') 12 | kw = kw.replace('[', '\[') 13 | kw = kw.replace('*', '\*') 14 | kw = kw.replace('?', '\?') 15 | kw = kw.replace('{', '\{') 16 | kw = kw.replace('}', '\}') 17 | kw = kw.replace('~', '\~') 18 | 19 | 20 | return kw 21 | -------------------------------------------------------------------------------- /ltr/helpers/tau.py: -------------------------------------------------------------------------------- 1 | sign = lambda a: (a>0) - (a<0) 2 | 3 | def pairs_in_order(ranking, both_ways=True): 4 | assert len(ranking) > 1 5 | for idx1, val1 in enumerate(ranking): 6 | for idx2, val2 in enumerate(ranking): 7 | if idx2 > idx1: 8 | yield val1, val2, sign(idx2-idx1) 9 | if both_ways: 10 | yield val2, val1, sign(idx1-idx2) 11 | 12 | def tau(rank1, rank2, at=4): 13 | rank1in = {} 14 | 15 | 16 | if len(rank1) < at or len(rank2) < at: 17 | raise ValueError("rankings must be larger than provided at param(%s)" % at) 18 | 19 | # Handle 1 as a special case 20 | if at == 1: 21 | if rank1[0] == rank2[0]: 22 | return 1 23 | return -1 24 | 25 | rank1 = rank1[:at]; rank2 = rank2[:at] 26 | 27 | # gather concordances/discords for rank1 28 | for val1, val2, order in pairs_in_order(rank1, both_ways=True): 29 | rank1in[(val1,val2)] = order 30 | 31 | # check rank2 32 | concords = 0 33 | discords = 0 34 | for val1, val2, order in pairs_in_order(rank2, both_ways=False): 35 | try: 36 | rank1order = rank1in[(val1,val2)] 37 | if order == rank1order: 38 | concords += 1 39 | else: 40 | discords += 1 41 | except KeyError: 42 | discords += 1 43 | 44 | return (concords - discords) / ((at * (at - 1)) / 2) 45 | 46 | def avg_tau(rank1, rank2, at=4): 47 | if len(rank1) < at or len(rank2) < at: 48 | raise ValueError("rankings must be larger than provided at param(%s)" % at) 49 | 50 | rank1 = rank1[:at]; rank2 = rank2[:at] 51 | 52 | tot = 0 53 | for i in range(1,at+1): 54 | tot += tau(rank1,rank2,at=i) 55 | return tot / (at) 56 | 57 | if __name__ == "__main__": 58 | print(tau([1,2,3,4],[4,3,2,1])) 59 | print(tau([1,2,3,4],[1,2,3,4])) 60 | print(tau([1,2,4,3],[1,2,3,4])) 61 | print(tau([5,6,7,8],[1,2,3,4])) 62 | print(tau([1,2,3,5],[1,2,3,4])) 63 | print(tau([5,3,2,1],[4,3,2,1])) 64 | l1=[1,2,4,3]; l2=[1,2,3,4]; l3=[2,1,3,4] 65 | print("avg_tau(%s,%s,at=4) %s" % (l1, l1, avg_tau(l1,l1))) 66 | print("avg_tau(%s,%s,at=4) %s" % (l1, l2, avg_tau(l1,l2))) 67 | print("avg_tau(%s,%s,at=4) %s" % (l2, l3, avg_tau(l1,l3))) 68 | print("tau(%s,%s,at=4) %s" % (l1, l2, tau(l1,l2))) 69 | print("tau(%s,%s,at=4) %s" % (l2, l3, tau(l1,l3))) 70 | 71 | -------------------------------------------------------------------------------- /ltr/helpers/timed_block.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import sys 3 | 4 | @contextmanager 5 | def timed_block(name=None, f=sys.stdout): 6 | from time import perf_counter 7 | start = perf_counter() 8 | yield start 9 | stop = perf_counter() 10 | if name: 11 | f.write(name + " - ") 12 | f.write("Took %s\n" % (stop-start)) 13 | -------------------------------------------------------------------------------- /ltr/index.py: -------------------------------------------------------------------------------- 1 | def reindex(client, index, doc_src, indexing_workers=3, indexing_batch_size=500): 2 | """ Reload a configuration on disk for each search engine 3 | (Solr a configset, Elasticsearch a json file) 4 | and reindex 5 | 6 | """ 7 | from ltr.helpers.timed_block import timed_block 8 | 9 | print("Reindexing...") 10 | 11 | with timed_block(name='Indexing'): 12 | client.index_documents(index, 13 | doc_src=doc_src, 14 | batch_size=indexing_batch_size, 15 | workers=indexing_workers) 16 | 17 | print('Done') 18 | -------------------------------------------------------------------------------- /ltr/injectTypos.py: -------------------------------------------------------------------------------- 1 | try: 2 | from judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid 3 | from butterfingers import butterfingers 4 | except ImportError: 5 | from .judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid 6 | from .butterfingers import butterfingers 7 | 8 | 9 | 10 | def typoIt(judgmentInFile, judgmentOutFile, rounds=100): 11 | with open(judgmentInFile) as f: 12 | currJudgments = [judg for judg in judgments_from_file(f)] 13 | lastQid = currJudgments[-1].qid 14 | judgDict = judgments_by_qid(currJudgments) 15 | 16 | existingTypos = set() 17 | 18 | for i in range(0, rounds): 19 | 20 | for qid, judglist in judgDict.items(): 21 | keywords = judglist[0].keywords 22 | keywordsWTypo = butterfingers(keywords) 23 | 24 | if keywordsWTypo != keywords and keywordsWTypo not in existingTypos: 25 | newQid = lastQid+1 26 | print("%s => %s" % (keywords, keywordsWTypo)) 27 | lastQid += 1 28 | for judg in judglist: 29 | typoJudg = Judgment(grade=judg.grade, 30 | qid=newQid, 31 | keywords=keywordsWTypo, 32 | doc_id=judg.doc_id) 33 | currJudgments.append(typoJudg) 34 | existingTypos.add(keywordsWTypo) 35 | 36 | with open(judgmentOutFile, 'w') as f: 37 | judgments_to_file(f, judgmentsList=currJudgments) 38 | 39 | 40 | if __name__ == "__main__": 41 | typoIt(judgmentInFile='title_judgments.txt', judgmentOutFile='title_fuzzy_judgments.txt') 42 | 43 | 44 | # Clone a judgment, inject random typos 45 | -------------------------------------------------------------------------------- /ltr/log.py: -------------------------------------------------------------------------------- 1 | import re 2 | from aips import get_ltr_engine, get_engine 3 | 4 | class FeatureLogger: 5 | """ Logs LTR Features, one query at a time 6 | 7 | ...Building up a training set... 8 | """ 9 | 10 | def __init__(self, index, feature_set, drop_missing=True, id_field='id'): 11 | self.index=index 12 | self.feature_set=feature_set 13 | self.drop_missing=drop_missing 14 | self.id_field=id_field 15 | self.logged=[] 16 | 17 | def clear(self): 18 | self.logged=[] 19 | 20 | def log_for_qid(self, judgments, qid=None, keywords=None, log=False): 21 | """ Log a set of judgments associated with a single qid 22 | judgments will be modified, a training set also returned, discarding 23 | any judgments we could not log features for (because the doc was missing) 24 | """ 25 | if qid is None: 26 | qid=judgments[0].qid 27 | 28 | judgments = [j for j in judgments] 29 | doc_ids = [judgment.doc_id for judgment in judgments] 30 | unique_ids = list(set(doc_ids)) 31 | if len(doc_ids) != len(unique_ids): 32 | duplicated = set([id for id in doc_ids if doc_ids.count(id) > 1]) 33 | print(f"Duplicate docs in for query id {qid}: {duplicated}") 34 | doc_ids = unique_ids 35 | 36 | if keywords is None: 37 | keywords = judgments[len(judgments) - 1].keywords 38 | # For every batch of N docs to generate judgments for 39 | BATCH_SIZE = 500 40 | numLeft = len(doc_ids) 41 | document_features = {} 42 | for i in range(0, 1 + (len(doc_ids) // BATCH_SIZE)): 43 | 44 | numFetch = min(BATCH_SIZE, numLeft) 45 | start = i*BATCH_SIZE 46 | if start >= len(doc_ids): 47 | break 48 | ids = doc_ids[start:start+numFetch] 49 | 50 | # Sanitize (Solr has a strict syntax that can easily be tripped up) 51 | # This removes anything but alphanumeric and spaces 52 | fixed_keywords = re.sub('([^\s\w]|_)+', '', keywords) 53 | 54 | params = { 55 | "keywords": fixed_keywords, 56 | "fuzzy_keywords": ' '.join([x + '~' for x in fixed_keywords.split(' ')]), 57 | "squeezed_keywords": ''.join(fixed_keywords.split(' ')) 58 | } 59 | 60 | ids = [str(doc_id) for doc_id in ids] 61 | res = get_ltr_engine(self.index).get_logged_features(self.feature_set, ids, 62 | params, id_field=self.id_field, log=log) 63 | 64 | 65 | # Add feature back to each judgment 66 | for doc in res: 67 | doc_id = str(doc[self.id_field]) 68 | features = doc['[features]'] 69 | document_features[doc_id] = list(features.values()) 70 | numLeft -= BATCH_SIZE 71 | 72 | # Append features from search engine back to ranklib judgment list 73 | for judgment in judgments: 74 | if judgment.qid != qid: 75 | raise RuntimeError(f"Judgment qid {judgment.qid} inconsistent with logged qid {qid}") 76 | if judgment.keywords != keywords: 77 | raise RuntimeError(f"Judgment keywords {judgment.keywords} inconsistent with logged keywords {keywords}") 78 | if judgment.doc_id not in document_features: 79 | print(f"Missing doc {judgment.doc_id} with error") 80 | continue 81 | judgment.features = document_features[judgment.doc_id] 82 | 83 | # Return a paired down judgments if we are missing features for judgments 84 | training_set = [] 85 | discarded = [] 86 | for judgment in judgments: 87 | if self.drop_missing: 88 | if judgment.has_features(): 89 | training_set.append(judgment) 90 | else: 91 | discarded.append(judgment) 92 | else: 93 | training_set.append(judgment) 94 | # print("Discarded %s Keep %s" % (len(discarded), len(training_set))) 95 | self.logged.extend(training_set) 96 | return training_set, discarded 97 | -------------------------------------------------------------------------------- /ltr/plots.py: -------------------------------------------------------------------------------- 1 | # Crocodile Dundee and Rocky have nice Linear Shapes 2 | 3 | import numpy 4 | import pylab as pl 5 | import matplotlib as mpl 6 | from ltr.judgments import judgments_to_nparray 7 | 8 | norm = mpl.colors.Normalize(0,1.0) 9 | 10 | def plot_judgments(qids, xlabel, ylabel, judg_list, focus=None, 11 | title_prepend="Features for:"): 12 | if focus is None: 13 | focus=qids 14 | 15 | features, predictors, _ = judgments_to_nparray(judg_list) 16 | 17 | from random import shuffle 18 | from itertools import product 19 | r = list(range(0,5,1)); shuffle(r) 20 | g = list(range(0,5,1)); shuffle(g) 21 | b = list(range(0,5,1)); shuffle(b) 22 | 23 | out_of_focus_alpha=0.1 24 | in_focus_alpha=0.9 25 | 26 | if len(qids) > 3: 27 | # Make a random set of colors per query 28 | colors = [[r*0.1,g*0.1,b*0.1,out_of_focus_alpha] for r,g,b in product(r,g,b)] 29 | shuffle(colors) 30 | else: 31 | colors = ["lightgreen", "maroon"] 32 | 33 | qid_col=predictors[:,1] 34 | qid_idxs=numpy.array([]) 35 | kws = [] 36 | markers=('.', 'P') # Negative / Positive relevance markers... 37 | legend_paths=[] 38 | legend_labels=[] 39 | for idx, qid in enumerate(qids): 40 | qid_idxs=numpy.argwhere(qid_col==qid).ravel().astype(int) 41 | judgment=judg_list[qid_idxs[-1].item()] 42 | kws.append(judgment.keywords) 43 | x_qidA = features[qid_idxs] 44 | x_qidA 45 | y_qidA = predictors[qid_idxs, 0] 46 | color = colors[idx] 47 | for grade in [1,0]: 48 | this_grade=numpy.argwhere(y_qidA==grade) 49 | path = pl.scatter(x_qidA[this_grade,0], 50 | x_qidA[this_grade,1], 51 | marker=markers[grade], 52 | linewidth=1, 53 | s=80.0, 54 | facecolors=color, 55 | edgecolors=color, 56 | norm=norm) 57 | legend_paths.append(path) 58 | if grade == 0: 59 | legend_labels.append(judgment.keywords + " irrelevant movie") 60 | else: 61 | legend_labels.append(judgment.keywords + " relevant movie") 62 | 63 | 64 | 65 | pl.title(title_prepend + " {:.25}".format(", ".join(kws))) 66 | pl.xlabel(xlabel=xlabel) 67 | pl.ylabel(ylabel=ylabel) 68 | pl.legend(legend_paths, legend_labels, loc='lower center', 69 | bbox_to_anchor=[0.5,-0.5]) 70 | pl.savefig('fig.png', dpi=300, bbox_inches='tight') 71 | 72 | #plot_all(predictors) 73 | 74 | def plot_pairwise_data(features, predictors, title, 75 | graph_features=[0,1], 76 | xlabel="Delta Title BM25", 77 | ylabel="Delta Overview BM25"): 78 | legend_paths=[] 79 | for pred in [-1,1]: 80 | if pred == -1: 81 | marker = '.' 82 | elif pred == 1: 83 | marker = '+' 84 | path = pl.scatter(features[predictors==pred, graph_features[0]], 85 | features[predictors==pred, graph_features[1]], 86 | marker=marker) 87 | legend_paths.append(path) 88 | 89 | 90 | pl.title(title) 91 | pl.xlabel(xlabel=xlabel) 92 | pl.ylabel(ylabel=ylabel) 93 | pl.legend(legend_paths, ["Irrelevant minus Relevant", "Relevant minus Irrelevant"], loc='lower center', 94 | bbox_to_anchor=[0.5,-0.5]) 95 | pl.savefig('all_relevances.png', bbox_inches='tight', dpi=600) 96 | -------------------------------------------------------------------------------- /ltr/sdbn_functions.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import glob 3 | 4 | def all_sessions(): 5 | sessions = pandas.concat([pandas.read_csv(f, compression="gzip") 6 | for f in glob.glob("data/*_sessions.gz")]) 7 | sessions = sessions.sort_values(['query', 'sess_id', 'rank']) 8 | sessions = sessions.rename(columns={"clicked_doc_id": "doc_id"}) 9 | return sessions 10 | 11 | def get_sessions(query="", index=True): 12 | sessions = all_sessions() 13 | sessions = sessions[sessions["query"] == query] 14 | return sessions if not index else sessions.set_index("sess_id") 15 | 16 | def calculate_ctr(sessions): 17 | click_counts = sessions.groupby("doc_id")["clicked"].sum() 18 | sess_counts = sessions.groupby("doc_id")["sess_id"].nunique() 19 | ctrs = click_counts / sess_counts 20 | return ctrs.sort_values(ascending=False) 21 | 22 | def calculate_average_rank(sessions): 23 | avg_rank = sessions.groupby("doc_id")["rank"].mean() 24 | return avg_rank.sort_values(ascending=True) 25 | 26 | def caclulate_examine_probability(sessions): 27 | last_click_per_session = sessions.groupby(["clicked", "sess_id"])["rank"].max()[True] 28 | sessions["last_click_rank"] = last_click_per_session 29 | sessions["examined"] = sessions["rank"] <= sessions["last_click_rank"] 30 | return sessions 31 | 32 | def calculate_clicked_examined(sessions): 33 | sessions = caclulate_examine_probability(sessions) 34 | return sessions[sessions["examined"]] \ 35 | .groupby("doc_id")[["clicked", "examined"]].sum() 36 | 37 | def calculate_grade(sessions): 38 | sessions = calculate_clicked_examined(sessions) 39 | sessions["grade"] = sessions["clicked"] / sessions["examined"] 40 | return sessions.sort_values("grade", ascending=False) 41 | 42 | def calculate_prior(sessions, prior_grade, prior_weight): 43 | sessions = calculate_grade(sessions) 44 | sessions["prior_a"] = prior_grade * prior_weight 45 | sessions["prior_b"] = (1 - prior_grade) * prior_weight 46 | return sessions 47 | 48 | def calculate_sdbn(sessions, prior_grade=0.3, prior_weight=100): 49 | sessions = calculate_prior(sessions, prior_grade, prior_weight) 50 | sessions["posterior_a"] = (sessions["prior_a"] + 51 | sessions["clicked"]) 52 | sessions["posterior_b"] = (sessions["prior_b"] + 53 | sessions["examined"] - sessions["clicked"]) 54 | sessions["beta_grade"] = (sessions["posterior_a"] / 55 | (sessions["posterior_a"] + sessions["posterior_b"])) 56 | return sessions.sort_values("beta_grade", ascending=False) -------------------------------------------------------------------------------- /ltr/search.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | baseEsQuery = { 4 | "size": 5, 5 | "query": { 6 | "sltr": { 7 | "params": { 8 | "keywords": "", 9 | }, 10 | "model": "" 11 | } 12 | } 13 | } 14 | 15 | def esLtrQuery(keywords, modelName): 16 | import json 17 | baseEsQuery['query']['sltr']['params']['keywords'] = keywords 18 | baseEsQuery['query']['sltr']['model'] = modelName 19 | print("%s" % json.dumps(baseEsQuery)) 20 | return baseEsQuery 21 | 22 | # TODO: Parse params and add efi dynamically instead of adding manually to query below 23 | def solrLtrQuery(keywords, modelName): 24 | keywords = re.sub('([^\s\w]|_)+', '', keywords) 25 | fuzzy_keywords = ' '.join([x + '~' for x in keywords.split(' ')]) 26 | 27 | return { 28 | 'fl': '*,score', 29 | 'rows': 5, 30 | 'q': '{{!ltr reRankDocs=30000 model={} efi.keywords="{}" efi.fuzzy_keywords="{}"}}'.format(modelName, keywords, fuzzy_keywords) 31 | } 32 | 33 | 34 | tmdbFields = { 35 | 'title': 'title', 36 | 'display_fields': ['release_year', 'genres', 'overview'] 37 | } 38 | 39 | 40 | 41 | def search(client, keywords, modelName, index='tmdb', fields=tmdbFields): 42 | if client.name() == 'elastic': 43 | results = client.query(index, esLtrQuery(keywords, modelName)) 44 | else: 45 | results = client.query(index, solrLtrQuery(keywords, modelName)) 46 | 47 | ti = fields['title'] 48 | 49 | for result in results: 50 | print("%s " % (result[ti] if ti in result else 'N/A')) 51 | print("%s " % (result['_score'])) 52 | 53 | for df in fields['display_fields']: 54 | print("%s " % (result[df] if df in result else 'N/A')) 55 | 56 | print("---------------------------------------") 57 | -------------------------------------------------------------------------------- /ltr/years_as_ratings.py: -------------------------------------------------------------------------------- 1 | def get_classic_rating(year): 2 | if year > 2010: 3 | return 0 4 | elif year > 1990: 5 | return 1 6 | elif year > 1970: 7 | return 2 8 | elif year > 1950: 9 | return 3 10 | else: 11 | return 4 12 | 13 | def get_latest_rating(year): 14 | if year > 2010: 15 | return 4 16 | elif year > 1990: 17 | return 3 18 | elif year > 1970: 19 | return 2 20 | elif year > 1950: 21 | return 1 22 | else: 23 | return 0 24 | 25 | def synthesize(client, featureSet='release', latestTrainingSetOut='data/latest-training.txt', classicTrainingSetOut='data/classic-training.txt'): 26 | from ltr.judgments import judgments_to_file, Judgment 27 | print('Generating ratings for classic and latest model') 28 | NO_ZERO = False 29 | 30 | resp = client.log_query('tmdb', 'release', None) 31 | 32 | docs = [] 33 | for hit in resp: 34 | feature = list(hit['[features]'].values())[0] 35 | docs.append([feature]) # Treat features as ordered lists 36 | 37 | # Classic film fan 38 | judgments = [] 39 | for fv in docs: 40 | rating = get_classic_rating(fv[0]) 41 | 42 | if rating == 0 and NO_ZERO: 43 | continue 44 | 45 | judgments.append(Judgment(qid=1,doc_id=rating,grade=rating,features=fv,keywords='')) 46 | 47 | with open(classicTrainingSetOut, 'w') as out: 48 | judgments_to_file(out, judgments) 49 | 50 | judgments = [] 51 | for fv in docs: 52 | rating = get_latest_rating(fv[0]) 53 | 54 | if rating == 0 and NO_ZERO: 55 | continue 56 | 57 | judgments.append(Judgment(qid=1,doc_id=rating,grade=rating,features=fv,keywords='')) 58 | 59 | 60 | with open(latestTrainingSetOut, 'w') as out: 61 | judgments_to_file(out, judgments) 62 | 63 | print('Done') -------------------------------------------------------------------------------- /semantic_search/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../..') 3 | from aips import get_entity_extractor, get_sparse_semantic_search 4 | from .query_tree import enrich, to_queries 5 | 6 | def generate_tagged_query(extracted_entities): 7 | query = extracted_entities["query"] 8 | last_end = 0 9 | tagged_query = "" 10 | for tag in extracted_entities["tags"]: 11 | next_text = query[last_end:tag["startOffset"]].strip() 12 | if len(next_text) > 0: 13 | tagged_query += " " + next_text 14 | tagged_query += " {" + tag["matchText"] + "}" 15 | last_end = tag["endOffset"] 16 | if last_end < len(query): 17 | final_text = query[last_end:len(query)].strip() 18 | if len(final_text): 19 | tagged_query += " " + final_text 20 | return tagged_query 21 | 22 | def generate_query_tree(extracted_entities): 23 | query = extracted_entities["query"] 24 | entities = {entity["id"]: entity for entity 25 | in extracted_entities["entities"]} 26 | query_tree = [] 27 | last_end = 0 28 | 29 | for tag in extracted_entities["tags"]: 30 | best_entity = entities[tag["ids"][0]] 31 | for entity_id in tag["ids"]: 32 | if (entities[entity_id]["popularity"] > 33 | best_entity["popularity"]): 34 | best_entity = entities[entity_id] 35 | 36 | next_text = query[last_end:tag["startOffset"]].strip() 37 | if next_text: 38 | query_tree.append({"type": "keyword", 39 | "surface_form": next_text, 40 | "canonical_form": next_text}) 41 | query_tree.append(best_entity) 42 | last_end = tag["endOffset"] 43 | 44 | if last_end < len(query): 45 | final_text = query[last_end:len(query)].strip() 46 | if final_text: 47 | query_tree.append({"type": "keyword", 48 | "surface_form": final_text, 49 | "canonical_form": final_text}) 50 | return query_tree 51 | 52 | def process_semantic_query(collection, entities_collection, query): 53 | extractor = get_entity_extractor(entities_collection) 54 | semantic_functions = get_sparse_semantic_search() 55 | entities = extractor.extract_entities(query) 56 | tagged_query = generate_tagged_query(entities) 57 | query_tree = generate_query_tree(entities) 58 | enriched_query = " ".join([str(q) for q in query_tree]) 59 | enriched_query_tree = enrich(collection, query_tree) 60 | transformed = semantic_functions.transform_query(enriched_query_tree) 61 | 62 | return { 63 | "tagged_query": tagged_query, 64 | "parsed_query": enriched_query, 65 | "transformed_query": to_queries(transformed)[0], 66 | "tagger_data": entities 67 | } 68 | 69 | def process_basic_query(query): 70 | semantic_functions = get_sparse_semantic_search() 71 | return {"transformed_query": semantic_functions.generate_basic_query(query)} -------------------------------------------------------------------------------- /semantic_search/query_tree.py: -------------------------------------------------------------------------------- 1 | from aips import get_semantic_knowledge_graph, get_sparse_semantic_search 2 | 3 | semantic_functions = get_sparse_semantic_search() 4 | 5 | def create_geo_filter(coordinates, field, distance_in_KM): 6 | return semantic_functions.create_geo_filter(coordinates, field, distance_in_KM) 7 | 8 | def popularity(query, position): 9 | return semantic_functions.popularity(query, position) 10 | 11 | def location_distance(query, position): 12 | return semantic_functions.location_distance(query, position) 13 | 14 | def to_queries(query_tree): 15 | return [node["query"] for node in query_tree] 16 | 17 | def process_semantic_functions(query_tree): 18 | position = 0 19 | while position < len(query_tree): 20 | node = query_tree[position] 21 | if node["type"] == "semantic_function": 22 | query = {"query_tree": query_tree} 23 | command_successful = eval(node["semantic_function"]) 24 | if not command_successful: 25 | node["type"] = "invalid_semantic_function" 26 | position += 1 27 | return query_tree 28 | 29 | def get_enrichments(collection, keyword, limit=4): 30 | enrichments = {} 31 | nodes_to_traverse = [{"field": "content", 32 | "values": [keyword], 33 | "default_operator": "OR"}, 34 | [{"name": "related_terms", 35 | "field": "content", 36 | "limit": limit}, 37 | {"name": "doc_type", 38 | "field": "doc_type", 39 | "limit": 1}]] 40 | skg = get_semantic_knowledge_graph(collection) 41 | traversals = skg.traverse(*nodes_to_traverse) 42 | if "traversals" not in traversals["graph"][0]["values"][keyword]: 43 | return enrichments 44 | 45 | nested_traversals = traversals["graph"][0]["values"][keyword]["traversals"] 46 | 47 | doc_types = list(filter(lambda t: t["name"] == "doc_type", 48 | nested_traversals)) 49 | if doc_types: 50 | enrichments["category"] = next(iter(doc_types[0]["values"])) 51 | 52 | related_terms = list(filter(lambda t: t["name"] == "related_terms", 53 | nested_traversals)) 54 | if related_terms: 55 | term_vector = "" 56 | for term, data in related_terms[0]["values"].items(): 57 | term_vector += f'{term}^{round(data["relatedness"], 4)} ' 58 | enrichments["term_vector"] = term_vector.strip() 59 | 60 | return enrichments 61 | 62 | def enrich(collection, query_tree): 63 | query_tree = process_semantic_functions(query_tree) 64 | for item in query_tree: 65 | if item["type"] == "keyword": 66 | enrichments = get_enrichments(collection, item["surface_form"]) 67 | if enrichments: 68 | item["type"] = "skg_enriched" 69 | item["enrichments"] = enrichments 70 | return query_tree -------------------------------------------------------------------------------- /webserver/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /webserver/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/local/anaconda3/bin/python" 3 | } -------------------------------------------------------------------------------- /webserver/display/render_search_results.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../..') 3 | from aips import * 4 | import os, re 5 | 6 | def render_search_results(results, keywords_to_highlight): 7 | file_path = os.path.dirname(os.path.abspath(__file__)) 8 | search_results_template_file = os.path.join(file_path, "search-results-template.html") 9 | with open(search_results_template_file) as file: 10 | file_content = file.read() 11 | 12 | template_syntax = "(.*)" 13 | header_template = re.sub(template_syntax, "", file_content, flags=re.S) 14 | 15 | results_template_syntax = "(.*)" 16 | x = re.search(results_template_syntax, file_content, flags=re.S) 17 | results_template = x.group(1) 18 | 19 | separator_template_syntax = "(.*)" 20 | x = re.search(separator_template_syntax, file_content, flags=re.S) 21 | separator_template = x.group(1) 22 | 23 | rendered = "" 24 | for result in results["docs"]: 25 | #todo: add highlighting 26 | coordinates = result["location_coordinates"].split(",") 27 | rendered += results_template.replace("${NAME}", result.get("business_name", "UNKNOWN")) \ 28 | .replace("${CITY}", result.get("city", "Anywhere") + ", " + result.get("state", "USA"))\ 29 | .replace("${IMAGE_URL}", "/map?lat=" + coordinates[0] + "&lon=" + coordinates[1]) \ 30 | .replace("${STARS}", "★" * int(result.get("stars_rating", 0))) 31 | rendered += separator_template 32 | 33 | if rendered == "": 34 | rendered = "No Results for this query." 35 | 36 | return rendered -------------------------------------------------------------------------------- /webserver/display/search-results-template.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 |
5 |
6 | 7 |
8 |
9 |

Name: ${NAME} | City: ${CITY} | Rating: ${STARS}

10 |

11 |
12 |
13 | 14 | 15 | 16 |
17 |
18 |
19 | 20 | 21 |
22 |
-------------------------------------------------------------------------------- /webserver/is-running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/webserver/is-running.png -------------------------------------------------------------------------------- /webserver/start-webserver.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append('..') 4 | import http.server 5 | import io 6 | import json 7 | import threading 8 | import webbrowser 9 | 10 | import sys 11 | sys.path.append('..') 12 | import urllib.parse 13 | import json 14 | import requests 15 | 16 | from urllib.parse import parse_qs, urlparse 17 | 18 | from aips import get_engine, get_entity_extractor, get_semantic_knowledge_graph, get_sparse_semantic_search 19 | from aips.environment import AIPS_WEBSERVER_HOST, AIPS_WEBSERVER_PORT, WEBSERVER_URL 20 | from staticmap import CircleMarker, StaticMap 21 | 22 | from webserver.display.render_search_results import render_search_results 23 | from semantic_search import process_semantic_query, process_basic_query 24 | 25 | engine = get_engine() 26 | reviews_collection = engine.get_collection("reviews") 27 | entities_collection = engine.get_collection("entities") 28 | entity_extractor = get_entity_extractor(entities_collection) 29 | query_transformer = get_sparse_semantic_search() 30 | 31 | def keyword_search(text): 32 | request = {"query": text, 33 | "query_fields": ["content"]} 34 | return reviews_collection.search(**request) 35 | 36 | class SemanticSearchHandler(http.server.SimpleHTTPRequestHandler): 37 | """Semantic Search Handler (AI-Powered Search)""" 38 | 39 | def sendResponse(self, response): 40 | try: 41 | self.send_response(200) 42 | self.end_headers() 43 | self.wfile.write(bytes(json.dumps(response), 'utf-8')) 44 | except Exception as ex: 45 | self.send_error(500, ex) 46 | 47 | 48 | def sendImageResponse(self, response): 49 | try: 50 | self.send_response(200) 51 | self.end_headers() 52 | self.wfile.write(bytes(response)) 53 | except Exception as ex: 54 | self.send_error(500, ex) 55 | 56 | def do_POST(self): 57 | content_len = int(self.headers.get("Content-Length"), 0) 58 | post_body = self.rfile.read(content_len).decode('UTF-8') 59 | 60 | if (self.path.startswith("/tag_query")): 61 | self.sendResponse(entity_extractor.extract_entities(post_body)) 62 | elif self.path.startswith("/tag_places"): 63 | request = {"query": post_body, 64 | "query_fields": ["city", "state", "location_coordinates"]} 65 | response = reviews_collection.search(**request) 66 | self.sendResponse(response) 67 | elif self.path.startswith("/process_semantic_query"): 68 | self.sendResponse(process_semantic_query(reviews_collection, 69 | entities_collection, 70 | post_body)) 71 | elif self.path.startswith("/process_basic_query"): 72 | self.sendResponse(process_basic_query(post_body)) 73 | elif self.path.startswith("/run_search"): 74 | results = keyword_search(post_body) 75 | highlight_terms = post_body.split(' ') 76 | rendered_results = render_search_results(results, highlight_terms) 77 | self.sendResponse(rendered_results) 78 | 79 | def do_GET(self): 80 | if self.path.startswith("/search") or self.path.startswith("/semantic-search"): 81 | self.path = "display/search.html" 82 | http.server.SimpleHTTPRequestHandler.do_GET(self) 83 | http.server.SimpleHTTPRequestHandler.do_GET(self) 84 | elif self.path.startswith("/map"): 85 | qsVars = parse_qs(urlparse(self.path).query) 86 | if 'lat' in qsVars and 'lon' in qsVars: 87 | lat = float(qsVars["lat"][0]) 88 | lon = float(qsVars["lon"][0]) 89 | zoom = int(qsVars['zoom'][0]) if 'zoom' in qsVars else 10 90 | m = StaticMap(200, 200) 91 | marker_outline = CircleMarker((lon, lat), 'white', 18) 92 | marker = CircleMarker((lon, lat), '#0036FF', 12) 93 | m.add_marker(marker_outline) 94 | m.add_marker(marker) 95 | 96 | image = m.render(zoom=zoom) 97 | buf = io.BytesIO() 98 | image.save(buf, format='JPEG') 99 | self.sendImageResponse(buf.getvalue()) 100 | elif self.path.startswith("/healthcheck"): 101 | self.send_response(200) 102 | self.send_header('Access-Control-Allow-Private-Network', 'true') 103 | self.send_header('Access-Control-Allow-Origin','*') 104 | self.send_header('Content-type','image/png') 105 | self.end_headers() 106 | #Open the static file requested and send it 107 | image = open("is-running.png", 'br') 108 | self.wfile.write(image.read()) 109 | image.close() 110 | 111 | def open_browser(): 112 | """Start a browser after waiting for half a second.""" 113 | FILE = "semantic-search" 114 | def _open_browser(): 115 | if AIPS_WEBSERVER_HOST == "localhost": 116 | webbrowser.open(WEBSERVER_URL + '/%s' % FILE) 117 | thread = threading.Timer(0.5, _open_browser) 118 | thread.start() 119 | 120 | def start_server(): 121 | """Start the server.""" 122 | server_address = ("0.0.0.0", int(AIPS_WEBSERVER_PORT)) 123 | server = http.server.HTTPServer(server_address, SemanticSearchHandler) 124 | server.serve_forever() 125 | 126 | if __name__ == "__main__": 127 | open_browser() 128 | start_server() --------------------------------------------------------------------------------