├── .gitattributes
├── .gitignore
├── README.md
├── aips
├── __init__.py
├── data_loaders
│ ├── cities.py
│ ├── index_time_boosts.py
│ ├── movies.py
│ ├── outdoors.py
│ ├── products.py
│ └── reviews.py
├── environment.py
├── indexers
│ └── product.py
├── search_requests.py
└── spark
│ ├── __init__.py
│ └── dataframe.py
├── build
├── Dockerfile
├── ch5_spacy_requirements.txt
├── ipython_kernel_config.py
├── log4j.properties
└── requirements.txt
├── chapters
├── ch03
│ ├── 1.vectors-and-text-similarity.ipynb
│ └── 2.controlling-relevance.ipynb
├── ch04
│ ├── 1.setting-up-the-retrotech-dataset.ipynb
│ └── 2.signals-boosting.ipynb
├── ch05
│ ├── 1.open-information-extraction.ipynb
│ ├── 2.index-datasets.ipynb
│ ├── 3.semantic-knowledge-graph.ipynb
│ └── licenses
│ │ └── hearst.NOTICE.txt
├── ch06
│ ├── 1.skg-classification-disambiguation.ipynb
│ ├── 2.related-keywords-from-signals.ipynb
│ ├── 3.spell-correction.ipynb
│ ├── bonus.phrase-detection.ipynb
│ └── bonus.related-terms-from-documents.ipynb
├── ch07
│ ├── 1.index-datasets.ipynb
│ └── 2.semantic-search.ipynb
├── ch08
│ └── 1.signals-boosting.ipynb
├── ch09
│ ├── 1.personalization.ipynb
│ └── 2.embedding-based-personalization.ipynb
├── ch10
│ ├── 1.setup-the-movie-db.ipynb
│ ├── 2.judgments-and-logging.ipynb
│ ├── 3.pairwise-transform.ipynb
│ └── 4.train-and-evaluate-the-model.ipynb
├── ch11
│ ├── 0.setup.ipynb
│ ├── 1.click-through-rate-judgments.ipynb
│ ├── 2.sdbn-judgments-to-overcome-position-bias.ipynb
│ ├── 3.SDBN-Confidence-Bias.ipynb
│ ├── 4.train-upload-search-ltr.ipynb
│ ├── a.defunct.synthesize-search-sessions.ipynb
│ ├── a.synthesize-search-sessions.ipynb
│ └── session_gen.py
├── ch12
│ ├── 0.setup.ipynb
│ └── 1.ab-testing-to-active-learning.ipynb
├── ch13
│ ├── 1.setting-up-the-outdoors-dataset.ipynb
│ ├── 2.introduction-to-transformers.ipynb
│ ├── 3.natural-language-autocomplete.ipynb
│ ├── 4.semantic-search.ipynb
│ ├── 5.quantization.ipynb
│ ├── bert-vocab.txt
│ ├── ch13-tokenizer-analysis.ipynb
│ ├── outdoors_golden_answers.xlsx
│ └── pull_aips_dependency.py
├── ch14
│ ├── 1.question-answering-visualizer.ipynb
│ ├── 2.question-answering-data-preparation.ipynb
│ ├── 3.question-answering-fine-tuning.ipynb
│ └── 4.question-answering-demo-application.ipynb
├── ch15
│ ├── 1.llm-exploration.ipynb
│ ├── 2.multimodal-and-hybrid-search.ipynb
│ ├── a.generate-movie-embeddings.ipynb
│ ├── delorean-query.jpg
│ └── mockedGenerativeResponses.csv
└── welcome.ipynb
├── data
└── retrotech
│ ├── images
│ ├── 021331131393.jpg
│ ├── 027242755871.jpg
│ ├── 027242831599.jpg
│ ├── 037988909926.jpg
│ ├── 037988910045.jpg
│ ├── 037988910182.jpg
│ ├── 037988910250.jpg
│ ├── 037988910427.jpg
│ ├── 048231316835.jpg
│ ├── 05024545249224.jpg
│ ├── 072244106916.jpg
│ ├── 12505382925.jpg
│ ├── 12505451713.jpg
│ ├── 12505525766.jpg
│ ├── 12505527456.jpg
│ ├── 12505559105.jpg
│ ├── 14381196320.jpg
│ ├── 21331131393.jpg
│ ├── 23272335397.jpg
│ ├── 24543701538.jpg
│ ├── 25192107191.jpg
│ ├── 27108936499.jpg
│ ├── 27242752436.jpg
│ ├── 27242755871.jpg
│ ├── 27242798236.jpg
│ ├── 27242831599.jpg
│ ├── 32429037763.jpg
│ ├── 36172950027.jpg
│ ├── 36725560390.jpg
│ ├── 36725560451.jpg
│ ├── 36725561977.jpg
│ ├── 36725569331.jpg
│ ├── 36725569454.jpg
│ ├── 36725578241.jpg
│ ├── 37988909926.jpg
│ ├── 37988910045.jpg
│ ├── 37988910182.jpg
│ ├── 37988910250.jpg
│ ├── 37988910427.jpg
│ ├── 400032015667.jpg
│ ├── 400037252074.jpg
│ ├── 400037252258.jpg
│ ├── 400037252432.jpg
│ ├── 400037252616.jpg
│ ├── 400037252890.jpg
│ ├── 400037253316.jpg
│ ├── 400192926087.jpg
│ ├── 45626176.jpg
│ ├── 47875841406.jpg
│ ├── 47875841420.jpg
│ ├── 47875842328.jpg
│ ├── 47875842335.jpg
│ ├── 48231011396.jpg
│ ├── 48231011402.jpg
│ ├── 48231316835.jpg
│ ├── 48231317436.jpg
│ ├── 48231317498.jpg
│ ├── 5024545249224.jpg
│ ├── 600603105265.jpg
│ ├── 600603138423.jpg
│ ├── 603497664429.jpg
│ ├── 610839379408.jpg
│ ├── 612572171585.jpg
│ ├── 635753490879.jpg
│ ├── 635753493559.jpg
│ ├── 635753493573.jpg
│ ├── 665331101927.jpg
│ ├── 694318011294.jpg
│ ├── 696055169191.jpg
│ ├── 708056579739.jpg
│ ├── 708056579746.jpg
│ ├── 708431390614.jpg
│ ├── 711719842309.jpg
│ ├── 716829772249.jpg
│ ├── 72244106916.jpg
│ ├── 722868830062.jpg
│ ├── 722868840177.jpg
│ ├── 74108007469.jpg
│ ├── 74108056764.jpg
│ ├── 74108096487.jpg
│ ├── 77283045400.jpg
│ ├── 783722274422.jpg
│ ├── 786936817218.jpg
│ ├── 793447512228.jpg
│ ├── 803238004525.jpg
│ ├── 821793013776.jpg
│ ├── 826663114164.jpg
│ ├── 826663126044.jpg
│ ├── 843163089211.jpg
│ ├── 843404073153.jpg
│ ├── 84691170679.jpg
│ ├── 84691211174.jpg
│ ├── 84691226703.jpg
│ ├── 84691226727.jpg
│ ├── 848447000005.jpg
│ ├── 848447000081.jpg
│ ├── 848447000135.jpg
│ ├── 856751002097.jpg
│ ├── 878816004532.jpg
│ ├── 883049066905.jpg
│ ├── 883929085118.jpg
│ ├── 883929106172.jpg
│ ├── 883929154012.jpg
│ ├── 884116069973.jpg
│ ├── 885038021209.jpg
│ ├── 885038024644.jpg
│ ├── 885038024651.jpg
│ ├── 885170045132.jpg
│ ├── 885370315080.jpg
│ ├── 885370325348.jpg
│ ├── 885909300549.jpg
│ ├── 885909393404.jpg
│ ├── 885909394845.jpg
│ ├── 885909395095.jpg
│ ├── 885909457588.jpg
│ ├── 885909457595.jpg
│ ├── 885909457601.jpg
│ ├── 885909457632.jpg
│ ├── 885909471812.jpg
│ ├── 885909472376.jpg
│ ├── 886111271283.jpg
│ ├── 886111287055.jpg
│ ├── 886971404722.jpg
│ ├── 886973561621.jpg
│ ├── 92636260712.jpg
│ ├── 93624956037.jpg
│ ├── 93624995012.jpg
│ ├── 97360716641.jpg
│ ├── 97360722345.jpg
│ ├── 97360724240.jpg
│ ├── 97360810042.jpg
│ ├── 97363532149.jpg
│ ├── 97363560449.jpg
│ ├── 97368920347.jpg
│ ├── playground_tues.ipynb
│ └── unavailable.jpg
│ └── templates
│ └── search-results.html
├── docker-compose.yml
├── engines
├── Collection.py
├── Engine.py
├── EntityExtractor.py
├── LTR.py
├── README.md
├── SemanticKnowledgeGraph.py
├── SparseSemanticSearch.py
├── opensearch
│ ├── OpenSearchCollection.py
│ ├── OpenSearchEngine.py
│ ├── OpenSearchLTR.py
│ ├── OpenSearchSparseSemanticSearch.py
│ ├── build
│ │ ├── engine-Dockerfile
│ │ ├── log4j2.properties
│ │ ├── ltr-2.14.0-os2.14.0.zip
│ │ ├── opensearch-docker-entrypoint.sh
│ │ ├── opensearch-onetime-setup.sh
│ │ └── performance-analyzer.properties
│ └── config.py
└── solr
│ ├── SolrCollection.py
│ ├── SolrEngine.py
│ ├── SolrEntityExtractor.py
│ ├── SolrLTR.py
│ ├── SolrSemanticKnowledgeGraph.py
│ ├── SolrSparseSemanticSearch.py
│ ├── __init__.py
│ ├── build
│ ├── Dockerfile
│ ├── log4j2-config.xml
│ ├── run_solr_w_ltr.sh
│ └── solr.xml
│ └── config.py
├── ltr
├── MART_model.py
├── __init__.py
├── clickmodels
│ ├── __init__.py
│ ├── cascade.py
│ ├── coec.py
│ ├── conversion.py
│ ├── pbm.py
│ ├── sdbn.py
│ ├── session.py
│ └── ubm.py
├── client
│ ├── __init__.py
│ ├── base_client.py
│ ├── solr_client.py
│ └── solr_parse.py
├── download.py
├── evaluate.py
├── helpers
│ ├── __init__.py
│ ├── butterfingers.py
│ ├── convert.py
│ ├── defaultlist.py
│ ├── esUrlParse.py
│ ├── handle_resp.py
│ ├── msmarco
│ │ ├── __init__.py
│ │ └── evaluate.py
│ ├── ranklib_result.py
│ ├── solr_escape.py
│ ├── tau.py
│ └── timed_block.py
├── index.py
├── injectTypos.py
├── judgments.py
├── log.py
├── plots.py
├── ranklib.py
├── sdbn_functions.py
├── search.py
└── years_as_ratings.py
├── semantic_search
├── __init__.py
└── query_tree.py
└── webserver
├── .vscode
├── launch.json
└── settings.json
├── display
├── render_search_results.py
├── search-results-template.html
└── search.html
├── is-running.png
├── managed-schema.xml
└── start-webserver.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | *.sh text eol=lf
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | system.config
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
132 | #Mac
133 | *.DS_Store
134 |
135 | #no-commit folders
136 | *no-commit/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AI-Powered Search
2 |
3 | Code examples for the book [_AI-Powered Search_](https://aipoweredsearch.com) by [Trey Grainger](https://www.linkedin.com/in/treygrainger/), [Doug Turnbull](https://www.linkedin.com/in/softwaredoug/), and [Max Irwin](https://www.linkedin.com/in/maxirwin/). Published by [Manning Publications](https://www.manning.com).
4 |
5 |
6 |
7 |
10 |
11 | ## Book Overview
12 | [_AI-Powered Search_](https://aipoweredsearch.com) teaches you the latest machine learning techniques to build search engines that continuously learn from your users and your content to drive more domain-aware and intelligent search.
13 |
14 | Search engine technology is rapidly evolving, with Artificial Intelligence (AI) driving much of that innovation. Crowdsourced relevance and the integration of large language models (LLMs) like GPT and other foundation models are massively accelerating the capabilities and expectations of search technology.
15 |
16 | AI-Powered Search will teach you modern, data-science-driven search techniques like:
17 | - Semantic search using dense vector embeddings from foundation models
18 | - Retrieval Augmented Generation
19 | - Question answering and summarization combining search and LLMs
20 | - Fine-tuning transformer-based LLMs
21 | - Personalized search based on user signals and vector embeddings
22 | - Collecting user behavioral signals and building signals boosting models
23 | - Semantic knowledge graphs for domain-specific learning
24 | - Implementing machine-learned ranking models (learning to rank)
25 | - Building click models to automate machine-learned ranking
26 | - Generative search, hybrid search, and the search frontier
27 |
28 | Today’s search engines are expected to be smart, understanding the nuances of natural language queries, as well as each user’s preferences and context. This book empowers you to build search engines that take advantage of user interactions and the hidden semantic relationships in your content to automatically deliver better, more relevant search experiences.
29 |
30 | ## How to run
31 | For simplicity of setup, all code is shipped in Jupyter Notebooks and packaged in Docker containers. This means that installing Docker and then pulling (or building) and running the book's Docker containers is the only necessary setup. [Appendix A](https://livebook.manning.com/book/ai-powered-search/appendix-a?origin=code-base) of the book provides full step-by-step instructions for running the code examples, but you can run the following to get up and running quickly:
32 |
33 | If you haven't already pulled the source code locally, run:
34 | ```
35 | git clone https://github.com/treygrainger/ai-powered-search.git
36 | ```
37 |
38 | Then, to build and start the codebase with interactive Jupyter notebooks, run:
39 | ```
40 | cd ai-powered-search
41 | docker compose up
42 | ```
43 |
44 | That's all it takes! Once the containers are built and running (this may take a while, especially on the first build), visit:
45 | `http://localhost:8888` to launch the [Welcome](chapters/welcome.ipynb) notebook and see a Table of Contents for all the live code examples from throughout the book.
46 |
47 |
49 |
50 | ## Supported Technologies
51 | AI-Powered Search teaches many modern search techniques leveraging machine learning approaches. While we utilize specific technologies to demonstrate concepts, most techniques are applicable to many modern search engines and vector databases.
52 |
53 | Throughout the book, all code examples are in **Python**, with **PySpark** (the Python interface to **Apache Spark**) being utilized heavily for data processing tasks. The default search engine leveraged by the book's examples is **Apache Solr**, but most examples are abstracted away from the particular search engine, and swappable implementation will be soon available for most popular search engines and vector databases. For more information about the search engine abstractions and custom integrations check out the [engine documentation](engines/README.md).
54 |
55 | **See Full List**: [Supported Search Engines and Vector Databases](engines/README.md)
56 |
57 | [ *Note*: if you work for a search engine / vector database company, project, or hosting provider and want to work with us on getting your engine supported, please reach out to trey@searchkernel.com ]
58 |
59 | ## Questions and help
60 | Your purchase of _AI-Powered Search_ includes online access to Manning's [LiveBook forum](https://livebook.manning.com/forum?product=graingert). This allows you to provide comments and ask questions about any parts of the book. Additionally, feel free to submit pull requests, Github issues, or comments on the project's official Github repo at https://github.com/treygrainger/ai-powered-search.
61 |
62 | ## License
63 | All code in this repository is open source under the [Apache License, Version 2.0 (ASL 2.0)](https://www.apache.org/licenses/LICENSE-2.0), unless otherwise specified.
64 |
65 | Note that when executing the code, it may pull additional dependencies that follow alternate licenses, so please be sure to inspect those licenses before using them in your projects to ensure they are suitable. The code may also pull in datasets subject to various licenses, some of which may be derived from AI models and some of which may be derived from web crawls of data subject to fair use under the copyright laws in the country of publication (the USA). Any such datasets are published "as-is", for the sole purpose of demonstrating the concepts in the book, and these datasets and their associated licenses may be subject to change over time.
66 |
67 | ## Grab a copy of the book
68 | If you don't yet have a copy, please support the authors and the publisher by purchasing a copy of [_AI-Powered Search_](http://aipoweredsearch.com). It will walk you step by step through the concepts and techniques shown in the code examples in this repository, providing needed context and insights to help you better understand the techniques.
69 |
--------------------------------------------------------------------------------
/aips/__init__.py:
--------------------------------------------------------------------------------
1 | import aips.environment as environment
2 | from engines.solr import SolrLTR, SolrSemanticKnowledgeGraph, SolrEntityExtractor, SolrSparseSemanticSearch
3 | from engines.solr.SolrEngine import SolrEngine
4 | from engines.solr.SolrCollection import SolrCollection
5 |
6 | from engines.opensearch.OpenSearchCollection import OpenSearchCollection
7 | from engines.opensearch.OpenSearchEngine import OpenSearchEngine
8 | from engines.opensearch.OpenSearchLTR import OpenSearchLTR
9 | from engines.opensearch.OpenSearchSparseSemanticSearch import OpenSearchSparseSemanticSearch
10 |
11 | import os
12 | from IPython.display import display, HTML
13 | import pandas
14 | import re
15 |
16 | engine_type_map = {"SOLR": SolrEngine(),
17 | "OPENSEARCH": OpenSearchEngine()}
18 |
19 | def get_engine(override=None):
20 | engine_name = override.upper() if override else environment.get("AIPS_SEARCH_ENGINE", "SOLR")
21 | return engine_type_map[engine_name]
22 |
23 | def set_engine(engine_name):
24 | engine_name = engine_name.upper()
25 | if engine_name not in engine_type_map:
26 | raise ValueError(f"No search engine implementation found for {engine_name}")
27 | else:
28 | environment.set("AIPS_SEARCH_ENGINE", engine_name)
29 |
30 | def get_ltr_engine(collection):
31 | ltr_engine_map = {SolrCollection: SolrLTR,
32 | OpenSearchCollection: OpenSearchLTR}
33 | return ltr_engine_map[type(collection)](collection)
34 |
35 | def get_semantic_knowledge_graph(collection):
36 | return SolrSemanticKnowledgeGraph(get_engine("solr").get_collection(collection.name))
37 |
38 | def get_entity_extractor(collection):
39 | return SolrEntityExtractor(get_engine("solr").get_collection(collection.name))
40 |
41 | def get_sparse_semantic_search():
42 | SSS_map = {SolrEngine: SolrSparseSemanticSearch,
43 | OpenSearchEngine: OpenSearchSparseSemanticSearch}
44 | return SSS_map[type(get_engine())]()
45 |
46 | def healthcheck():
47 | try:
48 | if get_engine().health_check():
49 | print("All Systems are ready. Happy Searching!")
50 | else:
51 | print("The search engine is not in a ready state.")
52 | except:
53 | print("Error! One or more containers are not responding.\nPlease follow the instructions in Appendix A.")
54 |
55 | def num2str(number):
56 | return str(round(number,4)) #round to 4 decimal places for readibility
57 |
58 | def vec2str(vector):
59 | return "[" + ", ".join(map(num2str,vector)) + "]"
60 |
61 | def tokenize(text):
62 | return text.replace(".","").replace(",","").lower().split()
63 |
64 | def get_executing_notebook_path():
65 | return globals().get("__vsc_ipynb_file__", #only exists during a remote vscode kernel
66 | globals().get("_dh", [None])[0])
67 |
68 | def images_directory():
69 | path = get_executing_notebook_path()
70 | if path:
71 | relative = os.path.relpath(os.environ.get("HOME"), path)
72 | else:
73 | relative = "../.."
74 | return f"{relative}/data/retrotech/images"
75 |
76 | def img_path_for_product(product):
77 | directory = images_directory()
78 | file = product.get("upc", "no-upc")
79 | if not os.path.exists(f"data/retrotech/images/{file}.jpg"):
80 | file = "unavailable"
81 | return f"{directory}/{file}.jpg"
82 |
83 | def remove_new_lines(data):
84 | return str(data).replace('\\n', '').replace('\\N', '')
85 |
86 | def as_html(data):
87 | return remove_new_lines(data).replace(", '", ",
'")
88 |
89 | def display_search(query, documents):
90 | doc_html = as_html(documents)
91 | display(HTML(f"Query: {query}
Results:"))
92 | display(HTML(doc_html))
93 |
94 | def display_product_search(query, documents):
95 | rendered_html = render_search_results(query, documents)
96 | display(HTML(rendered_html))
97 |
98 | def render_search_results(query, results):
99 | search_results_template_file = os.path.join("data/retrotech/templates/", "search-results.html")
100 | with open(search_results_template_file) as file:
101 | file_content = file.read()
102 |
103 | template_syntax = "(.*)"
104 | header_template = re.sub(template_syntax, "", file_content, flags=re.S)
105 |
106 | results_template_syntax = "(.*)"
107 | x = re.search(results_template_syntax, file_content, flags=re.S)
108 | results_template = x.group(1)
109 |
110 | separator_template_syntax = "(.*)"
111 | x = re.search(separator_template_syntax, file_content, flags=re.S)
112 | separator_template = x.group(1)
113 |
114 | rendered = header_template.replace("${QUERY}", query.replace('"', '\"'))
115 | for result in results:
116 | image_url = img_path_for_product(result)
117 | rendered += results_template.replace("${NAME}", result.get("name", "UNKNOWN")) \
118 | .replace("${MANUFACTURER}", result.get("manufacturer", "UNKNOWN")) \
119 | .replace("${IMAGE_URL}", image_url)
120 |
121 | rendered += separator_template
122 | return rendered
123 |
124 | def fetch_products(doc_ids):
125 | request = {"query": " ".join([str(id) for id in doc_ids]),
126 | "query_fields": ["upc"],
127 | "limit": len(doc_ids)}
128 | response = get_engine().get_collection("products").search(**request)
129 |
130 | df = pandas.DataFrame(response["docs"])
131 | df['upc'] = df['upc'].astype('int64')
132 | df.insert(0, 'image', df.apply(lambda row: "
", axis=1))
133 | return df
134 |
135 | def render_judged(products, judged, grade_col='ctr', label=""):
136 | """ Render the computed judgments alongside the products and description data"""
137 | w_prods = judged.merge(products, left_on='doc_id', right_on='upc', how='left')
138 |
139 | style = """
140 | """
145 | w_prods = w_prods[[grade_col, 'upc', 'image', 'name']][:5]
146 | return HTML(style +
147 | f"
{label}" + w_prods.to_html(float_format=lambda x: '%10.4f' % x,
148 | escape=False))
149 |
150 | #def print_s(series_data, column):
151 | ##pandas.set_option("display.width", 76)
152 | #dataframe = series_data.to_frame(name=column).sort_values(column, ascending=False)
153 | #merged = dataframe.merge(products, left_on='doc_id', right_on='upc', how='left')
154 | #print(merged.rename(columns={"upc": "doc_id"})[["doc_id", column, "name"]].set_index("doc_id"))
--------------------------------------------------------------------------------
/aips/data_loaders/cities.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.types import StructType, StringType, IntegerType
2 | from pyspark.sql.functions import concat_ws, lit
3 | from pyspark.sql import SparkSession
4 |
5 | def load_dataframe(csv_file):
6 | print("Loading Geonames...")
7 | schema = StructType() \
8 | .add("id",StringType(),True) \
9 | .add("name",StringType(),True) \
10 | .add("ascii_name_s",StringType(),True) \
11 | .add("alternative_names_s",StringType(),True) \
12 | .add("latitude_s",StringType(),True) \
13 | .add("longitude_s",StringType(),True) \
14 | .add("feature_class_s",StringType(),True) \
15 | .add("feature_code_s",StringType(),True) \
16 | .add("country",StringType(),True) \
17 | .add("cc2_s",StringType(),True) \
18 | .add("admin_area",StringType(),True) \
19 | .add("admin_code_2_s",StringType(),True) \
20 | .add("admin_code_3_s",StringType(),True) \
21 | .add("admin_code_4_s",StringType(),True) \
22 | .add("popularity",IntegerType(),True) \
23 | .add("elevation_s",StringType(),True) \
24 | .add("dem_s",StringType(),True) \
25 | .add("timezone_s",StringType(),True) \
26 | .add("modification_date_s",StringType(),True)
27 |
28 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
29 | dataframe = spark.read.csv(csv_file, schema=schema, multiLine=True, escape="\\", sep="\t") \
30 | .withColumn("type", lit("city")) \
31 | .withColumn("location_coordinates", concat_ws(",", "latitude_s", "longitude_s"))
32 |
33 | return dataframe
--------------------------------------------------------------------------------
/aips/data_loaders/index_time_boosts.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.functions import collect_list, create_map
2 | from aips.spark.dataframe import from_sql
3 | from aips.spark import create_view_from_collection
4 |
5 | def load_dataframe(boosted_products_collection, boosts_collection):
6 | assert(type(boosted_products_collection) == type(boosts_collection))
7 | create_view_from_collection(boosts_collection,
8 | boosts_collection.name)
9 | create_view_from_collection(boosted_products_collection,
10 | boosted_products_collection.name)
11 | match boosted_products_collection.get_engine_name():
12 | case "solr":
13 | query = f"""SELECT p.*, b.signals_boosts FROM (
14 | SELECT doc, CONCAT_WS(',', COLLECT_LIST(CONCAT(query, '|', boost)))
15 | AS signals_boosts FROM {boosts_collection.name} GROUP BY doc
16 | ) b INNER JOIN {boosted_products_collection.name} p ON p.upc = b.doc"""
17 | boosts_dataframe = from_sql(query)
18 | case "opensearch":
19 | product_query = f"SELECT * FROM {boosted_products_collection.name}"
20 | boosts_query = f"SELECT doc, boost, REPLACE(query, '.', '') AS query FROM {boosts_collection.name}"
21 |
22 | grouped_boosts = from_sql(boosts_query).groupBy("doc") \
23 | .agg(collect_list(create_map("query", "boost"))[0].alias("signals_boost")) \
24 | .withColumnRenamed("doc", "upc")
25 |
26 | boosts_dataframe = from_sql(product_query).join(grouped_boosts, "upc")
27 | case _:
28 | raise Exception(f"Index time boost not implemented for {type(boosted_products_collection)}")
29 |
30 | return boosts_dataframe
--------------------------------------------------------------------------------
/aips/data_loaders/movies.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pyspark.sql import SparkSession, Row
3 |
4 | def load_dataframe(movie_file="data/tmdb.json", movie_image_ids={}):
5 | movies = []
6 | for movieId, tmdbMovie in json.load(open(movie_file)).items():
7 | try:
8 | releaseDate = None
9 | if "release_date" in tmdbMovie and len(tmdbMovie["release_date"]) > 0:
10 | releaseDate = tmdbMovie["release_date"]
11 | releaseYear = releaseDate[0:4]
12 |
13 | full_poster_path = ""
14 | if "poster_path" in tmdbMovie and tmdbMovie["poster_path"] is not None and len(tmdbMovie["poster_path"]) > 0:
15 | full_poster_path = "https://image.tmdb.org/t/p/w185" + tmdbMovie["poster_path"]
16 |
17 | movie = {"id": movieId,
18 | "title": tmdbMovie["title"],
19 | "overview": tmdbMovie["overview"],
20 | "tagline": tmdbMovie["tagline"],
21 | "directors": [director["name"] for director in tmdbMovie["directors"]],
22 | "cast": " ".join([castMember["name"] for castMember in tmdbMovie["cast"]]),
23 | "genres": [genre["name"] for genre in tmdbMovie["genres"]],
24 | "release_date": releaseDate,
25 | "release_year": releaseYear,
26 | "poster_file": (tmdbMovie["poster_path"] or " ")[1:],
27 | "poster_path": full_poster_path,
28 | "vote_average": float(tmdbMovie["vote_average"]) if "vote_average" in tmdbMovie else None,
29 | "vote_count": int(tmdbMovie["vote_count"]) if "vote_count" in tmdbMovie else 0}
30 | if movie["title"].lower() in movie_image_ids:
31 | joined_ids = ",".join(movie_image_ids[movie["title"].lower()])
32 | else:
33 | joined_ids = ""
34 | movie["movie_image_ids"] = joined_ids
35 |
36 | movies.append(movie)
37 | except KeyError as k: # Ignore any movies missing these attributes
38 | continue
39 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
40 | return spark.createDataFrame(Row(**m) for m in movies)
--------------------------------------------------------------------------------
/aips/data_loaders/outdoors.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.functions import coalesce, col, lit, udf
3 | from pyspark.sql.types import ArrayType, StringType
4 | import html
5 | import re
6 |
7 | from bs4 import BeautifulSoup
8 |
9 | def load_dataframe(csv_file):
10 | def split_tags(ascii_html):
11 | tags = re.compile("[\<\>]").split(html.unescape(ascii_html or ""))
12 | return [t.replace("-", " ") for t in tags if len(t)]
13 |
14 | def strip_HTML(ascii_html):
15 | text = html.unescape(ascii_html or "")
16 | text = BeautifulSoup(text, "lxml").get_text(separator=" ")
17 | return re.sub(r"\s+", " ", text.strip())
18 |
19 | split_tags_udf = udf(split_tags, ArrayType(StringType()))
20 | strip_html_udf = udf(strip_HTML)
21 | generate_url_udf = udf(lambda id: f"https://outdoors.stackexchange.com/questions/{id}", StringType())
22 | post_type_udf = udf(lambda type_id: "question" if type_id == 1 else "answer", StringType())
23 |
24 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
25 | dataframe = spark.read.csv(csv_file, header=True, inferSchema=True)
26 | dataframe = dataframe.filter((dataframe.post_type_id == 1) | (dataframe.post_type_id == 2))
27 | dataframe = dataframe.withColumn("post_type", post_type_udf(col("post_type_id")))
28 | dataframe = dataframe.withColumn("view_count", coalesce(col("view_count"), lit(0)))
29 | dataframe = dataframe.withColumn("body", strip_html_udf(col("body")))
30 | dataframe = dataframe.withColumn("owner_user_id", coalesce(col("owner_user_id"), col("owner_display_name")))
31 | dataframe = dataframe.withColumn("title", strip_html_udf(col("title")))
32 | dataframe = dataframe.withColumn("tags", split_tags_udf(col("tags")))
33 | dataframe = dataframe.withColumn("url", generate_url_udf(col("id")))
34 | dataframe = dataframe.drop("post_type_id", "deletion_date", "owner_display_name", "last_editor_user_id",
35 | "last_editor_display_name", "last_edit_date", "last_activity_date", "comment_count",
36 | "favorite_count", "closed_date", "community_owned_date")
37 | return dataframe
--------------------------------------------------------------------------------
/aips/data_loaders/products.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.functions import col, udf
3 |
4 | def load_dataframe(csv_file):
5 | print("Loading Products")
6 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
7 | dataframe = spark.read.csv(csv_file, header=True, inferSchema=True)
8 | dataframe = dataframe.withColumn("upc", udf(str)(col("upc")))
9 | print("Schema: ")
10 | dataframe.printSchema()
11 | return dataframe
--------------------------------------------------------------------------------
/aips/data_loaders/reviews.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.functions import col
3 |
4 | def load_dataframe(csv_file):
5 | print("\nLoading Reviews...")
6 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
7 | dataframe = spark.read.csv(csv_file, inferSchema=True, header=True, multiLine=True, escape="\"") \
8 | .select(col("id"), col("name_t").alias("business_name"),
9 | col("name_s").alias("name"),
10 | col("city_t").alias("city"),
11 | col("state_t").alias("state"), col("text_t").alias("content"),
12 | col("categories_t").alias("categories"), col("stars_i").alias("stars_rating"),
13 | col("location_pt_s").alias("location_coordinates"))
14 | dataframe.printSchema()
15 | dataframe = dataframe.filter(dataframe.business_name != "Charlotte Center City Partners")
16 | return dataframe
--------------------------------------------------------------------------------
/aips/environment.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | AIPS_NOTEBOOK_HOST = "aips-notebook"
5 | AIPS_NOTEBOOK_PORT = os.getenv("AIPS_NOTEBOOK_PORT") or "8888"
6 |
7 | AIPS_ZK_HOST = "aips-zk"
8 | AIPS_ZK_PORT = os.getenv("AIPS_ZK_PORT") or "2181"
9 |
10 | AIPS_WEBSERVER_HOST = os.getenv("AIPS_WEBSERVER_HOST") or "localhost"
11 | AIPS_WEBSERVER_PORT = os.getenv("AIPS_WEBSERVER_PORT") or "2345"
12 | WEBSERVER_URL = f"http://{AIPS_WEBSERVER_HOST}:{AIPS_WEBSERVER_PORT}"
13 | DEFAULT_CONFIG = {"AIPS_SEARCH_ENGINE": "SOLR",
14 | "PRINT_REQUESTS": False}
15 |
16 | CONFIG_FILE_PATH = os.path.abspath(os.path.join(os.path.join(
17 | os.path.dirname(__file__) , './'), 'system.config'))
18 |
19 | def write_config(config):
20 | with open(CONFIG_FILE_PATH, "w") as config_file:
21 | json.dump(config, config_file)
22 |
23 | def read_config():
24 | with open(CONFIG_FILE_PATH, "r") as f:
25 | return json.load(f)
26 |
27 | def load_config():
28 | try:
29 | config = read_config()
30 | except:
31 | write_config(DEFAULT_CONFIG)
32 | config = read_config()
33 | return config
34 |
35 | def set(key, value):
36 | config = load_config()
37 | config[key] = value
38 | with open(CONFIG_FILE_PATH, "w") as config_file:
39 | json.dump(config, config_file)
40 |
41 | def get(key, default=None):
42 | config = load_config()
43 | if default:
44 | return config.get(key, default)
45 | else:
46 | return config[key]
--------------------------------------------------------------------------------
/aips/indexers/product.py:
--------------------------------------------------------------------------------
1 | from ltr.download import download, extract_tgz
2 | from git import Repo # pip install gitpython
3 |
4 |
5 | #Get datasets
6 | ![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git
7 | ! cd retrotech && git pull
8 | ! cd retrotech && mkdir -p '../data/retrotech/' && tar -xvf products.tgz -C '../data/retrotech/' && tar -xvf signals.tgz -C '../data/retrotech/'
9 |
10 |
11 | dataset = ["https://github.com/ai-powered-search/tmdb/raw/main/judgments.tgz",
12 | "https://github.com/ai-powered-search/tmdb/raw/main/movies.tgz"]
13 | download(dataset, dest="")
14 | extract_tgz("data/movies.tgz", "data/") # -> Holds "tmdb.json", big json dict with corpus
15 | extract_tgz("data/judgments.tgz", "data/") # -> Holds "ai_pow_search_judgments.txt",
16 | # which is our labeled judgment list
17 | Repo.clone_from("https://github.com/ai-powered-search/retrotech.git", "data/retrotech/")
18 |
19 | from aips.data_loaders.products import load_dataframe
20 |
21 | products_collection = engine.create_collection("products")
22 | products_dataframe = load_dataframe("data/retrotech/products.csv")
23 | products_collection.write(products_dataframe)
24 |
25 | signals_collection = engine.create_collection("signals")
26 | signals_collection.write(from_csv("data/retrotech/signals.csv"))
--------------------------------------------------------------------------------
/aips/search_requests.py:
--------------------------------------------------------------------------------
1 | def product_search_request(query, param_overrides={}):
2 | request = {"query": query,
3 | "query_fields": ["name", "manufacturer", "long_description"],
4 | "return_fields": ["upc", "name", "manufacturer",
5 | "short_description", "score"],
6 | "limit": 5,
7 | "order_by": [("score", "desc"), ("upc", "asc")]}
8 | return request | param_overrides
9 |
10 | def search_for_boosts(query, collection, query_field="query"):
11 | boosts_request = {"query": query,
12 | "query_fields": [query_field],
13 | "return_fields": ["query", "doc", "boost"],
14 | "limit": 20,
15 | "order_by": [("boost", "desc")]}
16 | response = collection.search(**boosts_request)
17 | return response["docs"]
18 |
19 | def create_boosts_query(boost_documents):
20 | print("Boost Documents:")
21 | print(boost_documents)
22 | boosts = " ".join([f'"{b["doc"]}"^{b["boost"]}'
23 | for b in boost_documents])
24 | print(f"\nBoost Query: \n{boosts}\n")
25 | return boosts
26 |
27 | def boosted_product_search_request(query, collection, boost_field=None):
28 | signals_documents = search_for_boosts(query, collection)
29 | signals_boosts = create_boosts_query(signals_documents)
30 | boosted_request = product_search_request(query)
31 | if boost_field:
32 | signals_boosts = (boost_field, signals_boosts)
33 | boosted_request["query_boosts"] = signals_boosts
34 | return boosted_request
--------------------------------------------------------------------------------
/aips/spark/__init__.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 |
3 | from pyspark.sql.functions import col, udf
4 | from pyspark.sql.types import StringType
5 |
6 | from aips.environment import AIPS_ZK_HOST
7 | from engines.opensearch.config import OPENSEARCH_URL
8 |
9 | def create_view_from_collection(collection, view_name, spark=None):
10 | if not spark:
11 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
12 | match collection.get_engine_name():
13 | case "solr":
14 | opts = {"zkhost": AIPS_ZK_HOST, "collection": collection.name}
15 | spark.read.format("solr").options(**opts).load().createOrReplaceTempView(view_name)
16 | case "opensearch":
17 | parse_id_udf = udf(lambda s: s["_id"], StringType())
18 | opts = {"opensearch.nodes": OPENSEARCH_URL,
19 | "opensearch.net.ssl": "false",
20 | "opensearch.read.metadata": "true"}
21 | dataframe = spark.read.format("opensearch").options(**opts).load(collection.name)
22 | if "_metadata" in dataframe.columns:
23 | dataframe = dataframe.withColumn("id", parse_id_udf(col("_metadata")))
24 | dataframe = dataframe.drop("_metadata")
25 | print(dataframe.columns)
26 | dataframe.createOrReplaceTempView(view_name)
27 | case _:
28 | raise NotImplementedError(type(collection))
--------------------------------------------------------------------------------
/aips/spark/dataframe.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.functions import lit
3 |
4 | def from_csv(file, more_opts=False, log=True):
5 | if log:
6 | print(f"Loading {file}")
7 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
8 | reader = spark.read.format("csv").option("header", "true").option("inferSchema", "true")
9 | if more_opts:
10 | reader = reader.option("charset", "utf-8").option("quote", "\"").option("escape", "\"").option("multiLine","true").option("delimiter", ",")
11 | dataframe = reader.load(file)
12 | if more_opts and "category" in more_opts:
13 | # We can rely on automatic generation of IDs, or we can create them ourselves.
14 | # If we do it, comment out previous line
15 | # .withColumn("id", concat(col("category"), lit("_") col("id")))
16 | dataframe = dataframe.withColumn("category", lit(more_opts.get("category"))).drop("id")
17 |
18 | if log:
19 | print("Schema: ")
20 | dataframe.printSchema()
21 |
22 | return dataframe
23 |
24 | def from_sql(query, spark=None):
25 | if not spark:
26 | spark = SparkSession.builder.appName("AIPS").getOrCreate()
27 | return spark.sql(query)
--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax = docker/dockerfile:1.5
2 | FROM jupyter/pyspark-notebook:spark-3.3.1
3 | USER root
4 |
5 | # Install gcc, c++, and related dependencies needed for pip to build some python dependencies
6 | RUN sudo apt-get -y update && \
7 | apt-get install -y --reinstall build-essential gcc cargo && \
8 | rm -rf /var/lib/apt/lists/
9 |
10 | # Install Spark-Solr
11 | ENV SPARK_SOLR_VERSION=4.0.2
12 | ENV SHADED_SOLR_JAR_PATH=/usr/local/spark/lib/spark-solr-${SPARK_SOLR_VERSION}-shaded.jar
13 | RUN mkdir -p /usr/local/spark/lib/ && cd /usr/local/spark/lib/ && \
14 | wget -q https://repo1.maven.org/maven2/com/lucidworks/spark/spark-solr/${SPARK_SOLR_VERSION}/spark-solr-${SPARK_SOLR_VERSION}-shaded.jar -O $SHADED_SOLR_JAR_PATH && \
15 | echo "c5293f10257603bcf650780afcb91ed1bb118f09feb731502c2dc7ac14ba950e586a033cb2f50e5c122c5ec442dc0d2b55f76c4f6522b555e67f4981a38bca26 *spark-solr-${SPARK_SOLR_VERSION}-shaded.jar" \
16 | | sha512sum -c - && chmod 0777 $SHADED_SOLR_JAR_PATH
17 |
18 | # Install Spark-OpenSearch
19 | ENV SPARK_OS_VERSION=1.2.0
20 | ENV SPARK_OS_JAR=opensearch-spark-30_2.12-${SPARK_OS_VERSION}.jar
21 | ENV SPARK_OS_PATH=/usr/local/spark/lib/${SPARK_OS_JAR}
22 | RUN cd /usr/local/spark/lib/ && \
23 | wget -q https://repo1.maven.org/maven2/org/opensearch/client/opensearch-spark-30_2.12/${SPARK_OS_VERSION}/${SPARK_OS_JAR} -O $SPARK_OS_PATH && \
24 | echo "5b9ae056b6ac21ae009f79a3a761774c7178b995fbe035572a4f35d5738e055d02828d2ec0ff98dd063ffffe37f4c48dc9a418d71269fc560f65b33c94493f2d *${SPARK_OS_JAR}" \
25 | | sha512sum -c - && chmod 0777 $SPARK_OS_PATH
26 |
27 | WORKDIR /home/$NB_USER
28 |
29 | # Install Python dependencies
30 | COPY build/ch5_spacy_requirements.txt /home/$NB_USER
31 | RUN conda create --name ch5-spacy python=3.10.0 -y
32 | RUN conda run --name ch5-spacy pip install -r ch5_spacy_requirements.txt
33 | RUN conda run --name ch5-spacy python -m spacy download en_core_web_sm
34 | RUN conda run --name ch5-spacy python -m ipykernel install --name ch5-spacy --display-name "[ONLY FOR CH5.1] spaCy experimental kernel"
35 |
36 | ENV BLIS_ARCH="generic" PIP_CACHE_DIR=/var/cache/pip
37 | RUN mkdir -p $PIP_CACHE_DIR
38 | COPY build/requirements.txt /home/$NB_USER
39 | RUN --mount=type=cache,target=$PIP_CACHE_DIR \
40 | pip install -r requirements.txt && \
41 | python -m spacy download en_core_web_sm
42 | RUN rm ch5_spacy_requirements.txt requirements.txt
43 |
44 | # Configure home directory
45 | COPY build/log4j.properties /usr/local/spark/conf/
46 | COPY aips/ /home/$NB_USER/aips
47 | COPY chapters/ /home/$NB_USER/chapters
48 | COPY data/ /home/$NB_USER/data
49 | COPY engines/ /home/$NB_USER/engines
50 | COPY ltr/ /home/$NB_USER/ltr
51 | COPY semantic_search/ /home/$NB_USER/semantic_search
52 | COPY webserver/ /home/$NB_USER/webserver
53 | COPY build/ipython_kernel_config.py /etc/ipython/
54 | RUN rm -rf work/
55 |
56 | # Change to notebook user
57 | RUN chown -R $NB_UID:$NB_UID /home/$NB_USER
58 | RUN fix-permissions /home/$NB_USER
59 | USER $NB_USER
60 |
61 | # Spark Config
62 | ENV SPARK_OPTS="$SPARK_OPTS --driver-java-options=\"-DXlint:none -Dlog4j.logLevel=error -Dallow-access=java.nio.DirectByteBuffer -Dlog4j.logger.org.apache.spark.repl.Main=ERROR\" --spark.ui.showConsoleProgress=False" \
63 | PYSPARK_SUBMIT_ARGS="-c spark.driver.defaultJavaOptions=\"-DXlint=none -Dlog4j.logLevel=error -Dallow-access=java.nio.DirectByteBuffer\" -c spark.ui.showConsoleProgress=False --jars $SHADED_SOLR_JAR_PATH,$SPARK_OS_PATH pyspark-shell" \
64 | PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-*-src.zip:%PYTHONPATH% \
65 | DOCKER_STACKS_JUPYTER_CMD=lab
66 |
67 | # If you want to edit the notebooks and have your changes persist,
68 | # uncomment the line below and restart with `docker compose up --build`
69 | #WORKDIR /tmp/notebooks
70 |
71 | # Mark all notebooks as trusted by default
72 | RUN find . -name \*.ipynb -print0|xargs -0 jupyter-trust -y
73 |
74 | # Start Jupyter Notebooks
75 | RUN jupyter labextension disable "@jupyterlab/apputils-extension:announcements"
76 | CMD start-notebook.sh --ServerApp.password='' \
77 | --ServerApp.token='' --NotebookApp.token='' --LabApp.token='' \
78 | --LabApp.default_url='/lab/tree/chapters/welcome.ipynb' \
79 | --NotebookApp.allow_origin='*' --NotebookApp.ip='0.0.0.0' --ServerApp.ip=0.0.0.0 --no-browser
--------------------------------------------------------------------------------
/build/ch5_spacy_requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | ipykernel
3 | spacy==3.4.4
4 | numpy==1.26.4
5 | matplotlib==3.9.2
6 | networkx==3.3
7 | https://github.com/explosion/spacy-experimental/releases/download/v0.6.1/en_coreference_web_trf-3.4.0a2-py3-none-any.whl
--------------------------------------------------------------------------------
/build/ipython_kernel_config.py:
--------------------------------------------------------------------------------
1 | # Configuration file for ipython-kernel.
2 | # See
3 |
4 | # With IPython >= 6.0.0, all outputs to stdout/stderr are captured.
5 | # It is the case for subprocesses and output of compiled libraries like Spark.
6 | # Those logs now both head to notebook logs and in notebooks outputs.
7 | # Logs are particularly verbose with Spark, that is why we turn them off through this flag.
8 | #
9 |
10 | # Attempt to capture and forward low-level output, e.g. produced by Extension
11 | # libraries.
12 | # Default: True
13 | # type:ignore
14 | c.IPKernelApp.capture_fd_output = False # noqa: F821
15 | c.IPKernelApp.code_to_run = "cd /home/jovyan"
--------------------------------------------------------------------------------
/build/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=ERROR, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.target=System.err
5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}:>
7 |
8 | # Set the default spark-shell log level to WARN. When running the spark-shell, >
9 | # log level for this class is used to overwrite the root logger's log level, so>
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=ERROR
12 |
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.sparkproject.jetty=ERROR
15 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=ERROR
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=ERROR
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 |
21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent U>
22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
24 |
25 | # For deploying Spark ThriftServer
26 | # SPARK-34128:Suppress undesirable TTransportException warnings involved in TH>
27 | log4j.appender.console.filter.1=org.apache.log4j.varia.StringMatchFilter
28 | log4j.appender.console.filter.1.StringToMatch=Thrift error occurred during proc>
29 | log4j.appender.console.filter.1.AcceptOnMatch=false
--------------------------------------------------------------------------------
/build/requirements.txt:
--------------------------------------------------------------------------------
1 | #Implicit dependencies with set versions for optimization
2 | adjusttext==0.8
3 | mdurl==0.1.2
4 | mizani==0.11.4
5 | patsy==0.5.6
6 | wcwidth==0.2.13
7 | pandas==2.2.3
8 | contourpy==1.0.7
9 | thinc==8.1.0
10 | numba==0.60.0
11 | wrapit==0.3.0
12 | h5py==3.12.1
13 | gitpython==3.1.43
14 |
15 | #Forced versions to prevent implicit dep version errors
16 | # (these sub packages are only used indirectly through normal deps)
17 | bottleneck==1.4.0
18 | numexpr==2.10.1
19 | pyarrow==17.0.0
20 | statsmodels==0.14.4
21 | networkx==3.3
22 |
23 | #Normal deps
24 | accelerate==0.34.2
25 | beautifulsoup4==4.12.3
26 | lxml==5.3.0
27 | datasets==3.0.1
28 | jupyter-console==6.6.3
29 | matplotlib==3.9.2
30 | nltk==3.9.1
31 | nmslib==2.1.1
32 | plotly==5.24.1
33 | plotnine==0.13.5
34 | openai-clip==1.0.1
35 | torchvision==0.20.1
36 | numpy==1.26.4
37 | scipy==1.14.1
38 | #scikit-learn==1.5.2
39 | scikit-learn==1.2.1
40 | spacy==3.5.0
41 | staticmap==0.5.7
42 | faiss-cpu==1.9.0
43 | sentence-transformers==3.1.1
44 | spladerunner==0.1.6
--------------------------------------------------------------------------------
/chapters/ch05/licenses/hearst.NOTICE.txt:
--------------------------------------------------------------------------------
1 | (From https://github.com/mmichelsonIF/hearst_patterns_python)
2 | --------------
3 |
4 | Code in the notebook "1.open-information-extraction.ipynb"
5 | related the Hearst patterns is reused in whole or in part
6 | from https://github.com/mmichelsonIF/hearst_patterns_python,
7 | which is licensed under the Apache (Software) License,
8 | version 2.0 ("the License"), and is subject to the following notice:
9 |
10 |
11 | Copyright 2016-2019 mmichelsonIF (https://github.com/mmichelsonIF)
12 |
13 | Licensed under the Apache License, Version 2.0 (the "License");
14 | you may not use this file except in compliance with the License.
15 | You may obtain a copy of the License at
16 |
17 | http://www.apache.org/licenses/LICENSE-2.0
18 |
19 | Unless required by applicable law or agreed to in writing, software
20 | distributed under the License is distributed on an "AS IS" BASIS,
21 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 | See the License for the specific language governing permissions and
23 | limitations under the License.
--------------------------------------------------------------------------------
/chapters/ch10/1.setup-the-movie-db.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# [ Chapter 10 - Learning to Rank for Generalizable Search Relevance ]\n",
8 | "# Setup TheMovieDB Collection"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "from aips import get_engine, get_ltr_engine\n",
18 | "engine = get_engine()"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Create Collection\n",
26 | "\n",
27 | "Create collection for http://themoviedb.org (TMDB) dataset for this book. We will just look at title, overview, and release_year fields."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Wiping \"tmdb\" collection\n",
40 | "Creating \"tmdb\" collection\n",
41 | "Status: Success\n",
42 | "Adding LTR QParser for tmdb collection\n",
43 | "Adding LTR Doc Transformer for tmdb collection\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "tmdb_collection = engine.create_collection(\"tmdb\")\n",
49 | "get_ltr_engine(tmdb_collection).enable_ltr()"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "metadata": {},
55 | "source": [
56 | "## Download and index data\n",
57 | "\n",
58 | "Download TMDB data and index. We also download a judgment list, labeled movies as relevant/irrelevant for several movie queries"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "GET https://github.com/ai-powered-search/tmdb/raw/main/judgments.tgz\n",
71 | "GET https://github.com/ai-powered-search/tmdb/raw/main/movies.tgz\n",
72 | "Successfully written 65616 documents\n"
73 | ]
74 | }
75 | ],
76 | "source": [
77 | "from ltr.download import download, extract_tgz \n",
78 | "from aips.data_loaders.movies import load_dataframe\n",
79 | "import tarfile\n",
80 | "import json\n",
81 | "\n",
82 | "dataset = [\"https://github.com/ai-powered-search/tmdb/raw/main/judgments.tgz\", \n",
83 | " \"https://github.com/ai-powered-search/tmdb/raw/main/movies.tgz\"]\n",
84 | "download(dataset, dest=\"data/\")\n",
85 | "extract_tgz(\"data/movies.tgz\", \"data/\") # -> Holds \"tmdb.json\", big json dict with corpus\n",
86 | "extract_tgz(\"data/judgments.tgz\", \"data/\") # -> Holds \"ai_pow_search_judgments.txt\", \n",
87 | " # which is our labeled judgment list\n",
88 | "\n",
89 | "movies_dataframe = load_dataframe(\"data/tmdb.json\")\n",
90 | "tmdb_collection.write(movies_dataframe)"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "## Next Up, judgments and feature logging\n",
98 | "\n",
99 | "Next up we use a _judgment list_, a set of labeled relevant / irrelevant movies for search query strings. We then extract some features from the search engine to setup a full training set we can use to train a model.\n",
100 | "\n",
101 | "Up next: [judgments and Logging](2.judgments-and-logging.ipynb)"
102 | ]
103 | }
104 | ],
105 | "metadata": {
106 | "kernelspec": {
107 | "display_name": "Python 3 (ipykernel)",
108 | "language": "python",
109 | "name": "python3"
110 | },
111 | "language_info": {
112 | "codemirror_mode": {
113 | "name": "ipython",
114 | "version": 3
115 | },
116 | "file_extension": ".py",
117 | "mimetype": "text/x-python",
118 | "name": "python",
119 | "nbconvert_exporter": "python",
120 | "pygments_lexer": "ipython3",
121 | "version": "3.10.9"
122 | }
123 | },
124 | "nbformat": 4,
125 | "nbformat_minor": 4
126 | }
127 |
--------------------------------------------------------------------------------
/chapters/ch11/0.setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# [ Chapter 11 - Automating Learning to Rank with Click Models ]\n",
8 | "# Indexing Search Sessions Data"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipyn) notebook or execute the next cell uncommented."
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 1,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import sys\n",
25 | "sys.path.append(\"..\")\n",
26 | "from ltr import download"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Download simulated raw clickstream data"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "Already up to date.\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git\n",
51 | "! cd retrotech && git pull\n",
52 | "! cp retrotech/sessions/* data/"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "name": "stdout",
62 | "output_type": "stream",
63 | "text": [
64 | "Already up to date.\n",
65 | "products.csv\n",
66 | "signals.csv\n",
67 | "\"upc\",\"name\",\"manufacturer\",\"short_description\",\"long_description\"\n",
68 | "\"096009010836\",\"Fists of Bruce Lee - Dolby - DVD\", , , \n",
69 | "\"043396061965\",\"The Professional - Widescreen Uncut - DVD\", , , \n",
70 | "\"085391862024\",\"Pokemon the Movie: 2000 - DVD\", , , \n",
71 | "\"067003016025\",\"Summerbreeze - CD\",\"Nettwerk\", , \n",
72 | "\"731454813822\",\"Back for the First Time [PA] - CD\",\"Def Jam South\", , \n",
73 | "\"024543008200\",\"Big Momma's House - Widescreen - DVD\", , , \n",
74 | "\"031398751823\",\"Kids - DVD\", , , \n",
75 | "\"037628413929\",\"20 Grandes Exitos - CD\",\"Sony Discos Inc.\", , \n",
76 | "\"060768972223\",\"Power Of Trinity (Box) - CD\",\"Sanctuary Records\", , \n",
77 | "Wiping \"products\" collection\n",
78 | "Creating \"products\" collection\n",
79 | "Status: Success\n",
80 | "Loading Products\n",
81 | "Schema: \n",
82 | "root\n",
83 | " |-- upc: long (nullable = true)\n",
84 | " |-- name: string (nullable = true)\n",
85 | " |-- manufacturer: string (nullable = true)\n",
86 | " |-- short_description: string (nullable = true)\n",
87 | " |-- long_description: string (nullable = true)\n",
88 | "\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "#%run chapters/ch04/1.setting-up-the-retrotech-dataset.ipynb"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "Up next: [Your First Click Model: Click Thru Rate](1.click-through-rate-judgments.ipynb)"
101 | ]
102 | }
103 | ],
104 | "metadata": {
105 | "kernelspec": {
106 | "display_name": "Python 3 (ipykernel)",
107 | "language": "python",
108 | "name": "python3"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.10.9"
121 | }
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 2
125 | }
126 |
--------------------------------------------------------------------------------
/chapters/ch13/outdoors_golden_answers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/chapters/ch13/outdoors_golden_answers.xlsx
--------------------------------------------------------------------------------
/chapters/ch13/pull_aips_dependency.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('../..')
3 | import requests
4 |
5 | def pull_dependency(file_name):
6 | print ("Pulling: \"" + file_name + "\"")
7 | with open(file_name, "wb") as file:
8 | for part in map(chr,range(ord('a'),ord('z')+1)):
9 | part_name = "part_" + str(part)
10 | response = requests.get("https://github.com/ai-powered-search/aips-build-dependencies/raw/main/" + file_name + "/" + part_name)
11 | if response.status_code == 200:
12 | print("Successfully downloaded " + part_name)
13 | file.write(response.content)
14 | elif response.status_code == 404:
15 | break
16 | else:
17 | raise Exception("Error: Status Code " + response.status_code + "\n" + response.text)
18 | print(file_name + " successfully pulled")
19 |
20 | if len(sys.argv) == 2:
21 | pull_dependency(sys.argv[1])
22 |
--------------------------------------------------------------------------------
/chapters/ch15/delorean-query.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/chapters/ch15/delorean-query.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/021331131393.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/021331131393.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/027242755871.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/027242755871.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/027242831599.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/027242831599.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/037988909926.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988909926.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/037988910045.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910045.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/037988910182.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910182.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/037988910250.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910250.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/037988910427.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/037988910427.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/048231316835.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/048231316835.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/05024545249224.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/05024545249224.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/072244106916.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/072244106916.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/12505382925.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505382925.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/12505451713.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505451713.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/12505525766.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505525766.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/12505527456.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505527456.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/12505559105.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/12505559105.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/14381196320.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/14381196320.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/21331131393.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/21331131393.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/23272335397.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/23272335397.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/24543701538.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/24543701538.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/25192107191.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/25192107191.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/27108936499.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27108936499.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/27242752436.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242752436.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/27242755871.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242755871.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/27242798236.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242798236.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/27242831599.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/27242831599.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/32429037763.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/32429037763.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36172950027.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36172950027.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36725560390.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725560390.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36725560451.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725560451.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36725561977.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725561977.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36725569331.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725569331.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36725569454.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725569454.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/36725578241.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/36725578241.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/37988909926.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988909926.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/37988910045.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910045.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/37988910182.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910182.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/37988910250.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910250.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/37988910427.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/37988910427.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400032015667.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400032015667.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400037252074.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252074.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400037252258.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252258.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400037252432.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252432.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400037252616.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252616.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400037252890.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037252890.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400037253316.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400037253316.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/400192926087.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/400192926087.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/45626176.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/45626176.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/47875841406.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875841406.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/47875841420.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875841420.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/47875842328.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875842328.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/47875842335.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/47875842335.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/48231011396.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231011396.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/48231011402.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231011402.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/48231316835.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231316835.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/48231317436.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231317436.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/48231317498.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/48231317498.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/5024545249224.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/5024545249224.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/600603105265.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/600603105265.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/600603138423.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/600603138423.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/603497664429.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/603497664429.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/610839379408.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/610839379408.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/612572171585.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/612572171585.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/635753490879.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/635753490879.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/635753493559.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/635753493559.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/635753493573.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/635753493573.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/665331101927.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/665331101927.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/694318011294.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/694318011294.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/696055169191.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/696055169191.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/708056579739.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/708056579739.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/708056579746.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/708056579746.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/708431390614.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/708431390614.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/711719842309.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/711719842309.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/716829772249.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/716829772249.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/72244106916.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/72244106916.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/722868830062.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/722868830062.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/722868840177.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/722868840177.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/74108007469.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/74108007469.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/74108056764.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/74108056764.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/74108096487.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/74108096487.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/77283045400.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/77283045400.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/783722274422.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/783722274422.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/786936817218.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/786936817218.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/793447512228.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/793447512228.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/803238004525.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/803238004525.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/821793013776.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/821793013776.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/826663114164.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/826663114164.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/826663126044.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/826663126044.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/843163089211.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/843163089211.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/843404073153.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/843404073153.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/84691170679.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691170679.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/84691211174.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691211174.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/84691226703.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691226703.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/84691226727.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/84691226727.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/848447000005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/848447000005.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/848447000081.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/848447000081.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/848447000135.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/848447000135.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/856751002097.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/856751002097.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/878816004532.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/878816004532.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/883049066905.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883049066905.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/883929085118.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883929085118.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/883929106172.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883929106172.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/883929154012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/883929154012.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/884116069973.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/884116069973.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885038021209.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885038021209.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885038024644.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885038024644.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885038024651.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885038024651.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885170045132.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885170045132.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885370315080.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885370315080.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885370325348.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885370325348.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909300549.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909300549.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909393404.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909393404.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909394845.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909394845.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909395095.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909395095.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909457588.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457588.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909457595.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457595.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909457601.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457601.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909457632.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909457632.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909471812.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909471812.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/885909472376.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/885909472376.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/886111271283.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886111271283.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/886111287055.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886111287055.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/886971404722.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886971404722.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/886973561621.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/886973561621.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/92636260712.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/92636260712.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/93624956037.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/93624956037.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/93624995012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/93624995012.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97360716641.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360716641.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97360722345.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360722345.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97360724240.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360724240.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97360810042.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97360810042.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97363532149.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97363532149.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97363560449.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97363560449.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/97368920347.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/97368920347.jpg
--------------------------------------------------------------------------------
/data/retrotech/images/unavailable.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/data/retrotech/images/unavailable.jpg
--------------------------------------------------------------------------------
/data/retrotech/templates/search-results.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |

10 |
11 |
12 |
Name: ${NAME}
13 |
Manufacturer: ${MANUFACTURER}
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | notebooks:
3 | build:
4 | context: .
5 | dockerfile: build/Dockerfile
6 | container_name: aips-notebooks
7 | ports:
8 | - 7077:7077 # Spark Master
9 | - 8082:8080 # Spark Master UI - 8082 less likely to conflict
10 | - 8081:8081 # Spark Worker UI
11 | - 4041:4041 # Spark UI
12 | - 8888:8888 # Jupyter Notebook UI
13 | - 2345:2345 # Search Webserver
14 | networks:
15 | - solr-network
16 | - opensearch-network
17 | restart: unless-stopped
18 | environment:
19 | #PYSPARK_SUBMIT_ARGS: '--jars /usr/local/spark/lib/spark-solr-4.0.0-shaded.jar pyspark-shell'
20 | #NB_USER: 'aips'
21 | #NB_UID: 1010
22 | #NB_GID: 1020
23 | #CHOWN_HOME: 'yes'
24 | #CHOWN_HOME_OPTS: -R
25 | SOLR_HOST: 'aips-solr'
26 | volumes:
27 | - type: bind
28 | source: "."
29 | target: "/tmp/notebooks/"
30 | profiles:
31 | - all
32 | - ""
33 |
34 | solr:
35 | build:
36 | context: ./engines/solr/build/
37 | dockerfile: Dockerfile
38 | container_name: aips-solr
39 | hostname: aips-solr
40 | ports:
41 | - 8983:8983
42 | environment:
43 | - ZK_HOST=aips-zk:2181
44 | - SOLR_HOST=aips-solr
45 | networks:
46 | - zk-solr
47 | - solr-network
48 | restart: unless-stopped
49 | depends_on:
50 | - zookeeper
51 | - notebooks
52 | profiles:
53 | - all
54 | - ""
55 |
56 | opensearch:
57 | build:
58 | context: ./engines/opensearch/build/
59 | dockerfile: engine-Dockerfile
60 | container_name: opensearch-node1
61 | hostname: aips-opensearch
62 | environment:
63 | - cluster.name=opensearch-cluster
64 | - node.name=opensearch-node1
65 | - discovery.type=single-node
66 | - network.host=0.0.0.0
67 | - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
68 | - "OPENSEARCH_INITIAL_ADMIN_PASSWORD=''"
69 | - "DISABLE_SECURITY_PLUGIN=true"
70 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
71 | ulimits:
72 | memlock:
73 | soft: -1
74 | hard: -1
75 | nofile:
76 | soft: 262114 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
77 | hard: 262114
78 | volumes:
79 | - opensearch-data:/usr/share/opensearch/data
80 | ports:
81 | - 9200:9200
82 | - 9600:9600 # required for Performance Analyzer
83 | expose:
84 | - 9200:9200
85 | networks:
86 | - opensearch-network
87 | depends_on:
88 | - notebooks
89 | - opensearch-dashboards
90 | profiles:
91 | - all
92 |
93 | opensearch-dashboards:
94 | image: opensearchproject/opensearch-dashboards:2.14.0
95 | container_name: opensearch-dashboards
96 | ports:
97 | - 5601:5601
98 | expose:
99 | - 5601:5601
100 | environment:
101 | DISABLE_SECURITY_DASHBOARDS_PLUGIN: "true"
102 | OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]'
103 | networks:
104 | - opensearch-network
105 | profiles:
106 | - all
107 |
108 | zookeeper:
109 | image: zookeeper:3.5.8
110 | container_name: aips-zk
111 | hostname: aips-zk
112 | ports:
113 | - 2181:2128
114 | networks:
115 | - zk-solr
116 | - solr-network
117 | restart: unless-stopped
118 | profiles:
119 | - all
120 | - ""
121 |
122 | volumes:
123 | opensearch-data:
124 |
125 | networks:
126 | zk-solr:
127 | solr-network:
128 | opensearch-network:
129 |
--------------------------------------------------------------------------------
/engines/Collection.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | import aips.environment as env
3 | import json
4 |
5 | class Collection(ABC):
6 | def __init__(self, name):
7 | self.name = name
8 |
9 | @abstractmethod
10 | def get_engine_name(self):
11 | "Returns the name of the search engine for the collection"
12 | pass
13 |
14 | @abstractmethod
15 | def commit(self):
16 | "Force the collection to commit all uncommited data into the collection"
17 | pass
18 |
19 | @abstractmethod
20 | def write(self, dataframe):
21 | "Writes a pyspark dataframe containing documents into the collection"
22 | pass
23 |
24 | @abstractmethod
25 | def add_documents(self, docs, commit=True):
26 | "Adds a collection of documents into the collection"
27 | pass
28 |
29 | @abstractmethod
30 | def transform_request(self, **search_args):
31 | "Transforms a generic search request into a native search request"
32 | pass
33 |
34 | @abstractmethod
35 | def transform_response(self, search_response):
36 | "Transform a native search response into a generic search response"
37 | pass
38 |
39 | @abstractmethod
40 | def native_search(self, request=None):
41 | "Executes a search against the search engine given a native search request"
42 | pass
43 |
44 | @abstractmethod
45 | def spell_check(self, query, log=False):
46 | "Execute a spellcheck against the collection"
47 | pass
48 |
49 | def search(self, **search_args):
50 | """
51 | Searches the collection
52 | :param str query: The main query for the search request
53 | :param str query_parser: The name of the query parser to use in the search
54 | :param list of str query_fields: the fields to query against
55 | :param list of str return_fields: the fields to return on each document
56 | :param list of tuple of str filters: A list of tuples (field, value) to filter the results by
57 | :param int limit: The number of results to return
58 | :param list of tuple of str order_by: A list of tuples (field, ASC/DESC) to order the results by
59 | :param str rerank_query: A query to rerank the results by
60 | :param str default_operator: Sets the default operator of the search query (AND/OR)
61 | :param str min_match: Specificies the minimum matching constraints for matching documents
62 | :param str query_boosts: A boost query to boost documents at query time
63 | :param tuple of str index_time_boosts: An index time boost
64 | :param boolean explain: Enables debugging on the request
65 | :param boolean log: Enables logging for the query
66 | :param boolean highlight: Returns results with highlight information (if supported)
67 | """
68 | request = self.transform_request(**search_args)
69 | if "log" in search_args or env.get("PRINT_REQUESTS", False):
70 | print(json.dumps(request, indent=2))
71 | search_response = self.native_search(request=request)
72 | if "log" in search_args:
73 | print(json.dumps(search_response, indent=2))
74 | return self.transform_response(search_response)
75 |
76 | def hybrid_search(self, searches=[], limit=None, algorithm="rrf", algorithm_params={}):
77 | hybrid_search_results = None
78 | match algorithm:
79 | case "rrf":
80 | search_results = [self.search(**request)["docs"]
81 | for request in searches]
82 |
83 | hybrid_search_scores = reciprocal_rank_fusion(search_results,
84 | algorithm_params.get("k"))
85 | scored_docs = merge_search_results(search_results,
86 | hybrid_search_scores)
87 | return {"docs": scored_docs[:limit]}
88 | case "lexical_vector_rerank":
89 | lexical_search_request = searches[0]
90 | searches[1]["k"] = algorithm_params.get("k", 10) #TODO: should probably default to "limit" instead of 10
91 | lexical_search_request["rerank_query"] = searches[1]
92 | return self.search(**lexical_search_request)
93 | return hybrid_search_results
94 |
95 | def merge_search_results(search_results, scores):
96 | merged_results = {}
97 | for results in search_results:
98 | for doc in results:
99 | if doc["id"] in merged_results:
100 | merged_results[doc["id"]] = {**doc, **merged_results[doc["id"]]}
101 | else:
102 | merged_results[doc["id"]] = doc
103 | return [{**merged_results[id], "score": score}
104 | for id, score in scores.items()]
105 |
106 |
107 | def reciprocal_rank_fusion(search_results, k=None):
108 | if k is None: k = 60
109 | scores = {}
110 | for ranked_docs in search_results:
111 | for rank, doc in enumerate(ranked_docs, 1):
112 | scores[doc["id"]] = scores.get(doc["id"], 0) + (1.0 / (k + rank))
113 | sorted_scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
114 | return sorted_scores
--------------------------------------------------------------------------------
/engines/Engine.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | class Engine(ABC):
4 | def __init__(self, name):
5 | self.name = name
6 |
7 | @abstractmethod
8 | def health_check(self):
9 | "Checks the state of the search engine returning a boolean"
10 | pass
11 |
12 | @abstractmethod
13 | def print_status(self, response):
14 | "Prints the resulting status of a search engine request"
15 | pass
16 |
17 | @abstractmethod
18 | def create_collection(self, name):
19 | "Create and initialize the schema for a collection, returns the initialized collection"
20 | pass
21 |
22 | @abstractmethod
23 | def get_collection(self, name):
24 | "Returns initialized object for a given collection"
25 | pass
--------------------------------------------------------------------------------
/engines/EntityExtractor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | class EntityExtractor(ABC):
4 | def __init__(self, collection):
5 | "The collection containing entities"
6 | self.collection = collection
7 |
8 | @abstractmethod
9 | def extract_entities(self, query):
10 | "Returns extracted entities and tag data for a given query"
11 | pass
--------------------------------------------------------------------------------
/engines/LTR.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | class LTR(ABC):
4 | def __init__(self, collection):
5 | self.collection = collection
6 |
7 | @abstractmethod
8 | def enable_ltr(self, log=False):
9 | "Initializes LTR dependencies for a given collection"
10 | pass
11 |
12 | @abstractmethod
13 | def generate_feature(self, feature_name, params, feature_type):
14 | "Generates an LTR feature definition."
15 | pass
16 |
17 | @abstractmethod
18 | def generate_query_feature(self, feature_name, field_name, constant_score=False, value="(${keywords})"):
19 | "Generates an LTR query feature definition."
20 | pass
21 |
22 | @abstractmethod
23 | def generate_fuzzy_query_feature(self, feature_name, field_name):
24 | "Generates an LTR fuzzy query feature definition."
25 | pass
26 |
27 | @abstractmethod
28 | def generate_bigram_query_feature(self, feature_name, field_name):
29 | "Generates an LTR bigram query feature definition."
30 | pass
31 |
32 | @abstractmethod
33 | def generate_field_value_feature(self, feature_name, field_name):
34 | "Generates an LTR field value feature definition."
35 | pass
36 |
37 | @abstractmethod
38 | def generate_field_length_feature(self, feature_name, field_name):
39 | "Generates an LTR field length feature definition."
40 | pass
41 |
42 | @abstractmethod
43 | def generate_model(self, model_name, feature_names, means, std_devs, weights):
44 | "Generate a model definition."
45 | pass
46 |
47 | @abstractmethod
48 | def delete_feature_store(self, name, log=False):
49 | "Deletes the feature store of the given name."
50 | pass
51 |
52 | @abstractmethod
53 | def upload_features(self, features, model_name, log=False):
54 | "Uploads features into the engine with a given name"
55 | pass
56 |
57 | @abstractmethod
58 | def delete_model(self, model_name, log=False):
59 | "Deletes the model from the engine."
60 | pass
61 |
62 | @abstractmethod
63 | def upload_model(self, model, log=False):
64 | "Upload a model to the engine."
65 | pass
66 |
67 | @abstractmethod
68 | def upsert_model(self, model, log=False):
69 | "Deletes and uploads a model to the engine."
70 | pass
71 |
72 | @abstractmethod
73 | def get_explore_candidate(self, query, explore_vector, feature_config, log=False):
74 | "Generates a exploration search request with the given criteria."
75 | pass
76 |
77 | @abstractmethod
78 | def get_logged_features(self, model_name, doc_ids, options={},
79 | id_field="id", fields=None, log=False):
80 | "Deletes the model from the engine."
81 | pass
82 |
83 | @abstractmethod
84 | def search_with_model(self, model_name, **search_args):
85 | """Search a collection using an uploaded model.
86 | See engines.Collection.search() for information on parameters"""
87 | pass
88 |
--------------------------------------------------------------------------------
/engines/SemanticKnowledgeGraph.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | class SemanticKnowledgeGraph(ABC):
4 | def __init__(self, collection):
5 | self.collection = collection
6 |
7 | @abstractmethod
8 | def traverse(self, *nodes):
9 | "Traverses a semantic knowledge through each request node"
10 | pass
11 |
12 | @abstractmethod
13 | def transform_request(self, *nodes):
14 | """
15 | Generates a semantic knowledge graph request from a list of nodes, or multi-nodes
16 | A node can contain the following params: `name`, `values`, `field`, `min_occurance` and `limit`.
17 | :param str name: An optional name of the node. If not provided a default will be assigned
18 | :param list of str value: If a value is present, this node represents a query
19 | Otherwise, this node will discover terms terms are discovered. Otherwise the query is applied.
20 | :param str field: The field to query against or discover values from.
21 | :param int min_occurance: The minimum number of occurances that a term must occur within
22 | the knowledge base to be qualify for discovery.
23 | :param int limit: The limit on number of terms to discover
24 | """
25 | pass
--------------------------------------------------------------------------------
/engines/SparseSemanticSearch.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | class SparseSemanticSearch(ABC):
3 | def __init__(self):
4 | pass
5 |
6 | @abstractmethod
7 | def location_distance(self, query, position):
8 | "A semantic function to create a location distance query. Applies a transformed query node it to the query tree."
9 | pass
10 |
11 | @abstractmethod
12 | def popularity(self, query, position):
13 | "A semantic function to create a popularity query. Applies a transformed query node it to the query tree."
14 | pass
15 |
16 | @abstractmethod
17 | def transform_query(self, query_tree):
18 | "Transforms the query tree into an engine specific query tree"
19 | pass
20 |
21 | @abstractmethod
22 | def generate_basic_query(self, query):
23 | "Creates a basic engine specific search query"
24 | pass
--------------------------------------------------------------------------------
/engines/opensearch/OpenSearchEngine.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import requests
4 |
5 | from engines.Engine import Engine
6 | from engines.opensearch.config import OPENSEARCH_URL, SCHEMAS
7 | from engines.opensearch.OpenSearchCollection import OpenSearchCollection
8 |
9 | STATUS_URL = f"{OPENSEARCH_URL}/_cluster/health"
10 |
11 | class OpenSearchEngine(Engine):
12 | def __init__(self):
13 | super().__init__("OpenSearch")
14 |
15 | def health_check(self):
16 | status = requests.get(STATUS_URL).json()["status"] in ["green", "yellow"]
17 | if status:
18 | print("OpenSearch engine is online")
19 | return status
20 |
21 | def print_status(self, response):
22 | #print(json.dumps(response, indent=2))
23 | "Prints the resulting status of a search engine request"
24 | pass
25 |
26 | def create_collection(self, name, log=False):
27 | print(f'Wiping "{name}" collection')
28 | response = requests.delete(f"{OPENSEARCH_URL}/{name}").json()
29 |
30 | print(f'Creating "{name}" collection')
31 | collection = self.get_collection(name)
32 | request = SCHEMAS[name]["schema"] if name in SCHEMAS else {}
33 | response = requests.put(f"{OPENSEARCH_URL}/{name}", json=request).json()
34 | if log:
35 | print("Schema:", json.dumps(request, indent=2))
36 | if log:
37 | print("Status:", json.dumps(response, indent=2))
38 | return collection
39 |
40 | def get_collection(self, name):
41 | "Returns initialized object for a given collection"
42 | id_field = SCHEMAS.get(name, {}).get("id_field", "_id")
43 | return OpenSearchCollection(name, id_field)
--------------------------------------------------------------------------------
/engines/opensearch/OpenSearchSparseSemanticSearch.py:
--------------------------------------------------------------------------------
1 | from engines.SparseSemanticSearch import SparseSemanticSearch
2 |
3 | def escape_quotes(text):
4 | return text.replace('"', '\\"')
5 |
6 | class OpenSearchSparseSemanticSearch(SparseSemanticSearch):
7 | def __init__(self):
8 | pass
9 |
10 | def location_distance(self, query, position):
11 | if len(query["query_tree"]) -1 > position:
12 | next_entity = query["query_tree"][position + 1]
13 | if next_entity["type"] == "city":
14 | query["query_tree"].pop(position + 1)
15 | query["query_tree"][position] = {
16 | "type": "transformed",
17 | "syntax": "opensearch",
18 | "query": self.create_geo_filter(next_entity["location_coordinates"],
19 | "location_coordinates", 50)}
20 | return True
21 | return False
22 |
23 | def create_geo_filter(self, coordinates, field, distance_KM):
24 | return {"geo_distance": {"distance": f"{distance_KM}km",
25 | field: {"lat": coordinates.split(",")[0],
26 | "lon": coordinates.split(",")[1]}}}
27 |
28 | def popularity(self, query, position):
29 | if len(query["query_tree"]) -1 > position:
30 | query["query_tree"][position] = {
31 | "type": "transformed",
32 | "syntax": "opensearch",
33 | "query": {"function_score": {"field_value_factor": {
34 | "field": "stars_rating",
35 | "factor": 1.5,
36 | "missing": 0}}}}
37 | return True
38 | return False
39 |
40 | def transform_query(self, query_tree):
41 | for i, item in enumerate(query_tree):
42 | match item["type"]:
43 | case "transformed":
44 | continue
45 | case "skg_enriched":
46 | enrichments = item["enrichments"]
47 | if "term_vector" in enrichments:
48 | query_string = enrichments["term_vector"]
49 | if "category" in enrichments:
50 | query_string += f' +doc_type:"{enrichments["category"]}"'
51 | transformed_query = '"' + escape_quotes(item["surface_form"]) + '"'
52 | else:
53 | continue
54 | case "color":
55 | transformed_query = f'+colors:"{item["canonical_form"]}"'
56 | case "known_item" | "event":
57 | transformed_query = f'+name:"{item["canonical_form"]}"'
58 | case "city":
59 | transformed_query = f'+city:"{item["canonical_form"]}"'
60 | case "brand":
61 | transformed_query = f'+brand:"{item["canonical_form"]}"'
62 | case _:
63 | transformed_query = '"' + escape_quotes(item["surface_form"]) + '"'
64 | query_tree[i] = {"type": "transformed",
65 | "syntax": "opensearch",
66 | "query": transformed_query}
67 | return query_tree
68 |
69 | def generate_basic_query(self, query):
70 | return '"' + escape_quotes(query) + '"'
--------------------------------------------------------------------------------
/engines/opensearch/build/engine-Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright OpenSearch Contributors
2 | # SPDX-License-Identifier: Apache-2.0
3 | FROM amazonlinux:2 AS linux_stage_0
4 |
5 | ENV OS_VERSION=2.14.0
6 | ARG OPENSEARCH_HOME=/usr/share/opensearch
7 | ARG UID=1000
8 | ARG GID=1000
9 | ARG BUILDARCH
10 |
11 | RUN yum update -y && yum install -y tar gzip shadow-utils which && yum clean all
12 | RUN mkdir /tmp/build/
13 | RUN mkdir $OPENSEARCH_HOME
14 | RUN groupadd -g $GID os_group && adduser -g $GID -u $UID -b $OPENSEARCH_HOME opensearch
15 |
16 | WORKDIR /tmp/build/
17 | RUN pwd
18 | ENV DISTRO_CACHE_DIR=/var/cache/pip
19 | RUN mkdir -p $DISTRO_CACHE_DIR
20 | RUN set -eux && BUILDARCH=$(echo $BUILDARCH | sed 's/amd64/x64/' -) && \
21 | OS_DISTRO_FILE=opensearch-${OS_VERSION}-linux-${BUILDARCH}.tar.gz && \
22 | OS_DISTRO_URL=https://artifacts.opensearch.org/releases/bundle/opensearch/${OS_VERSION}/${OS_DISTRO_FILE} && \
23 | curl -O $OS_DISTRO_URL -O $OS_DISTRO_URL.sig && \
24 | curl https://artifacts.opensearch.org/publickeys/opensearch.pgp | gpg --import && \
25 | gpg --verify $OS_DISTRO_FILE.sig $OS_DISTRO_FILE && \
26 | tar --warning=no-timestamp -zxf /tmp/build/$OS_DISTRO_FILE -C $OPENSEARCH_HOME --strip-components=1 && \
27 | install -d -m 750 -o $UID -g $GID $OPENSEARCH_HOME/data/
28 |
29 | ########################### Stage 1 ########################
30 | # Copy working directory to the actual release docker images
31 | FROM amazonlinux:2
32 |
33 | ENV OS_VERSION=2.14.0
34 | ARG OPENSEARCH_HOME=/usr/share/opensearch
35 | ARG UID=1000
36 | ARG GID=1000
37 | ARG BUILDARCH
38 |
39 | RUN yum update -y && yum install -y tar gzip shadow-utils which && yum clean all
40 |
41 | WORKDIR $OPENSEARCH_HOME
42 | COPY --from=linux_stage_0 $OPENSEARCH_HOME $OPENSEARCH_HOME
43 | RUN echo "export JAVA_HOME=$OPENSEARCH_HOME/jdk" >> /etc/profile.d/java_home.sh && \
44 | echo "export PATH=\$PATH:\$JAVA_HOME/bin" >> /etc/profile.d/java_home.sh
45 | ENV JAVA_HOME=$OPENSEARCH_HOME/jdk
46 | ENV PATH=$PATH:$JAVA_HOME/bin:$OPENSEARCH_HOME/bin
47 | ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$OPENSEARCH_HOME/plugins/opensearch-knn/lib"
48 |
49 | USER $UID
50 |
51 | ARG DISABLE_INSTALL_DEMO_CONFIG=true
52 | ARG DISABLE_SECURITY_PLUGIN=true
53 | ARG UBI_VERSION=v0.0.12.1-os2.14.0
54 | ARG UBI_PLUGIN_FILE=opensearch-ubi-plugin-$UBI_VERSION.zip
55 | ARG UBI_PLUGIN_URL=https://github.com/o19s/opensearch-ubi/releases/download/release-$UBI_VERSION/$UBI_PLUGIN_FILE
56 | RUN bin/opensearch-plugin install $UBI_PLUGIN_URL --batch
57 |
58 | ARG LTR_PLUGIN_FILE=ltr-2.14.0-os2.14.0.zip
59 | COPY $LTR_PLUGIN_FILE .
60 | RUN bin/opensearch-plugin install file://$OPENSEARCH_HOME/$LTR_PLUGIN_FILE --batch
61 |
62 | COPY *.properties $OPENSEARCH_HOME/config/
63 | ARG ANALYZER_DIR=$OPENSEARCH_HOME/config/opensearch-performance-analyzer/
64 | RUN [ ! -d $ANALYZER_DIR ] || mv $OPENSEARCH_HOME/config/performance-analyzer.properties $ANALYZER_DIR
65 | COPY --chown=$UID:$UID --chmod=0770 *.sh $OPENSEARCH_HOME/
66 | RUN chown -R $UID:$GID $OPENSEARCH_HOME/data/
67 | RUN chmod 0770 *.sh
68 | RUN ./opensearch-onetime-setup.sh
69 |
70 | EXPOSE 9200 9300 9600 9650
71 |
72 | LABEL org.label-schema.schema-version="1.0" \
73 | org.label-schema.name="opensearch" \
74 | org.label-schema.version="$OS_VERSION" \
75 | org.label-schema.url="https://opensearch.org" \
76 | org.label-schema.vcs-url="https://github.com/opensearch" \
77 | org.label-schema.license="Apache-2.0" \
78 | org.label-schema.vendor="OpenSearch"
79 |
80 | ENTRYPOINT ["./opensearch-docker-entrypoint.sh"]
81 | CMD ["opensearch"]
82 |
--------------------------------------------------------------------------------
/engines/opensearch/build/log4j2.properties:
--------------------------------------------------------------------------------
1 | status = error
2 |
3 | appender.console.type = Console
4 | appender.console.name = console
5 | appender.console.layout.type = PatternLayout
6 | appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] [%node_name]%marker %m%n
7 |
8 | rootLogger.level = info
9 | rootLogger.appenderRef.console.ref = console
10 |
--------------------------------------------------------------------------------
/engines/opensearch/build/ltr-2.14.0-os2.14.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/engines/opensearch/build/ltr-2.14.0-os2.14.0.zip
--------------------------------------------------------------------------------
/engines/opensearch/build/opensearch-docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright OpenSearch Contributors
4 | # SPDX-License-Identifier: Apache-2.0
5 |
6 | # This script specify the entrypoint startup actions for opensearch
7 | # It will start both opensearch and performance analyzer plugin cli
8 | # If either process failed, the entire docker container will be removed
9 | # in favor of a newly started container
10 |
11 | # Export OpenSearch Home
12 | export OPENSEARCH_HOME=/usr/share/opensearch
13 | export OPENSEARCH_PATH_CONF=$OPENSEARCH_HOME/config
14 |
15 | # The virtual file /proc/self/cgroup should list the current cgroup
16 | # membership. For each hierarchy, you can follow the cgroup path from
17 | # this file to the cgroup filesystem (usually /sys/fs/cgroup/) and
18 | # introspect the statistics for the cgroup for the given
19 | # hierarchy. Alas, Docker breaks this by mounting the container
20 | # statistics at the root while leaving the cgroup paths as the actual
21 | # paths. Therefore, OpenSearch provides a mechanism to override
22 | # reading the cgroup path from /proc/self/cgroup and instead uses the
23 | # cgroup path defined the JVM system property
24 | # opensearch.cgroups.hierarchy.override. Therefore, we set this value here so
25 | # that cgroup statistics are available for the container this process
26 | # will run in.
27 | export OPENSEARCH_JAVA_OPTS="-Dopensearch.cgroups.hierarchy.override=/ $OPENSEARCH_JAVA_OPTS"
28 |
29 | # Security Plugin
30 | function setupSecurityPlugin {
31 | SECURITY_PLUGIN="opensearch-security"
32 |
33 | if [ -d "$OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN" ]; then
34 | if [ "$DISABLE_INSTALL_DEMO_CONFIG" = "true" ]; then
35 | echo "Disabling execution of install_demo_configuration.sh for OpenSearch Security Plugin"
36 | else
37 | echo "Enabling execution of install_demo_configuration.sh for OpenSearch Security Plugin"
38 | bash $OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN/tools/install_demo_configuration.sh -y -i -s
39 | fi
40 |
41 | if [ "$DISABLE_SECURITY_PLUGIN" = "true" ]; then
42 | echo "Disabling OpenSearch Security Plugin"
43 | opensearch_opt="-Eplugins.security.disabled=true"
44 | opensearch_opts+=("${opensearch_opt}")
45 | else
46 | echo "Enabling OpenSearch Security Plugin"
47 | fi
48 | else
49 | echo "OpenSearch Security Plugin does not exist, disable by default"
50 | fi
51 | }
52 |
53 | # Performance Analyzer Plugin
54 | function setupPerformanceAnalyzerPlugin {
55 | PERFORMANCE_ANALYZER_PLUGIN="opensearch-performance-analyzer"
56 | if [ -d "$OPENSEARCH_HOME/plugins/$PERFORMANCE_ANALYZER_PLUGIN" ]; then
57 | if [ "$DISABLE_PERFORMANCE_ANALYZER_AGENT_CLI" = "true" ]; then
58 | echo "Disabling execution of $OPENSEARCH_HOME/bin/$PERFORMANCE_ANALYZER_PLUGIN/performance-analyzer-agent-cli for OpenSearch Performance Analyzer Plugin"
59 | else
60 | echo "Enabling execution of OPENSEARCH_HOME/bin/$PERFORMANCE_ANALYZER_PLUGIN/performance-analyzer-agent-cli for OpenSearch Performance Analyzer Plugin"
61 | $OPENSEARCH_HOME/bin/opensearch-performance-analyzer/performance-analyzer-agent-cli > $OPENSEARCH_HOME/logs/performance-analyzer.log 2>&1 &
62 | fi
63 | else
64 | echo "OpenSearch Performance Analyzer Plugin does not exist, disable by default"
65 | fi
66 | }
67 |
68 | # Start up the opensearch and performance analyzer agent processes.
69 | # When either of them halts, this script exits, or we receive a SIGTERM or SIGINT signal then we want to kill both these processes.
70 | function runOpensearch {
71 | # Files created by OpenSearch should always be group writable too
72 | umask 0002
73 |
74 | if [[ "$(id -u)" == "0" ]]; then
75 | echo "OpenSearch cannot run as root. Please start your container as another user."
76 | exit 1
77 | fi
78 |
79 | # Parse Docker env vars to customize OpenSearch
80 | #
81 | # e.g. Setting the env var cluster.name=testcluster
82 | # will cause OpenSearch to be invoked with -Ecluster.name=testcluster
83 | opensearch_opts=()
84 | while IFS='=' read -r envvar_key envvar_value
85 | do
86 | # OpenSearch settings need to have at least two dot separated lowercase
87 | # words, e.g. `cluster.name`, except for `processors` which we handle
88 | # specially
89 | if [[ "$envvar_key" =~ ^[a-z0-9_]+\.[a-z0-9_]+ || "$envvar_key" == "processors" ]]; then
90 | if [[ ! -z $envvar_value ]]; then
91 | opensearch_opt="-E${envvar_key}=${envvar_value}"
92 | opensearch_opts+=("${opensearch_opt}")
93 | fi
94 | fi
95 | done < <(env)
96 |
97 | setupSecurityPlugin
98 | setupPerformanceAnalyzerPlugin
99 |
100 | # Start opensearch
101 | "$@" "${opensearch_opts[@]}"
102 |
103 | }
104 |
105 | # Prepend "opensearch" command if no argument was provided or if the first
106 | # argument looks like a flag (i.e. starts with a dash).
107 | if [ $# -eq 0 ] || [ "${1:0:1}" = '-' ]; then
108 | set -- opensearch "$@"
109 | fi
110 |
111 | if [ "$1" = "opensearch" ]; then
112 | # If the first argument is opensearch, then run the setup script.
113 | runOpensearch "$@"
114 | else
115 | # Otherwise, just exec the command.
116 | exec "$@"
117 | fi
118 |
--------------------------------------------------------------------------------
/engines/opensearch/build/opensearch-onetime-setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # SPDX-License-Identifier: Apache-2.0
4 | #
5 | # The OpenSearch Contributors require contributions made to
6 | # this file be licensed under the Apache-2.0 license or a
7 | # compatible open source license.
8 |
9 | # This script performs one-time setup for the OpenSearch tarball distribution.
10 | # It installs a demo security config and sets up the performance analyzer
11 |
12 | export OPENSEARCH_HOME=`dirname $(realpath $0)`
13 | export OPENSEARCH_PATH_CONF=$OPENSEARCH_HOME/config
14 | cd $OPENSEARCH_HOME
15 |
16 | ##Security Plugin
17 | SECURITY_PLUGIN="opensearch-security"
18 | if [ -d "$OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN" ]; then
19 | if [ "$DISABLE_INSTALL_DEMO_CONFIG" = "true" ]; then
20 | echo "Disabling execution of install_demo_configuration.sh for OpenSearch Security Plugin"
21 | else
22 | echo "Enabling execution of install_demo_configuration.sh for OpenSearch Security Plugin"
23 | bash $OPENSEARCH_HOME/plugins/$SECURITY_PLUGIN/tools/install_demo_configuration.sh -y -i -s
24 | fi
25 |
26 | if [ "$DISABLE_SECURITY_PLUGIN" = "true" ]; then
27 | echo "Disabling OpenSearch Security Plugin"
28 | sed -i '/plugins.security.disabled/d' $OPENSEARCH_PATH_CONF/opensearch.yml
29 | echo "plugins.security.disabled: true" >> $OPENSEARCH_PATH_CONF/opensearch.yml
30 | else
31 | echo "Enabling OpenSearch Security Plugin"
32 | sed -i '/plugins.security.disabled/d' $OPENSEARCH_PATH_CONF/opensearch.yml
33 | fi
34 | fi
35 |
36 | ##Perf Plugin
37 | PA_PLUGIN="opensearch-performance-analyzer"
38 |
39 | if ! grep -q '## OpenDistro Performance Analyzer' $OPENSEARCH_PATH_CONF/jvm.options; then
40 | CLK_TCK=`/usr/bin/getconf CLK_TCK`
41 | echo >> $OPENSEARCH_PATH_CONF/jvm.options
42 | echo '## OpenDistro Performance Analyzer' >> $OPENSEARCH_PATH_CONF/jvm.options
43 | echo "-Dclk.tck=$CLK_TCK" >> $OPENSEARCH_PATH_CONF/jvm.options
44 | echo "-Djdk.attach.allowAttachSelf=true" >> $OPENSEARCH_PATH_CONF/jvm.options
45 | echo "-Djava.security.policy=$OPENSEARCH_PATH_CONF/$PA_PLUGIN/opensearch_security.policy" >> $OPENSEARCH_PATH_CONF/jvm.options
46 | echo "--add-opens=jdk.attach/sun.tools.attach=ALL-UNNAMED" >> $OPENSEARCH_PATH_CONF/jvm.options
47 | fi
48 |
49 |
50 |
--------------------------------------------------------------------------------
/engines/opensearch/build/performance-analyzer.properties:
--------------------------------------------------------------------------------
1 | # ======================== OpenSearch performance analyzer plugin config =========================
2 |
3 | # NOTE: this is an example for Linux. Please modify the config accordingly if you are using it under other OS.
4 |
5 | # Metrics data location
6 | metrics-location = /dev/shm/performanceanalyzer/
7 |
8 | # Metrics deletion interval (minutes) for metrics data.
9 | # Interval should be between 1 to 60.
10 | metrics-deletion-interval = 1
11 |
12 | # If set to true, the system cleans up the files behind it. So at any point, we should expect only 2
13 | # metrics-db-file-prefix-path files. If set to false, no files are cleaned up. This can be useful, if you are archiving
14 | # the files and wouldn't like for them to be cleaned up.
15 | cleanup-metrics-db-files = true
16 |
17 | # WebService exposed by App's port
18 | webservice-listener-port = 9600
19 |
20 | # Port for RPC Communication
21 | rpc-port = 9650
22 |
23 | # Metric DB File Prefix Path location
24 | metrics-db-file-prefix-path = /tmp/metricsdb_
25 |
26 | https-enabled = false
27 |
28 | # Setup the correct path for server certificates
29 | certificate-file-path = none
30 | private-key-file-path = none
31 | #trusted-cas-file-path = none
32 |
33 | # Setup the correct path for client certificates (by default, the client will just use the server certificates)
34 | #client-certificate-file-path = specify_path
35 | #client-private-key-file-path = specify_path
36 | #client-trusted-cas-file-path = specify_path
37 |
38 | # WebService bind host; default only to local interface
39 | webservice-bind-host = 0.0.0.0
40 |
41 | # Plugin Stats Metadata file name, expected to be in the same location
42 | plugin-stats-metadata = plugin-stats-metadata
43 |
44 | # Agent Stats Metadata file name, expected to be in the same location
45 | agent-stats-metadata = agent-stats-metadata
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/engines/solr/SolrEntityExtractor.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from engines.solr.config import SOLR_URL
3 | from engines.solr.SolrCollection import SolrCollection
4 | from engines.EntityExtractor import EntityExtractor
5 |
6 | def transform_response(query, response):
7 | return {"query": query,
8 | "tags": response["tags"],
9 | "entities": response["response"]["docs"]}
10 |
11 | class SolrEntityExtractor(EntityExtractor):
12 | def __init__(self, collection):
13 | if not isinstance(collection, SolrCollection):
14 | raise TypeError("Only supports a SolrCollection")
15 | super().__init__(collection)
16 |
17 | def extract_entities(self, query):
18 | response = requests.post(f"{SOLR_URL}/{self.collection.name}/tag", query).json()
19 | return transform_response(query, response)
--------------------------------------------------------------------------------
/engines/solr/SolrSemanticKnowledgeGraph.py:
--------------------------------------------------------------------------------
1 | from engines.SemanticKnowledgeGraph import SemanticKnowledgeGraph
2 | from engines.solr.SolrCollection import SolrCollection
3 |
4 | def generate_request_root():
5 | return {
6 | "limit": 0,
7 | "params": {
8 | "q": "*:*",
9 | "fore": "{!${defType} v=$q}",
10 | "back": "*:*",
11 | "defType": "edismax"
12 | },
13 | "facet": {}
14 | }
15 |
16 | def generate_facets(name=None, values=None, field=None,
17 | min_occurrences=None, limit=None,
18 | min_popularity=None, default_operator="AND"):
19 | base_facet = {"type": "query" if values else "terms",
20 | "limit": limit if limit else 10,
21 | "sort": {"relatedness": "desc" },
22 | "facet": {
23 | "relatedness": {
24 | "type": "func",
25 | "func": "relatedness($fore,$back)"}}}
26 | if min_occurrences:
27 | base_facet["mincount"] = min_occurrences
28 | if min_popularity:
29 | base_facet["facet"]["relatedness"]["min_popularity"] = min_popularity
30 | if field:
31 | base_facet["field"] = field
32 | facets = []
33 | if values:
34 | if min_occurrences: base_facet.pop("mincount")
35 | if not limit: base_facet.pop("limit")
36 | for i, _ in enumerate(values):
37 | facets.append(base_facet.copy())
38 | facets[i]["query"] = "{" + f'!edismax q.op={default_operator} qf={field} v=${name}_{i}_query' + "}"
39 | else:
40 | facets = [base_facet]
41 | return facets
42 |
43 | def default_node_name(i, j):
44 | return "f" + str(i) + (f"_{j}" if j else "")
45 |
46 | def validate_skg_request_input(multi_node):
47 | if isinstance(multi_node, list):
48 | map(validate_skg_request_input, multi_node)
49 | node_names = [node["name"] for node in multi_node]
50 | if len(node_names) != len(set(node_names)):
51 | raise ValueError("Node names must be distinct on a given level.")
52 | if "field" not in multi_node: # and "values" in multi_node
53 | raise ValueError("'field' must be provided")
54 |
55 | def transform_request(*multi_nodes):
56 | """Generates a faceted Solr SKG request from a set of multi-nodes.
57 | A multi-node can be a single node or a collection of nodes.
58 | A node can contain the following params: `name`, `values`, `field`, `min_occurance` and `limit`.
59 | :param str name: An optional name of the node. If not provided a default will be assigned
60 | :param list of str value: If empty or absent, a terms facet is used. Otherwise a query facet per value is used
61 | :param str field: The field to query against or discover values from.
62 | :param int min_occurance: The mincount on the facet.
63 | :param int limit: The limit on the facet.
64 | Each subsequent node is applied as a nested facet to all parent facets."""
65 | map(validate_skg_request_input, multi_nodes)
66 | request = generate_request_root()
67 | parent_nodes = [request]
68 | for i, multi_node in enumerate(multi_nodes):
69 | current_facets = []
70 | if isinstance(multi_node, dict):
71 | multi_node = [multi_node]
72 | for j, node in enumerate(multi_node):
73 | if "name" not in node:
74 | node["name"] = default_node_name(i, j)
75 | facets = generate_facets(**node)
76 | current_facets.extend(facets)
77 | for i, parent_node in enumerate(parent_nodes):
78 | for j, facet in enumerate(facets):
79 | parent_node["facet"][f'{node["name"]}_{j}'] = facet
80 | if "values" in node:
81 | for i, value in enumerate(node["values"]):
82 | request["params"][f'{node["name"]}_{i}_query'] = value
83 | parent_nodes = current_facets
84 | return request
85 |
86 | def transform_node(node, response_params):
87 | relatedness = node["relatedness"]["relatedness"] if node["count"] > 0 else 0.0
88 | value_node = {"relatedness": relatedness}
89 | sub_traversals = transform_response_facet(node, response_params)
90 | if sub_traversals:
91 | value_node["traversals"] = sub_traversals
92 | return value_node
93 |
94 | def transform_response_facet(node, response_params):
95 | ignored_keys = ["count", "relatedness", "val"]
96 | traversals = {}
97 | for full_name, data in node.items():
98 | if full_name in ignored_keys:
99 | continue
100 | name = full_name.removesuffix("_" + full_name.split("_")[-1])
101 | if name not in traversals:
102 | traversals[name] = {"name": name, "values": {}}
103 | if "buckets" in data:
104 | values_node = {b["val"] : transform_node(b, response_params)
105 | for b in data["buckets"]}
106 | traversals[name]["values"] = values_node
107 | else:
108 | value_name = response_params[f"{full_name}_query"]
109 | traversals[name]["values"][value_name] = transform_node(data, response_params)
110 | for k in traversals.keys():
111 | traversals[k]["values"] = sort_by_relatedness_desc(traversals[k]["values"])
112 | return list(traversals.values())
113 |
114 | def sort_by_relatedness_desc(d):
115 | return {k: v for k, v in sorted(d.items(), key=lambda item: item[1]["relatedness"], reverse=True)}
116 |
117 | class SolrSemanticKnowledgeGraph(SemanticKnowledgeGraph):
118 | def __init__(self, collection):
119 | if not isinstance(collection, SolrCollection):
120 | raise TypeError("Only supports a SolrCollection")
121 | super().__init__(collection)
122 |
123 | def traverse(self, *multi_nodes):
124 | request = self.transform_request(*multi_nodes)
125 | response = self.collection.native_search(request)
126 | return {"graph": transform_response_facet(response["facets"], request["params"])}
127 |
128 | def transform_request(self, *multi_nodes):
129 | return transform_request(*multi_nodes)
--------------------------------------------------------------------------------
/engines/solr/SolrSparseSemanticSearch.py:
--------------------------------------------------------------------------------
1 | from engines.SparseSemanticSearch import SparseSemanticSearch
2 |
3 | def escape_quotes(text):
4 | return text.replace('"', '\\"')
5 |
6 | class SolrSparseSemanticSearch(SparseSemanticSearch):
7 | def __init__(self):
8 | pass
9 |
10 | def location_distance(self, query, position):
11 | if len(query["query_tree"]) -1 > position:
12 | next_entity = query["query_tree"][position + 1]
13 | if next_entity["type"] == "city":
14 | query["query_tree"].pop(position + 1)
15 | query["query_tree"][position] = {
16 | "type": "transformed",
17 | "syntax": "solr",
18 | "query": self.create_geo_filter(next_entity['location_coordinates'],
19 | "location_coordinates", 50)}
20 | return True
21 | return False
22 |
23 | def create_geo_filter(self, coordinates, field, distance_KM):
24 | clause = f'!geofilt d={distance_KM} sfield="{field}" pt="{coordinates}"'
25 | return "+{" + clause + '}'
26 |
27 | def popularity(self, query, position):
28 | if len(query["query_tree"]) -1 > position:
29 | query["query_tree"][position] = {
30 | "type": "transformed",
31 | "syntax": "solr",
32 | "query": '+{!func v="mul(if(stars_rating,stars_rating,0),20)"}'}
33 | return True
34 | return False
35 |
36 | def transform_query(self, query_tree):
37 | for i, item in enumerate(query_tree):
38 | match item["type"]:
39 | case "transformed":
40 | continue
41 | case "skg_enriched":
42 | enrichments = item["enrichments"]
43 | if "term_vector" in enrichments:
44 | query_string = enrichments["term_vector"]
45 | if "category" in enrichments:
46 | query_string += f' +doc_type:"{enrichments["category"]}"'
47 | transformed_query = '+{!edismax v="' + escape_quotes(query_string) + '"}'
48 | else:
49 | continue
50 | case "color":
51 | transformed_query = f'+colors_s:"{item["canonical_form"]}"'
52 | case "known_item" | "event":
53 | transformed_query = f'+name_s:"{item["canonical_form"]}"'
54 | case "city":
55 | transformed_query = f'+city:"{str(item["canonical_form"])}"'
56 | case "brand":
57 | transformed_query = f'+brand_s:"{item["canonical_form"]}"'
58 | case _:
59 | transformed_query = "+{!edismax v=\"" + escape_quotes(item["surface_form"]) + "\"}"
60 | query_tree[i] = {"type": "transformed",
61 | "syntax": "solr",
62 | "query": transformed_query}
63 | return query_tree
64 |
65 | def generate_basic_query(self, query):
66 | return '+{!edismax mm=100% v="' + escape_quotes(query) + '"}'
--------------------------------------------------------------------------------
/engines/solr/__init__.py:
--------------------------------------------------------------------------------
1 | from .SolrLTR import SolrLTR
2 | from .SolrSemanticKnowledgeGraph import SolrSemanticKnowledgeGraph
3 | from .SolrEntityExtractor import SolrEntityExtractor
4 | from .SolrSparseSemanticSearch import SolrSparseSemanticSearch
--------------------------------------------------------------------------------
/engines/solr/build/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM solr:9.4.1
2 |
3 | USER root
4 |
5 | ADD solr.xml ./server/solr/solr.xml
6 | ADD run_solr_w_ltr.sh ./run_solr_w_ltr.sh
7 | RUN chown solr:solr run_solr_w_ltr.sh
8 | RUN sed -i -e 's/\r$//' run_solr_w_ltr.sh
9 | RUN chmod u+x run_solr_w_ltr.sh
10 |
11 | ADD log4j2-config.xml ./log4j2-config.xml
12 |
13 | USER solr
14 |
15 | ENTRYPOINT "./run_solr_w_ltr.sh"
16 |
--------------------------------------------------------------------------------
/engines/solr/build/log4j2-config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/engines/solr/build/run_solr_w_ltr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | mkdir -p /var/solr/data/
3 |
4 | SOLR_MODULES=ltr
5 | solr-foreground -Dsolr.modules=ltr -Dsolr.ltr.enabled=true -Dlog4j2.configurationFile=/opt/solr-9.4.1/log4j2-config.xml
--------------------------------------------------------------------------------
/engines/solr/build/solr.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | ${solr.max.booleanClauses:1024}
4 | ${solr.sharedLib:}
5 | ${solr.modules:}
6 | ${solr.allowPaths:}
7 | ${solr.allowUrls:}
8 | ${solr.hideStackTrace:false}
9 |
10 |
11 |
12 | ${host:}
13 | ${solr.port.advertise:0}
14 | ${hostContext:solr}
15 |
16 | ${genericCoreNodeNames:true}
17 |
18 | ${zkClientTimeout:30000}
19 | ${distribUpdateSoTimeout:600000}
20 | ${distribUpdateConnTimeout:60000}
21 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider}
22 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider}
23 | ${zkCredentialsInjector:org.apache.solr.common.cloud.DefaultZkCredentialsInjector}
24 | ${distributedClusterStateUpdates:false}
25 | ${distributedCollectionConfigSetExecution:false}
26 | ${minStateByteLenForCompression:-1}
27 | ${stateCompressor:org.apache.solr.common.util.ZLibCompressor}
28 |
29 |
30 |
31 |
33 | ${socketTimeout:600000}
34 | ${connTimeout:60000}
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
52 |
53 |
54 | QUERY_DOC_FV
55 |
56 |
--------------------------------------------------------------------------------
/engines/solr/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | AIPS_SOLR_HOST = os.getenv("AIPS_SOLR_HOST") or "aips-solr"
4 | AIPS_SOLR_PORT = os.getenv("AIPS_SOLR_PORT") or "8983"
5 | SOLR_URL = f"http://{AIPS_SOLR_HOST}:{AIPS_SOLR_PORT}/solr"
6 | STATUS_URL = f"{SOLR_URL}/admin/zookeeper/status"
7 | SOLR_COLLECTIONS_URL = f"{SOLR_URL}/admin/collections"
--------------------------------------------------------------------------------
/ltr/__init__.py:
--------------------------------------------------------------------------------
1 | # Make the most important pieces just available as
2 | # ie - from ltr import download
3 | from .download import download
4 | from .evaluate import evaluate, rre_table
5 | from .search import search
6 |
--------------------------------------------------------------------------------
/ltr/clickmodels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/ltr/clickmodels/__init__.py
--------------------------------------------------------------------------------
/ltr/clickmodels/cascade.py:
--------------------------------------------------------------------------------
1 | from ltr.clickmodels.session import build
2 | from collections import Counter, defaultdict
3 |
4 | class Model():
5 | def __init__(self):
6 | # Attractiveness per query-doc
7 | self.attracts = defaultdict(lambda : 0.5)
8 |
9 | def cascade_model(sessions):
10 | """ Cascading model can be solved directly:
11 | - sessions with skips count against a doc
12 | - sessions with clicks count for
13 | - stop at first click
14 | """
15 | session_counts = Counter()
16 | click_counts = Counter()
17 | model=Model()
18 |
19 | for session in sessions:
20 | for rank, doc in enumerate(session.docs):
21 | query_doc_key = (session.query, doc.doc_id)
22 | session_counts[query_doc_key] += 1
23 |
24 | if doc.click:
25 | # Cascading model doesn't consider
26 | # clicks past the last one, so we count
27 | # this one and break out
28 | click_counts[query_doc_key] += 1
29 | break;
30 |
31 | for (query_id, doc_id), count in session_counts.items():
32 | query_doc_key = (query_id, doc_id)
33 | model.attracts[query_doc_key] = click_counts[query_doc_key] / session_counts[query_doc_key]
34 | return model
35 |
36 |
37 |
38 | if __name__ == "__main__":
39 | sessions = build([
40 | ('A', ((1, True), (2, False), (3, True), (0, False))),
41 | ('B', ((5, False), (2, True), (3, True), (0, False))),
42 | ('A', ((1, False), (2, False), (3, True), (0, False))),
43 | ('B', ((1, False), (2, False), (3, False), (9, True))),
44 | ('A', ((9, False), (2, False), (1, True), (0, True))),
45 | ('B', ((6, True), (2, False), (3, True), (1, False))),
46 | ('A', ((7, False), (4, True), (1, False), (3, False))),
47 | ('B', ((8, True), (2, False), (3, True), (1, False))),
48 | ('A', ((1, False), (4, True), (2, False), (3, False))),
49 | ('B', ((7, True), (4, False), (5, True), (1, True))),
50 | ])
51 | cascade_model(sessions)
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/ltr/clickmodels/coec.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | class Model():
4 | def __init__(self):
5 | # COEC statistic
6 | self.coecs = Counter()
7 |
8 | # CTR for each query-doc pair in this session
9 | self.ctrs = {}
10 |
11 | def coec(ctr_by_rank, sessions):
12 | """ Clicks over expected clicks is a metric
13 | used for seeing what items get above or
14 | below average CTR for their rank. From paper
15 |
16 | > Personalized Click Prediction in Sponsored Search
17 | by Cheng, Cantu Paz
18 |
19 | A COEC > 1 means above average CTR for it's position
20 | A COEC < 1 means below average
21 |
22 | -ctr_by_rank is the global CTR at each rank position
23 | -sessions are an array of search session objects
24 |
25 | returned:
26 | each query-doc pair in provided sessions COEC
27 |
28 | """
29 | clicks = Counter()
30 | weighted_impressions = Counter()
31 |
32 | for session in sessions:
33 | for rank, doc in enumerate(session.docs):
34 | weighted_impressions[(session.query, doc.doc_id)] += ctr_by_rank[rank]
35 | if doc.click:
36 | clicks[(session.query, doc.doc_id)] += 1
37 |
38 | model = Model()
39 | for query_id, doc_id in weighted_impressions:
40 | model.coecs[(query_id,doc_id)] = \
41 | clicks[(query_id,doc_id)] / weighted_impressions[(query_id,doc_id)]
42 |
43 | return model
44 |
--------------------------------------------------------------------------------
/ltr/clickmodels/conversion.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | def conv_aug_attracts(attracts, sessions, costs):
4 | """ Rescan sessions, using click-derrived attractiveness.
5 |
6 | If theres no conversion, punish the attractiveness derrived judgment
7 |
8 | BUT we punish costly things less, and cheap things more
9 | """
10 | satisfacts = Counter()
11 | counts = Counter()
12 | for session in sessions:
13 | for rank, doc in enumerate(session.docs):
14 | attract = attracts[(session.query, doc.doc_id)]
15 | if doc.click:
16 | if doc.conversion:
17 | # Confirms the attractiveness was real with actual relevance
18 | counts[(session.query, doc.doc_id)] += 1
19 | satisfacts[(session.query, doc.doc_id)] += attract
20 | else:
21 | # If it costs a lot, and there wasn't a conversion,
22 | # thats ok, we default to attractiveness
23 | # If it costs little, and there wasn't a conversion,
24 | # thats generally not ok, why didn't they do (easy action)
25 | counts[(session.query, doc.doc_id)] += 1
26 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id]
27 | else:
28 | counts[(session.query, doc.doc_id)] += 1
29 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id]
30 |
31 | for (query_id, doc_id), count in counts.items():
32 | satisfacts[(query_id, doc_id)] = satisfacts[(query_id,doc_id)] / count
33 |
34 | return satisfacts
35 |
36 |
37 |
--------------------------------------------------------------------------------
/ltr/clickmodels/pbm.py:
--------------------------------------------------------------------------------
1 | from ltr.clickmodels.session import build
2 | from collections import Counter, defaultdict
3 | from ltr.helpers.defaultlist import defaultlist
4 |
5 |
6 | class Model():
7 | def __init__(self):
8 | # Examine prob per-rank
9 | self.ranks = defaultlist(lambda: 0.4)
10 |
11 | # Attractiveness per query-doc
12 | self.attracts = defaultdict(lambda : 0.5)
13 |
14 |
15 |
16 | def update_attractiveness(sessions, model):
17 | """ Run through the step of updating attractiveness
18 | based on session information and the current rank
19 | examine probabilities
20 |
21 | Algorithm based on Expectation Maximization derived in
22 | chapter 4 of "Click Models for Web Search" by
23 | Chulkin, Markov, de Rijke
24 |
25 | """
26 | attractions = Counter() #Track query-doc attractiveness in this round
27 | num_sessions = Counter() #Track num sessions where query-doc appears
28 | for session in sessions:
29 | for rank, doc in enumerate(session.docs):
30 | query_doc_key = (session.query, doc.doc_id)
31 | att = 0
32 | if doc.click:
33 | # By PBM rules, if its clicked,
34 | # the user thought it was attractive
35 | att = 1
36 | else:
37 | exam = model.ranks[rank]
38 | assert exam <= 1.0
39 | doc_a = model.attracts[query_doc_key]
40 | # Not examined, but attractive /
41 | # 1 - (examined and attractive)
42 | # When not clicked:
43 | # If somehow this is currently a rank examined
44 | # a lot and this doc is historically attractive, then
45 | # we might still count it as mostly attractive
46 | # OR if the doc IS examined a lot AND its not
47 | # attractive, then we do the opposite, add
48 | # close to 0
49 | att = (((1 - exam) * doc_a) / (1 - (exam * doc_a)))
50 |
51 | # Store away a_sum and
52 | assert att <= 1.0
53 | attractions[query_doc_key] += att
54 | num_sessions[query_doc_key] += 1
55 | assert attractions[query_doc_key] <= num_sessions[query_doc_key]
56 |
57 | # Update the main query attractiveness from the attractions / num sessions
58 | for (query_id, doc_id), a_sum in attractions.items():
59 | query_doc_key = (query_id, doc_id)
60 | att = a_sum / num_sessions[query_doc_key]
61 | assert att <= 1.0
62 | model.attracts[query_doc_key] = att
63 |
64 |
65 | def update_examines(sessions, model):
66 | """ Run through the step of updating position examine
67 | probabilities given current query-doc attractiveness
68 |
69 | Algorithm based on Expectation Maximization derived in
70 | chapter 4 of "Click Models for Web Search" by
71 | Chulkin, Markov, de Rijke
72 |
73 | """
74 | new_rank_probs = defaultlist(lambda: 0)
75 |
76 | for session in sessions:
77 | for rank, doc in enumerate(session.docs):
78 | if doc.click:
79 | new_rank_probs[rank] += 1
80 | else:
81 | # attractiveness at this query/doc pair
82 | a_qd = model.attracts[(session.query, doc.doc_id)]
83 | numerator = (1 - a_qd) * model.ranks[rank]
84 | denominator = 1 - (a_qd * model.ranks[rank])
85 | # When not clicked - was it examined? We have to guess!
86 | # - If it has seemed very attractive, we assume it
87 | # was not examined. Because who could pass up such
88 | # a yummy looking search result? (numerator)
89 | #
90 | # - If its not attractive, but this rank gets examined
91 | # a lot, the new rank prob is closer to 1
92 | # (approaches ranks[rank] / ranks[rank])
93 | #
94 | # - If its not examined much, wont contribute much
95 | new_rank_probs[rank] += numerator / denominator
96 | for i in range(len(new_rank_probs)):
97 | model.ranks[i] = new_rank_probs[i] / len(sessions)
98 |
99 |
100 | def position_based_model(sessions, rounds=20):
101 | """
102 | Algorithm based on Expectation Maximization derived in
103 | chapter 4 (table 4.1) of "Click Models for Web Search" by
104 | Chulkin, Markov, de Rijke
105 |
106 | Given the observed sessions
107 | Initialized:
108 | - prob a ranks is examined (`ranks`)
109 | - randomly initialized query/doc attractiveness
110 |
111 | Compute:
112 | - Probability a doc is attractive for a query
113 | """
114 | model=Model()
115 | for i in range(0,rounds):
116 | update_attractiveness(sessions, model)
117 | update_examines(sessions, model)
118 | return model
119 |
120 |
121 | if __name__ == "__main__":
122 | sessions = build([
123 | ('A', ((1, True), (2, False), (3, True), (0, False))),
124 | ('B', ((5, False), (2, True), (3, True), (0, False))),
125 | ('A', ((1, False), (2, False), (3, True), (0, False))),
126 | ('B', ((1, False), (2, False), (3, False), (9, True))),
127 | ('A', ((9, False), (2, False), (1, True), (0, True))),
128 | ('B', ((6, True), (2, False), (3, True), (1, False))),
129 | ('A', ((7, False), (4, True), (1, False), (3, False))),
130 | ('B', ((8, True), (2, False), (3, True), (1, False))),
131 | ('A', ((1, False), (4, True), (2, False), (3, False))),
132 | ('B', ((7, True), (4, False), (5, True), (1, True))),
133 | ])
134 | position_based_model(sessions, rounds=100)
135 |
--------------------------------------------------------------------------------
/ltr/clickmodels/sdbn.py:
--------------------------------------------------------------------------------
1 | from collections import Counter, defaultdict
2 | from ltr.clickmodels.session import build
3 |
4 | class Model():
5 | def __init__(self):
6 | # Satisfaction per query-doc
7 | self.satisfacts = defaultdict(lambda: 0.1)
8 |
9 | # Attractiveness per query-doc
10 | self.attracts = defaultdict(lambda : 0.1)
11 |
12 | reverse_enumerate = lambda l: zip(range(len(l)-1, -1, -1), reversed(l))
13 |
14 |
15 | def sdbn(sessions):
16 | """ Simplified Dynamic Bayesian Network is a simpler
17 | version of the much more complex Dynamic Bayesian Network
18 | that the authors say comes close to the accuracy of DBN
19 |
20 | Most importantly, it can be solved directly and simply without
21 | an EM learning process
22 |
23 | Features of sdbn:
24 | - Attractiveness is any click out of sessions where that document
25 | appears before the last click of the session
26 | - Satisfaction occurs when a doc is the last document clicked
27 | out of all sessions where that document is clicked
28 |
29 | """
30 | model = Model()
31 | NO_CLICK = -1
32 | counts = Counter()
33 | clicks = Counter()
34 | last_clicks = Counter()
35 | for session in sessions:
36 | last_click = NO_CLICK
37 | for rank, doc in reverse_enumerate(session.docs):
38 | if last_click == NO_CLICK and doc.click:
39 | last_click = rank
40 |
41 | if last_click != NO_CLICK:
42 | query_doc = (session.query, doc.doc_id)
43 | counts[query_doc] += 1
44 |
45 | if doc.click:
46 | # Cascading model doesn't consider
47 | # clicks past the last one, so we count
48 | # this one and break out
49 | clicks[query_doc] += 1
50 | if rank == last_click:
51 | last_clicks[query_doc] += 1
52 |
53 | # For all meaningful sessions (where query_doc appear)
54 | # count attractiveness clicks / num sessions
55 | # count satisfacts last clicks / sessions with clicks
56 | for query_doc, count in counts.items():
57 | model.attracts[query_doc] = clicks[query_doc] / count
58 | if query_doc in clicks:
59 | model.satisfacts[query_doc] = last_clicks[query_doc] / clicks[query_doc]
60 | return model
61 |
62 |
63 | if __name__ == "__main__":
64 | sessions = build([
65 | ('A', ((1, True), (2, False), (3, True), (0, False))),
66 | ('B', ((5, False), (2, True), (3, True), (0, False))),
67 | ('A', ((1, False), (2, False), (3, True), (0, False))),
68 | ('B', ((1, False), (2, False), (3, False), (9, True))),
69 | ('A', ((9, False), (2, False), (1, True), (0, True))),
70 | ('B', ((6, True), (2, False), (3, True), (1, False))),
71 | ('A', ((7, False), (4, True), (1, False), (3, False))),
72 | ('B', ((8, True), (2, False), (3, True), (1, False))),
73 | ('A', ((1, False), (4, True), (2, False), (3, False))),
74 | ('B', ((7, True), (4, False), (5, True), (1, True))),
75 | ])
76 | model = sdbn(sessions)
77 | print(model.attracts[('A', 1)])
78 | print(model.satisfacts[('A', 1)])
79 | print(model.attracts[('B', 1)])
80 | print(model.satisfacts[('B', 1)])
81 |
--------------------------------------------------------------------------------
/ltr/clickmodels/session.py:
--------------------------------------------------------------------------------
1 |
2 | class Doc:
3 | def __init__(self, click, doc_id, conversion=False):
4 | self.click = click
5 | self.doc_id = doc_id
6 | self.conversion = conversion
7 |
8 | def __repr__(self):
9 | return "Doc(doc_id=%s, click=%s, conversion=%s)" % (self.doc_id, self.click, self.conversion)
10 |
11 | def __str__(self):
12 | return "(%s, %s, %s)" % (self.doc_id, self.click, self.conversion)
13 |
14 |
15 | class Session:
16 | def __init__(self, query, docs):
17 | self.query = query
18 | self.docs = docs
19 | # Check if docs are unique
20 | docset = set()
21 | for doc in docs:
22 | if doc.doc_id in docset:
23 | raise ValueError("A session may only list a doc exactly once in search results")
24 | docset.add(doc.doc_id)
25 |
26 | def __repr__(self):
27 | return "Session(query=%s, docs=%s)" % (self.query, self.docs)
28 |
29 | def __str__(self):
30 | return "(%s, (%s))" % (self.query, self.docs)
31 |
32 |
33 | def build_one(sess_tuple):
34 | """ Take a tuple where
35 | 0th item is query (a string that uniquely identifies it)
36 | 1st item is a list of docs, with clicks
37 | and optionally a conversion id or true/false
38 |
39 |
40 | ('A', ((1, True), (2, False), (3, True), (0, False))),
41 |
42 | alternatively a value can be attached to the doc
43 |
44 | ('A', ((1, True, 0.9), (2, False, 0.8), (3, True, 1.0), (0, False))),
45 | """
46 | query = sess_tuple[0]
47 | docs = []
48 | for doc_tuple in sess_tuple[1]:
49 | conversion = False
50 | if len(doc_tuple) > 2:
51 | conversion = doc_tuple[2]
52 | docs.append(Doc(doc_id=doc_tuple[0],
53 | click=doc_tuple[1],
54 | conversion=conversion))
55 | return Session(query=query, docs=docs)
56 |
57 |
58 | def build(sess_tuples):
59 | sesss = []
60 | for sess_tup in sess_tuples:
61 | sesss.append(build_one(sess_tup))
62 | return sesss
63 |
64 |
--------------------------------------------------------------------------------
/ltr/clickmodels/ubm.py:
--------------------------------------------------------------------------------
1 | from ltr.clickmodels.session import build
2 | from collections import Counter, defaultdict
3 |
4 | class Model():
5 | def __init__(self):
6 | # Examine prob per-rank
7 | # Rank 0 is first displayed on page
8 | # Rank -1 i
9 | self.ranks = defaultdict(lambda: 0.4)
10 |
11 | # Attractiveness per query-doc
12 | self.attracts = defaultdict(lambda : 0.5)
13 |
14 |
15 | def update_attractiveness(sessions, model):
16 | """ Run through the step of updating attractiveness
17 | based on session information and the current rank
18 | examine probabilities
19 |
20 | Algorithm based on Expectation Maximization derived in
21 | chapter 4 of "Click Models for Web Search" by
22 | Chulkin, Markov, de Rijke
23 |
24 | """
25 | attractions = Counter() #Track query-doc attractiveness in this round
26 | num_sessions = Counter() #Track num sessions where query-doc appears
27 | for session in sessions:
28 | last_click = -1
29 | for rank, doc in enumerate(session.docs):
30 | query_doc_key = (session.query, doc.doc_id)
31 | att = 0
32 | if doc.click:
33 |
34 | last_click = rank
35 |
36 | att = 1
37 | else:
38 | exam = model.ranks[(last_click,rank)]
39 | assert exam <= 1.0
40 | doc_a = model.attracts[query_doc_key]
41 | # Not examined, but attractive /
42 | # 1 - (examined and attractive)
43 | # When not clicked:
44 | # If somehow this is currently a rank examined
45 | # a lot and this doc is historically attractive, then
46 | # we might still count it as mostly attractive
47 | # OR if the doc IS examined a lot AND its not
48 | # attractive, then we do the opposite, add
49 | # close to 0
50 | att = (((1 - exam) * doc_a) / (1 - (exam * doc_a)))
51 |
52 | # Store away a_sum and
53 | assert att <= 1.0
54 | attractions[query_doc_key] += att
55 | num_sessions[query_doc_key] += 1
56 | assert attractions[query_doc_key] <= num_sessions[query_doc_key]
57 |
58 | # Update the main query attractiveness from the attractions / num sessions
59 | for (query_id, doc_id), a_sum in attractions.items():
60 | query_doc_key = (query_id, doc_id)
61 | att = a_sum / num_sessions[query_doc_key]
62 | assert att <= 1.0
63 | model.attracts[query_doc_key] = att
64 |
65 |
66 | def update_examines(sessions, model):
67 | """ Run through the step of updating position examine
68 | probabilities given current query-doc attractiveness
69 |
70 | Algorithm based on Expectation Maximization derived in
71 | chapter 4 of "Click Models for Web Search" by
72 | Chulkin, Markov, de Rijke
73 |
74 | """
75 | new_rank_probs = defaultdict(lambda: 0)
76 | counts = defaultdict(lambda: 0)
77 |
78 | for session in sessions:
79 | last_click = -1
80 | for rank, doc in enumerate(session.docs):
81 | if doc.click:
82 | new_rank_probs[(last_click, rank)] += 1
83 | counts[(last_click, rank)] += 1
84 | if last_click == -1 and rank == 3:
85 | print(counts[(last_click,rank)])
86 |
87 | last_click = rank
88 | else:
89 | # attractiveness at this query/doc pair
90 | a_qd = model.attracts[(session.query, doc.doc_id)]
91 | numerator = (1 - a_qd) * model.ranks[(last_click, rank)]
92 | denominator = 1 - (a_qd * model.ranks[(last_click, rank)])
93 | # When not clicked - was it examined? We have to guess!
94 | # - If it has seemed very attractive, we assume it
95 | # was not examined. Because who could pass up such
96 | # a yummy looking search result? (numerator)
97 | #
98 | # - If its not attractive, but this rank gets examined
99 | # a lot, the new rank prob is closer to 1
100 | # (approaches ranks[rank] / ranks[rank])
101 | #
102 | # - If its not examined much, wont contribute much
103 | new_rank_probs[(last_click, rank)] += numerator / denominator
104 | counts[(last_click, rank)] += 1
105 | if last_click == -1 and rank == 3:
106 | print(counts[(last_click,rank)])
107 |
108 | for (last_click, click), count in counts.items():
109 | model.ranks[(last_click, click)] = new_rank_probs[(last_click, click)] / count
110 |
111 |
112 | def user_browse_model(sessions, rounds=20):
113 | """
114 | Algorithm based on Expectation Maximization derived in
115 | chapter 4 (table 4.1) of "Click Models for Web Search" by
116 | Chulkin, Markov, de Rijke
117 |
118 | """
119 | model=Model()
120 | for i in range(0,rounds):
121 | update_attractiveness(sessions, model)
122 | update_examines(sessions, model)
123 | return model
124 |
125 |
126 | if __name__ == "__main__":
127 | sessions = build([
128 | ('A', ((1, True), (2, False), (3, True), (0, False))),
129 | ('B', ((5, False), (2, True), (3, True), (0, False))),
130 | ('A', ((1, False), (2, False), (3, True), (0, False))),
131 | ('B', ((1, False), (2, False), (3, False), (9, True))),
132 | ('A', ((9, False), (2, False), (1, True), (0, True))),
133 | ('B', ((6, True), (2, False), (3, True), (1, False))),
134 | ('A', ((7, False), (4, True), (1, False), (3, False))),
135 | ('B', ((8, True), (2, False), (3, True), (1, False))),
136 | ('A', ((1, False), (4, True), (2, False), (3, False))),
137 | ('B', ((7, True), (4, False), (5, True), (1, True))),
138 | ])
139 | user_browse_model(sessions, rounds=100)
140 |
--------------------------------------------------------------------------------
/ltr/client/__init__.py:
--------------------------------------------------------------------------------
1 | from .solr_client import SolrClient
2 |
--------------------------------------------------------------------------------
/ltr/client/base_client.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | '''
4 | This project demonstrates working with LTR in Elasticsearch and Solr
5 |
6 | The goal of this class is to abstract away the server and highlight the steps
7 | required to begin working with LTR. This keeps the examples agnostic about
8 | which backend is being used, but the implementations of each client
9 | should be useful references to those getting started with LTR on
10 | their specific platform
11 | '''
12 | class BaseClient(ABC):
13 | @abstractmethod
14 | def get_host(self):
15 | pass
16 |
17 | @abstractmethod
18 | def name(self):
19 | pass
20 |
21 | @abstractmethod
22 | def delete_index(self, index):
23 | pass
24 |
25 | @abstractmethod
26 | def create_index(self, index):
27 | pass
28 |
29 | @abstractmethod
30 | def index_documents(self, index, doc_src):
31 | pass
32 |
33 | @abstractmethod
34 | def reset_ltr(self, index):
35 | pass
36 |
37 | @abstractmethod
38 | def create_featureset(self, index, name, ftr_config):
39 | pass
40 |
41 | @abstractmethod
42 | def query(self, index, query):
43 | pass
44 |
45 | @abstractmethod
46 | def get_doc(self, doc_id, fields=None):
47 | pass
48 |
49 | @abstractmethod
50 | def log_query(self, index, featureset, ids, params):
51 | pass
52 |
53 | @abstractmethod
54 | def submit_model(self, featureset, index, model_name, model_payload):
55 | pass
56 |
57 | @abstractmethod
58 | def submit_ranklib_model(self, featureset, index, model_name, model_payload):
59 | pass
60 |
61 | @abstractmethod
62 | def model_query(self, index, model, model_params, query):
63 | pass
64 |
65 | @abstractmethod
66 | def feature_set(self, index, name):
67 | """ Return a mapping of name/feature ordinal
68 | and the raw (search engine specific) feature list"""
69 | pass
70 |
71 |
72 |
--------------------------------------------------------------------------------
/ltr/client/solr_parse.py:
--------------------------------------------------------------------------------
1 | def every_other_zipped(lst):
2 | return zip(lst[0::2],lst[1::2])
3 |
4 | def dictify(nl_tups):
5 | """ Return dict if all keys unique, otherwise
6 | dont modify """
7 | as_dict = dict(nl_tups)
8 | if len(as_dict) == len(nl_tups):
9 | return as_dict
10 | return nl_tups
11 |
12 | def parse_named_list(lst):
13 | shallow_tups = [tup for tup in every_other_zipped(lst)]
14 |
15 | nl_as_tups = []
16 |
17 | for tup in shallow_tups:
18 | if isinstance(tup[1], list):
19 | tup = (tup[0], parse_named_list(tup[1]))
20 | nl_as_tups.append(tup)
21 | return dictify(nl_as_tups)
22 |
23 |
24 | def parse_termvect_namedlist(lst, field):
25 | """ Parse the named list and perform some transformations to create consistent
26 | JSON to parse
27 |
28 | Specifically changing {"positions": ...} to {"positions": [1234,4567]}
29 |
30 | """
31 |
32 | def listify_posns(posn_attrs):
33 | if isinstance(posn_attrs, dict):
34 | assert len(posn_attrs) == 1
35 | return [posn_attrs['position']]
36 | return [posn_attr[1] for posn_attr in posn_attrs]
37 |
38 |
39 | tv_parsed = parse_named_list(lst)
40 | for doc_id, doc_field_tv in tv_parsed.items():
41 | for field_name, term_vects in doc_field_tv.items():
42 | # T
43 | if field_name == field:
44 | for term, attrs in term_vects.items():
45 | for attr_key, attr_val in attrs.items():
46 | if attr_key == 'positions':
47 | attrs['positions'] = listify_posns(attr_val)
48 | return tv_parsed
49 |
50 |
51 |
52 | if __name__ == "__main__":
53 | solr_nl = [
54 | "D100000", [
55 | "uniqueKey", "D100000",
56 | "body", [
57 | "1", [
58 | "positions", [
59 | "position", 92,
60 | "position", 113
61 | ]],
62 | "2", [
63 | "positions", [
64 | "position", 22,
65 | "position", 413
66 | ]],
67 | "boo", [
68 | "positions", [
69 | "position", 22,
70 | ]]
71 | ]]]
72 | print(repr(parse_termvect_namedlist(solr_nl, 'body')))
73 |
--------------------------------------------------------------------------------
/ltr/download.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from os import path
3 | from tqdm import tqdm
4 |
5 | def download_one(uri, dest='data/', force=False, fancy=False):
6 | import os
7 |
8 | if not os.path.exists(dest):
9 | os.makedirs(dest)
10 |
11 | if not os.path.isdir(dest):
12 | raise ValueError("dest {} is not a directory".format(dest))
13 |
14 | filename = uri[uri.rfind('/') + 1:]
15 | filepath = os.path.join(dest, filename)
16 | if path.exists(filepath):
17 | if not force:
18 | print(filepath + ' already exists')
19 | return
20 | print("exists but force=True, Downloading anyway")
21 |
22 | if not fancy:
23 | with open(filepath, 'wb') as out:
24 | print('GET {}'.format(uri))
25 | resp = requests.get(uri, stream=True)
26 | for chunk in resp.iter_content(chunk_size=1024):
27 | if chunk:
28 | out.write(chunk)
29 | else:
30 | resp = requests.get(uri, stream=True)
31 | total = int(resp.headers.get('content-length', 0))
32 | with open(filepath, 'wb') as file, tqdm(
33 | desc=filepath,
34 | total=total,
35 | unit='iB',
36 | unit_scale=True,
37 | unit_divisor=1024,
38 | ) as bar:
39 | for data in resp.iter_content(chunk_size=1024):
40 | size = file.write(data)
41 | bar.update(size)
42 |
43 | def extract_tgz(fname, dest='data/'):
44 | import tarfile
45 | with tarfile.open(fname, 'r:gz') as tar:
46 | tar.extractall(path=dest)
47 |
48 |
49 | def download(uris, dest='data/', force=False, fancy=False):
50 | for uri in uris:
51 | download_one(uri=uri, dest=dest, force=force, fancy=fancy)
52 |
--------------------------------------------------------------------------------
/ltr/evaluate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import plotly.graph_objs as go
5 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
6 |
7 | def log_run(cmd):
8 | resp = os.popen(cmd).read()
9 | print(resp)
10 |
11 | def quiet_run(cmd):
12 | os.popen(cmd).read()
13 |
14 | def evaluate(mode):
15 | # Build the docker image
16 | if mode == 'elastic':
17 | cmd = 'docker build --no-cache -t ltr-rre rre/elastic/.'
18 | else:
19 | cmd = 'docker build --no-cache -t ltr-rre rre/solr/.'
20 |
21 | print('Building RRE image - This will take a while')
22 | quiet_run(cmd)
23 |
24 | # Remove and run a fresh docker image
25 | cmd = 'docker rm -f ltr-rre'
26 | quiet_run(cmd)
27 |
28 | cmd = 'docker run --name ltr-rre ltr-rre'
29 | print('Running evaluation')
30 | log_run(cmd)
31 |
32 | # Copy out reports
33 | cmd = 'docker cp ltr-rre:/rre/target/rre/evaluation.json data/rre-evaluation.json'
34 | log_run(cmd)
35 |
36 | cmd = 'docker cp ltr-rre:/rre/target/site/rre-report.xlsx data/rre-report.xlsx'
37 | log_run(cmd)
38 |
39 | print('RRE Evaluation complete')
40 |
41 |
42 | def rre_table():
43 | init_notebook_mode(connected=True)
44 |
45 | with open('data/rre-evaluation.json') as src:
46 | report = json.load(src)
47 | metrics = report['metrics']
48 |
49 | experiments = ['baseline', 'classic', 'latest']
50 | precisions = []
51 | recalls = []
52 | errs = []
53 |
54 | for exp in experiments:
55 | precisions.append(metrics['P']['versions'][exp]['value'])
56 | recalls.append(metrics['R']['versions'][exp]['value'])
57 | errs.append(metrics['ERR@30']['versions'][exp]['value'])
58 |
59 | trace = go.Table(
60 | header=dict(values=['', 'Precision', 'Recall', 'ERR'], fill = dict(color='#AAAAAA')),
61 | cells=dict(values=[
62 | experiments,
63 | precisions,
64 | recalls,
65 | errs
66 | ])
67 | )
68 |
69 | data = [trace]
70 | iplot(data)
71 |
72 |
--------------------------------------------------------------------------------
/ltr/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/ltr/helpers/__init__.py
--------------------------------------------------------------------------------
/ltr/helpers/butterfingers.py:
--------------------------------------------------------------------------------
1 | def butterfingers(text,prob=0.1,keyboard='qwerty'):
2 | import random
3 |
4 | """ taken from
5 | https://github.com/Decagon/butter-fingers/blob/master/butterfingers/butterfingers.py """
6 |
7 | keyApprox = {}
8 |
9 | if keyboard == "qwerty":
10 | keyApprox['q'] = "qwasedzx"
11 | keyApprox['w'] = "wqesadrfcx"
12 | keyApprox['e'] = "ewrsfdqazxcvgt"
13 | keyApprox['r'] = "retdgfwsxcvgt"
14 | keyApprox['t'] = "tryfhgedcvbnju"
15 | keyApprox['y'] = "ytugjhrfvbnji"
16 | keyApprox['u'] = "uyihkjtgbnmlo"
17 | keyApprox['i'] = "iuojlkyhnmlp"
18 | keyApprox['o'] = "oipklujm"
19 | keyApprox['p'] = "plo['ik"
20 |
21 | keyApprox['a'] = "aqszwxwdce"
22 | keyApprox['s'] = "swxadrfv"
23 | keyApprox['d'] = "decsfaqgbv"
24 | keyApprox['f'] = "fdgrvwsxyhn"
25 | keyApprox['g'] = "gtbfhedcyjn"
26 | keyApprox['h'] = "hyngjfrvkim"
27 | keyApprox['j'] = "jhknugtblom"
28 | keyApprox['k'] = "kjlinyhn"
29 | keyApprox['l'] = "lokmpujn"
30 |
31 | keyApprox['z'] = "zaxsvde"
32 | keyApprox['x'] = "xzcsdbvfrewq"
33 | keyApprox['c'] = "cxvdfzswergb"
34 | keyApprox['v'] = "vcfbgxdertyn"
35 | keyApprox['b'] = "bvnghcftyun"
36 | keyApprox['n'] = "nbmhjvgtuik"
37 | keyApprox['m'] = "mnkjloik"
38 | keyApprox[' '] = " "
39 | else:
40 | print("Keyboard not supported.")
41 |
42 | probOfTypo = int(prob * 100)
43 |
44 | buttertext = ""
45 | for letter in text:
46 | lcletter = letter.lower()
47 | if not lcletter in keyApprox.keys():
48 | newletter = lcletter
49 | else:
50 | if random.choice(range(0, 100)) <= probOfTypo:
51 | newletter = random.choice(keyApprox[lcletter])
52 | else:
53 | newletter = lcletter
54 | # go back to original case
55 | if not lcletter == letter:
56 | newletter = newletter.upper()
57 | buttertext += newletter
58 |
59 | return buttertext
60 |
61 |
62 |
--------------------------------------------------------------------------------
/ltr/helpers/convert.py:
--------------------------------------------------------------------------------
1 | # converts LambdaMART XML models to JSON for Solr..
2 |
3 | import xml.etree.ElementTree as ET
4 |
5 |
6 | def convert(ensemble_xml_string, modelName, featureSet, featureMapping):
7 | modelClass = 'org.apache.solr.ltr.model.MultipleAdditiveTreesModel'
8 |
9 | model = {
10 | 'store': featureSet,
11 | 'name': modelName,
12 | 'class': modelClass,
13 | 'features': featureMapping
14 | }
15 |
16 | # Clean up header
17 | ensemble_xml_string = '\n'.join(ensemble_xml_string.split('\n')[7:])
18 | lambdaModel = ET.fromstring(ensemble_xml_string)
19 |
20 | trees = []
21 | for node in lambdaModel:
22 | t = {
23 | 'weight': str(node.attrib['weight']),
24 | 'root': parseSplits(node[0], featureMapping)
25 | }
26 | trees.append(t)
27 |
28 | # print(trees)
29 | model['params'] = {'trees': trees}
30 |
31 | return model
32 |
33 | def parseSplits(split, features):
34 | obj = {}
35 | for el in split:
36 | if (el.tag == 'feature'):
37 | obj['feature'] = features[(int(el.text.strip()) - 1)]['name']
38 | elif (el.tag == 'threshold'):
39 | obj['threshold'] = str(el.text.strip())
40 | elif (el.tag == 'split' and 'pos' in el.attrib):
41 | obj[el.attrib['pos']] = parseSplits(el, features)
42 | elif (el.tag == 'output'):
43 | obj['value'] = str(el.text.strip())
44 | return obj
45 |
--------------------------------------------------------------------------------
/ltr/helpers/defaultlist.py:
--------------------------------------------------------------------------------
1 | class DefaultList(list):
2 | """ adapted from https://stackoverflow.com/a/869901/8123"""
3 |
4 | def __init__(self, factory):
5 | self.factory = factory
6 |
7 | def __getitem__(self, index):
8 | size = len(self)
9 | if index >= size:
10 | self.extend(self.factory() for _ in range(size, index + 1))
11 |
12 | return list.__getitem__(self, index)
13 |
14 | def __setitem__(self, index, value):
15 | size = len(self)
16 | if index >= size:
17 | self.extend(self.factory() for _ in range(size, index + 1))
18 |
19 | list.__setitem__(self, index, value)
20 |
21 | def defaultlist(factory):
22 | return DefaultList(factory)
23 |
--------------------------------------------------------------------------------
/ltr/helpers/esUrlParse.py:
--------------------------------------------------------------------------------
1 | def parseUrl(fullEsUrl):
2 | from urllib.parse import urlsplit, urlunsplit
3 | import os.path
4 | o = urlsplit(fullEsUrl)
5 |
6 | esUrl = urlunsplit([o.scheme, o.netloc, '','',''])
7 |
8 | indexAndSearchType = os.path.split(o.path)
9 |
10 | return (esUrl, indexAndSearchType[0][1:], indexAndSearchType[1])
11 |
12 |
13 | if __name__ == "__main__":
14 | from sys import argv
15 | print(parseUrl(argv[1]))
16 |
--------------------------------------------------------------------------------
/ltr/helpers/handle_resp.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | def resp_msg(msg, resp, throw=True):
4 | print('{} [Status: {}]'.format(msg, resp.status_code))
5 | if resp.status_code >= 400:
6 | print(resp.text)
7 | if throw:
8 | raise RuntimeError(resp.text)
9 |
10 |
--------------------------------------------------------------------------------
/ltr/helpers/msmarco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/ltr/helpers/msmarco/__init__.py
--------------------------------------------------------------------------------
/ltr/helpers/msmarco/evaluate.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import gzip
3 |
4 |
5 | class QRel():
6 |
7 | def __init__(self, qid, docid, keywords):
8 | self.qid=qid
9 | self.docid=docid
10 | self.keywords = keywords
11 |
12 | def eval_rr(self, doc_ranking):
13 | """ Evaluate the provided doc ranking using reciprical rank
14 | (1/rank of the expected doc)
15 |
16 | returns 0 if this qrels doc id is missing
17 | """
18 |
19 | for rank, docid in enumerate(doc_ranking, start=1):
20 | if docid == self.docid:
21 | return 1.0 / rank
22 | return 0.0
23 |
24 | @staticmethod
25 | def read_qrels(qrels_fname='data/msmarco-doctrain-qrels.tsv.gz',
26 | queries_fname='data/msmarco-doctrain-queries.tsv.gz'):
27 |
28 | qids_to_keywords = QRel.get_keyword_lookup(queries_fname)
29 |
30 | with gzip.open(qrels_fname, 'rt') as f:
31 | reader = csv.reader(f, delimiter=' ')
32 | for row in reader:
33 | qid = row[0]
34 | keywords = None
35 | if qid in qids_to_keywords:
36 | keywords = qids_to_keywords[qid]
37 | else:
38 | print("Missing keywords for %s" % qid)
39 | yield QRel(qid=row[0], docid=row[2], keywords=keywords)
40 |
41 | @staticmethod
42 | def get_keyword_lookup(fname='data/msmarco-doctrain-queries.tsv.gz'):
43 | qids_to_keywords = {}
44 | with gzip.open(fname, 'rt') as f:
45 | reader = csv.reader(f, delimiter='\t')
46 | for row in reader:
47 | qids_to_keywords[row[0]] = row[1]
48 | return qids_to_keywords
49 |
50 | def __str__(self):
51 | return "qid:%s(%s) => doc:%s" % (self.qid, self.keywords, self.docid)
52 |
53 |
54 | if __name__ == "__main__":
55 | qrels = {}
56 | for qrel in QRel.read_qrels():
57 | qrels[qrel.qid] = qrel
58 |
59 | print(qrels['1185869'].eval_rr(['1','1']))
60 |
61 |
--------------------------------------------------------------------------------
/ltr/helpers/ranklib_result.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | class RanklibResult:
5 |
6 | """ A result of ranklib training, either for a
7 | single training operation
8 | (where trainingLogs is just set, and has a single item)
9 | or k-folds cross validation
10 | (where the foldResults/kcv are set; with a result for
11 | each fold that is run """
12 |
13 | def __init__(self, trainingLogs, foldResults,
14 | kcvTestAvg, kcvTrainAvg):
15 | self.trainingLogs = trainingLogs
16 | self.foldResults = foldResults
17 | self.kcvTrainAvg = kcvTrainAvg
18 | self.kcvTestAvg = kcvTestAvg
19 |
20 | class TrainingLog:
21 |
22 | def __init__(self, rounds, impacts, trainMetricName, trainMetricVal):
23 | self.impacts = impacts
24 | self.rounds = rounds
25 | self.trainMetricName = trainMetricName
26 | self.trainMetricVal = trainMetricVal
27 |
28 |
29 | def metric(self):
30 | if self.trainMetricName is not None:
31 | return self.trainMetricVal
32 | if len(self.rounds) > 0:
33 | return self.rounds[-1]
34 | else:
35 | return 0
36 |
37 | class FoldResult:
38 | def __init__(self, foldId, trainMetric, testMetric):
39 | self.foldNum = foldId
40 | self.trainMetric = trainMetric
41 | self.testMetric = testMetric
42 |
43 | impactRe = re.compile(' Feature (\d+) reduced error (.*)')
44 | roundsRe = re.compile('(\d+)\s+\| (\d+)')
45 | foldsRe = re.compile('^Fold (\d+)\s+\|(.*)\|(.*)')
46 | avgRe = re.compile('^Avg.\s+\|(.*)\|(.*)')
47 | trainMetricRe = re.compile('(.*@.*) on training data: (.*)')
48 |
49 | def parse_training_log(rawResult):
50 | """ Takes raw result from Ranklib training and
51 | gathers the feature impacts, training rounds,
52 | and any cross-validation information """
53 | lines = rawResult.split('\n')
54 | # Fold 1 | 0.9396 | 0.8764
55 | train = False
56 | logs = []
57 | folds = []
58 | impacts = {}
59 | rounds = []
60 | trainMetricName = None
61 | trainMetricVal = 0.0
62 | kcvTestAvg = kcvTrainAvg = None
63 | for line in lines:
64 | if 'Training starts...' in line:
65 | if train:
66 | log = TrainingLog(rounds=rounds,
67 | impacts=impacts,
68 | trainMetricName=trainMetricName,
69 | trainMetricVal=trainMetricVal)
70 | logs.append(log)
71 | impacts = {}
72 | rounds = []
73 | train = True
74 |
75 | if train:
76 | m = re.match(impactRe, line)
77 | if m:
78 | ftrId = m.group(1)
79 | error = float(m.group(2))
80 | impacts[ftrId] = error
81 | m = re.match(roundsRe, line)
82 | if m:
83 | values = line.split('|')
84 | metricTrain = float(values[1])
85 | rounds.append(metricTrain)
86 | m = re.match(trainMetricRe, line)
87 | if m:
88 | trainMetricVal = float(m.group(2))
89 | trainMetricName = m.group(1)
90 |
91 | m = re.match(foldsRe, line)
92 | if m:
93 | foldId = m.group(1)
94 | trainMetric = float(m.group(2))
95 | testMetric = float(m.group(3))
96 | folds.append(FoldResult(foldId=foldId,
97 | testMetric=testMetric,
98 | trainMetric=trainMetric))
99 | m = re.match(avgRe, line)
100 | if m:
101 | kcvTrainAvg = float(m.group(1))
102 | kcvTestAvg = float(m.group(2))
103 |
104 | if train:
105 | log = TrainingLog(rounds=rounds,
106 | impacts=impacts,
107 | trainMetricName=trainMetricName,
108 | trainMetricVal=trainMetricVal)
109 | logs.append(log)
110 |
111 | return RanklibResult(trainingLogs=logs,
112 | foldResults=folds,
113 | kcvTrainAvg=kcvTrainAvg,
114 | kcvTestAvg=kcvTestAvg)
115 |
116 |
--------------------------------------------------------------------------------
/ltr/helpers/solr_escape.py:
--------------------------------------------------------------------------------
1 | def esc_kw(kw):
2 | """ Take a keyword and escape all the
3 | Solr parts we want to escape!"""
4 | kw = kw.replace('\\', '\\\\') # be sure to do this first, as we inject \!
5 | kw = kw.replace('(', '\(')
6 | kw = kw.replace(')', '\)')
7 | kw = kw.replace('+', '\+')
8 | kw = kw.replace('-', '\-')
9 | kw = kw.replace(':', '\:')
10 | kw = kw.replace('/', '\/')
11 | kw = kw.replace(']', '\]')
12 | kw = kw.replace('[', '\[')
13 | kw = kw.replace('*', '\*')
14 | kw = kw.replace('?', '\?')
15 | kw = kw.replace('{', '\{')
16 | kw = kw.replace('}', '\}')
17 | kw = kw.replace('~', '\~')
18 |
19 |
20 | return kw
21 |
--------------------------------------------------------------------------------
/ltr/helpers/tau.py:
--------------------------------------------------------------------------------
1 | sign = lambda a: (a>0) - (a<0)
2 |
3 | def pairs_in_order(ranking, both_ways=True):
4 | assert len(ranking) > 1
5 | for idx1, val1 in enumerate(ranking):
6 | for idx2, val2 in enumerate(ranking):
7 | if idx2 > idx1:
8 | yield val1, val2, sign(idx2-idx1)
9 | if both_ways:
10 | yield val2, val1, sign(idx1-idx2)
11 |
12 | def tau(rank1, rank2, at=4):
13 | rank1in = {}
14 |
15 |
16 | if len(rank1) < at or len(rank2) < at:
17 | raise ValueError("rankings must be larger than provided at param(%s)" % at)
18 |
19 | # Handle 1 as a special case
20 | if at == 1:
21 | if rank1[0] == rank2[0]:
22 | return 1
23 | return -1
24 |
25 | rank1 = rank1[:at]; rank2 = rank2[:at]
26 |
27 | # gather concordances/discords for rank1
28 | for val1, val2, order in pairs_in_order(rank1, both_ways=True):
29 | rank1in[(val1,val2)] = order
30 |
31 | # check rank2
32 | concords = 0
33 | discords = 0
34 | for val1, val2, order in pairs_in_order(rank2, both_ways=False):
35 | try:
36 | rank1order = rank1in[(val1,val2)]
37 | if order == rank1order:
38 | concords += 1
39 | else:
40 | discords += 1
41 | except KeyError:
42 | discords += 1
43 |
44 | return (concords - discords) / ((at * (at - 1)) / 2)
45 |
46 | def avg_tau(rank1, rank2, at=4):
47 | if len(rank1) < at or len(rank2) < at:
48 | raise ValueError("rankings must be larger than provided at param(%s)" % at)
49 |
50 | rank1 = rank1[:at]; rank2 = rank2[:at]
51 |
52 | tot = 0
53 | for i in range(1,at+1):
54 | tot += tau(rank1,rank2,at=i)
55 | return tot / (at)
56 |
57 | if __name__ == "__main__":
58 | print(tau([1,2,3,4],[4,3,2,1]))
59 | print(tau([1,2,3,4],[1,2,3,4]))
60 | print(tau([1,2,4,3],[1,2,3,4]))
61 | print(tau([5,6,7,8],[1,2,3,4]))
62 | print(tau([1,2,3,5],[1,2,3,4]))
63 | print(tau([5,3,2,1],[4,3,2,1]))
64 | l1=[1,2,4,3]; l2=[1,2,3,4]; l3=[2,1,3,4]
65 | print("avg_tau(%s,%s,at=4) %s" % (l1, l1, avg_tau(l1,l1)))
66 | print("avg_tau(%s,%s,at=4) %s" % (l1, l2, avg_tau(l1,l2)))
67 | print("avg_tau(%s,%s,at=4) %s" % (l2, l3, avg_tau(l1,l3)))
68 | print("tau(%s,%s,at=4) %s" % (l1, l2, tau(l1,l2)))
69 | print("tau(%s,%s,at=4) %s" % (l2, l3, tau(l1,l3)))
70 |
71 |
--------------------------------------------------------------------------------
/ltr/helpers/timed_block.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 | import sys
3 |
4 | @contextmanager
5 | def timed_block(name=None, f=sys.stdout):
6 | from time import perf_counter
7 | start = perf_counter()
8 | yield start
9 | stop = perf_counter()
10 | if name:
11 | f.write(name + " - ")
12 | f.write("Took %s\n" % (stop-start))
13 |
--------------------------------------------------------------------------------
/ltr/index.py:
--------------------------------------------------------------------------------
1 | def reindex(client, index, doc_src, indexing_workers=3, indexing_batch_size=500):
2 | """ Reload a configuration on disk for each search engine
3 | (Solr a configset, Elasticsearch a json file)
4 | and reindex
5 |
6 | """
7 | from ltr.helpers.timed_block import timed_block
8 |
9 | print("Reindexing...")
10 |
11 | with timed_block(name='Indexing'):
12 | client.index_documents(index,
13 | doc_src=doc_src,
14 | batch_size=indexing_batch_size,
15 | workers=indexing_workers)
16 |
17 | print('Done')
18 |
--------------------------------------------------------------------------------
/ltr/injectTypos.py:
--------------------------------------------------------------------------------
1 | try:
2 | from judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid
3 | from butterfingers import butterfingers
4 | except ImportError:
5 | from .judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid
6 | from .butterfingers import butterfingers
7 |
8 |
9 |
10 | def typoIt(judgmentInFile, judgmentOutFile, rounds=100):
11 | with open(judgmentInFile) as f:
12 | currJudgments = [judg for judg in judgments_from_file(f)]
13 | lastQid = currJudgments[-1].qid
14 | judgDict = judgments_by_qid(currJudgments)
15 |
16 | existingTypos = set()
17 |
18 | for i in range(0, rounds):
19 |
20 | for qid, judglist in judgDict.items():
21 | keywords = judglist[0].keywords
22 | keywordsWTypo = butterfingers(keywords)
23 |
24 | if keywordsWTypo != keywords and keywordsWTypo not in existingTypos:
25 | newQid = lastQid+1
26 | print("%s => %s" % (keywords, keywordsWTypo))
27 | lastQid += 1
28 | for judg in judglist:
29 | typoJudg = Judgment(grade=judg.grade,
30 | qid=newQid,
31 | keywords=keywordsWTypo,
32 | doc_id=judg.doc_id)
33 | currJudgments.append(typoJudg)
34 | existingTypos.add(keywordsWTypo)
35 |
36 | with open(judgmentOutFile, 'w') as f:
37 | judgments_to_file(f, judgmentsList=currJudgments)
38 |
39 |
40 | if __name__ == "__main__":
41 | typoIt(judgmentInFile='title_judgments.txt', judgmentOutFile='title_fuzzy_judgments.txt')
42 |
43 |
44 | # Clone a judgment, inject random typos
45 |
--------------------------------------------------------------------------------
/ltr/log.py:
--------------------------------------------------------------------------------
1 | import re
2 | from aips import get_ltr_engine, get_engine
3 |
4 | class FeatureLogger:
5 | """ Logs LTR Features, one query at a time
6 |
7 | ...Building up a training set...
8 | """
9 |
10 | def __init__(self, index, feature_set, drop_missing=True, id_field='id'):
11 | self.index=index
12 | self.feature_set=feature_set
13 | self.drop_missing=drop_missing
14 | self.id_field=id_field
15 | self.logged=[]
16 |
17 | def clear(self):
18 | self.logged=[]
19 |
20 | def log_for_qid(self, judgments, qid=None, keywords=None, log=False):
21 | """ Log a set of judgments associated with a single qid
22 | judgments will be modified, a training set also returned, discarding
23 | any judgments we could not log features for (because the doc was missing)
24 | """
25 | if qid is None:
26 | qid=judgments[0].qid
27 |
28 | judgments = [j for j in judgments]
29 | doc_ids = [judgment.doc_id for judgment in judgments]
30 | unique_ids = list(set(doc_ids))
31 | if len(doc_ids) != len(unique_ids):
32 | duplicated = set([id for id in doc_ids if doc_ids.count(id) > 1])
33 | print(f"Duplicate docs in for query id {qid}: {duplicated}")
34 | doc_ids = unique_ids
35 |
36 | if keywords is None:
37 | keywords = judgments[len(judgments) - 1].keywords
38 | # For every batch of N docs to generate judgments for
39 | BATCH_SIZE = 500
40 | numLeft = len(doc_ids)
41 | document_features = {}
42 | for i in range(0, 1 + (len(doc_ids) // BATCH_SIZE)):
43 |
44 | numFetch = min(BATCH_SIZE, numLeft)
45 | start = i*BATCH_SIZE
46 | if start >= len(doc_ids):
47 | break
48 | ids = doc_ids[start:start+numFetch]
49 |
50 | # Sanitize (Solr has a strict syntax that can easily be tripped up)
51 | # This removes anything but alphanumeric and spaces
52 | fixed_keywords = re.sub('([^\s\w]|_)+', '', keywords)
53 |
54 | params = {
55 | "keywords": fixed_keywords,
56 | "fuzzy_keywords": ' '.join([x + '~' for x in fixed_keywords.split(' ')]),
57 | "squeezed_keywords": ''.join(fixed_keywords.split(' '))
58 | }
59 |
60 | ids = [str(doc_id) for doc_id in ids]
61 | res = get_ltr_engine(self.index).get_logged_features(self.feature_set, ids,
62 | params, id_field=self.id_field, log=log)
63 |
64 |
65 | # Add feature back to each judgment
66 | for doc in res:
67 | doc_id = str(doc[self.id_field])
68 | features = doc['[features]']
69 | document_features[doc_id] = list(features.values())
70 | numLeft -= BATCH_SIZE
71 |
72 | # Append features from search engine back to ranklib judgment list
73 | for judgment in judgments:
74 | if judgment.qid != qid:
75 | raise RuntimeError(f"Judgment qid {judgment.qid} inconsistent with logged qid {qid}")
76 | if judgment.keywords != keywords:
77 | raise RuntimeError(f"Judgment keywords {judgment.keywords} inconsistent with logged keywords {keywords}")
78 | if judgment.doc_id not in document_features:
79 | print(f"Missing doc {judgment.doc_id} with error")
80 | continue
81 | judgment.features = document_features[judgment.doc_id]
82 |
83 | # Return a paired down judgments if we are missing features for judgments
84 | training_set = []
85 | discarded = []
86 | for judgment in judgments:
87 | if self.drop_missing:
88 | if judgment.has_features():
89 | training_set.append(judgment)
90 | else:
91 | discarded.append(judgment)
92 | else:
93 | training_set.append(judgment)
94 | # print("Discarded %s Keep %s" % (len(discarded), len(training_set)))
95 | self.logged.extend(training_set)
96 | return training_set, discarded
97 |
--------------------------------------------------------------------------------
/ltr/plots.py:
--------------------------------------------------------------------------------
1 | # Crocodile Dundee and Rocky have nice Linear Shapes
2 |
3 | import numpy
4 | import pylab as pl
5 | import matplotlib as mpl
6 | from ltr.judgments import judgments_to_nparray
7 |
8 | norm = mpl.colors.Normalize(0,1.0)
9 |
10 | def plot_judgments(qids, xlabel, ylabel, judg_list, focus=None,
11 | title_prepend="Features for:"):
12 | if focus is None:
13 | focus=qids
14 |
15 | features, predictors, _ = judgments_to_nparray(judg_list)
16 |
17 | from random import shuffle
18 | from itertools import product
19 | r = list(range(0,5,1)); shuffle(r)
20 | g = list(range(0,5,1)); shuffle(g)
21 | b = list(range(0,5,1)); shuffle(b)
22 |
23 | out_of_focus_alpha=0.1
24 | in_focus_alpha=0.9
25 |
26 | if len(qids) > 3:
27 | # Make a random set of colors per query
28 | colors = [[r*0.1,g*0.1,b*0.1,out_of_focus_alpha] for r,g,b in product(r,g,b)]
29 | shuffle(colors)
30 | else:
31 | colors = ["lightgreen", "maroon"]
32 |
33 | qid_col=predictors[:,1]
34 | qid_idxs=numpy.array([])
35 | kws = []
36 | markers=('.', 'P') # Negative / Positive relevance markers...
37 | legend_paths=[]
38 | legend_labels=[]
39 | for idx, qid in enumerate(qids):
40 | qid_idxs=numpy.argwhere(qid_col==qid).ravel().astype(int)
41 | judgment=judg_list[qid_idxs[-1].item()]
42 | kws.append(judgment.keywords)
43 | x_qidA = features[qid_idxs]
44 | x_qidA
45 | y_qidA = predictors[qid_idxs, 0]
46 | color = colors[idx]
47 | for grade in [1,0]:
48 | this_grade=numpy.argwhere(y_qidA==grade)
49 | path = pl.scatter(x_qidA[this_grade,0],
50 | x_qidA[this_grade,1],
51 | marker=markers[grade],
52 | linewidth=1,
53 | s=80.0,
54 | facecolors=color,
55 | edgecolors=color,
56 | norm=norm)
57 | legend_paths.append(path)
58 | if grade == 0:
59 | legend_labels.append(judgment.keywords + " irrelevant movie")
60 | else:
61 | legend_labels.append(judgment.keywords + " relevant movie")
62 |
63 |
64 |
65 | pl.title(title_prepend + " {:.25}".format(", ".join(kws)))
66 | pl.xlabel(xlabel=xlabel)
67 | pl.ylabel(ylabel=ylabel)
68 | pl.legend(legend_paths, legend_labels, loc='lower center',
69 | bbox_to_anchor=[0.5,-0.5])
70 | pl.savefig('fig.png', dpi=300, bbox_inches='tight')
71 |
72 | #plot_all(predictors)
73 |
74 | def plot_pairwise_data(features, predictors, title,
75 | graph_features=[0,1],
76 | xlabel="Delta Title BM25",
77 | ylabel="Delta Overview BM25"):
78 | legend_paths=[]
79 | for pred in [-1,1]:
80 | if pred == -1:
81 | marker = '.'
82 | elif pred == 1:
83 | marker = '+'
84 | path = pl.scatter(features[predictors==pred, graph_features[0]],
85 | features[predictors==pred, graph_features[1]],
86 | marker=marker)
87 | legend_paths.append(path)
88 |
89 |
90 | pl.title(title)
91 | pl.xlabel(xlabel=xlabel)
92 | pl.ylabel(ylabel=ylabel)
93 | pl.legend(legend_paths, ["Irrelevant minus Relevant", "Relevant minus Irrelevant"], loc='lower center',
94 | bbox_to_anchor=[0.5,-0.5])
95 | pl.savefig('all_relevances.png', bbox_inches='tight', dpi=600)
96 |
--------------------------------------------------------------------------------
/ltr/sdbn_functions.py:
--------------------------------------------------------------------------------
1 | import pandas
2 | import glob
3 |
4 | def all_sessions():
5 | sessions = pandas.concat([pandas.read_csv(f, compression="gzip")
6 | for f in glob.glob("data/*_sessions.gz")])
7 | sessions = sessions.sort_values(['query', 'sess_id', 'rank'])
8 | sessions = sessions.rename(columns={"clicked_doc_id": "doc_id"})
9 | return sessions
10 |
11 | def get_sessions(query="", index=True):
12 | sessions = all_sessions()
13 | sessions = sessions[sessions["query"] == query]
14 | return sessions if not index else sessions.set_index("sess_id")
15 |
16 | def calculate_ctr(sessions):
17 | click_counts = sessions.groupby("doc_id")["clicked"].sum()
18 | sess_counts = sessions.groupby("doc_id")["sess_id"].nunique()
19 | ctrs = click_counts / sess_counts
20 | return ctrs.sort_values(ascending=False)
21 |
22 | def calculate_average_rank(sessions):
23 | avg_rank = sessions.groupby("doc_id")["rank"].mean()
24 | return avg_rank.sort_values(ascending=True)
25 |
26 | def caclulate_examine_probability(sessions):
27 | last_click_per_session = sessions.groupby(["clicked", "sess_id"])["rank"].max()[True]
28 | sessions["last_click_rank"] = last_click_per_session
29 | sessions["examined"] = sessions["rank"] <= sessions["last_click_rank"]
30 | return sessions
31 |
32 | def calculate_clicked_examined(sessions):
33 | sessions = caclulate_examine_probability(sessions)
34 | return sessions[sessions["examined"]] \
35 | .groupby("doc_id")[["clicked", "examined"]].sum()
36 |
37 | def calculate_grade(sessions):
38 | sessions = calculate_clicked_examined(sessions)
39 | sessions["grade"] = sessions["clicked"] / sessions["examined"]
40 | return sessions.sort_values("grade", ascending=False)
41 |
42 | def calculate_prior(sessions, prior_grade, prior_weight):
43 | sessions = calculate_grade(sessions)
44 | sessions["prior_a"] = prior_grade * prior_weight
45 | sessions["prior_b"] = (1 - prior_grade) * prior_weight
46 | return sessions
47 |
48 | def calculate_sdbn(sessions, prior_grade=0.3, prior_weight=100):
49 | sessions = calculate_prior(sessions, prior_grade, prior_weight)
50 | sessions["posterior_a"] = (sessions["prior_a"] +
51 | sessions["clicked"])
52 | sessions["posterior_b"] = (sessions["prior_b"] +
53 | sessions["examined"] - sessions["clicked"])
54 | sessions["beta_grade"] = (sessions["posterior_a"] /
55 | (sessions["posterior_a"] + sessions["posterior_b"]))
56 | return sessions.sort_values("beta_grade", ascending=False)
--------------------------------------------------------------------------------
/ltr/search.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | baseEsQuery = {
4 | "size": 5,
5 | "query": {
6 | "sltr": {
7 | "params": {
8 | "keywords": "",
9 | },
10 | "model": ""
11 | }
12 | }
13 | }
14 |
15 | def esLtrQuery(keywords, modelName):
16 | import json
17 | baseEsQuery['query']['sltr']['params']['keywords'] = keywords
18 | baseEsQuery['query']['sltr']['model'] = modelName
19 | print("%s" % json.dumps(baseEsQuery))
20 | return baseEsQuery
21 |
22 | # TODO: Parse params and add efi dynamically instead of adding manually to query below
23 | def solrLtrQuery(keywords, modelName):
24 | keywords = re.sub('([^\s\w]|_)+', '', keywords)
25 | fuzzy_keywords = ' '.join([x + '~' for x in keywords.split(' ')])
26 |
27 | return {
28 | 'fl': '*,score',
29 | 'rows': 5,
30 | 'q': '{{!ltr reRankDocs=30000 model={} efi.keywords="{}" efi.fuzzy_keywords="{}"}}'.format(modelName, keywords, fuzzy_keywords)
31 | }
32 |
33 |
34 | tmdbFields = {
35 | 'title': 'title',
36 | 'display_fields': ['release_year', 'genres', 'overview']
37 | }
38 |
39 |
40 |
41 | def search(client, keywords, modelName, index='tmdb', fields=tmdbFields):
42 | if client.name() == 'elastic':
43 | results = client.query(index, esLtrQuery(keywords, modelName))
44 | else:
45 | results = client.query(index, solrLtrQuery(keywords, modelName))
46 |
47 | ti = fields['title']
48 |
49 | for result in results:
50 | print("%s " % (result[ti] if ti in result else 'N/A'))
51 | print("%s " % (result['_score']))
52 |
53 | for df in fields['display_fields']:
54 | print("%s " % (result[df] if df in result else 'N/A'))
55 |
56 | print("---------------------------------------")
57 |
--------------------------------------------------------------------------------
/ltr/years_as_ratings.py:
--------------------------------------------------------------------------------
1 | def get_classic_rating(year):
2 | if year > 2010:
3 | return 0
4 | elif year > 1990:
5 | return 1
6 | elif year > 1970:
7 | return 2
8 | elif year > 1950:
9 | return 3
10 | else:
11 | return 4
12 |
13 | def get_latest_rating(year):
14 | if year > 2010:
15 | return 4
16 | elif year > 1990:
17 | return 3
18 | elif year > 1970:
19 | return 2
20 | elif year > 1950:
21 | return 1
22 | else:
23 | return 0
24 |
25 | def synthesize(client, featureSet='release', latestTrainingSetOut='data/latest-training.txt', classicTrainingSetOut='data/classic-training.txt'):
26 | from ltr.judgments import judgments_to_file, Judgment
27 | print('Generating ratings for classic and latest model')
28 | NO_ZERO = False
29 |
30 | resp = client.log_query('tmdb', 'release', None)
31 |
32 | docs = []
33 | for hit in resp:
34 | feature = list(hit['[features]'].values())[0]
35 | docs.append([feature]) # Treat features as ordered lists
36 |
37 | # Classic film fan
38 | judgments = []
39 | for fv in docs:
40 | rating = get_classic_rating(fv[0])
41 |
42 | if rating == 0 and NO_ZERO:
43 | continue
44 |
45 | judgments.append(Judgment(qid=1,doc_id=rating,grade=rating,features=fv,keywords=''))
46 |
47 | with open(classicTrainingSetOut, 'w') as out:
48 | judgments_to_file(out, judgments)
49 |
50 | judgments = []
51 | for fv in docs:
52 | rating = get_latest_rating(fv[0])
53 |
54 | if rating == 0 and NO_ZERO:
55 | continue
56 |
57 | judgments.append(Judgment(qid=1,doc_id=rating,grade=rating,features=fv,keywords=''))
58 |
59 |
60 | with open(latestTrainingSetOut, 'w') as out:
61 | judgments_to_file(out, judgments)
62 |
63 | print('Done')
--------------------------------------------------------------------------------
/semantic_search/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('../..')
3 | from aips import get_entity_extractor, get_sparse_semantic_search
4 | from .query_tree import enrich, to_queries
5 |
6 | def generate_tagged_query(extracted_entities):
7 | query = extracted_entities["query"]
8 | last_end = 0
9 | tagged_query = ""
10 | for tag in extracted_entities["tags"]:
11 | next_text = query[last_end:tag["startOffset"]].strip()
12 | if len(next_text) > 0:
13 | tagged_query += " " + next_text
14 | tagged_query += " {" + tag["matchText"] + "}"
15 | last_end = tag["endOffset"]
16 | if last_end < len(query):
17 | final_text = query[last_end:len(query)].strip()
18 | if len(final_text):
19 | tagged_query += " " + final_text
20 | return tagged_query
21 |
22 | def generate_query_tree(extracted_entities):
23 | query = extracted_entities["query"]
24 | entities = {entity["id"]: entity for entity
25 | in extracted_entities["entities"]}
26 | query_tree = []
27 | last_end = 0
28 |
29 | for tag in extracted_entities["tags"]:
30 | best_entity = entities[tag["ids"][0]]
31 | for entity_id in tag["ids"]:
32 | if (entities[entity_id]["popularity"] >
33 | best_entity["popularity"]):
34 | best_entity = entities[entity_id]
35 |
36 | next_text = query[last_end:tag["startOffset"]].strip()
37 | if next_text:
38 | query_tree.append({"type": "keyword",
39 | "surface_form": next_text,
40 | "canonical_form": next_text})
41 | query_tree.append(best_entity)
42 | last_end = tag["endOffset"]
43 |
44 | if last_end < len(query):
45 | final_text = query[last_end:len(query)].strip()
46 | if final_text:
47 | query_tree.append({"type": "keyword",
48 | "surface_form": final_text,
49 | "canonical_form": final_text})
50 | return query_tree
51 |
52 | def process_semantic_query(collection, entities_collection, query):
53 | extractor = get_entity_extractor(entities_collection)
54 | semantic_functions = get_sparse_semantic_search()
55 | entities = extractor.extract_entities(query)
56 | tagged_query = generate_tagged_query(entities)
57 | query_tree = generate_query_tree(entities)
58 | enriched_query = " ".join([str(q) for q in query_tree])
59 | enriched_query_tree = enrich(collection, query_tree)
60 | transformed = semantic_functions.transform_query(enriched_query_tree)
61 |
62 | return {
63 | "tagged_query": tagged_query,
64 | "parsed_query": enriched_query,
65 | "transformed_query": to_queries(transformed)[0],
66 | "tagger_data": entities
67 | }
68 |
69 | def process_basic_query(query):
70 | semantic_functions = get_sparse_semantic_search()
71 | return {"transformed_query": semantic_functions.generate_basic_query(query)}
--------------------------------------------------------------------------------
/semantic_search/query_tree.py:
--------------------------------------------------------------------------------
1 | from aips import get_semantic_knowledge_graph, get_sparse_semantic_search
2 |
3 | semantic_functions = get_sparse_semantic_search()
4 |
5 | def create_geo_filter(coordinates, field, distance_in_KM):
6 | return semantic_functions.create_geo_filter(coordinates, field, distance_in_KM)
7 |
8 | def popularity(query, position):
9 | return semantic_functions.popularity(query, position)
10 |
11 | def location_distance(query, position):
12 | return semantic_functions.location_distance(query, position)
13 |
14 | def to_queries(query_tree):
15 | return [node["query"] for node in query_tree]
16 |
17 | def process_semantic_functions(query_tree):
18 | position = 0
19 | while position < len(query_tree):
20 | node = query_tree[position]
21 | if node["type"] == "semantic_function":
22 | query = {"query_tree": query_tree}
23 | command_successful = eval(node["semantic_function"])
24 | if not command_successful:
25 | node["type"] = "invalid_semantic_function"
26 | position += 1
27 | return query_tree
28 |
29 | def get_enrichments(collection, keyword, limit=4):
30 | enrichments = {}
31 | nodes_to_traverse = [{"field": "content",
32 | "values": [keyword],
33 | "default_operator": "OR"},
34 | [{"name": "related_terms",
35 | "field": "content",
36 | "limit": limit},
37 | {"name": "doc_type",
38 | "field": "doc_type",
39 | "limit": 1}]]
40 | skg = get_semantic_knowledge_graph(collection)
41 | traversals = skg.traverse(*nodes_to_traverse)
42 | if "traversals" not in traversals["graph"][0]["values"][keyword]:
43 | return enrichments
44 |
45 | nested_traversals = traversals["graph"][0]["values"][keyword]["traversals"]
46 |
47 | doc_types = list(filter(lambda t: t["name"] == "doc_type",
48 | nested_traversals))
49 | if doc_types:
50 | enrichments["category"] = next(iter(doc_types[0]["values"]))
51 |
52 | related_terms = list(filter(lambda t: t["name"] == "related_terms",
53 | nested_traversals))
54 | if related_terms:
55 | term_vector = ""
56 | for term, data in related_terms[0]["values"].items():
57 | term_vector += f'{term}^{round(data["relatedness"], 4)} '
58 | enrichments["term_vector"] = term_vector.strip()
59 |
60 | return enrichments
61 |
62 | def enrich(collection, query_tree):
63 | query_tree = process_semantic_functions(query_tree)
64 | for item in query_tree:
65 | if item["type"] == "keyword":
66 | enrichments = get_enrichments(collection, item["surface_form"])
67 | if enrichments:
68 | item["type"] = "skg_enriched"
69 | item["enrichments"] = enrichments
70 | return query_tree
--------------------------------------------------------------------------------
/webserver/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python: Current File",
9 | "type": "python",
10 | "request": "launch",
11 | "program": "${file}",
12 | "console": "integratedTerminal"
13 | }
14 | ]
15 | }
--------------------------------------------------------------------------------
/webserver/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "/usr/local/anaconda3/bin/python"
3 | }
--------------------------------------------------------------------------------
/webserver/display/render_search_results.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('../..')
3 | from aips import *
4 | import os, re
5 |
6 | def render_search_results(results, keywords_to_highlight):
7 | file_path = os.path.dirname(os.path.abspath(__file__))
8 | search_results_template_file = os.path.join(file_path, "search-results-template.html")
9 | with open(search_results_template_file) as file:
10 | file_content = file.read()
11 |
12 | template_syntax = "(.*)"
13 | header_template = re.sub(template_syntax, "", file_content, flags=re.S)
14 |
15 | results_template_syntax = "(.*)"
16 | x = re.search(results_template_syntax, file_content, flags=re.S)
17 | results_template = x.group(1)
18 |
19 | separator_template_syntax = "(.*)"
20 | x = re.search(separator_template_syntax, file_content, flags=re.S)
21 | separator_template = x.group(1)
22 |
23 | rendered = ""
24 | for result in results["docs"]:
25 | #todo: add highlighting
26 | coordinates = result["location_coordinates"].split(",")
27 | rendered += results_template.replace("${NAME}", result.get("business_name", "UNKNOWN")) \
28 | .replace("${CITY}", result.get("city", "Anywhere") + ", " + result.get("state", "USA"))\
29 | .replace("${IMAGE_URL}", "/map?lat=" + coordinates[0] + "&lon=" + coordinates[1]) \
30 | .replace("${STARS}", "★" * int(result.get("stars_rating", 0)))
31 | rendered += separator_template
32 |
33 | if rendered == "":
34 | rendered = "No Results for this query."
35 |
36 | return rendered
--------------------------------------------------------------------------------
/webserver/display/search-results-template.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |

7 |
8 |
9 |
Name: ${NAME} | City: ${CITY} | Rating: ${STARS}
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/webserver/is-running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treygrainger/ai-powered-search/b7b716960798eb891ff4610fce61a2537bac98e7/webserver/is-running.png
--------------------------------------------------------------------------------
/webserver/start-webserver.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('..')
4 | import http.server
5 | import io
6 | import json
7 | import threading
8 | import webbrowser
9 |
10 | import sys
11 | sys.path.append('..')
12 | import urllib.parse
13 | import json
14 | import requests
15 |
16 | from urllib.parse import parse_qs, urlparse
17 |
18 | from aips import get_engine, get_entity_extractor, get_semantic_knowledge_graph, get_sparse_semantic_search
19 | from aips.environment import AIPS_WEBSERVER_HOST, AIPS_WEBSERVER_PORT, WEBSERVER_URL
20 | from staticmap import CircleMarker, StaticMap
21 |
22 | from webserver.display.render_search_results import render_search_results
23 | from semantic_search import process_semantic_query, process_basic_query
24 |
25 | engine = get_engine()
26 | reviews_collection = engine.get_collection("reviews")
27 | entities_collection = engine.get_collection("entities")
28 | entity_extractor = get_entity_extractor(entities_collection)
29 | query_transformer = get_sparse_semantic_search()
30 |
31 | def keyword_search(text):
32 | request = {"query": text,
33 | "query_fields": ["content"]}
34 | return reviews_collection.search(**request)
35 |
36 | class SemanticSearchHandler(http.server.SimpleHTTPRequestHandler):
37 | """Semantic Search Handler (AI-Powered Search)"""
38 |
39 | def sendResponse(self, response):
40 | try:
41 | self.send_response(200)
42 | self.end_headers()
43 | self.wfile.write(bytes(json.dumps(response), 'utf-8'))
44 | except Exception as ex:
45 | self.send_error(500, ex)
46 |
47 |
48 | def sendImageResponse(self, response):
49 | try:
50 | self.send_response(200)
51 | self.end_headers()
52 | self.wfile.write(bytes(response))
53 | except Exception as ex:
54 | self.send_error(500, ex)
55 |
56 | def do_POST(self):
57 | content_len = int(self.headers.get("Content-Length"), 0)
58 | post_body = self.rfile.read(content_len).decode('UTF-8')
59 |
60 | if (self.path.startswith("/tag_query")):
61 | self.sendResponse(entity_extractor.extract_entities(post_body))
62 | elif self.path.startswith("/tag_places"):
63 | request = {"query": post_body,
64 | "query_fields": ["city", "state", "location_coordinates"]}
65 | response = reviews_collection.search(**request)
66 | self.sendResponse(response)
67 | elif self.path.startswith("/process_semantic_query"):
68 | self.sendResponse(process_semantic_query(reviews_collection,
69 | entities_collection,
70 | post_body))
71 | elif self.path.startswith("/process_basic_query"):
72 | self.sendResponse(process_basic_query(post_body))
73 | elif self.path.startswith("/run_search"):
74 | results = keyword_search(post_body)
75 | highlight_terms = post_body.split(' ')
76 | rendered_results = render_search_results(results, highlight_terms)
77 | self.sendResponse(rendered_results)
78 |
79 | def do_GET(self):
80 | if self.path.startswith("/search") or self.path.startswith("/semantic-search"):
81 | self.path = "display/search.html"
82 | http.server.SimpleHTTPRequestHandler.do_GET(self)
83 | http.server.SimpleHTTPRequestHandler.do_GET(self)
84 | elif self.path.startswith("/map"):
85 | qsVars = parse_qs(urlparse(self.path).query)
86 | if 'lat' in qsVars and 'lon' in qsVars:
87 | lat = float(qsVars["lat"][0])
88 | lon = float(qsVars["lon"][0])
89 | zoom = int(qsVars['zoom'][0]) if 'zoom' in qsVars else 10
90 | m = StaticMap(200, 200)
91 | marker_outline = CircleMarker((lon, lat), 'white', 18)
92 | marker = CircleMarker((lon, lat), '#0036FF', 12)
93 | m.add_marker(marker_outline)
94 | m.add_marker(marker)
95 |
96 | image = m.render(zoom=zoom)
97 | buf = io.BytesIO()
98 | image.save(buf, format='JPEG')
99 | self.sendImageResponse(buf.getvalue())
100 | elif self.path.startswith("/healthcheck"):
101 | self.send_response(200)
102 | self.send_header('Access-Control-Allow-Private-Network', 'true')
103 | self.send_header('Access-Control-Allow-Origin','*')
104 | self.send_header('Content-type','image/png')
105 | self.end_headers()
106 | #Open the static file requested and send it
107 | image = open("is-running.png", 'br')
108 | self.wfile.write(image.read())
109 | image.close()
110 |
111 | def open_browser():
112 | """Start a browser after waiting for half a second."""
113 | FILE = "semantic-search"
114 | def _open_browser():
115 | if AIPS_WEBSERVER_HOST == "localhost":
116 | webbrowser.open(WEBSERVER_URL + '/%s' % FILE)
117 | thread = threading.Timer(0.5, _open_browser)
118 | thread.start()
119 |
120 | def start_server():
121 | """Start the server."""
122 | server_address = ("0.0.0.0", int(AIPS_WEBSERVER_PORT))
123 | server = http.server.HTTPServer(server_address, SemanticSearchHandler)
124 | server.serve_forever()
125 |
126 | if __name__ == "__main__":
127 | open_browser()
128 | start_server()
--------------------------------------------------------------------------------