├── ltr ├── helpers │ ├── __init__.py │ ├── msmarco │ │ ├── __init__.py │ │ └── evaluate.py │ ├── handle_resp.py │ ├── esUrlParse.py │ ├── solr_escape.py │ ├── defaultlist.py │ ├── convert.py │ ├── butterfingers.py │ ├── tau.py │ └── movies.py ├── clickmodels │ ├── __init__.py │ ├── coec.py │ ├── conversion.py │ ├── cascade.py │ ├── session.py │ └── sdbn.py ├── client │ ├── __init__.py │ ├── base_client.py │ └── solr_parse.py ├── __init__.py ├── p9_plots.py ├── index.py ├── download.py ├── release_date_plot.py ├── injectTypos.py ├── years_as_ratings.py ├── search.py ├── evaluate.py ├── date_genre_judgments.py └── log.py ├── rre ├── solr │ ├── .gitignore │ ├── .dockerignore │ ├── src │ │ └── etc │ │ │ ├── templates │ │ │ ├── baseline │ │ │ │ └── query.json │ │ │ ├── classic │ │ │ │ └── query.json │ │ │ ├── latest │ │ │ │ └── query.json │ │ │ └── README.md │ │ │ ├── configuration_sets │ │ │ ├── classic │ │ │ │ └── solr-settings.json │ │ │ ├── latest │ │ │ │ └── solr-settings.json │ │ │ ├── baseline │ │ │ │ └── solr-settings.json │ │ │ └── README.md │ │ │ └── ratings │ │ │ ├── README.md │ │ │ └── ratings.json │ └── Dockerfile ├── elastic │ ├── .dockerignore │ ├── .gitignore │ ├── src │ │ └── etc │ │ │ ├── configuration_sets │ │ │ ├── classic │ │ │ │ └── index-settings.json │ │ │ ├── latest │ │ │ │ └── index-settings.json │ │ │ ├── baseline │ │ │ │ └── index-settings.json │ │ │ └── README.md │ │ │ ├── templates │ │ │ ├── baseline │ │ │ │ └── query.json │ │ │ ├── classic │ │ │ │ └── query.json │ │ │ ├── latest │ │ │ │ └── query.json │ │ │ └── README.md │ │ │ └── ratings │ │ │ └── ratings.json │ └── Dockerfile └── README.md ├── notebooks ├── solr │ ├── tmdb │ │ ├── solr_config │ │ │ └── conf │ │ │ │ ├── names.txt │ │ │ │ ├── name_synonyms.txt │ │ │ │ ├── synonyms_genres.txt │ │ │ │ ├── synonyms_directed.txt │ │ │ │ ├── lang │ │ │ │ ├── contractions_ga.txt │ │ │ │ ├── hyphenations_ga.txt │ │ │ │ ├── contractions_ca.txt │ │ │ │ ├── stemdict_nl.txt │ │ │ │ ├── contractions_fr.txt │ │ │ │ ├── contractions_it.txt │ │ │ │ ├── stopwords_hy.txt │ │ │ │ ├── stopwords_el.txt │ │ │ │ ├── stopwords_ga.txt │ │ │ │ ├── stopwords_eu.txt │ │ │ │ ├── userdict_ja.txt │ │ │ │ ├── stopwords_en.txt │ │ │ │ ├── stopwords_th.txt │ │ │ │ ├── stopwords_ar.txt │ │ │ │ ├── stopwords_gl.txt │ │ │ │ ├── stopwords_cz.txt │ │ │ │ ├── stopwords_ja.txt │ │ │ │ ├── stopwords_lv.txt │ │ │ │ ├── stopwords_bg.txt │ │ │ │ ├── stopwords_ca.txt │ │ │ │ ├── stopwords_tr.txt │ │ │ │ ├── stopwords_ro.txt │ │ │ │ ├── stopwords_hu.txt │ │ │ │ ├── stopwords_hi.txt │ │ │ │ ├── stopwords_fi.txt │ │ │ │ ├── stopwords_fa.txt │ │ │ │ └── stopwords_da.txt │ │ │ │ ├── synonyms_multiterm.txt │ │ │ │ ├── synonyms_bidirect.txt │ │ │ │ ├── taxonomy_parent.txt │ │ │ │ ├── params.json │ │ │ │ ├── taxonomy.txt │ │ │ │ ├── idioms.txt │ │ │ │ ├── stopwords.txt │ │ │ │ ├── protwords.txt │ │ │ │ ├── synonyms.txt │ │ │ │ └── elevate.xml │ │ ├── ltr.py │ │ └── evaluation (Solr).ipynb │ ├── .docker │ │ └── solr_home │ │ │ ├── tmdb │ │ │ └── conf │ │ │ │ ├── names.txt │ │ │ │ ├── name_synonyms.txt │ │ │ │ ├── synonyms_genres.txt │ │ │ │ ├── synonyms_directed.txt │ │ │ │ ├── lang │ │ │ │ ├── hyphenations_ga.txt │ │ │ │ ├── contractions_ga.txt │ │ │ │ ├── contractions_ca.txt │ │ │ │ ├── stemdict_nl.txt │ │ │ │ ├── contractions_fr.txt │ │ │ │ ├── contractions_it.txt │ │ │ │ ├── stopwords_hy.txt │ │ │ │ ├── stopwords_el.txt │ │ │ │ ├── stopwords_ga.txt │ │ │ │ ├── stopwords_eu.txt │ │ │ │ ├── userdict_ja.txt │ │ │ │ ├── stopwords_en.txt │ │ │ │ ├── stopwords_th.txt │ │ │ │ ├── stopwords_ar.txt │ │ │ │ ├── stopwords_gl.txt │ │ │ │ ├── stopwords_cz.txt │ │ │ │ ├── stopwords_ja.txt │ │ │ │ ├── stopwords_lv.txt │ │ │ │ ├── stopwords_bg.txt │ │ │ │ ├── stopwords_ca.txt │ │ │ │ ├── stopwords_tr.txt │ │ │ │ ├── stopwords_ro.txt │ │ │ │ ├── stopwords_hu.txt │ │ │ │ ├── stopwords_hi.txt │ │ │ │ ├── stopwords_fi.txt │ │ │ │ ├── stopwords_fa.txt │ │ │ │ └── stopwords_da.txt │ │ │ │ ├── synonyms_multiterm.txt │ │ │ │ ├── synonyms_bidirect.txt │ │ │ │ ├── taxonomy_parent.txt │ │ │ │ ├── params.json │ │ │ │ ├── taxonomy.txt │ │ │ │ ├── idioms.txt │ │ │ │ ├── stopwords.txt │ │ │ │ ├── protwords.txt │ │ │ │ ├── synonyms.txt │ │ │ │ └── elevate.xml │ │ │ ├── zoo.cfg │ │ │ └── solr.xml │ ├── docker-compose.yml │ ├── Dockerfile │ └── msmarco │ │ ├── solr_config │ │ └── conf │ │ │ ├── params.json │ │ │ └── elevate.xml │ │ └── ltr.py ├── elasticsearch │ ├── tmdb │ │ ├── fmap.txt │ │ ├── ltr.py │ │ ├── Dataframes.ipynb │ │ └── evaluation.ipynb │ ├── .docker │ │ ├── kb-docker │ │ │ └── Dockerfile │ │ └── es-docker │ │ │ ├── elasticsearch.sh │ │ │ ├── Dockerfile │ │ │ └── elasticsearch.yml │ ├── README.md │ ├── docker-compose.yml │ └── osc-blog │ │ ├── ltr.py │ │ └── blog_settings.json ├── ltr.py └── exercises │ └── ltr.py ├── utils ├── utils.py ├── rateFuzzySearch.json.jinja ├── rateSearch.json.jinja └── train_to_csv.py ├── .dockerignore ├── clean-notebooks.sh ├── .gitignore ├── tests ├── fail.py ├── pass.py ├── test_prep.py ├── nb_test_config.py ├── run_most_nbs.py ├── runner.py └── notebook_test_case.py ├── Dockerfile ├── docker └── README.md ├── docker-compose.yml ├── requirements.txt └── README.md /ltr/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ltr/clickmodels/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ltr/helpers/msmarco/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rre/solr/.gitignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/elastic/.dockerignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/elastic/.gitignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/solr/.dockerignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/names.txt: -------------------------------------------------------------------------------- 1 | luke_skywalker -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/names.txt: -------------------------------------------------------------------------------- 1 | luke_skywalker -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/fmap.txt: -------------------------------------------------------------------------------- 1 | 0 release_year q 2 | 1 features0 q 3 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/name_synonyms.txt: -------------------------------------------------------------------------------- 1 | sky walker, skywalker -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/name_synonyms.txt: -------------------------------------------------------------------------------- 1 | sky walker, skywalker -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.getcwd()) 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/baseline/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "q": "title:($query)" 3 | } 4 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_genres.txt: -------------------------------------------------------------------------------- 1 | scifi,science fiction,science fiction movie -------------------------------------------------------------------------------- /ltr/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .elastic_client import ElasticClient 2 | from .solr_client import SolrClient 3 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/kb-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/kibana/kibana:7.12.1 2 | 3 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_genres.txt: -------------------------------------------------------------------------------- 1 | scifi,science fiction,science fiction movie -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/classic/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "q": "title:($query)", 3 | "rq": "{!ltr model=classic}" 4 | } 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/latest/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "q": "title:($query)", 3 | "rq": "{!ltr model=latest}" 4 | } 5 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | data/ 2 | venv/ 3 | venv2/ 4 | .git/ 5 | .cache/ 6 | .trash/ 7 | **/venv* 8 | **/data/ 9 | **/__pycache__/ 10 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/classic/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/latest/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/baseline/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /clean-notebooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Removes all output and metadata from notebooks 4 | find notebooks -type f -name "*.ipynb" -print0 | xargs -0 nbstripout 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/classic/solr-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ], 3 | "collectionName": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/latest/solr-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ], 3 | "collectionName": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/es-docker/elasticsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch-tlre 4 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/baseline/solr-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ], 3 | "collectionName": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_directed.txt: -------------------------------------------------------------------------------- 1 | wife => wife, bride 2 | spouse => spouse, husband, wife, partner 3 | tunes => cartoons, toons, songs 4 | cartoon => toons, tunes -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/baseline/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "match": { 4 | "title": { 5 | "query": "$query" 6 | } 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_directed.txt: -------------------------------------------------------------------------------- 1 | wife => wife, bride 2 | spouse => spouse, husband, wife, partner 3 | tunes => cartoons, toons, songs 4 | cartoon => toons, tunes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | **/data 3 | venv/* 4 | **/.ipynb_checkpoints 5 | tests/last_run.ipynb 6 | 7 | *.pyc 8 | .vscode 9 | .cache 10 | features.txt 11 | .trash 12 | .DS_store 13 | notify.sh 14 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | m 5 | b 6 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish hyphenations for StopFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | h 4 | n 5 | t 6 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish hyphenations for StopFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | h 4 | n 5 | t 6 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_multiterm.txt: -------------------------------------------------------------------------------- 1 | # Here are some multi term synonym to 2 | # see what happens at query time 3 | 4 | looney tunes, cartoons 5 | science fiction, sci fi, sci-fi, scifi -------------------------------------------------------------------------------- /tests/fail.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class Fail(unittest.TestCase): 4 | 5 | def test_that_fails(self): 6 | assert 1 == 0 7 | 8 | if __name__ == "__main__": 9 | unittest.main() 10 | -------------------------------------------------------------------------------- /tests/pass.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class Pass(unittest.TestCase): 4 | 5 | def test_that_passes(self): 6 | assert 1 == 1 7 | 8 | if __name__ == "__main__": 9 | unittest.main() 10 | -------------------------------------------------------------------------------- /ltr/__init__.py: -------------------------------------------------------------------------------- 1 | # Make the most important pieces just available as 2 | # ie - from ltr import download 3 | from .download import download 4 | from .evaluate import evaluate, rre_table 5 | from .search import search 6 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | m 5 | b 6 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_multiterm.txt: -------------------------------------------------------------------------------- 1 | # Here are some multi term synonym to 2 | # see what happens at query time 3 | 4 | looney tunes, cartoons 5 | science fiction, sci fi, sci-fi, scifi -------------------------------------------------------------------------------- /rre/solr/src/etc/ratings/README.md: -------------------------------------------------------------------------------- 1 | Under the ratings folder you should have at least 1 ratings file. 2 | A ratings file is connected with a dataset and contains a set of queries that compose the evaluation execution. -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- 1 | # Set of Catalan contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | l 5 | m 6 | n 7 | s 8 | t 9 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- 1 | # Set of Catalan contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | l 5 | m 6 | n 7 | s 8 | t 9 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- 1 | # Set of overrides for the dutch stemmer 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | fiets fiets 4 | bromfiets bromfiets 5 | ei eier 6 | kind kinder 7 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- 1 | # Set of overrides for the dutch stemmer 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | fiets fiets 4 | bromfiets bromfiets 5 | ei eier 6 | kind kinder 7 | -------------------------------------------------------------------------------- /ltr/helpers/handle_resp.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def resp_msg(msg, resp, throw=True): 4 | print('{} [Status: {}]'.format(msg, resp.status_code)) 5 | if resp.status_code >= 400: 6 | print(resp.text) 7 | if throw: 8 | raise RuntimeError(resp.text) 9 | 10 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- 1 | # Set of French contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | l 4 | m 5 | t 6 | qu 7 | n 8 | s 9 | j 10 | d 11 | c 12 | jusqu 13 | quoiqu 14 | lorsqu 15 | puisqu 16 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- 1 | # Set of French contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | l 4 | m 5 | t 6 | qu 7 | n 8 | s 9 | j 10 | d 11 | c 12 | jusqu 13 | quoiqu 14 | lorsqu 15 | puisqu 16 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_bidirect.txt: -------------------------------------------------------------------------------- 1 | # Often people erroneously equate linguistic synonyms 2 | # with Solr synonyms. Here the bidirectional nature 3 | # of the synonyms creates problems where the more specific 4 | # term is not prioritized 5 | wife,bride 6 | wife,spouse 7 | toons,tunes,cartoon -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_bidirect.txt: -------------------------------------------------------------------------------- 1 | # Often people erroneously equate linguistic synonyms 2 | # with Solr synonyms. Here the bidirectional nature 3 | # of the synonyms creates problems where the more specific 4 | # term is not prioritized 5 | wife,bride 6 | wife,spouse 7 | toons,tunes,cartoon -------------------------------------------------------------------------------- /notebooks/elasticsearch/README.md: -------------------------------------------------------------------------------- 1 | This folder contains some Elasticsearch configuration and a Dockerfile to expedite setting up Elasticsearch with LTR. 2 | 3 | ## Docker 4 | Run `docker-compose up` to create a image running Elasticsearch with LTR 5 | 6 | After the instance is running, load up the "hello-ltr (ES)" notebook. 7 | -------------------------------------------------------------------------------- /notebooks/solr/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | solr: 4 | build: . 5 | expose: 6 | - "8983" 7 | ports: 8 | - "8983:8983" 9 | volumes: 10 | - data:/var/solr 11 | environment: 12 | SERVER_HOST: "0.0.0.0" 13 | mem_limit: 4096m 14 | mem_reservation: 4096m 15 | volumes: 16 | data: 17 | -------------------------------------------------------------------------------- /utils/rateFuzzySearch.json.jinja: -------------------------------------------------------------------------------- 1 | { 2 | "from": 0, 3 | "size": 7, 4 | "query": { 5 | "bool": { 6 | "should": [ 7 | {"match": { 8 | "title": { 9 | "query": "{{ keywords }}", 10 | "fuzziness": "AUTO"} 11 | }} 12 | ] 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /notebooks/solr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM solr:8.5.1 2 | 3 | USER root 4 | 5 | ADD tmdb/solr_config /var/solr/data/configsets/tmdb 6 | RUN chown solr:solr /var/solr/data/configsets/tmdb 7 | 8 | ADD msmarco/solr_config /var/solr/data/configsets/msmarco 9 | RUN chown solr:solr /var/solr/data/configsets/msmarco 10 | 11 | USER solr 12 | 13 | CMD ["solr-foreground", "-Dsolr.ltr.enabled=true"] 14 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- 1 | # Set of Italian contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | c 4 | l 5 | all 6 | dall 7 | dell 8 | nell 9 | sull 10 | coll 11 | pell 12 | gl 13 | agl 14 | dagl 15 | degl 16 | negl 17 | sugl 18 | un 19 | m 20 | t 21 | s 22 | v 23 | d 24 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/README.md: -------------------------------------------------------------------------------- 1 | This folder contains one subfolder for each configuration version. 2 | Each version folder should contain the index settings associated with such version: 3 | 4 | - `hostUrls`: an array of URLs where the Elasticsearch instance for this 5 | version can be accessed. 6 | - `index`: the name of the index holding the data being used to search. 7 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- 1 | # Set of Italian contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | c 4 | l 5 | all 6 | dall 7 | dell 8 | nell 9 | sull 10 | coll 11 | pell 12 | gl 13 | agl 14 | dagl 15 | degl 16 | negl 17 | sugl 18 | un 19 | m 20 | t 21 | s 22 | v 23 | d 24 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/taxonomy_parent.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes 3 | #bugs bunny => bug_bunny, looney_tunes 4 | #mickey mouse => mickey_mouse, disney 5 | #minnie mouse => minnie_mouse, disney 6 | #donald duck => donald_duck, disney 7 | #yogi bear => yogi_bear, disney 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse -------------------------------------------------------------------------------- /utils/rateSearch.json.jinja: -------------------------------------------------------------------------------- 1 | { 2 | "from": 0, 3 | "size": 5, 4 | "query": { 5 | "bool": { 6 | "should": [ 7 | {"match": { 8 | "text_all": "{{ keywords }}" 9 | }}, 10 | { 11 | "match_phrase": { 12 | "title": { 13 | "query": "{{ keywords }}", 14 | "boost": 1000 15 | } 16 | } 17 | }] 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy_parent.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes 3 | #bugs bunny => bug_bunny, looney_tunes 4 | #mickey mouse => mickey_mouse, disney 5 | #minnie mouse => minnie_mouse, disney 6 | #donald duck => donald_duck, disney 7 | #yogi bear => yogi_bear, disney 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse -------------------------------------------------------------------------------- /rre/solr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.6.0-jdk-8 2 | 3 | # Clone the RRE repo 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator 5 | WORKDIR rated-ranking-evaluator 6 | 7 | # Build RRE 8 | RUN mvn clean install 9 | 10 | # Bring over the RRE config 11 | WORKDIR / 12 | COPY . rre 13 | WORKDIR rre 14 | 15 | # By default, run an RRE evaluation if no other command is specified 16 | CMD mvn clean install 17 | -------------------------------------------------------------------------------- /tests/test_prep.py: -------------------------------------------------------------------------------- 1 | from ltr.client.solr_client import SolrClient 2 | client = SolrClient() 3 | 4 | from ltr import download 5 | from ltr.index import rebuild 6 | from ltr.helpers.movies import indexable_movies 7 | 8 | corpus='http://es-learn-to-rank.labs.o19s.com/tmdb.json' 9 | download([corpus], dest='data/'); 10 | 11 | movies=indexable_movies(movies='data/tmdb.json') 12 | rebuild(client, index='tmdb', doc_src=movies) -------------------------------------------------------------------------------- /rre/elastic/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.6.0-jdk-8 2 | 3 | # Clone the RRE repo 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator 5 | WORKDIR rated-ranking-evaluator 6 | 7 | # Build RRE 8 | RUN mvn clean install 9 | 10 | # Bring over the RRE config 11 | WORKDIR / 12 | COPY . rre 13 | WORKDIR rre 14 | 15 | # By default, run an RRE evaluation if no other command is specified 16 | CMD mvn clean install 17 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/classic/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "should": [ 5 | { 6 | "sltr": { 7 | "featureset": "release", 8 | "model": "classic", 9 | "params": {} 10 | } 11 | } 12 | ], 13 | "filter": [ 14 | {"match": {"title": "$query"}} 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/latest/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "should": [ 5 | { 6 | "sltr": { 7 | "featureset": "release", 8 | "model": "latest", 9 | "params": {} 10 | } 11 | } 12 | ], 13 | "filter": [ 14 | {"match": {"title": "$query"}} 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/es-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/elasticsearch/elasticsearch:7.12.1 2 | 3 | RUN bin/elasticsearch-plugin install -b https://github.com/o19s/elasticsearch-learning-to-rank/releases/download/v1.5.5-es7.12.1/ltr-plugin-v1.5.5-es7.12.1.zip 4 | COPY --chown=elasticsearch:elasticsearch elasticsearch.yml /usr/share/elasticsearch/config/ 5 | RUN cat /usr/share/elasticsearch/config/elasticsearch.yml 6 | 7 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/params.json: -------------------------------------------------------------------------------- 1 | {"params":{ 2 | "query":{ 3 | "defType":"edismax", 4 | "q.alt":"*:*", 5 | "rows":"10", 6 | "fl":"*,score", 7 | "":{"v":0} 8 | }, 9 | "facets":{ 10 | "facet":"on", 11 | "facet.mincount": "1", 12 | "":{"v":0} 13 | }, 14 | "velocity":{ 15 | "wt": "velocity", 16 | "v.template":"browse", 17 | "v.layout": "layout", 18 | "":{"v":0} 19 | } 20 | }} -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/params.json: -------------------------------------------------------------------------------- 1 | {"params":{ 2 | "query":{ 3 | "defType":"edismax", 4 | "q.alt":"*:*", 5 | "rows":"10", 6 | "fl":"*,score", 7 | "":{"v":0} 8 | }, 9 | "facets":{ 10 | "facet":"on", 11 | "facet.mincount": "1", 12 | "":{"v":0} 13 | }, 14 | "velocity":{ 15 | "wt": "velocity", 16 | "v.template":"browse", 17 | "v.layout": "layout", 18 | "":{"v":0} 19 | } 20 | }} -------------------------------------------------------------------------------- /notebooks/solr/msmarco/solr_config/conf/params.json: -------------------------------------------------------------------------------- 1 | {"params":{ 2 | "query":{ 3 | "defType":"edismax", 4 | "q.alt":"*:*", 5 | "rows":"10", 6 | "fl":"*,score", 7 | "":{"v":0} 8 | }, 9 | "facets":{ 10 | "facet":"on", 11 | "facet.mincount": "1", 12 | "":{"v":0} 13 | }, 14 | "velocity":{ 15 | "wt": "velocity", 16 | "v.template":"browse", 17 | "v.layout": "layout", 18 | "":{"v":0} 19 | } 20 | }} -------------------------------------------------------------------------------- /ltr/helpers/esUrlParse.py: -------------------------------------------------------------------------------- 1 | def parseUrl(fullEsUrl): 2 | from urllib.parse import urlsplit, urlunsplit 3 | import os.path 4 | o = urlsplit(fullEsUrl) 5 | 6 | esUrl = urlunsplit([o.scheme, o.netloc, '','','']) 7 | 8 | indexAndSearchType = os.path.split(o.path) 9 | 10 | return (esUrl, indexAndSearchType[0][1:], indexAndSearchType[1]) 11 | 12 | 13 | if __name__ == "__main__": 14 | from sys import argv 15 | print(parseUrl(argv[1])) 16 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/taxonomy.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes, cartoons 3 | #bugs bunny => bug_bunny, looney_tunes, cartoons 4 | #mickey mouse => mickey_mouse, disney, cartoons 5 | #minnie mouse => minnie_mouse, disney, cartoons 6 | #donald duck => donald_duck, disney, cartoons 7 | #yogi bear => yogi_bear, disney, cartoons 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse 11 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes, cartoons 3 | #bugs bunny => bug_bunny, looney_tunes, cartoons 4 | #mickey mouse => mickey_mouse, disney, cartoons 5 | #minnie mouse => minnie_mouse, disney, cartoons 6 | #donald duck => donald_duck, disney, cartoons 7 | #yogi bear => yogi_bear, disney, cartoons 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse 11 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- 1 | # example set of Armenian stopwords. 2 | այդ 3 | այլ 4 | այն 5 | այս 6 | դու 7 | դուք 8 | եմ 9 | են 10 | ենք 11 | ես 12 | եք 13 | է 14 | էի 15 | էին 16 | էինք 17 | էիր 18 | էիք 19 | էր 20 | ըստ 21 | թ 22 | ի 23 | ին 24 | իսկ 25 | իր 26 | կամ 27 | համար 28 | հետ 29 | հետո 30 | մենք 31 | մեջ 32 | մի 33 | ն 34 | նա 35 | նաև 36 | նրա 37 | նրանք 38 | որ 39 | որը 40 | որոնք 41 | որպես 42 | ու 43 | ում 44 | պիտի 45 | վրա 46 | և 47 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- 1 | # example set of Armenian stopwords. 2 | այդ 3 | այլ 4 | այն 5 | այս 6 | դու 7 | դուք 8 | եմ 9 | են 10 | ենք 11 | ես 12 | եք 13 | է 14 | էի 15 | էին 16 | էինք 17 | էիր 18 | էիք 19 | էր 20 | ըստ 21 | թ 22 | ի 23 | ին 24 | իսկ 25 | իր 26 | կամ 27 | համար 28 | հետ 29 | հետո 30 | մենք 31 | մեջ 32 | մի 33 | ն 34 | նա 35 | նաև 36 | նրա 37 | նրանք 38 | որ 39 | որը 40 | որոնք 41 | որպես 42 | ու 43 | ում 44 | պիտի 45 | վրա 46 | և 47 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/idioms.txt: -------------------------------------------------------------------------------- 1 | # Idioms is a synonyms file that captures idiomatic phrases as single units 2 | 3 | # LHS is all representations encountered in query or document 4 | looneytunes, looney tunes, looney toons => 12345 5 | sci fi, scifi, science fiction => 56789 6 | 7 | #looneytunes, looney tunes => looney_tunes 8 | #bugs bunny => bug_bunny 9 | #mickey mouse => mickey_mouse 10 | #minnie mouse => minnie_mouse 11 | #donald duck => donald_duck 12 | #yogi bear => yogi_bear -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/idioms.txt: -------------------------------------------------------------------------------- 1 | # Idioms is a synonyms file that captures idiomatic phrases as single units 2 | 3 | # LHS is all representations encountered in query or document 4 | looneytunes, looney tunes, looney toons => 12345 5 | sci fi, scifi, science fiction => 56789 6 | 7 | #looneytunes, looney tunes => looney_tunes 8 | #bugs bunny => bug_bunny 9 | #mickey mouse => mickey_mouse 10 | #minnie mouse => minnie_mouse 11 | #donald duck => donald_duck 12 | #yogi bear => yogi_bear -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/README.md: -------------------------------------------------------------------------------- 1 | This folder will contain the query templates associated with the evaluation suite. 2 | A template is a JSON file containing a JSON object with name->value(s) pairs corresponding to query parameters. 3 | Although it is completely ok to have statically-defined values here, usually you will be using placeholders. 4 | 5 | ```javascript 6 | { 7 | "q": "$query", 8 | "fq": "language:$lang" 9 | } 10 | ``` 11 | The placeholders values will be defined within the ratings file, specifically in the queries definitions. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-stretch 2 | 3 | # Install openjdk 4 | RUN apt-get update && \ 5 | apt-get install -y openjdk-8-jdk graphviz && \ 6 | apt-get clean; 7 | 8 | # Setup a user 9 | RUN useradd -ms /bin/bash ltr 10 | WORKDIR /home/ltr 11 | 12 | # Make current directory accesible 13 | ADD . /home/ltr/hello-ltr 14 | 15 | # Install requirements 16 | RUN chown -R ltr.ltr hello-ltr 17 | WORKDIR /home/ltr/hello-ltr 18 | 19 | RUN pip install -r requirements.txt 20 | USER ltr 21 | 22 | CMD jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token='' 23 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | kibana: 4 | build: ./.docker/kb-docker/. 5 | expose: 6 | - "5601" 7 | ports: 8 | - "5601:5601" 9 | environment: 10 | SERVER_HOST: "0.0.0.0" 11 | elasticsearch: 12 | build: ./.docker/es-docker/. 13 | ports: 14 | - "9200:9200" 15 | expose: 16 | - "9200" 17 | environment: 18 | SERVER_NAME: "elasticsearch" 19 | volumes: 20 | - tlre-es-data:/usr/share/elasticsearch/data 21 | 22 | volumes: 23 | tlre-es-data: 24 | driver: local 25 | -------------------------------------------------------------------------------- /tests/nb_test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class NotebookTestConfig: 4 | 5 | SETUP_NB = 'setup.ipynb' 6 | 7 | def __init__(self, path): 8 | self.notebooks = [] 9 | self.setup = None 10 | for nb_path in os.listdir(path): 11 | full_nb_path = os.path.join(path,nb_path) 12 | if os.path.isfile(full_nb_path) and nb_path.endswith('.ipynb'): 13 | if nb_path == NotebookTestConfig.SETUP_NB: 14 | self.setup = full_nb_path 15 | else: 16 | self.notebooks.append(full_nb_path) 17 | 18 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/zoo.cfg: -------------------------------------------------------------------------------- 1 | # The number of milliseconds of each tick 2 | tickTime=2000 3 | # The number of ticks that the initial 4 | # synchronization phase can take 5 | initLimit=10 6 | # The number of ticks that can pass between 7 | # sending a request and getting an acknowledgement 8 | syncLimit=5 9 | 10 | # the directory where the snapshot is stored. 11 | # dataDir=/opt/zookeeper/data 12 | # NOTE: Solr defaults the dataDir to /zoo_data 13 | 14 | # the port at which the clients will connect 15 | # clientPort=2181 16 | # NOTE: Solr sets this based on zkRun / zkHost params 17 | 18 | -------------------------------------------------------------------------------- /ltr/p9_plots.py: -------------------------------------------------------------------------------- 1 | def plot_grades(dat): 2 | import plotnine as p9 3 | 4 | p = { 5 | p9.ggplot(dat, p9.aes('grade')) + 6 | p9.geom_bar() + 7 | p9.facet_wrap('keywords') 8 | } 9 | 10 | return p 11 | 12 | def plot_features(dat): 13 | import plotnine as p9 14 | 15 | p = { 16 | p9.ggplot(dat, p9.aes('grade', 'features', color = 'keywords')) + 17 | p9.geom_jitter(alpha = .5) + 18 | p9.facet_wrap('feature_id', scales = 'free_y', labeller = 'label_both') + 19 | p9.labs(y='Feature values', x='Relevance grade') 20 | } 21 | 22 | return p -------------------------------------------------------------------------------- /notebooks/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/exercises/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /rre/README.md: -------------------------------------------------------------------------------- 1 | rre 2 | 3 | This folder contains some basic RRE demonstrations for running evaluations against your LTR models. 4 | 5 | Navigate to `solr` or `elastic` depending on which you are using and do the following: 6 | 7 | ## Getting Started 8 | - Build the docker image: `docker build -t ltr-rre .` 9 | - Run an evaluation: `docker run --name ltr-rre ltr-rre` 10 | - Copy the report to your host: `docker cp ltr-rre:/rre/target/site/rre-report.xlsx .` 11 | 12 | Alternatively, you can run thru the `evaluation` notebooks in Jupyter to run these steps for you. 13 | 14 | __Note:__ Older versions of Docker for Linux may have issues accessing localhost on the host machine 15 | -------------------------------------------------------------------------------- /notebooks/solr/msmarco/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/osc-blog/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /ltr/helpers/solr_escape.py: -------------------------------------------------------------------------------- 1 | def esc_kw(kw): 2 | """ Take a keyword and escape all the 3 | Solr parts we want to escape!""" 4 | kw = kw.replace('\\', '\\\\') # be sure to do this first, as we inject \! 5 | kw = kw.replace('(', '\(') 6 | kw = kw.replace(')', '\)') 7 | kw = kw.replace('+', '\+') 8 | kw = kw.replace('-', '\-') 9 | kw = kw.replace(':', '\:') 10 | kw = kw.replace('/', '\/') 11 | kw = kw.replace(']', '\]') 12 | kw = kw.replace('[', '\[') 13 | kw = kw.replace('*', '\*') 14 | kw = kw.replace('?', '\?') 15 | kw = kw.replace('{', '\{') 16 | kw = kw.replace('}', '\}') 17 | kw = kw.replace('~', '\~') 18 | 19 | 20 | return kw 21 | -------------------------------------------------------------------------------- /ltr/helpers/defaultlist.py: -------------------------------------------------------------------------------- 1 | class DefaultList(list): 2 | """ adapted from https://stackoverflow.com/a/869901/8123""" 3 | 4 | def __init__(self, factory): 5 | self.factory = factory 6 | 7 | def __getitem__(self, index): 8 | size = len(self) 9 | if index >= size: 10 | self.extend(self.factory() for _ in range(size, index + 1)) 11 | 12 | return list.__getitem__(self, index) 13 | 14 | def __setitem__(self, index, value): 15 | size = len(self) 16 | if index >= size: 17 | self.extend(self.factory() for _ in range(size, index + 1)) 18 | 19 | list.__setitem__(self, index, value) 20 | 21 | def defaultlist(factory): 22 | return DefaultList(factory) 23 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | If you have issues getting jupyter or the JDK running on the host machine, you can use the files here to setup a docker environment with everything in one place. 2 | 3 | ## Requirements 4 | 5 | - Docker with docker-compose 6 | - Ports 8888/8983/9200 must be available on your host machine 7 | 8 | ## Setup 9 | 10 | Run `docker-compose up -d` 11 | 12 | The above command will build all images necessary for the project and run the following services: 13 | 14 | - Jupyter available at localhost:8888 15 | - Solr available at localhost:8983 16 | - Elasticsearch available at localhost:9200 17 | 18 | ## Cleanup 19 | 20 | - To shut things down and return later run `docker-compose stop` 21 | - To get rid of everything run `docker-compose down` 22 | -------------------------------------------------------------------------------- /tests/run_most_nbs.py: -------------------------------------------------------------------------------- 1 | from notebook_test_case import NotebooksTestCase 2 | import unittest 3 | 4 | class RunMostNotebooksTestCase(NotebooksTestCase): 5 | 6 | TEST_PATHS = ['./notebooks/', 7 | './notebooks/solr/tmdb', 8 | './notebooks/elasticsearch/tmdb', 9 | './notebooks/elasticsearch/osc-blog'] 10 | 11 | IGNORED_NBS = ['./notebooks/solr/tmdb/evaluation (Solr).ipynb', 12 | './notebooks/elasticsearch/tmdb/evaluation.ipynb'] 13 | 14 | 15 | def test_paths(self): 16 | return RunMostNotebooksTestCase.TEST_PATHS 17 | 18 | def ignored_nbs(self): 19 | return RunMostNotebooksTestCase.IGNORED_NBS 20 | 21 | 22 | 23 | if __name__ == "__main__": 24 | unittest.main() 25 | -------------------------------------------------------------------------------- /ltr/index.py: -------------------------------------------------------------------------------- 1 | from ltr.helpers.movies import indexable_movies, noop 2 | 3 | def rebuild(client, index, doc_src, force = False): 4 | """ Reload a configuration on disk for each search engine 5 | (Solr a configset, Elasticsearch a json file) 6 | and reindex 7 | """ 8 | 9 | if client.check_index_exists(index): 10 | if (force): 11 | client.delete_index(index) 12 | client.create_index(index) 13 | client.index_documents(index, doc_src=doc_src) 14 | else: 15 | print("Index {} already exists. Use `force = True` to delete and recreate".format(index)) 16 | return None 17 | else: 18 | client.create_index(index) 19 | client.index_documents(index, doc_src=doc_src) 20 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- 1 | # Lucene Greek Stopwords list 2 | # Note: by default this file is used after GreekLowerCaseFilter, 3 | # so when modifying this file use 'σ' instead of 'ς' 4 | ο 5 | η 6 | το 7 | οι 8 | τα 9 | του 10 | τησ 11 | των 12 | τον 13 | την 14 | και 15 | κι 16 | κ 17 | ειμαι 18 | εισαι 19 | ειναι 20 | ειμαστε 21 | ειστε 22 | στο 23 | στον 24 | στη 25 | στην 26 | μα 27 | αλλα 28 | απο 29 | για 30 | προσ 31 | με 32 | σε 33 | ωσ 34 | παρα 35 | αντι 36 | κατα 37 | μετα 38 | θα 39 | να 40 | δε 41 | δεν 42 | μη 43 | μην 44 | επι 45 | ενω 46 | εαν 47 | αν 48 | τοτε 49 | που 50 | πωσ 51 | ποιοσ 52 | ποια 53 | ποιο 54 | ποιοι 55 | ποιεσ 56 | ποιων 57 | ποιουσ 58 | αυτοσ 59 | αυτη 60 | αυτο 61 | αυτοι 62 | αυτων 63 | αυτουσ 64 | αυτεσ 65 | αυτα 66 | εκεινοσ 67 | εκεινη 68 | εκεινο 69 | εκεινοι 70 | εκεινεσ 71 | εκεινα 72 | εκεινων 73 | εκεινουσ 74 | οπωσ 75 | ομωσ 76 | ισωσ 77 | οσο 78 | οτι 79 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- 1 | # Lucene Greek Stopwords list 2 | # Note: by default this file is used after GreekLowerCaseFilter, 3 | # so when modifying this file use 'σ' instead of 'ς' 4 | ο 5 | η 6 | το 7 | οι 8 | τα 9 | του 10 | τησ 11 | των 12 | τον 13 | την 14 | και 15 | κι 16 | κ 17 | ειμαι 18 | εισαι 19 | ειναι 20 | ειμαστε 21 | ειστε 22 | στο 23 | στον 24 | στη 25 | στην 26 | μα 27 | αλλα 28 | απο 29 | για 30 | προσ 31 | με 32 | σε 33 | ωσ 34 | παρα 35 | αντι 36 | κατα 37 | μετα 38 | θα 39 | να 40 | δε 41 | δεν 42 | μη 43 | μην 44 | επι 45 | ενω 46 | εαν 47 | αν 48 | τοτε 49 | που 50 | πωσ 51 | ποιοσ 52 | ποια 53 | ποιο 54 | ποιοι 55 | ποιεσ 56 | ποιων 57 | ποιουσ 58 | αυτοσ 59 | αυτη 60 | αυτο 61 | αυτοι 62 | αυτων 63 | αυτουσ 64 | αυτεσ 65 | αυτα 66 | εκεινοσ 67 | εκεινη 68 | εκεινο 69 | εκεινοι 70 | εκεινεσ 71 | εκεινα 72 | εκεινων 73 | εκεινουσ 74 | οπωσ 75 | ομωσ 76 | ισωσ 77 | οσο 78 | οτι 79 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /ltr/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from os import path 3 | 4 | def download_one(uri, dest='data/', force=False): 5 | import os 6 | 7 | if not os.path.exists(dest): 8 | os.makedirs(dest) 9 | 10 | if not os.path.isdir(dest): 11 | raise ValueError("dest {} is not a directory".format(dest)) 12 | 13 | filename = uri[uri.rfind('/') + 1:] 14 | filepath = os.path.join(dest, filename) 15 | if path.exists(filepath): 16 | if not force: 17 | print(filepath + ' already exists') 18 | return 19 | print("exists but force=True, Downloading anyway") 20 | 21 | with open(filepath, 'wb') as out: 22 | print('GET {}'.format(uri)) 23 | resp = requests.get(uri, stream=True) 24 | for chunk in resp.iter_content(chunk_size=1024): 25 | if chunk: 26 | out.write(chunk) 27 | 28 | def download(uris, dest='data/', force=False): 29 | for uri in uris: 30 | download_one(uri=uri, dest=dest, force=force) 31 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | notebooks: 4 | build: . 5 | container_name: hello-ltr-notebook 6 | ports: 7 | - 8888:8888 8 | environment: 9 | - LTR_DOCKER=yes 10 | links: 11 | - elastic 12 | - solr 13 | 14 | elastic: 15 | build: 16 | context: ./notebooks/elasticsearch/.docker/es-docker/ 17 | dockerfile: Dockerfile 18 | container_name: hello-ltr-elastic 19 | ports: 20 | - 9200:9200 21 | 22 | kibana: 23 | build: 24 | context: ./notebooks/elasticsearch/.docker/kb-docker/ 25 | dockerfile: Dockerfile 26 | container_name: hello-ltr-kibana 27 | expose: 28 | - "5601" 29 | ports: 30 | - "5601:5601" 31 | environment: 32 | ELASTICSEARCH_HOSTS: "http://hello-ltr-elastic:9200" 33 | ELASTICSEARCH_URL: "http://hello-ltr-elastic:9200" 34 | SERVER_HOST: "0.0.0.0" 35 | 36 | solr: 37 | build: 38 | context: ./notebooks/solr/ 39 | dockerfile: Dockerfile 40 | container_name: hello-ltr-solr 41 | ports: 42 | - 8983:8983 43 | 44 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/README.md: -------------------------------------------------------------------------------- 1 | This folder will contain the query templates associated with the evaluation suite. 2 | The query shape in Elasticsearch is already a JSON file so each template should be a valid Elasticsearch query 3 | with all needed placeholders (that will be defined within the ratings file). 4 | 5 | ```javascript 6 | { 7 | "size": 0, 8 | "query": { 9 | "bool": { 10 | "must": [ 11 | { 12 | "multi_match": { 13 | "query": "$query", 14 | "fields": [ 15 | "some_searchable_field_1^1.75", 16 | "some_other_searchable_field" 17 | ], 18 | "minimum_should_match": "3<-45% 6<-95%" 19 | } 20 | } 21 | ] 22 | } 23 | }, 24 | "aggs": { 25 | "headings": { 26 | "terms": { 27 | "field": "title_sugg", 28 | "order": { "max_score": "desc" } 29 | }, 30 | "aggs": { 31 | "max_score": { 32 | "max": { 33 | "script": { 34 | "lang": "painless", 35 | "inline": "_score" 36 | } 37 | } 38 | } 39 | } 40 | } 41 | } 42 | } 43 | ``` -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- 1 | 2 | a 3 | ach 4 | ag 5 | agus 6 | an 7 | aon 8 | ar 9 | arna 10 | as 11 | b' 12 | ba 13 | beirt 14 | bhúr 15 | caoga 16 | ceathair 17 | ceathrar 18 | chomh 19 | chtó 20 | chuig 21 | chun 22 | cois 23 | céad 24 | cúig 25 | cúigear 26 | d' 27 | daichead 28 | dar 29 | de 30 | deich 31 | deichniúr 32 | den 33 | dhá 34 | do 35 | don 36 | dtí 37 | dá 38 | dár 39 | dó 40 | faoi 41 | faoin 42 | faoina 43 | faoinár 44 | fara 45 | fiche 46 | gach 47 | gan 48 | go 49 | gur 50 | haon 51 | hocht 52 | i 53 | iad 54 | idir 55 | in 56 | ina 57 | ins 58 | inár 59 | is 60 | le 61 | leis 62 | lena 63 | lenár 64 | m' 65 | mar 66 | mo 67 | mé 68 | na 69 | nach 70 | naoi 71 | naonúr 72 | ná 73 | ní 74 | níor 75 | nó 76 | nócha 77 | ocht 78 | ochtar 79 | os 80 | roimh 81 | sa 82 | seacht 83 | seachtar 84 | seachtó 85 | seasca 86 | seisear 87 | siad 88 | sibh 89 | sinn 90 | sna 91 | sé 92 | sí 93 | tar 94 | thar 95 | thú 96 | triúr 97 | trí 98 | trína 99 | trínár 100 | tríocha 101 | tú 102 | um 103 | ár 104 | é 105 | éis 106 | í 107 | ó 108 | ón 109 | óna 110 | ónár 111 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- 1 | 2 | a 3 | ach 4 | ag 5 | agus 6 | an 7 | aon 8 | ar 9 | arna 10 | as 11 | b' 12 | ba 13 | beirt 14 | bhúr 15 | caoga 16 | ceathair 17 | ceathrar 18 | chomh 19 | chtó 20 | chuig 21 | chun 22 | cois 23 | céad 24 | cúig 25 | cúigear 26 | d' 27 | daichead 28 | dar 29 | de 30 | deich 31 | deichniúr 32 | den 33 | dhá 34 | do 35 | don 36 | dtí 37 | dá 38 | dár 39 | dó 40 | faoi 41 | faoin 42 | faoina 43 | faoinár 44 | fara 45 | fiche 46 | gach 47 | gan 48 | go 49 | gur 50 | haon 51 | hocht 52 | i 53 | iad 54 | idir 55 | in 56 | ina 57 | ins 58 | inár 59 | is 60 | le 61 | leis 62 | lena 63 | lenár 64 | m' 65 | mar 66 | mo 67 | mé 68 | na 69 | nach 70 | naoi 71 | naonúr 72 | ná 73 | ní 74 | níor 75 | nó 76 | nócha 77 | ocht 78 | ochtar 79 | os 80 | roimh 81 | sa 82 | seacht 83 | seachtar 84 | seachtó 85 | seasca 86 | seisear 87 | siad 88 | sibh 89 | sinn 90 | sna 91 | sé 92 | sí 93 | tar 94 | thar 95 | thú 96 | triúr 97 | trí 98 | trína 99 | trínár 100 | tríocha 101 | tú 102 | um 103 | ár 104 | é 105 | éis 106 | í 107 | ó 108 | ón 109 | óna 110 | ónár 111 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | 16 | # Demonstrating bidirectional synonyms 17 | #wife,bride 18 | #wife,spouse 19 | #toons,tunes,cartoon 20 | 21 | # Demonstrating => syntax 22 | # wife => wife, bride 23 | # spouse => spouse, husband, wife, partner 24 | # tunes => cartoons, toons, songs 25 | # cartoon => toons, tunes 26 | 27 | # Demonstrating multi phrase 28 | #looney tunes, cartoons 29 | #science fiction, sci fi, sci-fi, scifi 30 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | 16 | # Demonstrating bidirectional synonyms 17 | #wife,bride 18 | #wife,spouse 19 | #toons,tunes,cartoon 20 | 21 | # Demonstrating => syntax 22 | # wife => wife, bride 23 | # spouse => spouse, husband, wife, partner 24 | # tunes => cartoons, toons, songs 25 | # cartoon => toons, tunes 26 | 27 | # Demonstrating multi phrase 28 | #looney tunes, cartoons 29 | #science fiction, sci fi, sci-fi, scifi 30 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- 1 | # example set of basque stopwords 2 | al 3 | anitz 4 | arabera 5 | asko 6 | baina 7 | bat 8 | batean 9 | batek 10 | bati 11 | batzuei 12 | batzuek 13 | batzuetan 14 | batzuk 15 | bera 16 | beraiek 17 | berau 18 | berauek 19 | bere 20 | berori 21 | beroriek 22 | beste 23 | bezala 24 | da 25 | dago 26 | dira 27 | ditu 28 | du 29 | dute 30 | edo 31 | egin 32 | ere 33 | eta 34 | eurak 35 | ez 36 | gainera 37 | gu 38 | gutxi 39 | guzti 40 | haiei 41 | haiek 42 | haietan 43 | hainbeste 44 | hala 45 | han 46 | handik 47 | hango 48 | hara 49 | hari 50 | hark 51 | hartan 52 | hau 53 | hauei 54 | hauek 55 | hauetan 56 | hemen 57 | hemendik 58 | hemengo 59 | hi 60 | hona 61 | honek 62 | honela 63 | honetan 64 | honi 65 | hor 66 | hori 67 | horiei 68 | horiek 69 | horietan 70 | horko 71 | horra 72 | horrek 73 | horrela 74 | horretan 75 | horri 76 | hortik 77 | hura 78 | izan 79 | ni 80 | noiz 81 | nola 82 | non 83 | nondik 84 | nongo 85 | nor 86 | nora 87 | ze 88 | zein 89 | zen 90 | zenbait 91 | zenbat 92 | zer 93 | zergatik 94 | ziren 95 | zituen 96 | zu 97 | zuek 98 | zuen 99 | zuten 100 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- 1 | # example set of basque stopwords 2 | al 3 | anitz 4 | arabera 5 | asko 6 | baina 7 | bat 8 | batean 9 | batek 10 | bati 11 | batzuei 12 | batzuek 13 | batzuetan 14 | batzuk 15 | bera 16 | beraiek 17 | berau 18 | berauek 19 | bere 20 | berori 21 | beroriek 22 | beste 23 | bezala 24 | da 25 | dago 26 | dira 27 | ditu 28 | du 29 | dute 30 | edo 31 | egin 32 | ere 33 | eta 34 | eurak 35 | ez 36 | gainera 37 | gu 38 | gutxi 39 | guzti 40 | haiei 41 | haiek 42 | haietan 43 | hainbeste 44 | hala 45 | han 46 | handik 47 | hango 48 | hara 49 | hari 50 | hark 51 | hartan 52 | hau 53 | hauei 54 | hauek 55 | hauetan 56 | hemen 57 | hemendik 58 | hemengo 59 | hi 60 | hona 61 | honek 62 | honela 63 | honetan 64 | honi 65 | hor 66 | hori 67 | horiei 68 | horiek 69 | horietan 70 | horko 71 | horra 72 | horrek 73 | horrela 74 | horretan 75 | horri 76 | hortik 77 | hura 78 | izan 79 | ni 80 | noiz 81 | nola 82 | non 83 | nondik 84 | nongo 85 | nor 86 | nora 87 | ze 88 | zein 89 | zen 90 | zenbait 91 | zenbat 92 | zer 93 | zergatik 94 | ziren 95 | zituen 96 | zu 97 | zuek 98 | zuen 99 | zuten 100 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer) 3 | # 4 | # Add entries to this file in order to override the statistical model in terms 5 | # of segmentation, readings and part-of-speech tags. Notice that entries do 6 | # not have weights since they are always used when found. This is by-design 7 | # in order to maximize ease-of-use. 8 | # 9 | # Entries are defined using the following CSV format: 10 | # , ... , ... , 11 | # 12 | # Notice that a single half-width space separates tokens and readings, and 13 | # that the number tokens and readings must match exactly. 14 | # 15 | # Also notice that multiple entries with the same is undefined. 16 | # 17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines. 18 | # 19 | 20 | # Custom segmentation for kanji compounds 21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 23 | 24 | # Custom segmentation for compound katakana 25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 27 | 28 | # Custom reading for former sumo wrestler 29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名 30 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer) 3 | # 4 | # Add entries to this file in order to override the statistical model in terms 5 | # of segmentation, readings and part-of-speech tags. Notice that entries do 6 | # not have weights since they are always used when found. This is by-design 7 | # in order to maximize ease-of-use. 8 | # 9 | # Entries are defined using the following CSV format: 10 | # , ... , ... , 11 | # 12 | # Notice that a single half-width space separates tokens and readings, and 13 | # that the number tokens and readings must match exactly. 14 | # 15 | # Also notice that multiple entries with the same is undefined. 16 | # 17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines. 18 | # 19 | 20 | # Custom segmentation for kanji compounds 21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 23 | 24 | # Custom segmentation for compound katakana 25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 27 | 28 | # Custom reading for former sumo wrestler 29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名 30 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/README.md: -------------------------------------------------------------------------------- 1 | This folder contains one subfolder for each configuration version. 2 | Each version folder should contain a solr-settings.json file with details of 3 | how to connect to the appropriate Solr core. 4 | 5 | This is an example: 6 | 7 | * configuration_sets 8 | * v1.0 9 | * solr-settings.json 10 | * v1.1 11 | * solr-settings.json 12 | 13 | The solr-settings.json files may have the following properties: 14 | 15 | - `baseUrls`: an array of Solr base URLs (eg. `[ "http://localhost:8983/solr", "http://localhost:7574/solr" ]`). 16 | - `collectionName` [**REQUIRED**]: the name of the collection or core being evaluated. 17 | - `zkHosts`: an array of Zookeeper hosts (eg. `[ "zk1:2181", "zk2:2181" ]`). 18 | - `zkChroot`: the path to the root Zookeeper node containing Solr data, if running in a Chroot environment (eg. `"/solr"`). 19 | Optional. 20 | - `connectionTimeoutMillis`: the number of milliseconds to wait for a connection to be made to Solr. Optional. 21 | - `socketTimeoutMillis`: the number of milliseconds to allow for a response from Solr. Optional. 22 | 23 | **Either** the baseUrls **or** the zkHosts property must contain values. If both are empty, 24 | the configuration will fail to load. -------------------------------------------------------------------------------- /rre/elastic/src/etc/ratings/ratings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "tmdb", 3 | "id_field": "id", 4 | "topics": [ 5 | { 6 | "description": "LTR Example Evaluation", 7 | "queries": [ 8 | { 9 | "template": "query.json", 10 | "placeholders": { 11 | "$query": "batman" 12 | } 13 | } 14 | ], 15 | "relevant_documents": { 16 | "4": [ 17 | "40662", 18 | "45162", 19 | "69735", 20 | "123025", 21 | "142061", 22 | "177271", 23 | "209112", 24 | "242643", 25 | "251519", 26 | "321528", 27 | "324849", 28 | "366924", 29 | "382322" 30 | ], 31 | "3": [ 32 | "272", 33 | "13851", 34 | "14919", 35 | "16234", 36 | "20077", 37 | "21683", 38 | "22855", 39 | "29751" 40 | ], 41 | "2": [ 42 | "268", 43 | "364", 44 | "414", 45 | "415", 46 | "15805", 47 | "17074" 48 | ], 49 | "1": [ 50 | "2661", 51 | "93560", 52 | "125249" 53 | ] 54 | } 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /rre/solr/src/etc/ratings/ratings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "tmdb", 3 | "id_field": "id", 4 | "topics": [ 5 | { 6 | "description": "LTR Example Evaluation", 7 | "queries": [ 8 | { 9 | "template": "query.json", 10 | "placeholders": { 11 | "$query": "batman" 12 | } 13 | } 14 | ], 15 | "relevant_documents": { 16 | "4": [ 17 | "40662", 18 | "45162", 19 | "69735", 20 | "123025", 21 | "142061", 22 | "177271", 23 | "209112", 24 | "242643", 25 | "251519", 26 | "321528", 27 | "324849", 28 | "366924", 29 | "382322" 30 | ], 31 | "3": [ 32 | "272", 33 | "13851", 34 | "14919", 35 | "16234", 36 | "20077", 37 | "21683", 38 | "22855", 39 | "29751" 40 | ], 41 | "2": [ 42 | "268", 43 | "364", 44 | "414", 45 | "415", 46 | "15805", 47 | "17074" 48 | ], 49 | "1": [ 50 | "2661", 51 | "93560", 52 | "125249" 53 | ] 54 | } 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /tests/runner.py: -------------------------------------------------------------------------------- 1 | # Notebook test runner, adapted from 2 | # https://www.blog.pythonlibrary.org/2018/10/16/testing-jupyter-notebooks/ 3 | import nbformat 4 | import os 5 | 6 | from nbconvert.preprocessors import ExecutePreprocessor 7 | 8 | def hours(hours): 9 | """ Hours as seconds """ 10 | hours * 60 * 60 11 | 12 | def run_notebook(notebook_path, timeout=hours(6), save_nb_path=None): 13 | nb_name, _ = os.path.splitext(os.path.basename(notebook_path)) 14 | dirname = os.path.dirname(notebook_path) 15 | 16 | with open(notebook_path) as f: 17 | nb = nbformat.read(f, as_version=4) 18 | 19 | proc = ExecutePreprocessor(timeout=timeout, kernel_name='python3') 20 | proc.allow_errors = True 21 | 22 | proc.preprocess(nb, {'metadata': {'path': dirname}}) 23 | 24 | if save_nb_path: 25 | with open(save_nb_path, mode='wt') as f: 26 | nbformat.write(nb, f) 27 | 28 | errors = [] 29 | for cell in nb.cells: 30 | if 'outputs' in cell: 31 | for output in cell['outputs']: 32 | if output.output_type == 'error': 33 | errors.append(output) 34 | 35 | return nb, errors 36 | 37 | if __name__ == '__main__': 38 | nb, errors = run_notebook('Testing.ipynb') 39 | print(errors) 40 | -------------------------------------------------------------------------------- /utils/train_to_csv.py: -------------------------------------------------------------------------------- 1 | import utils 2 | from ltr.judgments import judgments_from_file 3 | from ltr.client import ElasticClient 4 | import csv 5 | 6 | 7 | def train_to_csv(client, feature_set, in_filename, out_filename): 8 | features = client.feature_set(name=feature_set, index='tmdb')[0] 9 | fieldnames = ['keywords', 'qid', 'grade'] 10 | fieldnames.extend([feature['name'] for feature in features]) 11 | with open(out_filename, 'w') as csvfile: 12 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 13 | writer.writeheader() 14 | 15 | judgments = judgments_from_file(filename='data/title_judgments_train.txt') 16 | for judgment in judgments: 17 | assert len(judgment.features) == len(fieldnames) - 3 18 | record = {} 19 | record[fieldnames[0]] = judgment.keywords 20 | record[fieldnames[1]] = judgment.qid 21 | record[fieldnames[2]] = judgment.grade 22 | for idx,field in enumerate(fieldnames[3:]): 23 | record[field] = judgment.features[idx] 24 | 25 | writer.writerow(record) 26 | 27 | if __name__ == "__main__": 28 | from sys import argv 29 | client = ElasticClient() 30 | train_to_csv(client=client, in_filename=argv[1], 31 | feature_set=argv[2], out_filename=argv[3]) 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- 1 | # Thai stopwords from: 2 | # "Opinion Detection in Thai Political News Columns 3 | # Based on Subjectivity Analysis" 4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak 5 | ไว้ 6 | ไม่ 7 | ไป 8 | ได้ 9 | ให้ 10 | ใน 11 | โดย 12 | แห่ง 13 | แล้ว 14 | และ 15 | แรก 16 | แบบ 17 | แต่ 18 | เอง 19 | เห็น 20 | เลย 21 | เริ่ม 22 | เรา 23 | เมื่อ 24 | เพื่อ 25 | เพราะ 26 | เป็นการ 27 | เป็น 28 | เปิดเผย 29 | เปิด 30 | เนื่องจาก 31 | เดียวกัน 32 | เดียว 33 | เช่น 34 | เฉพาะ 35 | เคย 36 | เข้า 37 | เขา 38 | อีก 39 | อาจ 40 | อะไร 41 | ออก 42 | อย่าง 43 | อยู่ 44 | อยาก 45 | หาก 46 | หลาย 47 | หลังจาก 48 | หลัง 49 | หรือ 50 | หนึ่ง 51 | ส่วน 52 | ส่ง 53 | สุด 54 | สําหรับ 55 | ว่า 56 | วัน 57 | ลง 58 | ร่วม 59 | ราย 60 | รับ 61 | ระหว่าง 62 | รวม 63 | ยัง 64 | มี 65 | มาก 66 | มา 67 | พร้อม 68 | พบ 69 | ผ่าน 70 | ผล 71 | บาง 72 | น่า 73 | นี้ 74 | นํา 75 | นั้น 76 | นัก 77 | นอกจาก 78 | ทุก 79 | ที่สุด 80 | ที่ 81 | ทําให้ 82 | ทํา 83 | ทาง 84 | ทั้งนี้ 85 | ทั้ง 86 | ถ้า 87 | ถูก 88 | ถึง 89 | ต้อง 90 | ต่างๆ 91 | ต่าง 92 | ต่อ 93 | ตาม 94 | ตั้งแต่ 95 | ตั้ง 96 | ด้าน 97 | ด้วย 98 | ดัง 99 | ซึ่ง 100 | ช่วง 101 | จึง 102 | จาก 103 | จัด 104 | จะ 105 | คือ 106 | ความ 107 | ครั้ง 108 | คง 109 | ขึ้น 110 | ของ 111 | ขอ 112 | ขณะ 113 | ก่อน 114 | ก็ 115 | การ 116 | กับ 117 | กัน 118 | กว่า 119 | กล่าว 120 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- 1 | # Thai stopwords from: 2 | # "Opinion Detection in Thai Political News Columns 3 | # Based on Subjectivity Analysis" 4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak 5 | ไว้ 6 | ไม่ 7 | ไป 8 | ได้ 9 | ให้ 10 | ใน 11 | โดย 12 | แห่ง 13 | แล้ว 14 | และ 15 | แรก 16 | แบบ 17 | แต่ 18 | เอง 19 | เห็น 20 | เลย 21 | เริ่ม 22 | เรา 23 | เมื่อ 24 | เพื่อ 25 | เพราะ 26 | เป็นการ 27 | เป็น 28 | เปิดเผย 29 | เปิด 30 | เนื่องจาก 31 | เดียวกัน 32 | เดียว 33 | เช่น 34 | เฉพาะ 35 | เคย 36 | เข้า 37 | เขา 38 | อีก 39 | อาจ 40 | อะไร 41 | ออก 42 | อย่าง 43 | อยู่ 44 | อยาก 45 | หาก 46 | หลาย 47 | หลังจาก 48 | หลัง 49 | หรือ 50 | หนึ่ง 51 | ส่วน 52 | ส่ง 53 | สุด 54 | สําหรับ 55 | ว่า 56 | วัน 57 | ลง 58 | ร่วม 59 | ราย 60 | รับ 61 | ระหว่าง 62 | รวม 63 | ยัง 64 | มี 65 | มาก 66 | มา 67 | พร้อม 68 | พบ 69 | ผ่าน 70 | ผล 71 | บาง 72 | น่า 73 | นี้ 74 | นํา 75 | นั้น 76 | นัก 77 | นอกจาก 78 | ทุก 79 | ที่สุด 80 | ที่ 81 | ทําให้ 82 | ทํา 83 | ทาง 84 | ทั้งนี้ 85 | ทั้ง 86 | ถ้า 87 | ถูก 88 | ถึง 89 | ต้อง 90 | ต่างๆ 91 | ต่าง 92 | ต่อ 93 | ตาม 94 | ตั้งแต่ 95 | ตั้ง 96 | ด้าน 97 | ด้วย 98 | ดัง 99 | ซึ่ง 100 | ช่วง 101 | จึง 102 | จาก 103 | จัด 104 | จะ 105 | คือ 106 | ความ 107 | ครั้ง 108 | คง 109 | ขึ้น 110 | ของ 111 | ขอ 112 | ขณะ 113 | ก่อน 114 | ก็ 115 | การ 116 | กับ 117 | กัน 118 | กว่า 119 | กล่าว 120 | -------------------------------------------------------------------------------- /ltr/helpers/convert.py: -------------------------------------------------------------------------------- 1 | # converts LambdaMART XML models to JSON for Solr.. 2 | 3 | import xml.etree.ElementTree as ET 4 | 5 | 6 | def convert(ensemble_xml_string, modelName, featureSet, featureMapping): 7 | modelClass = 'org.apache.solr.ltr.model.MultipleAdditiveTreesModel' 8 | 9 | model = { 10 | 'store': featureSet, 11 | 'name': modelName, 12 | 'class': modelClass, 13 | 'features': featureMapping 14 | } 15 | 16 | # Clean up header 17 | ensemble_xml_string = '\n'.join(ensemble_xml_string.split('\n')[7:]) 18 | lambdaModel = ET.fromstring(ensemble_xml_string) 19 | 20 | trees = [] 21 | for node in lambdaModel: 22 | t = { 23 | 'weight': str(node.attrib['weight']), 24 | 'root': parseSplits(node[0], featureMapping) 25 | } 26 | trees.append(t) 27 | 28 | # print(trees) 29 | model['params'] = {'trees': trees} 30 | 31 | return model 32 | 33 | def parseSplits(split, features): 34 | obj = {} 35 | for el in split: 36 | if (el.tag == 'feature'): 37 | obj['feature'] = features[(int(el.text.strip()) - 1)]['name'] 38 | elif (el.tag == 'threshold'): 39 | obj['threshold'] = str(el.text.strip()) 40 | elif (el.tag == 'split' and 'pos' in el.attrib): 41 | obj[el.attrib['pos']] = parseSplits(el, features) 42 | elif (el.tag == 'output'): 43 | obj['value'] = str(el.text.strip()) 44 | return obj 45 | -------------------------------------------------------------------------------- /ltr/clickmodels/coec.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | class Model(): 4 | def __init__(self): 5 | # COEC statistic 6 | self.coecs = Counter() 7 | 8 | # CTR for each query-doc pair in this session 9 | self.ctrs = {} 10 | 11 | def coec(ctr_by_rank, sessions): 12 | """ Clicks over expected clicks is a metric 13 | used for seeing what items get above or 14 | below average CTR for their rank. From paper 15 | 16 | > Personalized Click Prediction in Sponsored Search 17 | by Cheng, Cantu Paz 18 | 19 | A COEC > 1 means above average CTR for it's position 20 | A COEC < 1 means below average 21 | 22 | -ctr_by_rank is the global CTR at each rank position 23 | -sessions are an array of search session objects 24 | 25 | returned: 26 | each query-doc pair in provided sessions COEC 27 | 28 | """ 29 | clicks = Counter() 30 | weighted_impressions = Counter() 31 | 32 | for session in sessions: 33 | for rank, doc in enumerate(session.docs): 34 | weighted_impressions[(session.query, doc.doc_id)] += ctr_by_rank[rank] 35 | if doc.click: 36 | clicks[(session.query, doc.doc_id)] += 1 37 | 38 | model = Model() 39 | for query_id, doc_id in weighted_impressions: 40 | model.coecs[(query_id,doc_id)] = \ 41 | clicks[(query_id,doc_id)] / weighted_impressions[(query_id,doc_id)] 42 | 43 | return model 44 | -------------------------------------------------------------------------------- /ltr/release_date_plot.py: -------------------------------------------------------------------------------- 1 | import plotly.graph_objs as go 2 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 3 | 4 | def search(client, user_query, model_name): 5 | if client.name() == 'elastic': 6 | engine_query = { 7 | "bool": { 8 | "must": {"match_all": {} }, 9 | "filter": { 10 | "match": {"title": user_query} 11 | } 12 | } 13 | } 14 | else: 15 | engine_query = 'title:('+ user_query + ')^0' 16 | return client.model_query('tmdb', model_name, {}, engine_query) 17 | 18 | def plot(client, query, models = ['classic', 'latest']): 19 | init_notebook_mode(connected=True) 20 | 21 | modelData = [] 22 | 23 | for model in models: 24 | modelData.append(search(client, query, model)) 25 | 26 | xAxes = [] 27 | for i in range(len(modelData[0])): 28 | xAxes.append(i) 29 | 30 | trace0 = go.Scatter( 31 | x = xAxes, 32 | y = [x['release_year'] for x in modelData[0]], 33 | mode = "lines", 34 | name = models[0], 35 | text = [f'{x["title"]} ({x["score"]})' for x in modelData[0]] 36 | ) 37 | 38 | trace1 = go.Scatter( 39 | x = xAxes, 40 | y = [x['release_year'] for x in modelData[1]], 41 | mode = "lines", 42 | name = models[1], 43 | text = [f'{x["title"]} ({x["score"]})' for x in modelData[1]] 44 | ) 45 | 46 | 47 | data = [trace0, trace1] 48 | fig = go.Figure(data=data) 49 | iplot(fig) 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.4.2 2 | appnope==0.1.0 3 | attrs==19.1.0 4 | backcall==0.1.0 5 | bleach==3.1.4 6 | certifi==2019.6.16 7 | chardet==3.0.4 8 | cycler==0.10.0 9 | Cython 10 | decorator==4.3.2 11 | defusedxml==0.5.0 12 | elasticsearch==7.0.0 13 | entrypoints==0.3 14 | fuzzywuzzy==0.18.0 15 | graphviz==0.17 16 | idna==2.8 17 | ipykernel 18 | ipython 19 | ipython-genutils==0.2.0 20 | ipywidgets==7.4.2 21 | jedi==0.13.3 22 | Jinja2==2.10 23 | joblib==0.15.1 24 | jsonschema==3.0.1 25 | jupyter==1.0.0 26 | jupyter-client==5.2.4 27 | jupyter-console==6.0.0 28 | jupyter-core==4.4.0 29 | kiwisolver==1.1.0 30 | Mako==1.1.2 31 | MarkupSafe==1.1.1 32 | matplotlib==3.1.3 33 | mistune==0.8.4 34 | nbconvert==5.6.1 35 | nbformat==4.4.0 36 | nbgrader==0.6.1 37 | nbstripout==0.3.7 38 | notebook==5.7.6 39 | numpy 40 | pandas==1.3.4 41 | pandocfilters==1.4.2 42 | parso==0.3.4 43 | pexpect==4.6.0 44 | pickleshare==0.7.5 45 | plotly==3.7.0 46 | plotnine 47 | prometheus-client==0.6.0 48 | prompt-toolkit==2.0.9 49 | ptyprocess==0.6.0 50 | Pygments==2.3.1 51 | pyparsing==2.4.6 52 | pyrsistent==0.14.11 53 | python-dateutil==2.8.0 54 | python-editor==1.0.4 55 | pytz==2018.9 56 | pyzmq==18.0.1 57 | qtconsole==4.4.3 58 | requests==2.21.0 59 | retrying==1.3.3 60 | scikit-learn==0.23.0 61 | scipy==1.4.1 62 | seaborn 63 | Send2Trash==1.5.0 64 | six==1.12.0 65 | sklearn==0.0 66 | SQLAlchemy==1.3.15 67 | terminado==0.8.1 68 | testpath==0.4.2 69 | threadpoolctl==2.0.0 70 | tornado==6.0.1 71 | tqdm==4.43.0 72 | traitlets==4.3.2 73 | urllib3==1.24.1 74 | wcwidth==0.1.7 75 | webencodings==0.5.1 76 | widgetsnbextension==3.4.2 77 | xgboost==1.4.2 78 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /notebooks/solr/msmarco/solr_config/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization) 5 | # This means that when modifying this list, you might need to add some 6 | # redundant entries, for example containing forms with both أ and ا 7 | من 8 | ومن 9 | منها 10 | منه 11 | في 12 | وفي 13 | فيها 14 | فيه 15 | و 16 | ف 17 | ثم 18 | او 19 | أو 20 | ب 21 | بها 22 | به 23 | ا 24 | أ 25 | اى 26 | اي 27 | أي 28 | أى 29 | لا 30 | ولا 31 | الا 32 | ألا 33 | إلا 34 | لكن 35 | ما 36 | وما 37 | كما 38 | فما 39 | عن 40 | مع 41 | اذا 42 | إذا 43 | ان 44 | أن 45 | إن 46 | انها 47 | أنها 48 | إنها 49 | انه 50 | أنه 51 | إنه 52 | بان 53 | بأن 54 | فان 55 | فأن 56 | وان 57 | وأن 58 | وإن 59 | التى 60 | التي 61 | الذى 62 | الذي 63 | الذين 64 | الى 65 | الي 66 | إلى 67 | إلي 68 | على 69 | عليها 70 | عليه 71 | اما 72 | أما 73 | إما 74 | ايضا 75 | أيضا 76 | كل 77 | وكل 78 | لم 79 | ولم 80 | لن 81 | ولن 82 | هى 83 | هي 84 | هو 85 | وهى 86 | وهي 87 | وهو 88 | فهى 89 | فهي 90 | فهو 91 | انت 92 | أنت 93 | لك 94 | لها 95 | له 96 | هذه 97 | هذا 98 | تلك 99 | ذلك 100 | هناك 101 | كانت 102 | كان 103 | يكون 104 | تكون 105 | وكانت 106 | وكان 107 | غير 108 | بعض 109 | قد 110 | نحو 111 | بين 112 | بينما 113 | منذ 114 | ضمن 115 | حيث 116 | الان 117 | الآن 118 | خلال 119 | بعد 120 | قبل 121 | حتى 122 | عند 123 | عندما 124 | لدى 125 | جميع 126 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization) 5 | # This means that when modifying this list, you might need to add some 6 | # redundant entries, for example containing forms with both أ and ا 7 | من 8 | ومن 9 | منها 10 | منه 11 | في 12 | وفي 13 | فيها 14 | فيه 15 | و 16 | ف 17 | ثم 18 | او 19 | أو 20 | ب 21 | بها 22 | به 23 | ا 24 | أ 25 | اى 26 | اي 27 | أي 28 | أى 29 | لا 30 | ولا 31 | الا 32 | ألا 33 | إلا 34 | لكن 35 | ما 36 | وما 37 | كما 38 | فما 39 | عن 40 | مع 41 | اذا 42 | إذا 43 | ان 44 | أن 45 | إن 46 | انها 47 | أنها 48 | إنها 49 | انه 50 | أنه 51 | إنه 52 | بان 53 | بأن 54 | فان 55 | فأن 56 | وان 57 | وأن 58 | وإن 59 | التى 60 | التي 61 | الذى 62 | الذي 63 | الذين 64 | الى 65 | الي 66 | إلى 67 | إلي 68 | على 69 | عليها 70 | عليه 71 | اما 72 | أما 73 | إما 74 | ايضا 75 | أيضا 76 | كل 77 | وكل 78 | لم 79 | ولم 80 | لن 81 | ولن 82 | هى 83 | هي 84 | هو 85 | وهى 86 | وهي 87 | وهو 88 | فهى 89 | فهي 90 | فهو 91 | انت 92 | أنت 93 | لك 94 | لها 95 | له 96 | هذه 97 | هذا 98 | تلك 99 | ذلك 100 | هناك 101 | كانت 102 | كان 103 | يكون 104 | تكون 105 | وكانت 106 | وكان 107 | غير 108 | بعض 109 | قد 110 | نحو 111 | بين 112 | بينما 113 | منذ 114 | ضمن 115 | حيث 116 | الان 117 | الآن 118 | خلال 119 | بعد 120 | قبل 121 | حتى 122 | عند 123 | عندما 124 | لدى 125 | جميع 126 | -------------------------------------------------------------------------------- /ltr/clickmodels/conversion.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | def conv_aug_attracts(attracts, sessions, costs): 4 | """ Rescan sessions, using click-derrived attractiveness. 5 | 6 | If theres no conversion, punish the attractiveness derrived judgment 7 | 8 | BUT we punish costly things less, and cheap things more 9 | """ 10 | satisfacts = Counter() 11 | counts = Counter() 12 | for session in sessions: 13 | for rank, doc in enumerate(session.docs): 14 | attract = attracts[(session.query, doc.doc_id)] 15 | if doc.click: 16 | if doc.conversion: 17 | # Confirms the attractiveness was real with actual relevance 18 | counts[(session.query, doc.doc_id)] += 1 19 | satisfacts[(session.query, doc.doc_id)] += attract 20 | else: 21 | # If it costs a lot, and there wasn't a conversion, 22 | # thats ok, we default to attractiveness 23 | # If it costs little, and there wasn't a conversion, 24 | # thats generally not ok, why didn't they do (easy action) 25 | counts[(session.query, doc.doc_id)] += 1 26 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id] 27 | else: 28 | counts[(session.query, doc.doc_id)] += 1 29 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id] 30 | 31 | for (query_id, doc_id), count in counts.items(): 32 | satisfacts[(query_id, doc_id)] = satisfacts[(query_id,doc_id)] / count 33 | 34 | return satisfacts 35 | 36 | 37 | -------------------------------------------------------------------------------- /tests/notebook_test_case.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from nb_test_config import NotebookTestConfig 3 | import runner 4 | 5 | class NotebooksTestCase(unittest.TestCase): 6 | 7 | SAVE_NB_PATH='tests/last_run.ipynb' 8 | 9 | def test_paths(self): 10 | return [] 11 | 12 | def ignored_nbs(self): 13 | return [] 14 | 15 | def nbs_to_run(self): 16 | class IncludeAll: 17 | def __contains__(self, _): 18 | return True 19 | return IncludeAll() 20 | 21 | def test_for_no_errors(self): 22 | """ Run all nbs in directories at test_paths() 23 | also included in nbs_to_run(), 24 | excepting those in ignored_nbs() 25 | - assert there are no errors 26 | """ 27 | for nb_path in self.test_paths(): 28 | 29 | nb_cfg = NotebookTestConfig(path=nb_path) 30 | print("EXECUTING NBS IN DIRECTORY: " + nb_path) 31 | if nb_cfg.setup: 32 | print("Setting up ... " + nb_path) 33 | nb, errors = runner.run_notebook(nb_cfg.setup, save_nb_path=NotebooksTestCase.SAVE_NB_PATH) 34 | print(errors) 35 | assert len(errors) == 0 36 | for nb in nb_cfg.notebooks: 37 | if nb in self.nbs_to_run(): 38 | if nb in self.ignored_nbs(): 39 | print("Ignored " + nb) 40 | else: 41 | print("Running... " + nb) 42 | nb, errors = runner.run_notebook(nb, save_nb_path=NotebooksTestCase.SAVE_NB_PATH) 43 | print(errors) 44 | assert len(errors) == 0 45 | 46 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- 1 | # galican stopwords 2 | a 3 | aínda 4 | alí 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | ao 12 | aos 13 | as 14 | así 15 | á 16 | ben 17 | cando 18 | che 19 | co 20 | coa 21 | comigo 22 | con 23 | connosco 24 | contigo 25 | convosco 26 | coas 27 | cos 28 | cun 29 | cuns 30 | cunha 31 | cunhas 32 | da 33 | dalgunha 34 | dalgunhas 35 | dalgún 36 | dalgúns 37 | das 38 | de 39 | del 40 | dela 41 | delas 42 | deles 43 | desde 44 | deste 45 | do 46 | dos 47 | dun 48 | duns 49 | dunha 50 | dunhas 51 | e 52 | el 53 | ela 54 | elas 55 | eles 56 | en 57 | era 58 | eran 59 | esa 60 | esas 61 | ese 62 | eses 63 | esta 64 | estar 65 | estaba 66 | está 67 | están 68 | este 69 | estes 70 | estiven 71 | estou 72 | eu 73 | é 74 | facer 75 | foi 76 | foron 77 | fun 78 | había 79 | hai 80 | iso 81 | isto 82 | la 83 | las 84 | lle 85 | lles 86 | lo 87 | los 88 | mais 89 | me 90 | meu 91 | meus 92 | min 93 | miña 94 | miñas 95 | moi 96 | na 97 | nas 98 | neste 99 | nin 100 | no 101 | non 102 | nos 103 | nosa 104 | nosas 105 | noso 106 | nosos 107 | nós 108 | nun 109 | nunha 110 | nuns 111 | nunhas 112 | o 113 | os 114 | ou 115 | ó 116 | ós 117 | para 118 | pero 119 | pode 120 | pois 121 | pola 122 | polas 123 | polo 124 | polos 125 | por 126 | que 127 | se 128 | senón 129 | ser 130 | seu 131 | seus 132 | sexa 133 | sido 134 | sobre 135 | súa 136 | súas 137 | tamén 138 | tan 139 | te 140 | ten 141 | teñen 142 | teño 143 | ter 144 | teu 145 | teus 146 | ti 147 | tido 148 | tiña 149 | tiven 150 | túa 151 | túas 152 | un 153 | unha 154 | unhas 155 | uns 156 | vos 157 | vosa 158 | vosas 159 | voso 160 | vosos 161 | vós 162 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- 1 | # galican stopwords 2 | a 3 | aínda 4 | alí 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | ao 12 | aos 13 | as 14 | así 15 | á 16 | ben 17 | cando 18 | che 19 | co 20 | coa 21 | comigo 22 | con 23 | connosco 24 | contigo 25 | convosco 26 | coas 27 | cos 28 | cun 29 | cuns 30 | cunha 31 | cunhas 32 | da 33 | dalgunha 34 | dalgunhas 35 | dalgún 36 | dalgúns 37 | das 38 | de 39 | del 40 | dela 41 | delas 42 | deles 43 | desde 44 | deste 45 | do 46 | dos 47 | dun 48 | duns 49 | dunha 50 | dunhas 51 | e 52 | el 53 | ela 54 | elas 55 | eles 56 | en 57 | era 58 | eran 59 | esa 60 | esas 61 | ese 62 | eses 63 | esta 64 | estar 65 | estaba 66 | está 67 | están 68 | este 69 | estes 70 | estiven 71 | estou 72 | eu 73 | é 74 | facer 75 | foi 76 | foron 77 | fun 78 | había 79 | hai 80 | iso 81 | isto 82 | la 83 | las 84 | lle 85 | lles 86 | lo 87 | los 88 | mais 89 | me 90 | meu 91 | meus 92 | min 93 | miña 94 | miñas 95 | moi 96 | na 97 | nas 98 | neste 99 | nin 100 | no 101 | non 102 | nos 103 | nosa 104 | nosas 105 | noso 106 | nosos 107 | nós 108 | nun 109 | nunha 110 | nuns 111 | nunhas 112 | o 113 | os 114 | ou 115 | ó 116 | ós 117 | para 118 | pero 119 | pode 120 | pois 121 | pola 122 | polas 123 | polo 124 | polos 125 | por 126 | que 127 | se 128 | senón 129 | ser 130 | seu 131 | seus 132 | sexa 133 | sido 134 | sobre 135 | súa 136 | súas 137 | tamén 138 | tan 139 | te 140 | ten 141 | teñen 142 | teño 143 | ter 144 | teu 145 | teus 146 | ti 147 | tido 148 | tiña 149 | tiven 150 | túa 151 | túas 152 | un 153 | unha 154 | unhas 155 | uns 156 | vos 157 | vosa 158 | vosas 159 | voso 160 | vosos 161 | vós 162 | -------------------------------------------------------------------------------- /ltr/injectTypos.py: -------------------------------------------------------------------------------- 1 | try: 2 | from judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid 3 | from butterfingers import butterfingers 4 | except ImportError: 5 | from .judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid 6 | from .butterfingers import butterfingers 7 | 8 | 9 | 10 | def typoIt(judgmentInFile, judgmentOutFile, rounds=100): 11 | with open(judgmentInFile) as f: 12 | currJudgments = [judg for judg in judgments_from_file(f)] 13 | lastQid = currJudgments[-1].qid 14 | judgDict = judgments_by_qid(currJudgments) 15 | 16 | existingTypos = set() 17 | 18 | for i in range(0, rounds): 19 | 20 | for qid, judglist in judgDict.items(): 21 | keywords = judglist[0].keywords 22 | keywordsWTypo = butterfingers(keywords) 23 | 24 | if keywordsWTypo != keywords and keywordsWTypo not in existingTypos: 25 | newQid = lastQid+1 26 | print("%s => %s" % (keywords, keywordsWTypo)) 27 | lastQid += 1 28 | for judg in judglist: 29 | typoJudg = Judgment(grade=judg.grade, 30 | qid=newQid, 31 | keywords=keywordsWTypo, 32 | docId=judg.docId) 33 | currJudgments.append(typoJudg) 34 | existingTypos.add(keywordsWTypo) 35 | 36 | with open(judgmentOutFile, 'w') as f: 37 | judgments_to_file(f, judgmentsList=currJudgments) 38 | 39 | 40 | if __name__ == "__main__": 41 | typoIt(judgmentInFile='title_judgments.txt', judgmentOutFile='title_fuzzy_judgments.txt') 42 | 43 | 44 | # Clone a judgment, inject random typos 45 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- 1 | a 2 | s 3 | k 4 | o 5 | i 6 | u 7 | v 8 | z 9 | dnes 10 | cz 11 | tímto 12 | budeš 13 | budem 14 | byli 15 | jseš 16 | můj 17 | svým 18 | ta 19 | tomto 20 | tohle 21 | tuto 22 | tyto 23 | jej 24 | zda 25 | proč 26 | máte 27 | tato 28 | kam 29 | tohoto 30 | kdo 31 | kteří 32 | mi 33 | nám 34 | tom 35 | tomuto 36 | mít 37 | nic 38 | proto 39 | kterou 40 | byla 41 | toho 42 | protože 43 | asi 44 | ho 45 | naši 46 | napište 47 | re 48 | což 49 | tím 50 | takže 51 | svých 52 | její 53 | svými 54 | jste 55 | aj 56 | tu 57 | tedy 58 | teto 59 | bylo 60 | kde 61 | ke 62 | pravé 63 | ji 64 | nad 65 | nejsou 66 | či 67 | pod 68 | téma 69 | mezi 70 | přes 71 | ty 72 | pak 73 | vám 74 | ani 75 | když 76 | však 77 | neg 78 | jsem 79 | tento 80 | článku 81 | články 82 | aby 83 | jsme 84 | před 85 | pta 86 | jejich 87 | byl 88 | ještě 89 | až 90 | bez 91 | také 92 | pouze 93 | první 94 | vaše 95 | která 96 | nás 97 | nový 98 | tipy 99 | pokud 100 | může 101 | strana 102 | jeho 103 | své 104 | jiné 105 | zprávy 106 | nové 107 | není 108 | vás 109 | jen 110 | podle 111 | zde 112 | už 113 | být 114 | více 115 | bude 116 | již 117 | než 118 | který 119 | by 120 | které 121 | co 122 | nebo 123 | ten 124 | tak 125 | má 126 | při 127 | od 128 | po 129 | jsou 130 | jak 131 | další 132 | ale 133 | si 134 | se 135 | ve 136 | to 137 | jako 138 | za 139 | zpět 140 | ze 141 | do 142 | pro 143 | je 144 | na 145 | atd 146 | atp 147 | jakmile 148 | přičemž 149 | já 150 | on 151 | ona 152 | ono 153 | oni 154 | ony 155 | my 156 | vy 157 | jí 158 | ji 159 | mě 160 | mne 161 | jemu 162 | tomu 163 | těm 164 | těmu 165 | němu 166 | němuž 167 | jehož 168 | jíž 169 | jelikož 170 | jež 171 | jakož 172 | načež 173 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- 1 | a 2 | s 3 | k 4 | o 5 | i 6 | u 7 | v 8 | z 9 | dnes 10 | cz 11 | tímto 12 | budeš 13 | budem 14 | byli 15 | jseš 16 | můj 17 | svým 18 | ta 19 | tomto 20 | tohle 21 | tuto 22 | tyto 23 | jej 24 | zda 25 | proč 26 | máte 27 | tato 28 | kam 29 | tohoto 30 | kdo 31 | kteří 32 | mi 33 | nám 34 | tom 35 | tomuto 36 | mít 37 | nic 38 | proto 39 | kterou 40 | byla 41 | toho 42 | protože 43 | asi 44 | ho 45 | naši 46 | napište 47 | re 48 | což 49 | tím 50 | takže 51 | svých 52 | její 53 | svými 54 | jste 55 | aj 56 | tu 57 | tedy 58 | teto 59 | bylo 60 | kde 61 | ke 62 | pravé 63 | ji 64 | nad 65 | nejsou 66 | či 67 | pod 68 | téma 69 | mezi 70 | přes 71 | ty 72 | pak 73 | vám 74 | ani 75 | když 76 | však 77 | neg 78 | jsem 79 | tento 80 | článku 81 | články 82 | aby 83 | jsme 84 | před 85 | pta 86 | jejich 87 | byl 88 | ještě 89 | až 90 | bez 91 | také 92 | pouze 93 | první 94 | vaše 95 | která 96 | nás 97 | nový 98 | tipy 99 | pokud 100 | může 101 | strana 102 | jeho 103 | své 104 | jiné 105 | zprávy 106 | nové 107 | není 108 | vás 109 | jen 110 | podle 111 | zde 112 | už 113 | být 114 | více 115 | bude 116 | již 117 | než 118 | který 119 | by 120 | které 121 | co 122 | nebo 123 | ten 124 | tak 125 | má 126 | při 127 | od 128 | po 129 | jsou 130 | jak 131 | další 132 | ale 133 | si 134 | se 135 | ve 136 | to 137 | jako 138 | za 139 | zpět 140 | ze 141 | do 142 | pro 143 | je 144 | na 145 | atd 146 | atp 147 | jakmile 148 | přičemž 149 | já 150 | on 151 | ona 152 | ono 153 | oni 154 | ony 155 | my 156 | vy 157 | jí 158 | ji 159 | mě 160 | mne 161 | jemu 162 | tomu 163 | těm 164 | těmu 165 | němu 166 | němuž 167 | jehož 168 | jíž 169 | jelikož 170 | jež 171 | jakož 172 | načež 173 | -------------------------------------------------------------------------------- /ltr/years_as_ratings.py: -------------------------------------------------------------------------------- 1 | def get_classic_rating(year): 2 | if year > 2010: 3 | return 0 4 | elif year > 1990: 5 | return 1 6 | elif year > 1970: 7 | return 2 8 | elif year > 1950: 9 | return 3 10 | else: 11 | return 4 12 | 13 | def get_latest_rating(year): 14 | if year > 2010: 15 | return 4 16 | elif year > 1990: 17 | return 3 18 | elif year > 1970: 19 | return 2 20 | elif year > 1950: 21 | return 1 22 | else: 23 | return 0 24 | 25 | def synthesize( 26 | client, 27 | featureSet='release', 28 | latestTrainingSetOut='data/latest-training.txt', 29 | classicTrainingSetOut='data/classic-training.txt' 30 | ): 31 | from ltr.judgments import judgments_to_file, Judgment 32 | NO_ZERO = False 33 | 34 | resp = client.log_query('tmdb', 'release', None) 35 | 36 | # A classic film fan 37 | judgments = [] 38 | print("Generating 'classic' biased judgments:") 39 | for hit in resp: 40 | rating = get_classic_rating(hit['ltr_features'][0]) 41 | 42 | if rating == 0 and NO_ZERO: 43 | continue 44 | 45 | judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords='')) 46 | 47 | 48 | with open(classicTrainingSetOut, 'w') as out: 49 | judgments_to_file(out, judgments) 50 | 51 | # A current film fan 52 | judgments = [] 53 | print("Generating 'recent' biased judgments:") 54 | for hit in resp: 55 | rating = get_latest_rating(hit['ltr_features'][0]) 56 | 57 | if rating == 0 and NO_ZERO: 58 | continue 59 | 60 | judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords='')) 61 | 62 | 63 | with open(latestTrainingSetOut, 'w') as out: 64 | judgments_to_file(out, judgments) 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /ltr/search.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | baseEsQuery = { 4 | "size": 5, 5 | "query": { 6 | "sltr": { 7 | "params": { 8 | "keywords": "", 9 | }, 10 | "model": "" 11 | } 12 | } 13 | } 14 | 15 | def esLtrQuery(keywords, modelName): 16 | import json 17 | baseEsQuery['query']['sltr']['params']['keywords'] = keywords 18 | baseEsQuery['query']['sltr']['params']['keywordsList'] = [keywords] # Needed by TSQ for now 19 | baseEsQuery['query']['sltr']['model'] = modelName 20 | print("%s" % json.dumps(baseEsQuery)) 21 | return baseEsQuery 22 | 23 | # TODO: Parse params and add efi dynamically instead of adding manually to query below 24 | def solrLtrQuery(keywords, modelName): 25 | keywords = re.sub('([^\s\w]|_)+', '', keywords) 26 | fuzzy_keywords = ' '.join([x + '~' for x in keywords.split(' ')]) 27 | 28 | return { 29 | 'fl': '*,score', 30 | 'rows': 5, 31 | 'q': '{{!ltr reRankDocs=30000 model={} efi.keywords="{}" efi.fuzzy_keywords="{}"}}'.format(modelName, keywords, fuzzy_keywords) 32 | } 33 | 34 | 35 | tmdbFields = { 36 | 'title': 'title', 37 | 'display_fields': ['release_year', 'genres', 'overview'] 38 | } 39 | 40 | 41 | 42 | def search(client, keywords, modelName, index='tmdb', fields=tmdbFields): 43 | if client.name() == 'elastic': 44 | results = client.query(index, esLtrQuery(keywords, modelName)) 45 | else: 46 | q = solrLtrQuery(keywords, modelName) 47 | print(q) 48 | results = client.query(index, q) 49 | 50 | ti = fields['title'] 51 | 52 | for result in results: 53 | print("%s " % (result[ti] if ti in result else 'N/A')) 54 | print("%s " % (result['_score'])) 55 | 56 | for df in fields['display_fields']: 57 | print("%s " % (result[df] if df in result else 'N/A')) 58 | 59 | print("---------------------------------------") 60 | -------------------------------------------------------------------------------- /ltr/helpers/msmarco/evaluate.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | 4 | 5 | class QRel(): 6 | 7 | def __init__(self, qid, docid, keywords): 8 | self.qid=qid 9 | self.docid=docid 10 | self.keywords = keywords 11 | 12 | def eval_rr(self, doc_ranking): 13 | """ Evaluate the provided doc ranking using reciprical rank 14 | (1/rank of the expected doc) 15 | 16 | returns 0 if this qrels doc id is missing 17 | """ 18 | 19 | for rank, docid in enumerate(doc_ranking, start=1): 20 | if docid == self.docid: 21 | return 1.0 / rank 22 | return 0.0 23 | 24 | @staticmethod 25 | def read_qrels(qrels_fname='data/msmarco-doctrain-qrels.tsv.gz', 26 | queries_fname='data/msmarco-doctrain-queries.tsv.gz'): 27 | 28 | qids_to_keywords = QRel.get_keyword_lookup(queries_fname) 29 | 30 | with gzip.open(qrels_fname, 'rt') as f: 31 | reader = csv.reader(f, delimiter=' ') 32 | for row in reader: 33 | qid = row[0] 34 | keywords = None 35 | if qid in qids_to_keywords: 36 | keywords = qids_to_keywords[qid] 37 | else: 38 | print("Missing keywords for %s" % qid) 39 | yield QRel(qid=row[0], docid=row[2], keywords=keywords) 40 | 41 | @staticmethod 42 | def get_keyword_lookup(fname='data/msmarco-doctrain-queries.tsv.gz'): 43 | qids_to_keywords = {} 44 | with gzip.open(fname, 'rt') as f: 45 | reader = csv.reader(f, delimiter='\t') 46 | for row in reader: 47 | qids_to_keywords[row[0]] = row[1] 48 | return qids_to_keywords 49 | 50 | def __str__(self): 51 | return "qid:%s(%s) => doc:%s" % (self.qid, self.keywords, self.docid) 52 | 53 | 54 | if __name__ == "__main__": 55 | qrels = {} 56 | for qrel in QRel.read_qrels(): 57 | qrels[qrel.qid] = qrel 58 | 59 | print(qrels['1185869'].eval_rr(['1','1'])) 60 | 61 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file defines a stopword set for Japanese. 3 | # 4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. 5 | # Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 6 | # for frequency lists, etc. that can be useful for making your own set (if desired) 7 | # 8 | # Note that there is an overlap between these stopwords and the terms stopped when used 9 | # in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note 10 | # that comments are not allowed on the same line as stopwords. 11 | # 12 | # Also note that stopping is done in a case-insensitive manner. Change your StopFilter 13 | # configuration if you need case-sensitive stopping. Lastly, note that stopping is done 14 | # using the same character width as the entries in this file. Since this StopFilter is 15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji 16 | # entries to be in half-width and your kana entries to be in full-width. 17 | # 18 | の 19 | に 20 | は 21 | を 22 | た 23 | が 24 | で 25 | て 26 | と 27 | し 28 | れ 29 | さ 30 | ある 31 | いる 32 | も 33 | する 34 | から 35 | な 36 | こと 37 | として 38 | い 39 | や 40 | れる 41 | など 42 | なっ 43 | ない 44 | この 45 | ため 46 | その 47 | あっ 48 | よう 49 | また 50 | もの 51 | という 52 | あり 53 | まで 54 | られ 55 | なる 56 | へ 57 | か 58 | だ 59 | これ 60 | によって 61 | により 62 | おり 63 | より 64 | による 65 | ず 66 | なり 67 | られる 68 | において 69 | ば 70 | なかっ 71 | なく 72 | しかし 73 | について 74 | せ 75 | だっ 76 | その後 77 | できる 78 | それ 79 | う 80 | ので 81 | なお 82 | のみ 83 | でき 84 | き 85 | つ 86 | における 87 | および 88 | いう 89 | さらに 90 | でも 91 | ら 92 | たり 93 | その他 94 | に関する 95 | たち 96 | ます 97 | ん 98 | なら 99 | に対して 100 | 特に 101 | せる 102 | 及び 103 | これら 104 | とき 105 | では 106 | にて 107 | ほか 108 | ながら 109 | うち 110 | そして 111 | とともに 112 | ただし 113 | かつて 114 | それぞれ 115 | または 116 | お 117 | ほど 118 | ものの 119 | に対する 120 | ほとんど 121 | と共に 122 | といった 123 | です 124 | とも 125 | ところ 126 | ここ 127 | ##### End of file 128 | -------------------------------------------------------------------------------- /ltr/clickmodels/cascade.py: -------------------------------------------------------------------------------- 1 | from ltr.clickmodels.session import build 2 | from collections import Counter, defaultdict 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Attractiveness per query-doc 7 | self.attracts = defaultdict(lambda : 0.5) 8 | 9 | def cascade_model(sessions): 10 | """ Cascading model can be solved directly: 11 | - sessions with skips count against a doc 12 | - sessions with clicks count for 13 | - stop at first click 14 | """ 15 | session_counts = Counter() 16 | click_counts = Counter() 17 | model=Model() 18 | 19 | for session in sessions: 20 | for rank, doc in enumerate(session.docs): 21 | query_doc_key = (session.query, doc.doc_id) 22 | session_counts[query_doc_key] += 1 23 | 24 | if doc.click: 25 | # Cascading model doesn't consider 26 | # clicks past the last one, so we count 27 | # this one and break out 28 | click_counts[query_doc_key] += 1 29 | break; 30 | 31 | for (query_id, doc_id), count in session_counts.items(): 32 | query_doc_key = (query_id, doc_id) 33 | model.attracts[query_doc_key] = click_counts[query_doc_key] / session_counts[query_doc_key] 34 | return model 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | sessions = build([ 40 | ('A', ((1, True), (2, False), (3, True), (0, False))), 41 | ('B', ((5, False), (2, True), (3, True), (0, False))), 42 | ('A', ((1, False), (2, False), (3, True), (0, False))), 43 | ('B', ((1, False), (2, False), (3, False), (9, True))), 44 | ('A', ((9, False), (2, False), (1, True), (0, True))), 45 | ('B', ((6, True), (2, False), (3, True), (1, False))), 46 | ('A', ((7, False), (4, True), (1, False), (3, False))), 47 | ('B', ((8, True), (2, False), (3, True), (1, False))), 48 | ('A', ((1, False), (4, True), (2, False), (3, False))), 49 | ('B', ((7, True), (4, False), (5, True), (1, True))), 50 | ]) 51 | cascade_model(sessions) 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file defines a stopword set for Japanese. 3 | # 4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. 5 | # Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 6 | # for frequency lists, etc. that can be useful for making your own set (if desired) 7 | # 8 | # Note that there is an overlap between these stopwords and the terms stopped when used 9 | # in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note 10 | # that comments are not allowed on the same line as stopwords. 11 | # 12 | # Also note that stopping is done in a case-insensitive manner. Change your StopFilter 13 | # configuration if you need case-sensitive stopping. Lastly, note that stopping is done 14 | # using the same character width as the entries in this file. Since this StopFilter is 15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji 16 | # entries to be in half-width and your kana entries to be in full-width. 17 | # 18 | の 19 | に 20 | は 21 | を 22 | た 23 | が 24 | で 25 | て 26 | と 27 | し 28 | れ 29 | さ 30 | ある 31 | いる 32 | も 33 | する 34 | から 35 | な 36 | こと 37 | として 38 | い 39 | や 40 | れる 41 | など 42 | なっ 43 | ない 44 | この 45 | ため 46 | その 47 | あっ 48 | よう 49 | また 50 | もの 51 | という 52 | あり 53 | まで 54 | られ 55 | なる 56 | へ 57 | か 58 | だ 59 | これ 60 | によって 61 | により 62 | おり 63 | より 64 | による 65 | ず 66 | なり 67 | られる 68 | において 69 | ば 70 | なかっ 71 | なく 72 | しかし 73 | について 74 | せ 75 | だっ 76 | その後 77 | できる 78 | それ 79 | う 80 | ので 81 | なお 82 | のみ 83 | でき 84 | き 85 | つ 86 | における 87 | および 88 | いう 89 | さらに 90 | でも 91 | ら 92 | たり 93 | その他 94 | に関する 95 | たち 96 | ます 97 | ん 98 | なら 99 | に対して 100 | 特に 101 | せる 102 | 及び 103 | これら 104 | とき 105 | では 106 | にて 107 | ほか 108 | ながら 109 | うち 110 | そして 111 | とともに 112 | ただし 113 | かつて 114 | それぞれ 115 | または 116 | お 117 | ほど 118 | ものの 119 | に対する 120 | ほとんど 121 | と共に 122 | といった 123 | です 124 | とも 125 | ところ 126 | ここ 127 | ##### End of file 128 | -------------------------------------------------------------------------------- /ltr/clickmodels/session.py: -------------------------------------------------------------------------------- 1 | 2 | class Doc: 3 | def __init__(self, click, doc_id, conversion=False): 4 | self.click = click 5 | self.doc_id = doc_id 6 | self.conversion = conversion 7 | 8 | def __repr__(self): 9 | return "Doc(doc_id=%s, click=%s, conversion=%s)" % (self.doc_id, self.click, self.conversion) 10 | 11 | def __str__(self): 12 | return "(%s, %s, %s)" % (self.doc_id, self.click, self.conversion) 13 | 14 | 15 | class Session: 16 | def __init__(self, query, docs): 17 | self.query = query 18 | self.docs = docs 19 | # Check if docs are unique 20 | docset = set() 21 | for doc in docs: 22 | if doc.doc_id in docset: 23 | raise ValueError("A session may only list a doc exactly once in search results") 24 | docset.add(doc.doc_id) 25 | 26 | def __repr__(self): 27 | return "Session(query=%s, docs=%s)" % (self.query, self.docs) 28 | 29 | def __str__(self): 30 | return "(%s, (%s))" % (self.query, self.docs) 31 | 32 | 33 | def build_one(sess_tuple): 34 | """ Take a tuple where 35 | 0th item is query (a string that uniquely identifies it) 36 | 1st item is a list of docs, with clicks 37 | and optionally a conversion id or true/false 38 | 39 | 40 | ('A', ((1, True), (2, False), (3, True), (0, False))), 41 | 42 | alternatively a value can be attached to the doc 43 | 44 | ('A', ((1, True, 0.9), (2, False, 0.8), (3, True, 1.0), (0, False))), 45 | """ 46 | query = sess_tuple[0] 47 | docs = [] 48 | for doc_tuple in sess_tuple[1]: 49 | conversion = False 50 | if len(doc_tuple) > 2: 51 | conversion = doc_tuple[2] 52 | docs.append(Doc(doc_id=doc_tuple[0], 53 | click=doc_tuple[1], 54 | conversion=conversion)) 55 | return Session(query=query, docs=docs) 56 | 57 | 58 | def build(sess_tuples): 59 | sesss = [] 60 | for sess_tup in sess_tuples: 61 | sesss.append(build_one(sess_tup)) 62 | return sesss 63 | 64 | -------------------------------------------------------------------------------- /ltr/client/base_client.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | ''' 4 | This project demonstrates working with LTR in Elasticsearch and Solr 5 | 6 | The goal of this class is to abstract away the server and highlight the steps 7 | required to begin working with LTR. This keeps the examples agnostic about 8 | which backend is being used, but the implementations of each client 9 | should be useful references to those getting started with LTR on 10 | their specific platform 11 | ''' 12 | class BaseClient(ABC): 13 | @abstractmethod 14 | def get_host(self): 15 | pass 16 | 17 | @abstractmethod 18 | def name(self): 19 | pass 20 | 21 | @abstractmethod 22 | def delete_index(self, index): 23 | pass 24 | 25 | @abstractmethod 26 | def create_index(self, index): 27 | pass 28 | 29 | @abstractmethod 30 | def index_documents(self, index, doc_src): 31 | pass 32 | 33 | @abstractmethod 34 | def reset_ltr(self, index): 35 | pass 36 | 37 | @abstractmethod 38 | def create_featureset(self, index, name, ftr_config): 39 | pass 40 | 41 | @abstractmethod 42 | def get_feature_name(self, config, ftr_idx): 43 | pass 44 | 45 | @abstractmethod 46 | def query(self, index, query): 47 | pass 48 | 49 | @abstractmethod 50 | def get_doc(self, doc_id): 51 | pass 52 | 53 | @abstractmethod 54 | def log_query(self, index, featureset, ids, params): 55 | pass 56 | 57 | @abstractmethod 58 | def submit_model(self, featureset, index, model_name, model_payload): 59 | pass 60 | 61 | @abstractmethod 62 | def submit_ranklib_model(self, featureset, index, model_name, model_payload): 63 | pass 64 | 65 | @abstractmethod 66 | def model_query(self, index, model, model_params, query): 67 | pass 68 | 69 | @abstractmethod 70 | def feature_set(self, index, name): 71 | """ Return a mapping of name/feature ordinal 72 | and the raw (search engine specific) feature list""" 73 | pass 74 | 75 | 76 | -------------------------------------------------------------------------------- /ltr/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import plotly.graph_objs as go 5 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 6 | 7 | def log_run(cmd): 8 | resp = os.popen(cmd).read() 9 | print(resp) 10 | 11 | def quiet_run(cmd): 12 | os.popen(cmd).read() 13 | 14 | def evaluate(mode): 15 | # Build the docker image 16 | if mode == 'elastic': 17 | cmd = 'docker build --no-cache -t ltr-rre rre/elastic/.' 18 | else: 19 | cmd = 'docker build --no-cache -t ltr-rre rre/solr/.' 20 | 21 | print('Building RRE image - This will take a while') 22 | quiet_run(cmd) 23 | 24 | # Remove and run a fresh docker image 25 | cmd = 'docker rm -f ltr-rre' 26 | quiet_run(cmd) 27 | 28 | cmd = 'docker run --name ltr-rre ltr-rre' 29 | print('Running evaluation') 30 | log_run(cmd) 31 | 32 | # Copy out reports 33 | cmd = 'docker cp ltr-rre:/rre/target/rre/evaluation.json data/rre-evaluation.json' 34 | log_run(cmd) 35 | 36 | cmd = 'docker cp ltr-rre:/rre/target/site/rre-report.xlsx data/rre-report.xlsx' 37 | log_run(cmd) 38 | 39 | print('RRE Evaluation complete') 40 | 41 | 42 | def rre_table(): 43 | init_notebook_mode(connected=True) 44 | 45 | with open('data/rre-evaluation.json') as src: 46 | report = json.load(src) 47 | metrics = report['metrics'] 48 | 49 | experiments = ['baseline', 'classic', 'latest'] 50 | precisions = [] 51 | recalls = [] 52 | errs = [] 53 | 54 | for exp in experiments: 55 | precisions.append(metrics['P']['versions'][exp]['value']) 56 | recalls.append(metrics['R']['versions'][exp]['value']) 57 | errs.append(metrics['ERR@30']['versions'][exp]['value']) 58 | 59 | trace = go.Table( 60 | header=dict(values=['', 'Precision', 'Recall', 'ERR'], fill = dict(color='#AAAAAA')), 61 | cells=dict(values=[ 62 | experiments, 63 | precisions, 64 | recalls, 65 | errs 66 | ]) 67 | ) 68 | 69 | data = [trace] 70 | iplot(data) 71 | 72 | -------------------------------------------------------------------------------- /ltr/helpers/butterfingers.py: -------------------------------------------------------------------------------- 1 | def butterfingers(text,prob=0.1,keyboard='qwerty'): 2 | import random 3 | 4 | """ taken from 5 | https://github.com/Decagon/butter-fingers/blob/master/butterfingers/butterfingers.py """ 6 | 7 | keyApprox = {} 8 | 9 | if keyboard == "qwerty": 10 | keyApprox['q'] = "qwasedzx" 11 | keyApprox['w'] = "wqesadrfcx" 12 | keyApprox['e'] = "ewrsfdqazxcvgt" 13 | keyApprox['r'] = "retdgfwsxcvgt" 14 | keyApprox['t'] = "tryfhgedcvbnju" 15 | keyApprox['y'] = "ytugjhrfvbnji" 16 | keyApprox['u'] = "uyihkjtgbnmlo" 17 | keyApprox['i'] = "iuojlkyhnmlp" 18 | keyApprox['o'] = "oipklujm" 19 | keyApprox['p'] = "plo['ik" 20 | 21 | keyApprox['a'] = "aqszwxwdce" 22 | keyApprox['s'] = "swxadrfv" 23 | keyApprox['d'] = "decsfaqgbv" 24 | keyApprox['f'] = "fdgrvwsxyhn" 25 | keyApprox['g'] = "gtbfhedcyjn" 26 | keyApprox['h'] = "hyngjfrvkim" 27 | keyApprox['j'] = "jhknugtblom" 28 | keyApprox['k'] = "kjlinyhn" 29 | keyApprox['l'] = "lokmpujn" 30 | 31 | keyApprox['z'] = "zaxsvde" 32 | keyApprox['x'] = "xzcsdbvfrewq" 33 | keyApprox['c'] = "cxvdfzswergb" 34 | keyApprox['v'] = "vcfbgxdertyn" 35 | keyApprox['b'] = "bvnghcftyun" 36 | keyApprox['n'] = "nbmhjvgtuik" 37 | keyApprox['m'] = "mnkjloik" 38 | keyApprox[' '] = " " 39 | else: 40 | print("Keyboard not supported.") 41 | 42 | probOfTypo = int(prob * 100) 43 | 44 | buttertext = "" 45 | for letter in text: 46 | lcletter = letter.lower() 47 | if not lcletter in keyApprox.keys(): 48 | newletter = lcletter 49 | else: 50 | if random.choice(range(0, 100)) <= probOfTypo: 51 | newletter = random.choice(keyApprox[lcletter]) 52 | else: 53 | newletter = lcletter 54 | # go back to original case 55 | if not lcletter == letter: 56 | newletter = newletter.upper() 57 | buttertext += newletter 58 | 59 | return buttertext 60 | 61 | 62 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_lv.txt: -------------------------------------------------------------------------------- 1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins 2 | # the original list of over 800 forms was refined: 3 | # pronouns, adverbs, interjections were removed 4 | # 5 | # prepositions 6 | aiz 7 | ap 8 | ar 9 | apakš 10 | ārpus 11 | augšpus 12 | bez 13 | caur 14 | dēļ 15 | gar 16 | iekš 17 | iz 18 | kopš 19 | labad 20 | lejpus 21 | līdz 22 | no 23 | otrpus 24 | pa 25 | par 26 | pār 27 | pēc 28 | pie 29 | pirms 30 | pret 31 | priekš 32 | starp 33 | šaipus 34 | uz 35 | viņpus 36 | virs 37 | virspus 38 | zem 39 | apakšpus 40 | # Conjunctions 41 | un 42 | bet 43 | jo 44 | ja 45 | ka 46 | lai 47 | tomēr 48 | tikko 49 | turpretī 50 | arī 51 | kaut 52 | gan 53 | tādēļ 54 | tā 55 | ne 56 | tikvien 57 | vien 58 | kā 59 | ir 60 | te 61 | vai 62 | kamēr 63 | # Particles 64 | ar 65 | diezin 66 | droši 67 | diemžēl 68 | nebūt 69 | ik 70 | it 71 | taču 72 | nu 73 | pat 74 | tiklab 75 | iekšpus 76 | nedz 77 | tik 78 | nevis 79 | turpretim 80 | jeb 81 | iekam 82 | iekām 83 | iekāms 84 | kolīdz 85 | līdzko 86 | tiklīdz 87 | jebšu 88 | tālab 89 | tāpēc 90 | nekā 91 | itin 92 | jā 93 | jau 94 | jel 95 | nē 96 | nezin 97 | tad 98 | tikai 99 | vis 100 | tak 101 | iekams 102 | vien 103 | # modal verbs 104 | būt 105 | biju 106 | biji 107 | bija 108 | bijām 109 | bijāt 110 | esmu 111 | esi 112 | esam 113 | esat 114 | būšu 115 | būsi 116 | būs 117 | būsim 118 | būsiet 119 | tikt 120 | tiku 121 | tiki 122 | tika 123 | tikām 124 | tikāt 125 | tieku 126 | tiec 127 | tiek 128 | tiekam 129 | tiekat 130 | tikšu 131 | tiks 132 | tiksim 133 | tiksiet 134 | tapt 135 | tapi 136 | tapāt 137 | topat 138 | tapšu 139 | tapsi 140 | taps 141 | tapsim 142 | tapsiet 143 | kļūt 144 | kļuvu 145 | kļuvi 146 | kļuva 147 | kļuvām 148 | kļuvāt 149 | kļūstu 150 | kļūsti 151 | kļūst 152 | kļūstam 153 | kļūstat 154 | kļūšu 155 | kļūsi 156 | kļūs 157 | kļūsim 158 | kļūsiet 159 | # verbs 160 | varēt 161 | varēju 162 | varējām 163 | varēšu 164 | varēsim 165 | var 166 | varēji 167 | varējāt 168 | varēsi 169 | varēsiet 170 | varat 171 | varēja 172 | varēs 173 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_lv.txt: -------------------------------------------------------------------------------- 1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins 2 | # the original list of over 800 forms was refined: 3 | # pronouns, adverbs, interjections were removed 4 | # 5 | # prepositions 6 | aiz 7 | ap 8 | ar 9 | apakš 10 | ārpus 11 | augšpus 12 | bez 13 | caur 14 | dēļ 15 | gar 16 | iekš 17 | iz 18 | kopš 19 | labad 20 | lejpus 21 | līdz 22 | no 23 | otrpus 24 | pa 25 | par 26 | pār 27 | pēc 28 | pie 29 | pirms 30 | pret 31 | priekš 32 | starp 33 | šaipus 34 | uz 35 | viņpus 36 | virs 37 | virspus 38 | zem 39 | apakšpus 40 | # Conjunctions 41 | un 42 | bet 43 | jo 44 | ja 45 | ka 46 | lai 47 | tomēr 48 | tikko 49 | turpretī 50 | arī 51 | kaut 52 | gan 53 | tādēļ 54 | tā 55 | ne 56 | tikvien 57 | vien 58 | kā 59 | ir 60 | te 61 | vai 62 | kamēr 63 | # Particles 64 | ar 65 | diezin 66 | droši 67 | diemžēl 68 | nebūt 69 | ik 70 | it 71 | taču 72 | nu 73 | pat 74 | tiklab 75 | iekšpus 76 | nedz 77 | tik 78 | nevis 79 | turpretim 80 | jeb 81 | iekam 82 | iekām 83 | iekāms 84 | kolīdz 85 | līdzko 86 | tiklīdz 87 | jebšu 88 | tālab 89 | tāpēc 90 | nekā 91 | itin 92 | jā 93 | jau 94 | jel 95 | nē 96 | nezin 97 | tad 98 | tikai 99 | vis 100 | tak 101 | iekams 102 | vien 103 | # modal verbs 104 | būt 105 | biju 106 | biji 107 | bija 108 | bijām 109 | bijāt 110 | esmu 111 | esi 112 | esam 113 | esat 114 | būšu 115 | būsi 116 | būs 117 | būsim 118 | būsiet 119 | tikt 120 | tiku 121 | tiki 122 | tika 123 | tikām 124 | tikāt 125 | tieku 126 | tiec 127 | tiek 128 | tiekam 129 | tiekat 130 | tikšu 131 | tiks 132 | tiksim 133 | tiksiet 134 | tapt 135 | tapi 136 | tapāt 137 | topat 138 | tapšu 139 | tapsi 140 | taps 141 | tapsim 142 | tapsiet 143 | kļūt 144 | kļuvu 145 | kļuvi 146 | kļuva 147 | kļuvām 148 | kļuvāt 149 | kļūstu 150 | kļūsti 151 | kļūst 152 | kļūstam 153 | kļūstat 154 | kļūšu 155 | kļūsi 156 | kļūs 157 | kļūsim 158 | kļūsiet 159 | # verbs 160 | varēt 161 | varēju 162 | varējām 163 | varēšu 164 | varēsim 165 | var 166 | varēji 167 | varējāt 168 | varēsi 169 | varēsiet 170 | varat 171 | varēja 172 | varēs 173 | -------------------------------------------------------------------------------- /ltr/client/solr_parse.py: -------------------------------------------------------------------------------- 1 | def every_other_zipped(lst): 2 | return zip(lst[0::2],lst[1::2]) 3 | 4 | def dictify(nl_tups): 5 | """ Return dict if all keys unique, otherwise 6 | dont modify """ 7 | as_dict = dict(nl_tups) 8 | if len(as_dict) == len(nl_tups): 9 | return as_dict 10 | return nl_tups 11 | 12 | def parse_named_list(lst): 13 | shallow_tups = [tup for tup in every_other_zipped(lst)] 14 | 15 | nl_as_tups = [] 16 | 17 | for tup in shallow_tups: 18 | if isinstance(tup[1], list): 19 | tup = (tup[0], parse_named_list(tup[1])) 20 | nl_as_tups.append(tup) 21 | return dictify(nl_as_tups) 22 | 23 | 24 | def parse_termvect_namedlist(lst, field): 25 | """ Parse the named list and perform some transformations to create consistent 26 | JSON to parse 27 | 28 | Specifically changing {"positions": ...} to {"positions": [1234,4567]} 29 | 30 | """ 31 | 32 | def listify_posns(posn_attrs): 33 | if isinstance(posn_attrs, dict): 34 | assert len(posn_attrs) == 1 35 | return [posn_attrs['position']] 36 | return [posn_attr[1] for posn_attr in posn_attrs] 37 | 38 | 39 | tv_parsed = parse_named_list(lst) 40 | for doc_id, doc_field_tv in tv_parsed.items(): 41 | for field_name, term_vects in doc_field_tv.items(): 42 | # T 43 | if field_name == field: 44 | for term, attrs in term_vects.items(): 45 | for attr_key, attr_val in attrs.items(): 46 | if attr_key == 'positions': 47 | attrs['positions'] = listify_posns(attr_val) 48 | return tv_parsed 49 | 50 | 51 | 52 | if __name__ == "__main__": 53 | solr_nl = [ 54 | "D100000", [ 55 | "uniqueKey", "D100000", 56 | "body", [ 57 | "1", [ 58 | "positions", [ 59 | "position", 92, 60 | "position", 113 61 | ]], 62 | "2", [ 63 | "positions", [ 64 | "position", 22, 65 | "position", 413 66 | ]], 67 | "boo", [ 68 | "positions", [ 69 | "position", 22, 70 | ]] 71 | ]]] 72 | print(repr(parse_termvect_namedlist(solr_nl, 'body'))) 73 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_bg.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | а 5 | аз 6 | ако 7 | ала 8 | бе 9 | без 10 | беше 11 | би 12 | бил 13 | била 14 | били 15 | било 16 | близо 17 | бъдат 18 | бъде 19 | бяха 20 | в 21 | вас 22 | ваш 23 | ваша 24 | вероятно 25 | вече 26 | взема 27 | ви 28 | вие 29 | винаги 30 | все 31 | всеки 32 | всички 33 | всичко 34 | всяка 35 | във 36 | въпреки 37 | върху 38 | г 39 | ги 40 | главно 41 | го 42 | д 43 | да 44 | дали 45 | до 46 | докато 47 | докога 48 | дори 49 | досега 50 | доста 51 | е 52 | едва 53 | един 54 | ето 55 | за 56 | зад 57 | заедно 58 | заради 59 | засега 60 | затова 61 | защо 62 | защото 63 | и 64 | из 65 | или 66 | им 67 | има 68 | имат 69 | иска 70 | й 71 | каза 72 | как 73 | каква 74 | какво 75 | както 76 | какъв 77 | като 78 | кога 79 | когато 80 | което 81 | които 82 | кой 83 | който 84 | колко 85 | която 86 | къде 87 | където 88 | към 89 | ли 90 | м 91 | ме 92 | между 93 | мен 94 | ми 95 | мнозина 96 | мога 97 | могат 98 | може 99 | моля 100 | момента 101 | му 102 | н 103 | на 104 | над 105 | назад 106 | най 107 | направи 108 | напред 109 | например 110 | нас 111 | не 112 | него 113 | нея 114 | ни 115 | ние 116 | никой 117 | нито 118 | но 119 | някои 120 | някой 121 | няма 122 | обаче 123 | около 124 | освен 125 | особено 126 | от 127 | отгоре 128 | отново 129 | още 130 | пак 131 | по 132 | повече 133 | повечето 134 | под 135 | поне 136 | поради 137 | после 138 | почти 139 | прави 140 | пред 141 | преди 142 | през 143 | при 144 | пък 145 | първо 146 | с 147 | са 148 | само 149 | се 150 | сега 151 | си 152 | скоро 153 | след 154 | сме 155 | според 156 | сред 157 | срещу 158 | сте 159 | съм 160 | със 161 | също 162 | т 163 | тази 164 | така 165 | такива 166 | такъв 167 | там 168 | твой 169 | те 170 | тези 171 | ти 172 | тн 173 | то 174 | това 175 | тогава 176 | този 177 | той 178 | толкова 179 | точно 180 | трябва 181 | тук 182 | тъй 183 | тя 184 | тях 185 | у 186 | харесва 187 | ч 188 | че 189 | често 190 | чрез 191 | ще 192 | щом 193 | я 194 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_bg.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | а 5 | аз 6 | ако 7 | ала 8 | бе 9 | без 10 | беше 11 | би 12 | бил 13 | била 14 | били 15 | било 16 | близо 17 | бъдат 18 | бъде 19 | бяха 20 | в 21 | вас 22 | ваш 23 | ваша 24 | вероятно 25 | вече 26 | взема 27 | ви 28 | вие 29 | винаги 30 | все 31 | всеки 32 | всички 33 | всичко 34 | всяка 35 | във 36 | въпреки 37 | върху 38 | г 39 | ги 40 | главно 41 | го 42 | д 43 | да 44 | дали 45 | до 46 | докато 47 | докога 48 | дори 49 | досега 50 | доста 51 | е 52 | едва 53 | един 54 | ето 55 | за 56 | зад 57 | заедно 58 | заради 59 | засега 60 | затова 61 | защо 62 | защото 63 | и 64 | из 65 | или 66 | им 67 | има 68 | имат 69 | иска 70 | й 71 | каза 72 | как 73 | каква 74 | какво 75 | както 76 | какъв 77 | като 78 | кога 79 | когато 80 | което 81 | които 82 | кой 83 | който 84 | колко 85 | която 86 | къде 87 | където 88 | към 89 | ли 90 | м 91 | ме 92 | между 93 | мен 94 | ми 95 | мнозина 96 | мога 97 | могат 98 | може 99 | моля 100 | момента 101 | му 102 | н 103 | на 104 | над 105 | назад 106 | най 107 | направи 108 | напред 109 | например 110 | нас 111 | не 112 | него 113 | нея 114 | ни 115 | ние 116 | никой 117 | нито 118 | но 119 | някои 120 | някой 121 | няма 122 | обаче 123 | около 124 | освен 125 | особено 126 | от 127 | отгоре 128 | отново 129 | още 130 | пак 131 | по 132 | повече 133 | повечето 134 | под 135 | поне 136 | поради 137 | после 138 | почти 139 | прави 140 | пред 141 | преди 142 | през 143 | при 144 | пък 145 | първо 146 | с 147 | са 148 | само 149 | се 150 | сега 151 | си 152 | скоро 153 | след 154 | сме 155 | според 156 | сред 157 | срещу 158 | сте 159 | съм 160 | със 161 | също 162 | т 163 | тази 164 | така 165 | такива 166 | такъв 167 | там 168 | твой 169 | те 170 | тези 171 | ти 172 | тн 173 | то 174 | това 175 | тогава 176 | този 177 | той 178 | толкова 179 | точно 180 | трябва 181 | тук 182 | тъй 183 | тя 184 | тях 185 | у 186 | харесва 187 | ч 188 | че 189 | често 190 | чрез 191 | ще 192 | щом 193 | я 194 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/solr.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 28 | 29 | 30 | 31 | 32 | 33 | ${host:} 34 | ${jetty.port:8983} 35 | ${hostContext:solr} 36 | 37 | ${genericCoreNodeNames:true} 38 | 39 | ${zkClientTimeout:30000} 40 | ${distribUpdateSoTimeout:600000} 41 | ${distribUpdateConnTimeout:60000} 42 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider} 43 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider} 44 | 45 | 46 | 47 | 49 | ${socketTimeout:600000} 50 | ${connTimeout:60000} 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /ltr/helpers/tau.py: -------------------------------------------------------------------------------- 1 | sign = lambda a: (a>0) - (a<0) 2 | 3 | def pairs_in_order(ranking, both_ways=True): 4 | assert len(ranking) > 1 5 | for idx1, val1 in enumerate(ranking): 6 | for idx2, val2 in enumerate(ranking): 7 | if idx2 > idx1: 8 | yield val1, val2, sign(idx2-idx1) 9 | if both_ways: 10 | yield val2, val1, sign(idx1-idx2) 11 | 12 | def tau(rank1, rank2, at=4): 13 | rank1in = {} 14 | 15 | 16 | if len(rank1) < at or len(rank2) < at: 17 | raise ValueError("rankings must be larger than provided at param(%s)" % at) 18 | 19 | # Handle 1 as a special case 20 | if at == 1: 21 | if rank1[0] == rank2[0]: 22 | return 1 23 | return -1 24 | 25 | rank1 = rank1[:at]; rank2 = rank2[:at] 26 | 27 | # gather concordances/discords for rank1 28 | for val1, val2, order in pairs_in_order(rank1, both_ways=True): 29 | rank1in[(val1,val2)] = order 30 | 31 | # check rank2 32 | concords = 0 33 | discords = 0 34 | for val1, val2, order in pairs_in_order(rank2, both_ways=False): 35 | try: 36 | rank1order = rank1in[(val1,val2)] 37 | if order == rank1order: 38 | concords += 1 39 | else: 40 | discords += 1 41 | except KeyError: 42 | discords += 1 43 | 44 | return (concords - discords) / ((at * (at - 1)) / 2) 45 | 46 | def avg_tau(rank1, rank2, at=4): 47 | if len(rank1) < at or len(rank2) < at: 48 | raise ValueError("rankings must be larger than provided at param(%s)" % at) 49 | 50 | rank1 = rank1[:at]; rank2 = rank2[:at] 51 | 52 | tot = 0 53 | for i in range(1,at+1): 54 | tot += tau(rank1,rank2,at=i) 55 | return tot / (at) 56 | 57 | if __name__ == "__main__": 58 | print(tau([1,2,3,4],[4,3,2,1])) 59 | print(tau([1,2,3,4],[1,2,3,4])) 60 | print(tau([1,2,4,3],[1,2,3,4])) 61 | print(tau([5,6,7,8],[1,2,3,4])) 62 | print(tau([1,2,3,5],[1,2,3,4])) 63 | print(tau([5,3,2,1],[4,3,2,1])) 64 | l1=[1,2,4,3]; l2=[1,2,3,4]; l3=[2,1,3,4] 65 | print("avg_tau(%s,%s,at=4) %s" % (l1, l1, avg_tau(l1,l1))) 66 | print("avg_tau(%s,%s,at=4) %s" % (l1, l2, avg_tau(l1,l2))) 67 | print("avg_tau(%s,%s,at=4) %s" % (l2, l3, avg_tau(l1,l3))) 68 | print("tau(%s,%s,at=4) %s" % (l1, l2, tau(l1,l2))) 69 | print("tau(%s,%s,at=4) %s" % (l2, l3, tau(l1,l3))) 70 | 71 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ca.txt: -------------------------------------------------------------------------------- 1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) 2 | a 3 | abans 4 | ací 5 | ah 6 | així 7 | això 8 | al 9 | als 10 | aleshores 11 | algun 12 | alguna 13 | algunes 14 | alguns 15 | alhora 16 | allà 17 | allí 18 | allò 19 | altra 20 | altre 21 | altres 22 | amb 23 | ambdós 24 | ambdues 25 | apa 26 | aquell 27 | aquella 28 | aquelles 29 | aquells 30 | aquest 31 | aquesta 32 | aquestes 33 | aquests 34 | aquí 35 | baix 36 | cada 37 | cadascú 38 | cadascuna 39 | cadascunes 40 | cadascuns 41 | com 42 | contra 43 | d'un 44 | d'una 45 | d'unes 46 | d'uns 47 | dalt 48 | de 49 | del 50 | dels 51 | des 52 | després 53 | dins 54 | dintre 55 | donat 56 | doncs 57 | durant 58 | e 59 | eh 60 | el 61 | els 62 | em 63 | en 64 | encara 65 | ens 66 | entre 67 | érem 68 | eren 69 | éreu 70 | es 71 | és 72 | esta 73 | està 74 | estàvem 75 | estaven 76 | estàveu 77 | esteu 78 | et 79 | etc 80 | ets 81 | fins 82 | fora 83 | gairebé 84 | ha 85 | han 86 | has 87 | havia 88 | he 89 | hem 90 | heu 91 | hi 92 | ho 93 | i 94 | igual 95 | iguals 96 | ja 97 | l'hi 98 | la 99 | les 100 | li 101 | li'n 102 | llavors 103 | m'he 104 | ma 105 | mal 106 | malgrat 107 | mateix 108 | mateixa 109 | mateixes 110 | mateixos 111 | me 112 | mentre 113 | més 114 | meu 115 | meus 116 | meva 117 | meves 118 | molt 119 | molta 120 | moltes 121 | molts 122 | mon 123 | mons 124 | n'he 125 | n'hi 126 | ne 127 | ni 128 | no 129 | nogensmenys 130 | només 131 | nosaltres 132 | nostra 133 | nostre 134 | nostres 135 | o 136 | oh 137 | oi 138 | on 139 | pas 140 | pel 141 | pels 142 | per 143 | però 144 | perquè 145 | poc 146 | poca 147 | pocs 148 | poques 149 | potser 150 | propi 151 | qual 152 | quals 153 | quan 154 | quant 155 | que 156 | què 157 | quelcom 158 | qui 159 | quin 160 | quina 161 | quines 162 | quins 163 | s'ha 164 | s'han 165 | sa 166 | semblant 167 | semblants 168 | ses 169 | seu 170 | seus 171 | seva 172 | seva 173 | seves 174 | si 175 | sobre 176 | sobretot 177 | sóc 178 | solament 179 | sols 180 | son 181 | són 182 | sons 183 | sota 184 | sou 185 | t'ha 186 | t'han 187 | t'he 188 | ta 189 | tal 190 | també 191 | tampoc 192 | tan 193 | tant 194 | tanta 195 | tantes 196 | teu 197 | teus 198 | teva 199 | teves 200 | ton 201 | tons 202 | tot 203 | tota 204 | totes 205 | tots 206 | un 207 | una 208 | unes 209 | uns 210 | us 211 | va 212 | vaig 213 | vam 214 | van 215 | vas 216 | veu 217 | vosaltres 218 | vostra 219 | vostre 220 | vostres 221 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ca.txt: -------------------------------------------------------------------------------- 1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) 2 | a 3 | abans 4 | ací 5 | ah 6 | així 7 | això 8 | al 9 | als 10 | aleshores 11 | algun 12 | alguna 13 | algunes 14 | alguns 15 | alhora 16 | allà 17 | allí 18 | allò 19 | altra 20 | altre 21 | altres 22 | amb 23 | ambdós 24 | ambdues 25 | apa 26 | aquell 27 | aquella 28 | aquelles 29 | aquells 30 | aquest 31 | aquesta 32 | aquestes 33 | aquests 34 | aquí 35 | baix 36 | cada 37 | cadascú 38 | cadascuna 39 | cadascunes 40 | cadascuns 41 | com 42 | contra 43 | d'un 44 | d'una 45 | d'unes 46 | d'uns 47 | dalt 48 | de 49 | del 50 | dels 51 | des 52 | després 53 | dins 54 | dintre 55 | donat 56 | doncs 57 | durant 58 | e 59 | eh 60 | el 61 | els 62 | em 63 | en 64 | encara 65 | ens 66 | entre 67 | érem 68 | eren 69 | éreu 70 | es 71 | és 72 | esta 73 | està 74 | estàvem 75 | estaven 76 | estàveu 77 | esteu 78 | et 79 | etc 80 | ets 81 | fins 82 | fora 83 | gairebé 84 | ha 85 | han 86 | has 87 | havia 88 | he 89 | hem 90 | heu 91 | hi 92 | ho 93 | i 94 | igual 95 | iguals 96 | ja 97 | l'hi 98 | la 99 | les 100 | li 101 | li'n 102 | llavors 103 | m'he 104 | ma 105 | mal 106 | malgrat 107 | mateix 108 | mateixa 109 | mateixes 110 | mateixos 111 | me 112 | mentre 113 | més 114 | meu 115 | meus 116 | meva 117 | meves 118 | molt 119 | molta 120 | moltes 121 | molts 122 | mon 123 | mons 124 | n'he 125 | n'hi 126 | ne 127 | ni 128 | no 129 | nogensmenys 130 | només 131 | nosaltres 132 | nostra 133 | nostre 134 | nostres 135 | o 136 | oh 137 | oi 138 | on 139 | pas 140 | pel 141 | pels 142 | per 143 | però 144 | perquè 145 | poc 146 | poca 147 | pocs 148 | poques 149 | potser 150 | propi 151 | qual 152 | quals 153 | quan 154 | quant 155 | que 156 | què 157 | quelcom 158 | qui 159 | quin 160 | quina 161 | quines 162 | quins 163 | s'ha 164 | s'han 165 | sa 166 | semblant 167 | semblants 168 | ses 169 | seu 170 | seus 171 | seva 172 | seva 173 | seves 174 | si 175 | sobre 176 | sobretot 177 | sóc 178 | solament 179 | sols 180 | son 181 | són 182 | sons 183 | sota 184 | sou 185 | t'ha 186 | t'han 187 | t'he 188 | ta 189 | tal 190 | també 191 | tampoc 192 | tan 193 | tant 194 | tanta 195 | tantes 196 | teu 197 | teus 198 | teva 199 | teves 200 | ton 201 | tons 202 | tot 203 | tota 204 | totes 205 | tots 206 | un 207 | una 208 | unes 209 | uns 210 | us 211 | va 212 | vaig 213 | vam 214 | van 215 | vas 216 | veu 217 | vosaltres 218 | vostra 219 | vostre 220 | vostres 221 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_tr.txt: -------------------------------------------------------------------------------- 1 | # Turkish stopwords from LUCENE-559 2 | # merged with the list from "Information Retrieval on Turkish Texts" 3 | # (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) 4 | acaba 5 | altmış 6 | altı 7 | ama 8 | ancak 9 | arada 10 | aslında 11 | ayrıca 12 | bana 13 | bazı 14 | belki 15 | ben 16 | benden 17 | beni 18 | benim 19 | beri 20 | beş 21 | bile 22 | bin 23 | bir 24 | birçok 25 | biri 26 | birkaç 27 | birkez 28 | birşey 29 | birşeyi 30 | biz 31 | bize 32 | bizden 33 | bizi 34 | bizim 35 | böyle 36 | böylece 37 | bu 38 | buna 39 | bunda 40 | bundan 41 | bunlar 42 | bunları 43 | bunların 44 | bunu 45 | bunun 46 | burada 47 | çok 48 | çünkü 49 | da 50 | daha 51 | dahi 52 | de 53 | defa 54 | değil 55 | diğer 56 | diye 57 | doksan 58 | dokuz 59 | dolayı 60 | dolayısıyla 61 | dört 62 | edecek 63 | eden 64 | ederek 65 | edilecek 66 | ediliyor 67 | edilmesi 68 | ediyor 69 | eğer 70 | elli 71 | en 72 | etmesi 73 | etti 74 | ettiği 75 | ettiğini 76 | gibi 77 | göre 78 | halen 79 | hangi 80 | hatta 81 | hem 82 | henüz 83 | hep 84 | hepsi 85 | her 86 | herhangi 87 | herkesin 88 | hiç 89 | hiçbir 90 | için 91 | iki 92 | ile 93 | ilgili 94 | ise 95 | işte 96 | itibaren 97 | itibariyle 98 | kadar 99 | karşın 100 | katrilyon 101 | kendi 102 | kendilerine 103 | kendini 104 | kendisi 105 | kendisine 106 | kendisini 107 | kez 108 | ki 109 | kim 110 | kimden 111 | kime 112 | kimi 113 | kimse 114 | kırk 115 | milyar 116 | milyon 117 | mu 118 | mü 119 | mı 120 | nasıl 121 | ne 122 | neden 123 | nedenle 124 | nerde 125 | nerede 126 | nereye 127 | niye 128 | niçin 129 | o 130 | olan 131 | olarak 132 | oldu 133 | olduğu 134 | olduğunu 135 | olduklarını 136 | olmadı 137 | olmadığı 138 | olmak 139 | olması 140 | olmayan 141 | olmaz 142 | olsa 143 | olsun 144 | olup 145 | olur 146 | olursa 147 | oluyor 148 | on 149 | ona 150 | ondan 151 | onlar 152 | onlardan 153 | onları 154 | onların 155 | onu 156 | onun 157 | otuz 158 | oysa 159 | öyle 160 | pek 161 | rağmen 162 | sadece 163 | sanki 164 | sekiz 165 | seksen 166 | sen 167 | senden 168 | seni 169 | senin 170 | siz 171 | sizden 172 | sizi 173 | sizin 174 | şey 175 | şeyden 176 | şeyi 177 | şeyler 178 | şöyle 179 | şu 180 | şuna 181 | şunda 182 | şundan 183 | şunları 184 | şunu 185 | tarafından 186 | trilyon 187 | tüm 188 | üç 189 | üzere 190 | var 191 | vardı 192 | ve 193 | veya 194 | ya 195 | yani 196 | yapacak 197 | yapılan 198 | yapılması 199 | yapıyor 200 | yapmak 201 | yaptı 202 | yaptığı 203 | yaptığını 204 | yaptıkları 205 | yedi 206 | yerine 207 | yetmiş 208 | yine 209 | yirmi 210 | yoksa 211 | yüz 212 | zaten 213 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_tr.txt: -------------------------------------------------------------------------------- 1 | # Turkish stopwords from LUCENE-559 2 | # merged with the list from "Information Retrieval on Turkish Texts" 3 | # (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) 4 | acaba 5 | altmış 6 | altı 7 | ama 8 | ancak 9 | arada 10 | aslında 11 | ayrıca 12 | bana 13 | bazı 14 | belki 15 | ben 16 | benden 17 | beni 18 | benim 19 | beri 20 | beş 21 | bile 22 | bin 23 | bir 24 | birçok 25 | biri 26 | birkaç 27 | birkez 28 | birşey 29 | birşeyi 30 | biz 31 | bize 32 | bizden 33 | bizi 34 | bizim 35 | böyle 36 | böylece 37 | bu 38 | buna 39 | bunda 40 | bundan 41 | bunlar 42 | bunları 43 | bunların 44 | bunu 45 | bunun 46 | burada 47 | çok 48 | çünkü 49 | da 50 | daha 51 | dahi 52 | de 53 | defa 54 | değil 55 | diğer 56 | diye 57 | doksan 58 | dokuz 59 | dolayı 60 | dolayısıyla 61 | dört 62 | edecek 63 | eden 64 | ederek 65 | edilecek 66 | ediliyor 67 | edilmesi 68 | ediyor 69 | eğer 70 | elli 71 | en 72 | etmesi 73 | etti 74 | ettiği 75 | ettiğini 76 | gibi 77 | göre 78 | halen 79 | hangi 80 | hatta 81 | hem 82 | henüz 83 | hep 84 | hepsi 85 | her 86 | herhangi 87 | herkesin 88 | hiç 89 | hiçbir 90 | için 91 | iki 92 | ile 93 | ilgili 94 | ise 95 | işte 96 | itibaren 97 | itibariyle 98 | kadar 99 | karşın 100 | katrilyon 101 | kendi 102 | kendilerine 103 | kendini 104 | kendisi 105 | kendisine 106 | kendisini 107 | kez 108 | ki 109 | kim 110 | kimden 111 | kime 112 | kimi 113 | kimse 114 | kırk 115 | milyar 116 | milyon 117 | mu 118 | mü 119 | mı 120 | nasıl 121 | ne 122 | neden 123 | nedenle 124 | nerde 125 | nerede 126 | nereye 127 | niye 128 | niçin 129 | o 130 | olan 131 | olarak 132 | oldu 133 | olduğu 134 | olduğunu 135 | olduklarını 136 | olmadı 137 | olmadığı 138 | olmak 139 | olması 140 | olmayan 141 | olmaz 142 | olsa 143 | olsun 144 | olup 145 | olur 146 | olursa 147 | oluyor 148 | on 149 | ona 150 | ondan 151 | onlar 152 | onlardan 153 | onları 154 | onların 155 | onu 156 | onun 157 | otuz 158 | oysa 159 | öyle 160 | pek 161 | rağmen 162 | sadece 163 | sanki 164 | sekiz 165 | seksen 166 | sen 167 | senden 168 | seni 169 | senin 170 | siz 171 | sizden 172 | sizi 173 | sizin 174 | şey 175 | şeyden 176 | şeyi 177 | şeyler 178 | şöyle 179 | şu 180 | şuna 181 | şunda 182 | şundan 183 | şunları 184 | şunu 185 | tarafından 186 | trilyon 187 | tüm 188 | üç 189 | üzere 190 | var 191 | vardı 192 | ve 193 | veya 194 | ya 195 | yani 196 | yapacak 197 | yapılan 198 | yapılması 199 | yapıyor 200 | yapmak 201 | yaptı 202 | yaptığı 203 | yaptığını 204 | yaptıkları 205 | yedi 206 | yerine 207 | yetmiş 208 | yine 209 | yirmi 210 | yoksa 211 | yüz 212 | zaten 213 | -------------------------------------------------------------------------------- /ltr/helpers/movies.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | 4 | class Memoize: 5 | """ Adapted from 6 | https://stackoverflow.com/questions/1988804/what-is-memoization-and-how-can-i-use-it-in-python""" 7 | def __init__(self, f): 8 | self.f = f 9 | self.memo = {} 10 | def __call__(self, *args): 11 | if not args in self.memo: 12 | self.memo[args] = self.f(*args) 13 | #Warning: You may wish to do a deepcopy here if returning objects 14 | return self.memo[args] 15 | 16 | @Memoize 17 | def load_movies(json_path): 18 | return json.load(open(json_path)) 19 | 20 | def get_movie(tmdb_id, movies='data/tmdb.json'): 21 | movies = load_movies(movies) 22 | tmdb_id=str(tmdb_id) 23 | return movies[tmdb_id] 24 | 25 | def noop(src_movie, base_doc): 26 | return base_doc 27 | 28 | 29 | def indexable_movies(enrich=noop, movies='data/tmdb.json'): 30 | """ Generates TMDB movies, similar to how ES Bulk indexing 31 | uses a generator to generate bulk index/update actions""" 32 | movies = load_movies(movies) 33 | idx = 0 34 | for movieId, tmdbMovie in tqdm(movies.items(),total=len(movies)): 35 | try: 36 | releaseDate = None 37 | if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0: 38 | releaseDate = tmdbMovie['release_date'] 39 | releaseYear = releaseDate[0:4] 40 | 41 | full_poster_path = '' 42 | if 'poster_path' in tmdbMovie and tmdbMovie['poster_path'] is not None and len(tmdbMovie['poster_path']) > 0: 43 | full_poster_path = 'https://image.tmdb.org/t/p/w185' + tmdbMovie['poster_path'] 44 | 45 | base_doc = {'id': movieId, 46 | 'title': tmdbMovie['title'], 47 | 'overview': tmdbMovie['overview'], 48 | 'tagline': tmdbMovie['tagline'], 49 | 'directors': [director['name'] for director in tmdbMovie['directors']], 50 | 'cast': " ".join([castMember['name'] for castMember in tmdbMovie['cast']]), 51 | 'genres': [genre['name'] for genre in tmdbMovie['genres']], 52 | 'release_date': releaseDate, 53 | 'release_year': releaseYear, 54 | 'poster_path': full_poster_path, 55 | 'vote_average': float(tmdbMovie['vote_average']) if 'vote_average' in tmdbMovie else None, 56 | 'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else 0, 57 | } 58 | yield enrich(tmdbMovie, base_doc) 59 | idx += 1 60 | except KeyError as k: # Ignore any movies missing these attributes 61 | continue 62 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ro.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | acea 5 | aceasta 6 | această 7 | aceea 8 | acei 9 | aceia 10 | acel 11 | acela 12 | acele 13 | acelea 14 | acest 15 | acesta 16 | aceste 17 | acestea 18 | aceşti 19 | aceştia 20 | acolo 21 | acum 22 | ai 23 | aia 24 | aibă 25 | aici 26 | al 27 | ăla 28 | ale 29 | alea 30 | ălea 31 | altceva 32 | altcineva 33 | am 34 | ar 35 | are 36 | aş 37 | aşadar 38 | asemenea 39 | asta 40 | ăsta 41 | astăzi 42 | astea 43 | ăstea 44 | ăştia 45 | asupra 46 | aţi 47 | au 48 | avea 49 | avem 50 | aveţi 51 | azi 52 | bine 53 | bucur 54 | bună 55 | ca 56 | că 57 | căci 58 | când 59 | care 60 | cărei 61 | căror 62 | cărui 63 | cât 64 | câte 65 | câţi 66 | către 67 | câtva 68 | ce 69 | cel 70 | ceva 71 | chiar 72 | cînd 73 | cine 74 | cineva 75 | cît 76 | cîte 77 | cîţi 78 | cîtva 79 | contra 80 | cu 81 | cum 82 | cumva 83 | curând 84 | curînd 85 | da 86 | dă 87 | dacă 88 | dar 89 | datorită 90 | de 91 | deci 92 | deja 93 | deoarece 94 | departe 95 | deşi 96 | din 97 | dinaintea 98 | dintr 99 | dintre 100 | drept 101 | după 102 | ea 103 | ei 104 | el 105 | ele 106 | eram 107 | este 108 | eşti 109 | eu 110 | face 111 | fără 112 | fi 113 | fie 114 | fiecare 115 | fii 116 | fim 117 | fiţi 118 | iar 119 | ieri 120 | îi 121 | îl 122 | îmi 123 | împotriva 124 | în 125 | înainte 126 | înaintea 127 | încât 128 | încît 129 | încotro 130 | între 131 | întrucât 132 | întrucît 133 | îţi 134 | la 135 | lângă 136 | le 137 | li 138 | lîngă 139 | lor 140 | lui 141 | mă 142 | mâine 143 | mea 144 | mei 145 | mele 146 | mereu 147 | meu 148 | mi 149 | mine 150 | mult 151 | multă 152 | mulţi 153 | ne 154 | nicăieri 155 | nici 156 | nimeni 157 | nişte 158 | noastră 159 | noastre 160 | noi 161 | noştri 162 | nostru 163 | nu 164 | ori 165 | oricând 166 | oricare 167 | oricât 168 | orice 169 | oricînd 170 | oricine 171 | oricît 172 | oricum 173 | oriunde 174 | până 175 | pe 176 | pentru 177 | peste 178 | pînă 179 | poate 180 | pot 181 | prea 182 | prima 183 | primul 184 | prin 185 | printr 186 | sa 187 | să 188 | săi 189 | sale 190 | sau 191 | său 192 | se 193 | şi 194 | sînt 195 | sîntem 196 | sînteţi 197 | spre 198 | sub 199 | sunt 200 | suntem 201 | sunteţi 202 | ta 203 | tăi 204 | tale 205 | tău 206 | te 207 | ţi 208 | ţie 209 | tine 210 | toată 211 | toate 212 | tot 213 | toţi 214 | totuşi 215 | tu 216 | un 217 | una 218 | unde 219 | undeva 220 | unei 221 | unele 222 | uneori 223 | unor 224 | vă 225 | vi 226 | voastră 227 | voastre 228 | voi 229 | voştri 230 | vostru 231 | vouă 232 | vreo 233 | vreun 234 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ro.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | acea 5 | aceasta 6 | această 7 | aceea 8 | acei 9 | aceia 10 | acel 11 | acela 12 | acele 13 | acelea 14 | acest 15 | acesta 16 | aceste 17 | acestea 18 | aceşti 19 | aceştia 20 | acolo 21 | acum 22 | ai 23 | aia 24 | aibă 25 | aici 26 | al 27 | ăla 28 | ale 29 | alea 30 | ălea 31 | altceva 32 | altcineva 33 | am 34 | ar 35 | are 36 | aş 37 | aşadar 38 | asemenea 39 | asta 40 | ăsta 41 | astăzi 42 | astea 43 | ăstea 44 | ăştia 45 | asupra 46 | aţi 47 | au 48 | avea 49 | avem 50 | aveţi 51 | azi 52 | bine 53 | bucur 54 | bună 55 | ca 56 | că 57 | căci 58 | când 59 | care 60 | cărei 61 | căror 62 | cărui 63 | cât 64 | câte 65 | câţi 66 | către 67 | câtva 68 | ce 69 | cel 70 | ceva 71 | chiar 72 | cînd 73 | cine 74 | cineva 75 | cît 76 | cîte 77 | cîţi 78 | cîtva 79 | contra 80 | cu 81 | cum 82 | cumva 83 | curând 84 | curînd 85 | da 86 | dă 87 | dacă 88 | dar 89 | datorită 90 | de 91 | deci 92 | deja 93 | deoarece 94 | departe 95 | deşi 96 | din 97 | dinaintea 98 | dintr 99 | dintre 100 | drept 101 | după 102 | ea 103 | ei 104 | el 105 | ele 106 | eram 107 | este 108 | eşti 109 | eu 110 | face 111 | fără 112 | fi 113 | fie 114 | fiecare 115 | fii 116 | fim 117 | fiţi 118 | iar 119 | ieri 120 | îi 121 | îl 122 | îmi 123 | împotriva 124 | în 125 | înainte 126 | înaintea 127 | încât 128 | încît 129 | încotro 130 | între 131 | întrucât 132 | întrucît 133 | îţi 134 | la 135 | lângă 136 | le 137 | li 138 | lîngă 139 | lor 140 | lui 141 | mă 142 | mâine 143 | mea 144 | mei 145 | mele 146 | mereu 147 | meu 148 | mi 149 | mine 150 | mult 151 | multă 152 | mulţi 153 | ne 154 | nicăieri 155 | nici 156 | nimeni 157 | nişte 158 | noastră 159 | noastre 160 | noi 161 | noştri 162 | nostru 163 | nu 164 | ori 165 | oricând 166 | oricare 167 | oricât 168 | orice 169 | oricînd 170 | oricine 171 | oricît 172 | oricum 173 | oriunde 174 | până 175 | pe 176 | pentru 177 | peste 178 | pînă 179 | poate 180 | pot 181 | prea 182 | prima 183 | primul 184 | prin 185 | printr 186 | sa 187 | să 188 | săi 189 | sale 190 | sau 191 | său 192 | se 193 | şi 194 | sînt 195 | sîntem 196 | sînteţi 197 | spre 198 | sub 199 | sunt 200 | suntem 201 | sunteţi 202 | ta 203 | tăi 204 | tale 205 | tău 206 | te 207 | ţi 208 | ţie 209 | tine 210 | toată 211 | toate 212 | tot 213 | toţi 214 | totuşi 215 | tu 216 | un 217 | una 218 | unde 219 | undeva 220 | unei 221 | unele 222 | uneori 223 | unor 224 | vă 225 | vi 226 | voastră 227 | voastre 228 | voi 229 | voştri 230 | vostru 231 | vouă 232 | vreo 233 | vreun 234 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hu.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | Hungarian stop word list 11 | | prepared by Anna Tordai 12 | 13 | a 14 | ahogy 15 | ahol 16 | aki 17 | akik 18 | akkor 19 | alatt 20 | által 21 | általában 22 | amely 23 | amelyek 24 | amelyekben 25 | amelyeket 26 | amelyet 27 | amelynek 28 | ami 29 | amit 30 | amolyan 31 | amíg 32 | amikor 33 | át 34 | abban 35 | ahhoz 36 | annak 37 | arra 38 | arról 39 | az 40 | azok 41 | azon 42 | azt 43 | azzal 44 | azért 45 | aztán 46 | azután 47 | azonban 48 | bár 49 | be 50 | belül 51 | benne 52 | cikk 53 | cikkek 54 | cikkeket 55 | csak 56 | de 57 | e 58 | eddig 59 | egész 60 | egy 61 | egyes 62 | egyetlen 63 | egyéb 64 | egyik 65 | egyre 66 | ekkor 67 | el 68 | elég 69 | ellen 70 | elő 71 | először 72 | előtt 73 | első 74 | én 75 | éppen 76 | ebben 77 | ehhez 78 | emilyen 79 | ennek 80 | erre 81 | ez 82 | ezt 83 | ezek 84 | ezen 85 | ezzel 86 | ezért 87 | és 88 | fel 89 | felé 90 | hanem 91 | hiszen 92 | hogy 93 | hogyan 94 | igen 95 | így 96 | illetve 97 | ill. 98 | ill 99 | ilyen 100 | ilyenkor 101 | ison 102 | ismét 103 | itt 104 | jó 105 | jól 106 | jobban 107 | kell 108 | kellett 109 | keresztül 110 | keressünk 111 | ki 112 | kívül 113 | között 114 | közül 115 | legalább 116 | lehet 117 | lehetett 118 | legyen 119 | lenne 120 | lenni 121 | lesz 122 | lett 123 | maga 124 | magát 125 | majd 126 | majd 127 | már 128 | más 129 | másik 130 | meg 131 | még 132 | mellett 133 | mert 134 | mely 135 | melyek 136 | mi 137 | mit 138 | míg 139 | miért 140 | milyen 141 | mikor 142 | minden 143 | mindent 144 | mindenki 145 | mindig 146 | mint 147 | mintha 148 | mivel 149 | most 150 | nagy 151 | nagyobb 152 | nagyon 153 | ne 154 | néha 155 | nekem 156 | neki 157 | nem 158 | néhány 159 | nélkül 160 | nincs 161 | olyan 162 | ott 163 | össze 164 | ő 165 | ők 166 | őket 167 | pedig 168 | persze 169 | rá 170 | s 171 | saját 172 | sem 173 | semmi 174 | sok 175 | sokat 176 | sokkal 177 | számára 178 | szemben 179 | szerint 180 | szinte 181 | talán 182 | tehát 183 | teljes 184 | tovább 185 | továbbá 186 | több 187 | úgy 188 | ugyanis 189 | új 190 | újabb 191 | újra 192 | után 193 | utána 194 | utolsó 195 | vagy 196 | vagyis 197 | valaki 198 | valami 199 | valamint 200 | való 201 | vagyok 202 | van 203 | vannak 204 | volt 205 | voltam 206 | voltak 207 | voltunk 208 | vissza 209 | vele 210 | viszont 211 | volna 212 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hu.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | Hungarian stop word list 11 | | prepared by Anna Tordai 12 | 13 | a 14 | ahogy 15 | ahol 16 | aki 17 | akik 18 | akkor 19 | alatt 20 | által 21 | általában 22 | amely 23 | amelyek 24 | amelyekben 25 | amelyeket 26 | amelyet 27 | amelynek 28 | ami 29 | amit 30 | amolyan 31 | amíg 32 | amikor 33 | át 34 | abban 35 | ahhoz 36 | annak 37 | arra 38 | arról 39 | az 40 | azok 41 | azon 42 | azt 43 | azzal 44 | azért 45 | aztán 46 | azután 47 | azonban 48 | bár 49 | be 50 | belül 51 | benne 52 | cikk 53 | cikkek 54 | cikkeket 55 | csak 56 | de 57 | e 58 | eddig 59 | egész 60 | egy 61 | egyes 62 | egyetlen 63 | egyéb 64 | egyik 65 | egyre 66 | ekkor 67 | el 68 | elég 69 | ellen 70 | elő 71 | először 72 | előtt 73 | első 74 | én 75 | éppen 76 | ebben 77 | ehhez 78 | emilyen 79 | ennek 80 | erre 81 | ez 82 | ezt 83 | ezek 84 | ezen 85 | ezzel 86 | ezért 87 | és 88 | fel 89 | felé 90 | hanem 91 | hiszen 92 | hogy 93 | hogyan 94 | igen 95 | így 96 | illetve 97 | ill. 98 | ill 99 | ilyen 100 | ilyenkor 101 | ison 102 | ismét 103 | itt 104 | jó 105 | jól 106 | jobban 107 | kell 108 | kellett 109 | keresztül 110 | keressünk 111 | ki 112 | kívül 113 | között 114 | közül 115 | legalább 116 | lehet 117 | lehetett 118 | legyen 119 | lenne 120 | lenni 121 | lesz 122 | lett 123 | maga 124 | magát 125 | majd 126 | majd 127 | már 128 | más 129 | másik 130 | meg 131 | még 132 | mellett 133 | mert 134 | mely 135 | melyek 136 | mi 137 | mit 138 | míg 139 | miért 140 | milyen 141 | mikor 142 | minden 143 | mindent 144 | mindenki 145 | mindig 146 | mint 147 | mintha 148 | mivel 149 | most 150 | nagy 151 | nagyobb 152 | nagyon 153 | ne 154 | néha 155 | nekem 156 | neki 157 | nem 158 | néhány 159 | nélkül 160 | nincs 161 | olyan 162 | ott 163 | össze 164 | ő 165 | ők 166 | őket 167 | pedig 168 | persze 169 | rá 170 | s 171 | saját 172 | sem 173 | semmi 174 | sok 175 | sokat 176 | sokkal 177 | számára 178 | szemben 179 | szerint 180 | szinte 181 | talán 182 | tehát 183 | teljes 184 | tovább 185 | továbbá 186 | több 187 | úgy 188 | ugyanis 189 | új 190 | újabb 191 | újra 192 | után 193 | utána 194 | utolsó 195 | vagy 196 | vagyis 197 | valaki 198 | valami 199 | valamint 200 | való 201 | vagyok 202 | van 203 | vannak 204 | volt 205 | voltam 206 | voltak 207 | voltunk 208 | vissza 209 | vele 210 | viszont 211 | volna 212 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hi.txt: -------------------------------------------------------------------------------- 1 | # Also see http://www.opensource.org/licenses/bsd-license.html 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # This file was created by Jacques Savoy and is distributed under the BSD license. 4 | # Note: by default this file also contains forms normalized by HindiNormalizer 5 | # for spelling variation (see section below), such that it can be used whether or 6 | # not you enable that feature. When adding additional entries to this list, 7 | # please add the normalized form as well. 8 | अंदर 9 | अत 10 | अपना 11 | अपनी 12 | अपने 13 | अभी 14 | आदि 15 | आप 16 | इत्यादि 17 | इन 18 | इनका 19 | इन्हीं 20 | इन्हें 21 | इन्हों 22 | इस 23 | इसका 24 | इसकी 25 | इसके 26 | इसमें 27 | इसी 28 | इसे 29 | उन 30 | उनका 31 | उनकी 32 | उनके 33 | उनको 34 | उन्हीं 35 | उन्हें 36 | उन्हों 37 | उस 38 | उसके 39 | उसी 40 | उसे 41 | एक 42 | एवं 43 | एस 44 | ऐसे 45 | और 46 | कई 47 | कर 48 | करता 49 | करते 50 | करना 51 | करने 52 | करें 53 | कहते 54 | कहा 55 | का 56 | काफ़ी 57 | कि 58 | कितना 59 | किन्हें 60 | किन्हों 61 | किया 62 | किर 63 | किस 64 | किसी 65 | किसे 66 | की 67 | कुछ 68 | कुल 69 | के 70 | को 71 | कोई 72 | कौन 73 | कौनसा 74 | गया 75 | घर 76 | जब 77 | जहाँ 78 | जा 79 | जितना 80 | जिन 81 | जिन्हें 82 | जिन्हों 83 | जिस 84 | जिसे 85 | जीधर 86 | जैसा 87 | जैसे 88 | जो 89 | तक 90 | तब 91 | तरह 92 | तिन 93 | तिन्हें 94 | तिन्हों 95 | तिस 96 | तिसे 97 | तो 98 | था 99 | थी 100 | थे 101 | दबारा 102 | दिया 103 | दुसरा 104 | दूसरे 105 | दो 106 | द्वारा 107 | न 108 | नहीं 109 | ना 110 | निहायत 111 | नीचे 112 | ने 113 | पर 114 | पर 115 | पहले 116 | पूरा 117 | पे 118 | फिर 119 | बनी 120 | बही 121 | बहुत 122 | बाद 123 | बाला 124 | बिलकुल 125 | भी 126 | भीतर 127 | मगर 128 | मानो 129 | मे 130 | में 131 | यदि 132 | यह 133 | यहाँ 134 | यही 135 | या 136 | यिह 137 | ये 138 | रखें 139 | रहा 140 | रहे 141 | ऱ्वासा 142 | लिए 143 | लिये 144 | लेकिन 145 | व 146 | वर्ग 147 | वह 148 | वह 149 | वहाँ 150 | वहीं 151 | वाले 152 | वुह 153 | वे 154 | वग़ैरह 155 | संग 156 | सकता 157 | सकते 158 | सबसे 159 | सभी 160 | साथ 161 | साबुत 162 | साभ 163 | सारा 164 | से 165 | सो 166 | ही 167 | हुआ 168 | हुई 169 | हुए 170 | है 171 | हैं 172 | हो 173 | होता 174 | होती 175 | होते 176 | होना 177 | होने 178 | # additional normalized forms of the above 179 | अपनि 180 | जेसे 181 | होति 182 | सभि 183 | तिंहों 184 | इंहों 185 | दवारा 186 | इसि 187 | किंहें 188 | थि 189 | उंहों 190 | ओर 191 | जिंहें 192 | वहिं 193 | अभि 194 | बनि 195 | हि 196 | उंहिं 197 | उंहें 198 | हें 199 | वगेरह 200 | एसे 201 | रवासा 202 | कोन 203 | निचे 204 | काफि 205 | उसि 206 | पुरा 207 | भितर 208 | हे 209 | बहि 210 | वहां 211 | कोइ 212 | यहां 213 | जिंहों 214 | तिंहें 215 | किसि 216 | कइ 217 | यहि 218 | इंहिं 219 | जिधर 220 | इंहें 221 | अदि 222 | इतयादि 223 | हुइ 224 | कोनसा 225 | इसकि 226 | दुसरे 227 | जहां 228 | अप 229 | किंहों 230 | उनकि 231 | भि 232 | वरग 233 | हुअ 234 | जेसा 235 | नहिं 236 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hi.txt: -------------------------------------------------------------------------------- 1 | # Also see http://www.opensource.org/licenses/bsd-license.html 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # This file was created by Jacques Savoy and is distributed under the BSD license. 4 | # Note: by default this file also contains forms normalized by HindiNormalizer 5 | # for spelling variation (see section below), such that it can be used whether or 6 | # not you enable that feature. When adding additional entries to this list, 7 | # please add the normalized form as well. 8 | अंदर 9 | अत 10 | अपना 11 | अपनी 12 | अपने 13 | अभी 14 | आदि 15 | आप 16 | इत्यादि 17 | इन 18 | इनका 19 | इन्हीं 20 | इन्हें 21 | इन्हों 22 | इस 23 | इसका 24 | इसकी 25 | इसके 26 | इसमें 27 | इसी 28 | इसे 29 | उन 30 | उनका 31 | उनकी 32 | उनके 33 | उनको 34 | उन्हीं 35 | उन्हें 36 | उन्हों 37 | उस 38 | उसके 39 | उसी 40 | उसे 41 | एक 42 | एवं 43 | एस 44 | ऐसे 45 | और 46 | कई 47 | कर 48 | करता 49 | करते 50 | करना 51 | करने 52 | करें 53 | कहते 54 | कहा 55 | का 56 | काफ़ी 57 | कि 58 | कितना 59 | किन्हें 60 | किन्हों 61 | किया 62 | किर 63 | किस 64 | किसी 65 | किसे 66 | की 67 | कुछ 68 | कुल 69 | के 70 | को 71 | कोई 72 | कौन 73 | कौनसा 74 | गया 75 | घर 76 | जब 77 | जहाँ 78 | जा 79 | जितना 80 | जिन 81 | जिन्हें 82 | जिन्हों 83 | जिस 84 | जिसे 85 | जीधर 86 | जैसा 87 | जैसे 88 | जो 89 | तक 90 | तब 91 | तरह 92 | तिन 93 | तिन्हें 94 | तिन्हों 95 | तिस 96 | तिसे 97 | तो 98 | था 99 | थी 100 | थे 101 | दबारा 102 | दिया 103 | दुसरा 104 | दूसरे 105 | दो 106 | द्वारा 107 | न 108 | नहीं 109 | ना 110 | निहायत 111 | नीचे 112 | ने 113 | पर 114 | पर 115 | पहले 116 | पूरा 117 | पे 118 | फिर 119 | बनी 120 | बही 121 | बहुत 122 | बाद 123 | बाला 124 | बिलकुल 125 | भी 126 | भीतर 127 | मगर 128 | मानो 129 | मे 130 | में 131 | यदि 132 | यह 133 | यहाँ 134 | यही 135 | या 136 | यिह 137 | ये 138 | रखें 139 | रहा 140 | रहे 141 | ऱ्वासा 142 | लिए 143 | लिये 144 | लेकिन 145 | व 146 | वर्ग 147 | वह 148 | वह 149 | वहाँ 150 | वहीं 151 | वाले 152 | वुह 153 | वे 154 | वग़ैरह 155 | संग 156 | सकता 157 | सकते 158 | सबसे 159 | सभी 160 | साथ 161 | साबुत 162 | साभ 163 | सारा 164 | से 165 | सो 166 | ही 167 | हुआ 168 | हुई 169 | हुए 170 | है 171 | हैं 172 | हो 173 | होता 174 | होती 175 | होते 176 | होना 177 | होने 178 | # additional normalized forms of the above 179 | अपनि 180 | जेसे 181 | होति 182 | सभि 183 | तिंहों 184 | इंहों 185 | दवारा 186 | इसि 187 | किंहें 188 | थि 189 | उंहों 190 | ओर 191 | जिंहें 192 | वहिं 193 | अभि 194 | बनि 195 | हि 196 | उंहिं 197 | उंहें 198 | हें 199 | वगेरह 200 | एसे 201 | रवासा 202 | कोन 203 | निचे 204 | काफि 205 | उसि 206 | पुरा 207 | भितर 208 | हे 209 | बहि 210 | वहां 211 | कोइ 212 | यहां 213 | जिंहों 214 | तिंहें 215 | किसि 216 | कइ 217 | यहि 218 | इंहिं 219 | जिधर 220 | इंहें 221 | अदि 222 | इतयादि 223 | हुइ 224 | कोनसा 225 | इसकि 226 | दुसरे 227 | जहां 228 | अप 229 | किंहों 230 | उनकि 231 | भि 232 | वरग 233 | हुअ 234 | जेसा 235 | नहिं 236 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/osc-blog/blog_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "_source": { 4 | "enabled": true 5 | }, 6 | "properties": { 7 | "post_id": { 8 | "type": "long", 9 | "store": true 10 | }, 11 | "post_date": { 12 | "type": "date", 13 | "store": true 14 | }, 15 | "es_update_date": { 16 | "type": "date", 17 | "store": true 18 | }, 19 | "url": { 20 | "type": "text", 21 | "store": true 22 | }, 23 | "title": { 24 | "type": "text", 25 | "store": true, 26 | "analyzer": "content_analyzer", 27 | "fields": { 28 | "bigrams": { 29 | "type": "text", 30 | "analyzer": "content_bigrams" 31 | } 32 | } 33 | }, 34 | "author": { 35 | "type": "text", 36 | "store": true, 37 | "analyzer": "standard" 38 | }, 39 | "content": { 40 | "type": "text", 41 | "store": true, 42 | "analyzer": "content_analyzer", 43 | "fields": { 44 | "bigrams": { 45 | "type": "text", 46 | "analyzer": "content_bigrams" 47 | } 48 | } 49 | }, 50 | "excerpt": { 51 | "type": "text", 52 | "store": true, 53 | "analyzer": "content_analyzer" 54 | }, 55 | "categories": { 56 | "type": "text", 57 | "store": true, 58 | "analyzer": "content_analyzer" 59 | } 60 | } 61 | }, 62 | "settings": { 63 | "number_of_shards": 1, 64 | "number_of_replicas": 1, 65 | "analysis": { 66 | "filter": { 67 | "english_stemmer": { 68 | "type": "stemmer", 69 | "language": "english" 70 | }, 71 | "english_possessive_stemmer": { 72 | "type": "stemmer", 73 | "language": "possessive_english" 74 | }, 75 | "bigram": { 76 | "type": "shingle", 77 | "max_shingle_size": 2, 78 | "output_unigrams": false 79 | } 80 | }, 81 | "analyzer": { 82 | "content_analyzer": { 83 | "type": "custom", 84 | "char_filter": [ 85 | "html_strip" 86 | ], 87 | "filter": [ 88 | "english_possessive_stemmer", 89 | "lowercase", 90 | "english_stemmer" 91 | ], 92 | "tokenizer": "standard" 93 | }, 94 | "content_bigrams": { 95 | "type": "custom", 96 | "char_filter": [ 97 | "html_strip" 98 | ], 99 | "filter": [ 100 | "english_possessive_stemmer", 101 | "lowercase", 102 | "english_stemmer", 103 | "bigram" 104 | ], 105 | "tokenizer": "standard" 106 | } 107 | } 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hello LTR :) 2 | 3 | The overall goal of this project is to demonstrate all of the steps required to work with LTR in Elasticsearch or Solr. There's two modes of running. Just running and editing notebooks in a docker container. Or local development (also requiring docker to run the search engine). 4 | 5 | ## No fuss setup: You just want to play with LTR 6 | 7 | Follow these steps if you're just playing around & are OK with possibly losing some work (all notebooks exist just in the docker container) 8 | 9 | With docker & docker-compose simply run 10 | 11 | ``` 12 | docker-compose up 13 | ``` 14 | 15 | at the root dir and go to town! 16 | 17 | This will run jupyter and all search engines in Docker containers. Check that each is up at the default ports: 18 | 19 | - Solr: [localhost:8983](localhost:8983) 20 | - Elasticsearch: [localhost:9200](localhost:9200) 21 | - Kibana: [localhost:5601](localhost:5601) 22 | - Jupyter: [localhost:8888](localhost:8888) 23 | 24 | ## You want to build your own LTR notebooks 25 | 26 | Follow these steps if you want to do more serious work with the notebooks. For example, if you want to build a demo with your work's data or something you want to preserve later. 27 | 28 | ### Run your search engine with Docker 29 | 30 | You probably just want to work with one search engine. So whichever one you're working with, launch that search engine in Docker. 31 | 32 | #### Running Solr w/ LTR 33 | 34 | Setup Solr with docker compose to work with just Solr examples: 35 | 36 | ``` 37 | cd notebooks/solr 38 | docker-compose up 39 | ``` 40 | 41 | #### Running Elasticsearch w/ LTR 42 | 43 | Setup Elasticsearch with docker compose to work with just Elasticsearch examples: 44 | 45 | ``` 46 | cd notebooks/elasticsearch 47 | docker-compose up 48 | ``` 49 | 50 | ### Run Jupyter locally w/ Python 3 and all prereqs 51 | 52 | #### Setup Python requirements 53 | 54 | - Ensure Python 3 is installed on your system 55 | - Create a virtual environment: `python3 -m venv venv` 56 | - Start the virtual environment: `source venv/bin/activate` 57 | - Check install tooling is up to date `python -m pip install -U pip wheel setuptools` 58 | - Install the requirements `pip install -r requirements.txt` 59 | 60 | __Note:__ The above commands should be run from the root folder of the project. 61 | 62 | #### Start Jupyter notebook and confirm operation 63 | 64 | - Run `jupyter notebook` 65 | - Browse to notebooks/{search\_engine}/{collection} 66 | - Open either the "hello-ltr (Solr)" or "hello-ltr (ES)" as appropriate and ensure you get a graph at the last cell 67 | 68 | ## Tests 69 | 70 | ### Automatically run everything... 71 | 72 | To run a full suite of tests, such as to verify a PR, you can simply run 73 | 74 | ./tests/test.sh 75 | 76 | Optionally with containers rebuilt 77 | 78 | ./tests/test.sh --rebuild-containers 79 | 80 | Failing tests will have their output in `tests/last_run.ipynb` 81 | 82 | ### While developing... 83 | 84 | For more informal development: 85 | 86 | - Startup the Solr and ES Docker containers 87 | - Do your development 88 | - Run the command as needed: 89 | `python tests/run_most_nbs.py` 90 | - Tests fail if notebooks return any errors 91 | - The failing notebook will be stored at `tests/last_run.ipynb` 92 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/Dataframes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dataframes\n", 8 | "\n", 9 | "Data frames are the central object of most data science workflows. This notebook shows some helper function that can assist you in creating them from judgements. The older non-dataframe way of passing data is in most of the example notebooks, so use this code anywhere you see that pattern.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import ltr.judgments as judge" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "classic_training_set = [j for j in judge.judgments_from_file(open('data/classic-training.txt'))]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "classic_df = judge.judgments_to_dataframe(classic_training_set)\n", 37 | "classic_df" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Plotting\n", 45 | "\n", 46 | "Is one of the main reasons dataframes are easier to work with. There are two helper functions to show the distribtion of grade (`plot_grades`) and relationship between features and grades (plot_features).\n", 47 | "\n", 48 | "You are encouraged to use whatever python plotting library you are most comformtable with, we have `matplotlib` and `plotnine` installed in the Docker image." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "import ltr.p9_plots as plots\n", 58 | "plots.plot_grades(classic_df)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "classic_df_long = judge.judgments_dataframe_to_long(classic_df)\n", 68 | "classic_df_long" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "plots.plot_features(classic_df_long)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "plots.plot_features" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 3", 93 | "language": "python", 94 | "name": "python3" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 3 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython3", 106 | "version": "3.8.2" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fi.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | forms of BE 11 | 12 | olla 13 | olen 14 | olet 15 | on 16 | olemme 17 | olette 18 | ovat 19 | ole | negative form 20 | 21 | oli 22 | olisi 23 | olisit 24 | olisin 25 | olisimme 26 | olisitte 27 | olisivat 28 | olit 29 | olin 30 | olimme 31 | olitte 32 | olivat 33 | ollut 34 | olleet 35 | 36 | en | negation 37 | et 38 | ei 39 | emme 40 | ette 41 | eivät 42 | 43 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans 44 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I 45 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you 46 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she 47 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we 48 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you 49 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they 50 | 51 | tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this 52 | tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that 53 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it 54 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these 55 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those 56 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they 57 | 58 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who 59 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) 60 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what 61 | mitkä | (pl) 62 | 63 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which 64 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) 65 | 66 | | conjunctions 67 | 68 | että | that 69 | ja | and 70 | jos | if 71 | koska | because 72 | kuin | than 73 | mutta | but 74 | niin | so 75 | sekä | and 76 | sillä | for 77 | tai | or 78 | vaan | but 79 | vai | or 80 | vaikka | although 81 | 82 | 83 | | prepositions 84 | 85 | kanssa | with 86 | mukaan | according to 87 | noin | about 88 | poikki | across 89 | yli | over, across 90 | 91 | | other 92 | 93 | kun | when 94 | niin | so 95 | nyt | now 96 | itse | self 97 | 98 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fi.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | forms of BE 11 | 12 | olla 13 | olen 14 | olet 15 | on 16 | olemme 17 | olette 18 | ovat 19 | ole | negative form 20 | 21 | oli 22 | olisi 23 | olisit 24 | olisin 25 | olisimme 26 | olisitte 27 | olisivat 28 | olit 29 | olin 30 | olimme 31 | olitte 32 | olivat 33 | ollut 34 | olleet 35 | 36 | en | negation 37 | et 38 | ei 39 | emme 40 | ette 41 | eivät 42 | 43 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans 44 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I 45 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you 46 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she 47 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we 48 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you 49 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they 50 | 51 | tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this 52 | tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that 53 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it 54 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these 55 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those 56 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they 57 | 58 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who 59 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) 60 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what 61 | mitkä | (pl) 62 | 63 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which 64 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) 65 | 66 | | conjunctions 67 | 68 | että | that 69 | ja | and 70 | jos | if 71 | koska | because 72 | kuin | than 73 | mutta | but 74 | niin | so 75 | sekä | and 76 | sillä | for 77 | tai | or 78 | vaan | but 79 | vai | or 80 | vaikka | although 81 | 82 | 83 | | prepositions 84 | 85 | kanssa | with 86 | mukaan | according to 87 | noin | about 88 | poikki | across 89 | yli | over, across 90 | 91 | | other 92 | 93 | kun | when 94 | niin | so 95 | nyt | now 96 | itse | self 97 | 98 | -------------------------------------------------------------------------------- /ltr/clickmodels/sdbn.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from ltr.clickmodels.session import build 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Satisfaction per query-doc 7 | self.satisfacts = defaultdict(lambda: 0.1) 8 | 9 | # Attractiveness per query-doc 10 | self.attracts = defaultdict(lambda : 0.1) 11 | 12 | reverse_enumerate = lambda l: zip(range(len(l)-1, -1, -1), reversed(l)) 13 | 14 | 15 | def sdbn(sessions): 16 | """ Simplified Dynamic Bayesian Network is a simpler 17 | version of the much more complex Dynamic Bayesian Network 18 | that the authors say comes close to the accuracy of DBN 19 | 20 | Most importantly, it can be solved directly and simply without 21 | an EM learning process 22 | 23 | Features of sdbn: 24 | - Attractiveness is any click out of sessions where that document 25 | appears before the last click of the session 26 | - Satisfaction occurs when a doc is the last document clicked 27 | out of all sessions where that document is clicked 28 | 29 | """ 30 | model = Model() 31 | NO_CLICK = -1 32 | counts = Counter() 33 | clicks = Counter() 34 | last_clicks = Counter() 35 | for session in sessions: 36 | last_click = NO_CLICK 37 | for rank, doc in reverse_enumerate(session.docs): 38 | if last_click == NO_CLICK and doc.click: 39 | last_click = rank 40 | 41 | if last_click != NO_CLICK: 42 | query_doc = (session.query, doc.doc_id) 43 | counts[query_doc] += 1 44 | 45 | if doc.click: 46 | # Cascading model doesn't consider 47 | # clicks past the last one, so we count 48 | # this one and break out 49 | clicks[query_doc] += 1 50 | if rank == last_click: 51 | last_clicks[query_doc] += 1 52 | 53 | # For all meaningful sessions (where query_doc appear) 54 | # count attractiveness clicks / num sessions 55 | # count satisfacts last clicks / sessions with clicks 56 | for query_doc, count in counts.items(): 57 | model.attracts[query_doc] = clicks[query_doc] / count 58 | if query_doc in clicks: 59 | model.satisfacts[query_doc] = last_clicks[query_doc] / clicks[query_doc] 60 | return model 61 | 62 | 63 | if __name__ == "__main__": 64 | sessions = build([ 65 | ('A', ((1, True), (2, False), (3, True), (0, False))), 66 | ('B', ((5, False), (2, True), (3, True), (0, False))), 67 | ('A', ((1, False), (2, False), (3, True), (0, False))), 68 | ('B', ((1, False), (2, False), (3, False), (9, True))), 69 | ('A', ((9, False), (2, False), (1, True), (0, True))), 70 | ('B', ((6, True), (2, False), (3, True), (1, False))), 71 | ('A', ((7, False), (4, True), (1, False), (3, False))), 72 | ('B', ((8, True), (2, False), (3, True), (1, False))), 73 | ('A', ((1, False), (4, True), (2, False), (3, False))), 74 | ('B', ((7, True), (4, False), (5, True), (1, True))), 75 | ]) 76 | model = sdbn(sessions) 77 | print(model.attracts[('A', 1)]) 78 | print(model.satisfacts[('A', 1)]) 79 | print(model.attracts[('B', 1)]) 80 | print(model.satisfacts[('B', 1)]) 81 | -------------------------------------------------------------------------------- /ltr/date_genre_judgments.py: -------------------------------------------------------------------------------- 1 | from .judgments import Judgment, judgments_to_file 2 | from tqdm import tqdm 3 | 4 | def genreQid(genre): 5 | if genre == "Science Fiction": 6 | return 1 7 | if genre == "Drama": 8 | return 2 9 | else: 10 | return 0 11 | 12 | 13 | def genreGrade(movie): 14 | """ Create a simple training set, as if we were 15 | searching for a genre. 16 | 17 | Newer science fiction is considered better 18 | Older drama is considered better 19 | 20 | """ 21 | if 'release_year' in movie and movie['release_year'] is not None: 22 | releaseYear = int(movie['release_year']) 23 | else: 24 | return 0 25 | if movie['genres'][0] == "Science Fiction": 26 | if releaseYear > 2015: 27 | return 4 28 | elif releaseYear > 2010: 29 | return 3 30 | elif releaseYear > 2000: 31 | return 2 32 | elif releaseYear > 1990: 33 | return 1 34 | else: 35 | return 0 36 | 37 | if movie['genres'][0] == "Drama": 38 | if releaseYear > 1990: 39 | return 0 40 | elif releaseYear > 1970: 41 | return 1 42 | elif releaseYear > 1950: 43 | return 2 44 | elif releaseYear > 1930: 45 | return 3 46 | else: 47 | return 4 48 | return 0 49 | 50 | 51 | def synthesize(client, judgmentsOutFile='genre_by_date_judgments.txt', autoNegate=False): 52 | print('Generating judgments for scifi & drama movies') 53 | 54 | if client.name() == 'elastic': 55 | params = { 56 | "query": { 57 | "match_all": {} 58 | }, 59 | "size": 10000, 60 | "sort": [{"_id": "asc"}] 61 | } 62 | else: 63 | params = { 64 | "q": "*:*", 65 | "rows": 10000, 66 | "sort": "id ASC", 67 | "wt": 'json' 68 | } 69 | 70 | resp = client.query('tmdb', params) 71 | 72 | # Build judgments for each film 73 | judgments = [] 74 | for movie in tqdm(resp): 75 | if 'genres' in movie and len(movie['genres']) > 0: 76 | genre=movie['genres'][0] 77 | qid = genreQid(genre) 78 | if qid == 0: 79 | continue 80 | judgment = Judgment(qid=qid, 81 | grade=genreGrade(movie), 82 | docId=movie['id'], 83 | keywords=genre) 84 | judgments.append(judgment) 85 | 86 | # This movie is good for its genre, but 87 | # a bad result for the opposite genre 88 | negGenre = None 89 | if genre == "Science Fiction": 90 | negGenre = "Drama" 91 | elif genre == "Drama": 92 | negGenre = "Science Fiction" 93 | 94 | if autoNegate and negGenre is not None: 95 | negQid=genreQid(negGenre) 96 | judgment = Judgment(qid=negQid, 97 | grade=0, 98 | docId=movie['id'], 99 | keywords=negGenre) 100 | judgments.append(judgment) 101 | 102 | with open(judgmentsOutFile, 'w') as f: 103 | judgments_to_file(f, judgmentsList=judgments) 104 | 105 | return judgments 106 | -------------------------------------------------------------------------------- /ltr/log.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class FeatureLogger: 4 | """ Logs LTR Features, one query at a time 5 | 6 | ...Building up a training set... 7 | """ 8 | 9 | def __init__(self, client, index, feature_set, drop_missing=True): 10 | self.client=client 11 | self.index=index 12 | self.feature_set=feature_set 13 | self.drop_missing=drop_missing 14 | self.logged=[] 15 | 16 | def clear(self): 17 | self.logged=[] 18 | 19 | def log_for_qid(self, qid, judgments, keywords): 20 | """ Log a set of judgments associated with a single qid 21 | judgments will be modified, a training set also returned, discarding 22 | any judgments we could not log features for (because the doc was missing) 23 | """ 24 | featuresPerDoc = {} 25 | judgments = [j for j in judgments] 26 | docIds = [judgment.docId for judgment in judgments] 27 | 28 | # Check for dups of documents 29 | for docId in docIds: 30 | indices = [i for i, x in enumerate(docIds) if x == docId] 31 | if len(indices) > 1: 32 | # print("Duplicate Doc in qid:%s %s" % (qid, docId)) 33 | pass 34 | 35 | # For every batch of N docs to generate judgments for 36 | BATCH_SIZE = 500 37 | numLeft = len(docIds) 38 | for i in range(0, 1 + (len(docIds) // BATCH_SIZE)): 39 | 40 | numFetch = min(BATCH_SIZE, numLeft) 41 | start = i*BATCH_SIZE 42 | if start >= len(docIds): 43 | break 44 | ids = docIds[start:start+numFetch] 45 | 46 | # Sanitize (Solr has a strict syntax that can easily be tripped up) 47 | # This removes anything but alphanumeric and spaces 48 | keywords = re.sub('([^\s\w]|_)+', '', keywords) 49 | 50 | params = { 51 | "keywords": keywords, 52 | "fuzzy_keywords": ' '.join([x + '~' for x in keywords.split(' ')]), 53 | "keywordsList": [keywords] # Needed by TSQ for the time being 54 | } 55 | 56 | res = self.client.log_query(self.index, self.feature_set, ids, params) 57 | 58 | # Add feature back to each judgment 59 | for doc in res: 60 | docId = str(doc['id']) 61 | features = doc['ltr_features'] 62 | featuresPerDoc[docId] = features 63 | numLeft -= BATCH_SIZE 64 | 65 | # Append features from search engine back to ranklib judgment list 66 | for judgment in judgments: 67 | try: 68 | features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index 69 | judgment.features = features 70 | except KeyError: 71 | pass 72 | # print("Missing doc %s" % judgment.docId) 73 | 74 | # Return a paired down judgments if we are missing features for judgments 75 | training_set = [] 76 | discarded = [] 77 | for judgment in judgments: 78 | if self.drop_missing: 79 | if judgment.has_features(): 80 | training_set.append(judgment) 81 | else: 82 | discarded.append(judgment) 83 | else: 84 | training_set.append(judgment) 85 | # print("Discarded %s Keep %s" % (len(discarded), len(training_set))) 86 | self.logged.extend(training_set) 87 | return training_set, discarded 88 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/es-docker/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | # ======================== Elasticsearch Configuration ========================= 2 | # 3 | # NOTE: Elasticsearch comes with reasonable defaults for most settings. 4 | # Before you set out to tweak and tune the configuration, make sure you 5 | # understand what are you trying to accomplish and the consequences. 6 | # 7 | # The primary way of configuring a node is via this file. This template lists 8 | # the most important settings you may want to configure for a production cluster. 9 | # 10 | # Please consult the documentation for further information on configuration options: 11 | # https://www.elastic.co/guide/en/elasticsearch/reference/index.html 12 | # 13 | # ---------------------------------- Cluster ----------------------------------- 14 | # 15 | # Use a descriptive name for your cluster: 16 | # 17 | #cluster.name: my-application 18 | # 19 | # ------------------------------------ Node ------------------------------------ 20 | # 21 | # Use a descriptive name for the node: 22 | # 23 | #node.name: node-1 24 | # 25 | # Add custom attributes to the node: 26 | # 27 | #node.attr.rack: r1 28 | # 29 | # ----------------------------------- Paths ------------------------------------ 30 | # 31 | # Path to directory where to store the data (separate multiple locations by comma): 32 | # 33 | #path.data: /path/to/data 34 | # 35 | # Path to log files: 36 | # 37 | #path.logs: /path/to/logs 38 | # 39 | # ----------------------------------- Memory ----------------------------------- 40 | # 41 | # Lock the memory on startup: 42 | # 43 | #bootstrap.memory_lock: true 44 | # 45 | # Make sure that the heap size is set to about half the memory available 46 | # on the system and that the owner of the process is allowed to use this 47 | # limit. 48 | # 49 | # Elasticsearch performs poorly when the system is swapping the memory. 50 | # 51 | # ---------------------------------- Network ----------------------------------- 52 | # 53 | # Set the bind address to a specific IP (IPv4 or IPv6): 54 | # 55 | #network.host: 192.168.0.1 56 | # 57 | # Set a custom port for HTTP: 58 | # 59 | #http.port: 9200 60 | # 61 | # For more information, consult the network module documentation. 62 | # 63 | # --------------------------------- Discovery ---------------------------------- 64 | # 65 | # Pass an initial list of hosts to perform discovery when new node is started: 66 | # The default list of hosts is ["127.0.0.1", "[::1]"] 67 | # 68 | #discovery.zen.ping.unicast.hosts: ["host1", "host2"] 69 | # 70 | # Prevent the "split brain" by configuring the majority of nodes (total number of master-eligible nodes / 2 + 1): 71 | # 72 | #discovery.zen.minimum_master_nodes: 73 | # 74 | # For more information, consult the zen discovery module documentation. 75 | # 76 | # ---------------------------------- Gateway ----------------------------------- 77 | # 78 | # Block initial recovery after a full cluster restart until N nodes are started: 79 | # 80 | #gateway.recover_after_nodes: 3 81 | # 82 | # For more information, consult the gateway module documentation. 83 | # 84 | # ---------------------------------- Various ----------------------------------- 85 | # 86 | # Require explicit names when deleting indices: 87 | # 88 | #action.destructive_requires_name: true 89 | #http.cors.allow-origin: "/https?:\\/\\/(.*?\\.)?(quepid\\.com|splainer\\.io)/" 90 | http.cors.allow-origin: "/http?:.*/" 91 | #http.cors.allow-origin: /http?://localhost(:[0-9]+)?/ 92 | http.cors.enabled: true 93 | indices.query.bool.max_clause_count: 10240 94 | network.host: 0.0.0.0 95 | 96 | discovery.type: single-node 97 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fa.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Note: by default this file is used after normalization, so when adding entries 5 | # to this file, use the arabic 'ي' instead of 'ی' 6 | انان 7 | نداشته 8 | سراسر 9 | خياه 10 | ايشان 11 | وي 12 | تاكنون 13 | بيشتري 14 | دوم 15 | پس 16 | ناشي 17 | وگو 18 | يا 19 | داشتند 20 | سپس 21 | هنگام 22 | هرگز 23 | پنج 24 | نشان 25 | امسال 26 | ديگر 27 | گروهي 28 | شدند 29 | چطور 30 | ده 31 | و 32 | دو 33 | نخستين 34 | ولي 35 | چرا 36 | چه 37 | وسط 38 | ه 39 | كدام 40 | قابل 41 | يك 42 | رفت 43 | هفت 44 | همچنين 45 | در 46 | هزار 47 | بله 48 | بلي 49 | شايد 50 | اما 51 | شناسي 52 | گرفته 53 | دهد 54 | داشته 55 | دانست 56 | داشتن 57 | خواهيم 58 | ميليارد 59 | وقتيكه 60 | امد 61 | خواهد 62 | جز 63 | اورده 64 | شده 65 | بلكه 66 | خدمات 67 | شدن 68 | برخي 69 | نبود 70 | بسياري 71 | جلوگيري 72 | حق 73 | كردند 74 | نوعي 75 | بعري 76 | نكرده 77 | نظير 78 | نبايد 79 | بوده 80 | بودن 81 | داد 82 | اورد 83 | هست 84 | جايي 85 | شود 86 | دنبال 87 | داده 88 | بايد 89 | سابق 90 | هيچ 91 | همان 92 | انجا 93 | كمتر 94 | كجاست 95 | گردد 96 | كسي 97 | تر 98 | مردم 99 | تان 100 | دادن 101 | بودند 102 | سري 103 | جدا 104 | ندارند 105 | مگر 106 | يكديگر 107 | دارد 108 | دهند 109 | بنابراين 110 | هنگامي 111 | سمت 112 | جا 113 | انچه 114 | خود 115 | دادند 116 | زياد 117 | دارند 118 | اثر 119 | بدون 120 | بهترين 121 | بيشتر 122 | البته 123 | به 124 | براساس 125 | بيرون 126 | كرد 127 | بعضي 128 | گرفت 129 | توي 130 | اي 131 | ميليون 132 | او 133 | جريان 134 | تول 135 | بر 136 | مانند 137 | برابر 138 | باشيم 139 | مدتي 140 | گويند 141 | اكنون 142 | تا 143 | تنها 144 | جديد 145 | چند 146 | بي 147 | نشده 148 | كردن 149 | كردم 150 | گويد 151 | كرده 152 | كنيم 153 | نمي 154 | نزد 155 | روي 156 | قصد 157 | فقط 158 | بالاي 159 | ديگران 160 | اين 161 | ديروز 162 | توسط 163 | سوم 164 | ايم 165 | دانند 166 | سوي 167 | استفاده 168 | شما 169 | كنار 170 | داريم 171 | ساخته 172 | طور 173 | امده 174 | رفته 175 | نخست 176 | بيست 177 | نزديك 178 | طي 179 | كنيد 180 | از 181 | انها 182 | تمامي 183 | داشت 184 | يكي 185 | طريق 186 | اش 187 | چيست 188 | روب 189 | نمايد 190 | گفت 191 | چندين 192 | چيزي 193 | تواند 194 | ام 195 | ايا 196 | با 197 | ان 198 | ايد 199 | ترين 200 | اينكه 201 | ديگري 202 | راه 203 | هايي 204 | بروز 205 | همچنان 206 | پاعين 207 | كس 208 | حدود 209 | مختلف 210 | مقابل 211 | چيز 212 | گيرد 213 | ندارد 214 | ضد 215 | همچون 216 | سازي 217 | شان 218 | مورد 219 | باره 220 | مرسي 221 | خويش 222 | برخوردار 223 | چون 224 | خارج 225 | شش 226 | هنوز 227 | تحت 228 | ضمن 229 | هستيم 230 | گفته 231 | فكر 232 | بسيار 233 | پيش 234 | براي 235 | روزهاي 236 | انكه 237 | نخواهد 238 | بالا 239 | كل 240 | وقتي 241 | كي 242 | چنين 243 | كه 244 | گيري 245 | نيست 246 | است 247 | كجا 248 | كند 249 | نيز 250 | يابد 251 | بندي 252 | حتي 253 | توانند 254 | عقب 255 | خواست 256 | كنند 257 | بين 258 | تمام 259 | همه 260 | ما 261 | باشند 262 | مثل 263 | شد 264 | اري 265 | باشد 266 | اره 267 | طبق 268 | بعد 269 | اگر 270 | صورت 271 | غير 272 | جاي 273 | بيش 274 | ريزي 275 | اند 276 | زيرا 277 | چگونه 278 | بار 279 | لطفا 280 | مي 281 | درباره 282 | من 283 | ديده 284 | همين 285 | گذاري 286 | برداري 287 | علت 288 | گذاشته 289 | هم 290 | فوق 291 | نه 292 | ها 293 | شوند 294 | اباد 295 | همواره 296 | هر 297 | اول 298 | خواهند 299 | چهار 300 | نام 301 | امروز 302 | مان 303 | هاي 304 | قبل 305 | كنم 306 | سعي 307 | تازه 308 | را 309 | هستند 310 | زير 311 | جلوي 312 | عنوان 313 | بود 314 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fa.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Note: by default this file is used after normalization, so when adding entries 5 | # to this file, use the arabic 'ي' instead of 'ی' 6 | انان 7 | نداشته 8 | سراسر 9 | خياه 10 | ايشان 11 | وي 12 | تاكنون 13 | بيشتري 14 | دوم 15 | پس 16 | ناشي 17 | وگو 18 | يا 19 | داشتند 20 | سپس 21 | هنگام 22 | هرگز 23 | پنج 24 | نشان 25 | امسال 26 | ديگر 27 | گروهي 28 | شدند 29 | چطور 30 | ده 31 | و 32 | دو 33 | نخستين 34 | ولي 35 | چرا 36 | چه 37 | وسط 38 | ه 39 | كدام 40 | قابل 41 | يك 42 | رفت 43 | هفت 44 | همچنين 45 | در 46 | هزار 47 | بله 48 | بلي 49 | شايد 50 | اما 51 | شناسي 52 | گرفته 53 | دهد 54 | داشته 55 | دانست 56 | داشتن 57 | خواهيم 58 | ميليارد 59 | وقتيكه 60 | امد 61 | خواهد 62 | جز 63 | اورده 64 | شده 65 | بلكه 66 | خدمات 67 | شدن 68 | برخي 69 | نبود 70 | بسياري 71 | جلوگيري 72 | حق 73 | كردند 74 | نوعي 75 | بعري 76 | نكرده 77 | نظير 78 | نبايد 79 | بوده 80 | بودن 81 | داد 82 | اورد 83 | هست 84 | جايي 85 | شود 86 | دنبال 87 | داده 88 | بايد 89 | سابق 90 | هيچ 91 | همان 92 | انجا 93 | كمتر 94 | كجاست 95 | گردد 96 | كسي 97 | تر 98 | مردم 99 | تان 100 | دادن 101 | بودند 102 | سري 103 | جدا 104 | ندارند 105 | مگر 106 | يكديگر 107 | دارد 108 | دهند 109 | بنابراين 110 | هنگامي 111 | سمت 112 | جا 113 | انچه 114 | خود 115 | دادند 116 | زياد 117 | دارند 118 | اثر 119 | بدون 120 | بهترين 121 | بيشتر 122 | البته 123 | به 124 | براساس 125 | بيرون 126 | كرد 127 | بعضي 128 | گرفت 129 | توي 130 | اي 131 | ميليون 132 | او 133 | جريان 134 | تول 135 | بر 136 | مانند 137 | برابر 138 | باشيم 139 | مدتي 140 | گويند 141 | اكنون 142 | تا 143 | تنها 144 | جديد 145 | چند 146 | بي 147 | نشده 148 | كردن 149 | كردم 150 | گويد 151 | كرده 152 | كنيم 153 | نمي 154 | نزد 155 | روي 156 | قصد 157 | فقط 158 | بالاي 159 | ديگران 160 | اين 161 | ديروز 162 | توسط 163 | سوم 164 | ايم 165 | دانند 166 | سوي 167 | استفاده 168 | شما 169 | كنار 170 | داريم 171 | ساخته 172 | طور 173 | امده 174 | رفته 175 | نخست 176 | بيست 177 | نزديك 178 | طي 179 | كنيد 180 | از 181 | انها 182 | تمامي 183 | داشت 184 | يكي 185 | طريق 186 | اش 187 | چيست 188 | روب 189 | نمايد 190 | گفت 191 | چندين 192 | چيزي 193 | تواند 194 | ام 195 | ايا 196 | با 197 | ان 198 | ايد 199 | ترين 200 | اينكه 201 | ديگري 202 | راه 203 | هايي 204 | بروز 205 | همچنان 206 | پاعين 207 | كس 208 | حدود 209 | مختلف 210 | مقابل 211 | چيز 212 | گيرد 213 | ندارد 214 | ضد 215 | همچون 216 | سازي 217 | شان 218 | مورد 219 | باره 220 | مرسي 221 | خويش 222 | برخوردار 223 | چون 224 | خارج 225 | شش 226 | هنوز 227 | تحت 228 | ضمن 229 | هستيم 230 | گفته 231 | فكر 232 | بسيار 233 | پيش 234 | براي 235 | روزهاي 236 | انكه 237 | نخواهد 238 | بالا 239 | كل 240 | وقتي 241 | كي 242 | چنين 243 | كه 244 | گيري 245 | نيست 246 | است 247 | كجا 248 | كند 249 | نيز 250 | يابد 251 | بندي 252 | حتي 253 | توانند 254 | عقب 255 | خواست 256 | كنند 257 | بين 258 | تمام 259 | همه 260 | ما 261 | باشند 262 | مثل 263 | شد 264 | اري 265 | باشد 266 | اره 267 | طبق 268 | بعد 269 | اگر 270 | صورت 271 | غير 272 | جاي 273 | بيش 274 | ريزي 275 | اند 276 | زيرا 277 | چگونه 278 | بار 279 | لطفا 280 | مي 281 | درباره 282 | من 283 | ديده 284 | همين 285 | گذاري 286 | برداري 287 | علت 288 | گذاشته 289 | هم 290 | فوق 291 | نه 292 | ها 293 | شوند 294 | اباد 295 | همواره 296 | هر 297 | اول 298 | خواهند 299 | چهار 300 | نام 301 | امروز 302 | مان 303 | هاي 304 | قبل 305 | كنم 306 | سعي 307 | تازه 308 | را 309 | هستند 310 | زير 311 | جلوي 312 | عنوان 313 | بود 314 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/evaluation (Solr).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# evaluate (Solr Edition)\n", 8 | "\n", 9 | "**Note:** This lab requires hello-ltr be run first. You must have the TMDB data indexed and LTR models configured before proceeding." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### RRE\n", 17 | "This lab makes use of the rated-ranking-evaluator [project](https://github.com/SeaseLtd/rated-ranking-evaluator) to carry out evaluations on our models from the hello-ltr lab.\n", 18 | "\n", 19 | "An RRE configuration requires the following:\n", 20 | "\n", 21 | "- configuration_sets\n", 22 | " - This tells RRE about the Solr/Elastic instance to use for each evaluation\n", 23 | "- corpora (Not required for this setup)\n", 24 | " - RRE supports indexing a snapshot of data for evaluations. For this lab we'll be using the data indexed previously.\n", 25 | "- ratings\n", 26 | " - This folder houses json files with queries and ratings to be evaluated\n", 27 | "- templates\n", 28 | " - The queries to be run by each configuration set\n", 29 | "- pom.xml\n", 30 | " - Maven project configuration, here you can configure what metrics are calculated by the evalauation and format of the report.\n", 31 | " \n", 32 | "Take a look at the rre folder in the hello-ltr to get a better idea of the project layout and structure." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Ratings and Evaluation\n", 40 | "To get started with RRE we first need some ratings. For this example we're going to use a query for \"batman\" and we're going to say that newer films are better than older ones. We will setup 3 different configuration sets in RRE:\n", 41 | "\n", 42 | "- baseline (No LTR applied)\n", 43 | "- classic (Rescore with the `classic` LTR model)\n", 44 | "- latest (Rescore with the `latest` LTR model)\n", 45 | "\n", 46 | "The snippet below will kick off an evaluation in RRE" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from ltr import evaluate\n", 56 | "evaluate('solr')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Looking at the Results\n", 64 | "In this example we have rating data for every result in the Batman query and we're not adjusting matching so `Precision` and `Recall` are the expected value of 1. However, since we've altered the sorting of results with LTR we can see a lift in `ERR` as our higher rated documents are coming up closer to the top of the results." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from ltr import rre_table\n", 74 | "rre_table()" 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.7.6" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 2 99 | } 100 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# evaluate (Elastic Edition)\n", 8 | "\n", 9 | "**Note:** This lab requires hello-ltr be run first. You must have the TMDB data indexed and LTR models configured before proceeding." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### RRE\n", 17 | "This lab makes use of the rated-ranking-evaluator [project](https://github.com/SeaseLtd/rated-ranking-evaluator) to carry out evaluations on our models from the hello-ltr lab.\n", 18 | "\n", 19 | "An RRE configuration requires the following:\n", 20 | "\n", 21 | "- configuration_sets\n", 22 | " - This tells RRE about the Solr/Elastic instance to use for each evaluation\n", 23 | "- corpora (Not required for this setup)\n", 24 | " - RRE supports indexing a snapshot of data for evaluations. For this lab we'll be using the data indexed previously.\n", 25 | "- ratings\n", 26 | " - This folder houses json files with queries and ratings to be evaluated\n", 27 | "- templates\n", 28 | " - The queries to be run by each configuration set\n", 29 | "- pom.xml\n", 30 | " - Maven project configuration, here you can configure what metrics are calculated by the evalauation and format of the report.\n", 31 | " \n", 32 | "Take a look at the rre folder in the hello-ltr to get a better idea of the project layout and structure." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Ratings and Evaluation\n", 40 | "To get started with RRE we first need some ratings. For this example we're going to use a query for \"batman\" and we're going to say that newer films are better than older ones. We will setup 3 different configuration sets in RRE:\n", 41 | "\n", 42 | "- baseline (No LTR applied)\n", 43 | "- classic (Rescore with the `classic` LTR model)\n", 44 | "- latest (Rescore with the `latest` LTR model)\n", 45 | "\n", 46 | "The snippet below will kick off an evaluation in RRE" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from ltr import evaluate\n", 56 | "evaluate('elastic')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Looking at the Results\n", 64 | "In this example we have rating data for every result in the Batman query and we're not adjusting matching so `Precision` and `Recall` are the expected value of 1. However, since we've altered the sorting of results with LTR we can see a lift in `ERR` as our higher rated documents are coming up closer to the top of the results." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from ltr import rre_table\n", 74 | "rre_table()" 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.7.6" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 2 99 | } 100 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_da.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | A Danish stop word list. Comments begin with vertical bar. Each stop 11 | | word is at the start of a line. 12 | 13 | | This is a ranked list (commonest to rarest) of stopwords derived from 14 | | a large text sample. 15 | 16 | 17 | og | and 18 | i | in 19 | jeg | I 20 | det | that (dem. pronoun)/it (pers. pronoun) 21 | at | that (in front of a sentence)/to (with infinitive) 22 | en | a/an 23 | den | it (pers. pronoun)/that (dem. pronoun) 24 | til | to/at/for/until/against/by/of/into, more 25 | er | present tense of "to be" 26 | som | who, as 27 | på | on/upon/in/on/at/to/after/of/with/for, on 28 | de | they 29 | med | with/by/in, along 30 | han | he 31 | af | of/by/from/off/for/in/with/on, off 32 | for | at/for/to/from/by/of/ago, in front/before, because 33 | ikke | not 34 | der | who/which, there/those 35 | var | past tense of "to be" 36 | mig | me/myself 37 | sig | oneself/himself/herself/itself/themselves 38 | men | but 39 | et | a/an/one, one (number), someone/somebody/one 40 | har | present tense of "to have" 41 | om | round/about/for/in/a, about/around/down, if 42 | vi | we 43 | min | my 44 | havde | past tense of "to have" 45 | ham | him 46 | hun | she 47 | nu | now 48 | over | over/above/across/by/beyond/past/on/about, over/past 49 | da | then, when/as/since 50 | fra | from/off/since, off, since 51 | du | you 52 | ud | out 53 | sin | his/her/its/one's 54 | dem | them 55 | os | us/ourselves 56 | op | up 57 | man | you/one 58 | hans | his 59 | hvor | where 60 | eller | or 61 | hvad | what 62 | skal | must/shall etc. 63 | selv | myself/youself/herself/ourselves etc., even 64 | her | here 65 | alle | all/everyone/everybody etc. 66 | vil | will (verb) 67 | blev | past tense of "to stay/to remain/to get/to become" 68 | kunne | could 69 | ind | in 70 | når | when 71 | være | present tense of "to be" 72 | dog | however/yet/after all 73 | noget | something 74 | ville | would 75 | jo | you know/you see (adv), yes 76 | deres | their/theirs 77 | efter | after/behind/according to/for/by/from, later/afterwards 78 | ned | down 79 | skulle | should 80 | denne | this 81 | end | than 82 | dette | this 83 | mit | my/mine 84 | også | also 85 | under | under/beneath/below/during, below/underneath 86 | have | have 87 | dig | you 88 | anden | other 89 | hende | her 90 | mine | my 91 | alt | everything 92 | meget | much/very, plenty of 93 | sit | his, her, its, one's 94 | sine | his, her, its, one's 95 | vor | our 96 | mod | against 97 | disse | these 98 | hvis | if 99 | din | your/yours 100 | nogle | some 101 | hos | by/at 102 | blive | be/become 103 | mange | many 104 | ad | by/through 105 | bliver | present tense of "to be/to become" 106 | hendes | her/hers 107 | været | be 108 | thi | for (conj) 109 | jer | you 110 | sådan | such, like this/like that 111 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_da.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | A Danish stop word list. Comments begin with vertical bar. Each stop 11 | | word is at the start of a line. 12 | 13 | | This is a ranked list (commonest to rarest) of stopwords derived from 14 | | a large text sample. 15 | 16 | 17 | og | and 18 | i | in 19 | jeg | I 20 | det | that (dem. pronoun)/it (pers. pronoun) 21 | at | that (in front of a sentence)/to (with infinitive) 22 | en | a/an 23 | den | it (pers. pronoun)/that (dem. pronoun) 24 | til | to/at/for/until/against/by/of/into, more 25 | er | present tense of "to be" 26 | som | who, as 27 | på | on/upon/in/on/at/to/after/of/with/for, on 28 | de | they 29 | med | with/by/in, along 30 | han | he 31 | af | of/by/from/off/for/in/with/on, off 32 | for | at/for/to/from/by/of/ago, in front/before, because 33 | ikke | not 34 | der | who/which, there/those 35 | var | past tense of "to be" 36 | mig | me/myself 37 | sig | oneself/himself/herself/itself/themselves 38 | men | but 39 | et | a/an/one, one (number), someone/somebody/one 40 | har | present tense of "to have" 41 | om | round/about/for/in/a, about/around/down, if 42 | vi | we 43 | min | my 44 | havde | past tense of "to have" 45 | ham | him 46 | hun | she 47 | nu | now 48 | over | over/above/across/by/beyond/past/on/about, over/past 49 | da | then, when/as/since 50 | fra | from/off/since, off, since 51 | du | you 52 | ud | out 53 | sin | his/her/its/one's 54 | dem | them 55 | os | us/ourselves 56 | op | up 57 | man | you/one 58 | hans | his 59 | hvor | where 60 | eller | or 61 | hvad | what 62 | skal | must/shall etc. 63 | selv | myself/youself/herself/ourselves etc., even 64 | her | here 65 | alle | all/everyone/everybody etc. 66 | vil | will (verb) 67 | blev | past tense of "to stay/to remain/to get/to become" 68 | kunne | could 69 | ind | in 70 | når | when 71 | være | present tense of "to be" 72 | dog | however/yet/after all 73 | noget | something 74 | ville | would 75 | jo | you know/you see (adv), yes 76 | deres | their/theirs 77 | efter | after/behind/according to/for/by/from, later/afterwards 78 | ned | down 79 | skulle | should 80 | denne | this 81 | end | than 82 | dette | this 83 | mit | my/mine 84 | også | also 85 | under | under/beneath/below/during, below/underneath 86 | have | have 87 | dig | you 88 | anden | other 89 | hende | her 90 | mine | my 91 | alt | everything 92 | meget | much/very, plenty of 93 | sit | his, her, its, one's 94 | sine | his, her, its, one's 95 | vor | our 96 | mod | against 97 | disse | these 98 | hvis | if 99 | din | your/yours 100 | nogle | some 101 | hos | by/at 102 | blive | be/become 103 | mange | many 104 | ad | by/through 105 | bliver | present tense of "to be/to become" 106 | hendes | her/hers 107 | været | be 108 | thi | for (conj) 109 | jer | you 110 | sådan | such, like this/like that 111 | --------------------------------------------------------------------------------