├── ltr
    ├── helpers
    │   ├── __init__.py
    │   ├── msmarco
    │   │   ├── __init__.py
    │   │   └── evaluate.py
    │   ├── handle_resp.py
    │   ├── esUrlParse.py
    │   ├── solr_escape.py
    │   ├── defaultlist.py
    │   ├── convert.py
    │   ├── butterfingers.py
    │   ├── tau.py
    │   └── movies.py
    ├── clickmodels
    │   ├── __init__.py
    │   ├── coec.py
    │   ├── conversion.py
    │   ├── cascade.py
    │   ├── session.py
    │   └── sdbn.py
    ├── client
    │   ├── __init__.py
    │   ├── base_client.py
    │   └── solr_parse.py
    ├── __init__.py
    ├── p9_plots.py
    ├── index.py
    ├── download.py
    ├── release_date_plot.py
    ├── injectTypos.py
    ├── years_as_ratings.py
    ├── search.py
    ├── evaluate.py
    ├── date_genre_judgments.py
    └── log.py
├── rre
    ├── solr
    │   ├── .gitignore
    │   ├── .dockerignore
    │   ├── src
    │   │   └── etc
    │   │   │   ├── templates
    │   │   │       ├── baseline
    │   │   │       │   └── query.json
    │   │   │       ├── classic
    │   │   │       │   └── query.json
    │   │   │       ├── latest
    │   │   │       │   └── query.json
    │   │   │       └── README.md
    │   │   │   ├── configuration_sets
    │   │   │       ├── classic
    │   │   │       │   └── solr-settings.json
    │   │   │       ├── latest
    │   │   │       │   └── solr-settings.json
    │   │   │       ├── baseline
    │   │   │       │   └── solr-settings.json
    │   │   │       └── README.md
    │   │   │   └── ratings
    │   │   │       ├── README.md
    │   │   │       └── ratings.json
    │   └── Dockerfile
    ├── elastic
    │   ├── .dockerignore
    │   ├── .gitignore
    │   ├── src
    │   │   └── etc
    │   │   │   ├── configuration_sets
    │   │   │       ├── classic
    │   │   │       │   └── index-settings.json
    │   │   │       ├── latest
    │   │   │       │   └── index-settings.json
    │   │   │       ├── baseline
    │   │   │       │   └── index-settings.json
    │   │   │       └── README.md
    │   │   │   ├── templates
    │   │   │       ├── baseline
    │   │   │       │   └── query.json
    │   │   │       ├── classic
    │   │   │       │   └── query.json
    │   │   │       ├── latest
    │   │   │       │   └── query.json
    │   │   │       └── README.md
    │   │   │   └── ratings
    │   │   │       └── ratings.json
    │   └── Dockerfile
    └── README.md
├── notebooks
    ├── solr
    │   ├── tmdb
    │   │   ├── solr_config
    │   │   │   └── conf
    │   │   │   │   ├── names.txt
    │   │   │   │   ├── name_synonyms.txt
    │   │   │   │   ├── synonyms_genres.txt
    │   │   │   │   ├── synonyms_directed.txt
    │   │   │   │   ├── lang
    │   │   │   │       ├── contractions_ga.txt
    │   │   │   │       ├── hyphenations_ga.txt
    │   │   │   │       ├── contractions_ca.txt
    │   │   │   │       ├── stemdict_nl.txt
    │   │   │   │       ├── contractions_fr.txt
    │   │   │   │       ├── contractions_it.txt
    │   │   │   │       ├── stopwords_hy.txt
    │   │   │   │       ├── stopwords_el.txt
    │   │   │   │       ├── stopwords_ga.txt
    │   │   │   │       ├── stopwords_eu.txt
    │   │   │   │       ├── userdict_ja.txt
    │   │   │   │       ├── stopwords_en.txt
    │   │   │   │       ├── stopwords_th.txt
    │   │   │   │       ├── stopwords_ar.txt
    │   │   │   │       ├── stopwords_gl.txt
    │   │   │   │       ├── stopwords_cz.txt
    │   │   │   │       ├── stopwords_ja.txt
    │   │   │   │       ├── stopwords_lv.txt
    │   │   │   │       ├── stopwords_bg.txt
    │   │   │   │       ├── stopwords_ca.txt
    │   │   │   │       ├── stopwords_tr.txt
    │   │   │   │       ├── stopwords_ro.txt
    │   │   │   │       ├── stopwords_hu.txt
    │   │   │   │       ├── stopwords_hi.txt
    │   │   │   │       ├── stopwords_fi.txt
    │   │   │   │       ├── stopwords_fa.txt
    │   │   │   │       └── stopwords_da.txt
    │   │   │   │   ├── synonyms_multiterm.txt
    │   │   │   │   ├── synonyms_bidirect.txt
    │   │   │   │   ├── taxonomy_parent.txt
    │   │   │   │   ├── params.json
    │   │   │   │   ├── taxonomy.txt
    │   │   │   │   ├── idioms.txt
    │   │   │   │   ├── stopwords.txt
    │   │   │   │   ├── protwords.txt
    │   │   │   │   ├── synonyms.txt
    │   │   │   │   └── elevate.xml
    │   │   ├── ltr.py
    │   │   └── evaluation (Solr).ipynb
    │   ├── .docker
    │   │   └── solr_home
    │   │   │   ├── tmdb
    │   │   │       └── conf
    │   │   │       │   ├── names.txt
    │   │   │       │   ├── name_synonyms.txt
    │   │   │       │   ├── synonyms_genres.txt
    │   │   │       │   ├── synonyms_directed.txt
    │   │   │       │   ├── lang
    │   │   │       │       ├── hyphenations_ga.txt
    │   │   │       │       ├── contractions_ga.txt
    │   │   │       │       ├── contractions_ca.txt
    │   │   │       │       ├── stemdict_nl.txt
    │   │   │       │       ├── contractions_fr.txt
    │   │   │       │       ├── contractions_it.txt
    │   │   │       │       ├── stopwords_hy.txt
    │   │   │       │       ├── stopwords_el.txt
    │   │   │       │       ├── stopwords_ga.txt
    │   │   │       │       ├── stopwords_eu.txt
    │   │   │       │       ├── userdict_ja.txt
    │   │   │       │       ├── stopwords_en.txt
    │   │   │       │       ├── stopwords_th.txt
    │   │   │       │       ├── stopwords_ar.txt
    │   │   │       │       ├── stopwords_gl.txt
    │   │   │       │       ├── stopwords_cz.txt
    │   │   │       │       ├── stopwords_ja.txt
    │   │   │       │       ├── stopwords_lv.txt
    │   │   │       │       ├── stopwords_bg.txt
    │   │   │       │       ├── stopwords_ca.txt
    │   │   │       │       ├── stopwords_tr.txt
    │   │   │       │       ├── stopwords_ro.txt
    │   │   │       │       ├── stopwords_hu.txt
    │   │   │       │       ├── stopwords_hi.txt
    │   │   │       │       ├── stopwords_fi.txt
    │   │   │       │       ├── stopwords_fa.txt
    │   │   │       │       └── stopwords_da.txt
    │   │   │       │   ├── synonyms_multiterm.txt
    │   │   │       │   ├── synonyms_bidirect.txt
    │   │   │       │   ├── taxonomy_parent.txt
    │   │   │       │   ├── params.json
    │   │   │       │   ├── taxonomy.txt
    │   │   │       │   ├── idioms.txt
    │   │   │       │   ├── stopwords.txt
    │   │   │       │   ├── protwords.txt
    │   │   │       │   ├── synonyms.txt
    │   │   │       │   └── elevate.xml
    │   │   │   ├── zoo.cfg
    │   │   │   └── solr.xml
    │   ├── docker-compose.yml
    │   ├── Dockerfile
    │   └── msmarco
    │   │   ├── solr_config
    │   │       └── conf
    │   │       │   ├── params.json
    │   │       │   └── elevate.xml
    │   │   └── ltr.py
    ├── elasticsearch
    │   ├── tmdb
    │   │   ├── fmap.txt
    │   │   ├── ltr.py
    │   │   ├── Dataframes.ipynb
    │   │   └── evaluation.ipynb
    │   ├── .docker
    │   │   ├── kb-docker
    │   │   │   └── Dockerfile
    │   │   └── es-docker
    │   │   │   ├── elasticsearch.sh
    │   │   │   ├── Dockerfile
    │   │   │   └── elasticsearch.yml
    │   ├── README.md
    │   ├── docker-compose.yml
    │   └── osc-blog
    │   │   ├── ltr.py
    │   │   └── blog_settings.json
    ├── ltr.py
    └── exercises
    │   └── ltr.py
├── utils
    ├── utils.py
    ├── rateFuzzySearch.json.jinja
    ├── rateSearch.json.jinja
    └── train_to_csv.py
├── .dockerignore
├── clean-notebooks.sh
├── .gitignore
├── tests
    ├── fail.py
    ├── pass.py
    ├── test_prep.py
    ├── nb_test_config.py
    ├── run_most_nbs.py
    ├── runner.py
    └── notebook_test_case.py
├── Dockerfile
├── docker
    └── README.md
├── docker-compose.yml
├── requirements.txt
└── README.md


/ltr/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ltr/clickmodels/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ltr/helpers/msmarco/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rre/solr/.gitignore:
--------------------------------------------------------------------------------
1 | target/*
2 | 


--------------------------------------------------------------------------------
/rre/elastic/.dockerignore:
--------------------------------------------------------------------------------
1 | target/*
2 | 


--------------------------------------------------------------------------------
/rre/elastic/.gitignore:
--------------------------------------------------------------------------------
1 | target/*
2 | 


--------------------------------------------------------------------------------
/rre/solr/.dockerignore:
--------------------------------------------------------------------------------
1 | target/*
2 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/names.txt:
--------------------------------------------------------------------------------
1 | luke_skywalker


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/names.txt:
--------------------------------------------------------------------------------
1 | luke_skywalker


--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/fmap.txt:
--------------------------------------------------------------------------------
1 | 0 release_year q
2 | 1 features0 q
3 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/name_synonyms.txt:
--------------------------------------------------------------------------------
1 | sky walker, skywalker


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/name_synonyms.txt:
--------------------------------------------------------------------------------
1 | sky walker, skywalker


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | sys.path.append(os.getcwd())
5 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/baseline/query.json:
--------------------------------------------------------------------------------
1 | {
2 |   "q": "title:($query)"
3 | }
4 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_genres.txt:
--------------------------------------------------------------------------------
1 | scifi,science fiction,science fiction movie


--------------------------------------------------------------------------------
/ltr/client/__init__.py:
--------------------------------------------------------------------------------
1 | from .elastic_client import ElasticClient
2 | from .solr_client import SolrClient
3 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/kb-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/kibana/kibana:7.12.1
2 | 
3 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_genres.txt:
--------------------------------------------------------------------------------
1 | scifi,science fiction,science fiction movie


--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/classic/query.json:
--------------------------------------------------------------------------------
1 | {
2 |   "q": "title:($query)",
3 |   "rq": "{!ltr model=classic}"
4 | }
5 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/latest/query.json:
--------------------------------------------------------------------------------
1 | {
2 |   "q": "title:($query)",
3 |   "rq": "{!ltr model=latest}"
4 | }
5 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | venv/
 3 | venv2/
 4 | .git/
 5 | .cache/
 6 | .trash/
 7 | **/venv*
 8 | **/data/
 9 | **/__pycache__/
10 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/classic/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "hostUrls": [ "http://host.docker.internal:9200" ],
3 |   "index": "tmdb"
4 | }
5 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/latest/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "hostUrls": [ "http://host.docker.internal:9200" ],
3 |   "index": "tmdb"
4 | }
5 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/baseline/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "hostUrls": [ "http://host.docker.internal:9200" ],
3 |   "index": "tmdb"
4 | }
5 | 


--------------------------------------------------------------------------------
/clean-notebooks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Removes all output and metadata from notebooks
4 | find notebooks -type f -name "*.ipynb" -print0 | xargs -0 nbstripout
5 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/classic/solr-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "baseUrls": [ "http://host.docker.internal:8983/solr" ],
3 |   "collectionName": "tmdb"
4 | }
5 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/latest/solr-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "baseUrls": [ "http://host.docker.internal:8983/solr" ],
3 |   "collectionName": "tmdb"
4 | }
5 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/es-docker/elasticsearch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch-tlre
4 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/baseline/solr-settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "baseUrls": [ "http://host.docker.internal:8983/solr" ],
3 |   "collectionName": "tmdb"
4 | }
5 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_directed.txt:
--------------------------------------------------------------------------------
1 | wife => wife, bride
2 | spouse => spouse, husband, wife, partner
3 | tunes => cartoons, toons, songs
4 | cartoon  => toons, tunes


--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/baseline/query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "query": {
 3 |     "match": {
 4 |       "title": {
 5 |         "query": "$query"
 6 |       }
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_directed.txt:
--------------------------------------------------------------------------------
1 | wife => wife, bride
2 | spouse => spouse, husband, wife, partner
3 | tunes => cartoons, toons, songs
4 | cartoon  => toons, tunes


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | **/data
 3 | venv/*
 4 | **/.ipynb_checkpoints
 5 | tests/last_run.ipynb
 6 | 
 7 | *.pyc
 8 | .vscode
 9 | .cache
10 | features.txt
11 | .trash
12 | .DS_store
13 | notify.sh
14 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | m
5 | b
6 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/hyphenations_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish hyphenations for StopFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | h
4 | n
5 | t
6 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/hyphenations_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish hyphenations for StopFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | h
4 | n
5 | t
6 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_multiterm.txt:
--------------------------------------------------------------------------------
1 | # Here are some multi term synonym to 
2 | # see what happens at query time
3 | 
4 | looney tunes, cartoons
5 | science fiction, sci fi, sci-fi, scifi


--------------------------------------------------------------------------------
/tests/fail.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | class Fail(unittest.TestCase):
 4 | 
 5 |     def test_that_fails(self):
 6 |         assert 1 == 0
 7 | 
 8 | if __name__ == "__main__":
 9 |     unittest.main()
10 | 


--------------------------------------------------------------------------------
/tests/pass.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | class Pass(unittest.TestCase):
 4 | 
 5 |     def test_that_passes(self):
 6 |         assert 1 == 1
 7 | 
 8 | if __name__ == "__main__":
 9 |     unittest.main()
10 | 


--------------------------------------------------------------------------------
/ltr/__init__.py:
--------------------------------------------------------------------------------
1 | # Make the most important pieces just available as
2 | # ie - from ltr import download
3 | from .download import download
4 | from .evaluate import evaluate, rre_table
5 | from .search import search
6 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | m
5 | b
6 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_multiterm.txt:
--------------------------------------------------------------------------------
1 | # Here are some multi term synonym to 
2 | # see what happens at query time
3 | 
4 | looney tunes, cartoons
5 | science fiction, sci fi, sci-fi, scifi


--------------------------------------------------------------------------------
/rre/solr/src/etc/ratings/README.md:
--------------------------------------------------------------------------------
1 | Under the ratings folder you should have at least 1 ratings file. 
2 | A ratings file is connected with a dataset and contains a set of queries that compose the evaluation execution. 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_ca.txt:
--------------------------------------------------------------------------------
1 | # Set of Catalan contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | l
5 | m
6 | n
7 | s
8 | t
9 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ca.txt:
--------------------------------------------------------------------------------
1 | # Set of Catalan contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | l
5 | m
6 | n
7 | s
8 | t
9 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stemdict_nl.txt:
--------------------------------------------------------------------------------
1 | # Set of overrides for the dutch stemmer
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | fiets	fiets
4 | bromfiets	bromfiets
5 | ei	eier
6 | kind	kinder
7 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stemdict_nl.txt:
--------------------------------------------------------------------------------
1 | # Set of overrides for the dutch stemmer
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | fiets	fiets
4 | bromfiets	bromfiets
5 | ei	eier
6 | kind	kinder
7 | 


--------------------------------------------------------------------------------
/ltr/helpers/handle_resp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def resp_msg(msg, resp, throw=True):
 4 |     print('{} [Status: {}]'.format(msg, resp.status_code))
 5 |     if resp.status_code >= 400:
 6 |         print(resp.text)
 7 |         if throw:
 8 |             raise RuntimeError(resp.text)
 9 | 
10 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_fr.txt:
--------------------------------------------------------------------------------
 1 | # Set of French contractions for ElisionFilter
 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
 3 | l
 4 | m
 5 | t
 6 | qu
 7 | n
 8 | s
 9 | j
10 | d
11 | c
12 | jusqu
13 | quoiqu
14 | lorsqu
15 | puisqu
16 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_fr.txt:
--------------------------------------------------------------------------------
 1 | # Set of French contractions for ElisionFilter
 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
 3 | l
 4 | m
 5 | t
 6 | qu
 7 | n
 8 | s
 9 | j
10 | d
11 | c
12 | jusqu
13 | quoiqu
14 | lorsqu
15 | puisqu
16 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_bidirect.txt:
--------------------------------------------------------------------------------
1 | # Often people erroneously equate linguistic synonyms
2 | # with Solr synonyms. Here the bidirectional nature
3 | # of the synonyms creates problems where the more specific
4 | # term is not prioritized
5 | wife,bride
6 | wife,spouse
7 | toons,tunes,cartoon


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_bidirect.txt:
--------------------------------------------------------------------------------
1 | # Often people erroneously equate linguistic synonyms
2 | # with Solr synonyms. Here the bidirectional nature
3 | # of the synonyms creates problems where the more specific
4 | # term is not prioritized
5 | wife,bride
6 | wife,spouse
7 | toons,tunes,cartoon


--------------------------------------------------------------------------------
/notebooks/elasticsearch/README.md:
--------------------------------------------------------------------------------
1 | This folder contains some Elasticsearch configuration and a Dockerfile to expedite setting up Elasticsearch with LTR.
2 | 
3 | ## Docker
4 | Run `docker-compose up` to create a image running Elasticsearch with LTR
5 | 
6 | After the instance is running, load up the "hello-ltr (ES)" notebook.
7 | 


--------------------------------------------------------------------------------
/notebooks/solr/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   solr:
 4 |     build: .
 5 |     expose:
 6 |       - "8983"
 7 |     ports:
 8 |       - "8983:8983"
 9 |     volumes:
10 |       - data:/var/solr
11 |     environment:
12 |       SERVER_HOST: "0.0.0.0"
13 |     mem_limit: 4096m
14 |     mem_reservation: 4096m
15 | volumes:
16 |   data:
17 | 


--------------------------------------------------------------------------------
/utils/rateFuzzySearch.json.jinja:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"from": 0,
 3 | 	"size": 7,
 4 | 	"query": {
 5 | 		"bool": {
 6 | 			"should": [
 7 |                 {"match": {
 8 |                     "title": {
 9 |                         "query": "{{ keywords }}",
10 |                         "fuzziness": "AUTO"}
11 |                 }}
12 |             ]
13 | 		}
14 | 	}
15 | }
16 | 


--------------------------------------------------------------------------------
/notebooks/solr/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM solr:8.5.1
 2 | 
 3 | USER root
 4 | 
 5 | ADD tmdb/solr_config /var/solr/data/configsets/tmdb
 6 | RUN chown solr:solr /var/solr/data/configsets/tmdb
 7 | 
 8 | ADD msmarco/solr_config /var/solr/data/configsets/msmarco
 9 | RUN chown solr:solr /var/solr/data/configsets/msmarco
10 | 
11 | USER solr
12 | 
13 | CMD ["solr-foreground", "-Dsolr.ltr.enabled=true"]
14 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_it.txt:
--------------------------------------------------------------------------------
 1 | # Set of Italian contractions for ElisionFilter
 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
 3 | c
 4 | l 
 5 | all 
 6 | dall 
 7 | dell 
 8 | nell 
 9 | sull 
10 | coll 
11 | pell 
12 | gl 
13 | agl 
14 | dagl 
15 | degl 
16 | negl 
17 | sugl 
18 | un 
19 | m 
20 | t 
21 | s 
22 | v 
23 | d
24 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/README.md:
--------------------------------------------------------------------------------
1 | This folder contains one subfolder for each configuration version. 
2 | Each version folder should contain the index settings associated with such version:
3 | 
4 | - `hostUrls`: an array of URLs where the Elasticsearch instance for this
5 | version can be accessed.
6 | - `index`: the name of the index holding the data being used to search.
7 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_it.txt:
--------------------------------------------------------------------------------
 1 | # Set of Italian contractions for ElisionFilter
 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
 3 | c
 4 | l 
 5 | all 
 6 | dall 
 7 | dell 
 8 | nell 
 9 | sull 
10 | coll 
11 | pell 
12 | gl 
13 | agl 
14 | dagl 
15 | degl 
16 | negl 
17 | sugl 
18 | un 
19 | m 
20 | t 
21 | s 
22 | v 
23 | d
24 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/taxonomy_parent.txt:
--------------------------------------------------------------------------------
 1 | # Capture how the *user* structures information
 2 | #looneytunes, looney tunes => looney_tunes
 3 | #bugs bunny => bug_bunny, looney_tunes
 4 | #mickey mouse => mickey_mouse, disney
 5 | #minnie mouse => minnie_mouse, disney
 6 | #donald duck => donald_duck, disney
 7 | #yogi bear => yogi_bear, disney
 8 | 
 9 | wife => wife, spouse
10 | bride => bride, spouse


--------------------------------------------------------------------------------
/utils/rateSearch.json.jinja:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"from": 0,
 3 | 	"size": 5,
 4 | 	"query": {
 5 | 		"bool": {
 6 | 			"should": [
 7 |                 {"match": {
 8 |                     "text_all": "{{ keywords }}"
 9 |                 }},
10 |             {
11 | 				"match_phrase": {
12 | 					"title": {
13 | 						"query": "{{ keywords }}",
14 | 						"boost": 1000
15 | 					}
16 | 				}
17 | 			}]
18 | 		}
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy_parent.txt:
--------------------------------------------------------------------------------
 1 | # Capture how the *user* structures information
 2 | #looneytunes, looney tunes => looney_tunes
 3 | #bugs bunny => bug_bunny, looney_tunes
 4 | #mickey mouse => mickey_mouse, disney
 5 | #minnie mouse => minnie_mouse, disney
 6 | #donald duck => donald_duck, disney
 7 | #yogi bear => yogi_bear, disney
 8 | 
 9 | wife => wife, spouse
10 | bride => bride, spouse


--------------------------------------------------------------------------------
/rre/solr/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM maven:3.6.0-jdk-8
 2 | 
 3 | # Clone the RRE repo
 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator
 5 | WORKDIR rated-ranking-evaluator
 6 | 
 7 | # Build RRE
 8 | RUN mvn clean install
 9 | 
10 | # Bring over the RRE config
11 | WORKDIR /
12 | COPY . rre
13 | WORKDIR rre
14 | 
15 | # By default, run an RRE evaluation if no other command is specified
16 | CMD mvn clean install
17 | 


--------------------------------------------------------------------------------
/tests/test_prep.py:
--------------------------------------------------------------------------------
 1 | from ltr.client.solr_client import SolrClient
 2 | client = SolrClient()
 3 | 
 4 | from ltr import download
 5 | from ltr.index import rebuild
 6 | from ltr.helpers.movies import indexable_movies
 7 | 
 8 | corpus='http://es-learn-to-rank.labs.o19s.com/tmdb.json'
 9 | download([corpus], dest='data/');
10 | 
11 | movies=indexable_movies(movies='data/tmdb.json')
12 | rebuild(client, index='tmdb', doc_src=movies)


--------------------------------------------------------------------------------
/rre/elastic/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM maven:3.6.0-jdk-8
 2 | 
 3 | # Clone the RRE repo
 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator
 5 | WORKDIR rated-ranking-evaluator
 6 | 
 7 | # Build RRE
 8 | RUN mvn clean install
 9 | 
10 | # Bring over the RRE config
11 | WORKDIR /
12 | COPY . rre
13 | WORKDIR rre
14 | 
15 | # By default, run an RRE evaluation if no other command is specified
16 | CMD mvn clean install
17 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/classic/query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "query": {
 3 |     "bool": {
 4 |       "should": [
 5 |           {
 6 |             "sltr": {
 7 |                 "featureset": "release",
 8 |                 "model": "classic",
 9 |                 "params": {}
10 |             }
11 |           }
12 |       ],
13 |       "filter": [
14 |         {"match": {"title": "$query"}}
15 |       ]
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/latest/query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "query": {
 3 |     "bool": {
 4 |       "should": [
 5 |           {
 6 |             "sltr": {
 7 |                 "featureset": "release",
 8 |                 "model": "latest",
 9 |                 "params": {}
10 |             }
11 |           }
12 |       ],
13 |       "filter": [
14 |         {"match": {"title": "$query"}}
15 |       ]
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/es-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/elasticsearch/elasticsearch:7.12.1
2 | 
3 | RUN bin/elasticsearch-plugin install -b https://github.com/o19s/elasticsearch-learning-to-rank/releases/download/v1.5.5-es7.12.1/ltr-plugin-v1.5.5-es7.12.1.zip
4 | COPY --chown=elasticsearch:elasticsearch elasticsearch.yml /usr/share/elasticsearch/config/
5 | RUN cat  /usr/share/elasticsearch/config/elasticsearch.yml
6 | 
7 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/params.json:
--------------------------------------------------------------------------------
 1 | {"params":{
 2 |   "query":{
 3 |     "defType":"edismax",
 4 |     "q.alt":"*:*",
 5 |     "rows":"10",
 6 |     "fl":"*,score",
 7 |     "":{"v":0}
 8 |   },
 9 |   "facets":{
10 |     "facet":"on",
11 |     "facet.mincount": "1",
12 |     "":{"v":0}
13 |   },
14 |  "velocity":{
15 |    "wt": "velocity",
16 |    "v.template":"browse",
17 |    "v.layout": "layout",
18 |    "":{"v":0}
19 |  }
20 | }}


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/params.json:
--------------------------------------------------------------------------------
 1 | {"params":{
 2 |   "query":{
 3 |     "defType":"edismax",
 4 |     "q.alt":"*:*",
 5 |     "rows":"10",
 6 |     "fl":"*,score",
 7 |     "":{"v":0}
 8 |   },
 9 |   "facets":{
10 |     "facet":"on",
11 |     "facet.mincount": "1",
12 |     "":{"v":0}
13 |   },
14 |  "velocity":{
15 |    "wt": "velocity",
16 |    "v.template":"browse",
17 |    "v.layout": "layout",
18 |    "":{"v":0}
19 |  }
20 | }}


--------------------------------------------------------------------------------
/notebooks/solr/msmarco/solr_config/conf/params.json:
--------------------------------------------------------------------------------
 1 | {"params":{
 2 |   "query":{
 3 |     "defType":"edismax",
 4 |     "q.alt":"*:*",
 5 |     "rows":"10",
 6 |     "fl":"*,score",
 7 |     "":{"v":0}
 8 |   },
 9 |   "facets":{
10 |     "facet":"on",
11 |     "facet.mincount": "1",
12 |     "":{"v":0}
13 |   },
14 |  "velocity":{
15 |    "wt": "velocity",
16 |    "v.template":"browse",
17 |    "v.layout": "layout",
18 |    "":{"v":0}
19 |  }
20 | }}


--------------------------------------------------------------------------------
/ltr/helpers/esUrlParse.py:
--------------------------------------------------------------------------------
 1 | def parseUrl(fullEsUrl):
 2 |     from urllib.parse import urlsplit, urlunsplit
 3 |     import os.path
 4 |     o = urlsplit(fullEsUrl)
 5 | 
 6 |     esUrl = urlunsplit([o.scheme, o.netloc, '','',''])
 7 | 
 8 |     indexAndSearchType = os.path.split(o.path)
 9 | 
10 |     return (esUrl, indexAndSearchType[0][1:], indexAndSearchType[1])
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     from sys import argv
15 |     print(parseUrl(argv[1]))
16 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/taxonomy.txt:
--------------------------------------------------------------------------------
 1 | # Capture how the *user* structures information
 2 | #looneytunes, looney tunes => looney_tunes, cartoons
 3 | #bugs bunny => bug_bunny, looney_tunes, cartoons
 4 | #mickey mouse => mickey_mouse, disney, cartoons
 5 | #minnie mouse => minnie_mouse, disney, cartoons
 6 | #donald duck => donald_duck, disney, cartoons
 7 | #yogi bear => yogi_bear, disney, cartoons
 8 | 
 9 | wife => wife, spouse
10 | bride => bride, spouse
11 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy.txt:
--------------------------------------------------------------------------------
 1 | # Capture how the *user* structures information
 2 | #looneytunes, looney tunes => looney_tunes, cartoons
 3 | #bugs bunny => bug_bunny, looney_tunes, cartoons
 4 | #mickey mouse => mickey_mouse, disney, cartoons
 5 | #minnie mouse => minnie_mouse, disney, cartoons
 6 | #donald duck => donald_duck, disney, cartoons
 7 | #yogi bear => yogi_bear, disney, cartoons
 8 | 
 9 | wife => wife, spouse
10 | bride => bride, spouse
11 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hy.txt:
--------------------------------------------------------------------------------
 1 | # example set of Armenian stopwords.
 2 | այդ
 3 | այլ
 4 | այն
 5 | այս
 6 | դու
 7 | դուք
 8 | եմ
 9 | են
10 | ենք
11 | ես
12 | եք
13 | է
14 | էի
15 | էին
16 | էինք
17 | էիր
18 | էիք
19 | էր
20 | ըստ
21 | թ
22 | ի
23 | ին
24 | իսկ
25 | իր
26 | կամ
27 | համար
28 | հետ
29 | հետո
30 | մենք
31 | մեջ
32 | մի
33 | ն
34 | նա
35 | նաև
36 | նրա
37 | նրանք
38 | որ
39 | որը
40 | որոնք
41 | որպես
42 | ու
43 | ում
44 | պիտի
45 | վրա
46 | և
47 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hy.txt:
--------------------------------------------------------------------------------
 1 | # example set of Armenian stopwords.
 2 | այդ
 3 | այլ
 4 | այն
 5 | այս
 6 | դու
 7 | դուք
 8 | եմ
 9 | են
10 | ենք
11 | ես
12 | եք
13 | է
14 | էի
15 | էին
16 | էինք
17 | էիր
18 | էիք
19 | էր
20 | ըստ
21 | թ
22 | ի
23 | ին
24 | իսկ
25 | իր
26 | կամ
27 | համար
28 | հետ
29 | հետո
30 | մենք
31 | մեջ
32 | մի
33 | ն
34 | նա
35 | նաև
36 | նրա
37 | նրանք
38 | որ
39 | որը
40 | որոնք
41 | որպես
42 | ու
43 | ում
44 | պիտի
45 | վրա
46 | և
47 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/idioms.txt:
--------------------------------------------------------------------------------
 1 | # Idioms is a synonyms file that captures idiomatic phrases as single units
 2 | 
 3 | # LHS is all representations encountered in query or document
 4 | looneytunes, looney tunes, looney toons => 12345
 5 | sci fi, scifi, science fiction => 56789
 6 | 
 7 | #looneytunes, looney tunes => looney_tunes
 8 | #bugs bunny => bug_bunny
 9 | #mickey mouse => mickey_mouse
10 | #minnie mouse => minnie_mouse
11 | #donald duck => donald_duck
12 | #yogi bear => yogi_bear


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/idioms.txt:
--------------------------------------------------------------------------------
 1 | # Idioms is a synonyms file that captures idiomatic phrases as single units
 2 | 
 3 | # LHS is all representations encountered in query or document
 4 | looneytunes, looney tunes, looney toons => 12345
 5 | sci fi, scifi, science fiction => 56789
 6 | 
 7 | #looneytunes, looney tunes => looney_tunes
 8 | #bugs bunny => bug_bunny
 9 | #mickey mouse => mickey_mouse
10 | #minnie mouse => minnie_mouse
11 | #donald duck => donald_duck
12 | #yogi bear => yogi_bear


--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/README.md:
--------------------------------------------------------------------------------
 1 | This folder will contain the query templates associated with the evaluation suite. 
 2 | A template is a JSON file containing a JSON object with name->value(s) pairs corresponding to query parameters. 
 3 | Although it is completely ok to have statically-defined values here, usually you will be using placeholders.
 4 | 
 5 | ```javascript
 6 |   {
 7 |     "q": "$query",
 8 |     "fq": "language:$lang"
 9 |   }
10 | ```
11 | The placeholders values will be defined within the ratings file, specifically in the queries definitions. 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-stretch
 2 | 
 3 | # Install openjdk
 4 | RUN apt-get update && \
 5 |     apt-get install -y openjdk-8-jdk graphviz && \
 6 |     apt-get clean;
 7 | 
 8 | # Setup a user 
 9 | RUN useradd -ms /bin/bash ltr 
10 | WORKDIR /home/ltr
11 | 
12 | # Make current directory accesible
13 | ADD . /home/ltr/hello-ltr
14 | 
15 | # Install requirements
16 | RUN chown -R ltr.ltr hello-ltr
17 | WORKDIR /home/ltr/hello-ltr
18 | 
19 | RUN pip install -r requirements.txt
20 | USER ltr
21 | 
22 | CMD jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token='' 
23 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   kibana:
 4 |     build: ./.docker/kb-docker/. 
 5 |     expose:
 6 |       - "5601"
 7 |     ports:
 8 |      - "5601:5601"
 9 |     environment:
10 |       SERVER_HOST: "0.0.0.0"
11 |   elasticsearch:
12 |     build: ./.docker/es-docker/.
13 |     ports:
14 |      - "9200:9200"
15 |     expose:
16 |       - "9200"
17 |     environment:
18 |       SERVER_NAME: "elasticsearch"
19 |     volumes:
20 |       - tlre-es-data:/usr/share/elasticsearch/data
21 | 
22 | volumes:
23 |   tlre-es-data:
24 |     driver: local
25 | 


--------------------------------------------------------------------------------
/tests/nb_test_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class NotebookTestConfig:
 4 | 
 5 |     SETUP_NB = 'setup.ipynb'
 6 | 
 7 |     def __init__(self, path):
 8 |         self.notebooks = []
 9 |         self.setup = None
10 |         for nb_path in os.listdir(path):
11 |             full_nb_path = os.path.join(path,nb_path)
12 |             if os.path.isfile(full_nb_path) and nb_path.endswith('.ipynb'):
13 |                 if nb_path == NotebookTestConfig.SETUP_NB:
14 |                     self.setup = full_nb_path
15 |                 else:
16 |                     self.notebooks.append(full_nb_path)
17 | 
18 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/zoo.cfg:
--------------------------------------------------------------------------------
 1 | # The number of milliseconds of each tick
 2 | tickTime=2000
 3 | # The number of ticks that the initial
 4 | # synchronization phase can take
 5 | initLimit=10
 6 | # The number of ticks that can pass between
 7 | # sending a request and getting an acknowledgement
 8 | syncLimit=5
 9 | 
10 | # the directory where the snapshot is stored.
11 | # dataDir=/opt/zookeeper/data
12 | # NOTE: Solr defaults the dataDir to <solrHome>/zoo_data
13 | 
14 | # the port at which the clients will connect
15 | # clientPort=2181
16 | # NOTE: Solr sets this based on zkRun / zkHost params
17 | 
18 | 


--------------------------------------------------------------------------------
/ltr/p9_plots.py:
--------------------------------------------------------------------------------
 1 | def plot_grades(dat):
 2 |     import plotnine as p9
 3 | 
 4 |     p = {
 5 |         p9.ggplot(dat, p9.aes('grade')) +
 6 |             p9.geom_bar() +
 7 |             p9.facet_wrap('keywords')
 8 |     }
 9 | 
10 |     return p
11 | 
12 | def plot_features(dat):
13 |     import plotnine as p9
14 |     
15 |     p = {
16 |         p9.ggplot(dat, p9.aes('grade', 'features', color = 'keywords')) +
17 |             p9.geom_jitter(alpha = .5) +
18 |             p9.facet_wrap('feature_id', scales = 'free_y', labeller = 'label_both') +
19 |             p9.labs(y='Feature values', x='Relevance grade')
20 |     }
21 |     
22 |     return p


--------------------------------------------------------------------------------
/notebooks/ltr.py:
--------------------------------------------------------------------------------
 1 | # Import a module with the same name from a different directory.
 2 | #
 3 | # Adapted from
 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
 5 | 
 6 | import importlib
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.insert(0, os.path.abspath('../'))
11 | 
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 | 
16 | del importlib
17 | del os
18 | del sys
19 | 
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 | 


--------------------------------------------------------------------------------
/notebooks/exercises/ltr.py:
--------------------------------------------------------------------------------
 1 | # Import a module with the same name from a different directory.
 2 | #
 3 | # Adapted from
 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
 5 | 
 6 | import importlib
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.insert(0, os.path.abspath('../../'))
11 | 
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 | 
16 | del importlib
17 | del os
18 | del sys
19 | 
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 | 


--------------------------------------------------------------------------------
/rre/README.md:
--------------------------------------------------------------------------------
 1 | rre
 2 | 
 3 | This folder contains some basic RRE demonstrations for running evaluations against your LTR models.
 4 | 
 5 | Navigate to `solr` or `elastic` depending on which you are using and do the following:
 6 | 
 7 | ## Getting Started
 8 | - Build the docker image: `docker build -t ltr-rre .`
 9 | - Run an evaluation: `docker run --name ltr-rre ltr-rre`
10 | - Copy the report to your host: `docker cp ltr-rre:/rre/target/site/rre-report.xlsx .`
11 | 
12 | Alternatively, you can run thru the `evaluation` notebooks in Jupyter to run these steps for you.
13 | 
14 | __Note:__ Older versions of Docker for Linux may have issues accessing localhost on the host machine
15 | 


--------------------------------------------------------------------------------
/notebooks/solr/msmarco/ltr.py:
--------------------------------------------------------------------------------
 1 | # Import a module with the same name from a different directory.
 2 | #
 3 | # Adapted from
 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
 5 | 
 6 | import importlib
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 | 
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 | 
16 | del importlib
17 | del os
18 | del sys
19 | 
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/ltr.py:
--------------------------------------------------------------------------------
 1 | # Import a module with the same name from a different directory.
 2 | #
 3 | # Adapted from
 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
 5 | 
 6 | import importlib
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 | 
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 | 
16 | del importlib
17 | del os
18 | del sys
19 | 
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/ltr.py:
--------------------------------------------------------------------------------
 1 | # Import a module with the same name from a different directory.
 2 | #
 3 | # Adapted from
 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
 5 | 
 6 | import importlib
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 | 
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 | 
16 | del importlib
17 | del os
18 | del sys
19 | 
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/osc-blog/ltr.py:
--------------------------------------------------------------------------------
 1 | # Import a module with the same name from a different directory.
 2 | #
 3 | # Adapted from
 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
 5 | 
 6 | import importlib
 7 | import os
 8 | import sys
 9 | 
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 | 
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 | 
16 | del importlib
17 | del os
18 | del sys
19 | 
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 | 


--------------------------------------------------------------------------------
/ltr/helpers/solr_escape.py:
--------------------------------------------------------------------------------
 1 | def esc_kw(kw):
 2 |     """ Take a keyword and escape all the
 3 |         Solr parts we want to escape!"""
 4 |     kw = kw.replace('\\', '\\\\') # be sure to do this first, as we inject \!
 5 |     kw = kw.replace('(', '\(')
 6 |     kw = kw.replace(')', '\)')
 7 |     kw = kw.replace('+', '\+')
 8 |     kw = kw.replace('-', '\-')
 9 |     kw = kw.replace(':', '\:')
10 |     kw = kw.replace('/', '\/')
11 |     kw = kw.replace(']', '\]')
12 |     kw = kw.replace('[', '\[')
13 |     kw = kw.replace('*', '\*')
14 |     kw = kw.replace('?', '\?')
15 |     kw = kw.replace('{', '\{')
16 |     kw = kw.replace('}', '\}')
17 |     kw = kw.replace('~', '\~')
18 | 
19 | 
20 |     return kw
21 | 


--------------------------------------------------------------------------------
/ltr/helpers/defaultlist.py:
--------------------------------------------------------------------------------
 1 | class DefaultList(list):
 2 |     """ adapted from https://stackoverflow.com/a/869901/8123"""
 3 | 
 4 |     def __init__(self, factory):
 5 |         self.factory = factory
 6 | 
 7 |     def __getitem__(self, index):
 8 |         size = len(self)
 9 |         if index >= size:
10 |             self.extend(self.factory() for _ in range(size, index + 1))
11 | 
12 |         return list.__getitem__(self, index)
13 | 
14 |     def __setitem__(self, index, value):
15 |         size = len(self)
16 |         if index >= size:
17 |             self.extend(self.factory() for _ in range(size, index + 1))
18 | 
19 |         list.__setitem__(self, index, value)
20 | 
21 | def defaultlist(factory):
22 |     return DefaultList(factory)
23 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | If you have issues getting jupyter or the JDK running on the host machine, you can use the files here to setup a docker environment with everything in one place.
 2 | 
 3 | ## Requirements
 4 | 
 5 | - Docker with docker-compose
 6 | - Ports 8888/8983/9200 must be available on your host machine
 7 | 
 8 | ## Setup
 9 | 
10 | Run `docker-compose up -d` 
11 | 
12 | The above command will build all images necessary for the project and run the following services:
13 | 
14 | - Jupyter available at localhost:8888
15 | - Solr available at localhost:8983
16 | - Elasticsearch available at localhost:9200
17 | 
18 | ## Cleanup
19 | 
20 | - To shut things down and return later run `docker-compose stop`
21 | - To get rid of everything run `docker-compose down`
22 | 


--------------------------------------------------------------------------------
/tests/run_most_nbs.py:
--------------------------------------------------------------------------------
 1 | from notebook_test_case import NotebooksTestCase
 2 | import unittest
 3 | 
 4 | class RunMostNotebooksTestCase(NotebooksTestCase):
 5 | 
 6 |     TEST_PATHS = ['./notebooks/',
 7 |                   './notebooks/solr/tmdb',
 8 |                   './notebooks/elasticsearch/tmdb',
 9 |                   './notebooks/elasticsearch/osc-blog']
10 | 
11 |     IGNORED_NBS = ['./notebooks/solr/tmdb/evaluation (Solr).ipynb',
12 |                    './notebooks/elasticsearch/tmdb/evaluation.ipynb']
13 | 
14 | 
15 |     def test_paths(self):
16 |         return RunMostNotebooksTestCase.TEST_PATHS
17 | 
18 |     def ignored_nbs(self):
19 |         return RunMostNotebooksTestCase.IGNORED_NBS
20 | 
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     unittest.main()
25 | 


--------------------------------------------------------------------------------
/ltr/index.py:
--------------------------------------------------------------------------------
 1 | from ltr.helpers.movies import indexable_movies, noop
 2 | 
 3 | def rebuild(client, index, doc_src, force = False):
 4 |     """ Reload a configuration on disk for each search engine
 5 |         (Solr a configset, Elasticsearch a json file)
 6 |         and reindex
 7 |     """
 8 | 
 9 |     if client.check_index_exists(index):
10 |         if (force):
11 |             client.delete_index(index)
12 |             client.create_index(index)
13 |             client.index_documents(index, doc_src=doc_src)
14 |         else:
15 |             print("Index {} already exists. Use `force = True` to delete and recreate".format(index))
16 |             return None
17 |     else:
18 |         client.create_index(index)
19 |         client.index_documents(index, doc_src=doc_src)
20 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/stopwords.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/stopwords.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_el.txt:
--------------------------------------------------------------------------------
 1 | # Lucene Greek Stopwords list
 2 | # Note: by default this file is used after GreekLowerCaseFilter,
 3 | # so when modifying this file use 'σ' instead of 'ς' 
 4 | ο
 5 | η
 6 | το
 7 | οι
 8 | τα
 9 | του
10 | τησ
11 | των
12 | τον
13 | την
14 | και 
15 | κι
16 | κ
17 | ειμαι
18 | εισαι
19 | ειναι
20 | ειμαστε
21 | ειστε
22 | στο
23 | στον
24 | στη
25 | στην
26 | μα
27 | αλλα
28 | απο
29 | για
30 | προσ
31 | με
32 | σε
33 | ωσ
34 | παρα
35 | αντι
36 | κατα
37 | μετα
38 | θα
39 | να
40 | δε
41 | δεν
42 | μη
43 | μην
44 | επι
45 | ενω
46 | εαν
47 | αν
48 | τοτε
49 | που
50 | πωσ
51 | ποιοσ
52 | ποια
53 | ποιο
54 | ποιοι
55 | ποιεσ
56 | ποιων
57 | ποιουσ
58 | αυτοσ
59 | αυτη
60 | αυτο
61 | αυτοι
62 | αυτων
63 | αυτουσ
64 | αυτεσ
65 | αυτα
66 | εκεινοσ
67 | εκεινη
68 | εκεινο
69 | εκεινοι
70 | εκεινεσ
71 | εκεινα
72 | εκεινων
73 | εκεινουσ
74 | οπωσ
75 | ομωσ
76 | ισωσ
77 | οσο
78 | οτι
79 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_el.txt:
--------------------------------------------------------------------------------
 1 | # Lucene Greek Stopwords list
 2 | # Note: by default this file is used after GreekLowerCaseFilter,
 3 | # so when modifying this file use 'σ' instead of 'ς' 
 4 | ο
 5 | η
 6 | το
 7 | οι
 8 | τα
 9 | του
10 | τησ
11 | των
12 | τον
13 | την
14 | και 
15 | κι
16 | κ
17 | ειμαι
18 | εισαι
19 | ειναι
20 | ειμαστε
21 | ειστε
22 | στο
23 | στον
24 | στη
25 | στην
26 | μα
27 | αλλα
28 | απο
29 | για
30 | προσ
31 | με
32 | σε
33 | ωσ
34 | παρα
35 | αντι
36 | κατα
37 | μετα
38 | θα
39 | να
40 | δε
41 | δεν
42 | μη
43 | μην
44 | επι
45 | ενω
46 | εαν
47 | αν
48 | τοτε
49 | που
50 | πωσ
51 | ποιοσ
52 | ποια
53 | ποιο
54 | ποιοι
55 | ποιεσ
56 | ποιων
57 | ποιουσ
58 | αυτοσ
59 | αυτη
60 | αυτο
61 | αυτοι
62 | αυτων
63 | αυτουσ
64 | αυτεσ
65 | αυτα
66 | εκεινοσ
67 | εκεινη
68 | εκεινο
69 | εκεινοι
70 | εκεινεσ
71 | εκεινα
72 | εκεινων
73 | εκεινουσ
74 | οπωσ
75 | ομωσ
76 | ισωσ
77 | οσο
78 | οτι
79 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/protwords.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | # Use a protected word file to protect against the stemmer reducing two
15 | # unrelated words to the same base word.
16 | 
17 | # Some non-words that normally won't be encountered,
18 | # just to test that they won't be stemmed.
19 | dontstems
20 | zwhacky
21 | 
22 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/protwords.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | # Use a protected word file to protect against the stemmer reducing two
15 | # unrelated words to the same base word.
16 | 
17 | # Some non-words that normally won't be encountered,
18 | # just to test that they won't be stemmed.
19 | dontstems
20 | zwhacky
21 | 
22 | 


--------------------------------------------------------------------------------
/ltr/download.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from os import path
 3 | 
 4 | def download_one(uri, dest='data/', force=False):
 5 |     import os
 6 | 
 7 |     if not os.path.exists(dest):
 8 |         os.makedirs(dest)
 9 | 
10 |     if not os.path.isdir(dest):
11 |         raise ValueError("dest {} is not a directory".format(dest))
12 | 
13 |     filename = uri[uri.rfind('/') + 1:]
14 |     filepath = os.path.join(dest, filename)
15 |     if path.exists(filepath):
16 |         if not force:
17 |             print(filepath + ' already exists')
18 |             return
19 |         print("exists but force=True, Downloading anyway")
20 | 
21 |     with open(filepath, 'wb') as out:
22 |         print('GET {}'.format(uri))
23 |         resp = requests.get(uri, stream=True)
24 |         for chunk in resp.iter_content(chunk_size=1024):
25 |             if chunk:
26 |                 out.write(chunk)
27 | 
28 | def download(uris, dest='data/', force=False):
29 |     for uri in uris:
30 |         download_one(uri=uri, dest=dest, force=force)
31 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3' 
 2 | services:
 3 |   notebooks:
 4 |     build: .
 5 |     container_name: hello-ltr-notebook
 6 |     ports:
 7 |       - 8888:8888 
 8 |     environment:
 9 |       - LTR_DOCKER=yes 
10 |     links:
11 |       - elastic
12 |       - solr
13 |   
14 |   elastic:
15 |     build:
16 |       context: ./notebooks/elasticsearch/.docker/es-docker/
17 |       dockerfile: Dockerfile
18 |     container_name: hello-ltr-elastic
19 |     ports:
20 |       - 9200:9200
21 | 
22 |   kibana:
23 |     build:
24 |       context: ./notebooks/elasticsearch/.docker/kb-docker/
25 |       dockerfile: Dockerfile
26 |     container_name: hello-ltr-kibana
27 |     expose:
28 |       - "5601"
29 |     ports:
30 |      - "5601:5601"
31 |     environment:
32 |       ELASTICSEARCH_HOSTS: "http://hello-ltr-elastic:9200"
33 |       ELASTICSEARCH_URL: "http://hello-ltr-elastic:9200"
34 |       SERVER_HOST: "0.0.0.0"
35 | 
36 |   solr:
37 |     build:
38 |       context: ./notebooks/solr/
39 |       dockerfile: Dockerfile
40 |     container_name: hello-ltr-solr
41 |     ports:
42 |       - 8983:8983 
43 | 
44 | 


--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/README.md:
--------------------------------------------------------------------------------
 1 | This folder will contain the query templates associated with the evaluation suite. 
 2 | The query shape in Elasticsearch is already a JSON file so each template should be a valid Elasticsearch query 
 3 | with all needed placeholders (that will be defined within the ratings file).
 4 | 
 5 | ```javascript
 6 | {
 7 |   "size": 0,
 8 |   "query": {
 9 |     "bool": {
10 |       "must": [
11 |         {
12 |           "multi_match": {
13 |             "query": "$query",
14 |             "fields": [
15 |               "some_searchable_field_1^1.75",
16 |               "some_other_searchable_field"
17 |             ],
18 |             "minimum_should_match": "3<-45% 6<-95%"
19 |           }
20 |         }
21 |       ]
22 |     }
23 |   },
24 |   "aggs": {
25 |     "headings": {
26 |       "terms": {
27 |         "field": "title_sugg",
28 |         "order": { "max_score": "desc" }
29 |       },
30 |       "aggs": {
31 |         "max_score": {
32 |           "max": {
33 |             "script": {
34 |               "lang": "painless",
35 |               "inline": "_score"
36 |             }
37 |           }
38 |         }
39 |       }
40 |     }
41 |   }
42 | }
43 | ```


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ga.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | a
  3 | ach
  4 | ag
  5 | agus
  6 | an
  7 | aon
  8 | ar
  9 | arna
 10 | as
 11 | b'
 12 | ba
 13 | beirt
 14 | bhúr
 15 | caoga
 16 | ceathair
 17 | ceathrar
 18 | chomh
 19 | chtó
 20 | chuig
 21 | chun
 22 | cois
 23 | céad
 24 | cúig
 25 | cúigear
 26 | d'
 27 | daichead
 28 | dar
 29 | de
 30 | deich
 31 | deichniúr
 32 | den
 33 | dhá
 34 | do
 35 | don
 36 | dtí
 37 | dá
 38 | dár
 39 | dó
 40 | faoi
 41 | faoin
 42 | faoina
 43 | faoinár
 44 | fara
 45 | fiche
 46 | gach
 47 | gan
 48 | go
 49 | gur
 50 | haon
 51 | hocht
 52 | i
 53 | iad
 54 | idir
 55 | in
 56 | ina
 57 | ins
 58 | inár
 59 | is
 60 | le
 61 | leis
 62 | lena
 63 | lenár
 64 | m'
 65 | mar
 66 | mo
 67 | mé
 68 | na
 69 | nach
 70 | naoi
 71 | naonúr
 72 | ná
 73 | ní
 74 | níor
 75 | nó
 76 | nócha
 77 | ocht
 78 | ochtar
 79 | os
 80 | roimh
 81 | sa
 82 | seacht
 83 | seachtar
 84 | seachtó
 85 | seasca
 86 | seisear
 87 | siad
 88 | sibh
 89 | sinn
 90 | sna
 91 | sé
 92 | sí
 93 | tar
 94 | thar
 95 | thú
 96 | triúr
 97 | trí
 98 | trína
 99 | trínár
100 | tríocha
101 | tú
102 | um
103 | ár
104 | é
105 | éis
106 | í
107 | ó
108 | ón
109 | óna
110 | ónár
111 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ga.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | a
  3 | ach
  4 | ag
  5 | agus
  6 | an
  7 | aon
  8 | ar
  9 | arna
 10 | as
 11 | b'
 12 | ba
 13 | beirt
 14 | bhúr
 15 | caoga
 16 | ceathair
 17 | ceathrar
 18 | chomh
 19 | chtó
 20 | chuig
 21 | chun
 22 | cois
 23 | céad
 24 | cúig
 25 | cúigear
 26 | d'
 27 | daichead
 28 | dar
 29 | de
 30 | deich
 31 | deichniúr
 32 | den
 33 | dhá
 34 | do
 35 | don
 36 | dtí
 37 | dá
 38 | dár
 39 | dó
 40 | faoi
 41 | faoin
 42 | faoina
 43 | faoinár
 44 | fara
 45 | fiche
 46 | gach
 47 | gan
 48 | go
 49 | gur
 50 | haon
 51 | hocht
 52 | i
 53 | iad
 54 | idir
 55 | in
 56 | ina
 57 | ins
 58 | inár
 59 | is
 60 | le
 61 | leis
 62 | lena
 63 | lenár
 64 | m'
 65 | mar
 66 | mo
 67 | mé
 68 | na
 69 | nach
 70 | naoi
 71 | naonúr
 72 | ná
 73 | ní
 74 | níor
 75 | nó
 76 | nócha
 77 | ocht
 78 | ochtar
 79 | os
 80 | roimh
 81 | sa
 82 | seacht
 83 | seachtar
 84 | seachtó
 85 | seasca
 86 | seisear
 87 | siad
 88 | sibh
 89 | sinn
 90 | sna
 91 | sé
 92 | sí
 93 | tar
 94 | thar
 95 | thú
 96 | triúr
 97 | trí
 98 | trína
 99 | trínár
100 | tríocha
101 | tú
102 | um
103 | ár
104 | é
105 | éis
106 | í
107 | ó
108 | ón
109 | óna
110 | ónár
111 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | 
16 | # Demonstrating bidirectional synonyms
17 | #wife,bride
18 | #wife,spouse
19 | #toons,tunes,cartoon
20 | 
21 | # Demonstrating => syntax
22 | # wife => wife, bride
23 | # spouse => spouse, husband, wife, partner
24 | # tunes => cartoons, toons, songs
25 | # cartoon  => toons, tunes
26 | 
27 | # Demonstrating multi phrase
28 | #looney tunes, cartoons
29 | #science fiction, sci fi, sci-fi, scifi
30 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms.txt:
--------------------------------------------------------------------------------
 1 | # The ASF licenses this file to You under the Apache License, Version 2.0
 2 | # (the "License"); you may not use this file except in compliance with
 3 | # the License.  You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 | 
16 | # Demonstrating bidirectional synonyms
17 | #wife,bride
18 | #wife,spouse
19 | #toons,tunes,cartoon
20 | 
21 | # Demonstrating => syntax
22 | # wife => wife, bride
23 | # spouse => spouse, husband, wife, partner
24 | # tunes => cartoons, toons, songs
25 | # cartoon  => toons, tunes
26 | 
27 | # Demonstrating multi phrase
28 | #looney tunes, cartoons
29 | #science fiction, sci fi, sci-fi, scifi
30 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_eu.txt:
--------------------------------------------------------------------------------
  1 | # example set of basque stopwords
  2 | al
  3 | anitz
  4 | arabera
  5 | asko
  6 | baina
  7 | bat
  8 | batean
  9 | batek
 10 | bati
 11 | batzuei
 12 | batzuek
 13 | batzuetan
 14 | batzuk
 15 | bera
 16 | beraiek
 17 | berau
 18 | berauek
 19 | bere
 20 | berori
 21 | beroriek
 22 | beste
 23 | bezala
 24 | da
 25 | dago
 26 | dira
 27 | ditu
 28 | du
 29 | dute
 30 | edo
 31 | egin
 32 | ere
 33 | eta
 34 | eurak
 35 | ez
 36 | gainera
 37 | gu
 38 | gutxi
 39 | guzti
 40 | haiei
 41 | haiek
 42 | haietan
 43 | hainbeste
 44 | hala
 45 | han
 46 | handik
 47 | hango
 48 | hara
 49 | hari
 50 | hark
 51 | hartan
 52 | hau
 53 | hauei
 54 | hauek
 55 | hauetan
 56 | hemen
 57 | hemendik
 58 | hemengo
 59 | hi
 60 | hona
 61 | honek
 62 | honela
 63 | honetan
 64 | honi
 65 | hor
 66 | hori
 67 | horiei
 68 | horiek
 69 | horietan
 70 | horko
 71 | horra
 72 | horrek
 73 | horrela
 74 | horretan
 75 | horri
 76 | hortik
 77 | hura
 78 | izan
 79 | ni
 80 | noiz
 81 | nola
 82 | non
 83 | nondik
 84 | nongo
 85 | nor
 86 | nora
 87 | ze
 88 | zein
 89 | zen
 90 | zenbait
 91 | zenbat
 92 | zer
 93 | zergatik
 94 | ziren
 95 | zituen
 96 | zu
 97 | zuek
 98 | zuen
 99 | zuten
100 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_eu.txt:
--------------------------------------------------------------------------------
  1 | # example set of basque stopwords
  2 | al
  3 | anitz
  4 | arabera
  5 | asko
  6 | baina
  7 | bat
  8 | batean
  9 | batek
 10 | bati
 11 | batzuei
 12 | batzuek
 13 | batzuetan
 14 | batzuk
 15 | bera
 16 | beraiek
 17 | berau
 18 | berauek
 19 | bere
 20 | berori
 21 | beroriek
 22 | beste
 23 | bezala
 24 | da
 25 | dago
 26 | dira
 27 | ditu
 28 | du
 29 | dute
 30 | edo
 31 | egin
 32 | ere
 33 | eta
 34 | eurak
 35 | ez
 36 | gainera
 37 | gu
 38 | gutxi
 39 | guzti
 40 | haiei
 41 | haiek
 42 | haietan
 43 | hainbeste
 44 | hala
 45 | han
 46 | handik
 47 | hango
 48 | hara
 49 | hari
 50 | hark
 51 | hartan
 52 | hau
 53 | hauei
 54 | hauek
 55 | hauetan
 56 | hemen
 57 | hemendik
 58 | hemengo
 59 | hi
 60 | hona
 61 | honek
 62 | honela
 63 | honetan
 64 | honi
 65 | hor
 66 | hori
 67 | horiei
 68 | horiek
 69 | horietan
 70 | horko
 71 | horra
 72 | horrek
 73 | horrela
 74 | horretan
 75 | horri
 76 | hortik
 77 | hura
 78 | izan
 79 | ni
 80 | noiz
 81 | nola
 82 | non
 83 | nondik
 84 | nongo
 85 | nor
 86 | nora
 87 | ze
 88 | zein
 89 | zen
 90 | zenbait
 91 | zenbat
 92 | zer
 93 | zergatik
 94 | ziren
 95 | zituen
 96 | zu
 97 | zuek
 98 | zuen
 99 | zuten
100 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/userdict_ja.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
 3 | #
 4 | # Add entries to this file in order to override the statistical model in terms
 5 | # of segmentation, readings and part-of-speech tags.  Notice that entries do
 6 | # not have weights since they are always used when found.  This is by-design
 7 | # in order to maximize ease-of-use.
 8 | #
 9 | # Entries are defined using the following CSV format:
10 | #  <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
11 | #
12 | # Notice that a single half-width space separates tokens and readings, and
13 | # that the number tokens and readings must match exactly.
14 | #
15 | # Also notice that multiple entries with the same <text> is undefined.
16 | #
17 | # Whitespace only lines are ignored.  Comments are not allowed on entry lines.
18 | #
19 | 
20 | # Custom segmentation for kanji compounds
21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
23 | 
24 | # Custom segmentation for compound katakana
25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
27 | 
28 | # Custom reading for former sumo wrestler
29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名
30 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/userdict_ja.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
 3 | #
 4 | # Add entries to this file in order to override the statistical model in terms
 5 | # of segmentation, readings and part-of-speech tags.  Notice that entries do
 6 | # not have weights since they are always used when found.  This is by-design
 7 | # in order to maximize ease-of-use.
 8 | #
 9 | # Entries are defined using the following CSV format:
10 | #  <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
11 | #
12 | # Notice that a single half-width space separates tokens and readings, and
13 | # that the number tokens and readings must match exactly.
14 | #
15 | # Also notice that multiple entries with the same <text> is undefined.
16 | #
17 | # Whitespace only lines are ignored.  Comments are not allowed on entry lines.
18 | #
19 | 
20 | # Custom segmentation for kanji compounds
21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
23 | 
24 | # Custom segmentation for compound katakana
25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
27 | 
28 | # Custom reading for former sumo wrestler
29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名
30 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/README.md:
--------------------------------------------------------------------------------
 1 | This folder contains one subfolder for each configuration version. 
 2 | Each version folder should contain a solr-settings.json file with details of
 3 | how to connect to the appropriate Solr core.
 4 | 
 5 | This is an example:
 6 | 
 7 | * configuration_sets  
 8 |   * v1.0
 9 |     * solr-settings.json
10 |   * v1.1
11 |     * solr-settings.json
12 | 
13 | The solr-settings.json files may have the following properties:
14 | 
15 | - `baseUrls`: an array of Solr base URLs (eg. `[ "http://localhost:8983/solr", "http://localhost:7574/solr" ]`).
16 | - `collectionName` [**REQUIRED**]: the name of the collection or core being evaluated.
17 | - `zkHosts`: an array of Zookeeper hosts (eg. `[ "zk1:2181", "zk2:2181" ]`).
18 | - `zkChroot`: the path to the root Zookeeper node containing Solr data, if running in a Chroot environment (eg. `"/solr"`).
19 | Optional.
20 | - `connectionTimeoutMillis`: the number of milliseconds to wait for a connection to be made to Solr. Optional.
21 | - `socketTimeoutMillis`: the number of milliseconds to allow for a response from Solr. Optional.
22 | 
23 | **Either** the baseUrls **or** the zkHosts property must contain values. If both are empty,
24 | the configuration will fail to load.


--------------------------------------------------------------------------------
/rre/elastic/src/etc/ratings/ratings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index": "tmdb",
 3 |   "id_field": "id",
 4 |   "topics": [
 5 |     {
 6 |       "description": "LTR Example Evaluation",
 7 |       "queries": [
 8 |         {
 9 |           "template": "query.json",
10 |           "placeholders": {
11 |             "$query": "batman"
12 |           }
13 |         }
14 |       ],
15 |       "relevant_documents": {
16 |         "4": [
17 |           "40662",
18 |           "45162",
19 |           "69735",
20 |           "123025",
21 |           "142061",
22 |           "177271",
23 |           "209112",
24 |           "242643",
25 |           "251519",
26 |           "321528",
27 |           "324849",
28 |           "366924",
29 |           "382322"
30 |         ],
31 |         "3": [
32 |           "272",
33 |           "13851",
34 |           "14919",
35 |           "16234",
36 |           "20077",
37 |           "21683",
38 |           "22855",
39 |           "29751"
40 |         ],
41 |         "2": [
42 |           "268",
43 |           "364",
44 |           "414",
45 |           "415",
46 |           "15805",
47 |           "17074"
48 |         ],
49 |         "1": [
50 |           "2661",
51 |           "93560",
52 |           "125249"
53 |         ]
54 |       }
55 |     }
56 |   ]
57 | }
58 | 


--------------------------------------------------------------------------------
/rre/solr/src/etc/ratings/ratings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index": "tmdb",
 3 |   "id_field": "id",
 4 |   "topics": [
 5 |     {
 6 |       "description": "LTR Example Evaluation",
 7 |       "queries": [
 8 |         {
 9 |           "template": "query.json",
10 |           "placeholders": {
11 |             "$query": "batman"
12 |           }
13 |         }
14 |       ],
15 |       "relevant_documents": {
16 |         "4": [
17 |           "40662",
18 |           "45162",
19 |           "69735",
20 |           "123025",
21 |           "142061",
22 |           "177271",
23 |           "209112",
24 |           "242643",
25 |           "251519",
26 |           "321528",
27 |           "324849",
28 |           "366924",
29 |           "382322"
30 |         ],
31 |         "3": [
32 |           "272",
33 |           "13851",
34 |           "14919",
35 |           "16234",
36 |           "20077",
37 |           "21683",
38 |           "22855",
39 |           "29751"
40 |         ],
41 |         "2": [
42 |           "268",
43 |           "364",
44 |           "414",
45 |           "415",
46 |           "15805",
47 |           "17074"
48 |         ],
49 |         "1": [
50 |           "2661",
51 |           "93560",
52 |           "125249"
53 |         ]
54 |       }
55 |     }
56 |   ]
57 | }
58 | 


--------------------------------------------------------------------------------
/tests/runner.py:
--------------------------------------------------------------------------------
 1 | # Notebook test runner, adapted from
 2 | # https://www.blog.pythonlibrary.org/2018/10/16/testing-jupyter-notebooks/
 3 | import nbformat
 4 | import os
 5 | 
 6 | from nbconvert.preprocessors import ExecutePreprocessor
 7 | 
 8 | def hours(hours):
 9 |     """ Hours as seconds """
10 |     hours * 60 * 60
11 | 
12 | def run_notebook(notebook_path, timeout=hours(6), save_nb_path=None):
13 |     nb_name, _ = os.path.splitext(os.path.basename(notebook_path))
14 |     dirname = os.path.dirname(notebook_path)
15 | 
16 |     with open(notebook_path) as f:
17 |         nb = nbformat.read(f, as_version=4)
18 | 
19 |     proc = ExecutePreprocessor(timeout=timeout, kernel_name='python3')
20 |     proc.allow_errors = True
21 | 
22 |     proc.preprocess(nb, {'metadata': {'path': dirname}})
23 | 
24 |     if save_nb_path:
25 |         with open(save_nb_path, mode='wt') as f:
26 |             nbformat.write(nb, f)
27 | 
28 |     errors = []
29 |     for cell in nb.cells:
30 |         if 'outputs' in cell:
31 |             for output in cell['outputs']:
32 |                 if output.output_type == 'error':
33 |                     errors.append(output)
34 | 
35 |     return nb, errors
36 | 
37 | if __name__ == '__main__':
38 |     nb, errors = run_notebook('Testing.ipynb')
39 |     print(errors)
40 | 


--------------------------------------------------------------------------------
/utils/train_to_csv.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | from ltr.judgments import judgments_from_file
 3 | from ltr.client import ElasticClient
 4 | import csv
 5 | 
 6 | 
 7 | def train_to_csv(client, feature_set, in_filename, out_filename):
 8 |     features = client.feature_set(name=feature_set, index='tmdb')[0]
 9 |     fieldnames = ['keywords', 'qid', 'grade']
10 |     fieldnames.extend([feature['name'] for feature in features])
11 |     with open(out_filename, 'w') as csvfile:
12 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
13 |         writer.writeheader()
14 | 
15 |         judgments = judgments_from_file(filename='data/title_judgments_train.txt')
16 |         for judgment in judgments:
17 |             assert len(judgment.features) == len(fieldnames) - 3
18 |             record = {}
19 |             record[fieldnames[0]] = judgment.keywords
20 |             record[fieldnames[1]] = judgment.qid
21 |             record[fieldnames[2]] = judgment.grade
22 |             for idx,field in enumerate(fieldnames[3:]):
23 |                 record[field] = judgment.features[idx]
24 | 
25 |             writer.writerow(record)
26 | 
27 | if __name__ == "__main__":
28 |     from sys import argv
29 |     client = ElasticClient()
30 |     train_to_csv(client=client, in_filename=argv[1],
31 |                  feature_set=argv[2], out_filename=argv[3])
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 | 
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 | 
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_th.txt:
--------------------------------------------------------------------------------
  1 | # Thai stopwords from:
  2 | # "Opinion Detection in Thai Political News Columns
  3 | # Based on Subjectivity Analysis"
  4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
  5 | ไว้
  6 | ไม่
  7 | ไป
  8 | ได้
  9 | ให้
 10 | ใน
 11 | โดย
 12 | แห่ง
 13 | แล้ว
 14 | และ
 15 | แรก
 16 | แบบ
 17 | แต่
 18 | เอง
 19 | เห็น
 20 | เลย
 21 | เริ่ม
 22 | เรา
 23 | เมื่อ
 24 | เพื่อ
 25 | เพราะ
 26 | เป็นการ
 27 | เป็น
 28 | เปิดเผย
 29 | เปิด
 30 | เนื่องจาก
 31 | เดียวกัน
 32 | เดียว
 33 | เช่น
 34 | เฉพาะ
 35 | เคย
 36 | เข้า
 37 | เขา
 38 | อีก
 39 | อาจ
 40 | อะไร
 41 | ออก
 42 | อย่าง
 43 | อยู่
 44 | อยาก
 45 | หาก
 46 | หลาย
 47 | หลังจาก
 48 | หลัง
 49 | หรือ
 50 | หนึ่ง
 51 | ส่วน
 52 | ส่ง
 53 | สุด
 54 | สําหรับ
 55 | ว่า
 56 | วัน
 57 | ลง
 58 | ร่วม
 59 | ราย
 60 | รับ
 61 | ระหว่าง
 62 | รวม
 63 | ยัง
 64 | มี
 65 | มาก
 66 | มา
 67 | พร้อม
 68 | พบ
 69 | ผ่าน
 70 | ผล
 71 | บาง
 72 | น่า
 73 | นี้
 74 | นํา
 75 | นั้น
 76 | นัก
 77 | นอกจาก
 78 | ทุก
 79 | ที่สุด
 80 | ที่
 81 | ทําให้
 82 | ทํา
 83 | ทาง
 84 | ทั้งนี้
 85 | ทั้ง
 86 | ถ้า
 87 | ถูก
 88 | ถึง
 89 | ต้อง
 90 | ต่างๆ
 91 | ต่าง
 92 | ต่อ
 93 | ตาม
 94 | ตั้งแต่
 95 | ตั้ง
 96 | ด้าน
 97 | ด้วย
 98 | ดัง
 99 | ซึ่ง
100 | ช่วง
101 | จึง
102 | จาก
103 | จัด
104 | จะ
105 | คือ
106 | ความ
107 | ครั้ง
108 | คง
109 | ขึ้น
110 | ของ
111 | ขอ
112 | ขณะ
113 | ก่อน
114 | ก็
115 | การ
116 | กับ
117 | กัน
118 | กว่า
119 | กล่าว
120 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_th.txt:
--------------------------------------------------------------------------------
  1 | # Thai stopwords from:
  2 | # "Opinion Detection in Thai Political News Columns
  3 | # Based on Subjectivity Analysis"
  4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
  5 | ไว้
  6 | ไม่
  7 | ไป
  8 | ได้
  9 | ให้
 10 | ใน
 11 | โดย
 12 | แห่ง
 13 | แล้ว
 14 | และ
 15 | แรก
 16 | แบบ
 17 | แต่
 18 | เอง
 19 | เห็น
 20 | เลย
 21 | เริ่ม
 22 | เรา
 23 | เมื่อ
 24 | เพื่อ
 25 | เพราะ
 26 | เป็นการ
 27 | เป็น
 28 | เปิดเผย
 29 | เปิด
 30 | เนื่องจาก
 31 | เดียวกัน
 32 | เดียว
 33 | เช่น
 34 | เฉพาะ
 35 | เคย
 36 | เข้า
 37 | เขา
 38 | อีก
 39 | อาจ
 40 | อะไร
 41 | ออก
 42 | อย่าง
 43 | อยู่
 44 | อยาก
 45 | หาก
 46 | หลาย
 47 | หลังจาก
 48 | หลัง
 49 | หรือ
 50 | หนึ่ง
 51 | ส่วน
 52 | ส่ง
 53 | สุด
 54 | สําหรับ
 55 | ว่า
 56 | วัน
 57 | ลง
 58 | ร่วม
 59 | ราย
 60 | รับ
 61 | ระหว่าง
 62 | รวม
 63 | ยัง
 64 | มี
 65 | มาก
 66 | มา
 67 | พร้อม
 68 | พบ
 69 | ผ่าน
 70 | ผล
 71 | บาง
 72 | น่า
 73 | นี้
 74 | นํา
 75 | นั้น
 76 | นัก
 77 | นอกจาก
 78 | ทุก
 79 | ที่สุด
 80 | ที่
 81 | ทําให้
 82 | ทํา
 83 | ทาง
 84 | ทั้งนี้
 85 | ทั้ง
 86 | ถ้า
 87 | ถูก
 88 | ถึง
 89 | ต้อง
 90 | ต่างๆ
 91 | ต่าง
 92 | ต่อ
 93 | ตาม
 94 | ตั้งแต่
 95 | ตั้ง
 96 | ด้าน
 97 | ด้วย
 98 | ดัง
 99 | ซึ่ง
100 | ช่วง
101 | จึง
102 | จาก
103 | จัด
104 | จะ
105 | คือ
106 | ความ
107 | ครั้ง
108 | คง
109 | ขึ้น
110 | ของ
111 | ขอ
112 | ขณะ
113 | ก่อน
114 | ก็
115 | การ
116 | กับ
117 | กัน
118 | กว่า
119 | กล่าว
120 | 


--------------------------------------------------------------------------------
/ltr/helpers/convert.py:
--------------------------------------------------------------------------------
 1 | # converts LambdaMART XML models to JSON for Solr..
 2 | 
 3 | import xml.etree.ElementTree as ET
 4 | 
 5 | 
 6 | def convert(ensemble_xml_string, modelName, featureSet, featureMapping):
 7 |     modelClass = 'org.apache.solr.ltr.model.MultipleAdditiveTreesModel'
 8 | 
 9 |     model = {
10 |         'store': featureSet,
11 |         'name': modelName,
12 |         'class': modelClass,
13 |         'features': featureMapping
14 |     }
15 | 
16 |     # Clean up header
17 |     ensemble_xml_string = '\n'.join(ensemble_xml_string.split('\n')[7:])
18 |     lambdaModel = ET.fromstring(ensemble_xml_string)
19 | 
20 |     trees = []
21 |     for node in lambdaModel:
22 |         t = {
23 |             'weight': str(node.attrib['weight']),
24 |             'root': parseSplits(node[0], featureMapping)
25 |         }
26 |         trees.append(t)
27 | 
28 |     # print(trees)
29 |     model['params'] = {'trees': trees}
30 | 
31 |     return model
32 | 
33 | def parseSplits(split, features):
34 |     obj = {}
35 |     for el in split:
36 |         if (el.tag == 'feature'):
37 |             obj['feature'] = features[(int(el.text.strip()) - 1)]['name']
38 |         elif (el.tag == 'threshold'):
39 |             obj['threshold'] = str(el.text.strip())
40 |         elif (el.tag == 'split' and 'pos' in el.attrib):
41 |             obj[el.attrib['pos']] = parseSplits(el, features)
42 |         elif (el.tag == 'output'):
43 |             obj['value'] = str(el.text.strip())
44 |     return obj
45 | 


--------------------------------------------------------------------------------
/ltr/clickmodels/coec.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | class Model():
 4 |     def __init__(self):
 5 |         # COEC statistic
 6 |         self.coecs = Counter()
 7 | 
 8 |         # CTR for each query-doc pair in this session
 9 |         self.ctrs = {}
10 | 
11 | def coec(ctr_by_rank, sessions):
12 |     """ Clicks over expected clicks is a metric
13 |         used for seeing what items get above or
14 |         below average CTR for their rank. From paper
15 | 
16 |         > Personalized Click Prediction in Sponsored Search
17 |         by Cheng, Cantu Paz
18 | 
19 |         A COEC > 1 means above average CTR for it's position
20 |         A COEC < 1 means below average
21 | 
22 |         -ctr_by_rank is the global CTR at each rank position
23 |         -sessions are an array of search session objects
24 | 
25 |         returned:
26 |         each query-doc pair in provided sessions COEC
27 | 
28 |         """
29 |     clicks = Counter()
30 |     weighted_impressions = Counter()
31 | 
32 |     for session in sessions:
33 |         for rank, doc in enumerate(session.docs):
34 |             weighted_impressions[(session.query, doc.doc_id)] += ctr_by_rank[rank]
35 |             if doc.click:
36 |                 clicks[(session.query, doc.doc_id)] += 1
37 | 
38 |     model = Model()
39 |     for query_id, doc_id in weighted_impressions:
40 |         model.coecs[(query_id,doc_id)] = \
41 |                 clicks[(query_id,doc_id)] / weighted_impressions[(query_id,doc_id)]
42 | 
43 |     return model
44 | 


--------------------------------------------------------------------------------
/ltr/release_date_plot.py:
--------------------------------------------------------------------------------
 1 | import plotly.graph_objs as go
 2 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
 3 | 
 4 | def search(client, user_query, model_name):
 5 |     if client.name() == 'elastic':
 6 |         engine_query = {
 7 |             "bool": {
 8 |                 "must": {"match_all": {} },
 9 |                 "filter": {
10 |                     "match": {"title": user_query}
11 |                 }
12 |             }
13 |         }
14 |     else:
15 |         engine_query = 'title:('+ user_query + ')^0'    
16 |     return client.model_query('tmdb', model_name, {}, engine_query)
17 | 
18 | def plot(client, query, models = ['classic', 'latest']):
19 |     init_notebook_mode(connected=True)
20 | 
21 |     modelData = []
22 | 
23 |     for model in models:
24 |         modelData.append(search(client, query, model))
25 | 
26 |     xAxes = []
27 |     for i in range(len(modelData[0])):
28 |         xAxes.append(i)
29 | 
30 |     trace0 = go.Scatter(
31 |         x = xAxes,
32 |         y = [x['release_year'] for x in modelData[0]],
33 |         mode = "lines",
34 |         name = models[0],
35 |         text = [f'{x["title"]} ({x["score"]})' for x in modelData[0]]
36 |     )
37 | 
38 |     trace1 = go.Scatter(
39 |         x = xAxes,
40 |         y = [x['release_year'] for x in modelData[1]],
41 |         mode = "lines",
42 |         name = models[1],
43 |         text = [f'{x["title"]} ({x["score"]})' for x in modelData[1]]
44 |     )
45 | 
46 | 
47 |     data = [trace0, trace1]
48 |     fig = go.Figure(data=data)
49 |     iplot(fig)
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alembic==1.4.2
 2 | appnope==0.1.0
 3 | attrs==19.1.0
 4 | backcall==0.1.0
 5 | bleach==3.1.4
 6 | certifi==2019.6.16
 7 | chardet==3.0.4
 8 | cycler==0.10.0
 9 | Cython
10 | decorator==4.3.2
11 | defusedxml==0.5.0
12 | elasticsearch==7.0.0
13 | entrypoints==0.3
14 | fuzzywuzzy==0.18.0
15 | graphviz==0.17
16 | idna==2.8
17 | ipykernel
18 | ipython
19 | ipython-genutils==0.2.0
20 | ipywidgets==7.4.2
21 | jedi==0.13.3
22 | Jinja2==2.10
23 | joblib==0.15.1
24 | jsonschema==3.0.1
25 | jupyter==1.0.0
26 | jupyter-client==5.2.4
27 | jupyter-console==6.0.0
28 | jupyter-core==4.4.0
29 | kiwisolver==1.1.0
30 | Mako==1.1.2
31 | MarkupSafe==1.1.1
32 | matplotlib==3.1.3
33 | mistune==0.8.4
34 | nbconvert==5.6.1
35 | nbformat==4.4.0
36 | nbgrader==0.6.1
37 | nbstripout==0.3.7
38 | notebook==5.7.6
39 | numpy
40 | pandas==1.3.4
41 | pandocfilters==1.4.2
42 | parso==0.3.4
43 | pexpect==4.6.0
44 | pickleshare==0.7.5
45 | plotly==3.7.0
46 | plotnine
47 | prometheus-client==0.6.0
48 | prompt-toolkit==2.0.9
49 | ptyprocess==0.6.0
50 | Pygments==2.3.1
51 | pyparsing==2.4.6
52 | pyrsistent==0.14.11
53 | python-dateutil==2.8.0
54 | python-editor==1.0.4
55 | pytz==2018.9
56 | pyzmq==18.0.1
57 | qtconsole==4.4.3
58 | requests==2.21.0
59 | retrying==1.3.3
60 | scikit-learn==0.23.0
61 | scipy==1.4.1
62 | seaborn
63 | Send2Trash==1.5.0
64 | six==1.12.0
65 | sklearn==0.0
66 | SQLAlchemy==1.3.15
67 | terminado==0.8.1
68 | testpath==0.4.2
69 | threadpoolctl==2.0.0
70 | tornado==6.0.1
71 | tqdm==4.43.0
72 | traitlets==4.3.2
73 | urllib3==1.24.1
74 | wcwidth==0.1.7
75 | webencodings==0.5.1
76 | widgetsnbextension==3.4.2
77 | xgboost==1.4.2
78 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/elevate.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | 
19 | <!-- If this file is found in the config directory, it will only be
20 |      loaded once at startup.  If it is found in Solr's data
21 |      directory, it will be re-loaded every commit.
22 | 
23 |    See http://wiki.apache.org/solr/QueryElevationComponent for more info
24 | 
25 | -->
26 | <elevate>
27 |  <!-- Query elevation examples
28 |   <query text="foo bar">
29 |     <doc id="1" />
30 |     <doc id="2" />
31 |     <doc id="3" />
32 |   </query>
33 | 
34 | for use with techproducts example
35 |  
36 |   <query text="ipod">
37 |     <doc id="MA147LL/A" />  put the actual ipod at the top 
38 |     <doc id="IW-02" exclude="true" /> exclude this cable
39 |   </query>
40 | -->
41 | 
42 | </elevate>
43 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/elevate.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | 
19 | <!-- If this file is found in the config directory, it will only be
20 |      loaded once at startup.  If it is found in Solr's data
21 |      directory, it will be re-loaded every commit.
22 | 
23 |    See http://wiki.apache.org/solr/QueryElevationComponent for more info
24 | 
25 | -->
26 | <elevate>
27 |  <!-- Query elevation examples
28 |   <query text="foo bar">
29 |     <doc id="1" />
30 |     <doc id="2" />
31 |     <doc id="3" />
32 |   </query>
33 | 
34 | for use with techproducts example
35 |  
36 |   <query text="ipod">
37 |     <doc id="MA147LL/A" />  put the actual ipod at the top 
38 |     <doc id="IW-02" exclude="true" /> exclude this cable
39 |   </query>
40 | -->
41 | 
42 | </elevate>
43 | 


--------------------------------------------------------------------------------
/notebooks/solr/msmarco/solr_config/conf/elevate.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | 
19 | <!-- If this file is found in the config directory, it will only be
20 |      loaded once at startup.  If it is found in Solr's data
21 |      directory, it will be re-loaded every commit.
22 | 
23 |    See http://wiki.apache.org/solr/QueryElevationComponent for more info
24 | 
25 | -->
26 | <elevate>
27 |  <!-- Query elevation examples
28 |   <query text="foo bar">
29 |     <doc id="1" />
30 |     <doc id="2" />
31 |     <doc id="3" />
32 |   </query>
33 | 
34 | for use with techproducts example
35 |  
36 |   <query text="ipod">
37 |     <doc id="MA147LL/A" />  put the actual ipod at the top 
38 |     <doc id="IW-02" exclude="true" /> exclude this cable
39 |   </query>
40 | -->
41 | 
42 | </elevate>
43 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ar.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization)
  5 | # This means that when modifying this list, you might need to add some 
  6 | # redundant entries, for example containing forms with both أ and ا
  7 | من
  8 | ومن
  9 | منها
 10 | منه
 11 | في
 12 | وفي
 13 | فيها
 14 | فيه
 15 | و
 16 | ف
 17 | ثم
 18 | او
 19 | أو
 20 | ب
 21 | بها
 22 | به
 23 | ا
 24 | أ
 25 | اى
 26 | اي
 27 | أي
 28 | أى
 29 | لا
 30 | ولا
 31 | الا
 32 | ألا
 33 | إلا
 34 | لكن
 35 | ما
 36 | وما
 37 | كما
 38 | فما
 39 | عن
 40 | مع
 41 | اذا
 42 | إذا
 43 | ان
 44 | أن
 45 | إن
 46 | انها
 47 | أنها
 48 | إنها
 49 | انه
 50 | أنه
 51 | إنه
 52 | بان
 53 | بأن
 54 | فان
 55 | فأن
 56 | وان
 57 | وأن
 58 | وإن
 59 | التى
 60 | التي
 61 | الذى
 62 | الذي
 63 | الذين
 64 | الى
 65 | الي
 66 | إلى
 67 | إلي
 68 | على
 69 | عليها
 70 | عليه
 71 | اما
 72 | أما
 73 | إما
 74 | ايضا
 75 | أيضا
 76 | كل
 77 | وكل
 78 | لم
 79 | ولم
 80 | لن
 81 | ولن
 82 | هى
 83 | هي
 84 | هو
 85 | وهى
 86 | وهي
 87 | وهو
 88 | فهى
 89 | فهي
 90 | فهو
 91 | انت
 92 | أنت
 93 | لك
 94 | لها
 95 | له
 96 | هذه
 97 | هذا
 98 | تلك
 99 | ذلك
100 | هناك
101 | كانت
102 | كان
103 | يكون
104 | تكون
105 | وكانت
106 | وكان
107 | غير
108 | بعض
109 | قد
110 | نحو
111 | بين
112 | بينما
113 | منذ
114 | ضمن
115 | حيث
116 | الان
117 | الآن
118 | خلال
119 | بعد
120 | قبل
121 | حتى
122 | عند
123 | عندما
124 | لدى
125 | جميع
126 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ar.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization)
  5 | # This means that when modifying this list, you might need to add some 
  6 | # redundant entries, for example containing forms with both أ and ا
  7 | من
  8 | ومن
  9 | منها
 10 | منه
 11 | في
 12 | وفي
 13 | فيها
 14 | فيه
 15 | و
 16 | ف
 17 | ثم
 18 | او
 19 | أو
 20 | ب
 21 | بها
 22 | به
 23 | ا
 24 | أ
 25 | اى
 26 | اي
 27 | أي
 28 | أى
 29 | لا
 30 | ولا
 31 | الا
 32 | ألا
 33 | إلا
 34 | لكن
 35 | ما
 36 | وما
 37 | كما
 38 | فما
 39 | عن
 40 | مع
 41 | اذا
 42 | إذا
 43 | ان
 44 | أن
 45 | إن
 46 | انها
 47 | أنها
 48 | إنها
 49 | انه
 50 | أنه
 51 | إنه
 52 | بان
 53 | بأن
 54 | فان
 55 | فأن
 56 | وان
 57 | وأن
 58 | وإن
 59 | التى
 60 | التي
 61 | الذى
 62 | الذي
 63 | الذين
 64 | الى
 65 | الي
 66 | إلى
 67 | إلي
 68 | على
 69 | عليها
 70 | عليه
 71 | اما
 72 | أما
 73 | إما
 74 | ايضا
 75 | أيضا
 76 | كل
 77 | وكل
 78 | لم
 79 | ولم
 80 | لن
 81 | ولن
 82 | هى
 83 | هي
 84 | هو
 85 | وهى
 86 | وهي
 87 | وهو
 88 | فهى
 89 | فهي
 90 | فهو
 91 | انت
 92 | أنت
 93 | لك
 94 | لها
 95 | له
 96 | هذه
 97 | هذا
 98 | تلك
 99 | ذلك
100 | هناك
101 | كانت
102 | كان
103 | يكون
104 | تكون
105 | وكانت
106 | وكان
107 | غير
108 | بعض
109 | قد
110 | نحو
111 | بين
112 | بينما
113 | منذ
114 | ضمن
115 | حيث
116 | الان
117 | الآن
118 | خلال
119 | بعد
120 | قبل
121 | حتى
122 | عند
123 | عندما
124 | لدى
125 | جميع
126 | 


--------------------------------------------------------------------------------
/ltr/clickmodels/conversion.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | def conv_aug_attracts(attracts, sessions, costs):
 4 |     """ Rescan sessions, using click-derrived attractiveness.
 5 | 
 6 |         If theres no conversion, punish the attractiveness derrived judgment
 7 | 
 8 |         BUT we punish costly things less, and cheap things more
 9 |     """
10 |     satisfacts = Counter()
11 |     counts = Counter()
12 |     for session in sessions:
13 |         for rank, doc in enumerate(session.docs):
14 |             attract = attracts[(session.query, doc.doc_id)]
15 |             if doc.click:
16 |                 if doc.conversion:
17 |                     # Confirms the attractiveness was real with actual relevance
18 |                     counts[(session.query, doc.doc_id)] += 1
19 |                     satisfacts[(session.query, doc.doc_id)] += attract
20 |                 else:
21 |                     # If it costs a lot, and there wasn't a conversion,
22 |                     #  thats ok, we default to attractiveness
23 |                     # If it costs little, and there wasn't a conversion,
24 |                     #  thats generally not ok, why didn't they do (easy action)
25 |                     counts[(session.query, doc.doc_id)] += 1
26 |                     satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id]
27 |             else:
28 |                 counts[(session.query, doc.doc_id)] += 1
29 |                 satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id]
30 | 
31 |     for (query_id, doc_id), count in counts.items():
32 |         satisfacts[(query_id, doc_id)] = satisfacts[(query_id,doc_id)] / count
33 | 
34 |     return satisfacts
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/notebook_test_case.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from nb_test_config import NotebookTestConfig
 3 | import runner
 4 | 
 5 | class NotebooksTestCase(unittest.TestCase):
 6 | 
 7 |     SAVE_NB_PATH='tests/last_run.ipynb'
 8 | 
 9 |     def test_paths(self):
10 |         return []
11 | 
12 |     def ignored_nbs(self):
13 |         return []
14 | 
15 |     def nbs_to_run(self):
16 |         class IncludeAll:
17 |             def __contains__(self, _):
18 |                 return True
19 |         return IncludeAll()
20 | 
21 |     def test_for_no_errors(self):
22 |         """ Run all nbs in directories at test_paths()
23 |             also included in nbs_to_run(),
24 |             excepting those in ignored_nbs()
25 |             - assert there are no errors
26 |             """
27 |         for nb_path in self.test_paths():
28 | 
29 |             nb_cfg = NotebookTestConfig(path=nb_path)
30 |             print("EXECUTING NBS IN DIRECTORY: " + nb_path)
31 |             if nb_cfg.setup:
32 |                 print("Setting up ... " + nb_path)
33 |                 nb, errors = runner.run_notebook(nb_cfg.setup, save_nb_path=NotebooksTestCase.SAVE_NB_PATH)
34 |                 print(errors)
35 |                 assert len(errors) == 0
36 |             for nb in nb_cfg.notebooks:
37 |                 if nb in self.nbs_to_run():
38 |                     if nb in self.ignored_nbs():
39 |                         print("Ignored " + nb)
40 |                     else:
41 |                         print("Running... " + nb)
42 |                         nb, errors = runner.run_notebook(nb, save_nb_path=NotebooksTestCase.SAVE_NB_PATH)
43 |                         print(errors)
44 |                         assert len(errors) == 0
45 | 
46 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_gl.txt:
--------------------------------------------------------------------------------
  1 | # galican stopwords
  2 | a
  3 | aínda
  4 | alí
  5 | aquel
  6 | aquela
  7 | aquelas
  8 | aqueles
  9 | aquilo
 10 | aquí
 11 | ao
 12 | aos
 13 | as
 14 | así
 15 | á
 16 | ben
 17 | cando
 18 | che
 19 | co
 20 | coa
 21 | comigo
 22 | con
 23 | connosco
 24 | contigo
 25 | convosco
 26 | coas
 27 | cos
 28 | cun
 29 | cuns
 30 | cunha
 31 | cunhas
 32 | da
 33 | dalgunha
 34 | dalgunhas
 35 | dalgún
 36 | dalgúns
 37 | das
 38 | de
 39 | del
 40 | dela
 41 | delas
 42 | deles
 43 | desde
 44 | deste
 45 | do
 46 | dos
 47 | dun
 48 | duns
 49 | dunha
 50 | dunhas
 51 | e
 52 | el
 53 | ela
 54 | elas
 55 | eles
 56 | en
 57 | era
 58 | eran
 59 | esa
 60 | esas
 61 | ese
 62 | eses
 63 | esta
 64 | estar
 65 | estaba
 66 | está
 67 | están
 68 | este
 69 | estes
 70 | estiven
 71 | estou
 72 | eu
 73 | é
 74 | facer
 75 | foi
 76 | foron
 77 | fun
 78 | había
 79 | hai
 80 | iso
 81 | isto
 82 | la
 83 | las
 84 | lle
 85 | lles
 86 | lo
 87 | los
 88 | mais
 89 | me
 90 | meu
 91 | meus
 92 | min
 93 | miña
 94 | miñas
 95 | moi
 96 | na
 97 | nas
 98 | neste
 99 | nin
100 | no
101 | non
102 | nos
103 | nosa
104 | nosas
105 | noso
106 | nosos
107 | nós
108 | nun
109 | nunha
110 | nuns
111 | nunhas
112 | o
113 | os
114 | ou
115 | ó
116 | ós
117 | para
118 | pero
119 | pode
120 | pois
121 | pola
122 | polas
123 | polo
124 | polos
125 | por
126 | que
127 | se
128 | senón
129 | ser
130 | seu
131 | seus
132 | sexa
133 | sido
134 | sobre
135 | súa
136 | súas
137 | tamén
138 | tan
139 | te
140 | ten
141 | teñen
142 | teño
143 | ter
144 | teu
145 | teus
146 | ti
147 | tido
148 | tiña
149 | tiven
150 | túa
151 | túas
152 | un
153 | unha
154 | unhas
155 | uns
156 | vos
157 | vosa
158 | vosas
159 | voso
160 | vosos
161 | vós
162 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_gl.txt:
--------------------------------------------------------------------------------
  1 | # galican stopwords
  2 | a
  3 | aínda
  4 | alí
  5 | aquel
  6 | aquela
  7 | aquelas
  8 | aqueles
  9 | aquilo
 10 | aquí
 11 | ao
 12 | aos
 13 | as
 14 | así
 15 | á
 16 | ben
 17 | cando
 18 | che
 19 | co
 20 | coa
 21 | comigo
 22 | con
 23 | connosco
 24 | contigo
 25 | convosco
 26 | coas
 27 | cos
 28 | cun
 29 | cuns
 30 | cunha
 31 | cunhas
 32 | da
 33 | dalgunha
 34 | dalgunhas
 35 | dalgún
 36 | dalgúns
 37 | das
 38 | de
 39 | del
 40 | dela
 41 | delas
 42 | deles
 43 | desde
 44 | deste
 45 | do
 46 | dos
 47 | dun
 48 | duns
 49 | dunha
 50 | dunhas
 51 | e
 52 | el
 53 | ela
 54 | elas
 55 | eles
 56 | en
 57 | era
 58 | eran
 59 | esa
 60 | esas
 61 | ese
 62 | eses
 63 | esta
 64 | estar
 65 | estaba
 66 | está
 67 | están
 68 | este
 69 | estes
 70 | estiven
 71 | estou
 72 | eu
 73 | é
 74 | facer
 75 | foi
 76 | foron
 77 | fun
 78 | había
 79 | hai
 80 | iso
 81 | isto
 82 | la
 83 | las
 84 | lle
 85 | lles
 86 | lo
 87 | los
 88 | mais
 89 | me
 90 | meu
 91 | meus
 92 | min
 93 | miña
 94 | miñas
 95 | moi
 96 | na
 97 | nas
 98 | neste
 99 | nin
100 | no
101 | non
102 | nos
103 | nosa
104 | nosas
105 | noso
106 | nosos
107 | nós
108 | nun
109 | nunha
110 | nuns
111 | nunhas
112 | o
113 | os
114 | ou
115 | ó
116 | ós
117 | para
118 | pero
119 | pode
120 | pois
121 | pola
122 | polas
123 | polo
124 | polos
125 | por
126 | que
127 | se
128 | senón
129 | ser
130 | seu
131 | seus
132 | sexa
133 | sido
134 | sobre
135 | súa
136 | súas
137 | tamén
138 | tan
139 | te
140 | ten
141 | teñen
142 | teño
143 | ter
144 | teu
145 | teus
146 | ti
147 | tido
148 | tiña
149 | tiven
150 | túa
151 | túas
152 | un
153 | unha
154 | unhas
155 | uns
156 | vos
157 | vosa
158 | vosas
159 | voso
160 | vosos
161 | vós
162 | 


--------------------------------------------------------------------------------
/ltr/injectTypos.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid
 3 |     from butterfingers import butterfingers
 4 | except ImportError:
 5 |     from .judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid
 6 |     from .butterfingers import butterfingers
 7 | 
 8 | 
 9 | 
10 | def typoIt(judgmentInFile, judgmentOutFile, rounds=100):
11 |     with open(judgmentInFile) as f:
12 |         currJudgments = [judg for judg in judgments_from_file(f)]
13 |     lastQid = currJudgments[-1].qid
14 |     judgDict = judgments_by_qid(currJudgments)
15 | 
16 |     existingTypos = set()
17 | 
18 |     for i in range(0, rounds):
19 | 
20 |         for qid, judglist in judgDict.items():
21 |             keywords = judglist[0].keywords
22 |             keywordsWTypo = butterfingers(keywords)
23 | 
24 |             if keywordsWTypo != keywords and keywordsWTypo not in existingTypos:
25 |                 newQid = lastQid+1
26 |                 print("%s => %s" % (keywords, keywordsWTypo))
27 |                 lastQid += 1
28 |                 for judg in judglist:
29 |                     typoJudg = Judgment(grade=judg.grade,
30 |                                         qid=newQid,
31 |                                         keywords=keywordsWTypo,
32 |                                         docId=judg.docId)
33 |                     currJudgments.append(typoJudg)
34 |                 existingTypos.add(keywordsWTypo)
35 | 
36 |     with open(judgmentOutFile, 'w') as f:
37 |         judgments_to_file(f, judgmentsList=currJudgments)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     typoIt(judgmentInFile='title_judgments.txt', judgmentOutFile='title_fuzzy_judgments.txt')
42 | 
43 | 
44 |     # Clone a judgment, inject random typos
45 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_cz.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | s
  3 | k
  4 | o
  5 | i
  6 | u
  7 | v
  8 | z
  9 | dnes
 10 | cz
 11 | tímto
 12 | budeš
 13 | budem
 14 | byli
 15 | jseš
 16 | můj
 17 | svým
 18 | ta
 19 | tomto
 20 | tohle
 21 | tuto
 22 | tyto
 23 | jej
 24 | zda
 25 | proč
 26 | máte
 27 | tato
 28 | kam
 29 | tohoto
 30 | kdo
 31 | kteří
 32 | mi
 33 | nám
 34 | tom
 35 | tomuto
 36 | mít
 37 | nic
 38 | proto
 39 | kterou
 40 | byla
 41 | toho
 42 | protože
 43 | asi
 44 | ho
 45 | naši
 46 | napište
 47 | re
 48 | což
 49 | tím
 50 | takže
 51 | svých
 52 | její
 53 | svými
 54 | jste
 55 | aj
 56 | tu
 57 | tedy
 58 | teto
 59 | bylo
 60 | kde
 61 | ke
 62 | pravé
 63 | ji
 64 | nad
 65 | nejsou
 66 | či
 67 | pod
 68 | téma
 69 | mezi
 70 | přes
 71 | ty
 72 | pak
 73 | vám
 74 | ani
 75 | když
 76 | však
 77 | neg
 78 | jsem
 79 | tento
 80 | článku
 81 | články
 82 | aby
 83 | jsme
 84 | před
 85 | pta
 86 | jejich
 87 | byl
 88 | ještě
 89 | až
 90 | bez
 91 | také
 92 | pouze
 93 | první
 94 | vaše
 95 | která
 96 | nás
 97 | nový
 98 | tipy
 99 | pokud
100 | může
101 | strana
102 | jeho
103 | své
104 | jiné
105 | zprávy
106 | nové
107 | není
108 | vás
109 | jen
110 | podle
111 | zde
112 | už
113 | být
114 | více
115 | bude
116 | již
117 | než
118 | který
119 | by
120 | které
121 | co
122 | nebo
123 | ten
124 | tak
125 | má
126 | při
127 | od
128 | po
129 | jsou
130 | jak
131 | další
132 | ale
133 | si
134 | se
135 | ve
136 | to
137 | jako
138 | za
139 | zpět
140 | ze
141 | do
142 | pro
143 | je
144 | na
145 | atd
146 | atp
147 | jakmile
148 | přičemž
149 | já
150 | on
151 | ona
152 | ono
153 | oni
154 | ony
155 | my
156 | vy
157 | jí
158 | ji
159 | mě
160 | mne
161 | jemu
162 | tomu
163 | těm
164 | těmu
165 | němu
166 | němuž
167 | jehož
168 | jíž
169 | jelikož
170 | jež
171 | jakož
172 | načež
173 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_cz.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | s
  3 | k
  4 | o
  5 | i
  6 | u
  7 | v
  8 | z
  9 | dnes
 10 | cz
 11 | tímto
 12 | budeš
 13 | budem
 14 | byli
 15 | jseš
 16 | můj
 17 | svým
 18 | ta
 19 | tomto
 20 | tohle
 21 | tuto
 22 | tyto
 23 | jej
 24 | zda
 25 | proč
 26 | máte
 27 | tato
 28 | kam
 29 | tohoto
 30 | kdo
 31 | kteří
 32 | mi
 33 | nám
 34 | tom
 35 | tomuto
 36 | mít
 37 | nic
 38 | proto
 39 | kterou
 40 | byla
 41 | toho
 42 | protože
 43 | asi
 44 | ho
 45 | naši
 46 | napište
 47 | re
 48 | což
 49 | tím
 50 | takže
 51 | svých
 52 | její
 53 | svými
 54 | jste
 55 | aj
 56 | tu
 57 | tedy
 58 | teto
 59 | bylo
 60 | kde
 61 | ke
 62 | pravé
 63 | ji
 64 | nad
 65 | nejsou
 66 | či
 67 | pod
 68 | téma
 69 | mezi
 70 | přes
 71 | ty
 72 | pak
 73 | vám
 74 | ani
 75 | když
 76 | však
 77 | neg
 78 | jsem
 79 | tento
 80 | článku
 81 | články
 82 | aby
 83 | jsme
 84 | před
 85 | pta
 86 | jejich
 87 | byl
 88 | ještě
 89 | až
 90 | bez
 91 | také
 92 | pouze
 93 | první
 94 | vaše
 95 | která
 96 | nás
 97 | nový
 98 | tipy
 99 | pokud
100 | může
101 | strana
102 | jeho
103 | své
104 | jiné
105 | zprávy
106 | nové
107 | není
108 | vás
109 | jen
110 | podle
111 | zde
112 | už
113 | být
114 | více
115 | bude
116 | již
117 | než
118 | který
119 | by
120 | které
121 | co
122 | nebo
123 | ten
124 | tak
125 | má
126 | při
127 | od
128 | po
129 | jsou
130 | jak
131 | další
132 | ale
133 | si
134 | se
135 | ve
136 | to
137 | jako
138 | za
139 | zpět
140 | ze
141 | do
142 | pro
143 | je
144 | na
145 | atd
146 | atp
147 | jakmile
148 | přičemž
149 | já
150 | on
151 | ona
152 | ono
153 | oni
154 | ony
155 | my
156 | vy
157 | jí
158 | ji
159 | mě
160 | mne
161 | jemu
162 | tomu
163 | těm
164 | těmu
165 | němu
166 | němuž
167 | jehož
168 | jíž
169 | jelikož
170 | jež
171 | jakož
172 | načež
173 | 


--------------------------------------------------------------------------------
/ltr/years_as_ratings.py:
--------------------------------------------------------------------------------
 1 | def get_classic_rating(year):
 2 |     if year > 2010:
 3 |         return 0
 4 |     elif year > 1990:
 5 |         return 1
 6 |     elif year > 1970:
 7 |         return 2
 8 |     elif year > 1950:
 9 |         return 3
10 |     else:
11 |         return 4
12 | 
13 | def get_latest_rating(year):
14 |     if year > 2010:
15 |         return 4
16 |     elif year > 1990:
17 |         return 3
18 |     elif year > 1970:
19 |         return 2
20 |     elif year > 1950:
21 |         return 1
22 |     else:
23 |         return 0
24 | 
25 | def synthesize(
26 |     client,
27 |     featureSet='release',
28 |     latestTrainingSetOut='data/latest-training.txt',
29 |     classicTrainingSetOut='data/classic-training.txt'
30 | ):
31 |     from ltr.judgments import judgments_to_file, Judgment
32 |     NO_ZERO = False
33 | 
34 |     resp = client.log_query('tmdb', 'release', None)
35 | 
36 |     # A classic film fan
37 |     judgments = []
38 |     print("Generating 'classic' biased judgments:")
39 |     for hit in resp:
40 |         rating = get_classic_rating(hit['ltr_features'][0])
41 | 
42 |         if rating == 0 and NO_ZERO:
43 |             continue
44 | 
45 |         judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords=''))
46 | 
47 | 
48 |     with open(classicTrainingSetOut, 'w') as out:
49 |         judgments_to_file(out, judgments)
50 | 
51 |     # A current film fan
52 |     judgments = []
53 |     print("Generating 'recent' biased judgments:")
54 |     for hit in resp:
55 |         rating = get_latest_rating(hit['ltr_features'][0])
56 | 
57 |         if rating == 0 and NO_ZERO:
58 |             continue
59 | 
60 |         judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords=''))
61 | 
62 | 
63 |     with open(latestTrainingSetOut, 'w') as out:
64 |         judgments_to_file(out, judgments)
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/ltr/search.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | baseEsQuery = {
 4 |     "size": 5,
 5 |     "query": {
 6 |         "sltr": {
 7 |             "params": {
 8 |                 "keywords": "",
 9 |             },
10 |             "model": ""
11 |         }
12 |       }
13 | }
14 | 
15 | def esLtrQuery(keywords, modelName):
16 |     import json
17 |     baseEsQuery['query']['sltr']['params']['keywords'] = keywords
18 |     baseEsQuery['query']['sltr']['params']['keywordsList'] = [keywords]  # Needed by TSQ for now
19 |     baseEsQuery['query']['sltr']['model'] = modelName
20 |     print("%s" % json.dumps(baseEsQuery))
21 |     return baseEsQuery
22 | 
23 | # TODO: Parse params and add efi dynamically instead of adding manually to query below
24 | def solrLtrQuery(keywords, modelName):
25 |     keywords = re.sub('([^\s\w]|_)+', '', keywords)
26 |     fuzzy_keywords = ' '.join([x + '~' for x in keywords.split(' ')])
27 | 
28 |     return {
29 |         'fl': '*,score',
30 |         'rows': 5,
31 |         'q': '{{!ltr reRankDocs=30000 model={} efi.keywords="{}" efi.fuzzy_keywords="{}"}}'.format(modelName, keywords, fuzzy_keywords)
32 |     }
33 | 
34 | 
35 | tmdbFields = {
36 |     'title': 'title',
37 |     'display_fields': ['release_year', 'genres', 'overview']
38 | }
39 | 
40 | 
41 | 
42 | def search(client, keywords, modelName, index='tmdb', fields=tmdbFields):
43 |     if client.name() == 'elastic':
44 |         results = client.query(index, esLtrQuery(keywords, modelName))
45 |     else:
46 |         q = solrLtrQuery(keywords, modelName)
47 |         print(q)
48 |         results = client.query(index, q)
49 | 
50 |     ti = fields['title']
51 | 
52 |     for result in results:
53 |          print("%s " % (result[ti] if ti in result else 'N/A'))
54 |          print("%s " % (result['_score']))
55 | 
56 |          for df in fields['display_fields']:
57 |             print("%s " % (result[df] if df in result else 'N/A'))
58 | 
59 |          print("---------------------------------------")
60 | 


--------------------------------------------------------------------------------
/ltr/helpers/msmarco/evaluate.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import gzip
 3 | 
 4 | 
 5 | class QRel():
 6 | 
 7 |     def __init__(self, qid, docid, keywords):
 8 |         self.qid=qid
 9 |         self.docid=docid
10 |         self.keywords = keywords
11 | 
12 |     def eval_rr(self, doc_ranking):
13 |         """ Evaluate the provided doc ranking using reciprical rank
14 |             (1/rank of the expected doc)
15 | 
16 |             returns 0 if this qrels doc id is missing
17 |         """
18 | 
19 |         for rank, docid in enumerate(doc_ranking, start=1):
20 |             if docid == self.docid:
21 |                 return 1.0 / rank
22 |         return 0.0
23 | 
24 |     @staticmethod
25 |     def read_qrels(qrels_fname='data/msmarco-doctrain-qrels.tsv.gz',
26 |                    queries_fname='data/msmarco-doctrain-queries.tsv.gz'):
27 | 
28 |         qids_to_keywords = QRel.get_keyword_lookup(queries_fname)
29 | 
30 |         with gzip.open(qrels_fname, 'rt') as f:
31 |             reader = csv.reader(f, delimiter=' ')
32 |             for row in reader:
33 |                 qid = row[0]
34 |                 keywords = None
35 |                 if qid in qids_to_keywords:
36 |                     keywords = qids_to_keywords[qid]
37 |                 else:
38 |                     print("Missing keywords for %s" % qid)
39 |                 yield QRel(qid=row[0], docid=row[2], keywords=keywords)
40 | 
41 |     @staticmethod
42 |     def get_keyword_lookup(fname='data/msmarco-doctrain-queries.tsv.gz'):
43 |         qids_to_keywords = {}
44 |         with gzip.open(fname, 'rt') as f:
45 |             reader = csv.reader(f, delimiter='\t')
46 |             for row in reader:
47 |                 qids_to_keywords[row[0]] = row[1]
48 |         return qids_to_keywords
49 | 
50 |     def __str__(self):
51 |         return "qid:%s(%s) => doc:%s" % (self.qid, self.keywords, self.docid)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     qrels = {}
56 |     for qrel in QRel.read_qrels():
57 |         qrels[qrel.qid] = qrel
58 | 
59 |     print(qrels['1185869'].eval_rr(['1','1']))
60 | 
61 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ja.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file defines a stopword set for Japanese.
  3 | #
  4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
  5 | # Punctuation characters and frequent kanji have mostly been left out.  See LUCENE-3745
  6 | # for frequency lists, etc. that can be useful for making your own set (if desired)
  7 | #
  8 | # Note that there is an overlap between these stopwords and the terms stopped when used
  9 | # in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 10 | # that comments are not allowed on the same line as stopwords.
 11 | #
 12 | # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
 13 | # configuration if you need case-sensitive stopping.  Lastly, note that stopping is done
 14 | # using the same character width as the entries in this file.  Since this StopFilter is
 15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji
 16 | # entries to be in half-width and your kana entries to be in full-width.
 17 | #
 18 | の
 19 | に
 20 | は
 21 | を
 22 | た
 23 | が
 24 | で
 25 | て
 26 | と
 27 | し
 28 | れ
 29 | さ
 30 | ある
 31 | いる
 32 | も
 33 | する
 34 | から
 35 | な
 36 | こと
 37 | として
 38 | い
 39 | や
 40 | れる
 41 | など
 42 | なっ
 43 | ない
 44 | この
 45 | ため
 46 | その
 47 | あっ
 48 | よう
 49 | また
 50 | もの
 51 | という
 52 | あり
 53 | まで
 54 | られ
 55 | なる
 56 | へ
 57 | か
 58 | だ
 59 | これ
 60 | によって
 61 | により
 62 | おり
 63 | より
 64 | による
 65 | ず
 66 | なり
 67 | られる
 68 | において
 69 | ば
 70 | なかっ
 71 | なく
 72 | しかし
 73 | について
 74 | せ
 75 | だっ
 76 | その後
 77 | できる
 78 | それ
 79 | う
 80 | ので
 81 | なお
 82 | のみ
 83 | でき
 84 | き
 85 | つ
 86 | における
 87 | および
 88 | いう
 89 | さらに
 90 | でも
 91 | ら
 92 | たり
 93 | その他
 94 | に関する
 95 | たち
 96 | ます
 97 | ん
 98 | なら
 99 | に対して
100 | 特に
101 | せる
102 | 及び
103 | これら
104 | とき
105 | では
106 | にて
107 | ほか
108 | ながら
109 | うち
110 | そして
111 | とともに
112 | ただし
113 | かつて
114 | それぞれ
115 | または
116 | お
117 | ほど
118 | ものの
119 | に対する
120 | ほとんど
121 | と共に
122 | といった
123 | です
124 | とも
125 | ところ
126 | ここ
127 | ##### End of file
128 | 


--------------------------------------------------------------------------------
/ltr/clickmodels/cascade.py:
--------------------------------------------------------------------------------
 1 | from ltr.clickmodels.session import build
 2 | from collections import Counter, defaultdict
 3 | 
 4 | class Model():
 5 |     def __init__(self):
 6 |         # Attractiveness per query-doc
 7 |         self.attracts = defaultdict(lambda : 0.5)
 8 | 
 9 | def cascade_model(sessions):
10 |     """ Cascading model can be solved directly:
11 |          - sessions with skips count against a doc
12 |          - sessions with clicks count for
13 |          - stop at first click
14 |         """
15 |     session_counts = Counter()
16 |     click_counts = Counter()
17 |     model=Model()
18 | 
19 |     for session in sessions:
20 |         for rank, doc in enumerate(session.docs):
21 |             query_doc_key = (session.query, doc.doc_id)
22 |             session_counts[query_doc_key] += 1
23 | 
24 |             if doc.click:
25 |                 # Cascading model doesn't consider
26 |                 # clicks past the last one, so we count
27 |                 # this one and break out
28 |                 click_counts[query_doc_key] += 1
29 |                 break;
30 | 
31 |     for (query_id, doc_id), count in session_counts.items():
32 |         query_doc_key = (query_id, doc_id)
33 |         model.attracts[query_doc_key] = click_counts[query_doc_key] / session_counts[query_doc_key]
34 |     return model
35 | 
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     sessions = build([
40 |       ('A', ((1, True), (2, False), (3, True), (0, False))),
41 |       ('B', ((5, False), (2, True), (3, True), (0, False))),
42 |       ('A', ((1, False), (2, False), (3, True), (0, False))),
43 |       ('B', ((1, False), (2, False), (3, False), (9, True))),
44 |       ('A', ((9, False), (2, False), (1, True), (0, True))),
45 |       ('B', ((6, True), (2, False), (3, True), (1, False))),
46 |       ('A', ((7, False), (4, True), (1, False), (3, False))),
47 |       ('B', ((8, True), (2, False), (3, True), (1, False))),
48 |       ('A', ((1, False), (4, True), (2, False), (3, False))),
49 |       ('B', ((7, True), (4, False), (5, True), (1, True))),
50 |     ])
51 |     cascade_model(sessions)
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ja.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file defines a stopword set for Japanese.
  3 | #
  4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
  5 | # Punctuation characters and frequent kanji have mostly been left out.  See LUCENE-3745
  6 | # for frequency lists, etc. that can be useful for making your own set (if desired)
  7 | #
  8 | # Note that there is an overlap between these stopwords and the terms stopped when used
  9 | # in combination with the JapanesePartOfSpeechStopFilter.  When editing this file, note
 10 | # that comments are not allowed on the same line as stopwords.
 11 | #
 12 | # Also note that stopping is done in a case-insensitive manner.  Change your StopFilter
 13 | # configuration if you need case-sensitive stopping.  Lastly, note that stopping is done
 14 | # using the same character width as the entries in this file.  Since this StopFilter is
 15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji
 16 | # entries to be in half-width and your kana entries to be in full-width.
 17 | #
 18 | の
 19 | に
 20 | は
 21 | を
 22 | た
 23 | が
 24 | で
 25 | て
 26 | と
 27 | し
 28 | れ
 29 | さ
 30 | ある
 31 | いる
 32 | も
 33 | する
 34 | から
 35 | な
 36 | こと
 37 | として
 38 | い
 39 | や
 40 | れる
 41 | など
 42 | なっ
 43 | ない
 44 | この
 45 | ため
 46 | その
 47 | あっ
 48 | よう
 49 | また
 50 | もの
 51 | という
 52 | あり
 53 | まで
 54 | られ
 55 | なる
 56 | へ
 57 | か
 58 | だ
 59 | これ
 60 | によって
 61 | により
 62 | おり
 63 | より
 64 | による
 65 | ず
 66 | なり
 67 | られる
 68 | において
 69 | ば
 70 | なかっ
 71 | なく
 72 | しかし
 73 | について
 74 | せ
 75 | だっ
 76 | その後
 77 | できる
 78 | それ
 79 | う
 80 | ので
 81 | なお
 82 | のみ
 83 | でき
 84 | き
 85 | つ
 86 | における
 87 | および
 88 | いう
 89 | さらに
 90 | でも
 91 | ら
 92 | たり
 93 | その他
 94 | に関する
 95 | たち
 96 | ます
 97 | ん
 98 | なら
 99 | に対して
100 | 特に
101 | せる
102 | 及び
103 | これら
104 | とき
105 | では
106 | にて
107 | ほか
108 | ながら
109 | うち
110 | そして
111 | とともに
112 | ただし
113 | かつて
114 | それぞれ
115 | または
116 | お
117 | ほど
118 | ものの
119 | に対する
120 | ほとんど
121 | と共に
122 | といった
123 | です
124 | とも
125 | ところ
126 | ここ
127 | ##### End of file
128 | 


--------------------------------------------------------------------------------
/ltr/clickmodels/session.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Doc:
 3 |     def __init__(self, click, doc_id, conversion=False):
 4 |         self.click = click
 5 |         self.doc_id = doc_id
 6 |         self.conversion = conversion
 7 | 
 8 |     def __repr__(self):
 9 |         return "Doc(doc_id=%s, click=%s, conversion=%s)" % (self.doc_id, self.click, self.conversion)
10 | 
11 |     def __str__(self):
12 |         return "(%s, %s, %s)" % (self.doc_id, self.click, self.conversion)
13 | 
14 | 
15 | class Session:
16 |     def __init__(self, query, docs):
17 |         self.query = query
18 |         self.docs = docs
19 |         # Check if docs are unique
20 |         docset = set()
21 |         for doc in docs:
22 |             if doc.doc_id in docset:
23 |                 raise ValueError("A session may only list a doc exactly once in search results")
24 |             docset.add(doc.doc_id)
25 | 
26 |     def __repr__(self):
27 |         return "Session(query=%s, docs=%s)" % (self.query, self.docs)
28 | 
29 |     def __str__(self):
30 |         return "(%s, (%s))" % (self.query, self.docs)
31 | 
32 | 
33 | def build_one(sess_tuple):
34 |     """ Take a tuple where
35 |         0th item is query (a string that uniquely identifies it)
36 |         1st item is a list of docs, with clicks
37 |                  and optionally a conversion id or true/false
38 | 
39 | 
40 |         ('A', ((1, True), (2, False), (3, True), (0, False))),
41 | 
42 |         alternatively a value can be attached to the doc
43 | 
44 |         ('A', ((1, True, 0.9), (2, False, 0.8), (3, True, 1.0), (0, False))),
45 |     """
46 |     query = sess_tuple[0]
47 |     docs = []
48 |     for doc_tuple in sess_tuple[1]:
49 |         conversion = False
50 |         if len(doc_tuple) > 2:
51 |             conversion = doc_tuple[2]
52 |         docs.append(Doc(doc_id=doc_tuple[0],
53 |                         click=doc_tuple[1],
54 |                         conversion=conversion))
55 |     return Session(query=query, docs=docs)
56 | 
57 | 
58 | def build(sess_tuples):
59 |     sesss = []
60 |     for sess_tup in sess_tuples:
61 |         sesss.append(build_one(sess_tup))
62 |     return sesss
63 | 
64 | 


--------------------------------------------------------------------------------
/ltr/client/base_client.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | '''
 4 |     This project demonstrates working with LTR in Elasticsearch and Solr
 5 | 
 6 |     The goal of this class is to abstract away the server and highlight the steps
 7 |     required to begin working with LTR.  This keeps the examples agnostic about
 8 |     which backend is being used, but the implementations of each client
 9 |     should be useful references to those getting started with LTR on
10 |     their specific platform
11 | '''
12 | class BaseClient(ABC):
13 |     @abstractmethod
14 |     def get_host(self):
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def name(self):
19 |         pass
20 | 
21 |     @abstractmethod
22 |     def delete_index(self, index):
23 |         pass
24 | 
25 |     @abstractmethod
26 |     def create_index(self, index):
27 |         pass
28 | 
29 |     @abstractmethod
30 |     def index_documents(self, index, doc_src):
31 |         pass
32 | 
33 |     @abstractmethod
34 |     def reset_ltr(self, index):
35 |         pass
36 | 
37 |     @abstractmethod
38 |     def create_featureset(self, index, name, ftr_config):
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def get_feature_name(self, config, ftr_idx):
43 |         pass
44 | 
45 |     @abstractmethod
46 |     def query(self, index, query):
47 |         pass
48 | 
49 |     @abstractmethod
50 |     def get_doc(self, doc_id):
51 |         pass
52 | 
53 |     @abstractmethod
54 |     def log_query(self, index, featureset, ids, params):
55 |         pass
56 | 
57 |     @abstractmethod
58 |     def submit_model(self, featureset, index, model_name, model_payload):
59 |         pass
60 | 
61 |     @abstractmethod
62 |     def submit_ranklib_model(self, featureset, index, model_name, model_payload):
63 |         pass
64 | 
65 |     @abstractmethod
66 |     def model_query(self, index, model, model_params, query):
67 |         pass
68 | 
69 |     @abstractmethod
70 |     def feature_set(self, index, name):
71 |         """ Return a mapping of name/feature ordinal
72 |             and the raw (search engine specific) feature list"""
73 |         pass
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/ltr/evaluate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import plotly.graph_objs as go
 5 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
 6 | 
 7 | def log_run(cmd):
 8 |     resp = os.popen(cmd).read()
 9 |     print(resp)
10 | 
11 | def quiet_run(cmd):
12 |     os.popen(cmd).read()
13 | 
14 | def evaluate(mode):
15 |     # Build the docker image
16 |     if mode == 'elastic':
17 |         cmd = 'docker build --no-cache -t ltr-rre rre/elastic/.'
18 |     else:
19 |         cmd = 'docker build --no-cache -t ltr-rre rre/solr/.'
20 | 
21 |     print('Building RRE image - This will take a while')
22 |     quiet_run(cmd)
23 | 
24 |     # Remove and run a fresh docker image
25 |     cmd = 'docker rm -f ltr-rre'
26 |     quiet_run(cmd)
27 | 
28 |     cmd = 'docker run --name ltr-rre ltr-rre'
29 |     print('Running evaluation')
30 |     log_run(cmd)
31 | 
32 |     # Copy out reports
33 |     cmd = 'docker cp ltr-rre:/rre/target/rre/evaluation.json data/rre-evaluation.json'
34 |     log_run(cmd)
35 | 
36 |     cmd = 'docker cp ltr-rre:/rre/target/site/rre-report.xlsx data/rre-report.xlsx'
37 |     log_run(cmd)
38 | 
39 |     print('RRE Evaluation complete')
40 | 
41 | 
42 | def rre_table():
43 |     init_notebook_mode(connected=True)
44 | 
45 |     with open('data/rre-evaluation.json') as src:
46 |         report = json.load(src)
47 |         metrics = report['metrics']
48 | 
49 |     experiments = ['baseline', 'classic', 'latest']
50 |     precisions = []
51 |     recalls = []
52 |     errs = []
53 | 
54 |     for exp in experiments:
55 |         precisions.append(metrics['P']['versions'][exp]['value'])
56 |         recalls.append(metrics['R']['versions'][exp]['value'])
57 |         errs.append(metrics['ERR@30']['versions'][exp]['value'])
58 | 
59 |     trace = go.Table(
60 |             header=dict(values=['', 'Precision', 'Recall', 'ERR'], fill = dict(color='#AAAAAA')),
61 |             cells=dict(values=[
62 |                     experiments,
63 |                     precisions,
64 |                     recalls,
65 |                     errs
66 |                 ])
67 |     )
68 | 
69 |     data = [trace]
70 |     iplot(data)
71 | 
72 | 


--------------------------------------------------------------------------------
/ltr/helpers/butterfingers.py:
--------------------------------------------------------------------------------
 1 | def butterfingers(text,prob=0.1,keyboard='qwerty'):
 2 |     import random
 3 | 
 4 |     """ taken from
 5 |         https://github.com/Decagon/butter-fingers/blob/master/butterfingers/butterfingers.py """
 6 | 
 7 |     keyApprox = {}
 8 | 
 9 |     if keyboard == "qwerty":
10 |         keyApprox['q'] = "qwasedzx"
11 |         keyApprox['w'] = "wqesadrfcx"
12 |         keyApprox['e'] = "ewrsfdqazxcvgt"
13 |         keyApprox['r'] = "retdgfwsxcvgt"
14 |         keyApprox['t'] = "tryfhgedcvbnju"
15 |         keyApprox['y'] = "ytugjhrfvbnji"
16 |         keyApprox['u'] = "uyihkjtgbnmlo"
17 |         keyApprox['i'] = "iuojlkyhnmlp"
18 |         keyApprox['o'] = "oipklujm"
19 |         keyApprox['p'] = "plo['ik"
20 | 
21 |         keyApprox['a'] = "aqszwxwdce"
22 |         keyApprox['s'] = "swxadrfv"
23 |         keyApprox['d'] = "decsfaqgbv"
24 |         keyApprox['f'] = "fdgrvwsxyhn"
25 |         keyApprox['g'] = "gtbfhedcyjn"
26 |         keyApprox['h'] = "hyngjfrvkim"
27 |         keyApprox['j'] = "jhknugtblom"
28 |         keyApprox['k'] = "kjlinyhn"
29 |         keyApprox['l'] = "lokmpujn"
30 | 
31 |         keyApprox['z'] = "zaxsvde"
32 |         keyApprox['x'] = "xzcsdbvfrewq"
33 |         keyApprox['c'] = "cxvdfzswergb"
34 |         keyApprox['v'] = "vcfbgxdertyn"
35 |         keyApprox['b'] = "bvnghcftyun"
36 |         keyApprox['n'] = "nbmhjvgtuik"
37 |         keyApprox['m'] = "mnkjloik"
38 |         keyApprox[' '] = " "
39 |     else:
40 |         print("Keyboard not supported.")
41 | 
42 |     probOfTypo = int(prob * 100)
43 | 
44 |     buttertext = ""
45 |     for letter in text:
46 |         lcletter = letter.lower()
47 |         if not lcletter in keyApprox.keys():
48 |             newletter = lcletter
49 |         else:
50 |             if random.choice(range(0, 100)) <= probOfTypo:
51 |                     newletter = random.choice(keyApprox[lcletter])
52 |             else:
53 |                     newletter = lcletter
54 |         # go back to original case
55 |         if not lcletter == letter:
56 |             newletter = newletter.upper()
57 |         buttertext += newletter
58 | 
59 |     return buttertext
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_lv.txt:
--------------------------------------------------------------------------------
  1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
  2 | # the original list of over 800 forms was refined: 
  3 | #   pronouns, adverbs, interjections were removed
  4 | # 
  5 | # prepositions
  6 | aiz
  7 | ap
  8 | ar
  9 | apakš
 10 | ārpus
 11 | augšpus
 12 | bez
 13 | caur
 14 | dēļ
 15 | gar
 16 | iekš
 17 | iz
 18 | kopš
 19 | labad
 20 | lejpus
 21 | līdz
 22 | no
 23 | otrpus
 24 | pa
 25 | par
 26 | pār
 27 | pēc
 28 | pie
 29 | pirms
 30 | pret
 31 | priekš
 32 | starp
 33 | šaipus
 34 | uz
 35 | viņpus
 36 | virs
 37 | virspus
 38 | zem
 39 | apakšpus
 40 | # Conjunctions
 41 | un
 42 | bet
 43 | jo
 44 | ja
 45 | ka
 46 | lai
 47 | tomēr
 48 | tikko
 49 | turpretī
 50 | arī
 51 | kaut
 52 | gan
 53 | tādēļ
 54 | tā
 55 | ne
 56 | tikvien
 57 | vien
 58 | kā
 59 | ir
 60 | te
 61 | vai
 62 | kamēr
 63 | # Particles
 64 | ar
 65 | diezin
 66 | droši
 67 | diemžēl
 68 | nebūt
 69 | ik
 70 | it
 71 | taču
 72 | nu
 73 | pat
 74 | tiklab
 75 | iekšpus
 76 | nedz
 77 | tik
 78 | nevis
 79 | turpretim
 80 | jeb
 81 | iekam
 82 | iekām
 83 | iekāms
 84 | kolīdz
 85 | līdzko
 86 | tiklīdz
 87 | jebšu
 88 | tālab
 89 | tāpēc
 90 | nekā
 91 | itin
 92 | jā
 93 | jau
 94 | jel
 95 | nē
 96 | nezin
 97 | tad
 98 | tikai
 99 | vis
100 | tak
101 | iekams
102 | vien
103 | # modal verbs
104 | būt  
105 | biju 
106 | biji
107 | bija
108 | bijām
109 | bijāt
110 | esmu
111 | esi
112 | esam
113 | esat 
114 | būšu     
115 | būsi
116 | būs
117 | būsim
118 | būsiet
119 | tikt
120 | tiku
121 | tiki
122 | tika
123 | tikām
124 | tikāt
125 | tieku
126 | tiec
127 | tiek
128 | tiekam
129 | tiekat
130 | tikšu
131 | tiks
132 | tiksim
133 | tiksiet
134 | tapt
135 | tapi
136 | tapāt
137 | topat
138 | tapšu
139 | tapsi
140 | taps
141 | tapsim
142 | tapsiet
143 | kļūt
144 | kļuvu
145 | kļuvi
146 | kļuva
147 | kļuvām
148 | kļuvāt
149 | kļūstu
150 | kļūsti
151 | kļūst
152 | kļūstam
153 | kļūstat
154 | kļūšu
155 | kļūsi
156 | kļūs
157 | kļūsim
158 | kļūsiet
159 | # verbs
160 | varēt
161 | varēju
162 | varējām
163 | varēšu
164 | varēsim
165 | var
166 | varēji
167 | varējāt
168 | varēsi
169 | varēsiet
170 | varat
171 | varēja
172 | varēs
173 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_lv.txt:
--------------------------------------------------------------------------------
  1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
  2 | # the original list of over 800 forms was refined: 
  3 | #   pronouns, adverbs, interjections were removed
  4 | # 
  5 | # prepositions
  6 | aiz
  7 | ap
  8 | ar
  9 | apakš
 10 | ārpus
 11 | augšpus
 12 | bez
 13 | caur
 14 | dēļ
 15 | gar
 16 | iekš
 17 | iz
 18 | kopš
 19 | labad
 20 | lejpus
 21 | līdz
 22 | no
 23 | otrpus
 24 | pa
 25 | par
 26 | pār
 27 | pēc
 28 | pie
 29 | pirms
 30 | pret
 31 | priekš
 32 | starp
 33 | šaipus
 34 | uz
 35 | viņpus
 36 | virs
 37 | virspus
 38 | zem
 39 | apakšpus
 40 | # Conjunctions
 41 | un
 42 | bet
 43 | jo
 44 | ja
 45 | ka
 46 | lai
 47 | tomēr
 48 | tikko
 49 | turpretī
 50 | arī
 51 | kaut
 52 | gan
 53 | tādēļ
 54 | tā
 55 | ne
 56 | tikvien
 57 | vien
 58 | kā
 59 | ir
 60 | te
 61 | vai
 62 | kamēr
 63 | # Particles
 64 | ar
 65 | diezin
 66 | droši
 67 | diemžēl
 68 | nebūt
 69 | ik
 70 | it
 71 | taču
 72 | nu
 73 | pat
 74 | tiklab
 75 | iekšpus
 76 | nedz
 77 | tik
 78 | nevis
 79 | turpretim
 80 | jeb
 81 | iekam
 82 | iekām
 83 | iekāms
 84 | kolīdz
 85 | līdzko
 86 | tiklīdz
 87 | jebšu
 88 | tālab
 89 | tāpēc
 90 | nekā
 91 | itin
 92 | jā
 93 | jau
 94 | jel
 95 | nē
 96 | nezin
 97 | tad
 98 | tikai
 99 | vis
100 | tak
101 | iekams
102 | vien
103 | # modal verbs
104 | būt  
105 | biju 
106 | biji
107 | bija
108 | bijām
109 | bijāt
110 | esmu
111 | esi
112 | esam
113 | esat 
114 | būšu     
115 | būsi
116 | būs
117 | būsim
118 | būsiet
119 | tikt
120 | tiku
121 | tiki
122 | tika
123 | tikām
124 | tikāt
125 | tieku
126 | tiec
127 | tiek
128 | tiekam
129 | tiekat
130 | tikšu
131 | tiks
132 | tiksim
133 | tiksiet
134 | tapt
135 | tapi
136 | tapāt
137 | topat
138 | tapšu
139 | tapsi
140 | taps
141 | tapsim
142 | tapsiet
143 | kļūt
144 | kļuvu
145 | kļuvi
146 | kļuva
147 | kļuvām
148 | kļuvāt
149 | kļūstu
150 | kļūsti
151 | kļūst
152 | kļūstam
153 | kļūstat
154 | kļūšu
155 | kļūsi
156 | kļūs
157 | kļūsim
158 | kļūsiet
159 | # verbs
160 | varēt
161 | varēju
162 | varējām
163 | varēšu
164 | varēsim
165 | var
166 | varēji
167 | varējāt
168 | varēsi
169 | varēsiet
170 | varat
171 | varēja
172 | varēs
173 | 


--------------------------------------------------------------------------------
/ltr/client/solr_parse.py:
--------------------------------------------------------------------------------
 1 | def every_other_zipped(lst):
 2 |     return zip(lst[0::2],lst[1::2])
 3 | 
 4 | def dictify(nl_tups):
 5 |     """ Return dict if all keys unique, otherwise
 6 |         dont modify """
 7 |     as_dict = dict(nl_tups)
 8 |     if len(as_dict) == len(nl_tups):
 9 |         return as_dict
10 |     return nl_tups
11 | 
12 | def parse_named_list(lst):
13 |     shallow_tups = [tup for tup in every_other_zipped(lst)]
14 | 
15 |     nl_as_tups = []
16 | 
17 |     for tup in shallow_tups:
18 |         if isinstance(tup[1], list):
19 |             tup = (tup[0], parse_named_list(tup[1]))
20 |         nl_as_tups.append(tup)
21 |     return dictify(nl_as_tups)
22 | 
23 | 
24 | def parse_termvect_namedlist(lst, field):
25 |     """ Parse the named list and perform some transformations to create consistent
26 |        JSON to parse
27 | 
28 |         Specifically changing {"positions": ...} to {"positions": [1234,4567]}
29 | 
30 |        """
31 | 
32 |     def listify_posns(posn_attrs):
33 |         if isinstance(posn_attrs, dict):
34 |             assert len(posn_attrs) == 1
35 |             return [posn_attrs['position']]
36 |         return [posn_attr[1] for posn_attr in posn_attrs]
37 | 
38 | 
39 |     tv_parsed = parse_named_list(lst)
40 |     for doc_id, doc_field_tv in tv_parsed.items():
41 |         for field_name, term_vects in doc_field_tv.items():
42 |             # T
43 |             if field_name == field:
44 |                 for term, attrs in term_vects.items():
45 |                     for attr_key, attr_val in attrs.items():
46 |                         if attr_key == 'positions':
47 |                             attrs['positions'] = listify_posns(attr_val)
48 |     return tv_parsed
49 | 
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     solr_nl =  [
54 | 		"D100000", [
55 | 			"uniqueKey", "D100000",
56 | 			"body", [
57 | 				"1", [
58 | 					"positions", [
59 | 						"position", 92,
60 | 						"position", 113
61 | 					]],
62 | 				"2", [
63 | 					"positions", [
64 | 						"position", 22,
65 | 						"position", 413
66 | 					]],
67 | 				"boo", [
68 | 					"positions", [
69 | 						"position", 22,
70 | 					]]
71 | 	        ]]]
72 |     print(repr(parse_termvect_namedlist(solr_nl, 'body')))
73 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_bg.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | а
  5 | аз
  6 | ако
  7 | ала
  8 | бе
  9 | без
 10 | беше
 11 | би
 12 | бил
 13 | била
 14 | били
 15 | било
 16 | близо
 17 | бъдат
 18 | бъде
 19 | бяха
 20 | в
 21 | вас
 22 | ваш
 23 | ваша
 24 | вероятно
 25 | вече
 26 | взема
 27 | ви
 28 | вие
 29 | винаги
 30 | все
 31 | всеки
 32 | всички
 33 | всичко
 34 | всяка
 35 | във
 36 | въпреки
 37 | върху
 38 | г
 39 | ги
 40 | главно
 41 | го
 42 | д
 43 | да
 44 | дали
 45 | до
 46 | докато
 47 | докога
 48 | дори
 49 | досега
 50 | доста
 51 | е
 52 | едва
 53 | един
 54 | ето
 55 | за
 56 | зад
 57 | заедно
 58 | заради
 59 | засега
 60 | затова
 61 | защо
 62 | защото
 63 | и
 64 | из
 65 | или
 66 | им
 67 | има
 68 | имат
 69 | иска
 70 | й
 71 | каза
 72 | как
 73 | каква
 74 | какво
 75 | както
 76 | какъв
 77 | като
 78 | кога
 79 | когато
 80 | което
 81 | които
 82 | кой
 83 | който
 84 | колко
 85 | която
 86 | къде
 87 | където
 88 | към
 89 | ли
 90 | м
 91 | ме
 92 | между
 93 | мен
 94 | ми
 95 | мнозина
 96 | мога
 97 | могат
 98 | може
 99 | моля
100 | момента
101 | му
102 | н
103 | на
104 | над
105 | назад
106 | най
107 | направи
108 | напред
109 | например
110 | нас
111 | не
112 | него
113 | нея
114 | ни
115 | ние
116 | никой
117 | нито
118 | но
119 | някои
120 | някой
121 | няма
122 | обаче
123 | около
124 | освен
125 | особено
126 | от
127 | отгоре
128 | отново
129 | още
130 | пак
131 | по
132 | повече
133 | повечето
134 | под
135 | поне
136 | поради
137 | после
138 | почти
139 | прави
140 | пред
141 | преди
142 | през
143 | при
144 | пък
145 | първо
146 | с
147 | са
148 | само
149 | се
150 | сега
151 | си
152 | скоро
153 | след
154 | сме
155 | според
156 | сред
157 | срещу
158 | сте
159 | съм
160 | със
161 | също
162 | т
163 | тази
164 | така
165 | такива
166 | такъв
167 | там
168 | твой
169 | те
170 | тези
171 | ти
172 | тн
173 | то
174 | това
175 | тогава
176 | този
177 | той
178 | толкова
179 | точно
180 | трябва
181 | тук
182 | тъй
183 | тя
184 | тях
185 | у
186 | харесва
187 | ч
188 | че
189 | често
190 | чрез
191 | ще
192 | щом
193 | я
194 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_bg.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | а
  5 | аз
  6 | ако
  7 | ала
  8 | бе
  9 | без
 10 | беше
 11 | би
 12 | бил
 13 | била
 14 | били
 15 | било
 16 | близо
 17 | бъдат
 18 | бъде
 19 | бяха
 20 | в
 21 | вас
 22 | ваш
 23 | ваша
 24 | вероятно
 25 | вече
 26 | взема
 27 | ви
 28 | вие
 29 | винаги
 30 | все
 31 | всеки
 32 | всички
 33 | всичко
 34 | всяка
 35 | във
 36 | въпреки
 37 | върху
 38 | г
 39 | ги
 40 | главно
 41 | го
 42 | д
 43 | да
 44 | дали
 45 | до
 46 | докато
 47 | докога
 48 | дори
 49 | досега
 50 | доста
 51 | е
 52 | едва
 53 | един
 54 | ето
 55 | за
 56 | зад
 57 | заедно
 58 | заради
 59 | засега
 60 | затова
 61 | защо
 62 | защото
 63 | и
 64 | из
 65 | или
 66 | им
 67 | има
 68 | имат
 69 | иска
 70 | й
 71 | каза
 72 | как
 73 | каква
 74 | какво
 75 | както
 76 | какъв
 77 | като
 78 | кога
 79 | когато
 80 | което
 81 | които
 82 | кой
 83 | който
 84 | колко
 85 | която
 86 | къде
 87 | където
 88 | към
 89 | ли
 90 | м
 91 | ме
 92 | между
 93 | мен
 94 | ми
 95 | мнозина
 96 | мога
 97 | могат
 98 | може
 99 | моля
100 | момента
101 | му
102 | н
103 | на
104 | над
105 | назад
106 | най
107 | направи
108 | напред
109 | например
110 | нас
111 | не
112 | него
113 | нея
114 | ни
115 | ние
116 | никой
117 | нито
118 | но
119 | някои
120 | някой
121 | няма
122 | обаче
123 | около
124 | освен
125 | особено
126 | от
127 | отгоре
128 | отново
129 | още
130 | пак
131 | по
132 | повече
133 | повечето
134 | под
135 | поне
136 | поради
137 | после
138 | почти
139 | прави
140 | пред
141 | преди
142 | през
143 | при
144 | пък
145 | първо
146 | с
147 | са
148 | само
149 | се
150 | сега
151 | си
152 | скоро
153 | след
154 | сме
155 | според
156 | сред
157 | срещу
158 | сте
159 | съм
160 | със
161 | също
162 | т
163 | тази
164 | така
165 | такива
166 | такъв
167 | там
168 | твой
169 | те
170 | тези
171 | ти
172 | тн
173 | то
174 | това
175 | тогава
176 | този
177 | той
178 | толкова
179 | точно
180 | трябва
181 | тук
182 | тъй
183 | тя
184 | тях
185 | у
186 | харесва
187 | ч
188 | че
189 | често
190 | чрез
191 | ще
192 | щом
193 | я
194 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/solr.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!--
 3 |  Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  contributor license agreements.  See the NOTICE file distributed with
 5 |  this work for additional information regarding copyright ownership.
 6 |  The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  (the "License"); you may not use this file except in compliance with
 8 |  the License.  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | -->
18 | 
19 | <!--
20 |    This is an example of a simple "solr.xml" file for configuring one or 
21 |    more Solr Cores, as well as allowing Cores to be added, removed, and 
22 |    reloaded via HTTP requests.
23 | 
24 |    More information about options available in this configuration file, 
25 |    and Solr Core administration can be found online:
26 |    http://wiki.apache.org/solr/CoreAdmin
27 | -->
28 | 
29 | <solr>
30 | 
31 |   <solrcloud>
32 | 
33 |     <str name="host">${host:}</str>
34 |     <int name="hostPort">${jetty.port:8983}</int>
35 |     <str name="hostContext">${hostContext:solr}</str>
36 | 
37 |     <bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
38 | 
39 |     <int name="zkClientTimeout">${zkClientTimeout:30000}</int>
40 |     <int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:600000}</int>
41 |     <int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:60000}</int>
42 |     <str name="zkCredentialsProvider">${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider}</str>
43 |     <str name="zkACLProvider">${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider}</str>
44 | 
45 |   </solrcloud>
46 | 
47 |   <shardHandlerFactory name="shardHandlerFactory"
48 |     class="HttpShardHandlerFactory">
49 |     <int name="socketTimeout">${socketTimeout:600000}</int>
50 |     <int name="connTimeout">${connTimeout:60000}</int>
51 |   </shardHandlerFactory>
52 | 
53 | </solr>
54 | 


--------------------------------------------------------------------------------
/ltr/helpers/tau.py:
--------------------------------------------------------------------------------
 1 | sign = lambda a: (a>0) - (a<0)
 2 | 
 3 | def pairs_in_order(ranking, both_ways=True):
 4 |     assert len(ranking) > 1
 5 |     for idx1, val1 in enumerate(ranking):
 6 |         for idx2, val2 in enumerate(ranking):
 7 |             if idx2 > idx1:
 8 |                 yield val1, val2, sign(idx2-idx1)
 9 |                 if both_ways:
10 |                     yield val2, val1, sign(idx1-idx2)
11 | 
12 | def tau(rank1, rank2, at=4):
13 |     rank1in = {}
14 | 
15 | 
16 |     if len(rank1) < at or len(rank2) < at:
17 |         raise ValueError("rankings must be larger than provided at param(%s)" % at)
18 | 
19 |     # Handle 1 as a special case
20 |     if at == 1:
21 |         if rank1[0] == rank2[0]:
22 |             return 1
23 |         return -1
24 | 
25 |     rank1 = rank1[:at]; rank2 = rank2[:at]
26 | 
27 |     # gather concordances/discords for rank1
28 |     for val1, val2, order in pairs_in_order(rank1, both_ways=True):
29 |         rank1in[(val1,val2)] = order
30 | 
31 |     # check rank2
32 |     concords = 0
33 |     discords = 0
34 |     for val1, val2, order in pairs_in_order(rank2, both_ways=False):
35 |         try:
36 |             rank1order = rank1in[(val1,val2)]
37 |             if order == rank1order:
38 |                 concords += 1
39 |             else:
40 |                 discords += 1
41 |         except KeyError:
42 |             discords += 1
43 | 
44 |     return (concords - discords) / ((at * (at - 1)) / 2)
45 | 
46 | def avg_tau(rank1, rank2, at=4):
47 |     if len(rank1) < at or len(rank2) < at:
48 |         raise ValueError("rankings must be larger than provided at param(%s)" % at)
49 | 
50 |     rank1 = rank1[:at]; rank2 = rank2[:at]
51 | 
52 |     tot = 0
53 |     for i in range(1,at+1):
54 |         tot += tau(rank1,rank2,at=i)
55 |     return tot / (at)
56 | 
57 | if __name__ == "__main__":
58 |     print(tau([1,2,3,4],[4,3,2,1]))
59 |     print(tau([1,2,3,4],[1,2,3,4]))
60 |     print(tau([1,2,4,3],[1,2,3,4]))
61 |     print(tau([5,6,7,8],[1,2,3,4]))
62 |     print(tau([1,2,3,5],[1,2,3,4]))
63 |     print(tau([5,3,2,1],[4,3,2,1]))
64 |     l1=[1,2,4,3]; l2=[1,2,3,4]; l3=[2,1,3,4]
65 |     print("avg_tau(%s,%s,at=4) %s" % (l1, l1, avg_tau(l1,l1)))
66 |     print("avg_tau(%s,%s,at=4) %s" % (l1, l2, avg_tau(l1,l2)))
67 |     print("avg_tau(%s,%s,at=4) %s" % (l2, l3, avg_tau(l1,l3)))
68 |     print("tau(%s,%s,at=4) %s" % (l1, l2, tau(l1,l2)))
69 |     print("tau(%s,%s,at=4) %s" % (l2, l3, tau(l1,l3)))
70 | 
71 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ca.txt:
--------------------------------------------------------------------------------
  1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
  2 | a
  3 | abans
  4 | ací
  5 | ah
  6 | així
  7 | això
  8 | al
  9 | als
 10 | aleshores
 11 | algun
 12 | alguna
 13 | algunes
 14 | alguns
 15 | alhora
 16 | allà
 17 | allí
 18 | allò
 19 | altra
 20 | altre
 21 | altres
 22 | amb
 23 | ambdós
 24 | ambdues
 25 | apa
 26 | aquell
 27 | aquella
 28 | aquelles
 29 | aquells
 30 | aquest
 31 | aquesta
 32 | aquestes
 33 | aquests
 34 | aquí
 35 | baix
 36 | cada
 37 | cadascú
 38 | cadascuna
 39 | cadascunes
 40 | cadascuns
 41 | com
 42 | contra
 43 | d'un
 44 | d'una
 45 | d'unes
 46 | d'uns
 47 | dalt
 48 | de
 49 | del
 50 | dels
 51 | des
 52 | després
 53 | dins
 54 | dintre
 55 | donat
 56 | doncs
 57 | durant
 58 | e
 59 | eh
 60 | el
 61 | els
 62 | em
 63 | en
 64 | encara
 65 | ens
 66 | entre
 67 | érem
 68 | eren
 69 | éreu
 70 | es
 71 | és
 72 | esta
 73 | està
 74 | estàvem
 75 | estaven
 76 | estàveu
 77 | esteu
 78 | et
 79 | etc
 80 | ets
 81 | fins
 82 | fora
 83 | gairebé
 84 | ha
 85 | han
 86 | has
 87 | havia
 88 | he
 89 | hem
 90 | heu
 91 | hi 
 92 | ho
 93 | i
 94 | igual
 95 | iguals
 96 | ja
 97 | l'hi
 98 | la
 99 | les
100 | li
101 | li'n
102 | llavors
103 | m'he
104 | ma
105 | mal
106 | malgrat
107 | mateix
108 | mateixa
109 | mateixes
110 | mateixos
111 | me
112 | mentre
113 | més
114 | meu
115 | meus
116 | meva
117 | meves
118 | molt
119 | molta
120 | moltes
121 | molts
122 | mon
123 | mons
124 | n'he
125 | n'hi
126 | ne
127 | ni
128 | no
129 | nogensmenys
130 | només
131 | nosaltres
132 | nostra
133 | nostre
134 | nostres
135 | o
136 | oh
137 | oi
138 | on
139 | pas
140 | pel
141 | pels
142 | per
143 | però
144 | perquè
145 | poc 
146 | poca
147 | pocs
148 | poques
149 | potser
150 | propi
151 | qual
152 | quals
153 | quan
154 | quant 
155 | que
156 | què
157 | quelcom
158 | qui
159 | quin
160 | quina
161 | quines
162 | quins
163 | s'ha
164 | s'han
165 | sa
166 | semblant
167 | semblants
168 | ses
169 | seu 
170 | seus
171 | seva
172 | seva
173 | seves
174 | si
175 | sobre
176 | sobretot
177 | sóc
178 | solament
179 | sols
180 | son 
181 | són
182 | sons 
183 | sota
184 | sou
185 | t'ha
186 | t'han
187 | t'he
188 | ta
189 | tal
190 | també
191 | tampoc
192 | tan
193 | tant
194 | tanta
195 | tantes
196 | teu
197 | teus
198 | teva
199 | teves
200 | ton
201 | tons
202 | tot
203 | tota
204 | totes
205 | tots
206 | un
207 | una
208 | unes
209 | uns
210 | us
211 | va
212 | vaig
213 | vam
214 | van
215 | vas
216 | veu
217 | vosaltres
218 | vostra
219 | vostre
220 | vostres
221 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ca.txt:
--------------------------------------------------------------------------------
  1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
  2 | a
  3 | abans
  4 | ací
  5 | ah
  6 | així
  7 | això
  8 | al
  9 | als
 10 | aleshores
 11 | algun
 12 | alguna
 13 | algunes
 14 | alguns
 15 | alhora
 16 | allà
 17 | allí
 18 | allò
 19 | altra
 20 | altre
 21 | altres
 22 | amb
 23 | ambdós
 24 | ambdues
 25 | apa
 26 | aquell
 27 | aquella
 28 | aquelles
 29 | aquells
 30 | aquest
 31 | aquesta
 32 | aquestes
 33 | aquests
 34 | aquí
 35 | baix
 36 | cada
 37 | cadascú
 38 | cadascuna
 39 | cadascunes
 40 | cadascuns
 41 | com
 42 | contra
 43 | d'un
 44 | d'una
 45 | d'unes
 46 | d'uns
 47 | dalt
 48 | de
 49 | del
 50 | dels
 51 | des
 52 | després
 53 | dins
 54 | dintre
 55 | donat
 56 | doncs
 57 | durant
 58 | e
 59 | eh
 60 | el
 61 | els
 62 | em
 63 | en
 64 | encara
 65 | ens
 66 | entre
 67 | érem
 68 | eren
 69 | éreu
 70 | es
 71 | és
 72 | esta
 73 | està
 74 | estàvem
 75 | estaven
 76 | estàveu
 77 | esteu
 78 | et
 79 | etc
 80 | ets
 81 | fins
 82 | fora
 83 | gairebé
 84 | ha
 85 | han
 86 | has
 87 | havia
 88 | he
 89 | hem
 90 | heu
 91 | hi 
 92 | ho
 93 | i
 94 | igual
 95 | iguals
 96 | ja
 97 | l'hi
 98 | la
 99 | les
100 | li
101 | li'n
102 | llavors
103 | m'he
104 | ma
105 | mal
106 | malgrat
107 | mateix
108 | mateixa
109 | mateixes
110 | mateixos
111 | me
112 | mentre
113 | més
114 | meu
115 | meus
116 | meva
117 | meves
118 | molt
119 | molta
120 | moltes
121 | molts
122 | mon
123 | mons
124 | n'he
125 | n'hi
126 | ne
127 | ni
128 | no
129 | nogensmenys
130 | només
131 | nosaltres
132 | nostra
133 | nostre
134 | nostres
135 | o
136 | oh
137 | oi
138 | on
139 | pas
140 | pel
141 | pels
142 | per
143 | però
144 | perquè
145 | poc 
146 | poca
147 | pocs
148 | poques
149 | potser
150 | propi
151 | qual
152 | quals
153 | quan
154 | quant 
155 | que
156 | què
157 | quelcom
158 | qui
159 | quin
160 | quina
161 | quines
162 | quins
163 | s'ha
164 | s'han
165 | sa
166 | semblant
167 | semblants
168 | ses
169 | seu 
170 | seus
171 | seva
172 | seva
173 | seves
174 | si
175 | sobre
176 | sobretot
177 | sóc
178 | solament
179 | sols
180 | son 
181 | són
182 | sons 
183 | sota
184 | sou
185 | t'ha
186 | t'han
187 | t'he
188 | ta
189 | tal
190 | també
191 | tampoc
192 | tan
193 | tant
194 | tanta
195 | tantes
196 | teu
197 | teus
198 | teva
199 | teves
200 | ton
201 | tons
202 | tot
203 | tota
204 | totes
205 | tots
206 | un
207 | una
208 | unes
209 | uns
210 | us
211 | va
212 | vaig
213 | vam
214 | van
215 | vas
216 | veu
217 | vosaltres
218 | vostra
219 | vostre
220 | vostres
221 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_tr.txt:
--------------------------------------------------------------------------------
  1 | # Turkish stopwords from LUCENE-559
  2 | # merged with the list from "Information Retrieval on Turkish Texts"
  3 | #   (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
  4 | acaba
  5 | altmış
  6 | altı
  7 | ama
  8 | ancak
  9 | arada
 10 | aslında
 11 | ayrıca
 12 | bana
 13 | bazı
 14 | belki
 15 | ben
 16 | benden
 17 | beni
 18 | benim
 19 | beri
 20 | beş
 21 | bile
 22 | bin
 23 | bir
 24 | birçok
 25 | biri
 26 | birkaç
 27 | birkez
 28 | birşey
 29 | birşeyi
 30 | biz
 31 | bize
 32 | bizden
 33 | bizi
 34 | bizim
 35 | böyle
 36 | böylece
 37 | bu
 38 | buna
 39 | bunda
 40 | bundan
 41 | bunlar
 42 | bunları
 43 | bunların
 44 | bunu
 45 | bunun
 46 | burada
 47 | çok
 48 | çünkü
 49 | da
 50 | daha
 51 | dahi
 52 | de
 53 | defa
 54 | değil
 55 | diğer
 56 | diye
 57 | doksan
 58 | dokuz
 59 | dolayı
 60 | dolayısıyla
 61 | dört
 62 | edecek
 63 | eden
 64 | ederek
 65 | edilecek
 66 | ediliyor
 67 | edilmesi
 68 | ediyor
 69 | eğer
 70 | elli
 71 | en
 72 | etmesi
 73 | etti
 74 | ettiği
 75 | ettiğini
 76 | gibi
 77 | göre
 78 | halen
 79 | hangi
 80 | hatta
 81 | hem
 82 | henüz
 83 | hep
 84 | hepsi
 85 | her
 86 | herhangi
 87 | herkesin
 88 | hiç
 89 | hiçbir
 90 | için
 91 | iki
 92 | ile
 93 | ilgili
 94 | ise
 95 | işte
 96 | itibaren
 97 | itibariyle
 98 | kadar
 99 | karşın
100 | katrilyon
101 | kendi
102 | kendilerine
103 | kendini
104 | kendisi
105 | kendisine
106 | kendisini
107 | kez
108 | ki
109 | kim
110 | kimden
111 | kime
112 | kimi
113 | kimse
114 | kırk
115 | milyar
116 | milyon
117 | mu
118 | mü
119 | mı
120 | nasıl
121 | ne
122 | neden
123 | nedenle
124 | nerde
125 | nerede
126 | nereye
127 | niye
128 | niçin
129 | o
130 | olan
131 | olarak
132 | oldu
133 | olduğu
134 | olduğunu
135 | olduklarını
136 | olmadı
137 | olmadığı
138 | olmak
139 | olması
140 | olmayan
141 | olmaz
142 | olsa
143 | olsun
144 | olup
145 | olur
146 | olursa
147 | oluyor
148 | on
149 | ona
150 | ondan
151 | onlar
152 | onlardan
153 | onları
154 | onların
155 | onu
156 | onun
157 | otuz
158 | oysa
159 | öyle
160 | pek
161 | rağmen
162 | sadece
163 | sanki
164 | sekiz
165 | seksen
166 | sen
167 | senden
168 | seni
169 | senin
170 | siz
171 | sizden
172 | sizi
173 | sizin
174 | şey
175 | şeyden
176 | şeyi
177 | şeyler
178 | şöyle
179 | şu
180 | şuna
181 | şunda
182 | şundan
183 | şunları
184 | şunu
185 | tarafından
186 | trilyon
187 | tüm
188 | üç
189 | üzere
190 | var
191 | vardı
192 | ve
193 | veya
194 | ya
195 | yani
196 | yapacak
197 | yapılan
198 | yapılması
199 | yapıyor
200 | yapmak
201 | yaptı
202 | yaptığı
203 | yaptığını
204 | yaptıkları
205 | yedi
206 | yerine
207 | yetmiş
208 | yine
209 | yirmi
210 | yoksa
211 | yüz
212 | zaten
213 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_tr.txt:
--------------------------------------------------------------------------------
  1 | # Turkish stopwords from LUCENE-559
  2 | # merged with the list from "Information Retrieval on Turkish Texts"
  3 | #   (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
  4 | acaba
  5 | altmış
  6 | altı
  7 | ama
  8 | ancak
  9 | arada
 10 | aslında
 11 | ayrıca
 12 | bana
 13 | bazı
 14 | belki
 15 | ben
 16 | benden
 17 | beni
 18 | benim
 19 | beri
 20 | beş
 21 | bile
 22 | bin
 23 | bir
 24 | birçok
 25 | biri
 26 | birkaç
 27 | birkez
 28 | birşey
 29 | birşeyi
 30 | biz
 31 | bize
 32 | bizden
 33 | bizi
 34 | bizim
 35 | böyle
 36 | böylece
 37 | bu
 38 | buna
 39 | bunda
 40 | bundan
 41 | bunlar
 42 | bunları
 43 | bunların
 44 | bunu
 45 | bunun
 46 | burada
 47 | çok
 48 | çünkü
 49 | da
 50 | daha
 51 | dahi
 52 | de
 53 | defa
 54 | değil
 55 | diğer
 56 | diye
 57 | doksan
 58 | dokuz
 59 | dolayı
 60 | dolayısıyla
 61 | dört
 62 | edecek
 63 | eden
 64 | ederek
 65 | edilecek
 66 | ediliyor
 67 | edilmesi
 68 | ediyor
 69 | eğer
 70 | elli
 71 | en
 72 | etmesi
 73 | etti
 74 | ettiği
 75 | ettiğini
 76 | gibi
 77 | göre
 78 | halen
 79 | hangi
 80 | hatta
 81 | hem
 82 | henüz
 83 | hep
 84 | hepsi
 85 | her
 86 | herhangi
 87 | herkesin
 88 | hiç
 89 | hiçbir
 90 | için
 91 | iki
 92 | ile
 93 | ilgili
 94 | ise
 95 | işte
 96 | itibaren
 97 | itibariyle
 98 | kadar
 99 | karşın
100 | katrilyon
101 | kendi
102 | kendilerine
103 | kendini
104 | kendisi
105 | kendisine
106 | kendisini
107 | kez
108 | ki
109 | kim
110 | kimden
111 | kime
112 | kimi
113 | kimse
114 | kırk
115 | milyar
116 | milyon
117 | mu
118 | mü
119 | mı
120 | nasıl
121 | ne
122 | neden
123 | nedenle
124 | nerde
125 | nerede
126 | nereye
127 | niye
128 | niçin
129 | o
130 | olan
131 | olarak
132 | oldu
133 | olduğu
134 | olduğunu
135 | olduklarını
136 | olmadı
137 | olmadığı
138 | olmak
139 | olması
140 | olmayan
141 | olmaz
142 | olsa
143 | olsun
144 | olup
145 | olur
146 | olursa
147 | oluyor
148 | on
149 | ona
150 | ondan
151 | onlar
152 | onlardan
153 | onları
154 | onların
155 | onu
156 | onun
157 | otuz
158 | oysa
159 | öyle
160 | pek
161 | rağmen
162 | sadece
163 | sanki
164 | sekiz
165 | seksen
166 | sen
167 | senden
168 | seni
169 | senin
170 | siz
171 | sizden
172 | sizi
173 | sizin
174 | şey
175 | şeyden
176 | şeyi
177 | şeyler
178 | şöyle
179 | şu
180 | şuna
181 | şunda
182 | şundan
183 | şunları
184 | şunu
185 | tarafından
186 | trilyon
187 | tüm
188 | üç
189 | üzere
190 | var
191 | vardı
192 | ve
193 | veya
194 | ya
195 | yani
196 | yapacak
197 | yapılan
198 | yapılması
199 | yapıyor
200 | yapmak
201 | yaptı
202 | yaptığı
203 | yaptığını
204 | yaptıkları
205 | yedi
206 | yerine
207 | yetmiş
208 | yine
209 | yirmi
210 | yoksa
211 | yüz
212 | zaten
213 | 


--------------------------------------------------------------------------------
/ltr/helpers/movies.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | 
 4 | class Memoize:
 5 |     """ Adapted from
 6 |         https://stackoverflow.com/questions/1988804/what-is-memoization-and-how-can-i-use-it-in-python"""
 7 |     def __init__(self, f):
 8 |         self.f = f
 9 |         self.memo = {}
10 |     def __call__(self, *args):
11 |         if not args in self.memo:
12 |             self.memo[args] = self.f(*args)
13 |         #Warning: You may wish to do a deepcopy here if returning objects
14 |         return self.memo[args]
15 | 
16 | @Memoize
17 | def load_movies(json_path):
18 |     return json.load(open(json_path))
19 | 
20 | def get_movie(tmdb_id, movies='data/tmdb.json'):
21 |     movies = load_movies(movies)
22 |     tmdb_id=str(tmdb_id)
23 |     return movies[tmdb_id]
24 | 
25 | def noop(src_movie, base_doc):
26 |     return base_doc
27 | 
28 | 
29 | def indexable_movies(enrich=noop, movies='data/tmdb.json'):
30 |     """ Generates TMDB movies, similar to how ES Bulk indexing
31 |     uses a generator to generate bulk index/update actions"""
32 |     movies = load_movies(movies)
33 |     idx = 0
34 |     for movieId, tmdbMovie in tqdm(movies.items(),total=len(movies)):
35 |         try:
36 |             releaseDate = None
37 |             if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
38 |                 releaseDate = tmdbMovie['release_date']
39 |                 releaseYear = releaseDate[0:4]
40 | 
41 |             full_poster_path = ''
42 |             if 'poster_path' in tmdbMovie and tmdbMovie['poster_path'] is not None and len(tmdbMovie['poster_path']) > 0:
43 |                 full_poster_path = 'https://image.tmdb.org/t/p/w185' + tmdbMovie['poster_path']
44 | 
45 |             base_doc = {'id': movieId,
46 |                         'title': tmdbMovie['title'],
47 |                         'overview': tmdbMovie['overview'],
48 |                         'tagline': tmdbMovie['tagline'],
49 |                         'directors': [director['name'] for director in tmdbMovie['directors']],
50 |                         'cast': " ".join([castMember['name'] for castMember in tmdbMovie['cast']]),
51 |                         'genres': [genre['name'] for genre in tmdbMovie['genres']],
52 |                         'release_date': releaseDate,
53 |                         'release_year': releaseYear,
54 |                         'poster_path': full_poster_path,
55 |                         'vote_average': float(tmdbMovie['vote_average']) if 'vote_average' in tmdbMovie else None,
56 |                         'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else 0,
57 |                       }
58 |             yield enrich(tmdbMovie, base_doc)
59 |             idx += 1
60 |         except KeyError as k: # Ignore any movies missing these attributes
61 |             continue
62 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ro.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | acea
  5 | aceasta
  6 | această
  7 | aceea
  8 | acei
  9 | aceia
 10 | acel
 11 | acela
 12 | acele
 13 | acelea
 14 | acest
 15 | acesta
 16 | aceste
 17 | acestea
 18 | aceşti
 19 | aceştia
 20 | acolo
 21 | acum
 22 | ai
 23 | aia
 24 | aibă
 25 | aici
 26 | al
 27 | ăla
 28 | ale
 29 | alea
 30 | ălea
 31 | altceva
 32 | altcineva
 33 | am
 34 | ar
 35 | are
 36 | aş
 37 | aşadar
 38 | asemenea
 39 | asta
 40 | ăsta
 41 | astăzi
 42 | astea
 43 | ăstea
 44 | ăştia
 45 | asupra
 46 | aţi
 47 | au
 48 | avea
 49 | avem
 50 | aveţi
 51 | azi
 52 | bine
 53 | bucur
 54 | bună
 55 | ca
 56 | că
 57 | căci
 58 | când
 59 | care
 60 | cărei
 61 | căror
 62 | cărui
 63 | cât
 64 | câte
 65 | câţi
 66 | către
 67 | câtva
 68 | ce
 69 | cel
 70 | ceva
 71 | chiar
 72 | cînd
 73 | cine
 74 | cineva
 75 | cît
 76 | cîte
 77 | cîţi
 78 | cîtva
 79 | contra
 80 | cu
 81 | cum
 82 | cumva
 83 | curând
 84 | curînd
 85 | da
 86 | dă
 87 | dacă
 88 | dar
 89 | datorită
 90 | de
 91 | deci
 92 | deja
 93 | deoarece
 94 | departe
 95 | deşi
 96 | din
 97 | dinaintea
 98 | dintr
 99 | dintre
100 | drept
101 | după
102 | ea
103 | ei
104 | el
105 | ele
106 | eram
107 | este
108 | eşti
109 | eu
110 | face
111 | fără
112 | fi
113 | fie
114 | fiecare
115 | fii
116 | fim
117 | fiţi
118 | iar
119 | ieri
120 | îi
121 | îl
122 | îmi
123 | împotriva
124 | în 
125 | înainte
126 | înaintea
127 | încât
128 | încît
129 | încotro
130 | între
131 | întrucât
132 | întrucît
133 | îţi
134 | la
135 | lângă
136 | le
137 | li
138 | lîngă
139 | lor
140 | lui
141 | mă
142 | mâine
143 | mea
144 | mei
145 | mele
146 | mereu
147 | meu
148 | mi
149 | mine
150 | mult
151 | multă
152 | mulţi
153 | ne
154 | nicăieri
155 | nici
156 | nimeni
157 | nişte
158 | noastră
159 | noastre
160 | noi
161 | noştri
162 | nostru
163 | nu
164 | ori
165 | oricând
166 | oricare
167 | oricât
168 | orice
169 | oricînd
170 | oricine
171 | oricît
172 | oricum
173 | oriunde
174 | până
175 | pe
176 | pentru
177 | peste
178 | pînă
179 | poate
180 | pot
181 | prea
182 | prima
183 | primul
184 | prin
185 | printr
186 | sa
187 | să
188 | săi
189 | sale
190 | sau
191 | său
192 | se
193 | şi
194 | sînt
195 | sîntem
196 | sînteţi
197 | spre
198 | sub
199 | sunt
200 | suntem
201 | sunteţi
202 | ta
203 | tăi
204 | tale
205 | tău
206 | te
207 | ţi
208 | ţie
209 | tine
210 | toată
211 | toate
212 | tot
213 | toţi
214 | totuşi
215 | tu
216 | un
217 | una
218 | unde
219 | undeva
220 | unei
221 | unele
222 | uneori
223 | unor
224 | vă
225 | vi
226 | voastră
227 | voastre
228 | voi
229 | voştri
230 | vostru
231 | vouă
232 | vreo
233 | vreun
234 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ro.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | acea
  5 | aceasta
  6 | această
  7 | aceea
  8 | acei
  9 | aceia
 10 | acel
 11 | acela
 12 | acele
 13 | acelea
 14 | acest
 15 | acesta
 16 | aceste
 17 | acestea
 18 | aceşti
 19 | aceştia
 20 | acolo
 21 | acum
 22 | ai
 23 | aia
 24 | aibă
 25 | aici
 26 | al
 27 | ăla
 28 | ale
 29 | alea
 30 | ălea
 31 | altceva
 32 | altcineva
 33 | am
 34 | ar
 35 | are
 36 | aş
 37 | aşadar
 38 | asemenea
 39 | asta
 40 | ăsta
 41 | astăzi
 42 | astea
 43 | ăstea
 44 | ăştia
 45 | asupra
 46 | aţi
 47 | au
 48 | avea
 49 | avem
 50 | aveţi
 51 | azi
 52 | bine
 53 | bucur
 54 | bună
 55 | ca
 56 | că
 57 | căci
 58 | când
 59 | care
 60 | cărei
 61 | căror
 62 | cărui
 63 | cât
 64 | câte
 65 | câţi
 66 | către
 67 | câtva
 68 | ce
 69 | cel
 70 | ceva
 71 | chiar
 72 | cînd
 73 | cine
 74 | cineva
 75 | cît
 76 | cîte
 77 | cîţi
 78 | cîtva
 79 | contra
 80 | cu
 81 | cum
 82 | cumva
 83 | curând
 84 | curînd
 85 | da
 86 | dă
 87 | dacă
 88 | dar
 89 | datorită
 90 | de
 91 | deci
 92 | deja
 93 | deoarece
 94 | departe
 95 | deşi
 96 | din
 97 | dinaintea
 98 | dintr
 99 | dintre
100 | drept
101 | după
102 | ea
103 | ei
104 | el
105 | ele
106 | eram
107 | este
108 | eşti
109 | eu
110 | face
111 | fără
112 | fi
113 | fie
114 | fiecare
115 | fii
116 | fim
117 | fiţi
118 | iar
119 | ieri
120 | îi
121 | îl
122 | îmi
123 | împotriva
124 | în 
125 | înainte
126 | înaintea
127 | încât
128 | încît
129 | încotro
130 | între
131 | întrucât
132 | întrucît
133 | îţi
134 | la
135 | lângă
136 | le
137 | li
138 | lîngă
139 | lor
140 | lui
141 | mă
142 | mâine
143 | mea
144 | mei
145 | mele
146 | mereu
147 | meu
148 | mi
149 | mine
150 | mult
151 | multă
152 | mulţi
153 | ne
154 | nicăieri
155 | nici
156 | nimeni
157 | nişte
158 | noastră
159 | noastre
160 | noi
161 | noştri
162 | nostru
163 | nu
164 | ori
165 | oricând
166 | oricare
167 | oricât
168 | orice
169 | oricînd
170 | oricine
171 | oricît
172 | oricum
173 | oriunde
174 | până
175 | pe
176 | pentru
177 | peste
178 | pînă
179 | poate
180 | pot
181 | prea
182 | prima
183 | primul
184 | prin
185 | printr
186 | sa
187 | să
188 | săi
189 | sale
190 | sau
191 | său
192 | se
193 | şi
194 | sînt
195 | sîntem
196 | sînteţi
197 | spre
198 | sub
199 | sunt
200 | suntem
201 | sunteţi
202 | ta
203 | tăi
204 | tale
205 | tău
206 | te
207 | ţi
208 | ţie
209 | tine
210 | toată
211 | toate
212 | tot
213 | toţi
214 | totuşi
215 | tu
216 | un
217 | una
218 | unde
219 | undeva
220 | unei
221 | unele
222 | uneori
223 | unor
224 | vă
225 | vi
226 | voastră
227 | voastre
228 | voi
229 | voştri
230 | vostru
231 | vouă
232 | vreo
233 | vreun
234 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hu.txt:
--------------------------------------------------------------------------------
  1 |  | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
  2 |  | This file is distributed under the BSD License.
  3 |  | See http://snowball.tartarus.org/license.php
  4 |  | Also see http://www.opensource.org/licenses/bsd-license.html
  5 |  |  - Encoding was converted to UTF-8.
  6 |  |  - This notice was added.
  7 |  |
  8 |  | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
  9 |  
 10 | | Hungarian stop word list
 11 | | prepared by Anna Tordai
 12 | 
 13 | a
 14 | ahogy
 15 | ahol
 16 | aki
 17 | akik
 18 | akkor
 19 | alatt
 20 | által
 21 | általában
 22 | amely
 23 | amelyek
 24 | amelyekben
 25 | amelyeket
 26 | amelyet
 27 | amelynek
 28 | ami
 29 | amit
 30 | amolyan
 31 | amíg
 32 | amikor
 33 | át
 34 | abban
 35 | ahhoz
 36 | annak
 37 | arra
 38 | arról
 39 | az
 40 | azok
 41 | azon
 42 | azt
 43 | azzal
 44 | azért
 45 | aztán
 46 | azután
 47 | azonban
 48 | bár
 49 | be
 50 | belül
 51 | benne
 52 | cikk
 53 | cikkek
 54 | cikkeket
 55 | csak
 56 | de
 57 | e
 58 | eddig
 59 | egész
 60 | egy
 61 | egyes
 62 | egyetlen
 63 | egyéb
 64 | egyik
 65 | egyre
 66 | ekkor
 67 | el
 68 | elég
 69 | ellen
 70 | elő
 71 | először
 72 | előtt
 73 | első
 74 | én
 75 | éppen
 76 | ebben
 77 | ehhez
 78 | emilyen
 79 | ennek
 80 | erre
 81 | ez
 82 | ezt
 83 | ezek
 84 | ezen
 85 | ezzel
 86 | ezért
 87 | és
 88 | fel
 89 | felé
 90 | hanem
 91 | hiszen
 92 | hogy
 93 | hogyan
 94 | igen
 95 | így
 96 | illetve
 97 | ill.
 98 | ill
 99 | ilyen
100 | ilyenkor
101 | ison
102 | ismét
103 | itt
104 | jó
105 | jól
106 | jobban
107 | kell
108 | kellett
109 | keresztül
110 | keressünk
111 | ki
112 | kívül
113 | között
114 | közül
115 | legalább
116 | lehet
117 | lehetett
118 | legyen
119 | lenne
120 | lenni
121 | lesz
122 | lett
123 | maga
124 | magát
125 | majd
126 | majd
127 | már
128 | más
129 | másik
130 | meg
131 | még
132 | mellett
133 | mert
134 | mely
135 | melyek
136 | mi
137 | mit
138 | míg
139 | miért
140 | milyen
141 | mikor
142 | minden
143 | mindent
144 | mindenki
145 | mindig
146 | mint
147 | mintha
148 | mivel
149 | most
150 | nagy
151 | nagyobb
152 | nagyon
153 | ne
154 | néha
155 | nekem
156 | neki
157 | nem
158 | néhány
159 | nélkül
160 | nincs
161 | olyan
162 | ott
163 | össze
164 | ő
165 | ők
166 | őket
167 | pedig
168 | persze
169 | rá
170 | s
171 | saját
172 | sem
173 | semmi
174 | sok
175 | sokat
176 | sokkal
177 | számára
178 | szemben
179 | szerint
180 | szinte
181 | talán
182 | tehát
183 | teljes
184 | tovább
185 | továbbá
186 | több
187 | úgy
188 | ugyanis
189 | új
190 | újabb
191 | újra
192 | után
193 | utána
194 | utolsó
195 | vagy
196 | vagyis
197 | valaki
198 | valami
199 | valamint
200 | való
201 | vagyok
202 | van
203 | vannak
204 | volt
205 | voltam
206 | voltak
207 | voltunk
208 | vissza
209 | vele
210 | viszont
211 | volna
212 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hu.txt:
--------------------------------------------------------------------------------
  1 |  | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
  2 |  | This file is distributed under the BSD License.
  3 |  | See http://snowball.tartarus.org/license.php
  4 |  | Also see http://www.opensource.org/licenses/bsd-license.html
  5 |  |  - Encoding was converted to UTF-8.
  6 |  |  - This notice was added.
  7 |  |
  8 |  | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
  9 |  
 10 | | Hungarian stop word list
 11 | | prepared by Anna Tordai
 12 | 
 13 | a
 14 | ahogy
 15 | ahol
 16 | aki
 17 | akik
 18 | akkor
 19 | alatt
 20 | által
 21 | általában
 22 | amely
 23 | amelyek
 24 | amelyekben
 25 | amelyeket
 26 | amelyet
 27 | amelynek
 28 | ami
 29 | amit
 30 | amolyan
 31 | amíg
 32 | amikor
 33 | át
 34 | abban
 35 | ahhoz
 36 | annak
 37 | arra
 38 | arról
 39 | az
 40 | azok
 41 | azon
 42 | azt
 43 | azzal
 44 | azért
 45 | aztán
 46 | azután
 47 | azonban
 48 | bár
 49 | be
 50 | belül
 51 | benne
 52 | cikk
 53 | cikkek
 54 | cikkeket
 55 | csak
 56 | de
 57 | e
 58 | eddig
 59 | egész
 60 | egy
 61 | egyes
 62 | egyetlen
 63 | egyéb
 64 | egyik
 65 | egyre
 66 | ekkor
 67 | el
 68 | elég
 69 | ellen
 70 | elő
 71 | először
 72 | előtt
 73 | első
 74 | én
 75 | éppen
 76 | ebben
 77 | ehhez
 78 | emilyen
 79 | ennek
 80 | erre
 81 | ez
 82 | ezt
 83 | ezek
 84 | ezen
 85 | ezzel
 86 | ezért
 87 | és
 88 | fel
 89 | felé
 90 | hanem
 91 | hiszen
 92 | hogy
 93 | hogyan
 94 | igen
 95 | így
 96 | illetve
 97 | ill.
 98 | ill
 99 | ilyen
100 | ilyenkor
101 | ison
102 | ismét
103 | itt
104 | jó
105 | jól
106 | jobban
107 | kell
108 | kellett
109 | keresztül
110 | keressünk
111 | ki
112 | kívül
113 | között
114 | közül
115 | legalább
116 | lehet
117 | lehetett
118 | legyen
119 | lenne
120 | lenni
121 | lesz
122 | lett
123 | maga
124 | magát
125 | majd
126 | majd
127 | már
128 | más
129 | másik
130 | meg
131 | még
132 | mellett
133 | mert
134 | mely
135 | melyek
136 | mi
137 | mit
138 | míg
139 | miért
140 | milyen
141 | mikor
142 | minden
143 | mindent
144 | mindenki
145 | mindig
146 | mint
147 | mintha
148 | mivel
149 | most
150 | nagy
151 | nagyobb
152 | nagyon
153 | ne
154 | néha
155 | nekem
156 | neki
157 | nem
158 | néhány
159 | nélkül
160 | nincs
161 | olyan
162 | ott
163 | össze
164 | ő
165 | ők
166 | őket
167 | pedig
168 | persze
169 | rá
170 | s
171 | saját
172 | sem
173 | semmi
174 | sok
175 | sokat
176 | sokkal
177 | számára
178 | szemben
179 | szerint
180 | szinte
181 | talán
182 | tehát
183 | teljes
184 | tovább
185 | továbbá
186 | több
187 | úgy
188 | ugyanis
189 | új
190 | újabb
191 | újra
192 | után
193 | utána
194 | utolsó
195 | vagy
196 | vagyis
197 | valaki
198 | valami
199 | valamint
200 | való
201 | vagyok
202 | van
203 | vannak
204 | volt
205 | voltam
206 | voltak
207 | voltunk
208 | vissza
209 | vele
210 | viszont
211 | volna
212 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hi.txt:
--------------------------------------------------------------------------------
  1 | # Also see http://www.opensource.org/licenses/bsd-license.html
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  4 | # Note: by default this file also contains forms normalized by HindiNormalizer 
  5 | # for spelling variation (see section below), such that it can be used whether or 
  6 | # not you enable that feature. When adding additional entries to this list,
  7 | # please add the normalized form as well. 
  8 | अंदर
  9 | अत
 10 | अपना
 11 | अपनी
 12 | अपने
 13 | अभी
 14 | आदि
 15 | आप
 16 | इत्यादि
 17 | इन 
 18 | इनका
 19 | इन्हीं
 20 | इन्हें
 21 | इन्हों
 22 | इस
 23 | इसका
 24 | इसकी
 25 | इसके
 26 | इसमें
 27 | इसी
 28 | इसे
 29 | उन
 30 | उनका
 31 | उनकी
 32 | उनके
 33 | उनको
 34 | उन्हीं
 35 | उन्हें
 36 | उन्हों
 37 | उस
 38 | उसके
 39 | उसी
 40 | उसे
 41 | एक
 42 | एवं
 43 | एस
 44 | ऐसे
 45 | और
 46 | कई
 47 | कर
 48 | करता
 49 | करते
 50 | करना
 51 | करने
 52 | करें
 53 | कहते
 54 | कहा
 55 | का
 56 | काफ़ी
 57 | कि
 58 | कितना
 59 | किन्हें
 60 | किन्हों
 61 | किया
 62 | किर
 63 | किस
 64 | किसी
 65 | किसे
 66 | की
 67 | कुछ
 68 | कुल
 69 | के
 70 | को
 71 | कोई
 72 | कौन
 73 | कौनसा
 74 | गया
 75 | घर
 76 | जब
 77 | जहाँ
 78 | जा
 79 | जितना
 80 | जिन
 81 | जिन्हें
 82 | जिन्हों
 83 | जिस
 84 | जिसे
 85 | जीधर
 86 | जैसा
 87 | जैसे
 88 | जो
 89 | तक
 90 | तब
 91 | तरह
 92 | तिन
 93 | तिन्हें
 94 | तिन्हों
 95 | तिस
 96 | तिसे
 97 | तो
 98 | था
 99 | थी
100 | थे
101 | दबारा
102 | दिया
103 | दुसरा
104 | दूसरे
105 | दो
106 | द्वारा
107 | न
108 | नहीं
109 | ना
110 | निहायत
111 | नीचे
112 | ने
113 | पर
114 | पर  
115 | पहले
116 | पूरा
117 | पे
118 | फिर
119 | बनी
120 | बही
121 | बहुत
122 | बाद
123 | बाला
124 | बिलकुल
125 | भी
126 | भीतर
127 | मगर
128 | मानो
129 | मे
130 | में
131 | यदि
132 | यह
133 | यहाँ
134 | यही
135 | या
136 | यिह 
137 | ये
138 | रखें
139 | रहा
140 | रहे
141 | ऱ्वासा
142 | लिए
143 | लिये
144 | लेकिन
145 | व
146 | वर्ग
147 | वह
148 | वह 
149 | वहाँ
150 | वहीं
151 | वाले
152 | वुह 
153 | वे
154 | वग़ैरह
155 | संग
156 | सकता
157 | सकते
158 | सबसे
159 | सभी
160 | साथ
161 | साबुत
162 | साभ
163 | सारा
164 | से
165 | सो
166 | ही
167 | हुआ
168 | हुई
169 | हुए
170 | है
171 | हैं
172 | हो
173 | होता
174 | होती
175 | होते
176 | होना
177 | होने
178 | # additional normalized forms of the above
179 | अपनि
180 | जेसे
181 | होति
182 | सभि
183 | तिंहों
184 | इंहों
185 | दवारा
186 | इसि
187 | किंहें
188 | थि
189 | उंहों
190 | ओर
191 | जिंहें
192 | वहिं
193 | अभि
194 | बनि
195 | हि
196 | उंहिं
197 | उंहें
198 | हें
199 | वगेरह
200 | एसे
201 | रवासा
202 | कोन
203 | निचे
204 | काफि
205 | उसि
206 | पुरा
207 | भितर
208 | हे
209 | बहि
210 | वहां
211 | कोइ
212 | यहां
213 | जिंहों
214 | तिंहें
215 | किसि
216 | कइ
217 | यहि
218 | इंहिं
219 | जिधर
220 | इंहें
221 | अदि
222 | इतयादि
223 | हुइ
224 | कोनसा
225 | इसकि
226 | दुसरे
227 | जहां
228 | अप
229 | किंहों
230 | उनकि
231 | भि
232 | वरग
233 | हुअ
234 | जेसा
235 | नहिं
236 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hi.txt:
--------------------------------------------------------------------------------
  1 | # Also see http://www.opensource.org/licenses/bsd-license.html
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  4 | # Note: by default this file also contains forms normalized by HindiNormalizer 
  5 | # for spelling variation (see section below), such that it can be used whether or 
  6 | # not you enable that feature. When adding additional entries to this list,
  7 | # please add the normalized form as well. 
  8 | अंदर
  9 | अत
 10 | अपना
 11 | अपनी
 12 | अपने
 13 | अभी
 14 | आदि
 15 | आप
 16 | इत्यादि
 17 | इन 
 18 | इनका
 19 | इन्हीं
 20 | इन्हें
 21 | इन्हों
 22 | इस
 23 | इसका
 24 | इसकी
 25 | इसके
 26 | इसमें
 27 | इसी
 28 | इसे
 29 | उन
 30 | उनका
 31 | उनकी
 32 | उनके
 33 | उनको
 34 | उन्हीं
 35 | उन्हें
 36 | उन्हों
 37 | उस
 38 | उसके
 39 | उसी
 40 | उसे
 41 | एक
 42 | एवं
 43 | एस
 44 | ऐसे
 45 | और
 46 | कई
 47 | कर
 48 | करता
 49 | करते
 50 | करना
 51 | करने
 52 | करें
 53 | कहते
 54 | कहा
 55 | का
 56 | काफ़ी
 57 | कि
 58 | कितना
 59 | किन्हें
 60 | किन्हों
 61 | किया
 62 | किर
 63 | किस
 64 | किसी
 65 | किसे
 66 | की
 67 | कुछ
 68 | कुल
 69 | के
 70 | को
 71 | कोई
 72 | कौन
 73 | कौनसा
 74 | गया
 75 | घर
 76 | जब
 77 | जहाँ
 78 | जा
 79 | जितना
 80 | जिन
 81 | जिन्हें
 82 | जिन्हों
 83 | जिस
 84 | जिसे
 85 | जीधर
 86 | जैसा
 87 | जैसे
 88 | जो
 89 | तक
 90 | तब
 91 | तरह
 92 | तिन
 93 | तिन्हें
 94 | तिन्हों
 95 | तिस
 96 | तिसे
 97 | तो
 98 | था
 99 | थी
100 | थे
101 | दबारा
102 | दिया
103 | दुसरा
104 | दूसरे
105 | दो
106 | द्वारा
107 | न
108 | नहीं
109 | ना
110 | निहायत
111 | नीचे
112 | ने
113 | पर
114 | पर  
115 | पहले
116 | पूरा
117 | पे
118 | फिर
119 | बनी
120 | बही
121 | बहुत
122 | बाद
123 | बाला
124 | बिलकुल
125 | भी
126 | भीतर
127 | मगर
128 | मानो
129 | मे
130 | में
131 | यदि
132 | यह
133 | यहाँ
134 | यही
135 | या
136 | यिह 
137 | ये
138 | रखें
139 | रहा
140 | रहे
141 | ऱ्वासा
142 | लिए
143 | लिये
144 | लेकिन
145 | व
146 | वर्ग
147 | वह
148 | वह 
149 | वहाँ
150 | वहीं
151 | वाले
152 | वुह 
153 | वे
154 | वग़ैरह
155 | संग
156 | सकता
157 | सकते
158 | सबसे
159 | सभी
160 | साथ
161 | साबुत
162 | साभ
163 | सारा
164 | से
165 | सो
166 | ही
167 | हुआ
168 | हुई
169 | हुए
170 | है
171 | हैं
172 | हो
173 | होता
174 | होती
175 | होते
176 | होना
177 | होने
178 | # additional normalized forms of the above
179 | अपनि
180 | जेसे
181 | होति
182 | सभि
183 | तिंहों
184 | इंहों
185 | दवारा
186 | इसि
187 | किंहें
188 | थि
189 | उंहों
190 | ओर
191 | जिंहें
192 | वहिं
193 | अभि
194 | बनि
195 | हि
196 | उंहिं
197 | उंहें
198 | हें
199 | वगेरह
200 | एसे
201 | रवासा
202 | कोन
203 | निचे
204 | काफि
205 | उसि
206 | पुरा
207 | भितर
208 | हे
209 | बहि
210 | वहां
211 | कोइ
212 | यहां
213 | जिंहों
214 | तिंहें
215 | किसि
216 | कइ
217 | यहि
218 | इंहिं
219 | जिधर
220 | इंहें
221 | अदि
222 | इतयादि
223 | हुइ
224 | कोनसा
225 | इसकि
226 | दुसरे
227 | जहां
228 | अप
229 | किंहों
230 | उनकि
231 | भि
232 | वरग
233 | हुअ
234 | जेसा
235 | नहिं
236 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/osc-blog/blog_settings.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "mappings": {
  3 |     "_source": {
  4 |       "enabled": true
  5 |     },
  6 |     "properties": {
  7 |       "post_id": {
  8 |         "type": "long",
  9 |         "store": true
 10 |       },
 11 |       "post_date": {
 12 |         "type": "date",
 13 |         "store": true
 14 |       },
 15 |       "es_update_date": {
 16 |         "type": "date",
 17 |         "store": true
 18 |       },
 19 |       "url": {
 20 |         "type": "text",
 21 |         "store": true
 22 |       },
 23 |       "title": {
 24 |         "type": "text",
 25 |         "store": true,
 26 |         "analyzer": "content_analyzer",
 27 |         "fields": {
 28 |           "bigrams": {
 29 |             "type": "text",
 30 |             "analyzer": "content_bigrams"
 31 |           }
 32 |         }
 33 |       },
 34 |       "author": {
 35 |         "type": "text",
 36 |         "store": true,
 37 |         "analyzer": "standard"
 38 |       },
 39 |       "content": {
 40 |         "type": "text",
 41 |         "store": true,
 42 |         "analyzer": "content_analyzer",
 43 |         "fields": {
 44 |           "bigrams": {
 45 |             "type": "text",
 46 |             "analyzer": "content_bigrams"
 47 |           }
 48 |         }
 49 |       },
 50 |       "excerpt": {
 51 |         "type": "text",
 52 |         "store": true,
 53 |         "analyzer": "content_analyzer"
 54 |       },
 55 |       "categories": {
 56 |         "type": "text",
 57 |         "store": true,
 58 |         "analyzer": "content_analyzer"
 59 |       }
 60 |     }
 61 |   },
 62 |   "settings": {
 63 |     "number_of_shards": 1,
 64 |     "number_of_replicas": 1,
 65 |     "analysis": {
 66 |       "filter": {
 67 |         "english_stemmer": {
 68 |           "type": "stemmer",
 69 |           "language": "english"
 70 |         },
 71 |         "english_possessive_stemmer": {
 72 |           "type": "stemmer",
 73 |           "language": "possessive_english"
 74 |         },
 75 |         "bigram": {
 76 |           "type": "shingle",
 77 |           "max_shingle_size": 2,
 78 |           "output_unigrams": false
 79 |         }
 80 |       },
 81 |       "analyzer": {
 82 |         "content_analyzer": {
 83 |           "type": "custom",
 84 |           "char_filter": [
 85 |             "html_strip"
 86 |           ],
 87 |           "filter": [
 88 |             "english_possessive_stemmer",
 89 |             "lowercase",
 90 |             "english_stemmer"
 91 |           ],
 92 |           "tokenizer": "standard"
 93 |         },
 94 |         "content_bigrams": {
 95 |           "type": "custom",
 96 |           "char_filter": [
 97 |             "html_strip"
 98 |           ],
 99 |           "filter": [
100 |             "english_possessive_stemmer",
101 |             "lowercase",
102 |             "english_stemmer",
103 |             "bigram"
104 |           ],
105 |           "tokenizer": "standard"
106 |         }
107 |       }
108 |     }
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hello LTR :)
 2 | 
 3 | The overall goal of this project is to demonstrate all of the steps required to work with LTR in Elasticsearch or Solr. There's two modes of running. Just running and editing notebooks in a docker container. Or local development (also requiring docker to run the search engine).
 4 | 
 5 | ## No fuss setup: You just want to play with LTR
 6 | 
 7 | Follow these steps if you're just playing around & are OK with possibly losing some work (all notebooks exist just in the docker container)
 8 | 
 9 | With docker & docker-compose simply run
10 | 
11 | ```
12 | docker-compose up
13 | ```
14 | 
15 | at the root dir and go to town!
16 | 
17 | This will run jupyter and all search engines in Docker containers. Check that each is up at the default ports:
18 | 
19 | - Solr: [localhost:8983](localhost:8983)
20 | - Elasticsearch: [localhost:9200](localhost:9200)
21 | - Kibana: [localhost:5601](localhost:5601)
22 | - Jupyter: [localhost:8888](localhost:8888)
23 | 
24 | ## You want to build your own LTR notebooks
25 | 
26 | Follow these steps if you want to do more serious work with the notebooks. For example, if you want to build a demo with your work's data or something you want to preserve later.
27 | 
28 | ### Run your search engine with Docker
29 | 
30 | You probably just want to work with one search engine. So whichever one you're working with, launch that search engine in Docker.
31 | 
32 | #### Running Solr w/ LTR
33 | 
34 | Setup Solr with docker compose to work with just Solr examples:
35 | 
36 | ```
37 | cd notebooks/solr
38 | docker-compose up
39 | ```
40 | 
41 | #### Running Elasticsearch w/ LTR
42 | 
43 | Setup Elasticsearch with docker compose to work with just Elasticsearch examples:
44 | 
45 | ```
46 | cd notebooks/elasticsearch
47 | docker-compose up
48 | ```
49 | 
50 | ### Run Jupyter locally w/ Python 3 and all prereqs
51 | 
52 | #### Setup Python requirements
53 | 
54 | - Ensure Python 3 is installed on your system
55 | - Create a virtual environment: `python3 -m venv venv`
56 | - Start the virtual environment: `source venv/bin/activate`
57 | - Check install tooling is up to date `python -m pip install -U pip wheel setuptools`
58 | - Install the requirements `pip install -r requirements.txt`
59 | 
60 | __Note:__ The above commands should be run from the root folder of the project.
61 | 
62 | #### Start Jupyter notebook and confirm operation
63 | 
64 | - Run `jupyter notebook`
65 | - Browse to notebooks/{search\_engine}/{collection} 
66 | - Open either the "hello-ltr (Solr)" or "hello-ltr (ES)" as appropriate and ensure you get a graph at the last cell
67 | 
68 | ## Tests
69 | 
70 | ### Automatically run everything...
71 | 
72 | To run a full suite of tests, such as to verify a PR, you can simply run
73 | 
74 | ./tests/test.sh
75 | 
76 | Optionally with containers rebuilt
77 | 
78 | ./tests/test.sh --rebuild-containers
79 | 
80 | Failing tests will have their output in `tests/last_run.ipynb`
81 | 
82 | ### While developing...
83 | 
84 | For more informal development:
85 | 
86 | - Startup the Solr and ES Docker containers
87 | - Do your development
88 | - Run the command as needed:
89 | `python tests/run_most_nbs.py`
90 | - Tests fail if notebooks return any errors
91 |   - The failing notebook will be stored at `tests/last_run.ipynb`
92 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/Dataframes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dataframes\n",
  8 |     "\n",
  9 |     "Data frames are the central object of most data science workflows. This notebook shows some helper function that can assist you in creating them from judgements. The older non-dataframe way of passing data is in most of the example notebooks, so use this code anywhere you see that pattern.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import ltr.judgments as judge"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "classic_training_set = [j for j in judge.judgments_from_file(open('data/classic-training.txt'))]"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "classic_df = judge.judgments_to_dataframe(classic_training_set)\n",
 37 |     "classic_df"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "### Plotting\n",
 45 |     "\n",
 46 |     "Is one of the main reasons dataframes are easier to work with.  There are two helper functions to show the distribtion of grade (`plot_grades`) and relationship between features and grades (plot_features).\n",
 47 |     "\n",
 48 |     "You are encouraged to use whatever python plotting library you are most comformtable with, we have `matplotlib` and `plotnine` installed in the Docker image."
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "import ltr.p9_plots as plots\n",
 58 |     "plots.plot_grades(classic_df)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "classic_df_long = judge.judgments_dataframe_to_long(classic_df)\n",
 68 |     "classic_df_long"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "plots.plot_features(classic_df_long)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "plots.plot_features"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "kernelspec": {
 92 |    "display_name": "Python 3",
 93 |    "language": "python",
 94 |    "name": "python3"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 3
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython3",
106 |    "version": "3.8.2"
107 |   }
108 |  },
109 |  "nbformat": 4,
110 |  "nbformat_minor": 2
111 | }
112 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fi.txt:
--------------------------------------------------------------------------------
 1 |  | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
 2 |  | This file is distributed under the BSD License.
 3 |  | See http://snowball.tartarus.org/license.php
 4 |  | Also see http://www.opensource.org/licenses/bsd-license.html
 5 |  |  - Encoding was converted to UTF-8.
 6 |  |  - This notice was added.
 7 |  |
 8 |  | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
 9 |  
10 | | forms of BE
11 | 
12 | olla
13 | olen
14 | olet
15 | on
16 | olemme
17 | olette
18 | ovat
19 | ole        | negative form
20 | 
21 | oli
22 | olisi
23 | olisit
24 | olisin
25 | olisimme
26 | olisitte
27 | olisivat
28 | olit
29 | olin
30 | olimme
31 | olitte
32 | olivat
33 | ollut
34 | olleet
35 | 
36 | en         | negation
37 | et
38 | ei
39 | emme
40 | ette
41 | eivät
42 | 
43 | |Nom   Gen    Acc    Part   Iness   Elat    Illat  Adess   Ablat   Allat   Ess    Trans
44 | minä   minun  minut  minua  minussa minusta minuun minulla minulta minulle               | I
45 | sinä   sinun  sinut  sinua  sinussa sinusta sinuun sinulla sinulta sinulle               | you
46 | hän    hänen  hänet  häntä  hänessä hänestä häneen hänellä häneltä hänelle               | he she
47 | me     meidän meidät meitä  meissä  meistä  meihin meillä  meiltä  meille                | we
48 | te     teidän teidät teitä  teissä  teistä  teihin teillä  teiltä  teille                | you
49 | he     heidän heidät heitä  heissä  heistä  heihin heillä  heiltä  heille                | they
50 | 
51 | tämä   tämän         tätä   tässä   tästä   tähän  tallä   tältä   tälle   tänä   täksi  | this
52 | tuo    tuon          tuotä  tuossa  tuosta  tuohon tuolla  tuolta  tuolle  tuona  tuoksi | that
53 | se     sen           sitä   siinä   siitä   siihen sillä   siltä   sille   sinä   siksi  | it
54 | nämä   näiden        näitä  näissä  näistä  näihin näillä  näiltä  näille  näinä  näiksi | these
55 | nuo    noiden        noita  noissa  noista  noihin noilla  noilta  noille  noina  noiksi | those
56 | ne     niiden        niitä  niissä  niistä  niihin niillä  niiltä  niille  niinä  niiksi | they
57 | 
58 | kuka   kenen kenet   ketä   kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
59 | ketkä  keiden ketkä  keitä  keissä  keistä  keihin keillä  keiltä  keille  keinä  keiksi | (pl)
60 | mikä   minkä minkä   mitä   missä   mistä   mihin  millä   miltä   mille   minä   miksi  | which what
61 | mitkä                                                                                    | (pl)
62 | 
63 | joka   jonka         jota   jossa   josta   johon  jolla   jolta   jolle   jona   joksi  | who which
64 | jotka  joiden        joita  joissa  joista  joihin joilla  joilta  joille  joina  joiksi | (pl)
65 | 
66 | | conjunctions
67 | 
68 | että   | that
69 | ja     | and
70 | jos    | if
71 | koska  | because
72 | kuin   | than
73 | mutta  | but
74 | niin   | so
75 | sekä   | and
76 | sillä  | for
77 | tai    | or
78 | vaan   | but
79 | vai    | or
80 | vaikka | although
81 | 
82 | 
83 | | prepositions
84 | 
85 | kanssa  | with
86 | mukaan  | according to
87 | noin    | about
88 | poikki  | across
89 | yli     | over, across
90 | 
91 | | other
92 | 
93 | kun    | when
94 | niin   | so
95 | nyt    | now
96 | itse   | self
97 | 
98 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fi.txt:
--------------------------------------------------------------------------------
 1 |  | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
 2 |  | This file is distributed under the BSD License.
 3 |  | See http://snowball.tartarus.org/license.php
 4 |  | Also see http://www.opensource.org/licenses/bsd-license.html
 5 |  |  - Encoding was converted to UTF-8.
 6 |  |  - This notice was added.
 7 |  |
 8 |  | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
 9 |  
10 | | forms of BE
11 | 
12 | olla
13 | olen
14 | olet
15 | on
16 | olemme
17 | olette
18 | ovat
19 | ole        | negative form
20 | 
21 | oli
22 | olisi
23 | olisit
24 | olisin
25 | olisimme
26 | olisitte
27 | olisivat
28 | olit
29 | olin
30 | olimme
31 | olitte
32 | olivat
33 | ollut
34 | olleet
35 | 
36 | en         | negation
37 | et
38 | ei
39 | emme
40 | ette
41 | eivät
42 | 
43 | |Nom   Gen    Acc    Part   Iness   Elat    Illat  Adess   Ablat   Allat   Ess    Trans
44 | minä   minun  minut  minua  minussa minusta minuun minulla minulta minulle               | I
45 | sinä   sinun  sinut  sinua  sinussa sinusta sinuun sinulla sinulta sinulle               | you
46 | hän    hänen  hänet  häntä  hänessä hänestä häneen hänellä häneltä hänelle               | he she
47 | me     meidän meidät meitä  meissä  meistä  meihin meillä  meiltä  meille                | we
48 | te     teidän teidät teitä  teissä  teistä  teihin teillä  teiltä  teille                | you
49 | he     heidän heidät heitä  heissä  heistä  heihin heillä  heiltä  heille                | they
50 | 
51 | tämä   tämän         tätä   tässä   tästä   tähän  tallä   tältä   tälle   tänä   täksi  | this
52 | tuo    tuon          tuotä  tuossa  tuosta  tuohon tuolla  tuolta  tuolle  tuona  tuoksi | that
53 | se     sen           sitä   siinä   siitä   siihen sillä   siltä   sille   sinä   siksi  | it
54 | nämä   näiden        näitä  näissä  näistä  näihin näillä  näiltä  näille  näinä  näiksi | these
55 | nuo    noiden        noita  noissa  noista  noihin noilla  noilta  noille  noina  noiksi | those
56 | ne     niiden        niitä  niissä  niistä  niihin niillä  niiltä  niille  niinä  niiksi | they
57 | 
58 | kuka   kenen kenet   ketä   kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
59 | ketkä  keiden ketkä  keitä  keissä  keistä  keihin keillä  keiltä  keille  keinä  keiksi | (pl)
60 | mikä   minkä minkä   mitä   missä   mistä   mihin  millä   miltä   mille   minä   miksi  | which what
61 | mitkä                                                                                    | (pl)
62 | 
63 | joka   jonka         jota   jossa   josta   johon  jolla   jolta   jolle   jona   joksi  | who which
64 | jotka  joiden        joita  joissa  joista  joihin joilla  joilta  joille  joina  joiksi | (pl)
65 | 
66 | | conjunctions
67 | 
68 | että   | that
69 | ja     | and
70 | jos    | if
71 | koska  | because
72 | kuin   | than
73 | mutta  | but
74 | niin   | so
75 | sekä   | and
76 | sillä  | for
77 | tai    | or
78 | vaan   | but
79 | vai    | or
80 | vaikka | although
81 | 
82 | 
83 | | prepositions
84 | 
85 | kanssa  | with
86 | mukaan  | according to
87 | noin    | about
88 | poikki  | across
89 | yli     | over, across
90 | 
91 | | other
92 | 
93 | kun    | when
94 | niin   | so
95 | nyt    | now
96 | itse   | self
97 | 
98 | 


--------------------------------------------------------------------------------
/ltr/clickmodels/sdbn.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter, defaultdict
 2 | from ltr.clickmodels.session import build
 3 | 
 4 | class Model():
 5 |     def __init__(self):
 6 |         # Satisfaction per query-doc
 7 |         self.satisfacts = defaultdict(lambda: 0.1)
 8 | 
 9 |         # Attractiveness per query-doc
10 |         self.attracts = defaultdict(lambda : 0.1)
11 | 
12 | reverse_enumerate = lambda l: zip(range(len(l)-1, -1, -1), reversed(l))
13 | 
14 | 
15 | def sdbn(sessions):
16 |     """ Simplified Dynamic Bayesian Network is a simpler
17 |         version of the much more complex Dynamic Bayesian Network
18 |         that the authors say comes close to the accuracy of DBN
19 | 
20 |         Most importantly, it can be solved directly and simply without
21 |         an EM learning process
22 | 
23 |         Features of sdbn:
24 |         - Attractiveness is any click out of sessions where that document
25 |           appears before the last click of the session
26 |         - Satisfaction occurs when a doc is the last document clicked
27 |           out of all sessions where that document is clicked
28 | 
29 |         """
30 |     model = Model()
31 |     NO_CLICK = -1
32 |     counts = Counter()
33 |     clicks = Counter()
34 |     last_clicks = Counter()
35 |     for session in sessions:
36 |         last_click = NO_CLICK
37 |         for rank, doc in reverse_enumerate(session.docs):
38 |             if last_click == NO_CLICK and doc.click:
39 |                 last_click = rank
40 | 
41 |             if last_click != NO_CLICK:
42 |                 query_doc = (session.query, doc.doc_id)
43 |                 counts[query_doc] += 1
44 | 
45 |                 if doc.click:
46 |                     # Cascading model doesn't consider
47 |                     # clicks past the last one, so we count
48 |                     # this one and break out
49 |                     clicks[query_doc] += 1
50 |                     if rank == last_click:
51 |                         last_clicks[query_doc] += 1
52 | 
53 |     # For all meaningful sessions (where query_doc appear)
54 |     # count attractiveness clicks / num sessions
55 |     # count satisfacts last clicks / sessions with clicks
56 |     for query_doc, count in counts.items():
57 |         model.attracts[query_doc] = clicks[query_doc] / count
58 |         if query_doc in clicks:
59 |             model.satisfacts[query_doc] = last_clicks[query_doc] / clicks[query_doc]
60 |     return model
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     sessions = build([
65 |       ('A', ((1, True), (2, False), (3, True), (0, False))),
66 |       ('B', ((5, False), (2, True), (3, True), (0, False))),
67 |       ('A', ((1, False), (2, False), (3, True), (0, False))),
68 |       ('B', ((1, False), (2, False), (3, False), (9, True))),
69 |       ('A', ((9, False), (2, False), (1, True), (0, True))),
70 |       ('B', ((6, True), (2, False), (3, True), (1, False))),
71 |       ('A', ((7, False), (4, True), (1, False), (3, False))),
72 |       ('B', ((8, True), (2, False), (3, True), (1, False))),
73 |       ('A', ((1, False), (4, True), (2, False), (3, False))),
74 |       ('B', ((7, True), (4, False), (5, True), (1, True))),
75 |     ])
76 |     model = sdbn(sessions)
77 |     print(model.attracts[('A', 1)])
78 |     print(model.satisfacts[('A', 1)])
79 |     print(model.attracts[('B', 1)])
80 |     print(model.satisfacts[('B', 1)])
81 | 


--------------------------------------------------------------------------------
/ltr/date_genre_judgments.py:
--------------------------------------------------------------------------------
  1 | from .judgments import Judgment, judgments_to_file
  2 | from tqdm import tqdm
  3 | 
  4 | def genreQid(genre):
  5 |     if genre == "Science Fiction":
  6 |         return 1
  7 |     if genre == "Drama":
  8 |         return 2
  9 |     else:
 10 |         return 0
 11 | 
 12 | 
 13 | def genreGrade(movie):
 14 |     """ Create a simple training set, as if we were
 15 |         searching for a genre.
 16 | 
 17 |         Newer science fiction is considered better
 18 |         Older drama is considered better
 19 | 
 20 |         """
 21 |     if 'release_year' in movie and movie['release_year'] is not None:
 22 |         releaseYear = int(movie['release_year'])
 23 |     else:
 24 |         return 0
 25 |     if movie['genres'][0] == "Science Fiction":
 26 |         if releaseYear > 2015:
 27 |             return 4
 28 |         elif releaseYear > 2010:
 29 |             return 3
 30 |         elif releaseYear > 2000:
 31 |             return 2
 32 |         elif releaseYear > 1990:
 33 |             return 1
 34 |         else:
 35 |             return 0
 36 | 
 37 |     if movie['genres'][0] == "Drama":
 38 |         if releaseYear > 1990:
 39 |             return 0
 40 |         elif releaseYear > 1970:
 41 |             return 1
 42 |         elif releaseYear > 1950:
 43 |             return 2
 44 |         elif releaseYear > 1930:
 45 |             return 3
 46 |         else:
 47 |             return 4
 48 |     return 0
 49 | 
 50 | 
 51 | def synthesize(client, judgmentsOutFile='genre_by_date_judgments.txt', autoNegate=False):
 52 |     print('Generating judgments for scifi & drama movies')
 53 | 
 54 |     if client.name() == 'elastic':
 55 |         params = {
 56 |             "query": {
 57 |                 "match_all": {}
 58 |             },
 59 |             "size": 10000,
 60 |             "sort": [{"_id": "asc"}]
 61 |         }
 62 |     else:
 63 |         params = {
 64 |             "q": "*:*",
 65 |             "rows": 10000,
 66 |             "sort": "id ASC",
 67 |             "wt": 'json'
 68 |         }
 69 | 
 70 |     resp = client.query('tmdb', params)
 71 | 
 72 |     # Build judgments for each film
 73 |     judgments = []
 74 |     for movie in tqdm(resp):
 75 |         if 'genres' in movie and len(movie['genres']) > 0:
 76 |             genre=movie['genres'][0]
 77 |             qid = genreQid(genre)
 78 |             if qid == 0:
 79 |                 continue
 80 |             judgment = Judgment(qid=qid,
 81 |                                 grade=genreGrade(movie),
 82 |                                 docId=movie['id'],
 83 |                                 keywords=genre)
 84 |             judgments.append(judgment)
 85 | 
 86 |             # This movie is good for its genre, but
 87 |             # a bad result for the opposite genre
 88 |             negGenre = None
 89 |             if genre == "Science Fiction":
 90 |                 negGenre = "Drama"
 91 |             elif genre == "Drama":
 92 |                 negGenre = "Science Fiction"
 93 | 
 94 |             if autoNegate and negGenre is not None:
 95 |                 negQid=genreQid(negGenre)
 96 |                 judgment = Judgment(qid=negQid,
 97 |                                     grade=0,
 98 |                                     docId=movie['id'],
 99 |                                     keywords=negGenre)
100 |                 judgments.append(judgment)
101 |                 
102 |     with open(judgmentsOutFile, 'w') as f:
103 |         judgments_to_file(f, judgmentsList=judgments)
104 | 
105 |     return judgments
106 | 


--------------------------------------------------------------------------------
/ltr/log.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | class FeatureLogger:
 4 |     """ Logs LTR Features, one query at a time
 5 | 
 6 |         ...Building up a training set...
 7 |     """
 8 | 
 9 |     def __init__(self, client, index, feature_set, drop_missing=True):
10 |         self.client=client
11 |         self.index=index
12 |         self.feature_set=feature_set
13 |         self.drop_missing=drop_missing
14 |         self.logged=[]
15 | 
16 |     def clear(self):
17 |         self.logged=[]
18 | 
19 |     def log_for_qid(self, qid, judgments, keywords):
20 |         """ Log a set of judgments associated with a single qid
21 |             judgments will be modified, a training set also returned, discarding
22 |             any judgments we could not log features for (because the doc was missing)
23 |         """
24 |         featuresPerDoc = {}
25 |         judgments = [j for j in judgments]
26 |         docIds = [judgment.docId for judgment in judgments]
27 | 
28 |         # Check for dups of documents
29 |         for docId in docIds:
30 |             indices = [i for i, x in enumerate(docIds) if x == docId]
31 |             if len(indices) > 1:
32 |                 # print("Duplicate Doc in qid:%s %s" % (qid, docId))
33 |                 pass
34 | 
35 |         # For every batch of N docs to generate judgments for
36 |         BATCH_SIZE = 500
37 |         numLeft = len(docIds)
38 |         for i in range(0, 1 + (len(docIds) // BATCH_SIZE)):
39 | 
40 |             numFetch = min(BATCH_SIZE, numLeft)
41 |             start = i*BATCH_SIZE
42 |             if start >= len(docIds):
43 |                 break
44 |             ids = docIds[start:start+numFetch]
45 | 
46 |             # Sanitize (Solr has a strict syntax that can easily be tripped up)
47 |             # This removes anything but alphanumeric and spaces
48 |             keywords = re.sub('([^\s\w]|_)+', '', keywords)
49 | 
50 |             params = {
51 |                 "keywords": keywords,
52 |                 "fuzzy_keywords": ' '.join([x + '~' for x in keywords.split(' ')]),
53 |                 "keywordsList": [keywords] # Needed by TSQ for the time being
54 |             }
55 | 
56 |             res = self.client.log_query(self.index, self.feature_set, ids, params)
57 | 
58 |             # Add feature back to each judgment
59 |             for doc in res:
60 |                 docId = str(doc['id'])
61 |                 features = doc['ltr_features']
62 |                 featuresPerDoc[docId] = features
63 |             numLeft -= BATCH_SIZE
64 | 
65 |         # Append features from search engine back to ranklib judgment list
66 |         for judgment in judgments:
67 |             try:
68 |                 features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index
69 |                 judgment.features = features
70 |             except KeyError:
71 |                 pass
72 |                 # print("Missing doc %s" % judgment.docId)
73 | 
74 |         # Return a paired down judgments if we are missing features for judgments
75 |         training_set = []
76 |         discarded = []
77 |         for judgment in judgments:
78 |             if self.drop_missing:
79 |                 if judgment.has_features():
80 |                     training_set.append(judgment)
81 |                 else:
82 |                     discarded.append(judgment)
83 |             else:
84 |                 training_set.append(judgment)
85 |         # print("Discarded %s Keep %s" % (len(discarded), len(training_set)))
86 |         self.logged.extend(training_set)
87 |         return training_set, discarded
88 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/es-docker/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | # ======================== Elasticsearch Configuration =========================
 2 | #
 3 | # NOTE: Elasticsearch comes with reasonable defaults for most settings.
 4 | #       Before you set out to tweak and tune the configuration, make sure you
 5 | #       understand what are you trying to accomplish and the consequences.
 6 | #
 7 | # The primary way of configuring a node is via this file. This template lists
 8 | # the most important settings you may want to configure for a production cluster.
 9 | #
10 | # Please consult the documentation for further information on configuration options:
11 | # https://www.elastic.co/guide/en/elasticsearch/reference/index.html
12 | #
13 | # ---------------------------------- Cluster -----------------------------------
14 | #
15 | # Use a descriptive name for your cluster:
16 | #
17 | #cluster.name: my-application
18 | #
19 | # ------------------------------------ Node ------------------------------------
20 | #
21 | # Use a descriptive name for the node:
22 | #
23 | #node.name: node-1
24 | #
25 | # Add custom attributes to the node:
26 | #
27 | #node.attr.rack: r1
28 | #
29 | # ----------------------------------- Paths ------------------------------------
30 | #
31 | # Path to directory where to store the data (separate multiple locations by comma):
32 | #
33 | #path.data: /path/to/data
34 | #
35 | # Path to log files:
36 | #
37 | #path.logs: /path/to/logs
38 | #
39 | # ----------------------------------- Memory -----------------------------------
40 | #
41 | # Lock the memory on startup:
42 | #
43 | #bootstrap.memory_lock: true
44 | #
45 | # Make sure that the heap size is set to about half the memory available
46 | # on the system and that the owner of the process is allowed to use this
47 | # limit.
48 | #
49 | # Elasticsearch performs poorly when the system is swapping the memory.
50 | #
51 | # ---------------------------------- Network -----------------------------------
52 | #
53 | # Set the bind address to a specific IP (IPv4 or IPv6):
54 | #
55 | #network.host: 192.168.0.1
56 | #
57 | # Set a custom port for HTTP:
58 | #
59 | #http.port: 9200
60 | #
61 | # For more information, consult the network module documentation.
62 | #
63 | # --------------------------------- Discovery ----------------------------------
64 | #
65 | # Pass an initial list of hosts to perform discovery when new node is started:
66 | # The default list of hosts is ["127.0.0.1", "[::1]"]
67 | #
68 | #discovery.zen.ping.unicast.hosts: ["host1", "host2"]
69 | #
70 | # Prevent the "split brain" by configuring the majority of nodes (total number of master-eligible nodes / 2 + 1):
71 | #
72 | #discovery.zen.minimum_master_nodes: 
73 | #
74 | # For more information, consult the zen discovery module documentation.
75 | #
76 | # ---------------------------------- Gateway -----------------------------------
77 | #
78 | # Block initial recovery after a full cluster restart until N nodes are started:
79 | #
80 | #gateway.recover_after_nodes: 3
81 | #
82 | # For more information, consult the gateway module documentation.
83 | #
84 | # ---------------------------------- Various -----------------------------------
85 | #
86 | # Require explicit names when deleting indices:
87 | #
88 | #action.destructive_requires_name: true
89 | #http.cors.allow-origin: "/https?:\\/\\/(.*?\\.)?(quepid\\.com|splainer\\.io)/"
90 | http.cors.allow-origin: "/http?:.*/"
91 | #http.cors.allow-origin: /http?://localhost(:[0-9]+)?/
92 | http.cors.enabled: true
93 | indices.query.bool.max_clause_count: 10240
94 | network.host: 0.0.0.0
95 | 
96 | discovery.type: single-node
97 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fa.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | # Note: by default this file is used after normalization, so when adding entries
  5 | # to this file, use the arabic 'ي' instead of 'ی'
  6 | انان
  7 | نداشته
  8 | سراسر
  9 | خياه
 10 | ايشان
 11 | وي
 12 | تاكنون
 13 | بيشتري
 14 | دوم
 15 | پس
 16 | ناشي
 17 | وگو
 18 | يا
 19 | داشتند
 20 | سپس
 21 | هنگام
 22 | هرگز
 23 | پنج
 24 | نشان
 25 | امسال
 26 | ديگر
 27 | گروهي
 28 | شدند
 29 | چطور
 30 | ده
 31 | و
 32 | دو
 33 | نخستين
 34 | ولي
 35 | چرا
 36 | چه
 37 | وسط
 38 | ه
 39 | كدام
 40 | قابل
 41 | يك
 42 | رفت
 43 | هفت
 44 | همچنين
 45 | در
 46 | هزار
 47 | بله
 48 | بلي
 49 | شايد
 50 | اما
 51 | شناسي
 52 | گرفته
 53 | دهد
 54 | داشته
 55 | دانست
 56 | داشتن
 57 | خواهيم
 58 | ميليارد
 59 | وقتيكه
 60 | امد
 61 | خواهد
 62 | جز
 63 | اورده
 64 | شده
 65 | بلكه
 66 | خدمات
 67 | شدن
 68 | برخي
 69 | نبود
 70 | بسياري
 71 | جلوگيري
 72 | حق
 73 | كردند
 74 | نوعي
 75 | بعري
 76 | نكرده
 77 | نظير
 78 | نبايد
 79 | بوده
 80 | بودن
 81 | داد
 82 | اورد
 83 | هست
 84 | جايي
 85 | شود
 86 | دنبال
 87 | داده
 88 | بايد
 89 | سابق
 90 | هيچ
 91 | همان
 92 | انجا
 93 | كمتر
 94 | كجاست
 95 | گردد
 96 | كسي
 97 | تر
 98 | مردم
 99 | تان
100 | دادن
101 | بودند
102 | سري
103 | جدا
104 | ندارند
105 | مگر
106 | يكديگر
107 | دارد
108 | دهند
109 | بنابراين
110 | هنگامي
111 | سمت
112 | جا
113 | انچه
114 | خود
115 | دادند
116 | زياد
117 | دارند
118 | اثر
119 | بدون
120 | بهترين
121 | بيشتر
122 | البته
123 | به
124 | براساس
125 | بيرون
126 | كرد
127 | بعضي
128 | گرفت
129 | توي
130 | اي
131 | ميليون
132 | او
133 | جريان
134 | تول
135 | بر
136 | مانند
137 | برابر
138 | باشيم
139 | مدتي
140 | گويند
141 | اكنون
142 | تا
143 | تنها
144 | جديد
145 | چند
146 | بي
147 | نشده
148 | كردن
149 | كردم
150 | گويد
151 | كرده
152 | كنيم
153 | نمي
154 | نزد
155 | روي
156 | قصد
157 | فقط
158 | بالاي
159 | ديگران
160 | اين
161 | ديروز
162 | توسط
163 | سوم
164 | ايم
165 | دانند
166 | سوي
167 | استفاده
168 | شما
169 | كنار
170 | داريم
171 | ساخته
172 | طور
173 | امده
174 | رفته
175 | نخست
176 | بيست
177 | نزديك
178 | طي
179 | كنيد
180 | از
181 | انها
182 | تمامي
183 | داشت
184 | يكي
185 | طريق
186 | اش
187 | چيست
188 | روب
189 | نمايد
190 | گفت
191 | چندين
192 | چيزي
193 | تواند
194 | ام
195 | ايا
196 | با
197 | ان
198 | ايد
199 | ترين
200 | اينكه
201 | ديگري
202 | راه
203 | هايي
204 | بروز
205 | همچنان
206 | پاعين
207 | كس
208 | حدود
209 | مختلف
210 | مقابل
211 | چيز
212 | گيرد
213 | ندارد
214 | ضد
215 | همچون
216 | سازي
217 | شان
218 | مورد
219 | باره
220 | مرسي
221 | خويش
222 | برخوردار
223 | چون
224 | خارج
225 | شش
226 | هنوز
227 | تحت
228 | ضمن
229 | هستيم
230 | گفته
231 | فكر
232 | بسيار
233 | پيش
234 | براي
235 | روزهاي
236 | انكه
237 | نخواهد
238 | بالا
239 | كل
240 | وقتي
241 | كي
242 | چنين
243 | كه
244 | گيري
245 | نيست
246 | است
247 | كجا
248 | كند
249 | نيز
250 | يابد
251 | بندي
252 | حتي
253 | توانند
254 | عقب
255 | خواست
256 | كنند
257 | بين
258 | تمام
259 | همه
260 | ما
261 | باشند
262 | مثل
263 | شد
264 | اري
265 | باشد
266 | اره
267 | طبق
268 | بعد
269 | اگر
270 | صورت
271 | غير
272 | جاي
273 | بيش
274 | ريزي
275 | اند
276 | زيرا
277 | چگونه
278 | بار
279 | لطفا
280 | مي
281 | درباره
282 | من
283 | ديده
284 | همين
285 | گذاري
286 | برداري
287 | علت
288 | گذاشته
289 | هم
290 | فوق
291 | نه
292 | ها
293 | شوند
294 | اباد
295 | همواره
296 | هر
297 | اول
298 | خواهند
299 | چهار
300 | نام
301 | امروز
302 | مان
303 | هاي
304 | قبل
305 | كنم
306 | سعي
307 | تازه
308 | را
309 | هستند
310 | زير
311 | جلوي
312 | عنوان
313 | بود
314 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fa.txt:
--------------------------------------------------------------------------------
  1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
  2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
  3 | # Also see http://www.opensource.org/licenses/bsd-license.html
  4 | # Note: by default this file is used after normalization, so when adding entries
  5 | # to this file, use the arabic 'ي' instead of 'ی'
  6 | انان
  7 | نداشته
  8 | سراسر
  9 | خياه
 10 | ايشان
 11 | وي
 12 | تاكنون
 13 | بيشتري
 14 | دوم
 15 | پس
 16 | ناشي
 17 | وگو
 18 | يا
 19 | داشتند
 20 | سپس
 21 | هنگام
 22 | هرگز
 23 | پنج
 24 | نشان
 25 | امسال
 26 | ديگر
 27 | گروهي
 28 | شدند
 29 | چطور
 30 | ده
 31 | و
 32 | دو
 33 | نخستين
 34 | ولي
 35 | چرا
 36 | چه
 37 | وسط
 38 | ه
 39 | كدام
 40 | قابل
 41 | يك
 42 | رفت
 43 | هفت
 44 | همچنين
 45 | در
 46 | هزار
 47 | بله
 48 | بلي
 49 | شايد
 50 | اما
 51 | شناسي
 52 | گرفته
 53 | دهد
 54 | داشته
 55 | دانست
 56 | داشتن
 57 | خواهيم
 58 | ميليارد
 59 | وقتيكه
 60 | امد
 61 | خواهد
 62 | جز
 63 | اورده
 64 | شده
 65 | بلكه
 66 | خدمات
 67 | شدن
 68 | برخي
 69 | نبود
 70 | بسياري
 71 | جلوگيري
 72 | حق
 73 | كردند
 74 | نوعي
 75 | بعري
 76 | نكرده
 77 | نظير
 78 | نبايد
 79 | بوده
 80 | بودن
 81 | داد
 82 | اورد
 83 | هست
 84 | جايي
 85 | شود
 86 | دنبال
 87 | داده
 88 | بايد
 89 | سابق
 90 | هيچ
 91 | همان
 92 | انجا
 93 | كمتر
 94 | كجاست
 95 | گردد
 96 | كسي
 97 | تر
 98 | مردم
 99 | تان
100 | دادن
101 | بودند
102 | سري
103 | جدا
104 | ندارند
105 | مگر
106 | يكديگر
107 | دارد
108 | دهند
109 | بنابراين
110 | هنگامي
111 | سمت
112 | جا
113 | انچه
114 | خود
115 | دادند
116 | زياد
117 | دارند
118 | اثر
119 | بدون
120 | بهترين
121 | بيشتر
122 | البته
123 | به
124 | براساس
125 | بيرون
126 | كرد
127 | بعضي
128 | گرفت
129 | توي
130 | اي
131 | ميليون
132 | او
133 | جريان
134 | تول
135 | بر
136 | مانند
137 | برابر
138 | باشيم
139 | مدتي
140 | گويند
141 | اكنون
142 | تا
143 | تنها
144 | جديد
145 | چند
146 | بي
147 | نشده
148 | كردن
149 | كردم
150 | گويد
151 | كرده
152 | كنيم
153 | نمي
154 | نزد
155 | روي
156 | قصد
157 | فقط
158 | بالاي
159 | ديگران
160 | اين
161 | ديروز
162 | توسط
163 | سوم
164 | ايم
165 | دانند
166 | سوي
167 | استفاده
168 | شما
169 | كنار
170 | داريم
171 | ساخته
172 | طور
173 | امده
174 | رفته
175 | نخست
176 | بيست
177 | نزديك
178 | طي
179 | كنيد
180 | از
181 | انها
182 | تمامي
183 | داشت
184 | يكي
185 | طريق
186 | اش
187 | چيست
188 | روب
189 | نمايد
190 | گفت
191 | چندين
192 | چيزي
193 | تواند
194 | ام
195 | ايا
196 | با
197 | ان
198 | ايد
199 | ترين
200 | اينكه
201 | ديگري
202 | راه
203 | هايي
204 | بروز
205 | همچنان
206 | پاعين
207 | كس
208 | حدود
209 | مختلف
210 | مقابل
211 | چيز
212 | گيرد
213 | ندارد
214 | ضد
215 | همچون
216 | سازي
217 | شان
218 | مورد
219 | باره
220 | مرسي
221 | خويش
222 | برخوردار
223 | چون
224 | خارج
225 | شش
226 | هنوز
227 | تحت
228 | ضمن
229 | هستيم
230 | گفته
231 | فكر
232 | بسيار
233 | پيش
234 | براي
235 | روزهاي
236 | انكه
237 | نخواهد
238 | بالا
239 | كل
240 | وقتي
241 | كي
242 | چنين
243 | كه
244 | گيري
245 | نيست
246 | است
247 | كجا
248 | كند
249 | نيز
250 | يابد
251 | بندي
252 | حتي
253 | توانند
254 | عقب
255 | خواست
256 | كنند
257 | بين
258 | تمام
259 | همه
260 | ما
261 | باشند
262 | مثل
263 | شد
264 | اري
265 | باشد
266 | اره
267 | طبق
268 | بعد
269 | اگر
270 | صورت
271 | غير
272 | جاي
273 | بيش
274 | ريزي
275 | اند
276 | زيرا
277 | چگونه
278 | بار
279 | لطفا
280 | مي
281 | درباره
282 | من
283 | ديده
284 | همين
285 | گذاري
286 | برداري
287 | علت
288 | گذاشته
289 | هم
290 | فوق
291 | نه
292 | ها
293 | شوند
294 | اباد
295 | همواره
296 | هر
297 | اول
298 | خواهند
299 | چهار
300 | نام
301 | امروز
302 | مان
303 | هاي
304 | قبل
305 | كنم
306 | سعي
307 | تازه
308 | را
309 | هستند
310 | زير
311 | جلوي
312 | عنوان
313 | بود
314 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/evaluation (Solr).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# evaluate (Solr Edition)\n",
  8 |     "\n",
  9 |     "**Note:** This lab requires hello-ltr be run first.  You must have the TMDB data indexed and LTR models configured before proceeding."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### RRE\n",
 17 |     "This lab makes use of the rated-ranking-evaluator [project](https://github.com/SeaseLtd/rated-ranking-evaluator) to carry out evaluations on our models from the hello-ltr lab.\n",
 18 |     "\n",
 19 |     "An RRE configuration requires the following:\n",
 20 |     "\n",
 21 |     "- configuration_sets\n",
 22 |     "  - This tells RRE about the Solr/Elastic instance to use for each evaluation\n",
 23 |     "- corpora (Not required for this setup)\n",
 24 |     "  - RRE supports indexing a snapshot of data for evaluations.  For this lab we'll be using the data indexed previously.\n",
 25 |     "- ratings\n",
 26 |     "  - This folder houses json files with queries and ratings to be evaluated\n",
 27 |     "- templates\n",
 28 |     "  - The queries to be run by each configuration set\n",
 29 |     "- pom.xml\n",
 30 |     "  - Maven project configuration, here you can configure what metrics are calculated by the evalauation and format of the report.\n",
 31 |     "  \n",
 32 |     "Take a look at the rre folder in the hello-ltr to get a better idea of the project layout and structure."
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "### Ratings and Evaluation\n",
 40 |     "To get started with RRE we first need some ratings.  For this example we're going to use a query for \"batman\" and we're going to say that newer films are better than older ones.  We will setup 3 different configuration sets in RRE:\n",
 41 |     "\n",
 42 |     "- baseline (No LTR applied)\n",
 43 |     "- classic (Rescore with the `classic` LTR model)\n",
 44 |     "- latest (Rescore with the `latest` LTR model)\n",
 45 |     "\n",
 46 |     "The snippet below will kick off an evaluation in RRE"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from ltr import evaluate\n",
 56 |     "evaluate('solr')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### Looking at the Results\n",
 64 |     "In this example we have rating data for every result in the Batman query and we're not adjusting matching so `Precision` and `Recall` are the expected value of 1.  However, since we've altered the sorting of results with LTR we can see a lift in `ERR` as our higher rated documents are coming up closer to the top of the results."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "from ltr import rre_table\n",
 74 |     "rre_table()"
 75 |    ]
 76 |   }
 77 |  ],
 78 |  "metadata": {
 79 |   "kernelspec": {
 80 |    "display_name": "Python 3",
 81 |    "language": "python",
 82 |    "name": "python3"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 3
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython3",
 94 |    "version": "3.7.6"
 95 |   }
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 2
 99 | }
100 | 


--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/evaluation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# evaluate (Elastic Edition)\n",
  8 |     "\n",
  9 |     "**Note:** This lab requires hello-ltr be run first.  You must have the TMDB data indexed and LTR models configured before proceeding."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### RRE\n",
 17 |     "This lab makes use of the rated-ranking-evaluator [project](https://github.com/SeaseLtd/rated-ranking-evaluator) to carry out evaluations on our models from the hello-ltr lab.\n",
 18 |     "\n",
 19 |     "An RRE configuration requires the following:\n",
 20 |     "\n",
 21 |     "- configuration_sets\n",
 22 |     "  - This tells RRE about the Solr/Elastic instance to use for each evaluation\n",
 23 |     "- corpora (Not required for this setup)\n",
 24 |     "  - RRE supports indexing a snapshot of data for evaluations.  For this lab we'll be using the data indexed previously.\n",
 25 |     "- ratings\n",
 26 |     "  - This folder houses json files with queries and ratings to be evaluated\n",
 27 |     "- templates\n",
 28 |     "  - The queries to be run by each configuration set\n",
 29 |     "- pom.xml\n",
 30 |     "  - Maven project configuration, here you can configure what metrics are calculated by the evalauation and format of the report.\n",
 31 |     "  \n",
 32 |     "Take a look at the rre folder in the hello-ltr to get a better idea of the project layout and structure."
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "### Ratings and Evaluation\n",
 40 |     "To get started with RRE we first need some ratings.  For this example we're going to use a query for \"batman\" and we're going to say that newer films are better than older ones.  We will setup 3 different configuration sets in RRE:\n",
 41 |     "\n",
 42 |     "- baseline (No LTR applied)\n",
 43 |     "- classic (Rescore with the `classic` LTR model)\n",
 44 |     "- latest (Rescore with the `latest` LTR model)\n",
 45 |     "\n",
 46 |     "The snippet below will kick off an evaluation in RRE"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from ltr import evaluate\n",
 56 |     "evaluate('elastic')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### Looking at the Results\n",
 64 |     "In this example we have rating data for every result in the Batman query and we're not adjusting matching so `Precision` and `Recall` are the expected value of 1.  However, since we've altered the sorting of results with LTR we can see a lift in `ERR` as our higher rated documents are coming up closer to the top of the results."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "from ltr import rre_table\n",
 74 |     "rre_table()"
 75 |    ]
 76 |   }
 77 |  ],
 78 |  "metadata": {
 79 |   "kernelspec": {
 80 |    "display_name": "Python 3",
 81 |    "language": "python",
 82 |    "name": "python3"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 3
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython3",
 94 |    "version": "3.7.6"
 95 |   }
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 2
 99 | }
100 | 


--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_da.txt:
--------------------------------------------------------------------------------
  1 |  | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
  2 |  | This file is distributed under the BSD License.
  3 |  | See http://snowball.tartarus.org/license.php
  4 |  | Also see http://www.opensource.org/licenses/bsd-license.html
  5 |  |  - Encoding was converted to UTF-8.
  6 |  |  - This notice was added.
  7 |  |
  8 |  | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
  9 | 
 10 |  | A Danish stop word list. Comments begin with vertical bar. Each stop
 11 |  | word is at the start of a line.
 12 | 
 13 |  | This is a ranked list (commonest to rarest) of stopwords derived from
 14 |  | a large text sample.
 15 | 
 16 | 
 17 | og           | and
 18 | i            | in
 19 | jeg          | I
 20 | det          | that (dem. pronoun)/it (pers. pronoun)
 21 | at           | that (in front of a sentence)/to (with infinitive)
 22 | en           | a/an
 23 | den          | it (pers. pronoun)/that (dem. pronoun)
 24 | til          | to/at/for/until/against/by/of/into, more
 25 | er           | present tense of "to be"
 26 | som          | who, as
 27 | på           | on/upon/in/on/at/to/after/of/with/for, on
 28 | de           | they
 29 | med          | with/by/in, along
 30 | han          | he
 31 | af           | of/by/from/off/for/in/with/on, off
 32 | for          | at/for/to/from/by/of/ago, in front/before, because
 33 | ikke         | not
 34 | der          | who/which, there/those
 35 | var          | past tense of "to be"
 36 | mig          | me/myself
 37 | sig          | oneself/himself/herself/itself/themselves
 38 | men          | but
 39 | et           | a/an/one, one (number), someone/somebody/one
 40 | har          | present tense of "to have"
 41 | om           | round/about/for/in/a, about/around/down, if
 42 | vi           | we
 43 | min          | my
 44 | havde        | past tense of "to have"
 45 | ham          | him
 46 | hun          | she
 47 | nu           | now
 48 | over         | over/above/across/by/beyond/past/on/about, over/past
 49 | da           | then, when/as/since
 50 | fra          | from/off/since, off, since
 51 | du           | you
 52 | ud           | out
 53 | sin          | his/her/its/one's
 54 | dem          | them
 55 | os           | us/ourselves
 56 | op           | up
 57 | man          | you/one
 58 | hans         | his
 59 | hvor         | where
 60 | eller        | or
 61 | hvad         | what
 62 | skal         | must/shall etc.
 63 | selv         | myself/youself/herself/ourselves etc., even
 64 | her          | here
 65 | alle         | all/everyone/everybody etc.
 66 | vil          | will (verb)
 67 | blev         | past tense of "to stay/to remain/to get/to become"
 68 | kunne        | could
 69 | ind          | in
 70 | når          | when
 71 | være         | present tense of "to be"
 72 | dog          | however/yet/after all
 73 | noget        | something
 74 | ville        | would
 75 | jo           | you know/you see (adv), yes
 76 | deres        | their/theirs
 77 | efter        | after/behind/according to/for/by/from, later/afterwards
 78 | ned          | down
 79 | skulle       | should
 80 | denne        | this
 81 | end          | than
 82 | dette        | this
 83 | mit          | my/mine
 84 | også         | also
 85 | under        | under/beneath/below/during, below/underneath
 86 | have         | have
 87 | dig          | you
 88 | anden        | other
 89 | hende        | her
 90 | mine         | my
 91 | alt          | everything
 92 | meget        | much/very, plenty of
 93 | sit          | his, her, its, one's
 94 | sine         | his, her, its, one's
 95 | vor          | our
 96 | mod          | against
 97 | disse        | these
 98 | hvis         | if
 99 | din          | your/yours
100 | nogle        | some
101 | hos          | by/at
102 | blive        | be/become
103 | mange        | many
104 | ad           | by/through
105 | bliver       | present tense of "to be/to become"
106 | hendes       | her/hers
107 | været        | be
108 | thi          | for (conj)
109 | jer          | you
110 | sådan        | such, like this/like that
111 | 


--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_da.txt:
--------------------------------------------------------------------------------
  1 |  | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
  2 |  | This file is distributed under the BSD License.
  3 |  | See http://snowball.tartarus.org/license.php
  4 |  | Also see http://www.opensource.org/licenses/bsd-license.html
  5 |  |  - Encoding was converted to UTF-8.
  6 |  |  - This notice was added.
  7 |  |
  8 |  | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
  9 | 
 10 |  | A Danish stop word list. Comments begin with vertical bar. Each stop
 11 |  | word is at the start of a line.
 12 | 
 13 |  | This is a ranked list (commonest to rarest) of stopwords derived from
 14 |  | a large text sample.
 15 | 
 16 | 
 17 | og           | and
 18 | i            | in
 19 | jeg          | I
 20 | det          | that (dem. pronoun)/it (pers. pronoun)
 21 | at           | that (in front of a sentence)/to (with infinitive)
 22 | en           | a/an
 23 | den          | it (pers. pronoun)/that (dem. pronoun)
 24 | til          | to/at/for/until/against/by/of/into, more
 25 | er           | present tense of "to be"
 26 | som          | who, as
 27 | på           | on/upon/in/on/at/to/after/of/with/for, on
 28 | de           | they
 29 | med          | with/by/in, along
 30 | han          | he
 31 | af           | of/by/from/off/for/in/with/on, off
 32 | for          | at/for/to/from/by/of/ago, in front/before, because
 33 | ikke         | not
 34 | der          | who/which, there/those
 35 | var          | past tense of "to be"
 36 | mig          | me/myself
 37 | sig          | oneself/himself/herself/itself/themselves
 38 | men          | but
 39 | et           | a/an/one, one (number), someone/somebody/one
 40 | har          | present tense of "to have"
 41 | om           | round/about/for/in/a, about/around/down, if
 42 | vi           | we
 43 | min          | my
 44 | havde        | past tense of "to have"
 45 | ham          | him
 46 | hun          | she
 47 | nu           | now
 48 | over         | over/above/across/by/beyond/past/on/about, over/past
 49 | da           | then, when/as/since
 50 | fra          | from/off/since, off, since
 51 | du           | you
 52 | ud           | out
 53 | sin          | his/her/its/one's
 54 | dem          | them
 55 | os           | us/ourselves
 56 | op           | up
 57 | man          | you/one
 58 | hans         | his
 59 | hvor         | where
 60 | eller        | or
 61 | hvad         | what
 62 | skal         | must/shall etc.
 63 | selv         | myself/youself/herself/ourselves etc., even
 64 | her          | here
 65 | alle         | all/everyone/everybody etc.
 66 | vil          | will (verb)
 67 | blev         | past tense of "to stay/to remain/to get/to become"
 68 | kunne        | could
 69 | ind          | in
 70 | når          | when
 71 | være         | present tense of "to be"
 72 | dog          | however/yet/after all
 73 | noget        | something
 74 | ville        | would
 75 | jo           | you know/you see (adv), yes
 76 | deres        | their/theirs
 77 | efter        | after/behind/according to/for/by/from, later/afterwards
 78 | ned          | down
 79 | skulle       | should
 80 | denne        | this
 81 | end          | than
 82 | dette        | this
 83 | mit          | my/mine
 84 | også         | also
 85 | under        | under/beneath/below/during, below/underneath
 86 | have         | have
 87 | dig          | you
 88 | anden        | other
 89 | hende        | her
 90 | mine         | my
 91 | alt          | everything
 92 | meget        | much/very, plenty of
 93 | sit          | his, her, its, one's
 94 | sine         | his, her, its, one's
 95 | vor          | our
 96 | mod          | against
 97 | disse        | these
 98 | hvis         | if
 99 | din          | your/yours
100 | nogle        | some
101 | hos          | by/at
102 | blive        | be/become
103 | mange        | many
104 | ad           | by/through
105 | bliver       | present tense of "to be/to become"
106 | hendes       | her/hers
107 | været        | be
108 | thi          | for (conj)
109 | jer          | you
110 | sådan        | such, like this/like that
111 | 


--------------------------------------------------------------------------------