├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── clean-notebooks.sh
├── docker-compose.yml
├── docker
└── README.md
├── ltr
├── MART_model.py
├── __init__.py
├── clickmodels
│ ├── __init__.py
│ ├── cascade.py
│ ├── coec.py
│ ├── conversion.py
│ ├── pbm.py
│ ├── sdbn.py
│ ├── session.py
│ └── ubm.py
├── client
│ ├── __init__.py
│ ├── base_client.py
│ ├── elastic_client.py
│ ├── opensearch_client.py
│ ├── solr_client.py
│ └── solr_parse.py
├── date_genre_judgments.py
├── download.py
├── evaluate.py
├── helpers
│ ├── __init__.py
│ ├── butterfingers.py
│ ├── convert.py
│ ├── defaultlist.py
│ ├── esUrlParse.py
│ ├── handle_resp.py
│ ├── movies.py
│ ├── msmarco
│ │ ├── __init__.py
│ │ └── evaluate.py
│ ├── ranklib_result.py
│ ├── solr_escape.py
│ └── tau.py
├── index.py
├── injectTypos.py
├── judgments.py
├── log.py
├── p9_plots.py
├── ranklib.py
├── release_date_plot.py
├── search.py
└── years_as_ratings.py
├── notebooks
├── click models.ipynb
├── conversion-augmented-click-models.ipynb
├── elasticsearch
│ ├── .docker
│ │ ├── es-docker
│ │ │ ├── Dockerfile
│ │ │ ├── elasticsearch.sh
│ │ │ └── elasticsearch.yml
│ │ └── kb-docker
│ │ │ ├── Dockerfile
│ │ │ └── kibana.yml
│ ├── README.md
│ ├── docker-compose.yml
│ ├── osc-blog
│ │ ├── blog_settings.json
│ │ ├── ltr.py
│ │ └── osc-blog.ipynb
│ └── tmdb
│ │ ├── Dataframes.ipynb
│ │ ├── XGBoost.ipynb
│ │ ├── bayesian-optimization.ipynb
│ │ ├── es-ltr-basics-project.ipynb
│ │ ├── evaluation.ipynb
│ │ ├── fmap.txt
│ │ ├── gonna need a bigger bot (ES).ipynb
│ │ ├── hello-ltr (ES).ipynb
│ │ ├── lambda-mart-in-python.ipynb
│ │ ├── ltr.py
│ │ ├── netfix movies-random-forests.ipynb
│ │ ├── netfix movies.ipynb
│ │ ├── raw-es-commands.ipynb
│ │ ├── sandbox.ipynb
│ │ ├── tale-of-two-queries (ES).ipynb
│ │ ├── term-stat-query.ipynb
│ │ └── tmdb_settings.json
├── exercises
│ ├── Beta distribution for regularizing CTRs.ipynb
│ ├── Feature Sets and Feature Logs.ipynb
│ ├── Have fun with Hyperparameters .ipynb
│ ├── Models and More Models.ipynb
│ ├── click_log.csv
│ ├── data
│ │ ├── tates_model.txt
│ │ ├── title_features.csv
│ │ └── title_judgments.txt
│ └── ltr.py
├── ltr.py
├── opensearch
│ ├── .docker
│ │ ├── opensearch-docker
│ │ │ ├── Dockerfile
│ │ │ ├── opensearch.sh
│ │ │ └── opensearch.yml
│ │ └── osd-docker
│ │ │ ├── Dockerfile
│ │ │ └── opensearch_dashboards.yml
│ ├── README.md
│ ├── docker-compose.yml
│ ├── osc-blog
│ │ ├── blog_settings.json
│ │ ├── ltr.py
│ │ └── osc-blog.ipynb
│ └── tmdb
│ │ ├── Dataframes.ipynb
│ │ ├── XGBoost.ipynb
│ │ ├── bayesian-optimization.ipynb
│ │ ├── evaluation.ipynb
│ │ ├── fmap.txt
│ │ ├── gonna need a bigger bot (OpenSearch).ipynb
│ │ ├── hello-ltr (OpenSearch).ipynb
│ │ ├── lambda-mart-in-python.ipynb
│ │ ├── ltr.py
│ │ ├── netfix movies-random-forests.ipynb
│ │ ├── netfix movies.ipynb
│ │ ├── opensearch-ltr-basics-project.ipynb
│ │ ├── raw-opensearch-commands.ipynb
│ │ ├── sandbox.ipynb
│ │ ├── tale-of-two-queries (OpenSearch).ipynb
│ │ ├── term-stat-query.ipynb
│ │ └── tmdb_settings.json
└── solr
│ ├── .docker
│ └── solr_home
│ │ ├── solr.xml
│ │ ├── tmdb
│ │ └── conf
│ │ │ ├── currency.xml
│ │ │ ├── elevate.xml
│ │ │ ├── idioms.txt
│ │ │ ├── lang
│ │ │ ├── contractions_ca.txt
│ │ │ ├── contractions_fr.txt
│ │ │ ├── contractions_ga.txt
│ │ │ ├── contractions_it.txt
│ │ │ ├── hyphenations_ga.txt
│ │ │ ├── stemdict_nl.txt
│ │ │ ├── stoptags_ja.txt
│ │ │ ├── stopwords_ar.txt
│ │ │ ├── stopwords_bg.txt
│ │ │ ├── stopwords_ca.txt
│ │ │ ├── stopwords_cz.txt
│ │ │ ├── stopwords_da.txt
│ │ │ ├── stopwords_de.txt
│ │ │ ├── stopwords_el.txt
│ │ │ ├── stopwords_en.txt
│ │ │ ├── stopwords_es.txt
│ │ │ ├── stopwords_eu.txt
│ │ │ ├── stopwords_fa.txt
│ │ │ ├── stopwords_fi.txt
│ │ │ ├── stopwords_fr.txt
│ │ │ ├── stopwords_ga.txt
│ │ │ ├── stopwords_gl.txt
│ │ │ ├── stopwords_hi.txt
│ │ │ ├── stopwords_hu.txt
│ │ │ ├── stopwords_hy.txt
│ │ │ ├── stopwords_id.txt
│ │ │ ├── stopwords_it.txt
│ │ │ ├── stopwords_ja.txt
│ │ │ ├── stopwords_lv.txt
│ │ │ ├── stopwords_nl.txt
│ │ │ ├── stopwords_no.txt
│ │ │ ├── stopwords_pt.txt
│ │ │ ├── stopwords_ro.txt
│ │ │ ├── stopwords_ru.txt
│ │ │ ├── stopwords_sv.txt
│ │ │ ├── stopwords_th.txt
│ │ │ ├── stopwords_tr.txt
│ │ │ └── userdict_ja.txt
│ │ │ ├── name_synonyms.txt
│ │ │ ├── names.txt
│ │ │ ├── params.json
│ │ │ ├── protwords.txt
│ │ │ ├── schema.xml
│ │ │ ├── solrconfig.xml
│ │ │ ├── stopwords.txt
│ │ │ ├── synonyms.txt
│ │ │ ├── synonyms_bidirect.txt
│ │ │ ├── synonyms_directed.txt
│ │ │ ├── synonyms_genres.txt
│ │ │ ├── synonyms_multiterm.txt
│ │ │ ├── taxonomy.txt
│ │ │ └── taxonomy_parent.txt
│ │ └── zoo.cfg
│ ├── Dockerfile
│ ├── docker-compose.yml
│ ├── msmarco
│ ├── ltr.py
│ ├── msmarco.ipynb
│ └── solr_config
│ │ └── conf
│ │ ├── elevate.xml
│ │ ├── misspell.txt
│ │ ├── params.json
│ │ ├── plural_misstems.txt
│ │ ├── schema.xml
│ │ └── solrconfig.xml
│ └── tmdb
│ ├── ai-powered-search-ch-10.ipynb
│ ├── ai-powered-search.ipynb
│ ├── evaluation (Solr).ipynb
│ ├── gonna need a bigger bot (Solr).ipynb
│ ├── hello-ltr (Solr).ipynb
│ ├── ltr.py
│ ├── netfix movies(Solr).ipynb
│ ├── raw-solr-commands.ipynb
│ ├── solr_config
│ └── conf
│ │ ├── currency.xml
│ │ ├── elevate.xml
│ │ ├── idioms.txt
│ │ ├── lang
│ │ ├── contractions_ca.txt
│ │ ├── contractions_fr.txt
│ │ ├── contractions_ga.txt
│ │ ├── contractions_it.txt
│ │ ├── hyphenations_ga.txt
│ │ ├── stemdict_nl.txt
│ │ ├── stoptags_ja.txt
│ │ ├── stopwords_ar.txt
│ │ ├── stopwords_bg.txt
│ │ ├── stopwords_ca.txt
│ │ ├── stopwords_cz.txt
│ │ ├── stopwords_da.txt
│ │ ├── stopwords_de.txt
│ │ ├── stopwords_el.txt
│ │ ├── stopwords_en.txt
│ │ ├── stopwords_es.txt
│ │ ├── stopwords_eu.txt
│ │ ├── stopwords_fa.txt
│ │ ├── stopwords_fi.txt
│ │ ├── stopwords_fr.txt
│ │ ├── stopwords_ga.txt
│ │ ├── stopwords_gl.txt
│ │ ├── stopwords_hi.txt
│ │ ├── stopwords_hu.txt
│ │ ├── stopwords_hy.txt
│ │ ├── stopwords_id.txt
│ │ ├── stopwords_it.txt
│ │ ├── stopwords_ja.txt
│ │ ├── stopwords_lv.txt
│ │ ├── stopwords_nl.txt
│ │ ├── stopwords_no.txt
│ │ ├── stopwords_pt.txt
│ │ ├── stopwords_ro.txt
│ │ ├── stopwords_ru.txt
│ │ ├── stopwords_sv.txt
│ │ ├── stopwords_th.txt
│ │ ├── stopwords_tr.txt
│ │ └── userdict_ja.txt
│ │ ├── name_synonyms.txt
│ │ ├── names.txt
│ │ ├── params.json
│ │ ├── protwords.txt
│ │ ├── schema.xml
│ │ ├── solrconfig.xml
│ │ ├── stopwords.txt
│ │ ├── synonyms.txt
│ │ ├── synonyms_bidirect.txt
│ │ ├── synonyms_directed.txt
│ │ ├── synonyms_genres.txt
│ │ ├── synonyms_multiterm.txt
│ │ ├── taxonomy.txt
│ │ └── taxonomy_parent.txt
│ ├── svmrank.ipynb
│ └── tale-of-two-queries (Solr).ipynb
├── requirements.txt
├── rre
├── README.md
├── elastic
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── etc
│ │ ├── configuration_sets
│ │ ├── README.md
│ │ ├── baseline
│ │ │ └── index-settings.json
│ │ ├── classic
│ │ │ └── index-settings.json
│ │ └── latest
│ │ │ └── index-settings.json
│ │ ├── ratings
│ │ └── ratings.json
│ │ └── templates
│ │ ├── README.md
│ │ ├── baseline
│ │ └── query.json
│ │ ├── classic
│ │ └── query.json
│ │ └── latest
│ │ └── query.json
├── opensearch
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ │ └── etc
│ │ ├── configuration_sets
│ │ ├── README.md
│ │ ├── baseline
│ │ │ └── index-settings.json
│ │ ├── classic
│ │ │ └── index-settings.json
│ │ └── latest
│ │ │ └── index-settings.json
│ │ ├── ratings
│ │ └── ratings.json
│ │ └── templates
│ │ ├── README.md
│ │ ├── baseline
│ │ └── query.json
│ │ ├── classic
│ │ └── query.json
│ │ └── latest
│ │ └── query.json
└── solr
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Dockerfile
│ ├── pom.xml
│ └── src
│ └── etc
│ ├── configuration_sets
│ ├── README.md
│ ├── baseline
│ │ └── solr-settings.json
│ ├── classic
│ │ └── solr-settings.json
│ └── latest
│ │ └── solr-settings.json
│ ├── ratings
│ ├── README.md
│ └── ratings.json
│ └── templates
│ ├── README.md
│ ├── baseline
│ └── query.json
│ ├── classic
│ └── query.json
│ └── latest
│ └── query.json
├── tests
├── fail.py
├── nb_test_config.py
├── notebook_test_case.py
├── pass.py
├── run_most_nbs.py
├── runner.py
├── test.sh
├── test_judg_list.py
└── test_prep.py
└── utils
├── rate.py
├── rateFuzzySearch.json.jinja
├── rateSearch.json.jinja
├── train_to_csv.py
└── utils.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | #data/
2 | venv/
3 | venv2/
4 | .git/
5 | .cache/
6 | .trash/
7 | **/venv*
8 | #**/data/
9 | **/__pycache__/
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | **/data
3 | venv/*
4 | **/.ipynb_checkpoints
5 | tests/last_run.ipynb
6 |
7 | *.pyc
8 | .vscode
9 | .cache
10 | features.txt
11 | .trash
12 | .DS_store
13 | notify.sh
14 |
15 | .idea/
16 | *.iml
17 | tests_venv/*
18 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9.17-slim-bookworm
2 |
3 | # Get openjdk
4 | ENV JAVA_HOME=/opt/java/openjdk
5 | COPY --from=eclipse-temurin:11 $JAVA_HOME $JAVA_HOME
6 | ENV PATH="${JAVA_HOME}/bin:${PATH}"
7 |
8 | # Install graphviz
9 | RUN apt-get update && \
10 | apt-get install -y graphviz && \
11 | apt-get clean;
12 |
13 | # Setup a user
14 | RUN useradd -ms /bin/bash ltr
15 | WORKDIR /home/ltr
16 |
17 | # Make current directory accesible
18 | ADD . /home/ltr/hello-ltr
19 |
20 | # Install requirements
21 | RUN chown -R ltr.ltr hello-ltr
22 | WORKDIR /home/ltr/hello-ltr
23 |
24 | RUN /usr/local/bin/python -m pip install --upgrade pip
25 | RUN pip install -r requirements.txt
26 | USER ltr
27 |
28 | CMD jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token=''
29 |
--------------------------------------------------------------------------------
/clean-notebooks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Removes all output and metadata from notebooks
4 | find notebooks -type f -name "*.ipynb" -print0 | xargs -0 nbstripout
5 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | notebooks:
3 | build: .
4 | container_name: hello-ltr-notebook
5 | ports:
6 | - 8888:8888
7 | environment:
8 | - LTR_DOCKER=yes
9 | links:
10 | - elastic
11 | - solr
12 |
13 | elastic:
14 | build:
15 | context: ./notebooks/elasticsearch/.docker/es-docker/
16 | dockerfile: Dockerfile
17 | container_name: hello-ltr-elastic
18 | environment:
19 | - xpack.security.enabled=false
20 | - xpack.security.enrollment.enabled=false
21 | ports:
22 | - 9200:9200
23 |
24 | kibana:
25 | build:
26 | context: ./notebooks/elasticsearch/.docker/kb-docker/
27 | dockerfile: Dockerfile
28 | container_name: hello-ltr-kibana
29 | expose:
30 | - "5601"
31 | ports:
32 | - "5601:5601"
33 | environment:
34 | ELASTICSEARCH_HOSTS: "http://hello-ltr-elastic:9200"
35 | ELASTICSEARCH_URL: "http://hello-ltr-elastic:9200"
36 | SERVER_HOST: "0.0.0.0"
37 |
38 | opensearch-node1:
39 | build:
40 | context: ./notebooks/opensearch/.docker/opensearch-docker/
41 | dockerfile: Dockerfile
42 | container_name: hello-ltr-opensearch
43 | ports:
44 | - "9201:9201"
45 | environment:
46 | - "SERVER_HOST=0.0.0.0"
47 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
48 | - "DISABLE_INSTALL_DEMO_CONFIG=true"
49 | - "DISABLE_SECURITY_PLUGIN=true"
50 | - "discovery.type=single-node"
51 |
52 | osd-dashboards:
53 | build:
54 | context: ./notebooks/opensearch/.docker/osd-docker/
55 | dockerfile: Dockerfile
56 | container_name: hello-ltr-osd
57 | expose:
58 | - "5602"
59 | ports:
60 | - "5602:5602"
61 | environment:
62 | OPENSEARCH_HOSTS: "http://hello-ltr-opensearch:9201"
63 | OPENSEARCH_URL: "http://hello-ltr-opensearch:9201"
64 | SERVER_HOST: "0.0.0.0"
65 | DISABLE_SECURITY_DASHBOARDS_PLUGIN: true
66 |
67 |
68 | solr:
69 | build:
70 | context: ./notebooks/solr/
71 | dockerfile: Dockerfile
72 | container_name: hello-ltr-solr
73 | ports:
74 | - 8983:8983
75 |
76 |
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | If you have issues getting jupyter or the JDK running on the host machine, you can use the files here to setup a docker environment with everything in one place.
2 |
3 | ## Requirements
4 |
5 | - Docker with docker-compose
6 | - Ports 8888/8983/9200/9201 must be available on your host machine
7 |
8 | ## Setup
9 |
10 | Run `docker-compose up -d`
11 |
12 | The above command will build all images necessary for the project and run the following services:
13 |
14 | - Jupyter available at localhost:8888
15 | - Solr available at localhost:8983
16 | - Elasticsearch available at localhost:9200
17 | - OpenSearch available at localhost:9201
18 |
19 | ## Cleanup
20 |
21 | - To shut things down and return later run `docker-compose stop`
22 | - To get rid of everything run `docker-compose down`
23 |
--------------------------------------------------------------------------------
/ltr/__init__.py:
--------------------------------------------------------------------------------
1 | # Make the most important pieces just available as
2 | # ie - from ltr import download
3 | from .download import download
4 | from .evaluate import evaluate, rre_table
5 | from .search import search
6 |
--------------------------------------------------------------------------------
/ltr/clickmodels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/o19s/hello-ltr/58264c292c7805c24aebb5d33abd6fd0b75eedaa/ltr/clickmodels/__init__.py
--------------------------------------------------------------------------------
/ltr/clickmodels/cascade.py:
--------------------------------------------------------------------------------
1 | from ltr.clickmodels.session import build
2 | from collections import Counter, defaultdict
3 |
4 | class Model():
5 | def __init__(self):
6 | # Attractiveness per query-doc
7 | self.attracts = defaultdict(lambda : 0.5)
8 |
9 | def cascade_model(sessions):
10 | """ Cascading model can be solved directly:
11 | - sessions with skips count against a doc
12 | - sessions with clicks count for
13 | - stop at first click
14 | """
15 | session_counts = Counter()
16 | click_counts = Counter()
17 | model=Model()
18 |
19 | for session in sessions:
20 | for rank, doc in enumerate(session.docs):
21 | query_doc_key = (session.query, doc.doc_id)
22 | session_counts[query_doc_key] += 1
23 |
24 | if doc.click:
25 | # Cascading model doesn't consider
26 | # clicks past the last one, so we count
27 | # this one and break out
28 | click_counts[query_doc_key] += 1
29 | break;
30 |
31 | for (query_id, doc_id), count in session_counts.items():
32 | query_doc_key = (query_id, doc_id)
33 | model.attracts[query_doc_key] = click_counts[query_doc_key] / session_counts[query_doc_key]
34 | return model
35 |
36 |
37 |
38 | if __name__ == "__main__":
39 | sessions = build([
40 | ('A', ((1, True), (2, False), (3, True), (0, False))),
41 | ('B', ((5, False), (2, True), (3, True), (0, False))),
42 | ('A', ((1, False), (2, False), (3, True), (0, False))),
43 | ('B', ((1, False), (2, False), (3, False), (9, True))),
44 | ('A', ((9, False), (2, False), (1, True), (0, True))),
45 | ('B', ((6, True), (2, False), (3, True), (1, False))),
46 | ('A', ((7, False), (4, True), (1, False), (3, False))),
47 | ('B', ((8, True), (2, False), (3, True), (1, False))),
48 | ('A', ((1, False), (4, True), (2, False), (3, False))),
49 | ('B', ((7, True), (4, False), (5, True), (1, True))),
50 | ])
51 | cascade_model(sessions)
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/ltr/clickmodels/coec.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | class Model():
4 | def __init__(self):
5 | # COEC statistic
6 | self.coecs = Counter()
7 |
8 | # CTR for each query-doc pair in this session
9 | self.ctrs = {}
10 |
11 | def coec(ctr_by_rank, sessions):
12 | """ Clicks over expected clicks is a metric
13 | used for seeing what items get above or
14 | below average CTR for their rank. From paper
15 |
16 | > Personalized Click Prediction in Sponsored Search
17 | by Cheng, Cantu Paz
18 |
19 | A COEC > 1 means above average CTR for it's position
20 | A COEC < 1 means below average
21 |
22 | -ctr_by_rank is the global CTR at each rank position
23 | -sessions are an array of search session objects
24 |
25 | returned:
26 | each query-doc pair in provided sessions COEC
27 |
28 | """
29 | clicks = Counter()
30 | weighted_impressions = Counter()
31 |
32 | for session in sessions:
33 | for rank, doc in enumerate(session.docs):
34 | weighted_impressions[(session.query, doc.doc_id)] += ctr_by_rank[rank]
35 | if doc.click:
36 | clicks[(session.query, doc.doc_id)] += 1
37 |
38 | model = Model()
39 | for query_id, doc_id in weighted_impressions:
40 | model.coecs[(query_id,doc_id)] = \
41 | clicks[(query_id,doc_id)] / weighted_impressions[(query_id,doc_id)]
42 |
43 | return model
44 |
--------------------------------------------------------------------------------
/ltr/clickmodels/conversion.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | def conv_aug_attracts(attracts, sessions, costs):
4 | """ Rescan sessions, using click-derrived attractiveness.
5 |
6 | If theres no conversion, punish the attractiveness derrived judgment
7 |
8 | BUT we punish costly things less, and cheap things more
9 | """
10 | satisfacts = Counter()
11 | counts = Counter()
12 | for session in sessions:
13 | for rank, doc in enumerate(session.docs):
14 | attract = attracts[(session.query, doc.doc_id)]
15 | if doc.click:
16 | if doc.conversion:
17 | # Confirms the attractiveness was real with actual relevance
18 | counts[(session.query, doc.doc_id)] += 1
19 | satisfacts[(session.query, doc.doc_id)] += attract
20 | else:
21 | # If it costs a lot, and there wasn't a conversion,
22 | # thats ok, we default to attractiveness
23 | # If it costs little, and there wasn't a conversion,
24 | # thats generally not ok, why didn't they do (easy action)
25 | counts[(session.query, doc.doc_id)] += 1
26 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id]
27 | else:
28 | counts[(session.query, doc.doc_id)] += 1
29 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id]
30 |
31 | for (query_id, doc_id), count in counts.items():
32 | satisfacts[(query_id, doc_id)] = satisfacts[(query_id,doc_id)] / count
33 |
34 | return satisfacts
35 |
36 |
37 |
--------------------------------------------------------------------------------
/ltr/clickmodels/sdbn.py:
--------------------------------------------------------------------------------
1 | from collections import Counter, defaultdict
2 | from ltr.clickmodels.session import build
3 |
4 | class Model():
5 | def __init__(self):
6 | # Satisfaction per query-doc
7 | self.satisfacts = defaultdict(lambda: 0.1)
8 |
9 | # Attractiveness per query-doc
10 | self.attracts = defaultdict(lambda : 0.1)
11 |
12 | reverse_enumerate = lambda l: zip(range(len(l)-1, -1, -1), reversed(l))
13 |
14 |
15 | def sdbn(sessions):
16 | """ Simplified Dynamic Bayesian Network is a simpler
17 | version of the much more complex Dynamic Bayesian Network
18 | that the authors say comes close to the accuracy of DBN
19 |
20 | Most importantly, it can be solved directly and simply without
21 | an EM learning process
22 |
23 | Features of sdbn:
24 | - Attractiveness is any click out of sessions where that document
25 | appears before the last click of the session
26 | - Satisfaction occurs when a doc is the last document clicked
27 | out of all sessions where that document is clicked
28 |
29 | """
30 | model = Model()
31 | NO_CLICK = -1
32 | counts = Counter()
33 | clicks = Counter()
34 | last_clicks = Counter()
35 | for session in sessions:
36 | last_click = NO_CLICK
37 | for rank, doc in reverse_enumerate(session.docs):
38 | if last_click == NO_CLICK and doc.click:
39 | last_click = rank
40 |
41 | if last_click != NO_CLICK:
42 | query_doc = (session.query, doc.doc_id)
43 | counts[query_doc] += 1
44 |
45 | if doc.click:
46 | # Cascading model doesn't consider
47 | # clicks past the last one, so we count
48 | # this one and break out
49 | clicks[query_doc] += 1
50 | if rank == last_click:
51 | last_clicks[query_doc] += 1
52 |
53 | # For all meaningful sessions (where query_doc appear)
54 | # count attractiveness clicks / num sessions
55 | # count satisfacts last clicks / sessions with clicks
56 | for query_doc, count in counts.items():
57 | model.attracts[query_doc] = clicks[query_doc] / count
58 | if query_doc in clicks:
59 | model.satisfacts[query_doc] = last_clicks[query_doc] / clicks[query_doc]
60 | return model
61 |
62 |
63 | if __name__ == "__main__":
64 | sessions = build([
65 | ('A', ((1, True), (2, False), (3, True), (0, False))),
66 | ('B', ((5, False), (2, True), (3, True), (0, False))),
67 | ('A', ((1, False), (2, False), (3, True), (0, False))),
68 | ('B', ((1, False), (2, False), (3, False), (9, True))),
69 | ('A', ((9, False), (2, False), (1, True), (0, True))),
70 | ('B', ((6, True), (2, False), (3, True), (1, False))),
71 | ('A', ((7, False), (4, True), (1, False), (3, False))),
72 | ('B', ((8, True), (2, False), (3, True), (1, False))),
73 | ('A', ((1, False), (4, True), (2, False), (3, False))),
74 | ('B', ((7, True), (4, False), (5, True), (1, True))),
75 | ])
76 | model = sdbn(sessions)
77 | print(model.attracts[('A', 1)])
78 | print(model.satisfacts[('A', 1)])
79 | print(model.attracts[('B', 1)])
80 | print(model.satisfacts[('B', 1)])
81 |
--------------------------------------------------------------------------------
/ltr/clickmodels/session.py:
--------------------------------------------------------------------------------
1 |
2 | class Doc:
3 | def __init__(self, click, doc_id, conversion=False):
4 | self.click = click
5 | self.doc_id = doc_id
6 | self.conversion = conversion
7 |
8 | def __repr__(self):
9 | return "Doc(doc_id=%s, click=%s, conversion=%s)" % (self.doc_id, self.click, self.conversion)
10 |
11 | def __str__(self):
12 | return "(%s, %s, %s)" % (self.doc_id, self.click, self.conversion)
13 |
14 |
15 | class Session:
16 | def __init__(self, query, docs):
17 | self.query = query
18 | self.docs = docs
19 | # Check if docs are unique
20 | docset = set()
21 | for doc in docs:
22 | if doc.doc_id in docset:
23 | raise ValueError("A session may only list a doc exactly once in search results")
24 | docset.add(doc.doc_id)
25 |
26 | def __repr__(self):
27 | return "Session(query=%s, docs=%s)" % (self.query, self.docs)
28 |
29 | def __str__(self):
30 | return "(%s, (%s))" % (self.query, self.docs)
31 |
32 |
33 | def build_one(sess_tuple):
34 | """ Take a tuple where
35 | 0th item is query (a string that uniquely identifies it)
36 | 1st item is a list of docs, with clicks
37 | and optionally a conversion id or true/false
38 |
39 |
40 | ('A', ((1, True), (2, False), (3, True), (0, False))),
41 |
42 | alternatively a value can be attached to the doc
43 |
44 | ('A', ((1, True, 0.9), (2, False, 0.8), (3, True, 1.0), (0, False))),
45 | """
46 | query = sess_tuple[0]
47 | docs = []
48 | for doc_tuple in sess_tuple[1]:
49 | conversion = False
50 | if len(doc_tuple) > 2:
51 | conversion = doc_tuple[2]
52 | docs.append(Doc(doc_id=doc_tuple[0],
53 | click=doc_tuple[1],
54 | conversion=conversion))
55 | return Session(query=query, docs=docs)
56 |
57 |
58 | def build(sess_tuples):
59 | sesss = []
60 | for sess_tup in sess_tuples:
61 | sesss.append(build_one(sess_tup))
62 | return sesss
63 |
64 |
--------------------------------------------------------------------------------
/ltr/client/__init__.py:
--------------------------------------------------------------------------------
1 | from .elastic_client import ElasticClient
2 | from .solr_client import SolrClient
3 | from .opensearch_client import OpenSearchClient
--------------------------------------------------------------------------------
/ltr/client/base_client.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | '''
4 | This project demonstrates working with LTR in Elasticsearch and Solr
5 |
6 | The goal of this class is to abstract away the server and highlight the steps
7 | required to begin working with LTR. This keeps the examples agnostic about
8 | which backend is being used, but the implementations of each client
9 | should be useful references to those getting started with LTR on
10 | their specific platform
11 | '''
12 | class BaseClient(ABC):
13 | @abstractmethod
14 | def get_host(self):
15 | pass
16 |
17 | @abstractmethod
18 | def name(self):
19 | pass
20 |
21 | @abstractmethod
22 | def delete_index(self, index):
23 | pass
24 |
25 | @abstractmethod
26 | def create_index(self, index):
27 | pass
28 |
29 | @abstractmethod
30 | def index_documents(self, index, doc_src):
31 | pass
32 |
33 | @abstractmethod
34 | def reset_ltr(self, index):
35 | pass
36 |
37 | @abstractmethod
38 | def create_featureset(self, index, name, ftr_config):
39 | pass
40 |
41 | @abstractmethod
42 | def get_feature_name(self, config, ftr_idx):
43 | pass
44 |
45 | @abstractmethod
46 | def query(self, index, query):
47 | pass
48 |
49 | @abstractmethod
50 | def get_doc(self, doc_id):
51 | pass
52 |
53 | @abstractmethod
54 | def log_query(self, index, featureset, ids, params):
55 | pass
56 |
57 | @abstractmethod
58 | def submit_model(self, featureset, index, model_name, model_payload):
59 | pass
60 |
61 | @abstractmethod
62 | def submit_ranklib_model(self, featureset, index, model_name, model_payload):
63 | pass
64 |
65 | @abstractmethod
66 | def model_query(self, index, model, model_params, query):
67 | pass
68 |
69 | @abstractmethod
70 | def feature_set(self, index, name):
71 | """ Return a mapping of name/feature ordinal
72 | and the raw (search engine specific) feature list"""
73 | pass
74 |
75 |
76 |
--------------------------------------------------------------------------------
/ltr/client/solr_parse.py:
--------------------------------------------------------------------------------
1 | def every_other_zipped(lst):
2 | return zip(lst[0::2],lst[1::2])
3 |
4 | def dictify(nl_tups):
5 | """ Return dict if all keys unique, otherwise
6 | dont modify """
7 | as_dict = dict(nl_tups)
8 | if len(as_dict) == len(nl_tups):
9 | return as_dict
10 | return nl_tups
11 |
12 | def parse_named_list(lst):
13 | shallow_tups = [tup for tup in every_other_zipped(lst)]
14 |
15 | nl_as_tups = []
16 |
17 | for tup in shallow_tups:
18 | if isinstance(tup[1], list):
19 | tup = (tup[0], parse_named_list(tup[1]))
20 | nl_as_tups.append(tup)
21 | return dictify(nl_as_tups)
22 |
23 |
24 | def parse_termvect_namedlist(lst, field):
25 | """ Parse the named list and perform some transformations to create consistent
26 | JSON to parse
27 |
28 | Specifically changing {"positions": ...} to {"positions": [1234,4567]}
29 |
30 | """
31 |
32 | def listify_posns(posn_attrs):
33 | if isinstance(posn_attrs, dict):
34 | assert len(posn_attrs) == 1
35 | return [posn_attrs['position']]
36 | return [posn_attr[1] for posn_attr in posn_attrs]
37 |
38 |
39 | tv_parsed = parse_named_list(lst)
40 | for doc_id, doc_field_tv in tv_parsed.items():
41 | for field_name, term_vects in doc_field_tv.items():
42 | # T
43 | if field_name == field:
44 | for term, attrs in term_vects.items():
45 | for attr_key, attr_val in attrs.items():
46 | if attr_key == 'positions':
47 | attrs['positions'] = listify_posns(attr_val)
48 | return tv_parsed
49 |
50 |
51 |
52 | if __name__ == "__main__":
53 | solr_nl = [
54 | "D100000", [
55 | "uniqueKey", "D100000",
56 | "body", [
57 | "1", [
58 | "positions", [
59 | "position", 92,
60 | "position", 113
61 | ]],
62 | "2", [
63 | "positions", [
64 | "position", 22,
65 | "position", 413
66 | ]],
67 | "boo", [
68 | "positions", [
69 | "position", 22,
70 | ]]
71 | ]]]
72 | print(repr(parse_termvect_namedlist(solr_nl, 'body')))
73 |
--------------------------------------------------------------------------------
/ltr/date_genre_judgments.py:
--------------------------------------------------------------------------------
1 | from .judgments import Judgment, judgments_to_file
2 | from tqdm import tqdm
3 |
4 | def genreQid(genre):
5 | if genre == "Science Fiction":
6 | return 1
7 | if genre == "Drama":
8 | return 2
9 | else:
10 | return 0
11 |
12 |
13 | def genreGrade(movie):
14 | """ Create a simple training set, as if we were
15 | searching for a genre.
16 |
17 | Newer science fiction is considered better
18 | Older drama is considered better
19 |
20 | """
21 | if 'release_year' in movie and movie['release_year'] is not None:
22 | releaseYear = int(movie['release_year'])
23 | else:
24 | return 0
25 | if movie['genres'][0] == "Science Fiction":
26 | if releaseYear > 2015:
27 | return 4
28 | elif releaseYear > 2010:
29 | return 3
30 | elif releaseYear > 2000:
31 | return 2
32 | elif releaseYear > 1990:
33 | return 1
34 | else:
35 | return 0
36 |
37 | if movie['genres'][0] == "Drama":
38 | if releaseYear > 1990:
39 | return 0
40 | elif releaseYear > 1970:
41 | return 1
42 | elif releaseYear > 1950:
43 | return 2
44 | elif releaseYear > 1930:
45 | return 3
46 | else:
47 | return 4
48 | return 0
49 |
50 |
51 | def synthesize(client, judgmentsOutFile='genre_by_date_judgments.txt', autoNegate=False):
52 | print('Generating judgments for scifi & drama movies')
53 |
54 | if client.name() in ['elastic', 'opensearch']:
55 | params = {
56 | "query": {
57 | "match_all": {}
58 | },
59 | "size": 10000,
60 | "sort": [{"_id": "asc"}]
61 | }
62 | else:
63 | params = {
64 | "q": "*:*",
65 | "rows": 10000,
66 | "sort": "id ASC",
67 | "wt": 'json'
68 | }
69 |
70 | resp = client.query('tmdb', params)
71 |
72 | # Build judgments for each film
73 | judgments = []
74 | for movie in tqdm(resp):
75 | if 'genres' in movie and len(movie['genres']) > 0:
76 | genre=movie['genres'][0]
77 | qid = genreQid(genre)
78 | if qid == 0:
79 | continue
80 | judgment = Judgment(qid=qid,
81 | grade=genreGrade(movie),
82 | docId=movie['id'],
83 | keywords=genre)
84 | judgments.append(judgment)
85 |
86 | # This movie is good for its genre, but
87 | # a bad result for the opposite genre
88 | negGenre = None
89 | if genre == "Science Fiction":
90 | negGenre = "Drama"
91 | elif genre == "Drama":
92 | negGenre = "Science Fiction"
93 |
94 | if autoNegate and negGenre is not None:
95 | negQid=genreQid(negGenre)
96 | judgment = Judgment(qid=negQid,
97 | grade=0,
98 | docId=movie['id'],
99 | keywords=negGenre)
100 | judgments.append(judgment)
101 |
102 | with open(judgmentsOutFile, 'w') as f:
103 | judgments_to_file(f, judgmentsList=judgments)
104 |
105 | return judgments
106 |
--------------------------------------------------------------------------------
/ltr/download.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from os import path
3 |
4 | def download_one(uri, dest='data/', force=False):
5 | import os
6 |
7 | if not os.path.exists(dest):
8 | os.makedirs(dest)
9 |
10 | if not os.path.isdir(dest):
11 | raise ValueError("dest {} is not a directory".format(dest))
12 |
13 | filename = uri[uri.rfind('/') + 1:]
14 | filepath = os.path.join(dest, filename)
15 | if path.exists(filepath):
16 | if not force:
17 | print(filepath + ' already exists')
18 | return
19 | print("exists but force=True, Downloading anyway")
20 |
21 | with open(filepath, 'wb') as out:
22 | print('GET {}'.format(uri))
23 | resp = requests.get(uri, stream=True)
24 | for chunk in resp.iter_content(chunk_size=1024):
25 | if chunk:
26 | out.write(chunk)
27 |
28 | def download(uris, dest='data/', force=False):
29 | for uri in uris:
30 | download_one(uri=uri, dest=dest, force=force)
31 |
--------------------------------------------------------------------------------
/ltr/evaluate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | import plotly.graph_objs as go
5 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
6 |
7 | def log_run(cmd):
8 | resp = os.popen(cmd).read()
9 | print(resp)
10 |
11 | def quiet_run(cmd):
12 | os.popen(cmd).read()
13 |
14 | def evaluate(mode):
15 | # Build the docker image
16 | acceptable_modes = ['elastic', 'solr', 'opensearch']
17 | if mode not in acceptable_modes:
18 | raise ValueError(f"{mode} is not a supported value for mode. must be one of {acceptable_modes}")
19 |
20 | cmd = f'docker build --no-cache -t ltr-rre rre/{mode}/.'
21 |
22 | print('Building RRE image - This will take a while')
23 | quiet_run(cmd)
24 |
25 | # Remove and run a fresh docker image
26 | cmd = 'docker rm -f ltr-rre'
27 | quiet_run(cmd)
28 |
29 | cmd = 'docker run --name ltr-rre ltr-rre'
30 | print('Running evaluation')
31 | log_run(cmd)
32 |
33 | # Copy out reports
34 | cmd = 'docker cp ltr-rre:/rre/target/rre/evaluation.json data/rre-evaluation.json'
35 | log_run(cmd)
36 |
37 | cmd = 'docker cp ltr-rre:/rre/target/site/rre-report.xlsx data/rre-report.xlsx'
38 | log_run(cmd)
39 |
40 | print('RRE Evaluation complete')
41 |
42 |
43 | def rre_table():
44 | init_notebook_mode(connected=True)
45 |
46 | with open('data/rre-evaluation.json') as src:
47 | report = json.load(src)
48 | metrics = report['metrics']
49 |
50 | experiments = ['baseline', 'classic', 'latest']
51 | precisions = []
52 | recalls = []
53 | errs = []
54 |
55 | for exp in experiments:
56 | precisions.append(metrics['P']['versions'][exp]['value'])
57 | recalls.append(metrics['R']['versions'][exp]['value'])
58 | errs.append(metrics['ERR@30']['versions'][exp]['value'])
59 |
60 | trace = go.Table(
61 | header=dict(values=['', 'Precision', 'Recall', 'ERR'], fill = dict(color='#AAAAAA')),
62 | cells=dict(values=[
63 | experiments,
64 | precisions,
65 | recalls,
66 | errs
67 | ])
68 | )
69 |
70 | data = [trace]
71 | iplot(data)
72 |
73 |
--------------------------------------------------------------------------------
/ltr/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/o19s/hello-ltr/58264c292c7805c24aebb5d33abd6fd0b75eedaa/ltr/helpers/__init__.py
--------------------------------------------------------------------------------
/ltr/helpers/butterfingers.py:
--------------------------------------------------------------------------------
1 | def butterfingers(text,prob=0.1,keyboard='qwerty'):
2 | import random
3 |
4 | """ taken from
5 | https://github.com/Decagon/butter-fingers/blob/master/butterfingers/butterfingers.py """
6 |
7 | keyApprox = {}
8 |
9 | if keyboard == "qwerty":
10 | keyApprox['q'] = "qwasedzx"
11 | keyApprox['w'] = "wqesadrfcx"
12 | keyApprox['e'] = "ewrsfdqazxcvgt"
13 | keyApprox['r'] = "retdgfwsxcvgt"
14 | keyApprox['t'] = "tryfhgedcvbnju"
15 | keyApprox['y'] = "ytugjhrfvbnji"
16 | keyApprox['u'] = "uyihkjtgbnmlo"
17 | keyApprox['i'] = "iuojlkyhnmlp"
18 | keyApprox['o'] = "oipklujm"
19 | keyApprox['p'] = "plo['ik"
20 |
21 | keyApprox['a'] = "aqszwxwdce"
22 | keyApprox['s'] = "swxadrfv"
23 | keyApprox['d'] = "decsfaqgbv"
24 | keyApprox['f'] = "fdgrvwsxyhn"
25 | keyApprox['g'] = "gtbfhedcyjn"
26 | keyApprox['h'] = "hyngjfrvkim"
27 | keyApprox['j'] = "jhknugtblom"
28 | keyApprox['k'] = "kjlinyhn"
29 | keyApprox['l'] = "lokmpujn"
30 |
31 | keyApprox['z'] = "zaxsvde"
32 | keyApprox['x'] = "xzcsdbvfrewq"
33 | keyApprox['c'] = "cxvdfzswergb"
34 | keyApprox['v'] = "vcfbgxdertyn"
35 | keyApprox['b'] = "bvnghcftyun"
36 | keyApprox['n'] = "nbmhjvgtuik"
37 | keyApprox['m'] = "mnkjloik"
38 | keyApprox[' '] = " "
39 | else:
40 | print("Keyboard not supported.")
41 |
42 | probOfTypo = int(prob * 100)
43 |
44 | buttertext = ""
45 | for letter in text:
46 | lcletter = letter.lower()
47 | if not lcletter in keyApprox.keys():
48 | newletter = lcletter
49 | else:
50 | if random.choice(range(0, 100)) <= probOfTypo:
51 | newletter = random.choice(keyApprox[lcletter])
52 | else:
53 | newletter = lcletter
54 | # go back to original case
55 | if not lcletter == letter:
56 | newletter = newletter.upper()
57 | buttertext += newletter
58 |
59 | return buttertext
60 |
61 |
62 |
--------------------------------------------------------------------------------
/ltr/helpers/convert.py:
--------------------------------------------------------------------------------
1 | # converts LambdaMART XML models to JSON for Solr..
2 |
3 | import xml.etree.ElementTree as ET
4 |
5 |
6 | def convert(ensemble_xml_string, modelName, featureSet, featureMapping):
7 | modelClass = 'org.apache.solr.ltr.model.MultipleAdditiveTreesModel'
8 |
9 | model = {
10 | 'store': featureSet,
11 | 'name': modelName,
12 | 'class': modelClass,
13 | 'features': featureMapping
14 | }
15 |
16 | # Clean up header
17 | ensemble_xml_string = '\n'.join(ensemble_xml_string.split('\n')[7:])
18 | lambdaModel = ET.fromstring(ensemble_xml_string)
19 |
20 | trees = []
21 | for node in lambdaModel:
22 | t = {
23 | 'weight': str(node.attrib['weight']),
24 | 'root': parseSplits(node[0], featureMapping)
25 | }
26 | trees.append(t)
27 |
28 | # print(trees)
29 | model['params'] = {'trees': trees}
30 |
31 | return model
32 |
33 | def parseSplits(split, features):
34 | obj = {}
35 | for el in split:
36 | if (el.tag == 'feature'):
37 | obj['feature'] = features[(int(el.text.strip()) - 1)]['name']
38 | elif (el.tag == 'threshold'):
39 | obj['threshold'] = str(el.text.strip())
40 | elif (el.tag == 'split' and 'pos' in el.attrib):
41 | obj[el.attrib['pos']] = parseSplits(el, features)
42 | elif (el.tag == 'output'):
43 | obj['value'] = str(el.text.strip())
44 | return obj
45 |
--------------------------------------------------------------------------------
/ltr/helpers/defaultlist.py:
--------------------------------------------------------------------------------
1 | class DefaultList(list):
2 | """ adapted from https://stackoverflow.com/a/869901/8123"""
3 |
4 | def __init__(self, factory):
5 | self.factory = factory
6 |
7 | def __getitem__(self, index):
8 | size = len(self)
9 | if index >= size:
10 | self.extend(self.factory() for _ in range(size, index + 1))
11 |
12 | return list.__getitem__(self, index)
13 |
14 | def __setitem__(self, index, value):
15 | size = len(self)
16 | if index >= size:
17 | self.extend(self.factory() for _ in range(size, index + 1))
18 |
19 | list.__setitem__(self, index, value)
20 |
21 | def defaultlist(factory):
22 | return DefaultList(factory)
23 |
--------------------------------------------------------------------------------
/ltr/helpers/esUrlParse.py:
--------------------------------------------------------------------------------
1 | def parseUrl(fullEsUrl):
2 | from urllib.parse import urlsplit, urlunsplit
3 | import os.path
4 | o = urlsplit(fullEsUrl)
5 |
6 | esUrl = urlunsplit([o.scheme, o.netloc, '','',''])
7 |
8 | indexAndSearchType = os.path.split(o.path)
9 |
10 | return (esUrl, indexAndSearchType[0][1:], indexAndSearchType[1])
11 |
12 |
13 | if __name__ == "__main__":
14 | from sys import argv
15 | print(parseUrl(argv[1]))
16 |
--------------------------------------------------------------------------------
/ltr/helpers/handle_resp.py:
--------------------------------------------------------------------------------
1 | def resp_msg(msg, resp, throw=True, ignore=[]):
2 | rsc = resp.status_code
3 | print('{} [Status: {}]'.format(msg, rsc))
4 | if rsc >= 400 and rsc not in ignore:
5 | if throw:
6 | raise RuntimeError(resp.text)
7 |
8 |
--------------------------------------------------------------------------------
/ltr/helpers/movies.py:
--------------------------------------------------------------------------------
1 | import json
2 | from tqdm import tqdm
3 |
4 | class Memoize:
5 | """ Adapted from
6 | https://stackoverflow.com/questions/1988804/what-is-memoization-and-how-can-i-use-it-in-python"""
7 | def __init__(self, f):
8 | self.f = f
9 | self.memo = {}
10 | def __call__(self, *args):
11 | if not args in self.memo:
12 | self.memo[args] = self.f(*args)
13 | #Warning: You may wish to do a deepcopy here if returning objects
14 | return self.memo[args]
15 |
16 | @Memoize
17 | def load_movies(json_path):
18 | return json.load(open(json_path))
19 |
20 | def get_movie(tmdb_id, movies='data/tmdb.json'):
21 | movies = load_movies(movies)
22 | tmdb_id=str(tmdb_id)
23 | return movies[tmdb_id]
24 |
25 | def noop(src_movie, base_doc):
26 | return base_doc
27 |
28 |
29 | def indexable_movies(enrich=noop, movies='data/tmdb.json'):
30 | """ Generates TMDB movies, similar to how ES Bulk indexing
31 | uses a generator to generate bulk index/update actions"""
32 | movies = load_movies(movies)
33 | idx = 0
34 | for movieId, tmdbMovie in tqdm(movies.items(),total=len(movies)):
35 | try:
36 | releaseDate = None
37 | if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
38 | releaseDate = tmdbMovie['release_date']
39 | releaseYear = releaseDate[0:4]
40 |
41 | full_poster_path = ''
42 | if 'poster_path' in tmdbMovie and tmdbMovie['poster_path'] is not None and len(tmdbMovie['poster_path']) > 0:
43 | full_poster_path = 'https://image.tmdb.org/t/p/w185' + tmdbMovie['poster_path']
44 |
45 | base_doc = {'id': movieId,
46 | 'title': tmdbMovie['title'],
47 | 'overview': tmdbMovie['overview'],
48 | 'tagline': tmdbMovie['tagline'],
49 | 'directors': [director['name'] for director in tmdbMovie['directors']],
50 | 'cast': " ".join([castMember['name'] for castMember in tmdbMovie['cast']]),
51 | 'genres': [genre['name'] for genre in tmdbMovie['genres']],
52 | 'release_date': releaseDate,
53 | 'release_year': releaseYear,
54 | 'poster_path': full_poster_path,
55 | 'vote_average': float(tmdbMovie['vote_average']) if 'vote_average' in tmdbMovie else None,
56 | 'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else 0,
57 | }
58 | yield enrich(tmdbMovie, base_doc)
59 | idx += 1
60 | except KeyError as k: # Ignore any movies missing these attributes
61 | continue
62 |
--------------------------------------------------------------------------------
/ltr/helpers/msmarco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/o19s/hello-ltr/58264c292c7805c24aebb5d33abd6fd0b75eedaa/ltr/helpers/msmarco/__init__.py
--------------------------------------------------------------------------------
/ltr/helpers/msmarco/evaluate.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import gzip
3 |
4 |
5 | class QRel():
6 |
7 | def __init__(self, qid, docid, keywords):
8 | self.qid=qid
9 | self.docid=docid
10 | self.keywords = keywords
11 |
12 | def eval_rr(self, doc_ranking):
13 | """ Evaluate the provided doc ranking using reciprical rank
14 | (1/rank of the expected doc)
15 |
16 | returns 0 if this qrels doc id is missing
17 | """
18 |
19 | for rank, docid in enumerate(doc_ranking, start=1):
20 | if docid == self.docid:
21 | return 1.0 / rank
22 | return 0.0
23 |
24 | @staticmethod
25 | def read_qrels(qrels_fname='data/msmarco-doctrain-qrels.tsv.gz',
26 | queries_fname='data/msmarco-doctrain-queries.tsv.gz'):
27 |
28 | qids_to_keywords = QRel.get_keyword_lookup(queries_fname)
29 |
30 | with gzip.open(qrels_fname, 'rt') as f:
31 | reader = csv.reader(f, delimiter=' ')
32 | for row in reader:
33 | qid = row[0]
34 | keywords = None
35 | if qid in qids_to_keywords:
36 | keywords = qids_to_keywords[qid]
37 | else:
38 | print("Missing keywords for %s" % qid)
39 | yield QRel(qid=row[0], docid=row[2], keywords=keywords)
40 |
41 | @staticmethod
42 | def get_keyword_lookup(fname='data/msmarco-doctrain-queries.tsv.gz'):
43 | qids_to_keywords = {}
44 | with gzip.open(fname, 'rt') as f:
45 | reader = csv.reader(f, delimiter='\t')
46 | for row in reader:
47 | qids_to_keywords[row[0]] = row[1]
48 | return qids_to_keywords
49 |
50 | def __str__(self):
51 | return "qid:%s(%s) => doc:%s" % (self.qid, self.keywords, self.docid)
52 |
53 |
54 | if __name__ == "__main__":
55 | qrels = {}
56 | for qrel in QRel.read_qrels():
57 | qrels[qrel.qid] = qrel
58 |
59 | print(qrels['1185869'].eval_rr(['1','1']))
60 |
61 |
--------------------------------------------------------------------------------
/ltr/helpers/solr_escape.py:
--------------------------------------------------------------------------------
1 | def esc_kw(kw):
2 | """ Take a keyword and escape all the
3 | Solr parts we want to escape!"""
4 | kw = kw.replace('\\', '\\\\') # be sure to do this first, as we inject \!
5 | kw = kw.replace('(', '\(')
6 | kw = kw.replace(')', '\)')
7 | kw = kw.replace('+', '\+')
8 | kw = kw.replace('-', '\-')
9 | kw = kw.replace(':', '\:')
10 | kw = kw.replace('/', '\/')
11 | kw = kw.replace(']', '\]')
12 | kw = kw.replace('[', '\[')
13 | kw = kw.replace('*', '\*')
14 | kw = kw.replace('?', '\?')
15 | kw = kw.replace('{', '\{')
16 | kw = kw.replace('}', '\}')
17 | kw = kw.replace('~', '\~')
18 |
19 |
20 | return kw
21 |
--------------------------------------------------------------------------------
/ltr/helpers/tau.py:
--------------------------------------------------------------------------------
1 | sign = lambda a: (a>0) - (a<0)
2 |
3 | def pairs_in_order(ranking, both_ways=True):
4 | assert len(ranking) > 1
5 | for idx1, val1 in enumerate(ranking):
6 | for idx2, val2 in enumerate(ranking):
7 | if idx2 > idx1:
8 | yield val1, val2, sign(idx2-idx1)
9 | if both_ways:
10 | yield val2, val1, sign(idx1-idx2)
11 |
12 | def tau(rank1, rank2, at=4):
13 | rank1in = {}
14 |
15 |
16 | if len(rank1) < at or len(rank2) < at:
17 | raise ValueError("rankings must be larger than provided at param(%s)" % at)
18 |
19 | # Handle 1 as a special case
20 | if at == 1:
21 | if rank1[0] == rank2[0]:
22 | return 1
23 | return -1
24 |
25 | rank1 = rank1[:at]; rank2 = rank2[:at]
26 |
27 | # gather concordances/discords for rank1
28 | for val1, val2, order in pairs_in_order(rank1, both_ways=True):
29 | rank1in[(val1,val2)] = order
30 |
31 | # check rank2
32 | concords = 0
33 | discords = 0
34 | for val1, val2, order in pairs_in_order(rank2, both_ways=False):
35 | try:
36 | rank1order = rank1in[(val1,val2)]
37 | if order == rank1order:
38 | concords += 1
39 | else:
40 | discords += 1
41 | except KeyError:
42 | discords += 1
43 |
44 | return (concords - discords) / ((at * (at - 1)) / 2)
45 |
46 | def avg_tau(rank1, rank2, at=4):
47 | if len(rank1) < at or len(rank2) < at:
48 | raise ValueError("rankings must be larger than provided at param(%s)" % at)
49 |
50 | rank1 = rank1[:at]; rank2 = rank2[:at]
51 |
52 | tot = 0
53 | for i in range(1,at+1):
54 | tot += tau(rank1,rank2,at=i)
55 | return tot / (at)
56 |
57 | if __name__ == "__main__":
58 | print(tau([1,2,3,4],[4,3,2,1]))
59 | print(tau([1,2,3,4],[1,2,3,4]))
60 | print(tau([1,2,4,3],[1,2,3,4]))
61 | print(tau([5,6,7,8],[1,2,3,4]))
62 | print(tau([1,2,3,5],[1,2,3,4]))
63 | print(tau([5,3,2,1],[4,3,2,1]))
64 | l1=[1,2,4,3]; l2=[1,2,3,4]; l3=[2,1,3,4]
65 | print("avg_tau(%s,%s,at=4) %s" % (l1, l1, avg_tau(l1,l1)))
66 | print("avg_tau(%s,%s,at=4) %s" % (l1, l2, avg_tau(l1,l2)))
67 | print("avg_tau(%s,%s,at=4) %s" % (l2, l3, avg_tau(l1,l3)))
68 | print("tau(%s,%s,at=4) %s" % (l1, l2, tau(l1,l2)))
69 | print("tau(%s,%s,at=4) %s" % (l2, l3, tau(l1,l3)))
70 |
71 |
--------------------------------------------------------------------------------
/ltr/index.py:
--------------------------------------------------------------------------------
1 | from ltr.helpers.movies import indexable_movies, noop
2 |
3 | def rebuild(client, index, doc_src, force = False):
4 | """ Reload a configuration on disk for each search engine
5 | (Solr a configset, Elasticsearch a json file)
6 | and reindex
7 | """
8 |
9 | if client.check_index_exists(index):
10 | if (force):
11 | client.delete_index(index)
12 | client.create_index(index)
13 | client.index_documents(index, doc_src=doc_src)
14 | else:
15 | print("Index {} already exists. Use `force = True` to delete and recreate".format(index))
16 | return None
17 | else:
18 | client.create_index(index)
19 | client.index_documents(index, doc_src=doc_src)
20 |
--------------------------------------------------------------------------------
/ltr/injectTypos.py:
--------------------------------------------------------------------------------
1 | try:
2 | from judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid
3 | from butterfingers import butterfingers
4 | except ImportError:
5 | from .judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid
6 | from .butterfingers import butterfingers
7 |
8 |
9 |
10 | def typoIt(judgmentInFile, judgmentOutFile, rounds=100):
11 | with open(judgmentInFile) as f:
12 | currJudgments = [judg for judg in judgments_from_file(f)]
13 | lastQid = currJudgments[-1].qid
14 | judgDict = judgments_by_qid(currJudgments)
15 |
16 | existingTypos = set()
17 |
18 | for i in range(0, rounds):
19 |
20 | for qid, judglist in judgDict.items():
21 | keywords = judglist[0].keywords
22 | keywordsWTypo = butterfingers(keywords)
23 |
24 | if keywordsWTypo != keywords and keywordsWTypo not in existingTypos:
25 | newQid = lastQid+1
26 | print("%s => %s" % (keywords, keywordsWTypo))
27 | lastQid += 1
28 | for judg in judglist:
29 | typoJudg = Judgment(grade=judg.grade,
30 | qid=newQid,
31 | keywords=keywordsWTypo,
32 | docId=judg.docId)
33 | currJudgments.append(typoJudg)
34 | existingTypos.add(keywordsWTypo)
35 |
36 | with open(judgmentOutFile, 'w') as f:
37 | judgments_to_file(f, judgmentsList=currJudgments)
38 |
39 |
40 | if __name__ == "__main__":
41 | typoIt(judgmentInFile='title_judgments.txt', judgmentOutFile='title_fuzzy_judgments.txt')
42 |
43 |
44 | # Clone a judgment, inject random typos
45 |
--------------------------------------------------------------------------------
/ltr/log.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | class FeatureLogger:
4 | """ Logs LTR Features, one query at a time
5 |
6 | ...Building up a training set...
7 | """
8 |
9 | def __init__(self, client, index, feature_set, drop_missing=True):
10 | self.client=client
11 | self.index=index
12 | self.feature_set=feature_set
13 | self.drop_missing=drop_missing
14 | self.logged=[]
15 |
16 | def clear(self):
17 | self.logged=[]
18 |
19 | def log_for_qid(self, qid, judgments, keywords):
20 | """ Log a set of judgments associated with a single qid
21 | judgments will be modified, a training set also returned, discarding
22 | any judgments we could not log features for (because the doc was missing)
23 | """
24 | featuresPerDoc = {}
25 | judgments = [j for j in judgments]
26 | docIds = [judgment.docId for judgment in judgments]
27 |
28 | # Check for dups of documents
29 | for docId in docIds:
30 | indices = [i for i, x in enumerate(docIds) if x == docId]
31 | if len(indices) > 1:
32 | # print("Duplicate Doc in qid:%s %s" % (qid, docId))
33 | pass
34 |
35 | # For every batch of N docs to generate judgments for
36 | BATCH_SIZE = 500
37 | numLeft = len(docIds)
38 | for i in range(0, 1 + (len(docIds) // BATCH_SIZE)):
39 |
40 | numFetch = min(BATCH_SIZE, numLeft)
41 | start = i*BATCH_SIZE
42 | if start >= len(docIds):
43 | break
44 | ids = docIds[start:start+numFetch]
45 |
46 | # Sanitize (Solr has a strict syntax that can easily be tripped up)
47 | # This removes anything but alphanumeric and spaces
48 | keywords = re.sub('([^\s\w]|_)+', '', keywords)
49 |
50 | params = {
51 | "keywords": keywords,
52 | "fuzzy_keywords": ' '.join([x + '~' for x in keywords.split(' ')]),
53 | "keywordsList": [keywords] # Needed by TSQ for the time being
54 | }
55 |
56 | res = self.client.log_query(self.index, self.feature_set, ids, params)
57 |
58 | # Add feature back to each judgment
59 | for doc in res:
60 | docId = str(doc['id'])
61 | features = doc['ltr_features']
62 | featuresPerDoc[docId] = features
63 | numLeft -= BATCH_SIZE
64 |
65 | # Append features from search engine back to ranklib judgment list
66 | for judgment in judgments:
67 | try:
68 | features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index
69 | judgment.features = features
70 | except KeyError:
71 | pass
72 | # print("Missing doc %s" % judgment.docId)
73 |
74 | # Return a paired down judgments if we are missing features for judgments
75 | training_set = []
76 | discarded = []
77 | for judgment in judgments:
78 | if self.drop_missing:
79 | if judgment.has_features():
80 | training_set.append(judgment)
81 | else:
82 | discarded.append(judgment)
83 | else:
84 | training_set.append(judgment)
85 | # print("Discarded %s Keep %s" % (len(discarded), len(training_set)))
86 | self.logged.extend(training_set)
87 | return training_set, discarded
88 |
--------------------------------------------------------------------------------
/ltr/p9_plots.py:
--------------------------------------------------------------------------------
1 | def plot_grades(dat):
2 | import plotnine as p9
3 |
4 | p = {
5 | p9.ggplot(dat, p9.aes('grade')) +
6 | p9.geom_bar() +
7 | p9.facet_wrap('keywords')
8 | }
9 |
10 | return p
11 |
12 | def plot_features(dat):
13 | import plotnine as p9
14 |
15 | p = {
16 | p9.ggplot(dat, p9.aes('grade', 'features', color = 'keywords')) +
17 | p9.geom_jitter(alpha = .5) +
18 | p9.facet_wrap('feature_id', scales = 'free_y', labeller = 'label_both') +
19 | p9.labs(y='Feature values', x='Relevance grade')
20 | }
21 |
22 | return p
--------------------------------------------------------------------------------
/ltr/release_date_plot.py:
--------------------------------------------------------------------------------
1 | import plotly.graph_objs as go
2 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
3 |
4 | def search(client, user_query, model_name):
5 | if client.name() in ['elastic', 'opensearch']:
6 | engine_query = {
7 | "bool": {
8 | "must": {"match_all": {} },
9 | "filter": {
10 | "match": {"title": user_query}
11 | }
12 | }
13 | }
14 | else:
15 | engine_query = 'title:('+ user_query + ')^0'
16 | return client.model_query('tmdb', model_name, {}, engine_query)
17 |
18 | def plot(client, query, models = ['classic', 'latest']):
19 | init_notebook_mode(connected=True)
20 |
21 | modelData = []
22 |
23 | for model in models:
24 | modelData.append(search(client, query, model))
25 |
26 | xAxes = []
27 | for i in range(len(modelData[0])):
28 | xAxes.append(i)
29 |
30 | trace0 = go.Scatter(
31 | x = xAxes,
32 | y = [int(x['release_year']) for x in modelData[0]],
33 | mode = "lines",
34 | name = models[0],
35 | text = [f'{x["title"]} ({x["score"]})' for x in modelData[0]]
36 | )
37 |
38 | trace1 = go.Scatter(
39 | x = xAxes,
40 | y = [int(x['release_year']) for x in modelData[1]],
41 | mode = "lines",
42 | name = models[1],
43 | text = [f'{x["title"]} ({x["score"]})' for x in modelData[1]]
44 | )
45 |
46 |
47 | data = [trace0, trace1]
48 | fig = go.Figure(data=data)
49 | iplot(fig)
50 |
--------------------------------------------------------------------------------
/ltr/search.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | baseEsQuery = {
4 | "size": 5,
5 | "query": {
6 | "sltr": {
7 | "params": {
8 | "keywords": "",
9 | },
10 | "model": ""
11 | }
12 | }
13 | }
14 |
15 | def esLtrQuery(keywords, modelName):
16 | import json
17 | baseEsQuery['query']['sltr']['params']['keywords'] = keywords
18 | baseEsQuery['query']['sltr']['params']['keywordsList'] = [keywords] # Needed by TSQ for now
19 | baseEsQuery['query']['sltr']['model'] = modelName
20 | print("%s" % json.dumps(baseEsQuery))
21 | return baseEsQuery
22 |
23 | # TODO: Parse params and add efi dynamically instead of adding manually to query below
24 | def solrLtrQuery(keywords, modelName):
25 | keywords = re.sub('([^\s\w]|_)+', '', keywords)
26 | fuzzy_keywords = ' '.join([x + '~' for x in keywords.split(' ')])
27 |
28 | return {
29 | 'fl': '*,score',
30 | 'rows': 5,
31 | 'q': '{{!ltr reRankDocs=30000 model={} efi.keywords="{}" efi.fuzzy_keywords="{}"}}'.format(modelName, keywords, fuzzy_keywords)
32 | }
33 |
34 |
35 | tmdbFields = {
36 | 'title': 'title',
37 | 'display_fields': ['release_year', 'genres', 'overview']
38 | }
39 |
40 |
41 |
42 | def search(client, keywords, modelName, index='tmdb', fields=tmdbFields):
43 | if client.name() == 'elastic' or client.name() == 'opensearch':
44 | results = client.query(index, esLtrQuery(keywords, modelName))
45 | else:
46 | q = solrLtrQuery(keywords, modelName)
47 | print(q)
48 | results = client.query(index, q)
49 |
50 | ti = fields['title']
51 |
52 | for result in results:
53 | print("%s " % (result[ti] if ti in result else 'N/A'))
54 | print("%s " % (result['_score']))
55 |
56 | for df in fields['display_fields']:
57 | print("%s " % (result[df] if df in result else 'N/A'))
58 |
59 | print("---------------------------------------")
60 |
--------------------------------------------------------------------------------
/ltr/years_as_ratings.py:
--------------------------------------------------------------------------------
1 | def get_classic_rating(year):
2 | if year > 2010:
3 | return 0
4 | elif year > 1990:
5 | return 1
6 | elif year > 1970:
7 | return 2
8 | elif year > 1950:
9 | return 3
10 | else:
11 | return 4
12 |
13 | def get_latest_rating(year):
14 | if year > 2010:
15 | return 4
16 | elif year > 1990:
17 | return 3
18 | elif year > 1970:
19 | return 2
20 | elif year > 1950:
21 | return 1
22 | else:
23 | return 0
24 |
25 | def synthesize(
26 | client,
27 | featureSet='release',
28 | latestTrainingSetOut='data/latest-training.txt',
29 | classicTrainingSetOut='data/classic-training.txt'
30 | ):
31 | from ltr.judgments import judgments_to_file, Judgment
32 | NO_ZERO = False
33 |
34 | resp = client.log_query('tmdb', 'release', None)
35 |
36 | # A classic film fan
37 | judgments = []
38 | print("Generating 'classic' biased judgments:")
39 | for hit in resp:
40 | rating = get_classic_rating(hit['ltr_features'][0])
41 |
42 | if rating == 0 and NO_ZERO:
43 | continue
44 |
45 | judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords=''))
46 |
47 |
48 | with open(classicTrainingSetOut, 'w') as out:
49 | judgments_to_file(out, judgments)
50 |
51 | # A current film fan
52 | judgments = []
53 | print("Generating 'recent' biased judgments:")
54 | for hit in resp:
55 | rating = get_latest_rating(hit['ltr_features'][0])
56 |
57 | if rating == 0 and NO_ZERO:
58 | continue
59 |
60 | judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords=''))
61 |
62 |
63 | with open(latestTrainingSetOut, 'w') as out:
64 | judgments_to_file(out, judgments)
65 |
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/es-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.8.2
2 |
3 | RUN bin/elasticsearch-plugin install --batch \
4 | "https://github.com/o19s/elasticsearch-learning-to-rank/releases/download/v1.5.8-es8.8.2/ltr-plugin-v1.5.8-es8.8.2.zip"
5 |
6 | COPY --chown=elasticsearch:elasticsearch elasticsearch.yml /usr/share/elasticsearch/config/
7 | RUN cat /usr/share/elasticsearch/config/elasticsearch.yml
8 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/es-docker/elasticsearch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch-tlre
4 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/.docker/kb-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.elastic.co/kibana/kibana:8.8.2
2 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/README.md:
--------------------------------------------------------------------------------
1 | This folder contains some Elasticsearch configuration and a Dockerfile to expedite setting up Elasticsearch with LTR.
2 |
3 | ## Docker
4 | Run `docker-compose up` to create a image running Elasticsearch with LTR
5 |
6 | After the instance is running, load up the "hello-ltr (ES)" notebook.
7 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | kibana:
3 | build: ./.docker/kb-docker/.
4 | expose:
5 | - "5601"
6 | ports:
7 | - "5601:5601"
8 | environment:
9 | SERVER_HOST: "0.0.0.0"
10 | elasticsearch:
11 | build: ./.docker/es-docker/.
12 | ports:
13 | - "9200:9200"
14 | expose:
15 | - "9200"
16 | environment:
17 | SERVER_NAME: "elasticsearch"
18 | volumes:
19 | - tlre-es-data:/usr/share/elasticsearch/data
20 |
21 | volumes:
22 | tlre-es-data:
23 | driver: local
24 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/osc-blog/blog_settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "mappings": {
3 | "_source": {
4 | "enabled": true
5 | },
6 | "properties": {
7 | "post_id": {
8 | "type": "long",
9 | "store": true
10 | },
11 | "post_date": {
12 | "type": "date",
13 | "store": true
14 | },
15 | "es_update_date": {
16 | "type": "date",
17 | "store": true
18 | },
19 | "url": {
20 | "type": "text",
21 | "store": true
22 | },
23 | "title": {
24 | "type": "text",
25 | "store": true,
26 | "analyzer": "content_analyzer",
27 | "fields": {
28 | "bigrams": {
29 | "type": "text",
30 | "analyzer": "content_bigrams"
31 | }
32 | }
33 | },
34 | "author": {
35 | "type": "text",
36 | "store": true,
37 | "analyzer": "standard"
38 | },
39 | "content": {
40 | "type": "text",
41 | "store": true,
42 | "analyzer": "content_analyzer",
43 | "fields": {
44 | "bigrams": {
45 | "type": "text",
46 | "analyzer": "content_bigrams"
47 | }
48 | }
49 | },
50 | "excerpt": {
51 | "type": "text",
52 | "store": true,
53 | "analyzer": "content_analyzer"
54 | },
55 | "categories": {
56 | "type": "text",
57 | "store": true,
58 | "analyzer": "content_analyzer"
59 | }
60 | }
61 | },
62 | "settings": {
63 | "number_of_shards": 1,
64 | "number_of_replicas": 1,
65 | "analysis": {
66 | "filter": {
67 | "english_stemmer": {
68 | "type": "stemmer",
69 | "language": "english"
70 | },
71 | "english_possessive_stemmer": {
72 | "type": "stemmer",
73 | "language": "possessive_english"
74 | },
75 | "bigram": {
76 | "type": "shingle",
77 | "max_shingle_size": 2,
78 | "output_unigrams": false
79 | }
80 | },
81 | "analyzer": {
82 | "content_analyzer": {
83 | "type": "custom",
84 | "char_filter": [
85 | "html_strip"
86 | ],
87 | "filter": [
88 | "english_possessive_stemmer",
89 | "lowercase",
90 | "english_stemmer"
91 | ],
92 | "tokenizer": "standard"
93 | },
94 | "content_bigrams": {
95 | "type": "custom",
96 | "char_filter": [
97 | "html_strip"
98 | ],
99 | "filter": [
100 | "english_possessive_stemmer",
101 | "lowercase",
102 | "english_stemmer",
103 | "bigram"
104 | ],
105 | "tokenizer": "standard"
106 | }
107 | }
108 | }
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/osc-blog/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/Dataframes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Dataframes\n",
8 | "\n",
9 | "Data frames are the central object of most data science workflows. This notebook shows some helper function that can assist you in creating them from judgements. The older non-dataframe way of passing data is in most of the example notebooks, so use this code anywhere you see that pattern.\n"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import ltr.judgments as judge"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "classic_training_set = [j for j in judge.judgments_from_file(open('data/classic-training.txt'))]"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "classic_df = judge.judgments_to_dataframe(classic_training_set)\n",
37 | "classic_df"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Plotting\n",
45 | "\n",
46 | "Is one of the main reasons dataframes are easier to work with. There are two helper functions to show the distribtion of grade (`plot_grades`) and relationship between features and grades (plot_features).\n",
47 | "\n",
48 | "You are encouraged to use whatever python plotting library you are most comformtable with, we have `matplotlib` and `plotnine` installed in the Docker image."
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "import ltr.p9_plots as plots\n",
58 | "plots.plot_grades(classic_df)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "classic_df_long = judge.judgments_dataframe_to_long(classic_df)\n",
68 | "classic_df_long"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "plots.plot_features(classic_df_long)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [],
85 | "source": [
86 | "plots.plot_features"
87 | ]
88 | }
89 | ],
90 | "metadata": {
91 | "kernelspec": {
92 | "display_name": "Python 3",
93 | "language": "python",
94 | "name": "python3"
95 | },
96 | "language_info": {
97 | "codemirror_mode": {
98 | "name": "ipython",
99 | "version": 3
100 | },
101 | "file_extension": ".py",
102 | "mimetype": "text/x-python",
103 | "name": "python",
104 | "nbconvert_exporter": "python",
105 | "pygments_lexer": "ipython3",
106 | "version": "3.8.2"
107 | }
108 | },
109 | "nbformat": 4,
110 | "nbformat_minor": 2
111 | }
112 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/fmap.txt:
--------------------------------------------------------------------------------
1 | 0 release_year q
2 | 1 features0 q
3 |
--------------------------------------------------------------------------------
/notebooks/elasticsearch/tmdb/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/exercises/data/tates_model.txt:
--------------------------------------------------------------------------------
1 | ## Linear Regression
2 | ## Lambda = 1.0E-10
3 | 0:0.06442486267839354 1:0.06442486267839354 2:1.7298168616882517 3:0.06437886168753176
--------------------------------------------------------------------------------
/notebooks/exercises/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/opensearch/.docker/opensearch-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM opensearchproject/opensearch:2.5.0
2 |
3 |
4 | RUN bin/opensearch-plugin install --batch \
5 | "https://www.github.com/opensearch-project/opensearch-learning-to-rank-base/releases/download/release-v2.1.0/ltr-plugin-v2.1.0.zip"
6 |
7 | COPY --chown=opensearch:opensearch opensearch.yml /usr/share/opensearch/config/
8 | RUN cat /usr/share/opensearch/config/opensearch.yml
9 |
--------------------------------------------------------------------------------
/notebooks/opensearch/.docker/opensearch-docker/opensearch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | docker run -p 9301:9301 -p 9400:9400 -e "discovery.type=single-node" opensearch-tlre
4 |
--------------------------------------------------------------------------------
/notebooks/opensearch/.docker/opensearch-docker/opensearch.yml:
--------------------------------------------------------------------------------
1 | ---
2 | http.cors.allow-origin: "/http?:.*/"
3 | #http.cors.allow-origin: /http?://localhost(:[0-9]+)?/
4 | http.cors.enabled: true
5 | indices.query.bool.max_clause_count: 10240
6 | network.host: 0.0.0.0
7 | http.port: 9201
8 | discovery.type: single-node
9 |
10 | # cluster.name: docker-cluster
11 |
12 | ## # minimum_master_nodes need to be explicitly set when bound on a public IP
13 | ## # set to 1 to allow single node clusters
14 | ## discovery.zen.minimum_master_nodes: 1
15 | #
16 | ## Setting network.host to a non-loopback address enables the annoying bootstrap checks. "Single-node" mode disables them again.
17 | ## discovery.type: single-node
18 | #
19 | #http.host: 0.0.0.0
20 | #http.port: 9201
21 | #http.cors.allow-origin: "*"
22 | ##http.cors.allow-origin: "/http?:.*/"
23 | #http.cors.enabled: true
24 | #http.cors.allow-headers: X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization
25 | #http.cors.allow-credentials: true
26 | #
27 | #
28 | #
29 | ##http.cors.allow-origin: "/http?:.*/"
30 | ##http.cors.allow-origin: /http?://localhost(:[0-9]+)?/
31 | ##http.cors.enabled: true
32 | ##indices.query.bool.max_clause_count: 10240
33 | ##network.host: 0.0.0.0
34 | #
35 | #discovery.type: single-node
36 |
37 |
--------------------------------------------------------------------------------
/notebooks/opensearch/.docker/osd-docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM opensearchproject/opensearch-dashboards:2.5.0
2 |
3 | COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/
4 | RUN cat /usr/share/opensearch-dashboards/config/opensearch_dashboards.yml
--------------------------------------------------------------------------------
/notebooks/opensearch/.docker/osd-docker/opensearch_dashboards.yml:
--------------------------------------------------------------------------------
1 | # OSD is served by a back end server. This setting specifies the port to use.
2 | server.port: 5602
3 | server.host: 0.0.0.0
4 |
--------------------------------------------------------------------------------
/notebooks/opensearch/README.md:
--------------------------------------------------------------------------------
1 | This folder contains some OpenSearch configuration and a Dockerfile to expedite setting up OpenSearch with LTR.
2 |
3 | ## Docker
4 | Run `docker-compose up` to create a image running OpenSearch with LTR
5 |
6 | After the instance is running, load up the "hello-ltr (OpenSearch)" notebook.
7 |
--------------------------------------------------------------------------------
/notebooks/opensearch/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | opensearch-node1:
3 | #image: opensearch-custom-plugin
4 | #opensearchproject/opensearch:2.6.0
5 | # image: opensearchproject/opensearch:1.3.9
6 | #image: opensearchproject/opensearch:2.8.0
7 | build: ./.docker/opensearch-docker/.
8 | container_name: opensearch-node1
9 | environment:
10 | - cluster.name=opensearch-cluster
11 | - node.name=opensearch-node1
12 | - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping
13 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM
14 | - "DISABLE_INSTALL_DEMO_CONFIG=true" # disables execution of install_demo_configuration.sh bundled with security plugin, which installs demo certificates and security configurations to OpenSearch
15 | - "DISABLE_SECURITY_PLUGIN=true" # disables security plugin entirely in OpenSearch by setting plugins.security.disabled: true in opensearch.yml
16 | - "discovery.type=single-node" # disables bootstrap checks that are enabled when network.host is set to a non-loopback address
17 | ulimits:
18 | memlock:
19 | soft: -1
20 | hard: -1
21 | nofile:
22 | soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems
23 | hard: 65536
24 | volumes:
25 | - opensearch-data1:/usr/share/opensearch/data
26 | ports:
27 | - "9201:9201"
28 | - "9600:9600" # required for Performance Analyzer
29 | networks:
30 | - opensearch-net
31 |
32 | opensearch-dashboards:
33 | # image: opensearchproject/opensearch-dashboards:1.3.9
34 | # image: opensearchproject/opensearch-dashboards:2.5.0
35 | build: ./.docker/osd-docker/.
36 | container_name: opensearch-dashboards
37 | ports:
38 | - "5602:5602"
39 | environment:
40 | - 'OPENSEARCH_HOSTS=["http://opensearch-node1:9201"]'
41 | - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards
42 | networks:
43 | - opensearch-net
44 |
45 | volumes:
46 | opensearch-data1:
47 |
48 | networks:
49 | opensearch-net:
50 |
51 |
52 |
--------------------------------------------------------------------------------
/notebooks/opensearch/osc-blog/blog_settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "mappings": {
3 | "_source": {
4 | "enabled": true
5 | },
6 | "properties": {
7 | "post_id": {
8 | "type": "long",
9 | "store": true
10 | },
11 | "post_date": {
12 | "type": "date",
13 | "store": true
14 | },
15 | "es_update_date": {
16 | "type": "date",
17 | "store": true
18 | },
19 | "url": {
20 | "type": "text",
21 | "store": true
22 | },
23 | "title": {
24 | "type": "text",
25 | "store": true,
26 | "analyzer": "content_analyzer",
27 | "fields": {
28 | "bigrams": {
29 | "type": "text",
30 | "analyzer": "content_bigrams"
31 | }
32 | }
33 | },
34 | "author": {
35 | "type": "text",
36 | "store": true,
37 | "analyzer": "standard"
38 | },
39 | "content": {
40 | "type": "text",
41 | "store": true,
42 | "analyzer": "content_analyzer",
43 | "fields": {
44 | "bigrams": {
45 | "type": "text",
46 | "analyzer": "content_bigrams"
47 | }
48 | }
49 | },
50 | "excerpt": {
51 | "type": "text",
52 | "store": true,
53 | "analyzer": "content_analyzer"
54 | },
55 | "categories": {
56 | "type": "text",
57 | "store": true,
58 | "analyzer": "content_analyzer"
59 | }
60 | }
61 | },
62 | "settings": {
63 | "number_of_shards": 1,
64 | "number_of_replicas": 1,
65 | "analysis": {
66 | "filter": {
67 | "english_stemmer": {
68 | "type": "stemmer",
69 | "language": "english"
70 | },
71 | "english_possessive_stemmer": {
72 | "type": "stemmer",
73 | "language": "possessive_english"
74 | },
75 | "bigram": {
76 | "type": "shingle",
77 | "max_shingle_size": 2,
78 | "output_unigrams": false
79 | }
80 | },
81 | "analyzer": {
82 | "content_analyzer": {
83 | "type": "custom",
84 | "char_filter": [
85 | "html_strip"
86 | ],
87 | "filter": [
88 | "english_possessive_stemmer",
89 | "lowercase",
90 | "english_stemmer"
91 | ],
92 | "tokenizer": "standard"
93 | },
94 | "content_bigrams": {
95 | "type": "custom",
96 | "char_filter": [
97 | "html_strip"
98 | ],
99 | "filter": [
100 | "english_possessive_stemmer",
101 | "lowercase",
102 | "english_stemmer",
103 | "bigram"
104 | ],
105 | "tokenizer": "standard"
106 | }
107 | }
108 | }
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/notebooks/opensearch/osc-blog/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/opensearch/tmdb/fmap.txt:
--------------------------------------------------------------------------------
1 | 0 release_year q
2 | 1 features0 q
3 |
--------------------------------------------------------------------------------
/notebooks/opensearch/tmdb/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/solr.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
28 |
29 |
30 |
31 |
32 |
33 | ${host:}
34 | ${jetty.port:8983}
35 | ${hostContext:solr}
36 |
37 | ${genericCoreNodeNames:true}
38 |
39 | ${zkClientTimeout:30000}
40 | ${distribUpdateSoTimeout:600000}
41 | ${distribUpdateConnTimeout:60000}
42 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider}
43 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider}
44 |
45 |
46 |
47 |
49 | ${socketTimeout:600000}
50 | ${connTimeout:60000}
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/elevate.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
26 |
27 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/idioms.txt:
--------------------------------------------------------------------------------
1 | # Idioms is a synonyms file that captures idiomatic phrases as single units
2 |
3 | # LHS is all representations encountered in query or document
4 | looneytunes, looney tunes, looney toons => 12345
5 | sci fi, scifi, science fiction => 56789
6 |
7 | #looneytunes, looney tunes => looney_tunes
8 | #bugs bunny => bug_bunny
9 | #mickey mouse => mickey_mouse
10 | #minnie mouse => minnie_mouse
11 | #donald duck => donald_duck
12 | #yogi bear => yogi_bear
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ca.txt:
--------------------------------------------------------------------------------
1 | # Set of Catalan contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | l
5 | m
6 | n
7 | s
8 | t
9 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_fr.txt:
--------------------------------------------------------------------------------
1 | # Set of French contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | l
4 | m
5 | t
6 | qu
7 | n
8 | s
9 | j
10 | d
11 | c
12 | jusqu
13 | quoiqu
14 | lorsqu
15 | puisqu
16 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | m
5 | b
6 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_it.txt:
--------------------------------------------------------------------------------
1 | # Set of Italian contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | c
4 | l
5 | all
6 | dall
7 | dell
8 | nell
9 | sull
10 | coll
11 | pell
12 | gl
13 | agl
14 | dagl
15 | degl
16 | negl
17 | sugl
18 | un
19 | m
20 | t
21 | s
22 | v
23 | d
24 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/hyphenations_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish hyphenations for StopFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | h
4 | n
5 | t
6 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stemdict_nl.txt:
--------------------------------------------------------------------------------
1 | # Set of overrides for the dutch stemmer
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | fiets fiets
4 | bromfiets bromfiets
5 | ei eier
6 | kind kinder
7 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ar.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization)
5 | # This means that when modifying this list, you might need to add some
6 | # redundant entries, for example containing forms with both أ and ا
7 | من
8 | ومن
9 | منها
10 | منه
11 | في
12 | وفي
13 | فيها
14 | فيه
15 | و
16 | ف
17 | ثم
18 | او
19 | أو
20 | ب
21 | بها
22 | به
23 | ا
24 | أ
25 | اى
26 | اي
27 | أي
28 | أى
29 | لا
30 | ولا
31 | الا
32 | ألا
33 | إلا
34 | لكن
35 | ما
36 | وما
37 | كما
38 | فما
39 | عن
40 | مع
41 | اذا
42 | إذا
43 | ان
44 | أن
45 | إن
46 | انها
47 | أنها
48 | إنها
49 | انه
50 | أنه
51 | إنه
52 | بان
53 | بأن
54 | فان
55 | فأن
56 | وان
57 | وأن
58 | وإن
59 | التى
60 | التي
61 | الذى
62 | الذي
63 | الذين
64 | الى
65 | الي
66 | إلى
67 | إلي
68 | على
69 | عليها
70 | عليه
71 | اما
72 | أما
73 | إما
74 | ايضا
75 | أيضا
76 | كل
77 | وكل
78 | لم
79 | ولم
80 | لن
81 | ولن
82 | هى
83 | هي
84 | هو
85 | وهى
86 | وهي
87 | وهو
88 | فهى
89 | فهي
90 | فهو
91 | انت
92 | أنت
93 | لك
94 | لها
95 | له
96 | هذه
97 | هذا
98 | تلك
99 | ذلك
100 | هناك
101 | كانت
102 | كان
103 | يكون
104 | تكون
105 | وكانت
106 | وكان
107 | غير
108 | بعض
109 | قد
110 | نحو
111 | بين
112 | بينما
113 | منذ
114 | ضمن
115 | حيث
116 | الان
117 | الآن
118 | خلال
119 | بعد
120 | قبل
121 | حتى
122 | عند
123 | عندما
124 | لدى
125 | جميع
126 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_bg.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | а
5 | аз
6 | ако
7 | ала
8 | бе
9 | без
10 | беше
11 | би
12 | бил
13 | била
14 | били
15 | било
16 | близо
17 | бъдат
18 | бъде
19 | бяха
20 | в
21 | вас
22 | ваш
23 | ваша
24 | вероятно
25 | вече
26 | взема
27 | ви
28 | вие
29 | винаги
30 | все
31 | всеки
32 | всички
33 | всичко
34 | всяка
35 | във
36 | въпреки
37 | върху
38 | г
39 | ги
40 | главно
41 | го
42 | д
43 | да
44 | дали
45 | до
46 | докато
47 | докога
48 | дори
49 | досега
50 | доста
51 | е
52 | едва
53 | един
54 | ето
55 | за
56 | зад
57 | заедно
58 | заради
59 | засега
60 | затова
61 | защо
62 | защото
63 | и
64 | из
65 | или
66 | им
67 | има
68 | имат
69 | иска
70 | й
71 | каза
72 | как
73 | каква
74 | какво
75 | както
76 | какъв
77 | като
78 | кога
79 | когато
80 | което
81 | които
82 | кой
83 | който
84 | колко
85 | която
86 | къде
87 | където
88 | към
89 | ли
90 | м
91 | ме
92 | между
93 | мен
94 | ми
95 | мнозина
96 | мога
97 | могат
98 | може
99 | моля
100 | момента
101 | му
102 | н
103 | на
104 | над
105 | назад
106 | най
107 | направи
108 | напред
109 | например
110 | нас
111 | не
112 | него
113 | нея
114 | ни
115 | ние
116 | никой
117 | нито
118 | но
119 | някои
120 | някой
121 | няма
122 | обаче
123 | около
124 | освен
125 | особено
126 | от
127 | отгоре
128 | отново
129 | още
130 | пак
131 | по
132 | повече
133 | повечето
134 | под
135 | поне
136 | поради
137 | после
138 | почти
139 | прави
140 | пред
141 | преди
142 | през
143 | при
144 | пък
145 | първо
146 | с
147 | са
148 | само
149 | се
150 | сега
151 | си
152 | скоро
153 | след
154 | сме
155 | според
156 | сред
157 | срещу
158 | сте
159 | съм
160 | със
161 | също
162 | т
163 | тази
164 | така
165 | такива
166 | такъв
167 | там
168 | твой
169 | те
170 | тези
171 | ти
172 | тн
173 | то
174 | това
175 | тогава
176 | този
177 | той
178 | толкова
179 | точно
180 | трябва
181 | тук
182 | тъй
183 | тя
184 | тях
185 | у
186 | харесва
187 | ч
188 | че
189 | често
190 | чрез
191 | ще
192 | щом
193 | я
194 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ca.txt:
--------------------------------------------------------------------------------
1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
2 | a
3 | abans
4 | ací
5 | ah
6 | així
7 | això
8 | al
9 | als
10 | aleshores
11 | algun
12 | alguna
13 | algunes
14 | alguns
15 | alhora
16 | allà
17 | allí
18 | allò
19 | altra
20 | altre
21 | altres
22 | amb
23 | ambdós
24 | ambdues
25 | apa
26 | aquell
27 | aquella
28 | aquelles
29 | aquells
30 | aquest
31 | aquesta
32 | aquestes
33 | aquests
34 | aquí
35 | baix
36 | cada
37 | cadascú
38 | cadascuna
39 | cadascunes
40 | cadascuns
41 | com
42 | contra
43 | d'un
44 | d'una
45 | d'unes
46 | d'uns
47 | dalt
48 | de
49 | del
50 | dels
51 | des
52 | després
53 | dins
54 | dintre
55 | donat
56 | doncs
57 | durant
58 | e
59 | eh
60 | el
61 | els
62 | em
63 | en
64 | encara
65 | ens
66 | entre
67 | érem
68 | eren
69 | éreu
70 | es
71 | és
72 | esta
73 | està
74 | estàvem
75 | estaven
76 | estàveu
77 | esteu
78 | et
79 | etc
80 | ets
81 | fins
82 | fora
83 | gairebé
84 | ha
85 | han
86 | has
87 | havia
88 | he
89 | hem
90 | heu
91 | hi
92 | ho
93 | i
94 | igual
95 | iguals
96 | ja
97 | l'hi
98 | la
99 | les
100 | li
101 | li'n
102 | llavors
103 | m'he
104 | ma
105 | mal
106 | malgrat
107 | mateix
108 | mateixa
109 | mateixes
110 | mateixos
111 | me
112 | mentre
113 | més
114 | meu
115 | meus
116 | meva
117 | meves
118 | molt
119 | molta
120 | moltes
121 | molts
122 | mon
123 | mons
124 | n'he
125 | n'hi
126 | ne
127 | ni
128 | no
129 | nogensmenys
130 | només
131 | nosaltres
132 | nostra
133 | nostre
134 | nostres
135 | o
136 | oh
137 | oi
138 | on
139 | pas
140 | pel
141 | pels
142 | per
143 | però
144 | perquè
145 | poc
146 | poca
147 | pocs
148 | poques
149 | potser
150 | propi
151 | qual
152 | quals
153 | quan
154 | quant
155 | que
156 | què
157 | quelcom
158 | qui
159 | quin
160 | quina
161 | quines
162 | quins
163 | s'ha
164 | s'han
165 | sa
166 | semblant
167 | semblants
168 | ses
169 | seu
170 | seus
171 | seva
172 | seva
173 | seves
174 | si
175 | sobre
176 | sobretot
177 | sóc
178 | solament
179 | sols
180 | son
181 | són
182 | sons
183 | sota
184 | sou
185 | t'ha
186 | t'han
187 | t'he
188 | ta
189 | tal
190 | també
191 | tampoc
192 | tan
193 | tant
194 | tanta
195 | tantes
196 | teu
197 | teus
198 | teva
199 | teves
200 | ton
201 | tons
202 | tot
203 | tota
204 | totes
205 | tots
206 | un
207 | una
208 | unes
209 | uns
210 | us
211 | va
212 | vaig
213 | vam
214 | van
215 | vas
216 | veu
217 | vosaltres
218 | vostra
219 | vostre
220 | vostres
221 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_cz.txt:
--------------------------------------------------------------------------------
1 | a
2 | s
3 | k
4 | o
5 | i
6 | u
7 | v
8 | z
9 | dnes
10 | cz
11 | tímto
12 | budeš
13 | budem
14 | byli
15 | jseš
16 | můj
17 | svým
18 | ta
19 | tomto
20 | tohle
21 | tuto
22 | tyto
23 | jej
24 | zda
25 | proč
26 | máte
27 | tato
28 | kam
29 | tohoto
30 | kdo
31 | kteří
32 | mi
33 | nám
34 | tom
35 | tomuto
36 | mít
37 | nic
38 | proto
39 | kterou
40 | byla
41 | toho
42 | protože
43 | asi
44 | ho
45 | naši
46 | napište
47 | re
48 | což
49 | tím
50 | takže
51 | svých
52 | její
53 | svými
54 | jste
55 | aj
56 | tu
57 | tedy
58 | teto
59 | bylo
60 | kde
61 | ke
62 | pravé
63 | ji
64 | nad
65 | nejsou
66 | či
67 | pod
68 | téma
69 | mezi
70 | přes
71 | ty
72 | pak
73 | vám
74 | ani
75 | když
76 | však
77 | neg
78 | jsem
79 | tento
80 | článku
81 | články
82 | aby
83 | jsme
84 | před
85 | pta
86 | jejich
87 | byl
88 | ještě
89 | až
90 | bez
91 | také
92 | pouze
93 | první
94 | vaše
95 | která
96 | nás
97 | nový
98 | tipy
99 | pokud
100 | může
101 | strana
102 | jeho
103 | své
104 | jiné
105 | zprávy
106 | nové
107 | není
108 | vás
109 | jen
110 | podle
111 | zde
112 | už
113 | být
114 | více
115 | bude
116 | již
117 | než
118 | který
119 | by
120 | které
121 | co
122 | nebo
123 | ten
124 | tak
125 | má
126 | při
127 | od
128 | po
129 | jsou
130 | jak
131 | další
132 | ale
133 | si
134 | se
135 | ve
136 | to
137 | jako
138 | za
139 | zpět
140 | ze
141 | do
142 | pro
143 | je
144 | na
145 | atd
146 | atp
147 | jakmile
148 | přičemž
149 | já
150 | on
151 | ona
152 | ono
153 | oni
154 | ony
155 | my
156 | vy
157 | jí
158 | ji
159 | mě
160 | mne
161 | jemu
162 | tomu
163 | těm
164 | těmu
165 | němu
166 | němuž
167 | jehož
168 | jíž
169 | jelikož
170 | jež
171 | jakož
172 | načež
173 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_el.txt:
--------------------------------------------------------------------------------
1 | # Lucene Greek Stopwords list
2 | # Note: by default this file is used after GreekLowerCaseFilter,
3 | # so when modifying this file use 'σ' instead of 'ς'
4 | ο
5 | η
6 | το
7 | οι
8 | τα
9 | του
10 | τησ
11 | των
12 | τον
13 | την
14 | και
15 | κι
16 | κ
17 | ειμαι
18 | εισαι
19 | ειναι
20 | ειμαστε
21 | ειστε
22 | στο
23 | στον
24 | στη
25 | στην
26 | μα
27 | αλλα
28 | απο
29 | για
30 | προσ
31 | με
32 | σε
33 | ωσ
34 | παρα
35 | αντι
36 | κατα
37 | μετα
38 | θα
39 | να
40 | δε
41 | δεν
42 | μη
43 | μην
44 | επι
45 | ενω
46 | εαν
47 | αν
48 | τοτε
49 | που
50 | πωσ
51 | ποιοσ
52 | ποια
53 | ποιο
54 | ποιοι
55 | ποιεσ
56 | ποιων
57 | ποιουσ
58 | αυτοσ
59 | αυτη
60 | αυτο
61 | αυτοι
62 | αυτων
63 | αυτουσ
64 | αυτεσ
65 | αυτα
66 | εκεινοσ
67 | εκεινη
68 | εκεινο
69 | εκεινοι
70 | εκεινεσ
71 | εκεινα
72 | εκεινων
73 | εκεινουσ
74 | οπωσ
75 | ομωσ
76 | ισωσ
77 | οσο
78 | οτι
79 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 |
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_eu.txt:
--------------------------------------------------------------------------------
1 | # example set of basque stopwords
2 | al
3 | anitz
4 | arabera
5 | asko
6 | baina
7 | bat
8 | batean
9 | batek
10 | bati
11 | batzuei
12 | batzuek
13 | batzuetan
14 | batzuk
15 | bera
16 | beraiek
17 | berau
18 | berauek
19 | bere
20 | berori
21 | beroriek
22 | beste
23 | bezala
24 | da
25 | dago
26 | dira
27 | ditu
28 | du
29 | dute
30 | edo
31 | egin
32 | ere
33 | eta
34 | eurak
35 | ez
36 | gainera
37 | gu
38 | gutxi
39 | guzti
40 | haiei
41 | haiek
42 | haietan
43 | hainbeste
44 | hala
45 | han
46 | handik
47 | hango
48 | hara
49 | hari
50 | hark
51 | hartan
52 | hau
53 | hauei
54 | hauek
55 | hauetan
56 | hemen
57 | hemendik
58 | hemengo
59 | hi
60 | hona
61 | honek
62 | honela
63 | honetan
64 | honi
65 | hor
66 | hori
67 | horiei
68 | horiek
69 | horietan
70 | horko
71 | horra
72 | horrek
73 | horrela
74 | horretan
75 | horri
76 | hortik
77 | hura
78 | izan
79 | ni
80 | noiz
81 | nola
82 | non
83 | nondik
84 | nongo
85 | nor
86 | nora
87 | ze
88 | zein
89 | zen
90 | zenbait
91 | zenbat
92 | zer
93 | zergatik
94 | ziren
95 | zituen
96 | zu
97 | zuek
98 | zuen
99 | zuten
100 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fa.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | # Note: by default this file is used after normalization, so when adding entries
5 | # to this file, use the arabic 'ي' instead of 'ی'
6 | انان
7 | نداشته
8 | سراسر
9 | خياه
10 | ايشان
11 | وي
12 | تاكنون
13 | بيشتري
14 | دوم
15 | پس
16 | ناشي
17 | وگو
18 | يا
19 | داشتند
20 | سپس
21 | هنگام
22 | هرگز
23 | پنج
24 | نشان
25 | امسال
26 | ديگر
27 | گروهي
28 | شدند
29 | چطور
30 | ده
31 | و
32 | دو
33 | نخستين
34 | ولي
35 | چرا
36 | چه
37 | وسط
38 | ه
39 | كدام
40 | قابل
41 | يك
42 | رفت
43 | هفت
44 | همچنين
45 | در
46 | هزار
47 | بله
48 | بلي
49 | شايد
50 | اما
51 | شناسي
52 | گرفته
53 | دهد
54 | داشته
55 | دانست
56 | داشتن
57 | خواهيم
58 | ميليارد
59 | وقتيكه
60 | امد
61 | خواهد
62 | جز
63 | اورده
64 | شده
65 | بلكه
66 | خدمات
67 | شدن
68 | برخي
69 | نبود
70 | بسياري
71 | جلوگيري
72 | حق
73 | كردند
74 | نوعي
75 | بعري
76 | نكرده
77 | نظير
78 | نبايد
79 | بوده
80 | بودن
81 | داد
82 | اورد
83 | هست
84 | جايي
85 | شود
86 | دنبال
87 | داده
88 | بايد
89 | سابق
90 | هيچ
91 | همان
92 | انجا
93 | كمتر
94 | كجاست
95 | گردد
96 | كسي
97 | تر
98 | مردم
99 | تان
100 | دادن
101 | بودند
102 | سري
103 | جدا
104 | ندارند
105 | مگر
106 | يكديگر
107 | دارد
108 | دهند
109 | بنابراين
110 | هنگامي
111 | سمت
112 | جا
113 | انچه
114 | خود
115 | دادند
116 | زياد
117 | دارند
118 | اثر
119 | بدون
120 | بهترين
121 | بيشتر
122 | البته
123 | به
124 | براساس
125 | بيرون
126 | كرد
127 | بعضي
128 | گرفت
129 | توي
130 | اي
131 | ميليون
132 | او
133 | جريان
134 | تول
135 | بر
136 | مانند
137 | برابر
138 | باشيم
139 | مدتي
140 | گويند
141 | اكنون
142 | تا
143 | تنها
144 | جديد
145 | چند
146 | بي
147 | نشده
148 | كردن
149 | كردم
150 | گويد
151 | كرده
152 | كنيم
153 | نمي
154 | نزد
155 | روي
156 | قصد
157 | فقط
158 | بالاي
159 | ديگران
160 | اين
161 | ديروز
162 | توسط
163 | سوم
164 | ايم
165 | دانند
166 | سوي
167 | استفاده
168 | شما
169 | كنار
170 | داريم
171 | ساخته
172 | طور
173 | امده
174 | رفته
175 | نخست
176 | بيست
177 | نزديك
178 | طي
179 | كنيد
180 | از
181 | انها
182 | تمامي
183 | داشت
184 | يكي
185 | طريق
186 | اش
187 | چيست
188 | روب
189 | نمايد
190 | گفت
191 | چندين
192 | چيزي
193 | تواند
194 | ام
195 | ايا
196 | با
197 | ان
198 | ايد
199 | ترين
200 | اينكه
201 | ديگري
202 | راه
203 | هايي
204 | بروز
205 | همچنان
206 | پاعين
207 | كس
208 | حدود
209 | مختلف
210 | مقابل
211 | چيز
212 | گيرد
213 | ندارد
214 | ضد
215 | همچون
216 | سازي
217 | شان
218 | مورد
219 | باره
220 | مرسي
221 | خويش
222 | برخوردار
223 | چون
224 | خارج
225 | شش
226 | هنوز
227 | تحت
228 | ضمن
229 | هستيم
230 | گفته
231 | فكر
232 | بسيار
233 | پيش
234 | براي
235 | روزهاي
236 | انكه
237 | نخواهد
238 | بالا
239 | كل
240 | وقتي
241 | كي
242 | چنين
243 | كه
244 | گيري
245 | نيست
246 | است
247 | كجا
248 | كند
249 | نيز
250 | يابد
251 | بندي
252 | حتي
253 | توانند
254 | عقب
255 | خواست
256 | كنند
257 | بين
258 | تمام
259 | همه
260 | ما
261 | باشند
262 | مثل
263 | شد
264 | اري
265 | باشد
266 | اره
267 | طبق
268 | بعد
269 | اگر
270 | صورت
271 | غير
272 | جاي
273 | بيش
274 | ريزي
275 | اند
276 | زيرا
277 | چگونه
278 | بار
279 | لطفا
280 | مي
281 | درباره
282 | من
283 | ديده
284 | همين
285 | گذاري
286 | برداري
287 | علت
288 | گذاشته
289 | هم
290 | فوق
291 | نه
292 | ها
293 | شوند
294 | اباد
295 | همواره
296 | هر
297 | اول
298 | خواهند
299 | چهار
300 | نام
301 | امروز
302 | مان
303 | هاي
304 | قبل
305 | كنم
306 | سعي
307 | تازه
308 | را
309 | هستند
310 | زير
311 | جلوي
312 | عنوان
313 | بود
314 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fi.txt:
--------------------------------------------------------------------------------
1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
2 | | This file is distributed under the BSD License.
3 | | See http://snowball.tartarus.org/license.php
4 | | Also see http://www.opensource.org/licenses/bsd-license.html
5 | | - Encoding was converted to UTF-8.
6 | | - This notice was added.
7 | |
8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
9 |
10 | | forms of BE
11 |
12 | olla
13 | olen
14 | olet
15 | on
16 | olemme
17 | olette
18 | ovat
19 | ole | negative form
20 |
21 | oli
22 | olisi
23 | olisit
24 | olisin
25 | olisimme
26 | olisitte
27 | olisivat
28 | olit
29 | olin
30 | olimme
31 | olitte
32 | olivat
33 | ollut
34 | olleet
35 |
36 | en | negation
37 | et
38 | ei
39 | emme
40 | ette
41 | eivät
42 |
43 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
44 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I
45 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
46 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
47 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
48 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
49 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
50 |
51 | tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
52 | tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
53 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
54 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
55 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
56 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
57 |
58 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
59 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
60 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
61 | mitkä | (pl)
62 |
63 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
64 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
65 |
66 | | conjunctions
67 |
68 | että | that
69 | ja | and
70 | jos | if
71 | koska | because
72 | kuin | than
73 | mutta | but
74 | niin | so
75 | sekä | and
76 | sillä | for
77 | tai | or
78 | vaan | but
79 | vai | or
80 | vaikka | although
81 |
82 |
83 | | prepositions
84 |
85 | kanssa | with
86 | mukaan | according to
87 | noin | about
88 | poikki | across
89 | yli | over, across
90 |
91 | | other
92 |
93 | kun | when
94 | niin | so
95 | nyt | now
96 | itse | self
97 |
98 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ga.txt:
--------------------------------------------------------------------------------
1 |
2 | a
3 | ach
4 | ag
5 | agus
6 | an
7 | aon
8 | ar
9 | arna
10 | as
11 | b'
12 | ba
13 | beirt
14 | bhúr
15 | caoga
16 | ceathair
17 | ceathrar
18 | chomh
19 | chtó
20 | chuig
21 | chun
22 | cois
23 | céad
24 | cúig
25 | cúigear
26 | d'
27 | daichead
28 | dar
29 | de
30 | deich
31 | deichniúr
32 | den
33 | dhá
34 | do
35 | don
36 | dtí
37 | dá
38 | dár
39 | dó
40 | faoi
41 | faoin
42 | faoina
43 | faoinár
44 | fara
45 | fiche
46 | gach
47 | gan
48 | go
49 | gur
50 | haon
51 | hocht
52 | i
53 | iad
54 | idir
55 | in
56 | ina
57 | ins
58 | inár
59 | is
60 | le
61 | leis
62 | lena
63 | lenár
64 | m'
65 | mar
66 | mo
67 | mé
68 | na
69 | nach
70 | naoi
71 | naonúr
72 | ná
73 | ní
74 | níor
75 | nó
76 | nócha
77 | ocht
78 | ochtar
79 | os
80 | roimh
81 | sa
82 | seacht
83 | seachtar
84 | seachtó
85 | seasca
86 | seisear
87 | siad
88 | sibh
89 | sinn
90 | sna
91 | sé
92 | sí
93 | tar
94 | thar
95 | thú
96 | triúr
97 | trí
98 | trína
99 | trínár
100 | tríocha
101 | tú
102 | um
103 | ár
104 | é
105 | éis
106 | í
107 | ó
108 | ón
109 | óna
110 | ónár
111 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_gl.txt:
--------------------------------------------------------------------------------
1 | # galican stopwords
2 | a
3 | aínda
4 | alí
5 | aquel
6 | aquela
7 | aquelas
8 | aqueles
9 | aquilo
10 | aquí
11 | ao
12 | aos
13 | as
14 | así
15 | á
16 | ben
17 | cando
18 | che
19 | co
20 | coa
21 | comigo
22 | con
23 | connosco
24 | contigo
25 | convosco
26 | coas
27 | cos
28 | cun
29 | cuns
30 | cunha
31 | cunhas
32 | da
33 | dalgunha
34 | dalgunhas
35 | dalgún
36 | dalgúns
37 | das
38 | de
39 | del
40 | dela
41 | delas
42 | deles
43 | desde
44 | deste
45 | do
46 | dos
47 | dun
48 | duns
49 | dunha
50 | dunhas
51 | e
52 | el
53 | ela
54 | elas
55 | eles
56 | en
57 | era
58 | eran
59 | esa
60 | esas
61 | ese
62 | eses
63 | esta
64 | estar
65 | estaba
66 | está
67 | están
68 | este
69 | estes
70 | estiven
71 | estou
72 | eu
73 | é
74 | facer
75 | foi
76 | foron
77 | fun
78 | había
79 | hai
80 | iso
81 | isto
82 | la
83 | las
84 | lle
85 | lles
86 | lo
87 | los
88 | mais
89 | me
90 | meu
91 | meus
92 | min
93 | miña
94 | miñas
95 | moi
96 | na
97 | nas
98 | neste
99 | nin
100 | no
101 | non
102 | nos
103 | nosa
104 | nosas
105 | noso
106 | nosos
107 | nós
108 | nun
109 | nunha
110 | nuns
111 | nunhas
112 | o
113 | os
114 | ou
115 | ó
116 | ós
117 | para
118 | pero
119 | pode
120 | pois
121 | pola
122 | polas
123 | polo
124 | polos
125 | por
126 | que
127 | se
128 | senón
129 | ser
130 | seu
131 | seus
132 | sexa
133 | sido
134 | sobre
135 | súa
136 | súas
137 | tamén
138 | tan
139 | te
140 | ten
141 | teñen
142 | teño
143 | ter
144 | teu
145 | teus
146 | ti
147 | tido
148 | tiña
149 | tiven
150 | túa
151 | túas
152 | un
153 | unha
154 | unhas
155 | uns
156 | vos
157 | vosa
158 | vosas
159 | voso
160 | vosos
161 | vós
162 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hi.txt:
--------------------------------------------------------------------------------
1 | # Also see http://www.opensource.org/licenses/bsd-license.html
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # This file was created by Jacques Savoy and is distributed under the BSD license.
4 | # Note: by default this file also contains forms normalized by HindiNormalizer
5 | # for spelling variation (see section below), such that it can be used whether or
6 | # not you enable that feature. When adding additional entries to this list,
7 | # please add the normalized form as well.
8 | अंदर
9 | अत
10 | अपना
11 | अपनी
12 | अपने
13 | अभी
14 | आदि
15 | आप
16 | इत्यादि
17 | इन
18 | इनका
19 | इन्हीं
20 | इन्हें
21 | इन्हों
22 | इस
23 | इसका
24 | इसकी
25 | इसके
26 | इसमें
27 | इसी
28 | इसे
29 | उन
30 | उनका
31 | उनकी
32 | उनके
33 | उनको
34 | उन्हीं
35 | उन्हें
36 | उन्हों
37 | उस
38 | उसके
39 | उसी
40 | उसे
41 | एक
42 | एवं
43 | एस
44 | ऐसे
45 | और
46 | कई
47 | कर
48 | करता
49 | करते
50 | करना
51 | करने
52 | करें
53 | कहते
54 | कहा
55 | का
56 | काफ़ी
57 | कि
58 | कितना
59 | किन्हें
60 | किन्हों
61 | किया
62 | किर
63 | किस
64 | किसी
65 | किसे
66 | की
67 | कुछ
68 | कुल
69 | के
70 | को
71 | कोई
72 | कौन
73 | कौनसा
74 | गया
75 | घर
76 | जब
77 | जहाँ
78 | जा
79 | जितना
80 | जिन
81 | जिन्हें
82 | जिन्हों
83 | जिस
84 | जिसे
85 | जीधर
86 | जैसा
87 | जैसे
88 | जो
89 | तक
90 | तब
91 | तरह
92 | तिन
93 | तिन्हें
94 | तिन्हों
95 | तिस
96 | तिसे
97 | तो
98 | था
99 | थी
100 | थे
101 | दबारा
102 | दिया
103 | दुसरा
104 | दूसरे
105 | दो
106 | द्वारा
107 | न
108 | नहीं
109 | ना
110 | निहायत
111 | नीचे
112 | ने
113 | पर
114 | पर
115 | पहले
116 | पूरा
117 | पे
118 | फिर
119 | बनी
120 | बही
121 | बहुत
122 | बाद
123 | बाला
124 | बिलकुल
125 | भी
126 | भीतर
127 | मगर
128 | मानो
129 | मे
130 | में
131 | यदि
132 | यह
133 | यहाँ
134 | यही
135 | या
136 | यिह
137 | ये
138 | रखें
139 | रहा
140 | रहे
141 | ऱ्वासा
142 | लिए
143 | लिये
144 | लेकिन
145 | व
146 | वर्ग
147 | वह
148 | वह
149 | वहाँ
150 | वहीं
151 | वाले
152 | वुह
153 | वे
154 | वग़ैरह
155 | संग
156 | सकता
157 | सकते
158 | सबसे
159 | सभी
160 | साथ
161 | साबुत
162 | साभ
163 | सारा
164 | से
165 | सो
166 | ही
167 | हुआ
168 | हुई
169 | हुए
170 | है
171 | हैं
172 | हो
173 | होता
174 | होती
175 | होते
176 | होना
177 | होने
178 | # additional normalized forms of the above
179 | अपनि
180 | जेसे
181 | होति
182 | सभि
183 | तिंहों
184 | इंहों
185 | दवारा
186 | इसि
187 | किंहें
188 | थि
189 | उंहों
190 | ओर
191 | जिंहें
192 | वहिं
193 | अभि
194 | बनि
195 | हि
196 | उंहिं
197 | उंहें
198 | हें
199 | वगेरह
200 | एसे
201 | रवासा
202 | कोन
203 | निचे
204 | काफि
205 | उसि
206 | पुरा
207 | भितर
208 | हे
209 | बहि
210 | वहां
211 | कोइ
212 | यहां
213 | जिंहों
214 | तिंहें
215 | किसि
216 | कइ
217 | यहि
218 | इंहिं
219 | जिधर
220 | इंहें
221 | अदि
222 | इतयादि
223 | हुइ
224 | कोनसा
225 | इसकि
226 | दुसरे
227 | जहां
228 | अप
229 | किंहों
230 | उनकि
231 | भि
232 | वरग
233 | हुअ
234 | जेसा
235 | नहिं
236 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hu.txt:
--------------------------------------------------------------------------------
1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
2 | | This file is distributed under the BSD License.
3 | | See http://snowball.tartarus.org/license.php
4 | | Also see http://www.opensource.org/licenses/bsd-license.html
5 | | - Encoding was converted to UTF-8.
6 | | - This notice was added.
7 | |
8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
9 |
10 | | Hungarian stop word list
11 | | prepared by Anna Tordai
12 |
13 | a
14 | ahogy
15 | ahol
16 | aki
17 | akik
18 | akkor
19 | alatt
20 | által
21 | általában
22 | amely
23 | amelyek
24 | amelyekben
25 | amelyeket
26 | amelyet
27 | amelynek
28 | ami
29 | amit
30 | amolyan
31 | amíg
32 | amikor
33 | át
34 | abban
35 | ahhoz
36 | annak
37 | arra
38 | arról
39 | az
40 | azok
41 | azon
42 | azt
43 | azzal
44 | azért
45 | aztán
46 | azután
47 | azonban
48 | bár
49 | be
50 | belül
51 | benne
52 | cikk
53 | cikkek
54 | cikkeket
55 | csak
56 | de
57 | e
58 | eddig
59 | egész
60 | egy
61 | egyes
62 | egyetlen
63 | egyéb
64 | egyik
65 | egyre
66 | ekkor
67 | el
68 | elég
69 | ellen
70 | elő
71 | először
72 | előtt
73 | első
74 | én
75 | éppen
76 | ebben
77 | ehhez
78 | emilyen
79 | ennek
80 | erre
81 | ez
82 | ezt
83 | ezek
84 | ezen
85 | ezzel
86 | ezért
87 | és
88 | fel
89 | felé
90 | hanem
91 | hiszen
92 | hogy
93 | hogyan
94 | igen
95 | így
96 | illetve
97 | ill.
98 | ill
99 | ilyen
100 | ilyenkor
101 | ison
102 | ismét
103 | itt
104 | jó
105 | jól
106 | jobban
107 | kell
108 | kellett
109 | keresztül
110 | keressünk
111 | ki
112 | kívül
113 | között
114 | közül
115 | legalább
116 | lehet
117 | lehetett
118 | legyen
119 | lenne
120 | lenni
121 | lesz
122 | lett
123 | maga
124 | magát
125 | majd
126 | majd
127 | már
128 | más
129 | másik
130 | meg
131 | még
132 | mellett
133 | mert
134 | mely
135 | melyek
136 | mi
137 | mit
138 | míg
139 | miért
140 | milyen
141 | mikor
142 | minden
143 | mindent
144 | mindenki
145 | mindig
146 | mint
147 | mintha
148 | mivel
149 | most
150 | nagy
151 | nagyobb
152 | nagyon
153 | ne
154 | néha
155 | nekem
156 | neki
157 | nem
158 | néhány
159 | nélkül
160 | nincs
161 | olyan
162 | ott
163 | össze
164 | ő
165 | ők
166 | őket
167 | pedig
168 | persze
169 | rá
170 | s
171 | saját
172 | sem
173 | semmi
174 | sok
175 | sokat
176 | sokkal
177 | számára
178 | szemben
179 | szerint
180 | szinte
181 | talán
182 | tehát
183 | teljes
184 | tovább
185 | továbbá
186 | több
187 | úgy
188 | ugyanis
189 | új
190 | újabb
191 | újra
192 | után
193 | utána
194 | utolsó
195 | vagy
196 | vagyis
197 | valaki
198 | valami
199 | valamint
200 | való
201 | vagyok
202 | van
203 | vannak
204 | volt
205 | voltam
206 | voltak
207 | voltunk
208 | vissza
209 | vele
210 | viszont
211 | volna
212 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hy.txt:
--------------------------------------------------------------------------------
1 | # example set of Armenian stopwords.
2 | այդ
3 | այլ
4 | այն
5 | այս
6 | դու
7 | դուք
8 | եմ
9 | են
10 | ենք
11 | ես
12 | եք
13 | է
14 | էի
15 | էին
16 | էինք
17 | էիր
18 | էիք
19 | էր
20 | ըստ
21 | թ
22 | ի
23 | ին
24 | իսկ
25 | իր
26 | կամ
27 | համար
28 | հետ
29 | հետո
30 | մենք
31 | մեջ
32 | մի
33 | ն
34 | նա
35 | նաև
36 | նրա
37 | նրանք
38 | որ
39 | որը
40 | որոնք
41 | որպես
42 | ու
43 | ում
44 | պիտի
45 | վրա
46 | և
47 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ja.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file defines a stopword set for Japanese.
3 | #
4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
5 | # Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745
6 | # for frequency lists, etc. that can be useful for making your own set (if desired)
7 | #
8 | # Note that there is an overlap between these stopwords and the terms stopped when used
9 | # in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
10 | # that comments are not allowed on the same line as stopwords.
11 | #
12 | # Also note that stopping is done in a case-insensitive manner. Change your StopFilter
13 | # configuration if you need case-sensitive stopping. Lastly, note that stopping is done
14 | # using the same character width as the entries in this file. Since this StopFilter is
15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji
16 | # entries to be in half-width and your kana entries to be in full-width.
17 | #
18 | の
19 | に
20 | は
21 | を
22 | た
23 | が
24 | で
25 | て
26 | と
27 | し
28 | れ
29 | さ
30 | ある
31 | いる
32 | も
33 | する
34 | から
35 | な
36 | こと
37 | として
38 | い
39 | や
40 | れる
41 | など
42 | なっ
43 | ない
44 | この
45 | ため
46 | その
47 | あっ
48 | よう
49 | また
50 | もの
51 | という
52 | あり
53 | まで
54 | られ
55 | なる
56 | へ
57 | か
58 | だ
59 | これ
60 | によって
61 | により
62 | おり
63 | より
64 | による
65 | ず
66 | なり
67 | られる
68 | において
69 | ば
70 | なかっ
71 | なく
72 | しかし
73 | について
74 | せ
75 | だっ
76 | その後
77 | できる
78 | それ
79 | う
80 | ので
81 | なお
82 | のみ
83 | でき
84 | き
85 | つ
86 | における
87 | および
88 | いう
89 | さらに
90 | でも
91 | ら
92 | たり
93 | その他
94 | に関する
95 | たち
96 | ます
97 | ん
98 | なら
99 | に対して
100 | 特に
101 | せる
102 | 及び
103 | これら
104 | とき
105 | では
106 | にて
107 | ほか
108 | ながら
109 | うち
110 | そして
111 | とともに
112 | ただし
113 | かつて
114 | それぞれ
115 | または
116 | お
117 | ほど
118 | ものの
119 | に対する
120 | ほとんど
121 | と共に
122 | といった
123 | です
124 | とも
125 | ところ
126 | ここ
127 | ##### End of file
128 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_lv.txt:
--------------------------------------------------------------------------------
1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
2 | # the original list of over 800 forms was refined:
3 | # pronouns, adverbs, interjections were removed
4 | #
5 | # prepositions
6 | aiz
7 | ap
8 | ar
9 | apakš
10 | ārpus
11 | augšpus
12 | bez
13 | caur
14 | dēļ
15 | gar
16 | iekš
17 | iz
18 | kopš
19 | labad
20 | lejpus
21 | līdz
22 | no
23 | otrpus
24 | pa
25 | par
26 | pār
27 | pēc
28 | pie
29 | pirms
30 | pret
31 | priekš
32 | starp
33 | šaipus
34 | uz
35 | viņpus
36 | virs
37 | virspus
38 | zem
39 | apakšpus
40 | # Conjunctions
41 | un
42 | bet
43 | jo
44 | ja
45 | ka
46 | lai
47 | tomēr
48 | tikko
49 | turpretī
50 | arī
51 | kaut
52 | gan
53 | tādēļ
54 | tā
55 | ne
56 | tikvien
57 | vien
58 | kā
59 | ir
60 | te
61 | vai
62 | kamēr
63 | # Particles
64 | ar
65 | diezin
66 | droši
67 | diemžēl
68 | nebūt
69 | ik
70 | it
71 | taču
72 | nu
73 | pat
74 | tiklab
75 | iekšpus
76 | nedz
77 | tik
78 | nevis
79 | turpretim
80 | jeb
81 | iekam
82 | iekām
83 | iekāms
84 | kolīdz
85 | līdzko
86 | tiklīdz
87 | jebšu
88 | tālab
89 | tāpēc
90 | nekā
91 | itin
92 | jā
93 | jau
94 | jel
95 | nē
96 | nezin
97 | tad
98 | tikai
99 | vis
100 | tak
101 | iekams
102 | vien
103 | # modal verbs
104 | būt
105 | biju
106 | biji
107 | bija
108 | bijām
109 | bijāt
110 | esmu
111 | esi
112 | esam
113 | esat
114 | būšu
115 | būsi
116 | būs
117 | būsim
118 | būsiet
119 | tikt
120 | tiku
121 | tiki
122 | tika
123 | tikām
124 | tikāt
125 | tieku
126 | tiec
127 | tiek
128 | tiekam
129 | tiekat
130 | tikšu
131 | tiks
132 | tiksim
133 | tiksiet
134 | tapt
135 | tapi
136 | tapāt
137 | topat
138 | tapšu
139 | tapsi
140 | taps
141 | tapsim
142 | tapsiet
143 | kļūt
144 | kļuvu
145 | kļuvi
146 | kļuva
147 | kļuvām
148 | kļuvāt
149 | kļūstu
150 | kļūsti
151 | kļūst
152 | kļūstam
153 | kļūstat
154 | kļūšu
155 | kļūsi
156 | kļūs
157 | kļūsim
158 | kļūsiet
159 | # verbs
160 | varēt
161 | varēju
162 | varējām
163 | varēšu
164 | varēsim
165 | var
166 | varēji
167 | varējāt
168 | varēsi
169 | varēsiet
170 | varat
171 | varēja
172 | varēs
173 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ro.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | acea
5 | aceasta
6 | această
7 | aceea
8 | acei
9 | aceia
10 | acel
11 | acela
12 | acele
13 | acelea
14 | acest
15 | acesta
16 | aceste
17 | acestea
18 | aceşti
19 | aceştia
20 | acolo
21 | acum
22 | ai
23 | aia
24 | aibă
25 | aici
26 | al
27 | ăla
28 | ale
29 | alea
30 | ălea
31 | altceva
32 | altcineva
33 | am
34 | ar
35 | are
36 | aş
37 | aşadar
38 | asemenea
39 | asta
40 | ăsta
41 | astăzi
42 | astea
43 | ăstea
44 | ăştia
45 | asupra
46 | aţi
47 | au
48 | avea
49 | avem
50 | aveţi
51 | azi
52 | bine
53 | bucur
54 | bună
55 | ca
56 | că
57 | căci
58 | când
59 | care
60 | cărei
61 | căror
62 | cărui
63 | cât
64 | câte
65 | câţi
66 | către
67 | câtva
68 | ce
69 | cel
70 | ceva
71 | chiar
72 | cînd
73 | cine
74 | cineva
75 | cît
76 | cîte
77 | cîţi
78 | cîtva
79 | contra
80 | cu
81 | cum
82 | cumva
83 | curând
84 | curînd
85 | da
86 | dă
87 | dacă
88 | dar
89 | datorită
90 | de
91 | deci
92 | deja
93 | deoarece
94 | departe
95 | deşi
96 | din
97 | dinaintea
98 | dintr
99 | dintre
100 | drept
101 | după
102 | ea
103 | ei
104 | el
105 | ele
106 | eram
107 | este
108 | eşti
109 | eu
110 | face
111 | fără
112 | fi
113 | fie
114 | fiecare
115 | fii
116 | fim
117 | fiţi
118 | iar
119 | ieri
120 | îi
121 | îl
122 | îmi
123 | împotriva
124 | în
125 | înainte
126 | înaintea
127 | încât
128 | încît
129 | încotro
130 | între
131 | întrucât
132 | întrucît
133 | îţi
134 | la
135 | lângă
136 | le
137 | li
138 | lîngă
139 | lor
140 | lui
141 | mă
142 | mâine
143 | mea
144 | mei
145 | mele
146 | mereu
147 | meu
148 | mi
149 | mine
150 | mult
151 | multă
152 | mulţi
153 | ne
154 | nicăieri
155 | nici
156 | nimeni
157 | nişte
158 | noastră
159 | noastre
160 | noi
161 | noştri
162 | nostru
163 | nu
164 | ori
165 | oricând
166 | oricare
167 | oricât
168 | orice
169 | oricînd
170 | oricine
171 | oricît
172 | oricum
173 | oriunde
174 | până
175 | pe
176 | pentru
177 | peste
178 | pînă
179 | poate
180 | pot
181 | prea
182 | prima
183 | primul
184 | prin
185 | printr
186 | sa
187 | să
188 | săi
189 | sale
190 | sau
191 | său
192 | se
193 | şi
194 | sînt
195 | sîntem
196 | sînteţi
197 | spre
198 | sub
199 | sunt
200 | suntem
201 | sunteţi
202 | ta
203 | tăi
204 | tale
205 | tău
206 | te
207 | ţi
208 | ţie
209 | tine
210 | toată
211 | toate
212 | tot
213 | toţi
214 | totuşi
215 | tu
216 | un
217 | una
218 | unde
219 | undeva
220 | unei
221 | unele
222 | uneori
223 | unor
224 | vă
225 | vi
226 | voastră
227 | voastre
228 | voi
229 | voştri
230 | vostru
231 | vouă
232 | vreo
233 | vreun
234 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_th.txt:
--------------------------------------------------------------------------------
1 | # Thai stopwords from:
2 | # "Opinion Detection in Thai Political News Columns
3 | # Based on Subjectivity Analysis"
4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
5 | ไว้
6 | ไม่
7 | ไป
8 | ได้
9 | ให้
10 | ใน
11 | โดย
12 | แห่ง
13 | แล้ว
14 | และ
15 | แรก
16 | แบบ
17 | แต่
18 | เอง
19 | เห็น
20 | เลย
21 | เริ่ม
22 | เรา
23 | เมื่อ
24 | เพื่อ
25 | เพราะ
26 | เป็นการ
27 | เป็น
28 | เปิดเผย
29 | เปิด
30 | เนื่องจาก
31 | เดียวกัน
32 | เดียว
33 | เช่น
34 | เฉพาะ
35 | เคย
36 | เข้า
37 | เขา
38 | อีก
39 | อาจ
40 | อะไร
41 | ออก
42 | อย่าง
43 | อยู่
44 | อยาก
45 | หาก
46 | หลาย
47 | หลังจาก
48 | หลัง
49 | หรือ
50 | หนึ่ง
51 | ส่วน
52 | ส่ง
53 | สุด
54 | สําหรับ
55 | ว่า
56 | วัน
57 | ลง
58 | ร่วม
59 | ราย
60 | รับ
61 | ระหว่าง
62 | รวม
63 | ยัง
64 | มี
65 | มาก
66 | มา
67 | พร้อม
68 | พบ
69 | ผ่าน
70 | ผล
71 | บาง
72 | น่า
73 | นี้
74 | นํา
75 | นั้น
76 | นัก
77 | นอกจาก
78 | ทุก
79 | ที่สุด
80 | ที่
81 | ทําให้
82 | ทํา
83 | ทาง
84 | ทั้งนี้
85 | ทั้ง
86 | ถ้า
87 | ถูก
88 | ถึง
89 | ต้อง
90 | ต่างๆ
91 | ต่าง
92 | ต่อ
93 | ตาม
94 | ตั้งแต่
95 | ตั้ง
96 | ด้าน
97 | ด้วย
98 | ดัง
99 | ซึ่ง
100 | ช่วง
101 | จึง
102 | จาก
103 | จัด
104 | จะ
105 | คือ
106 | ความ
107 | ครั้ง
108 | คง
109 | ขึ้น
110 | ของ
111 | ขอ
112 | ขณะ
113 | ก่อน
114 | ก็
115 | การ
116 | กับ
117 | กัน
118 | กว่า
119 | กล่าว
120 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_tr.txt:
--------------------------------------------------------------------------------
1 | # Turkish stopwords from LUCENE-559
2 | # merged with the list from "Information Retrieval on Turkish Texts"
3 | # (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
4 | acaba
5 | altmış
6 | altı
7 | ama
8 | ancak
9 | arada
10 | aslında
11 | ayrıca
12 | bana
13 | bazı
14 | belki
15 | ben
16 | benden
17 | beni
18 | benim
19 | beri
20 | beş
21 | bile
22 | bin
23 | bir
24 | birçok
25 | biri
26 | birkaç
27 | birkez
28 | birşey
29 | birşeyi
30 | biz
31 | bize
32 | bizden
33 | bizi
34 | bizim
35 | böyle
36 | böylece
37 | bu
38 | buna
39 | bunda
40 | bundan
41 | bunlar
42 | bunları
43 | bunların
44 | bunu
45 | bunun
46 | burada
47 | çok
48 | çünkü
49 | da
50 | daha
51 | dahi
52 | de
53 | defa
54 | değil
55 | diğer
56 | diye
57 | doksan
58 | dokuz
59 | dolayı
60 | dolayısıyla
61 | dört
62 | edecek
63 | eden
64 | ederek
65 | edilecek
66 | ediliyor
67 | edilmesi
68 | ediyor
69 | eğer
70 | elli
71 | en
72 | etmesi
73 | etti
74 | ettiği
75 | ettiğini
76 | gibi
77 | göre
78 | halen
79 | hangi
80 | hatta
81 | hem
82 | henüz
83 | hep
84 | hepsi
85 | her
86 | herhangi
87 | herkesin
88 | hiç
89 | hiçbir
90 | için
91 | iki
92 | ile
93 | ilgili
94 | ise
95 | işte
96 | itibaren
97 | itibariyle
98 | kadar
99 | karşın
100 | katrilyon
101 | kendi
102 | kendilerine
103 | kendini
104 | kendisi
105 | kendisine
106 | kendisini
107 | kez
108 | ki
109 | kim
110 | kimden
111 | kime
112 | kimi
113 | kimse
114 | kırk
115 | milyar
116 | milyon
117 | mu
118 | mü
119 | mı
120 | nasıl
121 | ne
122 | neden
123 | nedenle
124 | nerde
125 | nerede
126 | nereye
127 | niye
128 | niçin
129 | o
130 | olan
131 | olarak
132 | oldu
133 | olduğu
134 | olduğunu
135 | olduklarını
136 | olmadı
137 | olmadığı
138 | olmak
139 | olması
140 | olmayan
141 | olmaz
142 | olsa
143 | olsun
144 | olup
145 | olur
146 | olursa
147 | oluyor
148 | on
149 | ona
150 | ondan
151 | onlar
152 | onlardan
153 | onları
154 | onların
155 | onu
156 | onun
157 | otuz
158 | oysa
159 | öyle
160 | pek
161 | rağmen
162 | sadece
163 | sanki
164 | sekiz
165 | seksen
166 | sen
167 | senden
168 | seni
169 | senin
170 | siz
171 | sizden
172 | sizi
173 | sizin
174 | şey
175 | şeyden
176 | şeyi
177 | şeyler
178 | şöyle
179 | şu
180 | şuna
181 | şunda
182 | şundan
183 | şunları
184 | şunu
185 | tarafından
186 | trilyon
187 | tüm
188 | üç
189 | üzere
190 | var
191 | vardı
192 | ve
193 | veya
194 | ya
195 | yani
196 | yapacak
197 | yapılan
198 | yapılması
199 | yapıyor
200 | yapmak
201 | yaptı
202 | yaptığı
203 | yaptığını
204 | yaptıkları
205 | yedi
206 | yerine
207 | yetmiş
208 | yine
209 | yirmi
210 | yoksa
211 | yüz
212 | zaten
213 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/lang/userdict_ja.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
3 | #
4 | # Add entries to this file in order to override the statistical model in terms
5 | # of segmentation, readings and part-of-speech tags. Notice that entries do
6 | # not have weights since they are always used when found. This is by-design
7 | # in order to maximize ease-of-use.
8 | #
9 | # Entries are defined using the following CSV format:
10 | # , ... , ... ,
11 | #
12 | # Notice that a single half-width space separates tokens and readings, and
13 | # that the number tokens and readings must match exactly.
14 | #
15 | # Also notice that multiple entries with the same is undefined.
16 | #
17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines.
18 | #
19 |
20 | # Custom segmentation for kanji compounds
21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
23 |
24 | # Custom segmentation for compound katakana
25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
27 |
28 | # Custom reading for former sumo wrestler
29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名
30 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/name_synonyms.txt:
--------------------------------------------------------------------------------
1 | sky walker, skywalker
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/names.txt:
--------------------------------------------------------------------------------
1 | luke_skywalker
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/params.json:
--------------------------------------------------------------------------------
1 | {"params":{
2 | "query":{
3 | "defType":"edismax",
4 | "q.alt":"*:*",
5 | "rows":"10",
6 | "fl":"*,score",
7 | "":{"v":0}
8 | },
9 | "facets":{
10 | "facet":"on",
11 | "facet.mincount": "1",
12 | "":{"v":0}
13 | },
14 | "velocity":{
15 | "wt": "velocity",
16 | "v.template":"browse",
17 | "v.layout": "layout",
18 | "":{"v":0}
19 | }
20 | }}
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/protwords.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | # Use a protected word file to protect against the stemmer reducing two
15 | # unrelated words to the same base word.
16 |
17 | # Some non-words that normally won't be encountered,
18 | # just to test that they won't be stemmed.
19 | dontstems
20 | zwhacky
21 |
22 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/stopwords.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 |
16 | # Demonstrating bidirectional synonyms
17 | #wife,bride
18 | #wife,spouse
19 | #toons,tunes,cartoon
20 |
21 | # Demonstrating => syntax
22 | # wife => wife, bride
23 | # spouse => spouse, husband, wife, partner
24 | # tunes => cartoons, toons, songs
25 | # cartoon => toons, tunes
26 |
27 | # Demonstrating multi phrase
28 | #looney tunes, cartoons
29 | #science fiction, sci fi, sci-fi, scifi
30 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_bidirect.txt:
--------------------------------------------------------------------------------
1 | # Often people erroneously equate linguistic synonyms
2 | # with Solr synonyms. Here the bidirectional nature
3 | # of the synonyms creates problems where the more specific
4 | # term is not prioritized
5 | wife,bride
6 | wife,spouse
7 | toons,tunes,cartoon
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_directed.txt:
--------------------------------------------------------------------------------
1 | wife => wife, bride
2 | spouse => spouse, husband, wife, partner
3 | tunes => cartoons, toons, songs
4 | cartoon => toons, tunes
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_genres.txt:
--------------------------------------------------------------------------------
1 | scifi,science fiction,science fiction movie
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_multiterm.txt:
--------------------------------------------------------------------------------
1 | # Here are some multi term synonym to
2 | # see what happens at query time
3 |
4 | looney tunes, cartoons
5 | science fiction, sci fi, sci-fi, scifi
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy.txt:
--------------------------------------------------------------------------------
1 | # Capture how the *user* structures information
2 | #looneytunes, looney tunes => looney_tunes, cartoons
3 | #bugs bunny => bug_bunny, looney_tunes, cartoons
4 | #mickey mouse => mickey_mouse, disney, cartoons
5 | #minnie mouse => minnie_mouse, disney, cartoons
6 | #donald duck => donald_duck, disney, cartoons
7 | #yogi bear => yogi_bear, disney, cartoons
8 |
9 | wife => wife, spouse
10 | bride => bride, spouse
11 |
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy_parent.txt:
--------------------------------------------------------------------------------
1 | # Capture how the *user* structures information
2 | #looneytunes, looney tunes => looney_tunes
3 | #bugs bunny => bug_bunny, looney_tunes
4 | #mickey mouse => mickey_mouse, disney
5 | #minnie mouse => minnie_mouse, disney
6 | #donald duck => donald_duck, disney
7 | #yogi bear => yogi_bear, disney
8 |
9 | wife => wife, spouse
10 | bride => bride, spouse
--------------------------------------------------------------------------------
/notebooks/solr/.docker/solr_home/zoo.cfg:
--------------------------------------------------------------------------------
1 | # The number of milliseconds of each tick
2 | tickTime=2000
3 | # The number of ticks that the initial
4 | # synchronization phase can take
5 | initLimit=10
6 | # The number of ticks that can pass between
7 | # sending a request and getting an acknowledgement
8 | syncLimit=5
9 |
10 | # the directory where the snapshot is stored.
11 | # dataDir=/opt/zookeeper/data
12 | # NOTE: Solr defaults the dataDir to /zoo_data
13 |
14 | # the port at which the clients will connect
15 | # clientPort=2181
16 | # NOTE: Solr sets this based on zkRun / zkHost params
17 |
18 |
--------------------------------------------------------------------------------
/notebooks/solr/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM solr:8.11.1
2 |
3 | USER root
4 |
5 | ADD tmdb/solr_config /var/solr/data/configsets/tmdb
6 | RUN chown solr:solr /var/solr/data/configsets/tmdb
7 |
8 | ADD msmarco/solr_config /var/solr/data/configsets/msmarco
9 | RUN chown solr:solr /var/solr/data/configsets/msmarco
10 |
11 | USER solr
12 |
13 | CMD ["solr-foreground", "-Dsolr.ltr.enabled=true"]
14 |
--------------------------------------------------------------------------------
/notebooks/solr/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | solr:
3 | build: .
4 | expose:
5 | - "8983"
6 | ports:
7 | - "8983:8983"
8 | volumes:
9 | - data:/var/solr
10 | environment:
11 | SERVER_HOST: "0.0.0.0"
12 | mem_limit: 4096m
13 | mem_reservation: 4096m
14 | volumes:
15 | data:
16 |
--------------------------------------------------------------------------------
/notebooks/solr/msmarco/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/solr/msmarco/solr_config/conf/elevate.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
26 |
27 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/notebooks/solr/msmarco/solr_config/conf/params.json:
--------------------------------------------------------------------------------
1 | {"params":{
2 | "query":{
3 | "defType":"edismax",
4 | "q.alt":"*:*",
5 | "rows":"10",
6 | "fl":"*,score",
7 | "":{"v":0}
8 | },
9 | "facets":{
10 | "facet":"on",
11 | "facet.mincount": "1",
12 | "":{"v":0}
13 | },
14 | "velocity":{
15 | "wt": "velocity",
16 | "v.template":"browse",
17 | "v.layout": "layout",
18 | "":{"v":0}
19 | }
20 | }}
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/evaluation (Solr).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# evaluate (Solr Edition)\n",
8 | "\n",
9 | "**Note:** This lab requires hello-ltr be run first. You must have the TMDB data indexed and LTR models configured before proceeding."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "### RRE\n",
17 | "This lab makes use of the rated-ranking-evaluator [project](https://github.com/SeaseLtd/rated-ranking-evaluator) to carry out evaluations on our models from the hello-ltr lab.\n",
18 | "\n",
19 | "An RRE configuration requires the following:\n",
20 | "\n",
21 | "- configuration_sets\n",
22 | " - This tells RRE about the Solr/Elastic instance to use for each evaluation\n",
23 | "- corpora (Not required for this setup)\n",
24 | " - RRE supports indexing a snapshot of data for evaluations. For this lab we'll be using the data indexed previously.\n",
25 | "- ratings\n",
26 | " - This folder houses json files with queries and ratings to be evaluated\n",
27 | "- templates\n",
28 | " - The queries to be run by each configuration set\n",
29 | "- pom.xml\n",
30 | " - Maven project configuration, here you can configure what metrics are calculated by the evalauation and format of the report.\n",
31 | " \n",
32 | "Take a look at the rre folder in the hello-ltr to get a better idea of the project layout and structure."
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "### Ratings and Evaluation\n",
40 | "To get started with RRE we first need some ratings. For this example we're going to use a query for \"batman\" and we're going to say that newer films are better than older ones. We will setup 3 different configuration sets in RRE:\n",
41 | "\n",
42 | "- baseline (No LTR applied)\n",
43 | "- classic (Rescore with the `classic` LTR model)\n",
44 | "- latest (Rescore with the `latest` LTR model)\n",
45 | "\n",
46 | "The snippet below will kick off an evaluation in RRE"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "from ltr import evaluate\n",
56 | "evaluate('solr')"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "### Looking at the Results\n",
64 | "In this example we have rating data for every result in the Batman query and we're not adjusting matching so `Precision` and `Recall` are the expected value of 1. However, since we've altered the sorting of results with LTR we can see a lift in `ERR` as our higher rated documents are coming up closer to the top of the results."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "from ltr import rre_table\n",
74 | "rre_table()"
75 | ]
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "Python 3",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "3.7.6"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 2
99 | }
100 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/ltr.py:
--------------------------------------------------------------------------------
1 | # Import a module with the same name from a different directory.
2 | #
3 | # Adapted from
4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html
5 |
6 | import importlib
7 | import os
8 | import sys
9 |
10 | sys.path.insert(0, os.path.abspath('../../../'))
11 |
12 | # Temporarily hijack __file__ to avoid adding names at module scope;
13 | # __file__ will be overwritten again during the reload() call.
14 | __file__ = {'sys': sys, 'importlib': importlib}
15 |
16 | del importlib
17 | del os
18 | del sys
19 |
20 | __file__['importlib'].reload(__file__['sys'].modules[__name__])
21 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/elevate.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
26 |
27 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/idioms.txt:
--------------------------------------------------------------------------------
1 | # Idioms is a synonyms file that captures idiomatic phrases as single units
2 |
3 | # LHS is all representations encountered in query or document
4 | looneytunes, looney tunes, looney toons => 12345
5 | sci fi, scifi, science fiction => 56789
6 |
7 | #looneytunes, looney tunes => looney_tunes
8 | #bugs bunny => bug_bunny
9 | #mickey mouse => mickey_mouse
10 | #minnie mouse => minnie_mouse
11 | #donald duck => donald_duck
12 | #yogi bear => yogi_bear
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_ca.txt:
--------------------------------------------------------------------------------
1 | # Set of Catalan contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | l
5 | m
6 | n
7 | s
8 | t
9 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_fr.txt:
--------------------------------------------------------------------------------
1 | # Set of French contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | l
4 | m
5 | t
6 | qu
7 | n
8 | s
9 | j
10 | d
11 | c
12 | jusqu
13 | quoiqu
14 | lorsqu
15 | puisqu
16 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | d
4 | m
5 | b
6 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/contractions_it.txt:
--------------------------------------------------------------------------------
1 | # Set of Italian contractions for ElisionFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | c
4 | l
5 | all
6 | dall
7 | dell
8 | nell
9 | sull
10 | coll
11 | pell
12 | gl
13 | agl
14 | dagl
15 | degl
16 | negl
17 | sugl
18 | un
19 | m
20 | t
21 | s
22 | v
23 | d
24 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/hyphenations_ga.txt:
--------------------------------------------------------------------------------
1 | # Set of Irish hyphenations for StopFilter
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | h
4 | n
5 | t
6 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stemdict_nl.txt:
--------------------------------------------------------------------------------
1 | # Set of overrides for the dutch stemmer
2 | # TODO: load this as a resource from the analyzer and sync it in build.xml
3 | fiets fiets
4 | bromfiets bromfiets
5 | ei eier
6 | kind kinder
7 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ar.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization)
5 | # This means that when modifying this list, you might need to add some
6 | # redundant entries, for example containing forms with both أ and ا
7 | من
8 | ومن
9 | منها
10 | منه
11 | في
12 | وفي
13 | فيها
14 | فيه
15 | و
16 | ف
17 | ثم
18 | او
19 | أو
20 | ب
21 | بها
22 | به
23 | ا
24 | أ
25 | اى
26 | اي
27 | أي
28 | أى
29 | لا
30 | ولا
31 | الا
32 | ألا
33 | إلا
34 | لكن
35 | ما
36 | وما
37 | كما
38 | فما
39 | عن
40 | مع
41 | اذا
42 | إذا
43 | ان
44 | أن
45 | إن
46 | انها
47 | أنها
48 | إنها
49 | انه
50 | أنه
51 | إنه
52 | بان
53 | بأن
54 | فان
55 | فأن
56 | وان
57 | وأن
58 | وإن
59 | التى
60 | التي
61 | الذى
62 | الذي
63 | الذين
64 | الى
65 | الي
66 | إلى
67 | إلي
68 | على
69 | عليها
70 | عليه
71 | اما
72 | أما
73 | إما
74 | ايضا
75 | أيضا
76 | كل
77 | وكل
78 | لم
79 | ولم
80 | لن
81 | ولن
82 | هى
83 | هي
84 | هو
85 | وهى
86 | وهي
87 | وهو
88 | فهى
89 | فهي
90 | فهو
91 | انت
92 | أنت
93 | لك
94 | لها
95 | له
96 | هذه
97 | هذا
98 | تلك
99 | ذلك
100 | هناك
101 | كانت
102 | كان
103 | يكون
104 | تكون
105 | وكانت
106 | وكان
107 | غير
108 | بعض
109 | قد
110 | نحو
111 | بين
112 | بينما
113 | منذ
114 | ضمن
115 | حيث
116 | الان
117 | الآن
118 | خلال
119 | بعد
120 | قبل
121 | حتى
122 | عند
123 | عندما
124 | لدى
125 | جميع
126 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_bg.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | а
5 | аз
6 | ако
7 | ала
8 | бе
9 | без
10 | беше
11 | би
12 | бил
13 | била
14 | били
15 | било
16 | близо
17 | бъдат
18 | бъде
19 | бяха
20 | в
21 | вас
22 | ваш
23 | ваша
24 | вероятно
25 | вече
26 | взема
27 | ви
28 | вие
29 | винаги
30 | все
31 | всеки
32 | всички
33 | всичко
34 | всяка
35 | във
36 | въпреки
37 | върху
38 | г
39 | ги
40 | главно
41 | го
42 | д
43 | да
44 | дали
45 | до
46 | докато
47 | докога
48 | дори
49 | досега
50 | доста
51 | е
52 | едва
53 | един
54 | ето
55 | за
56 | зад
57 | заедно
58 | заради
59 | засега
60 | затова
61 | защо
62 | защото
63 | и
64 | из
65 | или
66 | им
67 | има
68 | имат
69 | иска
70 | й
71 | каза
72 | как
73 | каква
74 | какво
75 | както
76 | какъв
77 | като
78 | кога
79 | когато
80 | което
81 | които
82 | кой
83 | който
84 | колко
85 | която
86 | къде
87 | където
88 | към
89 | ли
90 | м
91 | ме
92 | между
93 | мен
94 | ми
95 | мнозина
96 | мога
97 | могат
98 | може
99 | моля
100 | момента
101 | му
102 | н
103 | на
104 | над
105 | назад
106 | най
107 | направи
108 | напред
109 | например
110 | нас
111 | не
112 | него
113 | нея
114 | ни
115 | ние
116 | никой
117 | нито
118 | но
119 | някои
120 | някой
121 | няма
122 | обаче
123 | около
124 | освен
125 | особено
126 | от
127 | отгоре
128 | отново
129 | още
130 | пак
131 | по
132 | повече
133 | повечето
134 | под
135 | поне
136 | поради
137 | после
138 | почти
139 | прави
140 | пред
141 | преди
142 | през
143 | при
144 | пък
145 | първо
146 | с
147 | са
148 | само
149 | се
150 | сега
151 | си
152 | скоро
153 | след
154 | сме
155 | според
156 | сред
157 | срещу
158 | сте
159 | съм
160 | със
161 | също
162 | т
163 | тази
164 | така
165 | такива
166 | такъв
167 | там
168 | твой
169 | те
170 | тези
171 | ти
172 | тн
173 | то
174 | това
175 | тогава
176 | този
177 | той
178 | толкова
179 | точно
180 | трябва
181 | тук
182 | тъй
183 | тя
184 | тях
185 | у
186 | харесва
187 | ч
188 | че
189 | често
190 | чрез
191 | ще
192 | щом
193 | я
194 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ca.txt:
--------------------------------------------------------------------------------
1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
2 | a
3 | abans
4 | ací
5 | ah
6 | així
7 | això
8 | al
9 | als
10 | aleshores
11 | algun
12 | alguna
13 | algunes
14 | alguns
15 | alhora
16 | allà
17 | allí
18 | allò
19 | altra
20 | altre
21 | altres
22 | amb
23 | ambdós
24 | ambdues
25 | apa
26 | aquell
27 | aquella
28 | aquelles
29 | aquells
30 | aquest
31 | aquesta
32 | aquestes
33 | aquests
34 | aquí
35 | baix
36 | cada
37 | cadascú
38 | cadascuna
39 | cadascunes
40 | cadascuns
41 | com
42 | contra
43 | d'un
44 | d'una
45 | d'unes
46 | d'uns
47 | dalt
48 | de
49 | del
50 | dels
51 | des
52 | després
53 | dins
54 | dintre
55 | donat
56 | doncs
57 | durant
58 | e
59 | eh
60 | el
61 | els
62 | em
63 | en
64 | encara
65 | ens
66 | entre
67 | érem
68 | eren
69 | éreu
70 | es
71 | és
72 | esta
73 | està
74 | estàvem
75 | estaven
76 | estàveu
77 | esteu
78 | et
79 | etc
80 | ets
81 | fins
82 | fora
83 | gairebé
84 | ha
85 | han
86 | has
87 | havia
88 | he
89 | hem
90 | heu
91 | hi
92 | ho
93 | i
94 | igual
95 | iguals
96 | ja
97 | l'hi
98 | la
99 | les
100 | li
101 | li'n
102 | llavors
103 | m'he
104 | ma
105 | mal
106 | malgrat
107 | mateix
108 | mateixa
109 | mateixes
110 | mateixos
111 | me
112 | mentre
113 | més
114 | meu
115 | meus
116 | meva
117 | meves
118 | molt
119 | molta
120 | moltes
121 | molts
122 | mon
123 | mons
124 | n'he
125 | n'hi
126 | ne
127 | ni
128 | no
129 | nogensmenys
130 | només
131 | nosaltres
132 | nostra
133 | nostre
134 | nostres
135 | o
136 | oh
137 | oi
138 | on
139 | pas
140 | pel
141 | pels
142 | per
143 | però
144 | perquè
145 | poc
146 | poca
147 | pocs
148 | poques
149 | potser
150 | propi
151 | qual
152 | quals
153 | quan
154 | quant
155 | que
156 | què
157 | quelcom
158 | qui
159 | quin
160 | quina
161 | quines
162 | quins
163 | s'ha
164 | s'han
165 | sa
166 | semblant
167 | semblants
168 | ses
169 | seu
170 | seus
171 | seva
172 | seva
173 | seves
174 | si
175 | sobre
176 | sobretot
177 | sóc
178 | solament
179 | sols
180 | son
181 | són
182 | sons
183 | sota
184 | sou
185 | t'ha
186 | t'han
187 | t'he
188 | ta
189 | tal
190 | també
191 | tampoc
192 | tan
193 | tant
194 | tanta
195 | tantes
196 | teu
197 | teus
198 | teva
199 | teves
200 | ton
201 | tons
202 | tot
203 | tota
204 | totes
205 | tots
206 | un
207 | una
208 | unes
209 | uns
210 | us
211 | va
212 | vaig
213 | vam
214 | van
215 | vas
216 | veu
217 | vosaltres
218 | vostra
219 | vostre
220 | vostres
221 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_cz.txt:
--------------------------------------------------------------------------------
1 | a
2 | s
3 | k
4 | o
5 | i
6 | u
7 | v
8 | z
9 | dnes
10 | cz
11 | tímto
12 | budeš
13 | budem
14 | byli
15 | jseš
16 | můj
17 | svým
18 | ta
19 | tomto
20 | tohle
21 | tuto
22 | tyto
23 | jej
24 | zda
25 | proč
26 | máte
27 | tato
28 | kam
29 | tohoto
30 | kdo
31 | kteří
32 | mi
33 | nám
34 | tom
35 | tomuto
36 | mít
37 | nic
38 | proto
39 | kterou
40 | byla
41 | toho
42 | protože
43 | asi
44 | ho
45 | naši
46 | napište
47 | re
48 | což
49 | tím
50 | takže
51 | svých
52 | její
53 | svými
54 | jste
55 | aj
56 | tu
57 | tedy
58 | teto
59 | bylo
60 | kde
61 | ke
62 | pravé
63 | ji
64 | nad
65 | nejsou
66 | či
67 | pod
68 | téma
69 | mezi
70 | přes
71 | ty
72 | pak
73 | vám
74 | ani
75 | když
76 | však
77 | neg
78 | jsem
79 | tento
80 | článku
81 | články
82 | aby
83 | jsme
84 | před
85 | pta
86 | jejich
87 | byl
88 | ještě
89 | až
90 | bez
91 | také
92 | pouze
93 | první
94 | vaše
95 | která
96 | nás
97 | nový
98 | tipy
99 | pokud
100 | může
101 | strana
102 | jeho
103 | své
104 | jiné
105 | zprávy
106 | nové
107 | není
108 | vás
109 | jen
110 | podle
111 | zde
112 | už
113 | být
114 | více
115 | bude
116 | již
117 | než
118 | který
119 | by
120 | které
121 | co
122 | nebo
123 | ten
124 | tak
125 | má
126 | při
127 | od
128 | po
129 | jsou
130 | jak
131 | další
132 | ale
133 | si
134 | se
135 | ve
136 | to
137 | jako
138 | za
139 | zpět
140 | ze
141 | do
142 | pro
143 | je
144 | na
145 | atd
146 | atp
147 | jakmile
148 | přičemž
149 | já
150 | on
151 | ona
152 | ono
153 | oni
154 | ony
155 | my
156 | vy
157 | jí
158 | ji
159 | mě
160 | mne
161 | jemu
162 | tomu
163 | těm
164 | těmu
165 | němu
166 | němuž
167 | jehož
168 | jíž
169 | jelikož
170 | jež
171 | jakož
172 | načež
173 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_el.txt:
--------------------------------------------------------------------------------
1 | # Lucene Greek Stopwords list
2 | # Note: by default this file is used after GreekLowerCaseFilter,
3 | # so when modifying this file use 'σ' instead of 'ς'
4 | ο
5 | η
6 | το
7 | οι
8 | τα
9 | του
10 | τησ
11 | των
12 | τον
13 | την
14 | και
15 | κι
16 | κ
17 | ειμαι
18 | εισαι
19 | ειναι
20 | ειμαστε
21 | ειστε
22 | στο
23 | στον
24 | στη
25 | στην
26 | μα
27 | αλλα
28 | απο
29 | για
30 | προσ
31 | με
32 | σε
33 | ωσ
34 | παρα
35 | αντι
36 | κατα
37 | μετα
38 | θα
39 | να
40 | δε
41 | δεν
42 | μη
43 | μην
44 | επι
45 | ενω
46 | εαν
47 | αν
48 | τοτε
49 | που
50 | πωσ
51 | ποιοσ
52 | ποια
53 | ποιο
54 | ποιοι
55 | ποιεσ
56 | ποιων
57 | ποιουσ
58 | αυτοσ
59 | αυτη
60 | αυτο
61 | αυτοι
62 | αυτων
63 | αυτουσ
64 | αυτεσ
65 | αυτα
66 | εκεινοσ
67 | εκεινη
68 | εκεινο
69 | εκεινοι
70 | εκεινεσ
71 | εκεινα
72 | εκεινων
73 | εκεινουσ
74 | οπωσ
75 | ομωσ
76 | ισωσ
77 | οσο
78 | οτι
79 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_en.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # a couple of test stopwords to test that the words are really being
17 | # configured from this file:
18 | stopworda
19 | stopwordb
20 |
21 | # Standard english stop words taken from Lucene's StopAnalyzer
22 | a
23 | an
24 | and
25 | are
26 | as
27 | at
28 | be
29 | but
30 | by
31 | for
32 | if
33 | in
34 | into
35 | is
36 | it
37 | no
38 | not
39 | of
40 | on
41 | or
42 | such
43 | that
44 | the
45 | their
46 | then
47 | there
48 | these
49 | they
50 | this
51 | to
52 | was
53 | will
54 | with
55 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_eu.txt:
--------------------------------------------------------------------------------
1 | # example set of basque stopwords
2 | al
3 | anitz
4 | arabera
5 | asko
6 | baina
7 | bat
8 | batean
9 | batek
10 | bati
11 | batzuei
12 | batzuek
13 | batzuetan
14 | batzuk
15 | bera
16 | beraiek
17 | berau
18 | berauek
19 | bere
20 | berori
21 | beroriek
22 | beste
23 | bezala
24 | da
25 | dago
26 | dira
27 | ditu
28 | du
29 | dute
30 | edo
31 | egin
32 | ere
33 | eta
34 | eurak
35 | ez
36 | gainera
37 | gu
38 | gutxi
39 | guzti
40 | haiei
41 | haiek
42 | haietan
43 | hainbeste
44 | hala
45 | han
46 | handik
47 | hango
48 | hara
49 | hari
50 | hark
51 | hartan
52 | hau
53 | hauei
54 | hauek
55 | hauetan
56 | hemen
57 | hemendik
58 | hemengo
59 | hi
60 | hona
61 | honek
62 | honela
63 | honetan
64 | honi
65 | hor
66 | hori
67 | horiei
68 | horiek
69 | horietan
70 | horko
71 | horra
72 | horrek
73 | horrela
74 | horretan
75 | horri
76 | hortik
77 | hura
78 | izan
79 | ni
80 | noiz
81 | nola
82 | non
83 | nondik
84 | nongo
85 | nor
86 | nora
87 | ze
88 | zein
89 | zen
90 | zenbait
91 | zenbat
92 | zer
93 | zergatik
94 | ziren
95 | zituen
96 | zu
97 | zuek
98 | zuen
99 | zuten
100 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fa.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | # Note: by default this file is used after normalization, so when adding entries
5 | # to this file, use the arabic 'ي' instead of 'ی'
6 | انان
7 | نداشته
8 | سراسر
9 | خياه
10 | ايشان
11 | وي
12 | تاكنون
13 | بيشتري
14 | دوم
15 | پس
16 | ناشي
17 | وگو
18 | يا
19 | داشتند
20 | سپس
21 | هنگام
22 | هرگز
23 | پنج
24 | نشان
25 | امسال
26 | ديگر
27 | گروهي
28 | شدند
29 | چطور
30 | ده
31 | و
32 | دو
33 | نخستين
34 | ولي
35 | چرا
36 | چه
37 | وسط
38 | ه
39 | كدام
40 | قابل
41 | يك
42 | رفت
43 | هفت
44 | همچنين
45 | در
46 | هزار
47 | بله
48 | بلي
49 | شايد
50 | اما
51 | شناسي
52 | گرفته
53 | دهد
54 | داشته
55 | دانست
56 | داشتن
57 | خواهيم
58 | ميليارد
59 | وقتيكه
60 | امد
61 | خواهد
62 | جز
63 | اورده
64 | شده
65 | بلكه
66 | خدمات
67 | شدن
68 | برخي
69 | نبود
70 | بسياري
71 | جلوگيري
72 | حق
73 | كردند
74 | نوعي
75 | بعري
76 | نكرده
77 | نظير
78 | نبايد
79 | بوده
80 | بودن
81 | داد
82 | اورد
83 | هست
84 | جايي
85 | شود
86 | دنبال
87 | داده
88 | بايد
89 | سابق
90 | هيچ
91 | همان
92 | انجا
93 | كمتر
94 | كجاست
95 | گردد
96 | كسي
97 | تر
98 | مردم
99 | تان
100 | دادن
101 | بودند
102 | سري
103 | جدا
104 | ندارند
105 | مگر
106 | يكديگر
107 | دارد
108 | دهند
109 | بنابراين
110 | هنگامي
111 | سمت
112 | جا
113 | انچه
114 | خود
115 | دادند
116 | زياد
117 | دارند
118 | اثر
119 | بدون
120 | بهترين
121 | بيشتر
122 | البته
123 | به
124 | براساس
125 | بيرون
126 | كرد
127 | بعضي
128 | گرفت
129 | توي
130 | اي
131 | ميليون
132 | او
133 | جريان
134 | تول
135 | بر
136 | مانند
137 | برابر
138 | باشيم
139 | مدتي
140 | گويند
141 | اكنون
142 | تا
143 | تنها
144 | جديد
145 | چند
146 | بي
147 | نشده
148 | كردن
149 | كردم
150 | گويد
151 | كرده
152 | كنيم
153 | نمي
154 | نزد
155 | روي
156 | قصد
157 | فقط
158 | بالاي
159 | ديگران
160 | اين
161 | ديروز
162 | توسط
163 | سوم
164 | ايم
165 | دانند
166 | سوي
167 | استفاده
168 | شما
169 | كنار
170 | داريم
171 | ساخته
172 | طور
173 | امده
174 | رفته
175 | نخست
176 | بيست
177 | نزديك
178 | طي
179 | كنيد
180 | از
181 | انها
182 | تمامي
183 | داشت
184 | يكي
185 | طريق
186 | اش
187 | چيست
188 | روب
189 | نمايد
190 | گفت
191 | چندين
192 | چيزي
193 | تواند
194 | ام
195 | ايا
196 | با
197 | ان
198 | ايد
199 | ترين
200 | اينكه
201 | ديگري
202 | راه
203 | هايي
204 | بروز
205 | همچنان
206 | پاعين
207 | كس
208 | حدود
209 | مختلف
210 | مقابل
211 | چيز
212 | گيرد
213 | ندارد
214 | ضد
215 | همچون
216 | سازي
217 | شان
218 | مورد
219 | باره
220 | مرسي
221 | خويش
222 | برخوردار
223 | چون
224 | خارج
225 | شش
226 | هنوز
227 | تحت
228 | ضمن
229 | هستيم
230 | گفته
231 | فكر
232 | بسيار
233 | پيش
234 | براي
235 | روزهاي
236 | انكه
237 | نخواهد
238 | بالا
239 | كل
240 | وقتي
241 | كي
242 | چنين
243 | كه
244 | گيري
245 | نيست
246 | است
247 | كجا
248 | كند
249 | نيز
250 | يابد
251 | بندي
252 | حتي
253 | توانند
254 | عقب
255 | خواست
256 | كنند
257 | بين
258 | تمام
259 | همه
260 | ما
261 | باشند
262 | مثل
263 | شد
264 | اري
265 | باشد
266 | اره
267 | طبق
268 | بعد
269 | اگر
270 | صورت
271 | غير
272 | جاي
273 | بيش
274 | ريزي
275 | اند
276 | زيرا
277 | چگونه
278 | بار
279 | لطفا
280 | مي
281 | درباره
282 | من
283 | ديده
284 | همين
285 | گذاري
286 | برداري
287 | علت
288 | گذاشته
289 | هم
290 | فوق
291 | نه
292 | ها
293 | شوند
294 | اباد
295 | همواره
296 | هر
297 | اول
298 | خواهند
299 | چهار
300 | نام
301 | امروز
302 | مان
303 | هاي
304 | قبل
305 | كنم
306 | سعي
307 | تازه
308 | را
309 | هستند
310 | زير
311 | جلوي
312 | عنوان
313 | بود
314 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fi.txt:
--------------------------------------------------------------------------------
1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
2 | | This file is distributed under the BSD License.
3 | | See http://snowball.tartarus.org/license.php
4 | | Also see http://www.opensource.org/licenses/bsd-license.html
5 | | - Encoding was converted to UTF-8.
6 | | - This notice was added.
7 | |
8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
9 |
10 | | forms of BE
11 |
12 | olla
13 | olen
14 | olet
15 | on
16 | olemme
17 | olette
18 | ovat
19 | ole | negative form
20 |
21 | oli
22 | olisi
23 | olisit
24 | olisin
25 | olisimme
26 | olisitte
27 | olisivat
28 | olit
29 | olin
30 | olimme
31 | olitte
32 | olivat
33 | ollut
34 | olleet
35 |
36 | en | negation
37 | et
38 | ei
39 | emme
40 | ette
41 | eivät
42 |
43 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
44 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I
45 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
46 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
47 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
48 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
49 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
50 |
51 | tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
52 | tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
53 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
54 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
55 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
56 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
57 |
58 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
59 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
60 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
61 | mitkä | (pl)
62 |
63 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
64 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
65 |
66 | | conjunctions
67 |
68 | että | that
69 | ja | and
70 | jos | if
71 | koska | because
72 | kuin | than
73 | mutta | but
74 | niin | so
75 | sekä | and
76 | sillä | for
77 | tai | or
78 | vaan | but
79 | vai | or
80 | vaikka | although
81 |
82 |
83 | | prepositions
84 |
85 | kanssa | with
86 | mukaan | according to
87 | noin | about
88 | poikki | across
89 | yli | over, across
90 |
91 | | other
92 |
93 | kun | when
94 | niin | so
95 | nyt | now
96 | itse | self
97 |
98 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ga.txt:
--------------------------------------------------------------------------------
1 |
2 | a
3 | ach
4 | ag
5 | agus
6 | an
7 | aon
8 | ar
9 | arna
10 | as
11 | b'
12 | ba
13 | beirt
14 | bhúr
15 | caoga
16 | ceathair
17 | ceathrar
18 | chomh
19 | chtó
20 | chuig
21 | chun
22 | cois
23 | céad
24 | cúig
25 | cúigear
26 | d'
27 | daichead
28 | dar
29 | de
30 | deich
31 | deichniúr
32 | den
33 | dhá
34 | do
35 | don
36 | dtí
37 | dá
38 | dár
39 | dó
40 | faoi
41 | faoin
42 | faoina
43 | faoinár
44 | fara
45 | fiche
46 | gach
47 | gan
48 | go
49 | gur
50 | haon
51 | hocht
52 | i
53 | iad
54 | idir
55 | in
56 | ina
57 | ins
58 | inár
59 | is
60 | le
61 | leis
62 | lena
63 | lenár
64 | m'
65 | mar
66 | mo
67 | mé
68 | na
69 | nach
70 | naoi
71 | naonúr
72 | ná
73 | ní
74 | níor
75 | nó
76 | nócha
77 | ocht
78 | ochtar
79 | os
80 | roimh
81 | sa
82 | seacht
83 | seachtar
84 | seachtó
85 | seasca
86 | seisear
87 | siad
88 | sibh
89 | sinn
90 | sna
91 | sé
92 | sí
93 | tar
94 | thar
95 | thú
96 | triúr
97 | trí
98 | trína
99 | trínár
100 | tríocha
101 | tú
102 | um
103 | ár
104 | é
105 | éis
106 | í
107 | ó
108 | ón
109 | óna
110 | ónár
111 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_gl.txt:
--------------------------------------------------------------------------------
1 | # galican stopwords
2 | a
3 | aínda
4 | alí
5 | aquel
6 | aquela
7 | aquelas
8 | aqueles
9 | aquilo
10 | aquí
11 | ao
12 | aos
13 | as
14 | así
15 | á
16 | ben
17 | cando
18 | che
19 | co
20 | coa
21 | comigo
22 | con
23 | connosco
24 | contigo
25 | convosco
26 | coas
27 | cos
28 | cun
29 | cuns
30 | cunha
31 | cunhas
32 | da
33 | dalgunha
34 | dalgunhas
35 | dalgún
36 | dalgúns
37 | das
38 | de
39 | del
40 | dela
41 | delas
42 | deles
43 | desde
44 | deste
45 | do
46 | dos
47 | dun
48 | duns
49 | dunha
50 | dunhas
51 | e
52 | el
53 | ela
54 | elas
55 | eles
56 | en
57 | era
58 | eran
59 | esa
60 | esas
61 | ese
62 | eses
63 | esta
64 | estar
65 | estaba
66 | está
67 | están
68 | este
69 | estes
70 | estiven
71 | estou
72 | eu
73 | é
74 | facer
75 | foi
76 | foron
77 | fun
78 | había
79 | hai
80 | iso
81 | isto
82 | la
83 | las
84 | lle
85 | lles
86 | lo
87 | los
88 | mais
89 | me
90 | meu
91 | meus
92 | min
93 | miña
94 | miñas
95 | moi
96 | na
97 | nas
98 | neste
99 | nin
100 | no
101 | non
102 | nos
103 | nosa
104 | nosas
105 | noso
106 | nosos
107 | nós
108 | nun
109 | nunha
110 | nuns
111 | nunhas
112 | o
113 | os
114 | ou
115 | ó
116 | ós
117 | para
118 | pero
119 | pode
120 | pois
121 | pola
122 | polas
123 | polo
124 | polos
125 | por
126 | que
127 | se
128 | senón
129 | ser
130 | seu
131 | seus
132 | sexa
133 | sido
134 | sobre
135 | súa
136 | súas
137 | tamén
138 | tan
139 | te
140 | ten
141 | teñen
142 | teño
143 | ter
144 | teu
145 | teus
146 | ti
147 | tido
148 | tiña
149 | tiven
150 | túa
151 | túas
152 | un
153 | unha
154 | unhas
155 | uns
156 | vos
157 | vosa
158 | vosas
159 | voso
160 | vosos
161 | vós
162 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hi.txt:
--------------------------------------------------------------------------------
1 | # Also see http://www.opensource.org/licenses/bsd-license.html
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # This file was created by Jacques Savoy and is distributed under the BSD license.
4 | # Note: by default this file also contains forms normalized by HindiNormalizer
5 | # for spelling variation (see section below), such that it can be used whether or
6 | # not you enable that feature. When adding additional entries to this list,
7 | # please add the normalized form as well.
8 | अंदर
9 | अत
10 | अपना
11 | अपनी
12 | अपने
13 | अभी
14 | आदि
15 | आप
16 | इत्यादि
17 | इन
18 | इनका
19 | इन्हीं
20 | इन्हें
21 | इन्हों
22 | इस
23 | इसका
24 | इसकी
25 | इसके
26 | इसमें
27 | इसी
28 | इसे
29 | उन
30 | उनका
31 | उनकी
32 | उनके
33 | उनको
34 | उन्हीं
35 | उन्हें
36 | उन्हों
37 | उस
38 | उसके
39 | उसी
40 | उसे
41 | एक
42 | एवं
43 | एस
44 | ऐसे
45 | और
46 | कई
47 | कर
48 | करता
49 | करते
50 | करना
51 | करने
52 | करें
53 | कहते
54 | कहा
55 | का
56 | काफ़ी
57 | कि
58 | कितना
59 | किन्हें
60 | किन्हों
61 | किया
62 | किर
63 | किस
64 | किसी
65 | किसे
66 | की
67 | कुछ
68 | कुल
69 | के
70 | को
71 | कोई
72 | कौन
73 | कौनसा
74 | गया
75 | घर
76 | जब
77 | जहाँ
78 | जा
79 | जितना
80 | जिन
81 | जिन्हें
82 | जिन्हों
83 | जिस
84 | जिसे
85 | जीधर
86 | जैसा
87 | जैसे
88 | जो
89 | तक
90 | तब
91 | तरह
92 | तिन
93 | तिन्हें
94 | तिन्हों
95 | तिस
96 | तिसे
97 | तो
98 | था
99 | थी
100 | थे
101 | दबारा
102 | दिया
103 | दुसरा
104 | दूसरे
105 | दो
106 | द्वारा
107 | न
108 | नहीं
109 | ना
110 | निहायत
111 | नीचे
112 | ने
113 | पर
114 | पर
115 | पहले
116 | पूरा
117 | पे
118 | फिर
119 | बनी
120 | बही
121 | बहुत
122 | बाद
123 | बाला
124 | बिलकुल
125 | भी
126 | भीतर
127 | मगर
128 | मानो
129 | मे
130 | में
131 | यदि
132 | यह
133 | यहाँ
134 | यही
135 | या
136 | यिह
137 | ये
138 | रखें
139 | रहा
140 | रहे
141 | ऱ्वासा
142 | लिए
143 | लिये
144 | लेकिन
145 | व
146 | वर्ग
147 | वह
148 | वह
149 | वहाँ
150 | वहीं
151 | वाले
152 | वुह
153 | वे
154 | वग़ैरह
155 | संग
156 | सकता
157 | सकते
158 | सबसे
159 | सभी
160 | साथ
161 | साबुत
162 | साभ
163 | सारा
164 | से
165 | सो
166 | ही
167 | हुआ
168 | हुई
169 | हुए
170 | है
171 | हैं
172 | हो
173 | होता
174 | होती
175 | होते
176 | होना
177 | होने
178 | # additional normalized forms of the above
179 | अपनि
180 | जेसे
181 | होति
182 | सभि
183 | तिंहों
184 | इंहों
185 | दवारा
186 | इसि
187 | किंहें
188 | थि
189 | उंहों
190 | ओर
191 | जिंहें
192 | वहिं
193 | अभि
194 | बनि
195 | हि
196 | उंहिं
197 | उंहें
198 | हें
199 | वगेरह
200 | एसे
201 | रवासा
202 | कोन
203 | निचे
204 | काफि
205 | उसि
206 | पुरा
207 | भितर
208 | हे
209 | बहि
210 | वहां
211 | कोइ
212 | यहां
213 | जिंहों
214 | तिंहें
215 | किसि
216 | कइ
217 | यहि
218 | इंहिं
219 | जिधर
220 | इंहें
221 | अदि
222 | इतयादि
223 | हुइ
224 | कोनसा
225 | इसकि
226 | दुसरे
227 | जहां
228 | अप
229 | किंहों
230 | उनकि
231 | भि
232 | वरग
233 | हुअ
234 | जेसा
235 | नहिं
236 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hu.txt:
--------------------------------------------------------------------------------
1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
2 | | This file is distributed under the BSD License.
3 | | See http://snowball.tartarus.org/license.php
4 | | Also see http://www.opensource.org/licenses/bsd-license.html
5 | | - Encoding was converted to UTF-8.
6 | | - This notice was added.
7 | |
8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
9 |
10 | | Hungarian stop word list
11 | | prepared by Anna Tordai
12 |
13 | a
14 | ahogy
15 | ahol
16 | aki
17 | akik
18 | akkor
19 | alatt
20 | által
21 | általában
22 | amely
23 | amelyek
24 | amelyekben
25 | amelyeket
26 | amelyet
27 | amelynek
28 | ami
29 | amit
30 | amolyan
31 | amíg
32 | amikor
33 | át
34 | abban
35 | ahhoz
36 | annak
37 | arra
38 | arról
39 | az
40 | azok
41 | azon
42 | azt
43 | azzal
44 | azért
45 | aztán
46 | azután
47 | azonban
48 | bár
49 | be
50 | belül
51 | benne
52 | cikk
53 | cikkek
54 | cikkeket
55 | csak
56 | de
57 | e
58 | eddig
59 | egész
60 | egy
61 | egyes
62 | egyetlen
63 | egyéb
64 | egyik
65 | egyre
66 | ekkor
67 | el
68 | elég
69 | ellen
70 | elő
71 | először
72 | előtt
73 | első
74 | én
75 | éppen
76 | ebben
77 | ehhez
78 | emilyen
79 | ennek
80 | erre
81 | ez
82 | ezt
83 | ezek
84 | ezen
85 | ezzel
86 | ezért
87 | és
88 | fel
89 | felé
90 | hanem
91 | hiszen
92 | hogy
93 | hogyan
94 | igen
95 | így
96 | illetve
97 | ill.
98 | ill
99 | ilyen
100 | ilyenkor
101 | ison
102 | ismét
103 | itt
104 | jó
105 | jól
106 | jobban
107 | kell
108 | kellett
109 | keresztül
110 | keressünk
111 | ki
112 | kívül
113 | között
114 | közül
115 | legalább
116 | lehet
117 | lehetett
118 | legyen
119 | lenne
120 | lenni
121 | lesz
122 | lett
123 | maga
124 | magát
125 | majd
126 | majd
127 | már
128 | más
129 | másik
130 | meg
131 | még
132 | mellett
133 | mert
134 | mely
135 | melyek
136 | mi
137 | mit
138 | míg
139 | miért
140 | milyen
141 | mikor
142 | minden
143 | mindent
144 | mindenki
145 | mindig
146 | mint
147 | mintha
148 | mivel
149 | most
150 | nagy
151 | nagyobb
152 | nagyon
153 | ne
154 | néha
155 | nekem
156 | neki
157 | nem
158 | néhány
159 | nélkül
160 | nincs
161 | olyan
162 | ott
163 | össze
164 | ő
165 | ők
166 | őket
167 | pedig
168 | persze
169 | rá
170 | s
171 | saját
172 | sem
173 | semmi
174 | sok
175 | sokat
176 | sokkal
177 | számára
178 | szemben
179 | szerint
180 | szinte
181 | talán
182 | tehát
183 | teljes
184 | tovább
185 | továbbá
186 | több
187 | úgy
188 | ugyanis
189 | új
190 | újabb
191 | újra
192 | után
193 | utána
194 | utolsó
195 | vagy
196 | vagyis
197 | valaki
198 | valami
199 | valamint
200 | való
201 | vagyok
202 | van
203 | vannak
204 | volt
205 | voltam
206 | voltak
207 | voltunk
208 | vissza
209 | vele
210 | viszont
211 | volna
212 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hy.txt:
--------------------------------------------------------------------------------
1 | # example set of Armenian stopwords.
2 | այդ
3 | այլ
4 | այն
5 | այս
6 | դու
7 | դուք
8 | եմ
9 | են
10 | ենք
11 | ես
12 | եք
13 | է
14 | էի
15 | էին
16 | էինք
17 | էիր
18 | էիք
19 | էր
20 | ըստ
21 | թ
22 | ի
23 | ին
24 | իսկ
25 | իր
26 | կամ
27 | համար
28 | հետ
29 | հետո
30 | մենք
31 | մեջ
32 | մի
33 | ն
34 | նա
35 | նաև
36 | նրա
37 | նրանք
38 | որ
39 | որը
40 | որոնք
41 | որպես
42 | ու
43 | ում
44 | պիտի
45 | վրա
46 | և
47 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ja.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file defines a stopword set for Japanese.
3 | #
4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
5 | # Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745
6 | # for frequency lists, etc. that can be useful for making your own set (if desired)
7 | #
8 | # Note that there is an overlap between these stopwords and the terms stopped when used
9 | # in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
10 | # that comments are not allowed on the same line as stopwords.
11 | #
12 | # Also note that stopping is done in a case-insensitive manner. Change your StopFilter
13 | # configuration if you need case-sensitive stopping. Lastly, note that stopping is done
14 | # using the same character width as the entries in this file. Since this StopFilter is
15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji
16 | # entries to be in half-width and your kana entries to be in full-width.
17 | #
18 | の
19 | に
20 | は
21 | を
22 | た
23 | が
24 | で
25 | て
26 | と
27 | し
28 | れ
29 | さ
30 | ある
31 | いる
32 | も
33 | する
34 | から
35 | な
36 | こと
37 | として
38 | い
39 | や
40 | れる
41 | など
42 | なっ
43 | ない
44 | この
45 | ため
46 | その
47 | あっ
48 | よう
49 | また
50 | もの
51 | という
52 | あり
53 | まで
54 | られ
55 | なる
56 | へ
57 | か
58 | だ
59 | これ
60 | によって
61 | により
62 | おり
63 | より
64 | による
65 | ず
66 | なり
67 | られる
68 | において
69 | ば
70 | なかっ
71 | なく
72 | しかし
73 | について
74 | せ
75 | だっ
76 | その後
77 | できる
78 | それ
79 | う
80 | ので
81 | なお
82 | のみ
83 | でき
84 | き
85 | つ
86 | における
87 | および
88 | いう
89 | さらに
90 | でも
91 | ら
92 | たり
93 | その他
94 | に関する
95 | たち
96 | ます
97 | ん
98 | なら
99 | に対して
100 | 特に
101 | せる
102 | 及び
103 | これら
104 | とき
105 | では
106 | にて
107 | ほか
108 | ながら
109 | うち
110 | そして
111 | とともに
112 | ただし
113 | かつて
114 | それぞれ
115 | または
116 | お
117 | ほど
118 | ものの
119 | に対する
120 | ほとんど
121 | と共に
122 | といった
123 | です
124 | とも
125 | ところ
126 | ここ
127 | ##### End of file
128 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_lv.txt:
--------------------------------------------------------------------------------
1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
2 | # the original list of over 800 forms was refined:
3 | # pronouns, adverbs, interjections were removed
4 | #
5 | # prepositions
6 | aiz
7 | ap
8 | ar
9 | apakš
10 | ārpus
11 | augšpus
12 | bez
13 | caur
14 | dēļ
15 | gar
16 | iekš
17 | iz
18 | kopš
19 | labad
20 | lejpus
21 | līdz
22 | no
23 | otrpus
24 | pa
25 | par
26 | pār
27 | pēc
28 | pie
29 | pirms
30 | pret
31 | priekš
32 | starp
33 | šaipus
34 | uz
35 | viņpus
36 | virs
37 | virspus
38 | zem
39 | apakšpus
40 | # Conjunctions
41 | un
42 | bet
43 | jo
44 | ja
45 | ka
46 | lai
47 | tomēr
48 | tikko
49 | turpretī
50 | arī
51 | kaut
52 | gan
53 | tādēļ
54 | tā
55 | ne
56 | tikvien
57 | vien
58 | kā
59 | ir
60 | te
61 | vai
62 | kamēr
63 | # Particles
64 | ar
65 | diezin
66 | droši
67 | diemžēl
68 | nebūt
69 | ik
70 | it
71 | taču
72 | nu
73 | pat
74 | tiklab
75 | iekšpus
76 | nedz
77 | tik
78 | nevis
79 | turpretim
80 | jeb
81 | iekam
82 | iekām
83 | iekāms
84 | kolīdz
85 | līdzko
86 | tiklīdz
87 | jebšu
88 | tālab
89 | tāpēc
90 | nekā
91 | itin
92 | jā
93 | jau
94 | jel
95 | nē
96 | nezin
97 | tad
98 | tikai
99 | vis
100 | tak
101 | iekams
102 | vien
103 | # modal verbs
104 | būt
105 | biju
106 | biji
107 | bija
108 | bijām
109 | bijāt
110 | esmu
111 | esi
112 | esam
113 | esat
114 | būšu
115 | būsi
116 | būs
117 | būsim
118 | būsiet
119 | tikt
120 | tiku
121 | tiki
122 | tika
123 | tikām
124 | tikāt
125 | tieku
126 | tiec
127 | tiek
128 | tiekam
129 | tiekat
130 | tikšu
131 | tiks
132 | tiksim
133 | tiksiet
134 | tapt
135 | tapi
136 | tapāt
137 | topat
138 | tapšu
139 | tapsi
140 | taps
141 | tapsim
142 | tapsiet
143 | kļūt
144 | kļuvu
145 | kļuvi
146 | kļuva
147 | kļuvām
148 | kļuvāt
149 | kļūstu
150 | kļūsti
151 | kļūst
152 | kļūstam
153 | kļūstat
154 | kļūšu
155 | kļūsi
156 | kļūs
157 | kļūsim
158 | kļūsiet
159 | # verbs
160 | varēt
161 | varēju
162 | varējām
163 | varēšu
164 | varēsim
165 | var
166 | varēji
167 | varējāt
168 | varēsi
169 | varēsiet
170 | varat
171 | varēja
172 | varēs
173 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ro.txt:
--------------------------------------------------------------------------------
1 | # This file was created by Jacques Savoy and is distributed under the BSD license.
2 | # See http://members.unine.ch/jacques.savoy/clef/index.html.
3 | # Also see http://www.opensource.org/licenses/bsd-license.html
4 | acea
5 | aceasta
6 | această
7 | aceea
8 | acei
9 | aceia
10 | acel
11 | acela
12 | acele
13 | acelea
14 | acest
15 | acesta
16 | aceste
17 | acestea
18 | aceşti
19 | aceştia
20 | acolo
21 | acum
22 | ai
23 | aia
24 | aibă
25 | aici
26 | al
27 | ăla
28 | ale
29 | alea
30 | ălea
31 | altceva
32 | altcineva
33 | am
34 | ar
35 | are
36 | aş
37 | aşadar
38 | asemenea
39 | asta
40 | ăsta
41 | astăzi
42 | astea
43 | ăstea
44 | ăştia
45 | asupra
46 | aţi
47 | au
48 | avea
49 | avem
50 | aveţi
51 | azi
52 | bine
53 | bucur
54 | bună
55 | ca
56 | că
57 | căci
58 | când
59 | care
60 | cărei
61 | căror
62 | cărui
63 | cât
64 | câte
65 | câţi
66 | către
67 | câtva
68 | ce
69 | cel
70 | ceva
71 | chiar
72 | cînd
73 | cine
74 | cineva
75 | cît
76 | cîte
77 | cîţi
78 | cîtva
79 | contra
80 | cu
81 | cum
82 | cumva
83 | curând
84 | curînd
85 | da
86 | dă
87 | dacă
88 | dar
89 | datorită
90 | de
91 | deci
92 | deja
93 | deoarece
94 | departe
95 | deşi
96 | din
97 | dinaintea
98 | dintr
99 | dintre
100 | drept
101 | după
102 | ea
103 | ei
104 | el
105 | ele
106 | eram
107 | este
108 | eşti
109 | eu
110 | face
111 | fără
112 | fi
113 | fie
114 | fiecare
115 | fii
116 | fim
117 | fiţi
118 | iar
119 | ieri
120 | îi
121 | îl
122 | îmi
123 | împotriva
124 | în
125 | înainte
126 | înaintea
127 | încât
128 | încît
129 | încotro
130 | între
131 | întrucât
132 | întrucît
133 | îţi
134 | la
135 | lângă
136 | le
137 | li
138 | lîngă
139 | lor
140 | lui
141 | mă
142 | mâine
143 | mea
144 | mei
145 | mele
146 | mereu
147 | meu
148 | mi
149 | mine
150 | mult
151 | multă
152 | mulţi
153 | ne
154 | nicăieri
155 | nici
156 | nimeni
157 | nişte
158 | noastră
159 | noastre
160 | noi
161 | noştri
162 | nostru
163 | nu
164 | ori
165 | oricând
166 | oricare
167 | oricât
168 | orice
169 | oricînd
170 | oricine
171 | oricît
172 | oricum
173 | oriunde
174 | până
175 | pe
176 | pentru
177 | peste
178 | pînă
179 | poate
180 | pot
181 | prea
182 | prima
183 | primul
184 | prin
185 | printr
186 | sa
187 | să
188 | săi
189 | sale
190 | sau
191 | său
192 | se
193 | şi
194 | sînt
195 | sîntem
196 | sînteţi
197 | spre
198 | sub
199 | sunt
200 | suntem
201 | sunteţi
202 | ta
203 | tăi
204 | tale
205 | tău
206 | te
207 | ţi
208 | ţie
209 | tine
210 | toată
211 | toate
212 | tot
213 | toţi
214 | totuşi
215 | tu
216 | un
217 | una
218 | unde
219 | undeva
220 | unei
221 | unele
222 | uneori
223 | unor
224 | vă
225 | vi
226 | voastră
227 | voastre
228 | voi
229 | voştri
230 | vostru
231 | vouă
232 | vreo
233 | vreun
234 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_th.txt:
--------------------------------------------------------------------------------
1 | # Thai stopwords from:
2 | # "Opinion Detection in Thai Political News Columns
3 | # Based on Subjectivity Analysis"
4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
5 | ไว้
6 | ไม่
7 | ไป
8 | ได้
9 | ให้
10 | ใน
11 | โดย
12 | แห่ง
13 | แล้ว
14 | และ
15 | แรก
16 | แบบ
17 | แต่
18 | เอง
19 | เห็น
20 | เลย
21 | เริ่ม
22 | เรา
23 | เมื่อ
24 | เพื่อ
25 | เพราะ
26 | เป็นการ
27 | เป็น
28 | เปิดเผย
29 | เปิด
30 | เนื่องจาก
31 | เดียวกัน
32 | เดียว
33 | เช่น
34 | เฉพาะ
35 | เคย
36 | เข้า
37 | เขา
38 | อีก
39 | อาจ
40 | อะไร
41 | ออก
42 | อย่าง
43 | อยู่
44 | อยาก
45 | หาก
46 | หลาย
47 | หลังจาก
48 | หลัง
49 | หรือ
50 | หนึ่ง
51 | ส่วน
52 | ส่ง
53 | สุด
54 | สําหรับ
55 | ว่า
56 | วัน
57 | ลง
58 | ร่วม
59 | ราย
60 | รับ
61 | ระหว่าง
62 | รวม
63 | ยัง
64 | มี
65 | มาก
66 | มา
67 | พร้อม
68 | พบ
69 | ผ่าน
70 | ผล
71 | บาง
72 | น่า
73 | นี้
74 | นํา
75 | นั้น
76 | นัก
77 | นอกจาก
78 | ทุก
79 | ที่สุด
80 | ที่
81 | ทําให้
82 | ทํา
83 | ทาง
84 | ทั้งนี้
85 | ทั้ง
86 | ถ้า
87 | ถูก
88 | ถึง
89 | ต้อง
90 | ต่างๆ
91 | ต่าง
92 | ต่อ
93 | ตาม
94 | ตั้งแต่
95 | ตั้ง
96 | ด้าน
97 | ด้วย
98 | ดัง
99 | ซึ่ง
100 | ช่วง
101 | จึง
102 | จาก
103 | จัด
104 | จะ
105 | คือ
106 | ความ
107 | ครั้ง
108 | คง
109 | ขึ้น
110 | ของ
111 | ขอ
112 | ขณะ
113 | ก่อน
114 | ก็
115 | การ
116 | กับ
117 | กัน
118 | กว่า
119 | กล่าว
120 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/stopwords_tr.txt:
--------------------------------------------------------------------------------
1 | # Turkish stopwords from LUCENE-559
2 | # merged with the list from "Information Retrieval on Turkish Texts"
3 | # (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
4 | acaba
5 | altmış
6 | altı
7 | ama
8 | ancak
9 | arada
10 | aslında
11 | ayrıca
12 | bana
13 | bazı
14 | belki
15 | ben
16 | benden
17 | beni
18 | benim
19 | beri
20 | beş
21 | bile
22 | bin
23 | bir
24 | birçok
25 | biri
26 | birkaç
27 | birkez
28 | birşey
29 | birşeyi
30 | biz
31 | bize
32 | bizden
33 | bizi
34 | bizim
35 | böyle
36 | böylece
37 | bu
38 | buna
39 | bunda
40 | bundan
41 | bunlar
42 | bunları
43 | bunların
44 | bunu
45 | bunun
46 | burada
47 | çok
48 | çünkü
49 | da
50 | daha
51 | dahi
52 | de
53 | defa
54 | değil
55 | diğer
56 | diye
57 | doksan
58 | dokuz
59 | dolayı
60 | dolayısıyla
61 | dört
62 | edecek
63 | eden
64 | ederek
65 | edilecek
66 | ediliyor
67 | edilmesi
68 | ediyor
69 | eğer
70 | elli
71 | en
72 | etmesi
73 | etti
74 | ettiği
75 | ettiğini
76 | gibi
77 | göre
78 | halen
79 | hangi
80 | hatta
81 | hem
82 | henüz
83 | hep
84 | hepsi
85 | her
86 | herhangi
87 | herkesin
88 | hiç
89 | hiçbir
90 | için
91 | iki
92 | ile
93 | ilgili
94 | ise
95 | işte
96 | itibaren
97 | itibariyle
98 | kadar
99 | karşın
100 | katrilyon
101 | kendi
102 | kendilerine
103 | kendini
104 | kendisi
105 | kendisine
106 | kendisini
107 | kez
108 | ki
109 | kim
110 | kimden
111 | kime
112 | kimi
113 | kimse
114 | kırk
115 | milyar
116 | milyon
117 | mu
118 | mü
119 | mı
120 | nasıl
121 | ne
122 | neden
123 | nedenle
124 | nerde
125 | nerede
126 | nereye
127 | niye
128 | niçin
129 | o
130 | olan
131 | olarak
132 | oldu
133 | olduğu
134 | olduğunu
135 | olduklarını
136 | olmadı
137 | olmadığı
138 | olmak
139 | olması
140 | olmayan
141 | olmaz
142 | olsa
143 | olsun
144 | olup
145 | olur
146 | olursa
147 | oluyor
148 | on
149 | ona
150 | ondan
151 | onlar
152 | onlardan
153 | onları
154 | onların
155 | onu
156 | onun
157 | otuz
158 | oysa
159 | öyle
160 | pek
161 | rağmen
162 | sadece
163 | sanki
164 | sekiz
165 | seksen
166 | sen
167 | senden
168 | seni
169 | senin
170 | siz
171 | sizden
172 | sizi
173 | sizin
174 | şey
175 | şeyden
176 | şeyi
177 | şeyler
178 | şöyle
179 | şu
180 | şuna
181 | şunda
182 | şundan
183 | şunları
184 | şunu
185 | tarafından
186 | trilyon
187 | tüm
188 | üç
189 | üzere
190 | var
191 | vardı
192 | ve
193 | veya
194 | ya
195 | yani
196 | yapacak
197 | yapılan
198 | yapılması
199 | yapıyor
200 | yapmak
201 | yaptı
202 | yaptığı
203 | yaptığını
204 | yaptıkları
205 | yedi
206 | yerine
207 | yetmiş
208 | yine
209 | yirmi
210 | yoksa
211 | yüz
212 | zaten
213 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/lang/userdict_ja.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
3 | #
4 | # Add entries to this file in order to override the statistical model in terms
5 | # of segmentation, readings and part-of-speech tags. Notice that entries do
6 | # not have weights since they are always used when found. This is by-design
7 | # in order to maximize ease-of-use.
8 | #
9 | # Entries are defined using the following CSV format:
10 | # , ... , ... ,
11 | #
12 | # Notice that a single half-width space separates tokens and readings, and
13 | # that the number tokens and readings must match exactly.
14 | #
15 | # Also notice that multiple entries with the same is undefined.
16 | #
17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines.
18 | #
19 |
20 | # Custom segmentation for kanji compounds
21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
23 |
24 | # Custom segmentation for compound katakana
25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
27 |
28 | # Custom reading for former sumo wrestler
29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名
30 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/name_synonyms.txt:
--------------------------------------------------------------------------------
1 | sky walker, skywalker
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/names.txt:
--------------------------------------------------------------------------------
1 | luke_skywalker
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/params.json:
--------------------------------------------------------------------------------
1 | {"params":{
2 | "query":{
3 | "defType":"edismax",
4 | "q.alt":"*:*",
5 | "rows":"10",
6 | "fl":"*,score",
7 | "":{"v":0}
8 | },
9 | "facets":{
10 | "facet":"on",
11 | "facet.mincount": "1",
12 | "":{"v":0}
13 | },
14 | "velocity":{
15 | "wt": "velocity",
16 | "v.template":"browse",
17 | "v.layout": "layout",
18 | "":{"v":0}
19 | }
20 | }}
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/protwords.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | # Use a protected word file to protect against the stemmer reducing two
15 | # unrelated words to the same base word.
16 |
17 | # Some non-words that normally won't be encountered,
18 | # just to test that they won't be stemmed.
19 | dontstems
20 | zwhacky
21 |
22 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/stopwords.txt:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms.txt:
--------------------------------------------------------------------------------
1 | # The ASF licenses this file to You under the Apache License, Version 2.0
2 | # (the "License"); you may not use this file except in compliance with
3 | # the License. You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | #-----------------------------------------------------------------------
14 | #some test synonym mappings unlikely to appear in real input text
15 |
16 | # Demonstrating bidirectional synonyms
17 | #wife,bride
18 | #wife,spouse
19 | #toons,tunes,cartoon
20 |
21 | # Demonstrating => syntax
22 | # wife => wife, bride
23 | # spouse => spouse, husband, wife, partner
24 | # tunes => cartoons, toons, songs
25 | # cartoon => toons, tunes
26 |
27 | # Demonstrating multi phrase
28 | #looney tunes, cartoons
29 | #science fiction, sci fi, sci-fi, scifi
30 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_bidirect.txt:
--------------------------------------------------------------------------------
1 | # Often people erroneously equate linguistic synonyms
2 | # with Solr synonyms. Here the bidirectional nature
3 | # of the synonyms creates problems where the more specific
4 | # term is not prioritized
5 | wife,bride
6 | wife,spouse
7 | toons,tunes,cartoon
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_directed.txt:
--------------------------------------------------------------------------------
1 | wife => wife, bride
2 | spouse => spouse, husband, wife, partner
3 | tunes => cartoons, toons, songs
4 | cartoon => toons, tunes
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_genres.txt:
--------------------------------------------------------------------------------
1 | scifi,science fiction,science fiction movie
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/synonyms_multiterm.txt:
--------------------------------------------------------------------------------
1 | # Here are some multi term synonym to
2 | # see what happens at query time
3 |
4 | looney tunes, cartoons
5 | science fiction, sci fi, sci-fi, scifi
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/taxonomy.txt:
--------------------------------------------------------------------------------
1 | # Capture how the *user* structures information
2 | #looneytunes, looney tunes => looney_tunes, cartoons
3 | #bugs bunny => bug_bunny, looney_tunes, cartoons
4 | #mickey mouse => mickey_mouse, disney, cartoons
5 | #minnie mouse => minnie_mouse, disney, cartoons
6 | #donald duck => donald_duck, disney, cartoons
7 | #yogi bear => yogi_bear, disney, cartoons
8 |
9 | wife => wife, spouse
10 | bride => bride, spouse
11 |
--------------------------------------------------------------------------------
/notebooks/solr/tmdb/solr_config/conf/taxonomy_parent.txt:
--------------------------------------------------------------------------------
1 | # Capture how the *user* structures information
2 | #looneytunes, looney tunes => looney_tunes
3 | #bugs bunny => bug_bunny, looney_tunes
4 | #mickey mouse => mickey_mouse, disney
5 | #minnie mouse => minnie_mouse, disney
6 | #donald duck => donald_duck, disney
7 | #yogi bear => yogi_bear, disney
8 |
9 | wife => wife, spouse
10 | bride => bride, spouse
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 |
2 | alembic==1.7.6
3 | appnope==0.1.2
4 | attrs==21.4.0
5 | backcall==0.2.0
6 | bleach==4.1.0
7 | certifi==2022.12.07
8 | chardet==4.0.0
9 | cycler==0.11.0
10 | Cython
11 | decorator==4.3.2
12 | defusedxml==0.5.0
13 | elasticsearch==7.16.2
14 | entrypoints==0.3
15 | fuzzywuzzy==0.18.0
16 | graphviz==0.19.1
17 | idna==3.3
18 | ipykernel==6.4.2
19 | ipython==7.31.1
20 | ipython-genutils==0.2.0
21 | ipywidgets==7.6.5
22 | jedi==0.18.1
23 | Jinja2==3.0.3
24 | joblib==1.1.1
25 | jsonschema==4.4.0
26 | jupyter
27 | jupyter-client==7.4.4
28 | jupyter-console==6.4.0
29 | jupyter-core==4.12
30 | kiwisolver==1.3.2
31 | Mako==1.1.6
32 | MarkupSafe==2.0.1
33 | matplotlib==3.7.2
34 | mistune
35 | mizani==0.9.2
36 | nbconvert==6.5.0
37 | nbformat==5.3.0
38 | nbgrader
39 | nbstripout==0.5.0
40 | notebook==6.4.8
41 | numpy==1.23.5
42 | opensearch-py==2.2.0
43 | pandas==2.0.3
44 | pandocfilters==1.5.0
45 | parso==0.8.3
46 | pexpect==4.8.0
47 | pickleshare==0.7.5
48 | plotly==5.5.0
49 | plotnine==0.12.2
50 | prometheus-client==0.13.1
51 | prompt-toolkit==3.0.26
52 | ptyprocess==0.7.0
53 | Pygments==2.11.2
54 | pyparsing==3.0.7
55 | pyrsistent==0.18.1
56 | python-dateutil==2.8.2
57 | python-editor==1.0.4
58 | pytz==2021.3
59 | pyzmq==25.1.1
60 | qtconsole==5.2.2
61 | requests==2.27.1
62 | retrying==1.3.3
63 | scikit-learn==1.3.0
64 | scipy==1.10.1
65 | seaborn==0.11.2
66 | Send2Trash==1.8.0
67 | six==1.16.0
68 | #sklearn==1.3.0
69 | SQLAlchemy==1.3.24
70 | terminado==0.13.1
71 | testpath==0.5.0
72 | threadpoolctl==3.1.0
73 | tornado==6.2
74 | tqdm==4.62.3
75 | traitlets==5.9.0
76 | urllib3==1.26.8
77 | wcwidth==0.2.5
78 | webencodings==0.5.1
79 | widgetsnbextension==3.5.2
80 | xgboost==1.7.6
81 |
--------------------------------------------------------------------------------
/rre/README.md:
--------------------------------------------------------------------------------
1 | rre
2 |
3 | This folder contains some basic RRE demonstrations for running evaluations against your LTR models.
4 |
5 | Navigate to `solr` or `elastic` depending on which you are using and do the following:
6 |
7 | ## Getting Started
8 | - Build the docker image: `docker build -t ltr-rre .`
9 | - Run an evaluation: `docker run --name ltr-rre ltr-rre`
10 | - Copy the report to your host: `docker cp ltr-rre:/rre/target/site/rre-report.xlsx .`
11 |
12 | Alternatively, you can run thru the `evaluation` notebooks in Jupyter to run these steps for you.
13 |
14 | __Note:__ Older versions of Docker for Linux may have issues accessing localhost on the host machine
15 |
--------------------------------------------------------------------------------
/rre/elastic/.dockerignore:
--------------------------------------------------------------------------------
1 | target/*
2 |
--------------------------------------------------------------------------------
/rre/elastic/.gitignore:
--------------------------------------------------------------------------------
1 | target/*
2 |
--------------------------------------------------------------------------------
/rre/elastic/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM maven:3.6.0-jdk-8
2 |
3 | # Clone the RRE repo
4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator
5 | WORKDIR rated-ranking-evaluator
6 |
7 | # Build RRE
8 | RUN mvn clean install
9 |
10 | # Bring over the RRE config
11 | WORKDIR /
12 | COPY . rre
13 | WORKDIR rre
14 |
15 | # By default, run an RRE evaluation if no other command is specified
16 | CMD mvn clean install
17 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/README.md:
--------------------------------------------------------------------------------
1 | This folder contains one subfolder for each configuration version.
2 | Each version folder should contain the index settings associated with such version:
3 |
4 | - `hostUrls`: an array of URLs where the Elasticsearch instance for this
5 | version can be accessed.
6 | - `index`: the name of the index holding the data being used to search.
7 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/baseline/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "hostUrls": [ "http://host.docker.internal:9200" ],
3 | "index": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/classic/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "hostUrls": [ "http://host.docker.internal:9200" ],
3 | "index": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/configuration_sets/latest/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "hostUrls": [ "http://host.docker.internal:9200" ],
3 | "index": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/ratings/ratings.json:
--------------------------------------------------------------------------------
1 | {
2 | "index": "tmdb",
3 | "id_field": "id",
4 | "topics": [
5 | {
6 | "description": "LTR Example Evaluation",
7 | "queries": [
8 | {
9 | "template": "query.json",
10 | "placeholders": {
11 | "$query": "batman"
12 | }
13 | }
14 | ],
15 | "relevant_documents": {
16 | "4": [
17 | "40662",
18 | "45162",
19 | "69735",
20 | "123025",
21 | "142061",
22 | "177271",
23 | "209112",
24 | "242643",
25 | "251519",
26 | "321528",
27 | "324849",
28 | "366924",
29 | "382322"
30 | ],
31 | "3": [
32 | "272",
33 | "13851",
34 | "14919",
35 | "16234",
36 | "20077",
37 | "21683",
38 | "22855",
39 | "29751"
40 | ],
41 | "2": [
42 | "268",
43 | "364",
44 | "414",
45 | "415",
46 | "15805",
47 | "17074"
48 | ],
49 | "1": [
50 | "2661",
51 | "93560",
52 | "125249"
53 | ]
54 | }
55 | }
56 | ]
57 | }
58 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/README.md:
--------------------------------------------------------------------------------
1 | This folder will contain the query templates associated with the evaluation suite.
2 | The query shape in Elasticsearch is already a JSON file so each template should be a valid Elasticsearch query
3 | with all needed placeholders (that will be defined within the ratings file).
4 |
5 | ```javascript
6 | {
7 | "size": 0,
8 | "query": {
9 | "bool": {
10 | "must": [
11 | {
12 | "multi_match": {
13 | "query": "$query",
14 | "fields": [
15 | "some_searchable_field_1^1.75",
16 | "some_other_searchable_field"
17 | ],
18 | "minimum_should_match": "3<-45% 6<-95%"
19 | }
20 | }
21 | ]
22 | }
23 | },
24 | "aggs": {
25 | "headings": {
26 | "terms": {
27 | "field": "title_sugg",
28 | "order": { "max_score": "desc" }
29 | },
30 | "aggs": {
31 | "max_score": {
32 | "max": {
33 | "script": {
34 | "lang": "painless",
35 | "inline": "_score"
36 | }
37 | }
38 | }
39 | }
40 | }
41 | }
42 | }
43 | ```
--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/baseline/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "query": {
3 | "match": {
4 | "title": {
5 | "query": "$query"
6 | }
7 | }
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/classic/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "query": {
3 | "bool": {
4 | "should": [
5 | {
6 | "sltr": {
7 | "featureset": "release",
8 | "model": "classic",
9 | "params": {}
10 | }
11 | }
12 | ],
13 | "filter": [
14 | {"match": {"title": "$query"}}
15 | ]
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/rre/elastic/src/etc/templates/latest/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "query": {
3 | "bool": {
4 | "should": [
5 | {
6 | "sltr": {
7 | "featureset": "release",
8 | "model": "latest",
9 | "params": {}
10 | }
11 | }
12 | ],
13 | "filter": [
14 | {"match": {"title": "$query"}}
15 | ]
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/rre/opensearch/.dockerignore:
--------------------------------------------------------------------------------
1 | target/*
2 |
--------------------------------------------------------------------------------
/rre/opensearch/.gitignore:
--------------------------------------------------------------------------------
1 | target/*
2 |
--------------------------------------------------------------------------------
/rre/opensearch/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM maven:3.6.0-jdk-8
2 |
3 | # Clone the RRE repo
4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator
5 | WORKDIR rated-ranking-evaluator
6 |
7 | # Build RRE
8 | RUN mvn clean install
9 |
10 | # Bring over the RRE config
11 | WORKDIR /
12 | COPY . rre
13 | WORKDIR rre
14 |
15 | # By default, run an RRE evaluation if no other command is specified
16 | CMD mvn clean install
17 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/configuration_sets/README.md:
--------------------------------------------------------------------------------
1 | This folder contains one subfolder for each configuration version.
2 | Each version folder should contain the index settings associated with such version:
3 |
4 | - `hostUrls`: an array of URLs where the Elasticsearch instance for this
5 | version can be accessed.
6 | - `index`: the name of the index holding the data being used to search.
7 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/configuration_sets/baseline/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "hostUrls": [ "http://host.docker.internal:9200" ],
3 | "index": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/configuration_sets/classic/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "hostUrls": [ "http://host.docker.internal:9200" ],
3 | "index": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/configuration_sets/latest/index-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "hostUrls": [ "http://host.docker.internal:9200" ],
3 | "index": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/ratings/ratings.json:
--------------------------------------------------------------------------------
1 | {
2 | "index": "tmdb",
3 | "id_field": "id",
4 | "topics": [
5 | {
6 | "description": "LTR Example Evaluation",
7 | "queries": [
8 | {
9 | "template": "query.json",
10 | "placeholders": {
11 | "$query": "batman"
12 | }
13 | }
14 | ],
15 | "relevant_documents": {
16 | "4": [
17 | "40662",
18 | "45162",
19 | "69735",
20 | "123025",
21 | "142061",
22 | "177271",
23 | "209112",
24 | "242643",
25 | "251519",
26 | "321528",
27 | "324849",
28 | "366924",
29 | "382322"
30 | ],
31 | "3": [
32 | "272",
33 | "13851",
34 | "14919",
35 | "16234",
36 | "20077",
37 | "21683",
38 | "22855",
39 | "29751"
40 | ],
41 | "2": [
42 | "268",
43 | "364",
44 | "414",
45 | "415",
46 | "15805",
47 | "17074"
48 | ],
49 | "1": [
50 | "2661",
51 | "93560",
52 | "125249"
53 | ]
54 | }
55 | }
56 | ]
57 | }
58 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/templates/README.md:
--------------------------------------------------------------------------------
1 | This folder will contain the query templates associated with the evaluation suite.
2 | The query shape in Elasticsearch is already a JSON file so each template should be a valid Elasticsearch query
3 | with all needed placeholders (that will be defined within the ratings file).
4 |
5 | ```javascript
6 | {
7 | "size": 0,
8 | "query": {
9 | "bool": {
10 | "must": [
11 | {
12 | "multi_match": {
13 | "query": "$query",
14 | "fields": [
15 | "some_searchable_field_1^1.75",
16 | "some_other_searchable_field"
17 | ],
18 | "minimum_should_match": "3<-45% 6<-95%"
19 | }
20 | }
21 | ]
22 | }
23 | },
24 | "aggs": {
25 | "headings": {
26 | "terms": {
27 | "field": "title_sugg",
28 | "order": { "max_score": "desc" }
29 | },
30 | "aggs": {
31 | "max_score": {
32 | "max": {
33 | "script": {
34 | "lang": "painless",
35 | "inline": "_score"
36 | }
37 | }
38 | }
39 | }
40 | }
41 | }
42 | }
43 | ```
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/templates/baseline/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "query": {
3 | "match": {
4 | "title": {
5 | "query": "$query"
6 | }
7 | }
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/templates/classic/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "query": {
3 | "bool": {
4 | "should": [
5 | {
6 | "sltr": {
7 | "featureset": "release",
8 | "model": "classic",
9 | "params": {}
10 | }
11 | }
12 | ],
13 | "filter": [
14 | {"match": {"title": "$query"}}
15 | ]
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/rre/opensearch/src/etc/templates/latest/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "query": {
3 | "bool": {
4 | "should": [
5 | {
6 | "sltr": {
7 | "featureset": "release",
8 | "model": "latest",
9 | "params": {}
10 | }
11 | }
12 | ],
13 | "filter": [
14 | {"match": {"title": "$query"}}
15 | ]
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/rre/solr/.dockerignore:
--------------------------------------------------------------------------------
1 | target/*
2 |
--------------------------------------------------------------------------------
/rre/solr/.gitignore:
--------------------------------------------------------------------------------
1 | target/*
2 |
--------------------------------------------------------------------------------
/rre/solr/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM maven:3.6.0-jdk-8
2 |
3 | # Clone the RRE repo
4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator
5 | WORKDIR rated-ranking-evaluator
6 |
7 | # Build RRE
8 | RUN mvn clean install
9 |
10 | # Bring over the RRE config
11 | WORKDIR /
12 | COPY . rre
13 | WORKDIR rre
14 |
15 | # By default, run an RRE evaluation if no other command is specified
16 | CMD mvn clean install
17 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/README.md:
--------------------------------------------------------------------------------
1 | This folder contains one subfolder for each configuration version.
2 | Each version folder should contain a solr-settings.json file with details of
3 | how to connect to the appropriate Solr core.
4 |
5 | This is an example:
6 |
7 | * configuration_sets
8 | * v1.0
9 | * solr-settings.json
10 | * v1.1
11 | * solr-settings.json
12 |
13 | The solr-settings.json files may have the following properties:
14 |
15 | - `baseUrls`: an array of Solr base URLs (eg. `[ "http://localhost:8983/solr", "http://localhost:7574/solr" ]`).
16 | - `collectionName` [**REQUIRED**]: the name of the collection or core being evaluated.
17 | - `zkHosts`: an array of Zookeeper hosts (eg. `[ "zk1:2181", "zk2:2181" ]`).
18 | - `zkChroot`: the path to the root Zookeeper node containing Solr data, if running in a Chroot environment (eg. `"/solr"`).
19 | Optional.
20 | - `connectionTimeoutMillis`: the number of milliseconds to wait for a connection to be made to Solr. Optional.
21 | - `socketTimeoutMillis`: the number of milliseconds to allow for a response from Solr. Optional.
22 |
23 | **Either** the baseUrls **or** the zkHosts property must contain values. If both are empty,
24 | the configuration will fail to load.
--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/baseline/solr-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ],
3 | "collectionName": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/classic/solr-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ],
3 | "collectionName": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/configuration_sets/latest/solr-settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ],
3 | "collectionName": "tmdb"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/ratings/README.md:
--------------------------------------------------------------------------------
1 | Under the ratings folder you should have at least 1 ratings file.
2 | A ratings file is connected with a dataset and contains a set of queries that compose the evaluation execution.
--------------------------------------------------------------------------------
/rre/solr/src/etc/ratings/ratings.json:
--------------------------------------------------------------------------------
1 | {
2 | "index": "tmdb",
3 | "id_field": "id",
4 | "topics": [
5 | {
6 | "description": "LTR Example Evaluation",
7 | "queries": [
8 | {
9 | "template": "query.json",
10 | "placeholders": {
11 | "$query": "batman"
12 | }
13 | }
14 | ],
15 | "relevant_documents": {
16 | "4": [
17 | "40662",
18 | "45162",
19 | "69735",
20 | "123025",
21 | "142061",
22 | "177271",
23 | "209112",
24 | "242643",
25 | "251519",
26 | "321528",
27 | "324849",
28 | "366924",
29 | "382322"
30 | ],
31 | "3": [
32 | "272",
33 | "13851",
34 | "14919",
35 | "16234",
36 | "20077",
37 | "21683",
38 | "22855",
39 | "29751"
40 | ],
41 | "2": [
42 | "268",
43 | "364",
44 | "414",
45 | "415",
46 | "15805",
47 | "17074"
48 | ],
49 | "1": [
50 | "2661",
51 | "93560",
52 | "125249"
53 | ]
54 | }
55 | }
56 | ]
57 | }
58 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/README.md:
--------------------------------------------------------------------------------
1 | This folder will contain the query templates associated with the evaluation suite.
2 | A template is a JSON file containing a JSON object with name->value(s) pairs corresponding to query parameters.
3 | Although it is completely ok to have statically-defined values here, usually you will be using placeholders.
4 |
5 | ```javascript
6 | {
7 | "q": "$query",
8 | "fq": "language:$lang"
9 | }
10 | ```
11 | The placeholders values will be defined within the ratings file, specifically in the queries definitions.
--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/baseline/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "q": "title:($query)"
3 | }
4 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/classic/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "q": "title:($query)",
3 | "rq": "{!ltr model=classic}"
4 | }
5 |
--------------------------------------------------------------------------------
/rre/solr/src/etc/templates/latest/query.json:
--------------------------------------------------------------------------------
1 | {
2 | "q": "title:($query)",
3 | "rq": "{!ltr model=latest}"
4 | }
5 |
--------------------------------------------------------------------------------
/tests/fail.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | class Fail(unittest.TestCase):
4 |
5 | def test_that_fails(self):
6 | assert 1 == 0
7 |
8 | if __name__ == "__main__":
9 | unittest.main()
10 |
--------------------------------------------------------------------------------
/tests/nb_test_config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | class NotebookTestConfig:
4 |
5 | SETUP_NB = 'setup.ipynb'
6 |
7 | def __init__(self, path):
8 | self.notebooks = []
9 | self.setup = None
10 | for nb_path in os.listdir(path):
11 | full_nb_path = os.path.join(path,nb_path)
12 | if os.path.isfile(full_nb_path) and nb_path.endswith('.ipynb'):
13 | if nb_path == NotebookTestConfig.SETUP_NB:
14 | self.setup = full_nb_path
15 | else:
16 | self.notebooks.append(full_nb_path)
17 |
18 |
--------------------------------------------------------------------------------
/tests/notebook_test_case.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from nb_test_config import NotebookTestConfig
3 | import runner
4 |
5 | class NotebooksTestCase(unittest.TestCase):
6 |
7 | SAVE_NB_PATH='tests/last_run.ipynb'
8 |
9 | def test_paths(self):
10 | return []
11 |
12 | def ignored_nbs(self):
13 | return []
14 |
15 | def nbs_to_run(self):
16 | class IncludeAll:
17 | def __contains__(self, _):
18 | return True
19 | return IncludeAll()
20 |
21 | def test_for_no_errors(self):
22 | """ Run all nbs in directories at test_paths()
23 | also included in nbs_to_run(),
24 | excepting those in ignored_nbs()
25 | - assert there are no errors
26 | """
27 | for nb_path in self.test_paths():
28 |
29 | nb_cfg = NotebookTestConfig(path=nb_path)
30 | print("EXECUTING NBS IN DIRECTORY: " + nb_path)
31 | if nb_cfg.setup:
32 | print("Setting up ... " + nb_path)
33 | nb, errors = runner.run_notebook(nb_cfg.setup, save_nb_path=NotebooksTestCase.SAVE_NB_PATH)
34 | print(errors)
35 | assert len(errors) == 0
36 | for nb in nb_cfg.notebooks:
37 | if nb in self.nbs_to_run():
38 | if nb in self.ignored_nbs():
39 | print("Ignored " + nb)
40 | else:
41 | print("Running... " + nb)
42 | nb, errors = runner.run_notebook(nb, save_nb_path=NotebooksTestCase.SAVE_NB_PATH)
43 | print(errors)
44 | assert len(errors) == 0
45 |
46 |
--------------------------------------------------------------------------------
/tests/pass.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | class Pass(unittest.TestCase):
4 |
5 | def test_that_passes(self):
6 | assert 1 == 1
7 |
8 | if __name__ == "__main__":
9 | unittest.main()
10 |
--------------------------------------------------------------------------------
/tests/run_most_nbs.py:
--------------------------------------------------------------------------------
1 | from notebook_test_case import NotebooksTestCase
2 | import unittest
3 |
4 | class RunMostNotebooksTestCase(NotebooksTestCase):
5 |
6 | TEST_PATHS = ['./notebooks/',
7 | './notebooks/solr/tmdb',
8 | './notebooks/elasticsearch/tmdb',
9 | './notebooks/elasticsearch/osc-blog',
10 | './notebooks/opensearch/tmdb',
11 | './notebooks/opensearch/osc-blog']
12 |
13 | IGNORED_NBS = ['./notebooks/solr/tmdb/evaluation (Solr).ipynb',
14 | './notebooks/elasticsearch/tmdb/XGBoost.ipynb',
15 | './notebooks/elasticsearch/tmdb/evaluation.ipynb',
16 | './notebooks/opensearch/tmdb/XGBoost.ipynb',
17 | './notebooks/opensearch/tmdb/evaluation.ipynb']
18 |
19 |
20 | def test_paths(self):
21 | return RunMostNotebooksTestCase.TEST_PATHS
22 |
23 | def ignored_nbs(self):
24 | return RunMostNotebooksTestCase.IGNORED_NBS
25 |
26 |
27 |
28 | if __name__ == "__main__":
29 | unittest.main()
30 |
--------------------------------------------------------------------------------
/tests/runner.py:
--------------------------------------------------------------------------------
1 | # Notebook test runner, adapted from
2 | # https://www.blog.pythonlibrary.org/2018/10/16/testing-jupyter-notebooks/
3 | import nbformat
4 | import os
5 |
6 | from nbconvert.preprocessors import ExecutePreprocessor
7 |
8 | def hours(hours):
9 | """ Hours as seconds """
10 | hours * 60 * 60
11 |
12 | def run_notebook(notebook_path, timeout=hours(6), save_nb_path=None):
13 | nb_name, _ = os.path.splitext(os.path.basename(notebook_path))
14 | dirname = os.path.dirname(notebook_path)
15 |
16 | with open(notebook_path) as f:
17 | nb = nbformat.read(f, as_version=4)
18 |
19 | proc = ExecutePreprocessor(timeout=timeout, kernel_name='python3')
20 | proc.allow_errors = True
21 |
22 | proc.preprocess(nb, {'metadata': {'path': dirname}})
23 |
24 | if save_nb_path:
25 | with open(save_nb_path, mode='wt') as f:
26 | nbformat.write(nb, f)
27 |
28 | errors = []
29 | for cell in nb.cells:
30 | if 'outputs' in cell:
31 | for output in cell['outputs']:
32 | if output.output_type == 'error':
33 | errors.append(output)
34 |
35 | return nb, errors
36 |
37 | if __name__ == '__main__':
38 | nb, errors = run_notebook('Testing.ipynb')
39 | print(errors)
40 |
--------------------------------------------------------------------------------
/tests/test_prep.py:
--------------------------------------------------------------------------------
1 | from ltr.client.solr_client import SolrClient
2 | client = SolrClient()
3 |
4 | from ltr import download
5 | from ltr.index import rebuild
6 | from ltr.helpers.movies import indexable_movies
7 |
8 | corpus='http://es-learn-to-rank.labs.o19s.com/tmdb.json'
9 | download([corpus], dest='data/');
10 |
11 | movies=indexable_movies(movies='data/tmdb.json')
12 | rebuild(client, index='tmdb', doc_src=movies)
--------------------------------------------------------------------------------
/utils/rateFuzzySearch.json.jinja:
--------------------------------------------------------------------------------
1 | {
2 | "from": 0,
3 | "size": 7,
4 | "query": {
5 | "bool": {
6 | "should": [
7 | {"match": {
8 | "title": {
9 | "query": "{{ keywords }}",
10 | "fuzziness": "AUTO"}
11 | }}
12 | ]
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/utils/rateSearch.json.jinja:
--------------------------------------------------------------------------------
1 | {
2 | "from": 0,
3 | "size": 5,
4 | "query": {
5 | "bool": {
6 | "should": [
7 | {"match": {
8 | "text_all": "{{ keywords }}"
9 | }},
10 | {
11 | "match_phrase": {
12 | "title": {
13 | "query": "{{ keywords }}",
14 | "boost": 1000
15 | }
16 | }
17 | }]
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/utils/train_to_csv.py:
--------------------------------------------------------------------------------
1 | import utils
2 | from ltr.judgments import judgments_from_file
3 | from ltr.client import ElasticClient
4 | import csv
5 |
6 |
7 | def train_to_csv(client, feature_set, in_filename, out_filename):
8 | features = client.feature_set(name=feature_set, index='tmdb')[0]
9 | fieldnames = ['keywords', 'qid', 'grade']
10 | fieldnames.extend([feature['name'] for feature in features])
11 | with open(out_filename, 'w') as csvfile:
12 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
13 | writer.writeheader()
14 |
15 | judgments = judgments_from_file(filename='data/title_judgments_train.txt')
16 | for judgment in judgments:
17 | assert len(judgment.features) == len(fieldnames) - 3
18 | record = {}
19 | record[fieldnames[0]] = judgment.keywords
20 | record[fieldnames[1]] = judgment.qid
21 | record[fieldnames[2]] = judgment.grade
22 | for idx,field in enumerate(fieldnames[3:]):
23 | record[field] = judgment.features[idx]
24 |
25 | writer.writerow(record)
26 |
27 | if __name__ == "__main__":
28 | from sys import argv
29 | client = ElasticClient()
30 | train_to_csv(client=client, in_filename=argv[1],
31 | feature_set=argv[2], out_filename=argv[3])
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append(os.getcwd())
5 |
--------------------------------------------------------------------------------