├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── clean-notebooks.sh ├── docker-compose.yml ├── docker └── README.md ├── ltr ├── MART_model.py ├── __init__.py ├── clickmodels │ ├── __init__.py │ ├── cascade.py │ ├── coec.py │ ├── conversion.py │ ├── pbm.py │ ├── sdbn.py │ ├── session.py │ └── ubm.py ├── client │ ├── __init__.py │ ├── base_client.py │ ├── elastic_client.py │ ├── opensearch_client.py │ ├── solr_client.py │ └── solr_parse.py ├── date_genre_judgments.py ├── download.py ├── evaluate.py ├── helpers │ ├── __init__.py │ ├── butterfingers.py │ ├── convert.py │ ├── defaultlist.py │ ├── esUrlParse.py │ ├── handle_resp.py │ ├── movies.py │ ├── msmarco │ │ ├── __init__.py │ │ └── evaluate.py │ ├── ranklib_result.py │ ├── solr_escape.py │ └── tau.py ├── index.py ├── injectTypos.py ├── judgments.py ├── log.py ├── p9_plots.py ├── ranklib.py ├── release_date_plot.py ├── search.py └── years_as_ratings.py ├── notebooks ├── click models.ipynb ├── conversion-augmented-click-models.ipynb ├── elasticsearch │ ├── .docker │ │ ├── es-docker │ │ │ ├── Dockerfile │ │ │ ├── elasticsearch.sh │ │ │ └── elasticsearch.yml │ │ └── kb-docker │ │ │ ├── Dockerfile │ │ │ └── kibana.yml │ ├── README.md │ ├── docker-compose.yml │ ├── osc-blog │ │ ├── blog_settings.json │ │ ├── ltr.py │ │ └── osc-blog.ipynb │ └── tmdb │ │ ├── Dataframes.ipynb │ │ ├── XGBoost.ipynb │ │ ├── bayesian-optimization.ipynb │ │ ├── es-ltr-basics-project.ipynb │ │ ├── evaluation.ipynb │ │ ├── fmap.txt │ │ ├── gonna need a bigger bot (ES).ipynb │ │ ├── hello-ltr (ES).ipynb │ │ ├── lambda-mart-in-python.ipynb │ │ ├── ltr.py │ │ ├── netfix movies-random-forests.ipynb │ │ ├── netfix movies.ipynb │ │ ├── raw-es-commands.ipynb │ │ ├── sandbox.ipynb │ │ ├── tale-of-two-queries (ES).ipynb │ │ ├── term-stat-query.ipynb │ │ └── tmdb_settings.json ├── exercises │ ├── Beta distribution for regularizing CTRs.ipynb │ ├── Feature Sets and Feature Logs.ipynb │ ├── Have fun with Hyperparameters .ipynb │ ├── Models and More Models.ipynb │ ├── click_log.csv │ ├── data │ │ ├── tates_model.txt │ │ ├── title_features.csv │ │ └── title_judgments.txt │ └── ltr.py ├── ltr.py ├── opensearch │ ├── .docker │ │ ├── opensearch-docker │ │ │ ├── Dockerfile │ │ │ ├── opensearch.sh │ │ │ └── opensearch.yml │ │ └── osd-docker │ │ │ ├── Dockerfile │ │ │ └── opensearch_dashboards.yml │ ├── README.md │ ├── docker-compose.yml │ ├── osc-blog │ │ ├── blog_settings.json │ │ ├── ltr.py │ │ └── osc-blog.ipynb │ └── tmdb │ │ ├── Dataframes.ipynb │ │ ├── XGBoost.ipynb │ │ ├── bayesian-optimization.ipynb │ │ ├── evaluation.ipynb │ │ ├── fmap.txt │ │ ├── gonna need a bigger bot (OpenSearch).ipynb │ │ ├── hello-ltr (OpenSearch).ipynb │ │ ├── lambda-mart-in-python.ipynb │ │ ├── ltr.py │ │ ├── netfix movies-random-forests.ipynb │ │ ├── netfix movies.ipynb │ │ ├── opensearch-ltr-basics-project.ipynb │ │ ├── raw-opensearch-commands.ipynb │ │ ├── sandbox.ipynb │ │ ├── tale-of-two-queries (OpenSearch).ipynb │ │ ├── term-stat-query.ipynb │ │ └── tmdb_settings.json └── solr │ ├── .docker │ └── solr_home │ │ ├── solr.xml │ │ ├── tmdb │ │ └── conf │ │ │ ├── currency.xml │ │ │ ├── elevate.xml │ │ │ ├── idioms.txt │ │ │ ├── lang │ │ │ ├── contractions_ca.txt │ │ │ ├── contractions_fr.txt │ │ │ ├── contractions_ga.txt │ │ │ ├── contractions_it.txt │ │ │ ├── hyphenations_ga.txt │ │ │ ├── stemdict_nl.txt │ │ │ ├── stoptags_ja.txt │ │ │ ├── stopwords_ar.txt │ │ │ ├── stopwords_bg.txt │ │ │ ├── stopwords_ca.txt │ │ │ ├── stopwords_cz.txt │ │ │ ├── stopwords_da.txt │ │ │ ├── stopwords_de.txt │ │ │ ├── stopwords_el.txt │ │ │ ├── stopwords_en.txt │ │ │ ├── stopwords_es.txt │ │ │ ├── stopwords_eu.txt │ │ │ ├── stopwords_fa.txt │ │ │ ├── stopwords_fi.txt │ │ │ ├── stopwords_fr.txt │ │ │ ├── stopwords_ga.txt │ │ │ ├── stopwords_gl.txt │ │ │ ├── stopwords_hi.txt │ │ │ ├── stopwords_hu.txt │ │ │ ├── stopwords_hy.txt │ │ │ ├── stopwords_id.txt │ │ │ ├── stopwords_it.txt │ │ │ ├── stopwords_ja.txt │ │ │ ├── stopwords_lv.txt │ │ │ ├── stopwords_nl.txt │ │ │ ├── stopwords_no.txt │ │ │ ├── stopwords_pt.txt │ │ │ ├── stopwords_ro.txt │ │ │ ├── stopwords_ru.txt │ │ │ ├── stopwords_sv.txt │ │ │ ├── stopwords_th.txt │ │ │ ├── stopwords_tr.txt │ │ │ └── userdict_ja.txt │ │ │ ├── name_synonyms.txt │ │ │ ├── names.txt │ │ │ ├── params.json │ │ │ ├── protwords.txt │ │ │ ├── schema.xml │ │ │ ├── solrconfig.xml │ │ │ ├── stopwords.txt │ │ │ ├── synonyms.txt │ │ │ ├── synonyms_bidirect.txt │ │ │ ├── synonyms_directed.txt │ │ │ ├── synonyms_genres.txt │ │ │ ├── synonyms_multiterm.txt │ │ │ ├── taxonomy.txt │ │ │ └── taxonomy_parent.txt │ │ └── zoo.cfg │ ├── Dockerfile │ ├── docker-compose.yml │ ├── msmarco │ ├── ltr.py │ ├── msmarco.ipynb │ └── solr_config │ │ └── conf │ │ ├── elevate.xml │ │ ├── misspell.txt │ │ ├── params.json │ │ ├── plural_misstems.txt │ │ ├── schema.xml │ │ └── solrconfig.xml │ └── tmdb │ ├── ai-powered-search-ch-10.ipynb │ ├── ai-powered-search.ipynb │ ├── evaluation (Solr).ipynb │ ├── gonna need a bigger bot (Solr).ipynb │ ├── hello-ltr (Solr).ipynb │ ├── ltr.py │ ├── netfix movies(Solr).ipynb │ ├── raw-solr-commands.ipynb │ ├── solr_config │ └── conf │ │ ├── currency.xml │ │ ├── elevate.xml │ │ ├── idioms.txt │ │ ├── lang │ │ ├── contractions_ca.txt │ │ ├── contractions_fr.txt │ │ ├── contractions_ga.txt │ │ ├── contractions_it.txt │ │ ├── hyphenations_ga.txt │ │ ├── stemdict_nl.txt │ │ ├── stoptags_ja.txt │ │ ├── stopwords_ar.txt │ │ ├── stopwords_bg.txt │ │ ├── stopwords_ca.txt │ │ ├── stopwords_cz.txt │ │ ├── stopwords_da.txt │ │ ├── stopwords_de.txt │ │ ├── stopwords_el.txt │ │ ├── stopwords_en.txt │ │ ├── stopwords_es.txt │ │ ├── stopwords_eu.txt │ │ ├── stopwords_fa.txt │ │ ├── stopwords_fi.txt │ │ ├── stopwords_fr.txt │ │ ├── stopwords_ga.txt │ │ ├── stopwords_gl.txt │ │ ├── stopwords_hi.txt │ │ ├── stopwords_hu.txt │ │ ├── stopwords_hy.txt │ │ ├── stopwords_id.txt │ │ ├── stopwords_it.txt │ │ ├── stopwords_ja.txt │ │ ├── stopwords_lv.txt │ │ ├── stopwords_nl.txt │ │ ├── stopwords_no.txt │ │ ├── stopwords_pt.txt │ │ ├── stopwords_ro.txt │ │ ├── stopwords_ru.txt │ │ ├── stopwords_sv.txt │ │ ├── stopwords_th.txt │ │ ├── stopwords_tr.txt │ │ └── userdict_ja.txt │ │ ├── name_synonyms.txt │ │ ├── names.txt │ │ ├── params.json │ │ ├── protwords.txt │ │ ├── schema.xml │ │ ├── solrconfig.xml │ │ ├── stopwords.txt │ │ ├── synonyms.txt │ │ ├── synonyms_bidirect.txt │ │ ├── synonyms_directed.txt │ │ ├── synonyms_genres.txt │ │ ├── synonyms_multiterm.txt │ │ ├── taxonomy.txt │ │ └── taxonomy_parent.txt │ ├── svmrank.ipynb │ └── tale-of-two-queries (Solr).ipynb ├── requirements.txt ├── rre ├── README.md ├── elastic │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── etc │ │ ├── configuration_sets │ │ ├── README.md │ │ ├── baseline │ │ │ └── index-settings.json │ │ ├── classic │ │ │ └── index-settings.json │ │ └── latest │ │ │ └── index-settings.json │ │ ├── ratings │ │ └── ratings.json │ │ └── templates │ │ ├── README.md │ │ ├── baseline │ │ └── query.json │ │ ├── classic │ │ └── query.json │ │ └── latest │ │ └── query.json ├── opensearch │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── pom.xml │ └── src │ │ └── etc │ │ ├── configuration_sets │ │ ├── README.md │ │ ├── baseline │ │ │ └── index-settings.json │ │ ├── classic │ │ │ └── index-settings.json │ │ └── latest │ │ │ └── index-settings.json │ │ ├── ratings │ │ └── ratings.json │ │ └── templates │ │ ├── README.md │ │ ├── baseline │ │ └── query.json │ │ ├── classic │ │ └── query.json │ │ └── latest │ │ └── query.json └── solr │ ├── .dockerignore │ ├── .gitignore │ ├── Dockerfile │ ├── pom.xml │ └── src │ └── etc │ ├── configuration_sets │ ├── README.md │ ├── baseline │ │ └── solr-settings.json │ ├── classic │ │ └── solr-settings.json │ └── latest │ │ └── solr-settings.json │ ├── ratings │ ├── README.md │ └── ratings.json │ └── templates │ ├── README.md │ ├── baseline │ └── query.json │ ├── classic │ └── query.json │ └── latest │ └── query.json ├── tests ├── fail.py ├── nb_test_config.py ├── notebook_test_case.py ├── pass.py ├── run_most_nbs.py ├── runner.py ├── test.sh ├── test_judg_list.py └── test_prep.py └── utils ├── rate.py ├── rateFuzzySearch.json.jinja ├── rateSearch.json.jinja ├── train_to_csv.py └── utils.py /.dockerignore: -------------------------------------------------------------------------------- 1 | #data/ 2 | venv/ 3 | venv2/ 4 | .git/ 5 | .cache/ 6 | .trash/ 7 | **/venv* 8 | #**/data/ 9 | **/__pycache__/ 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | **/data 3 | venv/* 4 | **/.ipynb_checkpoints 5 | tests/last_run.ipynb 6 | 7 | *.pyc 8 | .vscode 9 | .cache 10 | features.txt 11 | .trash 12 | .DS_store 13 | notify.sh 14 | 15 | .idea/ 16 | *.iml 17 | tests_venv/* 18 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.17-slim-bookworm 2 | 3 | # Get openjdk 4 | ENV JAVA_HOME=/opt/java/openjdk 5 | COPY --from=eclipse-temurin:11 $JAVA_HOME $JAVA_HOME 6 | ENV PATH="${JAVA_HOME}/bin:${PATH}" 7 | 8 | # Install graphviz 9 | RUN apt-get update && \ 10 | apt-get install -y graphviz && \ 11 | apt-get clean; 12 | 13 | # Setup a user 14 | RUN useradd -ms /bin/bash ltr 15 | WORKDIR /home/ltr 16 | 17 | # Make current directory accesible 18 | ADD . /home/ltr/hello-ltr 19 | 20 | # Install requirements 21 | RUN chown -R ltr.ltr hello-ltr 22 | WORKDIR /home/ltr/hello-ltr 23 | 24 | RUN /usr/local/bin/python -m pip install --upgrade pip 25 | RUN pip install -r requirements.txt 26 | USER ltr 27 | 28 | CMD jupyter notebook --ip=0.0.0.0 --no-browser --NotebookApp.token='' 29 | -------------------------------------------------------------------------------- /clean-notebooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Removes all output and metadata from notebooks 4 | find notebooks -type f -name "*.ipynb" -print0 | xargs -0 nbstripout 5 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | notebooks: 3 | build: . 4 | container_name: hello-ltr-notebook 5 | ports: 6 | - 8888:8888 7 | environment: 8 | - LTR_DOCKER=yes 9 | links: 10 | - elastic 11 | - solr 12 | 13 | elastic: 14 | build: 15 | context: ./notebooks/elasticsearch/.docker/es-docker/ 16 | dockerfile: Dockerfile 17 | container_name: hello-ltr-elastic 18 | environment: 19 | - xpack.security.enabled=false 20 | - xpack.security.enrollment.enabled=false 21 | ports: 22 | - 9200:9200 23 | 24 | kibana: 25 | build: 26 | context: ./notebooks/elasticsearch/.docker/kb-docker/ 27 | dockerfile: Dockerfile 28 | container_name: hello-ltr-kibana 29 | expose: 30 | - "5601" 31 | ports: 32 | - "5601:5601" 33 | environment: 34 | ELASTICSEARCH_HOSTS: "http://hello-ltr-elastic:9200" 35 | ELASTICSEARCH_URL: "http://hello-ltr-elastic:9200" 36 | SERVER_HOST: "0.0.0.0" 37 | 38 | opensearch-node1: 39 | build: 40 | context: ./notebooks/opensearch/.docker/opensearch-docker/ 41 | dockerfile: Dockerfile 42 | container_name: hello-ltr-opensearch 43 | ports: 44 | - "9201:9201" 45 | environment: 46 | - "SERVER_HOST=0.0.0.0" 47 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" 48 | - "DISABLE_INSTALL_DEMO_CONFIG=true" 49 | - "DISABLE_SECURITY_PLUGIN=true" 50 | - "discovery.type=single-node" 51 | 52 | osd-dashboards: 53 | build: 54 | context: ./notebooks/opensearch/.docker/osd-docker/ 55 | dockerfile: Dockerfile 56 | container_name: hello-ltr-osd 57 | expose: 58 | - "5602" 59 | ports: 60 | - "5602:5602" 61 | environment: 62 | OPENSEARCH_HOSTS: "http://hello-ltr-opensearch:9201" 63 | OPENSEARCH_URL: "http://hello-ltr-opensearch:9201" 64 | SERVER_HOST: "0.0.0.0" 65 | DISABLE_SECURITY_DASHBOARDS_PLUGIN: true 66 | 67 | 68 | solr: 69 | build: 70 | context: ./notebooks/solr/ 71 | dockerfile: Dockerfile 72 | container_name: hello-ltr-solr 73 | ports: 74 | - 8983:8983 75 | 76 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | If you have issues getting jupyter or the JDK running on the host machine, you can use the files here to setup a docker environment with everything in one place. 2 | 3 | ## Requirements 4 | 5 | - Docker with docker-compose 6 | - Ports 8888/8983/9200/9201 must be available on your host machine 7 | 8 | ## Setup 9 | 10 | Run `docker-compose up -d` 11 | 12 | The above command will build all images necessary for the project and run the following services: 13 | 14 | - Jupyter available at localhost:8888 15 | - Solr available at localhost:8983 16 | - Elasticsearch available at localhost:9200 17 | - OpenSearch available at localhost:9201 18 | 19 | ## Cleanup 20 | 21 | - To shut things down and return later run `docker-compose stop` 22 | - To get rid of everything run `docker-compose down` 23 | -------------------------------------------------------------------------------- /ltr/__init__.py: -------------------------------------------------------------------------------- 1 | # Make the most important pieces just available as 2 | # ie - from ltr import download 3 | from .download import download 4 | from .evaluate import evaluate, rre_table 5 | from .search import search 6 | -------------------------------------------------------------------------------- /ltr/clickmodels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/o19s/hello-ltr/58264c292c7805c24aebb5d33abd6fd0b75eedaa/ltr/clickmodels/__init__.py -------------------------------------------------------------------------------- /ltr/clickmodels/cascade.py: -------------------------------------------------------------------------------- 1 | from ltr.clickmodels.session import build 2 | from collections import Counter, defaultdict 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Attractiveness per query-doc 7 | self.attracts = defaultdict(lambda : 0.5) 8 | 9 | def cascade_model(sessions): 10 | """ Cascading model can be solved directly: 11 | - sessions with skips count against a doc 12 | - sessions with clicks count for 13 | - stop at first click 14 | """ 15 | session_counts = Counter() 16 | click_counts = Counter() 17 | model=Model() 18 | 19 | for session in sessions: 20 | for rank, doc in enumerate(session.docs): 21 | query_doc_key = (session.query, doc.doc_id) 22 | session_counts[query_doc_key] += 1 23 | 24 | if doc.click: 25 | # Cascading model doesn't consider 26 | # clicks past the last one, so we count 27 | # this one and break out 28 | click_counts[query_doc_key] += 1 29 | break; 30 | 31 | for (query_id, doc_id), count in session_counts.items(): 32 | query_doc_key = (query_id, doc_id) 33 | model.attracts[query_doc_key] = click_counts[query_doc_key] / session_counts[query_doc_key] 34 | return model 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | sessions = build([ 40 | ('A', ((1, True), (2, False), (3, True), (0, False))), 41 | ('B', ((5, False), (2, True), (3, True), (0, False))), 42 | ('A', ((1, False), (2, False), (3, True), (0, False))), 43 | ('B', ((1, False), (2, False), (3, False), (9, True))), 44 | ('A', ((9, False), (2, False), (1, True), (0, True))), 45 | ('B', ((6, True), (2, False), (3, True), (1, False))), 46 | ('A', ((7, False), (4, True), (1, False), (3, False))), 47 | ('B', ((8, True), (2, False), (3, True), (1, False))), 48 | ('A', ((1, False), (4, True), (2, False), (3, False))), 49 | ('B', ((7, True), (4, False), (5, True), (1, True))), 50 | ]) 51 | cascade_model(sessions) 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /ltr/clickmodels/coec.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | class Model(): 4 | def __init__(self): 5 | # COEC statistic 6 | self.coecs = Counter() 7 | 8 | # CTR for each query-doc pair in this session 9 | self.ctrs = {} 10 | 11 | def coec(ctr_by_rank, sessions): 12 | """ Clicks over expected clicks is a metric 13 | used for seeing what items get above or 14 | below average CTR for their rank. From paper 15 | 16 | > Personalized Click Prediction in Sponsored Search 17 | by Cheng, Cantu Paz 18 | 19 | A COEC > 1 means above average CTR for it's position 20 | A COEC < 1 means below average 21 | 22 | -ctr_by_rank is the global CTR at each rank position 23 | -sessions are an array of search session objects 24 | 25 | returned: 26 | each query-doc pair in provided sessions COEC 27 | 28 | """ 29 | clicks = Counter() 30 | weighted_impressions = Counter() 31 | 32 | for session in sessions: 33 | for rank, doc in enumerate(session.docs): 34 | weighted_impressions[(session.query, doc.doc_id)] += ctr_by_rank[rank] 35 | if doc.click: 36 | clicks[(session.query, doc.doc_id)] += 1 37 | 38 | model = Model() 39 | for query_id, doc_id in weighted_impressions: 40 | model.coecs[(query_id,doc_id)] = \ 41 | clicks[(query_id,doc_id)] / weighted_impressions[(query_id,doc_id)] 42 | 43 | return model 44 | -------------------------------------------------------------------------------- /ltr/clickmodels/conversion.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | def conv_aug_attracts(attracts, sessions, costs): 4 | """ Rescan sessions, using click-derrived attractiveness. 5 | 6 | If theres no conversion, punish the attractiveness derrived judgment 7 | 8 | BUT we punish costly things less, and cheap things more 9 | """ 10 | satisfacts = Counter() 11 | counts = Counter() 12 | for session in sessions: 13 | for rank, doc in enumerate(session.docs): 14 | attract = attracts[(session.query, doc.doc_id)] 15 | if doc.click: 16 | if doc.conversion: 17 | # Confirms the attractiveness was real with actual relevance 18 | counts[(session.query, doc.doc_id)] += 1 19 | satisfacts[(session.query, doc.doc_id)] += attract 20 | else: 21 | # If it costs a lot, and there wasn't a conversion, 22 | # thats ok, we default to attractiveness 23 | # If it costs little, and there wasn't a conversion, 24 | # thats generally not ok, why didn't they do (easy action) 25 | counts[(session.query, doc.doc_id)] += 1 26 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id] 27 | else: 28 | counts[(session.query, doc.doc_id)] += 1 29 | satisfacts[(session.query, doc.doc_id)] += attract * costs[doc.doc_id] 30 | 31 | for (query_id, doc_id), count in counts.items(): 32 | satisfacts[(query_id, doc_id)] = satisfacts[(query_id,doc_id)] / count 33 | 34 | return satisfacts 35 | 36 | 37 | -------------------------------------------------------------------------------- /ltr/clickmodels/sdbn.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | from ltr.clickmodels.session import build 3 | 4 | class Model(): 5 | def __init__(self): 6 | # Satisfaction per query-doc 7 | self.satisfacts = defaultdict(lambda: 0.1) 8 | 9 | # Attractiveness per query-doc 10 | self.attracts = defaultdict(lambda : 0.1) 11 | 12 | reverse_enumerate = lambda l: zip(range(len(l)-1, -1, -1), reversed(l)) 13 | 14 | 15 | def sdbn(sessions): 16 | """ Simplified Dynamic Bayesian Network is a simpler 17 | version of the much more complex Dynamic Bayesian Network 18 | that the authors say comes close to the accuracy of DBN 19 | 20 | Most importantly, it can be solved directly and simply without 21 | an EM learning process 22 | 23 | Features of sdbn: 24 | - Attractiveness is any click out of sessions where that document 25 | appears before the last click of the session 26 | - Satisfaction occurs when a doc is the last document clicked 27 | out of all sessions where that document is clicked 28 | 29 | """ 30 | model = Model() 31 | NO_CLICK = -1 32 | counts = Counter() 33 | clicks = Counter() 34 | last_clicks = Counter() 35 | for session in sessions: 36 | last_click = NO_CLICK 37 | for rank, doc in reverse_enumerate(session.docs): 38 | if last_click == NO_CLICK and doc.click: 39 | last_click = rank 40 | 41 | if last_click != NO_CLICK: 42 | query_doc = (session.query, doc.doc_id) 43 | counts[query_doc] += 1 44 | 45 | if doc.click: 46 | # Cascading model doesn't consider 47 | # clicks past the last one, so we count 48 | # this one and break out 49 | clicks[query_doc] += 1 50 | if rank == last_click: 51 | last_clicks[query_doc] += 1 52 | 53 | # For all meaningful sessions (where query_doc appear) 54 | # count attractiveness clicks / num sessions 55 | # count satisfacts last clicks / sessions with clicks 56 | for query_doc, count in counts.items(): 57 | model.attracts[query_doc] = clicks[query_doc] / count 58 | if query_doc in clicks: 59 | model.satisfacts[query_doc] = last_clicks[query_doc] / clicks[query_doc] 60 | return model 61 | 62 | 63 | if __name__ == "__main__": 64 | sessions = build([ 65 | ('A', ((1, True), (2, False), (3, True), (0, False))), 66 | ('B', ((5, False), (2, True), (3, True), (0, False))), 67 | ('A', ((1, False), (2, False), (3, True), (0, False))), 68 | ('B', ((1, False), (2, False), (3, False), (9, True))), 69 | ('A', ((9, False), (2, False), (1, True), (0, True))), 70 | ('B', ((6, True), (2, False), (3, True), (1, False))), 71 | ('A', ((7, False), (4, True), (1, False), (3, False))), 72 | ('B', ((8, True), (2, False), (3, True), (1, False))), 73 | ('A', ((1, False), (4, True), (2, False), (3, False))), 74 | ('B', ((7, True), (4, False), (5, True), (1, True))), 75 | ]) 76 | model = sdbn(sessions) 77 | print(model.attracts[('A', 1)]) 78 | print(model.satisfacts[('A', 1)]) 79 | print(model.attracts[('B', 1)]) 80 | print(model.satisfacts[('B', 1)]) 81 | -------------------------------------------------------------------------------- /ltr/clickmodels/session.py: -------------------------------------------------------------------------------- 1 | 2 | class Doc: 3 | def __init__(self, click, doc_id, conversion=False): 4 | self.click = click 5 | self.doc_id = doc_id 6 | self.conversion = conversion 7 | 8 | def __repr__(self): 9 | return "Doc(doc_id=%s, click=%s, conversion=%s)" % (self.doc_id, self.click, self.conversion) 10 | 11 | def __str__(self): 12 | return "(%s, %s, %s)" % (self.doc_id, self.click, self.conversion) 13 | 14 | 15 | class Session: 16 | def __init__(self, query, docs): 17 | self.query = query 18 | self.docs = docs 19 | # Check if docs are unique 20 | docset = set() 21 | for doc in docs: 22 | if doc.doc_id in docset: 23 | raise ValueError("A session may only list a doc exactly once in search results") 24 | docset.add(doc.doc_id) 25 | 26 | def __repr__(self): 27 | return "Session(query=%s, docs=%s)" % (self.query, self.docs) 28 | 29 | def __str__(self): 30 | return "(%s, (%s))" % (self.query, self.docs) 31 | 32 | 33 | def build_one(sess_tuple): 34 | """ Take a tuple where 35 | 0th item is query (a string that uniquely identifies it) 36 | 1st item is a list of docs, with clicks 37 | and optionally a conversion id or true/false 38 | 39 | 40 | ('A', ((1, True), (2, False), (3, True), (0, False))), 41 | 42 | alternatively a value can be attached to the doc 43 | 44 | ('A', ((1, True, 0.9), (2, False, 0.8), (3, True, 1.0), (0, False))), 45 | """ 46 | query = sess_tuple[0] 47 | docs = [] 48 | for doc_tuple in sess_tuple[1]: 49 | conversion = False 50 | if len(doc_tuple) > 2: 51 | conversion = doc_tuple[2] 52 | docs.append(Doc(doc_id=doc_tuple[0], 53 | click=doc_tuple[1], 54 | conversion=conversion)) 55 | return Session(query=query, docs=docs) 56 | 57 | 58 | def build(sess_tuples): 59 | sesss = [] 60 | for sess_tup in sess_tuples: 61 | sesss.append(build_one(sess_tup)) 62 | return sesss 63 | 64 | -------------------------------------------------------------------------------- /ltr/client/__init__.py: -------------------------------------------------------------------------------- 1 | from .elastic_client import ElasticClient 2 | from .solr_client import SolrClient 3 | from .opensearch_client import OpenSearchClient -------------------------------------------------------------------------------- /ltr/client/base_client.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | ''' 4 | This project demonstrates working with LTR in Elasticsearch and Solr 5 | 6 | The goal of this class is to abstract away the server and highlight the steps 7 | required to begin working with LTR. This keeps the examples agnostic about 8 | which backend is being used, but the implementations of each client 9 | should be useful references to those getting started with LTR on 10 | their specific platform 11 | ''' 12 | class BaseClient(ABC): 13 | @abstractmethod 14 | def get_host(self): 15 | pass 16 | 17 | @abstractmethod 18 | def name(self): 19 | pass 20 | 21 | @abstractmethod 22 | def delete_index(self, index): 23 | pass 24 | 25 | @abstractmethod 26 | def create_index(self, index): 27 | pass 28 | 29 | @abstractmethod 30 | def index_documents(self, index, doc_src): 31 | pass 32 | 33 | @abstractmethod 34 | def reset_ltr(self, index): 35 | pass 36 | 37 | @abstractmethod 38 | def create_featureset(self, index, name, ftr_config): 39 | pass 40 | 41 | @abstractmethod 42 | def get_feature_name(self, config, ftr_idx): 43 | pass 44 | 45 | @abstractmethod 46 | def query(self, index, query): 47 | pass 48 | 49 | @abstractmethod 50 | def get_doc(self, doc_id): 51 | pass 52 | 53 | @abstractmethod 54 | def log_query(self, index, featureset, ids, params): 55 | pass 56 | 57 | @abstractmethod 58 | def submit_model(self, featureset, index, model_name, model_payload): 59 | pass 60 | 61 | @abstractmethod 62 | def submit_ranklib_model(self, featureset, index, model_name, model_payload): 63 | pass 64 | 65 | @abstractmethod 66 | def model_query(self, index, model, model_params, query): 67 | pass 68 | 69 | @abstractmethod 70 | def feature_set(self, index, name): 71 | """ Return a mapping of name/feature ordinal 72 | and the raw (search engine specific) feature list""" 73 | pass 74 | 75 | 76 | -------------------------------------------------------------------------------- /ltr/client/solr_parse.py: -------------------------------------------------------------------------------- 1 | def every_other_zipped(lst): 2 | return zip(lst[0::2],lst[1::2]) 3 | 4 | def dictify(nl_tups): 5 | """ Return dict if all keys unique, otherwise 6 | dont modify """ 7 | as_dict = dict(nl_tups) 8 | if len(as_dict) == len(nl_tups): 9 | return as_dict 10 | return nl_tups 11 | 12 | def parse_named_list(lst): 13 | shallow_tups = [tup for tup in every_other_zipped(lst)] 14 | 15 | nl_as_tups = [] 16 | 17 | for tup in shallow_tups: 18 | if isinstance(tup[1], list): 19 | tup = (tup[0], parse_named_list(tup[1])) 20 | nl_as_tups.append(tup) 21 | return dictify(nl_as_tups) 22 | 23 | 24 | def parse_termvect_namedlist(lst, field): 25 | """ Parse the named list and perform some transformations to create consistent 26 | JSON to parse 27 | 28 | Specifically changing {"positions": ...} to {"positions": [1234,4567]} 29 | 30 | """ 31 | 32 | def listify_posns(posn_attrs): 33 | if isinstance(posn_attrs, dict): 34 | assert len(posn_attrs) == 1 35 | return [posn_attrs['position']] 36 | return [posn_attr[1] for posn_attr in posn_attrs] 37 | 38 | 39 | tv_parsed = parse_named_list(lst) 40 | for doc_id, doc_field_tv in tv_parsed.items(): 41 | for field_name, term_vects in doc_field_tv.items(): 42 | # T 43 | if field_name == field: 44 | for term, attrs in term_vects.items(): 45 | for attr_key, attr_val in attrs.items(): 46 | if attr_key == 'positions': 47 | attrs['positions'] = listify_posns(attr_val) 48 | return tv_parsed 49 | 50 | 51 | 52 | if __name__ == "__main__": 53 | solr_nl = [ 54 | "D100000", [ 55 | "uniqueKey", "D100000", 56 | "body", [ 57 | "1", [ 58 | "positions", [ 59 | "position", 92, 60 | "position", 113 61 | ]], 62 | "2", [ 63 | "positions", [ 64 | "position", 22, 65 | "position", 413 66 | ]], 67 | "boo", [ 68 | "positions", [ 69 | "position", 22, 70 | ]] 71 | ]]] 72 | print(repr(parse_termvect_namedlist(solr_nl, 'body'))) 73 | -------------------------------------------------------------------------------- /ltr/date_genre_judgments.py: -------------------------------------------------------------------------------- 1 | from .judgments import Judgment, judgments_to_file 2 | from tqdm import tqdm 3 | 4 | def genreQid(genre): 5 | if genre == "Science Fiction": 6 | return 1 7 | if genre == "Drama": 8 | return 2 9 | else: 10 | return 0 11 | 12 | 13 | def genreGrade(movie): 14 | """ Create a simple training set, as if we were 15 | searching for a genre. 16 | 17 | Newer science fiction is considered better 18 | Older drama is considered better 19 | 20 | """ 21 | if 'release_year' in movie and movie['release_year'] is not None: 22 | releaseYear = int(movie['release_year']) 23 | else: 24 | return 0 25 | if movie['genres'][0] == "Science Fiction": 26 | if releaseYear > 2015: 27 | return 4 28 | elif releaseYear > 2010: 29 | return 3 30 | elif releaseYear > 2000: 31 | return 2 32 | elif releaseYear > 1990: 33 | return 1 34 | else: 35 | return 0 36 | 37 | if movie['genres'][0] == "Drama": 38 | if releaseYear > 1990: 39 | return 0 40 | elif releaseYear > 1970: 41 | return 1 42 | elif releaseYear > 1950: 43 | return 2 44 | elif releaseYear > 1930: 45 | return 3 46 | else: 47 | return 4 48 | return 0 49 | 50 | 51 | def synthesize(client, judgmentsOutFile='genre_by_date_judgments.txt', autoNegate=False): 52 | print('Generating judgments for scifi & drama movies') 53 | 54 | if client.name() in ['elastic', 'opensearch']: 55 | params = { 56 | "query": { 57 | "match_all": {} 58 | }, 59 | "size": 10000, 60 | "sort": [{"_id": "asc"}] 61 | } 62 | else: 63 | params = { 64 | "q": "*:*", 65 | "rows": 10000, 66 | "sort": "id ASC", 67 | "wt": 'json' 68 | } 69 | 70 | resp = client.query('tmdb', params) 71 | 72 | # Build judgments for each film 73 | judgments = [] 74 | for movie in tqdm(resp): 75 | if 'genres' in movie and len(movie['genres']) > 0: 76 | genre=movie['genres'][0] 77 | qid = genreQid(genre) 78 | if qid == 0: 79 | continue 80 | judgment = Judgment(qid=qid, 81 | grade=genreGrade(movie), 82 | docId=movie['id'], 83 | keywords=genre) 84 | judgments.append(judgment) 85 | 86 | # This movie is good for its genre, but 87 | # a bad result for the opposite genre 88 | negGenre = None 89 | if genre == "Science Fiction": 90 | negGenre = "Drama" 91 | elif genre == "Drama": 92 | negGenre = "Science Fiction" 93 | 94 | if autoNegate and negGenre is not None: 95 | negQid=genreQid(negGenre) 96 | judgment = Judgment(qid=negQid, 97 | grade=0, 98 | docId=movie['id'], 99 | keywords=negGenre) 100 | judgments.append(judgment) 101 | 102 | with open(judgmentsOutFile, 'w') as f: 103 | judgments_to_file(f, judgmentsList=judgments) 104 | 105 | return judgments 106 | -------------------------------------------------------------------------------- /ltr/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from os import path 3 | 4 | def download_one(uri, dest='data/', force=False): 5 | import os 6 | 7 | if not os.path.exists(dest): 8 | os.makedirs(dest) 9 | 10 | if not os.path.isdir(dest): 11 | raise ValueError("dest {} is not a directory".format(dest)) 12 | 13 | filename = uri[uri.rfind('/') + 1:] 14 | filepath = os.path.join(dest, filename) 15 | if path.exists(filepath): 16 | if not force: 17 | print(filepath + ' already exists') 18 | return 19 | print("exists but force=True, Downloading anyway") 20 | 21 | with open(filepath, 'wb') as out: 22 | print('GET {}'.format(uri)) 23 | resp = requests.get(uri, stream=True) 24 | for chunk in resp.iter_content(chunk_size=1024): 25 | if chunk: 26 | out.write(chunk) 27 | 28 | def download(uris, dest='data/', force=False): 29 | for uri in uris: 30 | download_one(uri=uri, dest=dest, force=force) 31 | -------------------------------------------------------------------------------- /ltr/evaluate.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import plotly.graph_objs as go 5 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 6 | 7 | def log_run(cmd): 8 | resp = os.popen(cmd).read() 9 | print(resp) 10 | 11 | def quiet_run(cmd): 12 | os.popen(cmd).read() 13 | 14 | def evaluate(mode): 15 | # Build the docker image 16 | acceptable_modes = ['elastic', 'solr', 'opensearch'] 17 | if mode not in acceptable_modes: 18 | raise ValueError(f"{mode} is not a supported value for mode. must be one of {acceptable_modes}") 19 | 20 | cmd = f'docker build --no-cache -t ltr-rre rre/{mode}/.' 21 | 22 | print('Building RRE image - This will take a while') 23 | quiet_run(cmd) 24 | 25 | # Remove and run a fresh docker image 26 | cmd = 'docker rm -f ltr-rre' 27 | quiet_run(cmd) 28 | 29 | cmd = 'docker run --name ltr-rre ltr-rre' 30 | print('Running evaluation') 31 | log_run(cmd) 32 | 33 | # Copy out reports 34 | cmd = 'docker cp ltr-rre:/rre/target/rre/evaluation.json data/rre-evaluation.json' 35 | log_run(cmd) 36 | 37 | cmd = 'docker cp ltr-rre:/rre/target/site/rre-report.xlsx data/rre-report.xlsx' 38 | log_run(cmd) 39 | 40 | print('RRE Evaluation complete') 41 | 42 | 43 | def rre_table(): 44 | init_notebook_mode(connected=True) 45 | 46 | with open('data/rre-evaluation.json') as src: 47 | report = json.load(src) 48 | metrics = report['metrics'] 49 | 50 | experiments = ['baseline', 'classic', 'latest'] 51 | precisions = [] 52 | recalls = [] 53 | errs = [] 54 | 55 | for exp in experiments: 56 | precisions.append(metrics['P']['versions'][exp]['value']) 57 | recalls.append(metrics['R']['versions'][exp]['value']) 58 | errs.append(metrics['ERR@30']['versions'][exp]['value']) 59 | 60 | trace = go.Table( 61 | header=dict(values=['', 'Precision', 'Recall', 'ERR'], fill = dict(color='#AAAAAA')), 62 | cells=dict(values=[ 63 | experiments, 64 | precisions, 65 | recalls, 66 | errs 67 | ]) 68 | ) 69 | 70 | data = [trace] 71 | iplot(data) 72 | 73 | -------------------------------------------------------------------------------- /ltr/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/o19s/hello-ltr/58264c292c7805c24aebb5d33abd6fd0b75eedaa/ltr/helpers/__init__.py -------------------------------------------------------------------------------- /ltr/helpers/butterfingers.py: -------------------------------------------------------------------------------- 1 | def butterfingers(text,prob=0.1,keyboard='qwerty'): 2 | import random 3 | 4 | """ taken from 5 | https://github.com/Decagon/butter-fingers/blob/master/butterfingers/butterfingers.py """ 6 | 7 | keyApprox = {} 8 | 9 | if keyboard == "qwerty": 10 | keyApprox['q'] = "qwasedzx" 11 | keyApprox['w'] = "wqesadrfcx" 12 | keyApprox['e'] = "ewrsfdqazxcvgt" 13 | keyApprox['r'] = "retdgfwsxcvgt" 14 | keyApprox['t'] = "tryfhgedcvbnju" 15 | keyApprox['y'] = "ytugjhrfvbnji" 16 | keyApprox['u'] = "uyihkjtgbnmlo" 17 | keyApprox['i'] = "iuojlkyhnmlp" 18 | keyApprox['o'] = "oipklujm" 19 | keyApprox['p'] = "plo['ik" 20 | 21 | keyApprox['a'] = "aqszwxwdce" 22 | keyApprox['s'] = "swxadrfv" 23 | keyApprox['d'] = "decsfaqgbv" 24 | keyApprox['f'] = "fdgrvwsxyhn" 25 | keyApprox['g'] = "gtbfhedcyjn" 26 | keyApprox['h'] = "hyngjfrvkim" 27 | keyApprox['j'] = "jhknugtblom" 28 | keyApprox['k'] = "kjlinyhn" 29 | keyApprox['l'] = "lokmpujn" 30 | 31 | keyApprox['z'] = "zaxsvde" 32 | keyApprox['x'] = "xzcsdbvfrewq" 33 | keyApprox['c'] = "cxvdfzswergb" 34 | keyApprox['v'] = "vcfbgxdertyn" 35 | keyApprox['b'] = "bvnghcftyun" 36 | keyApprox['n'] = "nbmhjvgtuik" 37 | keyApprox['m'] = "mnkjloik" 38 | keyApprox[' '] = " " 39 | else: 40 | print("Keyboard not supported.") 41 | 42 | probOfTypo = int(prob * 100) 43 | 44 | buttertext = "" 45 | for letter in text: 46 | lcletter = letter.lower() 47 | if not lcletter in keyApprox.keys(): 48 | newletter = lcletter 49 | else: 50 | if random.choice(range(0, 100)) <= probOfTypo: 51 | newletter = random.choice(keyApprox[lcletter]) 52 | else: 53 | newletter = lcletter 54 | # go back to original case 55 | if not lcletter == letter: 56 | newletter = newletter.upper() 57 | buttertext += newletter 58 | 59 | return buttertext 60 | 61 | 62 | -------------------------------------------------------------------------------- /ltr/helpers/convert.py: -------------------------------------------------------------------------------- 1 | # converts LambdaMART XML models to JSON for Solr.. 2 | 3 | import xml.etree.ElementTree as ET 4 | 5 | 6 | def convert(ensemble_xml_string, modelName, featureSet, featureMapping): 7 | modelClass = 'org.apache.solr.ltr.model.MultipleAdditiveTreesModel' 8 | 9 | model = { 10 | 'store': featureSet, 11 | 'name': modelName, 12 | 'class': modelClass, 13 | 'features': featureMapping 14 | } 15 | 16 | # Clean up header 17 | ensemble_xml_string = '\n'.join(ensemble_xml_string.split('\n')[7:]) 18 | lambdaModel = ET.fromstring(ensemble_xml_string) 19 | 20 | trees = [] 21 | for node in lambdaModel: 22 | t = { 23 | 'weight': str(node.attrib['weight']), 24 | 'root': parseSplits(node[0], featureMapping) 25 | } 26 | trees.append(t) 27 | 28 | # print(trees) 29 | model['params'] = {'trees': trees} 30 | 31 | return model 32 | 33 | def parseSplits(split, features): 34 | obj = {} 35 | for el in split: 36 | if (el.tag == 'feature'): 37 | obj['feature'] = features[(int(el.text.strip()) - 1)]['name'] 38 | elif (el.tag == 'threshold'): 39 | obj['threshold'] = str(el.text.strip()) 40 | elif (el.tag == 'split' and 'pos' in el.attrib): 41 | obj[el.attrib['pos']] = parseSplits(el, features) 42 | elif (el.tag == 'output'): 43 | obj['value'] = str(el.text.strip()) 44 | return obj 45 | -------------------------------------------------------------------------------- /ltr/helpers/defaultlist.py: -------------------------------------------------------------------------------- 1 | class DefaultList(list): 2 | """ adapted from https://stackoverflow.com/a/869901/8123""" 3 | 4 | def __init__(self, factory): 5 | self.factory = factory 6 | 7 | def __getitem__(self, index): 8 | size = len(self) 9 | if index >= size: 10 | self.extend(self.factory() for _ in range(size, index + 1)) 11 | 12 | return list.__getitem__(self, index) 13 | 14 | def __setitem__(self, index, value): 15 | size = len(self) 16 | if index >= size: 17 | self.extend(self.factory() for _ in range(size, index + 1)) 18 | 19 | list.__setitem__(self, index, value) 20 | 21 | def defaultlist(factory): 22 | return DefaultList(factory) 23 | -------------------------------------------------------------------------------- /ltr/helpers/esUrlParse.py: -------------------------------------------------------------------------------- 1 | def parseUrl(fullEsUrl): 2 | from urllib.parse import urlsplit, urlunsplit 3 | import os.path 4 | o = urlsplit(fullEsUrl) 5 | 6 | esUrl = urlunsplit([o.scheme, o.netloc, '','','']) 7 | 8 | indexAndSearchType = os.path.split(o.path) 9 | 10 | return (esUrl, indexAndSearchType[0][1:], indexAndSearchType[1]) 11 | 12 | 13 | if __name__ == "__main__": 14 | from sys import argv 15 | print(parseUrl(argv[1])) 16 | -------------------------------------------------------------------------------- /ltr/helpers/handle_resp.py: -------------------------------------------------------------------------------- 1 | def resp_msg(msg, resp, throw=True, ignore=[]): 2 | rsc = resp.status_code 3 | print('{} [Status: {}]'.format(msg, rsc)) 4 | if rsc >= 400 and rsc not in ignore: 5 | if throw: 6 | raise RuntimeError(resp.text) 7 | 8 | -------------------------------------------------------------------------------- /ltr/helpers/movies.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | 4 | class Memoize: 5 | """ Adapted from 6 | https://stackoverflow.com/questions/1988804/what-is-memoization-and-how-can-i-use-it-in-python""" 7 | def __init__(self, f): 8 | self.f = f 9 | self.memo = {} 10 | def __call__(self, *args): 11 | if not args in self.memo: 12 | self.memo[args] = self.f(*args) 13 | #Warning: You may wish to do a deepcopy here if returning objects 14 | return self.memo[args] 15 | 16 | @Memoize 17 | def load_movies(json_path): 18 | return json.load(open(json_path)) 19 | 20 | def get_movie(tmdb_id, movies='data/tmdb.json'): 21 | movies = load_movies(movies) 22 | tmdb_id=str(tmdb_id) 23 | return movies[tmdb_id] 24 | 25 | def noop(src_movie, base_doc): 26 | return base_doc 27 | 28 | 29 | def indexable_movies(enrich=noop, movies='data/tmdb.json'): 30 | """ Generates TMDB movies, similar to how ES Bulk indexing 31 | uses a generator to generate bulk index/update actions""" 32 | movies = load_movies(movies) 33 | idx = 0 34 | for movieId, tmdbMovie in tqdm(movies.items(),total=len(movies)): 35 | try: 36 | releaseDate = None 37 | if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0: 38 | releaseDate = tmdbMovie['release_date'] 39 | releaseYear = releaseDate[0:4] 40 | 41 | full_poster_path = '' 42 | if 'poster_path' in tmdbMovie and tmdbMovie['poster_path'] is not None and len(tmdbMovie['poster_path']) > 0: 43 | full_poster_path = 'https://image.tmdb.org/t/p/w185' + tmdbMovie['poster_path'] 44 | 45 | base_doc = {'id': movieId, 46 | 'title': tmdbMovie['title'], 47 | 'overview': tmdbMovie['overview'], 48 | 'tagline': tmdbMovie['tagline'], 49 | 'directors': [director['name'] for director in tmdbMovie['directors']], 50 | 'cast': " ".join([castMember['name'] for castMember in tmdbMovie['cast']]), 51 | 'genres': [genre['name'] for genre in tmdbMovie['genres']], 52 | 'release_date': releaseDate, 53 | 'release_year': releaseYear, 54 | 'poster_path': full_poster_path, 55 | 'vote_average': float(tmdbMovie['vote_average']) if 'vote_average' in tmdbMovie else None, 56 | 'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else 0, 57 | } 58 | yield enrich(tmdbMovie, base_doc) 59 | idx += 1 60 | except KeyError as k: # Ignore any movies missing these attributes 61 | continue 62 | -------------------------------------------------------------------------------- /ltr/helpers/msmarco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/o19s/hello-ltr/58264c292c7805c24aebb5d33abd6fd0b75eedaa/ltr/helpers/msmarco/__init__.py -------------------------------------------------------------------------------- /ltr/helpers/msmarco/evaluate.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | 4 | 5 | class QRel(): 6 | 7 | def __init__(self, qid, docid, keywords): 8 | self.qid=qid 9 | self.docid=docid 10 | self.keywords = keywords 11 | 12 | def eval_rr(self, doc_ranking): 13 | """ Evaluate the provided doc ranking using reciprical rank 14 | (1/rank of the expected doc) 15 | 16 | returns 0 if this qrels doc id is missing 17 | """ 18 | 19 | for rank, docid in enumerate(doc_ranking, start=1): 20 | if docid == self.docid: 21 | return 1.0 / rank 22 | return 0.0 23 | 24 | @staticmethod 25 | def read_qrels(qrels_fname='data/msmarco-doctrain-qrels.tsv.gz', 26 | queries_fname='data/msmarco-doctrain-queries.tsv.gz'): 27 | 28 | qids_to_keywords = QRel.get_keyword_lookup(queries_fname) 29 | 30 | with gzip.open(qrels_fname, 'rt') as f: 31 | reader = csv.reader(f, delimiter=' ') 32 | for row in reader: 33 | qid = row[0] 34 | keywords = None 35 | if qid in qids_to_keywords: 36 | keywords = qids_to_keywords[qid] 37 | else: 38 | print("Missing keywords for %s" % qid) 39 | yield QRel(qid=row[0], docid=row[2], keywords=keywords) 40 | 41 | @staticmethod 42 | def get_keyword_lookup(fname='data/msmarco-doctrain-queries.tsv.gz'): 43 | qids_to_keywords = {} 44 | with gzip.open(fname, 'rt') as f: 45 | reader = csv.reader(f, delimiter='\t') 46 | for row in reader: 47 | qids_to_keywords[row[0]] = row[1] 48 | return qids_to_keywords 49 | 50 | def __str__(self): 51 | return "qid:%s(%s) => doc:%s" % (self.qid, self.keywords, self.docid) 52 | 53 | 54 | if __name__ == "__main__": 55 | qrels = {} 56 | for qrel in QRel.read_qrels(): 57 | qrels[qrel.qid] = qrel 58 | 59 | print(qrels['1185869'].eval_rr(['1','1'])) 60 | 61 | -------------------------------------------------------------------------------- /ltr/helpers/solr_escape.py: -------------------------------------------------------------------------------- 1 | def esc_kw(kw): 2 | """ Take a keyword and escape all the 3 | Solr parts we want to escape!""" 4 | kw = kw.replace('\\', '\\\\') # be sure to do this first, as we inject \! 5 | kw = kw.replace('(', '\(') 6 | kw = kw.replace(')', '\)') 7 | kw = kw.replace('+', '\+') 8 | kw = kw.replace('-', '\-') 9 | kw = kw.replace(':', '\:') 10 | kw = kw.replace('/', '\/') 11 | kw = kw.replace(']', '\]') 12 | kw = kw.replace('[', '\[') 13 | kw = kw.replace('*', '\*') 14 | kw = kw.replace('?', '\?') 15 | kw = kw.replace('{', '\{') 16 | kw = kw.replace('}', '\}') 17 | kw = kw.replace('~', '\~') 18 | 19 | 20 | return kw 21 | -------------------------------------------------------------------------------- /ltr/helpers/tau.py: -------------------------------------------------------------------------------- 1 | sign = lambda a: (a>0) - (a<0) 2 | 3 | def pairs_in_order(ranking, both_ways=True): 4 | assert len(ranking) > 1 5 | for idx1, val1 in enumerate(ranking): 6 | for idx2, val2 in enumerate(ranking): 7 | if idx2 > idx1: 8 | yield val1, val2, sign(idx2-idx1) 9 | if both_ways: 10 | yield val2, val1, sign(idx1-idx2) 11 | 12 | def tau(rank1, rank2, at=4): 13 | rank1in = {} 14 | 15 | 16 | if len(rank1) < at or len(rank2) < at: 17 | raise ValueError("rankings must be larger than provided at param(%s)" % at) 18 | 19 | # Handle 1 as a special case 20 | if at == 1: 21 | if rank1[0] == rank2[0]: 22 | return 1 23 | return -1 24 | 25 | rank1 = rank1[:at]; rank2 = rank2[:at] 26 | 27 | # gather concordances/discords for rank1 28 | for val1, val2, order in pairs_in_order(rank1, both_ways=True): 29 | rank1in[(val1,val2)] = order 30 | 31 | # check rank2 32 | concords = 0 33 | discords = 0 34 | for val1, val2, order in pairs_in_order(rank2, both_ways=False): 35 | try: 36 | rank1order = rank1in[(val1,val2)] 37 | if order == rank1order: 38 | concords += 1 39 | else: 40 | discords += 1 41 | except KeyError: 42 | discords += 1 43 | 44 | return (concords - discords) / ((at * (at - 1)) / 2) 45 | 46 | def avg_tau(rank1, rank2, at=4): 47 | if len(rank1) < at or len(rank2) < at: 48 | raise ValueError("rankings must be larger than provided at param(%s)" % at) 49 | 50 | rank1 = rank1[:at]; rank2 = rank2[:at] 51 | 52 | tot = 0 53 | for i in range(1,at+1): 54 | tot += tau(rank1,rank2,at=i) 55 | return tot / (at) 56 | 57 | if __name__ == "__main__": 58 | print(tau([1,2,3,4],[4,3,2,1])) 59 | print(tau([1,2,3,4],[1,2,3,4])) 60 | print(tau([1,2,4,3],[1,2,3,4])) 61 | print(tau([5,6,7,8],[1,2,3,4])) 62 | print(tau([1,2,3,5],[1,2,3,4])) 63 | print(tau([5,3,2,1],[4,3,2,1])) 64 | l1=[1,2,4,3]; l2=[1,2,3,4]; l3=[2,1,3,4] 65 | print("avg_tau(%s,%s,at=4) %s" % (l1, l1, avg_tau(l1,l1))) 66 | print("avg_tau(%s,%s,at=4) %s" % (l1, l2, avg_tau(l1,l2))) 67 | print("avg_tau(%s,%s,at=4) %s" % (l2, l3, avg_tau(l1,l3))) 68 | print("tau(%s,%s,at=4) %s" % (l1, l2, tau(l1,l2))) 69 | print("tau(%s,%s,at=4) %s" % (l2, l3, tau(l1,l3))) 70 | 71 | -------------------------------------------------------------------------------- /ltr/index.py: -------------------------------------------------------------------------------- 1 | from ltr.helpers.movies import indexable_movies, noop 2 | 3 | def rebuild(client, index, doc_src, force = False): 4 | """ Reload a configuration on disk for each search engine 5 | (Solr a configset, Elasticsearch a json file) 6 | and reindex 7 | """ 8 | 9 | if client.check_index_exists(index): 10 | if (force): 11 | client.delete_index(index) 12 | client.create_index(index) 13 | client.index_documents(index, doc_src=doc_src) 14 | else: 15 | print("Index {} already exists. Use `force = True` to delete and recreate".format(index)) 16 | return None 17 | else: 18 | client.create_index(index) 19 | client.index_documents(index, doc_src=doc_src) 20 | -------------------------------------------------------------------------------- /ltr/injectTypos.py: -------------------------------------------------------------------------------- 1 | try: 2 | from judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid 3 | from butterfingers import butterfingers 4 | except ImportError: 5 | from .judgments import Judgment, judgments_from_file, judgments_to_file, judgments_by_qid 6 | from .butterfingers import butterfingers 7 | 8 | 9 | 10 | def typoIt(judgmentInFile, judgmentOutFile, rounds=100): 11 | with open(judgmentInFile) as f: 12 | currJudgments = [judg for judg in judgments_from_file(f)] 13 | lastQid = currJudgments[-1].qid 14 | judgDict = judgments_by_qid(currJudgments) 15 | 16 | existingTypos = set() 17 | 18 | for i in range(0, rounds): 19 | 20 | for qid, judglist in judgDict.items(): 21 | keywords = judglist[0].keywords 22 | keywordsWTypo = butterfingers(keywords) 23 | 24 | if keywordsWTypo != keywords and keywordsWTypo not in existingTypos: 25 | newQid = lastQid+1 26 | print("%s => %s" % (keywords, keywordsWTypo)) 27 | lastQid += 1 28 | for judg in judglist: 29 | typoJudg = Judgment(grade=judg.grade, 30 | qid=newQid, 31 | keywords=keywordsWTypo, 32 | docId=judg.docId) 33 | currJudgments.append(typoJudg) 34 | existingTypos.add(keywordsWTypo) 35 | 36 | with open(judgmentOutFile, 'w') as f: 37 | judgments_to_file(f, judgmentsList=currJudgments) 38 | 39 | 40 | if __name__ == "__main__": 41 | typoIt(judgmentInFile='title_judgments.txt', judgmentOutFile='title_fuzzy_judgments.txt') 42 | 43 | 44 | # Clone a judgment, inject random typos 45 | -------------------------------------------------------------------------------- /ltr/log.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class FeatureLogger: 4 | """ Logs LTR Features, one query at a time 5 | 6 | ...Building up a training set... 7 | """ 8 | 9 | def __init__(self, client, index, feature_set, drop_missing=True): 10 | self.client=client 11 | self.index=index 12 | self.feature_set=feature_set 13 | self.drop_missing=drop_missing 14 | self.logged=[] 15 | 16 | def clear(self): 17 | self.logged=[] 18 | 19 | def log_for_qid(self, qid, judgments, keywords): 20 | """ Log a set of judgments associated with a single qid 21 | judgments will be modified, a training set also returned, discarding 22 | any judgments we could not log features for (because the doc was missing) 23 | """ 24 | featuresPerDoc = {} 25 | judgments = [j for j in judgments] 26 | docIds = [judgment.docId for judgment in judgments] 27 | 28 | # Check for dups of documents 29 | for docId in docIds: 30 | indices = [i for i, x in enumerate(docIds) if x == docId] 31 | if len(indices) > 1: 32 | # print("Duplicate Doc in qid:%s %s" % (qid, docId)) 33 | pass 34 | 35 | # For every batch of N docs to generate judgments for 36 | BATCH_SIZE = 500 37 | numLeft = len(docIds) 38 | for i in range(0, 1 + (len(docIds) // BATCH_SIZE)): 39 | 40 | numFetch = min(BATCH_SIZE, numLeft) 41 | start = i*BATCH_SIZE 42 | if start >= len(docIds): 43 | break 44 | ids = docIds[start:start+numFetch] 45 | 46 | # Sanitize (Solr has a strict syntax that can easily be tripped up) 47 | # This removes anything but alphanumeric and spaces 48 | keywords = re.sub('([^\s\w]|_)+', '', keywords) 49 | 50 | params = { 51 | "keywords": keywords, 52 | "fuzzy_keywords": ' '.join([x + '~' for x in keywords.split(' ')]), 53 | "keywordsList": [keywords] # Needed by TSQ for the time being 54 | } 55 | 56 | res = self.client.log_query(self.index, self.feature_set, ids, params) 57 | 58 | # Add feature back to each judgment 59 | for doc in res: 60 | docId = str(doc['id']) 61 | features = doc['ltr_features'] 62 | featuresPerDoc[docId] = features 63 | numLeft -= BATCH_SIZE 64 | 65 | # Append features from search engine back to ranklib judgment list 66 | for judgment in judgments: 67 | try: 68 | features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index 69 | judgment.features = features 70 | except KeyError: 71 | pass 72 | # print("Missing doc %s" % judgment.docId) 73 | 74 | # Return a paired down judgments if we are missing features for judgments 75 | training_set = [] 76 | discarded = [] 77 | for judgment in judgments: 78 | if self.drop_missing: 79 | if judgment.has_features(): 80 | training_set.append(judgment) 81 | else: 82 | discarded.append(judgment) 83 | else: 84 | training_set.append(judgment) 85 | # print("Discarded %s Keep %s" % (len(discarded), len(training_set))) 86 | self.logged.extend(training_set) 87 | return training_set, discarded 88 | -------------------------------------------------------------------------------- /ltr/p9_plots.py: -------------------------------------------------------------------------------- 1 | def plot_grades(dat): 2 | import plotnine as p9 3 | 4 | p = { 5 | p9.ggplot(dat, p9.aes('grade')) + 6 | p9.geom_bar() + 7 | p9.facet_wrap('keywords') 8 | } 9 | 10 | return p 11 | 12 | def plot_features(dat): 13 | import plotnine as p9 14 | 15 | p = { 16 | p9.ggplot(dat, p9.aes('grade', 'features', color = 'keywords')) + 17 | p9.geom_jitter(alpha = .5) + 18 | p9.facet_wrap('feature_id', scales = 'free_y', labeller = 'label_both') + 19 | p9.labs(y='Feature values', x='Relevance grade') 20 | } 21 | 22 | return p -------------------------------------------------------------------------------- /ltr/release_date_plot.py: -------------------------------------------------------------------------------- 1 | import plotly.graph_objs as go 2 | from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot 3 | 4 | def search(client, user_query, model_name): 5 | if client.name() in ['elastic', 'opensearch']: 6 | engine_query = { 7 | "bool": { 8 | "must": {"match_all": {} }, 9 | "filter": { 10 | "match": {"title": user_query} 11 | } 12 | } 13 | } 14 | else: 15 | engine_query = 'title:('+ user_query + ')^0' 16 | return client.model_query('tmdb', model_name, {}, engine_query) 17 | 18 | def plot(client, query, models = ['classic', 'latest']): 19 | init_notebook_mode(connected=True) 20 | 21 | modelData = [] 22 | 23 | for model in models: 24 | modelData.append(search(client, query, model)) 25 | 26 | xAxes = [] 27 | for i in range(len(modelData[0])): 28 | xAxes.append(i) 29 | 30 | trace0 = go.Scatter( 31 | x = xAxes, 32 | y = [int(x['release_year']) for x in modelData[0]], 33 | mode = "lines", 34 | name = models[0], 35 | text = [f'{x["title"]} ({x["score"]})' for x in modelData[0]] 36 | ) 37 | 38 | trace1 = go.Scatter( 39 | x = xAxes, 40 | y = [int(x['release_year']) for x in modelData[1]], 41 | mode = "lines", 42 | name = models[1], 43 | text = [f'{x["title"]} ({x["score"]})' for x in modelData[1]] 44 | ) 45 | 46 | 47 | data = [trace0, trace1] 48 | fig = go.Figure(data=data) 49 | iplot(fig) 50 | -------------------------------------------------------------------------------- /ltr/search.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | baseEsQuery = { 4 | "size": 5, 5 | "query": { 6 | "sltr": { 7 | "params": { 8 | "keywords": "", 9 | }, 10 | "model": "" 11 | } 12 | } 13 | } 14 | 15 | def esLtrQuery(keywords, modelName): 16 | import json 17 | baseEsQuery['query']['sltr']['params']['keywords'] = keywords 18 | baseEsQuery['query']['sltr']['params']['keywordsList'] = [keywords] # Needed by TSQ for now 19 | baseEsQuery['query']['sltr']['model'] = modelName 20 | print("%s" % json.dumps(baseEsQuery)) 21 | return baseEsQuery 22 | 23 | # TODO: Parse params and add efi dynamically instead of adding manually to query below 24 | def solrLtrQuery(keywords, modelName): 25 | keywords = re.sub('([^\s\w]|_)+', '', keywords) 26 | fuzzy_keywords = ' '.join([x + '~' for x in keywords.split(' ')]) 27 | 28 | return { 29 | 'fl': '*,score', 30 | 'rows': 5, 31 | 'q': '{{!ltr reRankDocs=30000 model={} efi.keywords="{}" efi.fuzzy_keywords="{}"}}'.format(modelName, keywords, fuzzy_keywords) 32 | } 33 | 34 | 35 | tmdbFields = { 36 | 'title': 'title', 37 | 'display_fields': ['release_year', 'genres', 'overview'] 38 | } 39 | 40 | 41 | 42 | def search(client, keywords, modelName, index='tmdb', fields=tmdbFields): 43 | if client.name() == 'elastic' or client.name() == 'opensearch': 44 | results = client.query(index, esLtrQuery(keywords, modelName)) 45 | else: 46 | q = solrLtrQuery(keywords, modelName) 47 | print(q) 48 | results = client.query(index, q) 49 | 50 | ti = fields['title'] 51 | 52 | for result in results: 53 | print("%s " % (result[ti] if ti in result else 'N/A')) 54 | print("%s " % (result['_score'])) 55 | 56 | for df in fields['display_fields']: 57 | print("%s " % (result[df] if df in result else 'N/A')) 58 | 59 | print("---------------------------------------") 60 | -------------------------------------------------------------------------------- /ltr/years_as_ratings.py: -------------------------------------------------------------------------------- 1 | def get_classic_rating(year): 2 | if year > 2010: 3 | return 0 4 | elif year > 1990: 5 | return 1 6 | elif year > 1970: 7 | return 2 8 | elif year > 1950: 9 | return 3 10 | else: 11 | return 4 12 | 13 | def get_latest_rating(year): 14 | if year > 2010: 15 | return 4 16 | elif year > 1990: 17 | return 3 18 | elif year > 1970: 19 | return 2 20 | elif year > 1950: 21 | return 1 22 | else: 23 | return 0 24 | 25 | def synthesize( 26 | client, 27 | featureSet='release', 28 | latestTrainingSetOut='data/latest-training.txt', 29 | classicTrainingSetOut='data/classic-training.txt' 30 | ): 31 | from ltr.judgments import judgments_to_file, Judgment 32 | NO_ZERO = False 33 | 34 | resp = client.log_query('tmdb', 'release', None) 35 | 36 | # A classic film fan 37 | judgments = [] 38 | print("Generating 'classic' biased judgments:") 39 | for hit in resp: 40 | rating = get_classic_rating(hit['ltr_features'][0]) 41 | 42 | if rating == 0 and NO_ZERO: 43 | continue 44 | 45 | judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords='')) 46 | 47 | 48 | with open(classicTrainingSetOut, 'w') as out: 49 | judgments_to_file(out, judgments) 50 | 51 | # A current film fan 52 | judgments = [] 53 | print("Generating 'recent' biased judgments:") 54 | for hit in resp: 55 | rating = get_latest_rating(hit['ltr_features'][0]) 56 | 57 | if rating == 0 and NO_ZERO: 58 | continue 59 | 60 | judgments.append(Judgment(qid=1,docId=hit['id'],grade=rating,features=hit['ltr_features'],keywords='')) 61 | 62 | 63 | with open(latestTrainingSetOut, 'w') as out: 64 | judgments_to_file(out, judgments) 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/es-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/elasticsearch/elasticsearch:8.8.2 2 | 3 | RUN bin/elasticsearch-plugin install --batch \ 4 | "https://github.com/o19s/elasticsearch-learning-to-rank/releases/download/v1.5.8-es8.8.2/ltr-plugin-v1.5.8-es8.8.2.zip" 5 | 6 | COPY --chown=elasticsearch:elasticsearch elasticsearch.yml /usr/share/elasticsearch/config/ 7 | RUN cat /usr/share/elasticsearch/config/elasticsearch.yml 8 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/es-docker/elasticsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" elasticsearch-tlre 4 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/.docker/kb-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.elastic.co/kibana/kibana:8.8.2 2 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/README.md: -------------------------------------------------------------------------------- 1 | This folder contains some Elasticsearch configuration and a Dockerfile to expedite setting up Elasticsearch with LTR. 2 | 3 | ## Docker 4 | Run `docker-compose up` to create a image running Elasticsearch with LTR 5 | 6 | After the instance is running, load up the "hello-ltr (ES)" notebook. 7 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | kibana: 3 | build: ./.docker/kb-docker/. 4 | expose: 5 | - "5601" 6 | ports: 7 | - "5601:5601" 8 | environment: 9 | SERVER_HOST: "0.0.0.0" 10 | elasticsearch: 11 | build: ./.docker/es-docker/. 12 | ports: 13 | - "9200:9200" 14 | expose: 15 | - "9200" 16 | environment: 17 | SERVER_NAME: "elasticsearch" 18 | volumes: 19 | - tlre-es-data:/usr/share/elasticsearch/data 20 | 21 | volumes: 22 | tlre-es-data: 23 | driver: local 24 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/osc-blog/blog_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "_source": { 4 | "enabled": true 5 | }, 6 | "properties": { 7 | "post_id": { 8 | "type": "long", 9 | "store": true 10 | }, 11 | "post_date": { 12 | "type": "date", 13 | "store": true 14 | }, 15 | "es_update_date": { 16 | "type": "date", 17 | "store": true 18 | }, 19 | "url": { 20 | "type": "text", 21 | "store": true 22 | }, 23 | "title": { 24 | "type": "text", 25 | "store": true, 26 | "analyzer": "content_analyzer", 27 | "fields": { 28 | "bigrams": { 29 | "type": "text", 30 | "analyzer": "content_bigrams" 31 | } 32 | } 33 | }, 34 | "author": { 35 | "type": "text", 36 | "store": true, 37 | "analyzer": "standard" 38 | }, 39 | "content": { 40 | "type": "text", 41 | "store": true, 42 | "analyzer": "content_analyzer", 43 | "fields": { 44 | "bigrams": { 45 | "type": "text", 46 | "analyzer": "content_bigrams" 47 | } 48 | } 49 | }, 50 | "excerpt": { 51 | "type": "text", 52 | "store": true, 53 | "analyzer": "content_analyzer" 54 | }, 55 | "categories": { 56 | "type": "text", 57 | "store": true, 58 | "analyzer": "content_analyzer" 59 | } 60 | } 61 | }, 62 | "settings": { 63 | "number_of_shards": 1, 64 | "number_of_replicas": 1, 65 | "analysis": { 66 | "filter": { 67 | "english_stemmer": { 68 | "type": "stemmer", 69 | "language": "english" 70 | }, 71 | "english_possessive_stemmer": { 72 | "type": "stemmer", 73 | "language": "possessive_english" 74 | }, 75 | "bigram": { 76 | "type": "shingle", 77 | "max_shingle_size": 2, 78 | "output_unigrams": false 79 | } 80 | }, 81 | "analyzer": { 82 | "content_analyzer": { 83 | "type": "custom", 84 | "char_filter": [ 85 | "html_strip" 86 | ], 87 | "filter": [ 88 | "english_possessive_stemmer", 89 | "lowercase", 90 | "english_stemmer" 91 | ], 92 | "tokenizer": "standard" 93 | }, 94 | "content_bigrams": { 95 | "type": "custom", 96 | "char_filter": [ 97 | "html_strip" 98 | ], 99 | "filter": [ 100 | "english_possessive_stemmer", 101 | "lowercase", 102 | "english_stemmer", 103 | "bigram" 104 | ], 105 | "tokenizer": "standard" 106 | } 107 | } 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/osc-blog/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/Dataframes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dataframes\n", 8 | "\n", 9 | "Data frames are the central object of most data science workflows. This notebook shows some helper function that can assist you in creating them from judgements. The older non-dataframe way of passing data is in most of the example notebooks, so use this code anywhere you see that pattern.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import ltr.judgments as judge" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "classic_training_set = [j for j in judge.judgments_from_file(open('data/classic-training.txt'))]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "classic_df = judge.judgments_to_dataframe(classic_training_set)\n", 37 | "classic_df" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Plotting\n", 45 | "\n", 46 | "Is one of the main reasons dataframes are easier to work with. There are two helper functions to show the distribtion of grade (`plot_grades`) and relationship between features and grades (plot_features).\n", 47 | "\n", 48 | "You are encouraged to use whatever python plotting library you are most comformtable with, we have `matplotlib` and `plotnine` installed in the Docker image." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "import ltr.p9_plots as plots\n", 58 | "plots.plot_grades(classic_df)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "classic_df_long = judge.judgments_dataframe_to_long(classic_df)\n", 68 | "classic_df_long" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "plots.plot_features(classic_df_long)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "plots.plot_features" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 3", 93 | "language": "python", 94 | "name": "python3" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 3 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython3", 106 | "version": "3.8.2" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/fmap.txt: -------------------------------------------------------------------------------- 1 | 0 release_year q 2 | 1 features0 q 3 | -------------------------------------------------------------------------------- /notebooks/elasticsearch/tmdb/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/exercises/data/tates_model.txt: -------------------------------------------------------------------------------- 1 | ## Linear Regression 2 | ## Lambda = 1.0E-10 3 | 0:0.06442486267839354 1:0.06442486267839354 2:1.7298168616882517 3:0.06437886168753176 -------------------------------------------------------------------------------- /notebooks/exercises/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/opensearch/.docker/opensearch-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM opensearchproject/opensearch:2.5.0 2 | 3 | 4 | RUN bin/opensearch-plugin install --batch \ 5 | "https://www.github.com/opensearch-project/opensearch-learning-to-rank-base/releases/download/release-v2.1.0/ltr-plugin-v2.1.0.zip" 6 | 7 | COPY --chown=opensearch:opensearch opensearch.yml /usr/share/opensearch/config/ 8 | RUN cat /usr/share/opensearch/config/opensearch.yml 9 | -------------------------------------------------------------------------------- /notebooks/opensearch/.docker/opensearch-docker/opensearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | docker run -p 9301:9301 -p 9400:9400 -e "discovery.type=single-node" opensearch-tlre 4 | -------------------------------------------------------------------------------- /notebooks/opensearch/.docker/opensearch-docker/opensearch.yml: -------------------------------------------------------------------------------- 1 | --- 2 | http.cors.allow-origin: "/http?:.*/" 3 | #http.cors.allow-origin: /http?://localhost(:[0-9]+)?/ 4 | http.cors.enabled: true 5 | indices.query.bool.max_clause_count: 10240 6 | network.host: 0.0.0.0 7 | http.port: 9201 8 | discovery.type: single-node 9 | 10 | # cluster.name: docker-cluster 11 | 12 | ## # minimum_master_nodes need to be explicitly set when bound on a public IP 13 | ## # set to 1 to allow single node clusters 14 | ## discovery.zen.minimum_master_nodes: 1 15 | # 16 | ## Setting network.host to a non-loopback address enables the annoying bootstrap checks. "Single-node" mode disables them again. 17 | ## discovery.type: single-node 18 | # 19 | #http.host: 0.0.0.0 20 | #http.port: 9201 21 | #http.cors.allow-origin: "*" 22 | ##http.cors.allow-origin: "/http?:.*/" 23 | #http.cors.enabled: true 24 | #http.cors.allow-headers: X-Requested-With,X-Auth-Token,Content-Type,Content-Length,Authorization 25 | #http.cors.allow-credentials: true 26 | # 27 | # 28 | # 29 | ##http.cors.allow-origin: "/http?:.*/" 30 | ##http.cors.allow-origin: /http?://localhost(:[0-9]+)?/ 31 | ##http.cors.enabled: true 32 | ##indices.query.bool.max_clause_count: 10240 33 | ##network.host: 0.0.0.0 34 | # 35 | #discovery.type: single-node 36 | 37 | -------------------------------------------------------------------------------- /notebooks/opensearch/.docker/osd-docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM opensearchproject/opensearch-dashboards:2.5.0 2 | 3 | COPY --chown=opensearch-dashboards:opensearch-dashboards opensearch_dashboards.yml /usr/share/opensearch-dashboards/config/ 4 | RUN cat /usr/share/opensearch-dashboards/config/opensearch_dashboards.yml -------------------------------------------------------------------------------- /notebooks/opensearch/.docker/osd-docker/opensearch_dashboards.yml: -------------------------------------------------------------------------------- 1 | # OSD is served by a back end server. This setting specifies the port to use. 2 | server.port: 5602 3 | server.host: 0.0.0.0 4 | -------------------------------------------------------------------------------- /notebooks/opensearch/README.md: -------------------------------------------------------------------------------- 1 | This folder contains some OpenSearch configuration and a Dockerfile to expedite setting up OpenSearch with LTR. 2 | 3 | ## Docker 4 | Run `docker-compose up` to create a image running OpenSearch with LTR 5 | 6 | After the instance is running, load up the "hello-ltr (OpenSearch)" notebook. 7 | -------------------------------------------------------------------------------- /notebooks/opensearch/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | opensearch-node1: 3 | #image: opensearch-custom-plugin 4 | #opensearchproject/opensearch:2.6.0 5 | # image: opensearchproject/opensearch:1.3.9 6 | #image: opensearchproject/opensearch:2.8.0 7 | build: ./.docker/opensearch-docker/. 8 | container_name: opensearch-node1 9 | environment: 10 | - cluster.name=opensearch-cluster 11 | - node.name=opensearch-node1 12 | - bootstrap.memory_lock=true # along with the memlock settings below, disables swapping 13 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" # minimum and maximum Java heap size, recommend setting both to 50% of system RAM 14 | - "DISABLE_INSTALL_DEMO_CONFIG=true" # disables execution of install_demo_configuration.sh bundled with security plugin, which installs demo certificates and security configurations to OpenSearch 15 | - "DISABLE_SECURITY_PLUGIN=true" # disables security plugin entirely in OpenSearch by setting plugins.security.disabled: true in opensearch.yml 16 | - "discovery.type=single-node" # disables bootstrap checks that are enabled when network.host is set to a non-loopback address 17 | ulimits: 18 | memlock: 19 | soft: -1 20 | hard: -1 21 | nofile: 22 | soft: 65536 # maximum number of open files for the OpenSearch user, set to at least 65536 on modern systems 23 | hard: 65536 24 | volumes: 25 | - opensearch-data1:/usr/share/opensearch/data 26 | ports: 27 | - "9201:9201" 28 | - "9600:9600" # required for Performance Analyzer 29 | networks: 30 | - opensearch-net 31 | 32 | opensearch-dashboards: 33 | # image: opensearchproject/opensearch-dashboards:1.3.9 34 | # image: opensearchproject/opensearch-dashboards:2.5.0 35 | build: ./.docker/osd-docker/. 36 | container_name: opensearch-dashboards 37 | ports: 38 | - "5602:5602" 39 | environment: 40 | - 'OPENSEARCH_HOSTS=["http://opensearch-node1:9201"]' 41 | - "DISABLE_SECURITY_DASHBOARDS_PLUGIN=true" # disables security dashboards plugin in OpenSearch Dashboards 42 | networks: 43 | - opensearch-net 44 | 45 | volumes: 46 | opensearch-data1: 47 | 48 | networks: 49 | opensearch-net: 50 | 51 | 52 | -------------------------------------------------------------------------------- /notebooks/opensearch/osc-blog/blog_settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "_source": { 4 | "enabled": true 5 | }, 6 | "properties": { 7 | "post_id": { 8 | "type": "long", 9 | "store": true 10 | }, 11 | "post_date": { 12 | "type": "date", 13 | "store": true 14 | }, 15 | "es_update_date": { 16 | "type": "date", 17 | "store": true 18 | }, 19 | "url": { 20 | "type": "text", 21 | "store": true 22 | }, 23 | "title": { 24 | "type": "text", 25 | "store": true, 26 | "analyzer": "content_analyzer", 27 | "fields": { 28 | "bigrams": { 29 | "type": "text", 30 | "analyzer": "content_bigrams" 31 | } 32 | } 33 | }, 34 | "author": { 35 | "type": "text", 36 | "store": true, 37 | "analyzer": "standard" 38 | }, 39 | "content": { 40 | "type": "text", 41 | "store": true, 42 | "analyzer": "content_analyzer", 43 | "fields": { 44 | "bigrams": { 45 | "type": "text", 46 | "analyzer": "content_bigrams" 47 | } 48 | } 49 | }, 50 | "excerpt": { 51 | "type": "text", 52 | "store": true, 53 | "analyzer": "content_analyzer" 54 | }, 55 | "categories": { 56 | "type": "text", 57 | "store": true, 58 | "analyzer": "content_analyzer" 59 | } 60 | } 61 | }, 62 | "settings": { 63 | "number_of_shards": 1, 64 | "number_of_replicas": 1, 65 | "analysis": { 66 | "filter": { 67 | "english_stemmer": { 68 | "type": "stemmer", 69 | "language": "english" 70 | }, 71 | "english_possessive_stemmer": { 72 | "type": "stemmer", 73 | "language": "possessive_english" 74 | }, 75 | "bigram": { 76 | "type": "shingle", 77 | "max_shingle_size": 2, 78 | "output_unigrams": false 79 | } 80 | }, 81 | "analyzer": { 82 | "content_analyzer": { 83 | "type": "custom", 84 | "char_filter": [ 85 | "html_strip" 86 | ], 87 | "filter": [ 88 | "english_possessive_stemmer", 89 | "lowercase", 90 | "english_stemmer" 91 | ], 92 | "tokenizer": "standard" 93 | }, 94 | "content_bigrams": { 95 | "type": "custom", 96 | "char_filter": [ 97 | "html_strip" 98 | ], 99 | "filter": [ 100 | "english_possessive_stemmer", 101 | "lowercase", 102 | "english_stemmer", 103 | "bigram" 104 | ], 105 | "tokenizer": "standard" 106 | } 107 | } 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /notebooks/opensearch/osc-blog/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/opensearch/tmdb/fmap.txt: -------------------------------------------------------------------------------- 1 | 0 release_year q 2 | 1 features0 q 3 | -------------------------------------------------------------------------------- /notebooks/opensearch/tmdb/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/solr.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 28 | 29 | 30 | 31 | 32 | 33 | ${host:} 34 | ${jetty.port:8983} 35 | ${hostContext:solr} 36 | 37 | ${genericCoreNodeNames:true} 38 | 39 | ${zkClientTimeout:30000} 40 | ${distribUpdateSoTimeout:600000} 41 | ${distribUpdateConnTimeout:60000} 42 | ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider} 43 | ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider} 44 | 45 | 46 | 47 | 49 | ${socketTimeout:600000} 50 | ${connTimeout:60000} 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/idioms.txt: -------------------------------------------------------------------------------- 1 | # Idioms is a synonyms file that captures idiomatic phrases as single units 2 | 3 | # LHS is all representations encountered in query or document 4 | looneytunes, looney tunes, looney toons => 12345 5 | sci fi, scifi, science fiction => 56789 6 | 7 | #looneytunes, looney tunes => looney_tunes 8 | #bugs bunny => bug_bunny 9 | #mickey mouse => mickey_mouse 10 | #minnie mouse => minnie_mouse 11 | #donald duck => donald_duck 12 | #yogi bear => yogi_bear -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- 1 | # Set of Catalan contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | l 5 | m 6 | n 7 | s 8 | t 9 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- 1 | # Set of French contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | l 4 | m 5 | t 6 | qu 7 | n 8 | s 9 | j 10 | d 11 | c 12 | jusqu 13 | quoiqu 14 | lorsqu 15 | puisqu 16 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | m 5 | b 6 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- 1 | # Set of Italian contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | c 4 | l 5 | all 6 | dall 7 | dell 8 | nell 9 | sull 10 | coll 11 | pell 12 | gl 13 | agl 14 | dagl 15 | degl 16 | negl 17 | sugl 18 | un 19 | m 20 | t 21 | s 22 | v 23 | d 24 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish hyphenations for StopFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | h 4 | n 5 | t 6 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- 1 | # Set of overrides for the dutch stemmer 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | fiets fiets 4 | bromfiets bromfiets 5 | ei eier 6 | kind kinder 7 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization) 5 | # This means that when modifying this list, you might need to add some 6 | # redundant entries, for example containing forms with both أ and ا 7 | من 8 | ومن 9 | منها 10 | منه 11 | في 12 | وفي 13 | فيها 14 | فيه 15 | و 16 | ف 17 | ثم 18 | او 19 | أو 20 | ب 21 | بها 22 | به 23 | ا 24 | أ 25 | اى 26 | اي 27 | أي 28 | أى 29 | لا 30 | ولا 31 | الا 32 | ألا 33 | إلا 34 | لكن 35 | ما 36 | وما 37 | كما 38 | فما 39 | عن 40 | مع 41 | اذا 42 | إذا 43 | ان 44 | أن 45 | إن 46 | انها 47 | أنها 48 | إنها 49 | انه 50 | أنه 51 | إنه 52 | بان 53 | بأن 54 | فان 55 | فأن 56 | وان 57 | وأن 58 | وإن 59 | التى 60 | التي 61 | الذى 62 | الذي 63 | الذين 64 | الى 65 | الي 66 | إلى 67 | إلي 68 | على 69 | عليها 70 | عليه 71 | اما 72 | أما 73 | إما 74 | ايضا 75 | أيضا 76 | كل 77 | وكل 78 | لم 79 | ولم 80 | لن 81 | ولن 82 | هى 83 | هي 84 | هو 85 | وهى 86 | وهي 87 | وهو 88 | فهى 89 | فهي 90 | فهو 91 | انت 92 | أنت 93 | لك 94 | لها 95 | له 96 | هذه 97 | هذا 98 | تلك 99 | ذلك 100 | هناك 101 | كانت 102 | كان 103 | يكون 104 | تكون 105 | وكانت 106 | وكان 107 | غير 108 | بعض 109 | قد 110 | نحو 111 | بين 112 | بينما 113 | منذ 114 | ضمن 115 | حيث 116 | الان 117 | الآن 118 | خلال 119 | بعد 120 | قبل 121 | حتى 122 | عند 123 | عندما 124 | لدى 125 | جميع 126 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_bg.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | а 5 | аз 6 | ако 7 | ала 8 | бе 9 | без 10 | беше 11 | би 12 | бил 13 | била 14 | били 15 | било 16 | близо 17 | бъдат 18 | бъде 19 | бяха 20 | в 21 | вас 22 | ваш 23 | ваша 24 | вероятно 25 | вече 26 | взема 27 | ви 28 | вие 29 | винаги 30 | все 31 | всеки 32 | всички 33 | всичко 34 | всяка 35 | във 36 | въпреки 37 | върху 38 | г 39 | ги 40 | главно 41 | го 42 | д 43 | да 44 | дали 45 | до 46 | докато 47 | докога 48 | дори 49 | досега 50 | доста 51 | е 52 | едва 53 | един 54 | ето 55 | за 56 | зад 57 | заедно 58 | заради 59 | засега 60 | затова 61 | защо 62 | защото 63 | и 64 | из 65 | или 66 | им 67 | има 68 | имат 69 | иска 70 | й 71 | каза 72 | как 73 | каква 74 | какво 75 | както 76 | какъв 77 | като 78 | кога 79 | когато 80 | което 81 | които 82 | кой 83 | който 84 | колко 85 | която 86 | къде 87 | където 88 | към 89 | ли 90 | м 91 | ме 92 | между 93 | мен 94 | ми 95 | мнозина 96 | мога 97 | могат 98 | може 99 | моля 100 | момента 101 | му 102 | н 103 | на 104 | над 105 | назад 106 | най 107 | направи 108 | напред 109 | например 110 | нас 111 | не 112 | него 113 | нея 114 | ни 115 | ние 116 | никой 117 | нито 118 | но 119 | някои 120 | някой 121 | няма 122 | обаче 123 | около 124 | освен 125 | особено 126 | от 127 | отгоре 128 | отново 129 | още 130 | пак 131 | по 132 | повече 133 | повечето 134 | под 135 | поне 136 | поради 137 | после 138 | почти 139 | прави 140 | пред 141 | преди 142 | през 143 | при 144 | пък 145 | първо 146 | с 147 | са 148 | само 149 | се 150 | сега 151 | си 152 | скоро 153 | след 154 | сме 155 | според 156 | сред 157 | срещу 158 | сте 159 | съм 160 | със 161 | също 162 | т 163 | тази 164 | така 165 | такива 166 | такъв 167 | там 168 | твой 169 | те 170 | тези 171 | ти 172 | тн 173 | то 174 | това 175 | тогава 176 | този 177 | той 178 | толкова 179 | точно 180 | трябва 181 | тук 182 | тъй 183 | тя 184 | тях 185 | у 186 | харесва 187 | ч 188 | че 189 | често 190 | чрез 191 | ще 192 | щом 193 | я 194 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ca.txt: -------------------------------------------------------------------------------- 1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) 2 | a 3 | abans 4 | ací 5 | ah 6 | així 7 | això 8 | al 9 | als 10 | aleshores 11 | algun 12 | alguna 13 | algunes 14 | alguns 15 | alhora 16 | allà 17 | allí 18 | allò 19 | altra 20 | altre 21 | altres 22 | amb 23 | ambdós 24 | ambdues 25 | apa 26 | aquell 27 | aquella 28 | aquelles 29 | aquells 30 | aquest 31 | aquesta 32 | aquestes 33 | aquests 34 | aquí 35 | baix 36 | cada 37 | cadascú 38 | cadascuna 39 | cadascunes 40 | cadascuns 41 | com 42 | contra 43 | d'un 44 | d'una 45 | d'unes 46 | d'uns 47 | dalt 48 | de 49 | del 50 | dels 51 | des 52 | després 53 | dins 54 | dintre 55 | donat 56 | doncs 57 | durant 58 | e 59 | eh 60 | el 61 | els 62 | em 63 | en 64 | encara 65 | ens 66 | entre 67 | érem 68 | eren 69 | éreu 70 | es 71 | és 72 | esta 73 | està 74 | estàvem 75 | estaven 76 | estàveu 77 | esteu 78 | et 79 | etc 80 | ets 81 | fins 82 | fora 83 | gairebé 84 | ha 85 | han 86 | has 87 | havia 88 | he 89 | hem 90 | heu 91 | hi 92 | ho 93 | i 94 | igual 95 | iguals 96 | ja 97 | l'hi 98 | la 99 | les 100 | li 101 | li'n 102 | llavors 103 | m'he 104 | ma 105 | mal 106 | malgrat 107 | mateix 108 | mateixa 109 | mateixes 110 | mateixos 111 | me 112 | mentre 113 | més 114 | meu 115 | meus 116 | meva 117 | meves 118 | molt 119 | molta 120 | moltes 121 | molts 122 | mon 123 | mons 124 | n'he 125 | n'hi 126 | ne 127 | ni 128 | no 129 | nogensmenys 130 | només 131 | nosaltres 132 | nostra 133 | nostre 134 | nostres 135 | o 136 | oh 137 | oi 138 | on 139 | pas 140 | pel 141 | pels 142 | per 143 | però 144 | perquè 145 | poc 146 | poca 147 | pocs 148 | poques 149 | potser 150 | propi 151 | qual 152 | quals 153 | quan 154 | quant 155 | que 156 | què 157 | quelcom 158 | qui 159 | quin 160 | quina 161 | quines 162 | quins 163 | s'ha 164 | s'han 165 | sa 166 | semblant 167 | semblants 168 | ses 169 | seu 170 | seus 171 | seva 172 | seva 173 | seves 174 | si 175 | sobre 176 | sobretot 177 | sóc 178 | solament 179 | sols 180 | son 181 | són 182 | sons 183 | sota 184 | sou 185 | t'ha 186 | t'han 187 | t'he 188 | ta 189 | tal 190 | també 191 | tampoc 192 | tan 193 | tant 194 | tanta 195 | tantes 196 | teu 197 | teus 198 | teva 199 | teves 200 | ton 201 | tons 202 | tot 203 | tota 204 | totes 205 | tots 206 | un 207 | una 208 | unes 209 | uns 210 | us 211 | va 212 | vaig 213 | vam 214 | van 215 | vas 216 | veu 217 | vosaltres 218 | vostra 219 | vostre 220 | vostres 221 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- 1 | a 2 | s 3 | k 4 | o 5 | i 6 | u 7 | v 8 | z 9 | dnes 10 | cz 11 | tímto 12 | budeš 13 | budem 14 | byli 15 | jseš 16 | můj 17 | svým 18 | ta 19 | tomto 20 | tohle 21 | tuto 22 | tyto 23 | jej 24 | zda 25 | proč 26 | máte 27 | tato 28 | kam 29 | tohoto 30 | kdo 31 | kteří 32 | mi 33 | nám 34 | tom 35 | tomuto 36 | mít 37 | nic 38 | proto 39 | kterou 40 | byla 41 | toho 42 | protože 43 | asi 44 | ho 45 | naši 46 | napište 47 | re 48 | což 49 | tím 50 | takže 51 | svých 52 | její 53 | svými 54 | jste 55 | aj 56 | tu 57 | tedy 58 | teto 59 | bylo 60 | kde 61 | ke 62 | pravé 63 | ji 64 | nad 65 | nejsou 66 | či 67 | pod 68 | téma 69 | mezi 70 | přes 71 | ty 72 | pak 73 | vám 74 | ani 75 | když 76 | však 77 | neg 78 | jsem 79 | tento 80 | článku 81 | články 82 | aby 83 | jsme 84 | před 85 | pta 86 | jejich 87 | byl 88 | ještě 89 | až 90 | bez 91 | také 92 | pouze 93 | první 94 | vaše 95 | která 96 | nás 97 | nový 98 | tipy 99 | pokud 100 | může 101 | strana 102 | jeho 103 | své 104 | jiné 105 | zprávy 106 | nové 107 | není 108 | vás 109 | jen 110 | podle 111 | zde 112 | už 113 | být 114 | více 115 | bude 116 | již 117 | než 118 | který 119 | by 120 | které 121 | co 122 | nebo 123 | ten 124 | tak 125 | má 126 | při 127 | od 128 | po 129 | jsou 130 | jak 131 | další 132 | ale 133 | si 134 | se 135 | ve 136 | to 137 | jako 138 | za 139 | zpět 140 | ze 141 | do 142 | pro 143 | je 144 | na 145 | atd 146 | atp 147 | jakmile 148 | přičemž 149 | já 150 | on 151 | ona 152 | ono 153 | oni 154 | ony 155 | my 156 | vy 157 | jí 158 | ji 159 | mě 160 | mne 161 | jemu 162 | tomu 163 | těm 164 | těmu 165 | němu 166 | němuž 167 | jehož 168 | jíž 169 | jelikož 170 | jež 171 | jakož 172 | načež 173 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- 1 | # Lucene Greek Stopwords list 2 | # Note: by default this file is used after GreekLowerCaseFilter, 3 | # so when modifying this file use 'σ' instead of 'ς' 4 | ο 5 | η 6 | το 7 | οι 8 | τα 9 | του 10 | τησ 11 | των 12 | τον 13 | την 14 | και 15 | κι 16 | κ 17 | ειμαι 18 | εισαι 19 | ειναι 20 | ειμαστε 21 | ειστε 22 | στο 23 | στον 24 | στη 25 | στην 26 | μα 27 | αλλα 28 | απο 29 | για 30 | προσ 31 | με 32 | σε 33 | ωσ 34 | παρα 35 | αντι 36 | κατα 37 | μετα 38 | θα 39 | να 40 | δε 41 | δεν 42 | μη 43 | μην 44 | επι 45 | ενω 46 | εαν 47 | αν 48 | τοτε 49 | που 50 | πωσ 51 | ποιοσ 52 | ποια 53 | ποιο 54 | ποιοι 55 | ποιεσ 56 | ποιων 57 | ποιουσ 58 | αυτοσ 59 | αυτη 60 | αυτο 61 | αυτοι 62 | αυτων 63 | αυτουσ 64 | αυτεσ 65 | αυτα 66 | εκεινοσ 67 | εκεινη 68 | εκεινο 69 | εκεινοι 70 | εκεινεσ 71 | εκεινα 72 | εκεινων 73 | εκεινουσ 74 | οπωσ 75 | ομωσ 76 | ισωσ 77 | οσο 78 | οτι 79 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- 1 | # example set of basque stopwords 2 | al 3 | anitz 4 | arabera 5 | asko 6 | baina 7 | bat 8 | batean 9 | batek 10 | bati 11 | batzuei 12 | batzuek 13 | batzuetan 14 | batzuk 15 | bera 16 | beraiek 17 | berau 18 | berauek 19 | bere 20 | berori 21 | beroriek 22 | beste 23 | bezala 24 | da 25 | dago 26 | dira 27 | ditu 28 | du 29 | dute 30 | edo 31 | egin 32 | ere 33 | eta 34 | eurak 35 | ez 36 | gainera 37 | gu 38 | gutxi 39 | guzti 40 | haiei 41 | haiek 42 | haietan 43 | hainbeste 44 | hala 45 | han 46 | handik 47 | hango 48 | hara 49 | hari 50 | hark 51 | hartan 52 | hau 53 | hauei 54 | hauek 55 | hauetan 56 | hemen 57 | hemendik 58 | hemengo 59 | hi 60 | hona 61 | honek 62 | honela 63 | honetan 64 | honi 65 | hor 66 | hori 67 | horiei 68 | horiek 69 | horietan 70 | horko 71 | horra 72 | horrek 73 | horrela 74 | horretan 75 | horri 76 | hortik 77 | hura 78 | izan 79 | ni 80 | noiz 81 | nola 82 | non 83 | nondik 84 | nongo 85 | nor 86 | nora 87 | ze 88 | zein 89 | zen 90 | zenbait 91 | zenbat 92 | zer 93 | zergatik 94 | ziren 95 | zituen 96 | zu 97 | zuek 98 | zuen 99 | zuten 100 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fa.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Note: by default this file is used after normalization, so when adding entries 5 | # to this file, use the arabic 'ي' instead of 'ی' 6 | انان 7 | نداشته 8 | سراسر 9 | خياه 10 | ايشان 11 | وي 12 | تاكنون 13 | بيشتري 14 | دوم 15 | پس 16 | ناشي 17 | وگو 18 | يا 19 | داشتند 20 | سپس 21 | هنگام 22 | هرگز 23 | پنج 24 | نشان 25 | امسال 26 | ديگر 27 | گروهي 28 | شدند 29 | چطور 30 | ده 31 | و 32 | دو 33 | نخستين 34 | ولي 35 | چرا 36 | چه 37 | وسط 38 | ه 39 | كدام 40 | قابل 41 | يك 42 | رفت 43 | هفت 44 | همچنين 45 | در 46 | هزار 47 | بله 48 | بلي 49 | شايد 50 | اما 51 | شناسي 52 | گرفته 53 | دهد 54 | داشته 55 | دانست 56 | داشتن 57 | خواهيم 58 | ميليارد 59 | وقتيكه 60 | امد 61 | خواهد 62 | جز 63 | اورده 64 | شده 65 | بلكه 66 | خدمات 67 | شدن 68 | برخي 69 | نبود 70 | بسياري 71 | جلوگيري 72 | حق 73 | كردند 74 | نوعي 75 | بعري 76 | نكرده 77 | نظير 78 | نبايد 79 | بوده 80 | بودن 81 | داد 82 | اورد 83 | هست 84 | جايي 85 | شود 86 | دنبال 87 | داده 88 | بايد 89 | سابق 90 | هيچ 91 | همان 92 | انجا 93 | كمتر 94 | كجاست 95 | گردد 96 | كسي 97 | تر 98 | مردم 99 | تان 100 | دادن 101 | بودند 102 | سري 103 | جدا 104 | ندارند 105 | مگر 106 | يكديگر 107 | دارد 108 | دهند 109 | بنابراين 110 | هنگامي 111 | سمت 112 | جا 113 | انچه 114 | خود 115 | دادند 116 | زياد 117 | دارند 118 | اثر 119 | بدون 120 | بهترين 121 | بيشتر 122 | البته 123 | به 124 | براساس 125 | بيرون 126 | كرد 127 | بعضي 128 | گرفت 129 | توي 130 | اي 131 | ميليون 132 | او 133 | جريان 134 | تول 135 | بر 136 | مانند 137 | برابر 138 | باشيم 139 | مدتي 140 | گويند 141 | اكنون 142 | تا 143 | تنها 144 | جديد 145 | چند 146 | بي 147 | نشده 148 | كردن 149 | كردم 150 | گويد 151 | كرده 152 | كنيم 153 | نمي 154 | نزد 155 | روي 156 | قصد 157 | فقط 158 | بالاي 159 | ديگران 160 | اين 161 | ديروز 162 | توسط 163 | سوم 164 | ايم 165 | دانند 166 | سوي 167 | استفاده 168 | شما 169 | كنار 170 | داريم 171 | ساخته 172 | طور 173 | امده 174 | رفته 175 | نخست 176 | بيست 177 | نزديك 178 | طي 179 | كنيد 180 | از 181 | انها 182 | تمامي 183 | داشت 184 | يكي 185 | طريق 186 | اش 187 | چيست 188 | روب 189 | نمايد 190 | گفت 191 | چندين 192 | چيزي 193 | تواند 194 | ام 195 | ايا 196 | با 197 | ان 198 | ايد 199 | ترين 200 | اينكه 201 | ديگري 202 | راه 203 | هايي 204 | بروز 205 | همچنان 206 | پاعين 207 | كس 208 | حدود 209 | مختلف 210 | مقابل 211 | چيز 212 | گيرد 213 | ندارد 214 | ضد 215 | همچون 216 | سازي 217 | شان 218 | مورد 219 | باره 220 | مرسي 221 | خويش 222 | برخوردار 223 | چون 224 | خارج 225 | شش 226 | هنوز 227 | تحت 228 | ضمن 229 | هستيم 230 | گفته 231 | فكر 232 | بسيار 233 | پيش 234 | براي 235 | روزهاي 236 | انكه 237 | نخواهد 238 | بالا 239 | كل 240 | وقتي 241 | كي 242 | چنين 243 | كه 244 | گيري 245 | نيست 246 | است 247 | كجا 248 | كند 249 | نيز 250 | يابد 251 | بندي 252 | حتي 253 | توانند 254 | عقب 255 | خواست 256 | كنند 257 | بين 258 | تمام 259 | همه 260 | ما 261 | باشند 262 | مثل 263 | شد 264 | اري 265 | باشد 266 | اره 267 | طبق 268 | بعد 269 | اگر 270 | صورت 271 | غير 272 | جاي 273 | بيش 274 | ريزي 275 | اند 276 | زيرا 277 | چگونه 278 | بار 279 | لطفا 280 | مي 281 | درباره 282 | من 283 | ديده 284 | همين 285 | گذاري 286 | برداري 287 | علت 288 | گذاشته 289 | هم 290 | فوق 291 | نه 292 | ها 293 | شوند 294 | اباد 295 | همواره 296 | هر 297 | اول 298 | خواهند 299 | چهار 300 | نام 301 | امروز 302 | مان 303 | هاي 304 | قبل 305 | كنم 306 | سعي 307 | تازه 308 | را 309 | هستند 310 | زير 311 | جلوي 312 | عنوان 313 | بود 314 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_fi.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | forms of BE 11 | 12 | olla 13 | olen 14 | olet 15 | on 16 | olemme 17 | olette 18 | ovat 19 | ole | negative form 20 | 21 | oli 22 | olisi 23 | olisit 24 | olisin 25 | olisimme 26 | olisitte 27 | olisivat 28 | olit 29 | olin 30 | olimme 31 | olitte 32 | olivat 33 | ollut 34 | olleet 35 | 36 | en | negation 37 | et 38 | ei 39 | emme 40 | ette 41 | eivät 42 | 43 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans 44 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I 45 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you 46 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she 47 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we 48 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you 49 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they 50 | 51 | tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this 52 | tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that 53 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it 54 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these 55 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those 56 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they 57 | 58 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who 59 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) 60 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what 61 | mitkä | (pl) 62 | 63 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which 64 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) 65 | 66 | | conjunctions 67 | 68 | että | that 69 | ja | and 70 | jos | if 71 | koska | because 72 | kuin | than 73 | mutta | but 74 | niin | so 75 | sekä | and 76 | sillä | for 77 | tai | or 78 | vaan | but 79 | vai | or 80 | vaikka | although 81 | 82 | 83 | | prepositions 84 | 85 | kanssa | with 86 | mukaan | according to 87 | noin | about 88 | poikki | across 89 | yli | over, across 90 | 91 | | other 92 | 93 | kun | when 94 | niin | so 95 | nyt | now 96 | itse | self 97 | 98 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- 1 | 2 | a 3 | ach 4 | ag 5 | agus 6 | an 7 | aon 8 | ar 9 | arna 10 | as 11 | b' 12 | ba 13 | beirt 14 | bhúr 15 | caoga 16 | ceathair 17 | ceathrar 18 | chomh 19 | chtó 20 | chuig 21 | chun 22 | cois 23 | céad 24 | cúig 25 | cúigear 26 | d' 27 | daichead 28 | dar 29 | de 30 | deich 31 | deichniúr 32 | den 33 | dhá 34 | do 35 | don 36 | dtí 37 | dá 38 | dár 39 | dó 40 | faoi 41 | faoin 42 | faoina 43 | faoinár 44 | fara 45 | fiche 46 | gach 47 | gan 48 | go 49 | gur 50 | haon 51 | hocht 52 | i 53 | iad 54 | idir 55 | in 56 | ina 57 | ins 58 | inár 59 | is 60 | le 61 | leis 62 | lena 63 | lenár 64 | m' 65 | mar 66 | mo 67 | mé 68 | na 69 | nach 70 | naoi 71 | naonúr 72 | ná 73 | ní 74 | níor 75 | nó 76 | nócha 77 | ocht 78 | ochtar 79 | os 80 | roimh 81 | sa 82 | seacht 83 | seachtar 84 | seachtó 85 | seasca 86 | seisear 87 | siad 88 | sibh 89 | sinn 90 | sna 91 | sé 92 | sí 93 | tar 94 | thar 95 | thú 96 | triúr 97 | trí 98 | trína 99 | trínár 100 | tríocha 101 | tú 102 | um 103 | ár 104 | é 105 | éis 106 | í 107 | ó 108 | ón 109 | óna 110 | ónár 111 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- 1 | # galican stopwords 2 | a 3 | aínda 4 | alí 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | ao 12 | aos 13 | as 14 | así 15 | á 16 | ben 17 | cando 18 | che 19 | co 20 | coa 21 | comigo 22 | con 23 | connosco 24 | contigo 25 | convosco 26 | coas 27 | cos 28 | cun 29 | cuns 30 | cunha 31 | cunhas 32 | da 33 | dalgunha 34 | dalgunhas 35 | dalgún 36 | dalgúns 37 | das 38 | de 39 | del 40 | dela 41 | delas 42 | deles 43 | desde 44 | deste 45 | do 46 | dos 47 | dun 48 | duns 49 | dunha 50 | dunhas 51 | e 52 | el 53 | ela 54 | elas 55 | eles 56 | en 57 | era 58 | eran 59 | esa 60 | esas 61 | ese 62 | eses 63 | esta 64 | estar 65 | estaba 66 | está 67 | están 68 | este 69 | estes 70 | estiven 71 | estou 72 | eu 73 | é 74 | facer 75 | foi 76 | foron 77 | fun 78 | había 79 | hai 80 | iso 81 | isto 82 | la 83 | las 84 | lle 85 | lles 86 | lo 87 | los 88 | mais 89 | me 90 | meu 91 | meus 92 | min 93 | miña 94 | miñas 95 | moi 96 | na 97 | nas 98 | neste 99 | nin 100 | no 101 | non 102 | nos 103 | nosa 104 | nosas 105 | noso 106 | nosos 107 | nós 108 | nun 109 | nunha 110 | nuns 111 | nunhas 112 | o 113 | os 114 | ou 115 | ó 116 | ós 117 | para 118 | pero 119 | pode 120 | pois 121 | pola 122 | polas 123 | polo 124 | polos 125 | por 126 | que 127 | se 128 | senón 129 | ser 130 | seu 131 | seus 132 | sexa 133 | sido 134 | sobre 135 | súa 136 | súas 137 | tamén 138 | tan 139 | te 140 | ten 141 | teñen 142 | teño 143 | ter 144 | teu 145 | teus 146 | ti 147 | tido 148 | tiña 149 | tiven 150 | túa 151 | túas 152 | un 153 | unha 154 | unhas 155 | uns 156 | vos 157 | vosa 158 | vosas 159 | voso 160 | vosos 161 | vós 162 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hi.txt: -------------------------------------------------------------------------------- 1 | # Also see http://www.opensource.org/licenses/bsd-license.html 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # This file was created by Jacques Savoy and is distributed under the BSD license. 4 | # Note: by default this file also contains forms normalized by HindiNormalizer 5 | # for spelling variation (see section below), such that it can be used whether or 6 | # not you enable that feature. When adding additional entries to this list, 7 | # please add the normalized form as well. 8 | अंदर 9 | अत 10 | अपना 11 | अपनी 12 | अपने 13 | अभी 14 | आदि 15 | आप 16 | इत्यादि 17 | इन 18 | इनका 19 | इन्हीं 20 | इन्हें 21 | इन्हों 22 | इस 23 | इसका 24 | इसकी 25 | इसके 26 | इसमें 27 | इसी 28 | इसे 29 | उन 30 | उनका 31 | उनकी 32 | उनके 33 | उनको 34 | उन्हीं 35 | उन्हें 36 | उन्हों 37 | उस 38 | उसके 39 | उसी 40 | उसे 41 | एक 42 | एवं 43 | एस 44 | ऐसे 45 | और 46 | कई 47 | कर 48 | करता 49 | करते 50 | करना 51 | करने 52 | करें 53 | कहते 54 | कहा 55 | का 56 | काफ़ी 57 | कि 58 | कितना 59 | किन्हें 60 | किन्हों 61 | किया 62 | किर 63 | किस 64 | किसी 65 | किसे 66 | की 67 | कुछ 68 | कुल 69 | के 70 | को 71 | कोई 72 | कौन 73 | कौनसा 74 | गया 75 | घर 76 | जब 77 | जहाँ 78 | जा 79 | जितना 80 | जिन 81 | जिन्हें 82 | जिन्हों 83 | जिस 84 | जिसे 85 | जीधर 86 | जैसा 87 | जैसे 88 | जो 89 | तक 90 | तब 91 | तरह 92 | तिन 93 | तिन्हें 94 | तिन्हों 95 | तिस 96 | तिसे 97 | तो 98 | था 99 | थी 100 | थे 101 | दबारा 102 | दिया 103 | दुसरा 104 | दूसरे 105 | दो 106 | द्वारा 107 | न 108 | नहीं 109 | ना 110 | निहायत 111 | नीचे 112 | ने 113 | पर 114 | पर 115 | पहले 116 | पूरा 117 | पे 118 | फिर 119 | बनी 120 | बही 121 | बहुत 122 | बाद 123 | बाला 124 | बिलकुल 125 | भी 126 | भीतर 127 | मगर 128 | मानो 129 | मे 130 | में 131 | यदि 132 | यह 133 | यहाँ 134 | यही 135 | या 136 | यिह 137 | ये 138 | रखें 139 | रहा 140 | रहे 141 | ऱ्वासा 142 | लिए 143 | लिये 144 | लेकिन 145 | व 146 | वर्ग 147 | वह 148 | वह 149 | वहाँ 150 | वहीं 151 | वाले 152 | वुह 153 | वे 154 | वग़ैरह 155 | संग 156 | सकता 157 | सकते 158 | सबसे 159 | सभी 160 | साथ 161 | साबुत 162 | साभ 163 | सारा 164 | से 165 | सो 166 | ही 167 | हुआ 168 | हुई 169 | हुए 170 | है 171 | हैं 172 | हो 173 | होता 174 | होती 175 | होते 176 | होना 177 | होने 178 | # additional normalized forms of the above 179 | अपनि 180 | जेसे 181 | होति 182 | सभि 183 | तिंहों 184 | इंहों 185 | दवारा 186 | इसि 187 | किंहें 188 | थि 189 | उंहों 190 | ओर 191 | जिंहें 192 | वहिं 193 | अभि 194 | बनि 195 | हि 196 | उंहिं 197 | उंहें 198 | हें 199 | वगेरह 200 | एसे 201 | रवासा 202 | कोन 203 | निचे 204 | काफि 205 | उसि 206 | पुरा 207 | भितर 208 | हे 209 | बहि 210 | वहां 211 | कोइ 212 | यहां 213 | जिंहों 214 | तिंहें 215 | किसि 216 | कइ 217 | यहि 218 | इंहिं 219 | जिधर 220 | इंहें 221 | अदि 222 | इतयादि 223 | हुइ 224 | कोनसा 225 | इसकि 226 | दुसरे 227 | जहां 228 | अप 229 | किंहों 230 | उनकि 231 | भि 232 | वरग 233 | हुअ 234 | जेसा 235 | नहिं 236 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hu.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | Hungarian stop word list 11 | | prepared by Anna Tordai 12 | 13 | a 14 | ahogy 15 | ahol 16 | aki 17 | akik 18 | akkor 19 | alatt 20 | által 21 | általában 22 | amely 23 | amelyek 24 | amelyekben 25 | amelyeket 26 | amelyet 27 | amelynek 28 | ami 29 | amit 30 | amolyan 31 | amíg 32 | amikor 33 | át 34 | abban 35 | ahhoz 36 | annak 37 | arra 38 | arról 39 | az 40 | azok 41 | azon 42 | azt 43 | azzal 44 | azért 45 | aztán 46 | azután 47 | azonban 48 | bár 49 | be 50 | belül 51 | benne 52 | cikk 53 | cikkek 54 | cikkeket 55 | csak 56 | de 57 | e 58 | eddig 59 | egész 60 | egy 61 | egyes 62 | egyetlen 63 | egyéb 64 | egyik 65 | egyre 66 | ekkor 67 | el 68 | elég 69 | ellen 70 | elő 71 | először 72 | előtt 73 | első 74 | én 75 | éppen 76 | ebben 77 | ehhez 78 | emilyen 79 | ennek 80 | erre 81 | ez 82 | ezt 83 | ezek 84 | ezen 85 | ezzel 86 | ezért 87 | és 88 | fel 89 | felé 90 | hanem 91 | hiszen 92 | hogy 93 | hogyan 94 | igen 95 | így 96 | illetve 97 | ill. 98 | ill 99 | ilyen 100 | ilyenkor 101 | ison 102 | ismét 103 | itt 104 | jó 105 | jól 106 | jobban 107 | kell 108 | kellett 109 | keresztül 110 | keressünk 111 | ki 112 | kívül 113 | között 114 | közül 115 | legalább 116 | lehet 117 | lehetett 118 | legyen 119 | lenne 120 | lenni 121 | lesz 122 | lett 123 | maga 124 | magát 125 | majd 126 | majd 127 | már 128 | más 129 | másik 130 | meg 131 | még 132 | mellett 133 | mert 134 | mely 135 | melyek 136 | mi 137 | mit 138 | míg 139 | miért 140 | milyen 141 | mikor 142 | minden 143 | mindent 144 | mindenki 145 | mindig 146 | mint 147 | mintha 148 | mivel 149 | most 150 | nagy 151 | nagyobb 152 | nagyon 153 | ne 154 | néha 155 | nekem 156 | neki 157 | nem 158 | néhány 159 | nélkül 160 | nincs 161 | olyan 162 | ott 163 | össze 164 | ő 165 | ők 166 | őket 167 | pedig 168 | persze 169 | rá 170 | s 171 | saját 172 | sem 173 | semmi 174 | sok 175 | sokat 176 | sokkal 177 | számára 178 | szemben 179 | szerint 180 | szinte 181 | talán 182 | tehát 183 | teljes 184 | tovább 185 | továbbá 186 | több 187 | úgy 188 | ugyanis 189 | új 190 | újabb 191 | újra 192 | után 193 | utána 194 | utolsó 195 | vagy 196 | vagyis 197 | valaki 198 | valami 199 | valamint 200 | való 201 | vagyok 202 | van 203 | vannak 204 | volt 205 | voltam 206 | voltak 207 | voltunk 208 | vissza 209 | vele 210 | viszont 211 | volna 212 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- 1 | # example set of Armenian stopwords. 2 | այդ 3 | այլ 4 | այն 5 | այս 6 | դու 7 | դուք 8 | եմ 9 | են 10 | ենք 11 | ես 12 | եք 13 | է 14 | էի 15 | էին 16 | էինք 17 | էիր 18 | էիք 19 | էր 20 | ըստ 21 | թ 22 | ի 23 | ին 24 | իսկ 25 | իր 26 | կամ 27 | համար 28 | հետ 29 | հետո 30 | մենք 31 | մեջ 32 | մի 33 | ն 34 | նա 35 | նաև 36 | նրա 37 | նրանք 38 | որ 39 | որը 40 | որոնք 41 | որպես 42 | ու 43 | ում 44 | պիտի 45 | վրա 46 | և 47 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file defines a stopword set for Japanese. 3 | # 4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. 5 | # Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 6 | # for frequency lists, etc. that can be useful for making your own set (if desired) 7 | # 8 | # Note that there is an overlap between these stopwords and the terms stopped when used 9 | # in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note 10 | # that comments are not allowed on the same line as stopwords. 11 | # 12 | # Also note that stopping is done in a case-insensitive manner. Change your StopFilter 13 | # configuration if you need case-sensitive stopping. Lastly, note that stopping is done 14 | # using the same character width as the entries in this file. Since this StopFilter is 15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji 16 | # entries to be in half-width and your kana entries to be in full-width. 17 | # 18 | の 19 | に 20 | は 21 | を 22 | た 23 | が 24 | で 25 | て 26 | と 27 | し 28 | れ 29 | さ 30 | ある 31 | いる 32 | も 33 | する 34 | から 35 | な 36 | こと 37 | として 38 | い 39 | や 40 | れる 41 | など 42 | なっ 43 | ない 44 | この 45 | ため 46 | その 47 | あっ 48 | よう 49 | また 50 | もの 51 | という 52 | あり 53 | まで 54 | られ 55 | なる 56 | へ 57 | か 58 | だ 59 | これ 60 | によって 61 | により 62 | おり 63 | より 64 | による 65 | ず 66 | なり 67 | られる 68 | において 69 | ば 70 | なかっ 71 | なく 72 | しかし 73 | について 74 | せ 75 | だっ 76 | その後 77 | できる 78 | それ 79 | う 80 | ので 81 | なお 82 | のみ 83 | でき 84 | き 85 | つ 86 | における 87 | および 88 | いう 89 | さらに 90 | でも 91 | ら 92 | たり 93 | その他 94 | に関する 95 | たち 96 | ます 97 | ん 98 | なら 99 | に対して 100 | 特に 101 | せる 102 | 及び 103 | これら 104 | とき 105 | では 106 | にて 107 | ほか 108 | ながら 109 | うち 110 | そして 111 | とともに 112 | ただし 113 | かつて 114 | それぞれ 115 | または 116 | お 117 | ほど 118 | ものの 119 | に対する 120 | ほとんど 121 | と共に 122 | といった 123 | です 124 | とも 125 | ところ 126 | ここ 127 | ##### End of file 128 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_lv.txt: -------------------------------------------------------------------------------- 1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins 2 | # the original list of over 800 forms was refined: 3 | # pronouns, adverbs, interjections were removed 4 | # 5 | # prepositions 6 | aiz 7 | ap 8 | ar 9 | apakš 10 | ārpus 11 | augšpus 12 | bez 13 | caur 14 | dēļ 15 | gar 16 | iekš 17 | iz 18 | kopš 19 | labad 20 | lejpus 21 | līdz 22 | no 23 | otrpus 24 | pa 25 | par 26 | pār 27 | pēc 28 | pie 29 | pirms 30 | pret 31 | priekš 32 | starp 33 | šaipus 34 | uz 35 | viņpus 36 | virs 37 | virspus 38 | zem 39 | apakšpus 40 | # Conjunctions 41 | un 42 | bet 43 | jo 44 | ja 45 | ka 46 | lai 47 | tomēr 48 | tikko 49 | turpretī 50 | arī 51 | kaut 52 | gan 53 | tādēļ 54 | tā 55 | ne 56 | tikvien 57 | vien 58 | kā 59 | ir 60 | te 61 | vai 62 | kamēr 63 | # Particles 64 | ar 65 | diezin 66 | droši 67 | diemžēl 68 | nebūt 69 | ik 70 | it 71 | taču 72 | nu 73 | pat 74 | tiklab 75 | iekšpus 76 | nedz 77 | tik 78 | nevis 79 | turpretim 80 | jeb 81 | iekam 82 | iekām 83 | iekāms 84 | kolīdz 85 | līdzko 86 | tiklīdz 87 | jebšu 88 | tālab 89 | tāpēc 90 | nekā 91 | itin 92 | jā 93 | jau 94 | jel 95 | nē 96 | nezin 97 | tad 98 | tikai 99 | vis 100 | tak 101 | iekams 102 | vien 103 | # modal verbs 104 | būt 105 | biju 106 | biji 107 | bija 108 | bijām 109 | bijāt 110 | esmu 111 | esi 112 | esam 113 | esat 114 | būšu 115 | būsi 116 | būs 117 | būsim 118 | būsiet 119 | tikt 120 | tiku 121 | tiki 122 | tika 123 | tikām 124 | tikāt 125 | tieku 126 | tiec 127 | tiek 128 | tiekam 129 | tiekat 130 | tikšu 131 | tiks 132 | tiksim 133 | tiksiet 134 | tapt 135 | tapi 136 | tapāt 137 | topat 138 | tapšu 139 | tapsi 140 | taps 141 | tapsim 142 | tapsiet 143 | kļūt 144 | kļuvu 145 | kļuvi 146 | kļuva 147 | kļuvām 148 | kļuvāt 149 | kļūstu 150 | kļūsti 151 | kļūst 152 | kļūstam 153 | kļūstat 154 | kļūšu 155 | kļūsi 156 | kļūs 157 | kļūsim 158 | kļūsiet 159 | # verbs 160 | varēt 161 | varēju 162 | varējām 163 | varēšu 164 | varēsim 165 | var 166 | varēji 167 | varējāt 168 | varēsi 169 | varēsiet 170 | varat 171 | varēja 172 | varēs 173 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_ro.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | acea 5 | aceasta 6 | această 7 | aceea 8 | acei 9 | aceia 10 | acel 11 | acela 12 | acele 13 | acelea 14 | acest 15 | acesta 16 | aceste 17 | acestea 18 | aceşti 19 | aceştia 20 | acolo 21 | acum 22 | ai 23 | aia 24 | aibă 25 | aici 26 | al 27 | ăla 28 | ale 29 | alea 30 | ălea 31 | altceva 32 | altcineva 33 | am 34 | ar 35 | are 36 | aş 37 | aşadar 38 | asemenea 39 | asta 40 | ăsta 41 | astăzi 42 | astea 43 | ăstea 44 | ăştia 45 | asupra 46 | aţi 47 | au 48 | avea 49 | avem 50 | aveţi 51 | azi 52 | bine 53 | bucur 54 | bună 55 | ca 56 | că 57 | căci 58 | când 59 | care 60 | cărei 61 | căror 62 | cărui 63 | cât 64 | câte 65 | câţi 66 | către 67 | câtva 68 | ce 69 | cel 70 | ceva 71 | chiar 72 | cînd 73 | cine 74 | cineva 75 | cît 76 | cîte 77 | cîţi 78 | cîtva 79 | contra 80 | cu 81 | cum 82 | cumva 83 | curând 84 | curînd 85 | da 86 | dă 87 | dacă 88 | dar 89 | datorită 90 | de 91 | deci 92 | deja 93 | deoarece 94 | departe 95 | deşi 96 | din 97 | dinaintea 98 | dintr 99 | dintre 100 | drept 101 | după 102 | ea 103 | ei 104 | el 105 | ele 106 | eram 107 | este 108 | eşti 109 | eu 110 | face 111 | fără 112 | fi 113 | fie 114 | fiecare 115 | fii 116 | fim 117 | fiţi 118 | iar 119 | ieri 120 | îi 121 | îl 122 | îmi 123 | împotriva 124 | în 125 | înainte 126 | înaintea 127 | încât 128 | încît 129 | încotro 130 | între 131 | întrucât 132 | întrucît 133 | îţi 134 | la 135 | lângă 136 | le 137 | li 138 | lîngă 139 | lor 140 | lui 141 | mă 142 | mâine 143 | mea 144 | mei 145 | mele 146 | mereu 147 | meu 148 | mi 149 | mine 150 | mult 151 | multă 152 | mulţi 153 | ne 154 | nicăieri 155 | nici 156 | nimeni 157 | nişte 158 | noastră 159 | noastre 160 | noi 161 | noştri 162 | nostru 163 | nu 164 | ori 165 | oricând 166 | oricare 167 | oricât 168 | orice 169 | oricînd 170 | oricine 171 | oricît 172 | oricum 173 | oriunde 174 | până 175 | pe 176 | pentru 177 | peste 178 | pînă 179 | poate 180 | pot 181 | prea 182 | prima 183 | primul 184 | prin 185 | printr 186 | sa 187 | să 188 | săi 189 | sale 190 | sau 191 | său 192 | se 193 | şi 194 | sînt 195 | sîntem 196 | sînteţi 197 | spre 198 | sub 199 | sunt 200 | suntem 201 | sunteţi 202 | ta 203 | tăi 204 | tale 205 | tău 206 | te 207 | ţi 208 | ţie 209 | tine 210 | toată 211 | toate 212 | tot 213 | toţi 214 | totuşi 215 | tu 216 | un 217 | una 218 | unde 219 | undeva 220 | unei 221 | unele 222 | uneori 223 | unor 224 | vă 225 | vi 226 | voastră 227 | voastre 228 | voi 229 | voştri 230 | vostru 231 | vouă 232 | vreo 233 | vreun 234 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- 1 | # Thai stopwords from: 2 | # "Opinion Detection in Thai Political News Columns 3 | # Based on Subjectivity Analysis" 4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak 5 | ไว้ 6 | ไม่ 7 | ไป 8 | ได้ 9 | ให้ 10 | ใน 11 | โดย 12 | แห่ง 13 | แล้ว 14 | และ 15 | แรก 16 | แบบ 17 | แต่ 18 | เอง 19 | เห็น 20 | เลย 21 | เริ่ม 22 | เรา 23 | เมื่อ 24 | เพื่อ 25 | เพราะ 26 | เป็นการ 27 | เป็น 28 | เปิดเผย 29 | เปิด 30 | เนื่องจาก 31 | เดียวกัน 32 | เดียว 33 | เช่น 34 | เฉพาะ 35 | เคย 36 | เข้า 37 | เขา 38 | อีก 39 | อาจ 40 | อะไร 41 | ออก 42 | อย่าง 43 | อยู่ 44 | อยาก 45 | หาก 46 | หลาย 47 | หลังจาก 48 | หลัง 49 | หรือ 50 | หนึ่ง 51 | ส่วน 52 | ส่ง 53 | สุด 54 | สําหรับ 55 | ว่า 56 | วัน 57 | ลง 58 | ร่วม 59 | ราย 60 | รับ 61 | ระหว่าง 62 | รวม 63 | ยัง 64 | มี 65 | มาก 66 | มา 67 | พร้อม 68 | พบ 69 | ผ่าน 70 | ผล 71 | บาง 72 | น่า 73 | นี้ 74 | นํา 75 | นั้น 76 | นัก 77 | นอกจาก 78 | ทุก 79 | ที่สุด 80 | ที่ 81 | ทําให้ 82 | ทํา 83 | ทาง 84 | ทั้งนี้ 85 | ทั้ง 86 | ถ้า 87 | ถูก 88 | ถึง 89 | ต้อง 90 | ต่างๆ 91 | ต่าง 92 | ต่อ 93 | ตาม 94 | ตั้งแต่ 95 | ตั้ง 96 | ด้าน 97 | ด้วย 98 | ดัง 99 | ซึ่ง 100 | ช่วง 101 | จึง 102 | จาก 103 | จัด 104 | จะ 105 | คือ 106 | ความ 107 | ครั้ง 108 | คง 109 | ขึ้น 110 | ของ 111 | ขอ 112 | ขณะ 113 | ก่อน 114 | ก็ 115 | การ 116 | กับ 117 | กัน 118 | กว่า 119 | กล่าว 120 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/stopwords_tr.txt: -------------------------------------------------------------------------------- 1 | # Turkish stopwords from LUCENE-559 2 | # merged with the list from "Information Retrieval on Turkish Texts" 3 | # (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) 4 | acaba 5 | altmış 6 | altı 7 | ama 8 | ancak 9 | arada 10 | aslında 11 | ayrıca 12 | bana 13 | bazı 14 | belki 15 | ben 16 | benden 17 | beni 18 | benim 19 | beri 20 | beş 21 | bile 22 | bin 23 | bir 24 | birçok 25 | biri 26 | birkaç 27 | birkez 28 | birşey 29 | birşeyi 30 | biz 31 | bize 32 | bizden 33 | bizi 34 | bizim 35 | böyle 36 | böylece 37 | bu 38 | buna 39 | bunda 40 | bundan 41 | bunlar 42 | bunları 43 | bunların 44 | bunu 45 | bunun 46 | burada 47 | çok 48 | çünkü 49 | da 50 | daha 51 | dahi 52 | de 53 | defa 54 | değil 55 | diğer 56 | diye 57 | doksan 58 | dokuz 59 | dolayı 60 | dolayısıyla 61 | dört 62 | edecek 63 | eden 64 | ederek 65 | edilecek 66 | ediliyor 67 | edilmesi 68 | ediyor 69 | eğer 70 | elli 71 | en 72 | etmesi 73 | etti 74 | ettiği 75 | ettiğini 76 | gibi 77 | göre 78 | halen 79 | hangi 80 | hatta 81 | hem 82 | henüz 83 | hep 84 | hepsi 85 | her 86 | herhangi 87 | herkesin 88 | hiç 89 | hiçbir 90 | için 91 | iki 92 | ile 93 | ilgili 94 | ise 95 | işte 96 | itibaren 97 | itibariyle 98 | kadar 99 | karşın 100 | katrilyon 101 | kendi 102 | kendilerine 103 | kendini 104 | kendisi 105 | kendisine 106 | kendisini 107 | kez 108 | ki 109 | kim 110 | kimden 111 | kime 112 | kimi 113 | kimse 114 | kırk 115 | milyar 116 | milyon 117 | mu 118 | mü 119 | mı 120 | nasıl 121 | ne 122 | neden 123 | nedenle 124 | nerde 125 | nerede 126 | nereye 127 | niye 128 | niçin 129 | o 130 | olan 131 | olarak 132 | oldu 133 | olduğu 134 | olduğunu 135 | olduklarını 136 | olmadı 137 | olmadığı 138 | olmak 139 | olması 140 | olmayan 141 | olmaz 142 | olsa 143 | olsun 144 | olup 145 | olur 146 | olursa 147 | oluyor 148 | on 149 | ona 150 | ondan 151 | onlar 152 | onlardan 153 | onları 154 | onların 155 | onu 156 | onun 157 | otuz 158 | oysa 159 | öyle 160 | pek 161 | rağmen 162 | sadece 163 | sanki 164 | sekiz 165 | seksen 166 | sen 167 | senden 168 | seni 169 | senin 170 | siz 171 | sizden 172 | sizi 173 | sizin 174 | şey 175 | şeyden 176 | şeyi 177 | şeyler 178 | şöyle 179 | şu 180 | şuna 181 | şunda 182 | şundan 183 | şunları 184 | şunu 185 | tarafından 186 | trilyon 187 | tüm 188 | üç 189 | üzere 190 | var 191 | vardı 192 | ve 193 | veya 194 | ya 195 | yani 196 | yapacak 197 | yapılan 198 | yapılması 199 | yapıyor 200 | yapmak 201 | yaptı 202 | yaptığı 203 | yaptığını 204 | yaptıkları 205 | yedi 206 | yerine 207 | yetmiş 208 | yine 209 | yirmi 210 | yoksa 211 | yüz 212 | zaten 213 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer) 3 | # 4 | # Add entries to this file in order to override the statistical model in terms 5 | # of segmentation, readings and part-of-speech tags. Notice that entries do 6 | # not have weights since they are always used when found. This is by-design 7 | # in order to maximize ease-of-use. 8 | # 9 | # Entries are defined using the following CSV format: 10 | # , ... , ... , 11 | # 12 | # Notice that a single half-width space separates tokens and readings, and 13 | # that the number tokens and readings must match exactly. 14 | # 15 | # Also notice that multiple entries with the same is undefined. 16 | # 17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines. 18 | # 19 | 20 | # Custom segmentation for kanji compounds 21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 23 | 24 | # Custom segmentation for compound katakana 25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 27 | 28 | # Custom reading for former sumo wrestler 29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名 30 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/name_synonyms.txt: -------------------------------------------------------------------------------- 1 | sky walker, skywalker -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/names.txt: -------------------------------------------------------------------------------- 1 | luke_skywalker -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/params.json: -------------------------------------------------------------------------------- 1 | {"params":{ 2 | "query":{ 3 | "defType":"edismax", 4 | "q.alt":"*:*", 5 | "rows":"10", 6 | "fl":"*,score", 7 | "":{"v":0} 8 | }, 9 | "facets":{ 10 | "facet":"on", 11 | "facet.mincount": "1", 12 | "":{"v":0} 13 | }, 14 | "velocity":{ 15 | "wt": "velocity", 16 | "v.template":"browse", 17 | "v.layout": "layout", 18 | "":{"v":0} 19 | } 20 | }} -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | 16 | # Demonstrating bidirectional synonyms 17 | #wife,bride 18 | #wife,spouse 19 | #toons,tunes,cartoon 20 | 21 | # Demonstrating => syntax 22 | # wife => wife, bride 23 | # spouse => spouse, husband, wife, partner 24 | # tunes => cartoons, toons, songs 25 | # cartoon => toons, tunes 26 | 27 | # Demonstrating multi phrase 28 | #looney tunes, cartoons 29 | #science fiction, sci fi, sci-fi, scifi 30 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_bidirect.txt: -------------------------------------------------------------------------------- 1 | # Often people erroneously equate linguistic synonyms 2 | # with Solr synonyms. Here the bidirectional nature 3 | # of the synonyms creates problems where the more specific 4 | # term is not prioritized 5 | wife,bride 6 | wife,spouse 7 | toons,tunes,cartoon -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_directed.txt: -------------------------------------------------------------------------------- 1 | wife => wife, bride 2 | spouse => spouse, husband, wife, partner 3 | tunes => cartoons, toons, songs 4 | cartoon => toons, tunes -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_genres.txt: -------------------------------------------------------------------------------- 1 | scifi,science fiction,science fiction movie -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/synonyms_multiterm.txt: -------------------------------------------------------------------------------- 1 | # Here are some multi term synonym to 2 | # see what happens at query time 3 | 4 | looney tunes, cartoons 5 | science fiction, sci fi, sci-fi, scifi -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes, cartoons 3 | #bugs bunny => bug_bunny, looney_tunes, cartoons 4 | #mickey mouse => mickey_mouse, disney, cartoons 5 | #minnie mouse => minnie_mouse, disney, cartoons 6 | #donald duck => donald_duck, disney, cartoons 7 | #yogi bear => yogi_bear, disney, cartoons 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse 11 | -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/tmdb/conf/taxonomy_parent.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes 3 | #bugs bunny => bug_bunny, looney_tunes 4 | #mickey mouse => mickey_mouse, disney 5 | #minnie mouse => minnie_mouse, disney 6 | #donald duck => donald_duck, disney 7 | #yogi bear => yogi_bear, disney 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse -------------------------------------------------------------------------------- /notebooks/solr/.docker/solr_home/zoo.cfg: -------------------------------------------------------------------------------- 1 | # The number of milliseconds of each tick 2 | tickTime=2000 3 | # The number of ticks that the initial 4 | # synchronization phase can take 5 | initLimit=10 6 | # The number of ticks that can pass between 7 | # sending a request and getting an acknowledgement 8 | syncLimit=5 9 | 10 | # the directory where the snapshot is stored. 11 | # dataDir=/opt/zookeeper/data 12 | # NOTE: Solr defaults the dataDir to /zoo_data 13 | 14 | # the port at which the clients will connect 15 | # clientPort=2181 16 | # NOTE: Solr sets this based on zkRun / zkHost params 17 | 18 | -------------------------------------------------------------------------------- /notebooks/solr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM solr:8.11.1 2 | 3 | USER root 4 | 5 | ADD tmdb/solr_config /var/solr/data/configsets/tmdb 6 | RUN chown solr:solr /var/solr/data/configsets/tmdb 7 | 8 | ADD msmarco/solr_config /var/solr/data/configsets/msmarco 9 | RUN chown solr:solr /var/solr/data/configsets/msmarco 10 | 11 | USER solr 12 | 13 | CMD ["solr-foreground", "-Dsolr.ltr.enabled=true"] 14 | -------------------------------------------------------------------------------- /notebooks/solr/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | solr: 3 | build: . 4 | expose: 5 | - "8983" 6 | ports: 7 | - "8983:8983" 8 | volumes: 9 | - data:/var/solr 10 | environment: 11 | SERVER_HOST: "0.0.0.0" 12 | mem_limit: 4096m 13 | mem_reservation: 4096m 14 | volumes: 15 | data: 16 | -------------------------------------------------------------------------------- /notebooks/solr/msmarco/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/solr/msmarco/solr_config/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /notebooks/solr/msmarco/solr_config/conf/params.json: -------------------------------------------------------------------------------- 1 | {"params":{ 2 | "query":{ 3 | "defType":"edismax", 4 | "q.alt":"*:*", 5 | "rows":"10", 6 | "fl":"*,score", 7 | "":{"v":0} 8 | }, 9 | "facets":{ 10 | "facet":"on", 11 | "facet.mincount": "1", 12 | "":{"v":0} 13 | }, 14 | "velocity":{ 15 | "wt": "velocity", 16 | "v.template":"browse", 17 | "v.layout": "layout", 18 | "":{"v":0} 19 | } 20 | }} -------------------------------------------------------------------------------- /notebooks/solr/tmdb/evaluation (Solr).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# evaluate (Solr Edition)\n", 8 | "\n", 9 | "**Note:** This lab requires hello-ltr be run first. You must have the TMDB data indexed and LTR models configured before proceeding." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### RRE\n", 17 | "This lab makes use of the rated-ranking-evaluator [project](https://github.com/SeaseLtd/rated-ranking-evaluator) to carry out evaluations on our models from the hello-ltr lab.\n", 18 | "\n", 19 | "An RRE configuration requires the following:\n", 20 | "\n", 21 | "- configuration_sets\n", 22 | " - This tells RRE about the Solr/Elastic instance to use for each evaluation\n", 23 | "- corpora (Not required for this setup)\n", 24 | " - RRE supports indexing a snapshot of data for evaluations. For this lab we'll be using the data indexed previously.\n", 25 | "- ratings\n", 26 | " - This folder houses json files with queries and ratings to be evaluated\n", 27 | "- templates\n", 28 | " - The queries to be run by each configuration set\n", 29 | "- pom.xml\n", 30 | " - Maven project configuration, here you can configure what metrics are calculated by the evalauation and format of the report.\n", 31 | " \n", 32 | "Take a look at the rre folder in the hello-ltr to get a better idea of the project layout and structure." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Ratings and Evaluation\n", 40 | "To get started with RRE we first need some ratings. For this example we're going to use a query for \"batman\" and we're going to say that newer films are better than older ones. We will setup 3 different configuration sets in RRE:\n", 41 | "\n", 42 | "- baseline (No LTR applied)\n", 43 | "- classic (Rescore with the `classic` LTR model)\n", 44 | "- latest (Rescore with the `latest` LTR model)\n", 45 | "\n", 46 | "The snippet below will kick off an evaluation in RRE" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from ltr import evaluate\n", 56 | "evaluate('solr')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Looking at the Results\n", 64 | "In this example we have rating data for every result in the Batman query and we're not adjusting matching so `Precision` and `Recall` are the expected value of 1. However, since we've altered the sorting of results with LTR we can see a lift in `ERR` as our higher rated documents are coming up closer to the top of the results." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from ltr import rre_table\n", 74 | "rre_table()" 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.7.6" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 2 99 | } 100 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/ltr.py: -------------------------------------------------------------------------------- 1 | # Import a module with the same name from a different directory. 2 | # 3 | # Adapted from 4 | # https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-helper-module/path-helper.html 5 | 6 | import importlib 7 | import os 8 | import sys 9 | 10 | sys.path.insert(0, os.path.abspath('../../../')) 11 | 12 | # Temporarily hijack __file__ to avoid adding names at module scope; 13 | # __file__ will be overwritten again during the reload() call. 14 | __file__ = {'sys': sys, 'importlib': importlib} 15 | 16 | del importlib 17 | del os 18 | del sys 19 | 20 | __file__['importlib'].reload(__file__['sys'].modules[__name__]) 21 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/elevate.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 26 | 27 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/idioms.txt: -------------------------------------------------------------------------------- 1 | # Idioms is a synonyms file that captures idiomatic phrases as single units 2 | 3 | # LHS is all representations encountered in query or document 4 | looneytunes, looney tunes, looney toons => 12345 5 | sci fi, scifi, science fiction => 56789 6 | 7 | #looneytunes, looney tunes => looney_tunes 8 | #bugs bunny => bug_bunny 9 | #mickey mouse => mickey_mouse 10 | #minnie mouse => minnie_mouse 11 | #donald duck => donald_duck 12 | #yogi bear => yogi_bear -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_ca.txt: -------------------------------------------------------------------------------- 1 | # Set of Catalan contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | l 5 | m 6 | n 7 | s 8 | t 9 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_fr.txt: -------------------------------------------------------------------------------- 1 | # Set of French contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | l 4 | m 5 | t 6 | qu 7 | n 8 | s 9 | j 10 | d 11 | c 12 | jusqu 13 | quoiqu 14 | lorsqu 15 | puisqu 16 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | d 4 | m 5 | b 6 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/contractions_it.txt: -------------------------------------------------------------------------------- 1 | # Set of Italian contractions for ElisionFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | c 4 | l 5 | all 6 | dall 7 | dell 8 | nell 9 | sull 10 | coll 11 | pell 12 | gl 13 | agl 14 | dagl 15 | degl 16 | negl 17 | sugl 18 | un 19 | m 20 | t 21 | s 22 | v 23 | d 24 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/hyphenations_ga.txt: -------------------------------------------------------------------------------- 1 | # Set of Irish hyphenations for StopFilter 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | h 4 | n 5 | t 6 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stemdict_nl.txt: -------------------------------------------------------------------------------- 1 | # Set of overrides for the dutch stemmer 2 | # TODO: load this as a resource from the analyzer and sync it in build.xml 3 | fiets fiets 4 | bromfiets bromfiets 5 | ei eier 6 | kind kinder 7 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ar.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Cleaned on October 11, 2009 (not normalized, so use before normalization) 5 | # This means that when modifying this list, you might need to add some 6 | # redundant entries, for example containing forms with both أ and ا 7 | من 8 | ومن 9 | منها 10 | منه 11 | في 12 | وفي 13 | فيها 14 | فيه 15 | و 16 | ف 17 | ثم 18 | او 19 | أو 20 | ب 21 | بها 22 | به 23 | ا 24 | أ 25 | اى 26 | اي 27 | أي 28 | أى 29 | لا 30 | ولا 31 | الا 32 | ألا 33 | إلا 34 | لكن 35 | ما 36 | وما 37 | كما 38 | فما 39 | عن 40 | مع 41 | اذا 42 | إذا 43 | ان 44 | أن 45 | إن 46 | انها 47 | أنها 48 | إنها 49 | انه 50 | أنه 51 | إنه 52 | بان 53 | بأن 54 | فان 55 | فأن 56 | وان 57 | وأن 58 | وإن 59 | التى 60 | التي 61 | الذى 62 | الذي 63 | الذين 64 | الى 65 | الي 66 | إلى 67 | إلي 68 | على 69 | عليها 70 | عليه 71 | اما 72 | أما 73 | إما 74 | ايضا 75 | أيضا 76 | كل 77 | وكل 78 | لم 79 | ولم 80 | لن 81 | ولن 82 | هى 83 | هي 84 | هو 85 | وهى 86 | وهي 87 | وهو 88 | فهى 89 | فهي 90 | فهو 91 | انت 92 | أنت 93 | لك 94 | لها 95 | له 96 | هذه 97 | هذا 98 | تلك 99 | ذلك 100 | هناك 101 | كانت 102 | كان 103 | يكون 104 | تكون 105 | وكانت 106 | وكان 107 | غير 108 | بعض 109 | قد 110 | نحو 111 | بين 112 | بينما 113 | منذ 114 | ضمن 115 | حيث 116 | الان 117 | الآن 118 | خلال 119 | بعد 120 | قبل 121 | حتى 122 | عند 123 | عندما 124 | لدى 125 | جميع 126 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_bg.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | а 5 | аз 6 | ако 7 | ала 8 | бе 9 | без 10 | беше 11 | би 12 | бил 13 | била 14 | били 15 | било 16 | близо 17 | бъдат 18 | бъде 19 | бяха 20 | в 21 | вас 22 | ваш 23 | ваша 24 | вероятно 25 | вече 26 | взема 27 | ви 28 | вие 29 | винаги 30 | все 31 | всеки 32 | всички 33 | всичко 34 | всяка 35 | във 36 | въпреки 37 | върху 38 | г 39 | ги 40 | главно 41 | го 42 | д 43 | да 44 | дали 45 | до 46 | докато 47 | докога 48 | дори 49 | досега 50 | доста 51 | е 52 | едва 53 | един 54 | ето 55 | за 56 | зад 57 | заедно 58 | заради 59 | засега 60 | затова 61 | защо 62 | защото 63 | и 64 | из 65 | или 66 | им 67 | има 68 | имат 69 | иска 70 | й 71 | каза 72 | как 73 | каква 74 | какво 75 | както 76 | какъв 77 | като 78 | кога 79 | когато 80 | което 81 | които 82 | кой 83 | който 84 | колко 85 | която 86 | къде 87 | където 88 | към 89 | ли 90 | м 91 | ме 92 | между 93 | мен 94 | ми 95 | мнозина 96 | мога 97 | могат 98 | може 99 | моля 100 | момента 101 | му 102 | н 103 | на 104 | над 105 | назад 106 | най 107 | направи 108 | напред 109 | например 110 | нас 111 | не 112 | него 113 | нея 114 | ни 115 | ние 116 | никой 117 | нито 118 | но 119 | някои 120 | някой 121 | няма 122 | обаче 123 | около 124 | освен 125 | особено 126 | от 127 | отгоре 128 | отново 129 | още 130 | пак 131 | по 132 | повече 133 | повечето 134 | под 135 | поне 136 | поради 137 | после 138 | почти 139 | прави 140 | пред 141 | преди 142 | през 143 | при 144 | пък 145 | първо 146 | с 147 | са 148 | само 149 | се 150 | сега 151 | си 152 | скоро 153 | след 154 | сме 155 | според 156 | сред 157 | срещу 158 | сте 159 | съм 160 | със 161 | също 162 | т 163 | тази 164 | така 165 | такива 166 | такъв 167 | там 168 | твой 169 | те 170 | тези 171 | ти 172 | тн 173 | то 174 | това 175 | тогава 176 | този 177 | той 178 | толкова 179 | точно 180 | трябва 181 | тук 182 | тъй 183 | тя 184 | тях 185 | у 186 | харесва 187 | ч 188 | че 189 | често 190 | чрез 191 | ще 192 | щом 193 | я 194 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ca.txt: -------------------------------------------------------------------------------- 1 | # Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) 2 | a 3 | abans 4 | ací 5 | ah 6 | així 7 | això 8 | al 9 | als 10 | aleshores 11 | algun 12 | alguna 13 | algunes 14 | alguns 15 | alhora 16 | allà 17 | allí 18 | allò 19 | altra 20 | altre 21 | altres 22 | amb 23 | ambdós 24 | ambdues 25 | apa 26 | aquell 27 | aquella 28 | aquelles 29 | aquells 30 | aquest 31 | aquesta 32 | aquestes 33 | aquests 34 | aquí 35 | baix 36 | cada 37 | cadascú 38 | cadascuna 39 | cadascunes 40 | cadascuns 41 | com 42 | contra 43 | d'un 44 | d'una 45 | d'unes 46 | d'uns 47 | dalt 48 | de 49 | del 50 | dels 51 | des 52 | després 53 | dins 54 | dintre 55 | donat 56 | doncs 57 | durant 58 | e 59 | eh 60 | el 61 | els 62 | em 63 | en 64 | encara 65 | ens 66 | entre 67 | érem 68 | eren 69 | éreu 70 | es 71 | és 72 | esta 73 | està 74 | estàvem 75 | estaven 76 | estàveu 77 | esteu 78 | et 79 | etc 80 | ets 81 | fins 82 | fora 83 | gairebé 84 | ha 85 | han 86 | has 87 | havia 88 | he 89 | hem 90 | heu 91 | hi 92 | ho 93 | i 94 | igual 95 | iguals 96 | ja 97 | l'hi 98 | la 99 | les 100 | li 101 | li'n 102 | llavors 103 | m'he 104 | ma 105 | mal 106 | malgrat 107 | mateix 108 | mateixa 109 | mateixes 110 | mateixos 111 | me 112 | mentre 113 | més 114 | meu 115 | meus 116 | meva 117 | meves 118 | molt 119 | molta 120 | moltes 121 | molts 122 | mon 123 | mons 124 | n'he 125 | n'hi 126 | ne 127 | ni 128 | no 129 | nogensmenys 130 | només 131 | nosaltres 132 | nostra 133 | nostre 134 | nostres 135 | o 136 | oh 137 | oi 138 | on 139 | pas 140 | pel 141 | pels 142 | per 143 | però 144 | perquè 145 | poc 146 | poca 147 | pocs 148 | poques 149 | potser 150 | propi 151 | qual 152 | quals 153 | quan 154 | quant 155 | que 156 | què 157 | quelcom 158 | qui 159 | quin 160 | quina 161 | quines 162 | quins 163 | s'ha 164 | s'han 165 | sa 166 | semblant 167 | semblants 168 | ses 169 | seu 170 | seus 171 | seva 172 | seva 173 | seves 174 | si 175 | sobre 176 | sobretot 177 | sóc 178 | solament 179 | sols 180 | son 181 | són 182 | sons 183 | sota 184 | sou 185 | t'ha 186 | t'han 187 | t'he 188 | ta 189 | tal 190 | també 191 | tampoc 192 | tan 193 | tant 194 | tanta 195 | tantes 196 | teu 197 | teus 198 | teva 199 | teves 200 | ton 201 | tons 202 | tot 203 | tota 204 | totes 205 | tots 206 | un 207 | una 208 | unes 209 | uns 210 | us 211 | va 212 | vaig 213 | vam 214 | van 215 | vas 216 | veu 217 | vosaltres 218 | vostra 219 | vostre 220 | vostres 221 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_cz.txt: -------------------------------------------------------------------------------- 1 | a 2 | s 3 | k 4 | o 5 | i 6 | u 7 | v 8 | z 9 | dnes 10 | cz 11 | tímto 12 | budeš 13 | budem 14 | byli 15 | jseš 16 | můj 17 | svým 18 | ta 19 | tomto 20 | tohle 21 | tuto 22 | tyto 23 | jej 24 | zda 25 | proč 26 | máte 27 | tato 28 | kam 29 | tohoto 30 | kdo 31 | kteří 32 | mi 33 | nám 34 | tom 35 | tomuto 36 | mít 37 | nic 38 | proto 39 | kterou 40 | byla 41 | toho 42 | protože 43 | asi 44 | ho 45 | naši 46 | napište 47 | re 48 | což 49 | tím 50 | takže 51 | svých 52 | její 53 | svými 54 | jste 55 | aj 56 | tu 57 | tedy 58 | teto 59 | bylo 60 | kde 61 | ke 62 | pravé 63 | ji 64 | nad 65 | nejsou 66 | či 67 | pod 68 | téma 69 | mezi 70 | přes 71 | ty 72 | pak 73 | vám 74 | ani 75 | když 76 | však 77 | neg 78 | jsem 79 | tento 80 | článku 81 | články 82 | aby 83 | jsme 84 | před 85 | pta 86 | jejich 87 | byl 88 | ještě 89 | až 90 | bez 91 | také 92 | pouze 93 | první 94 | vaše 95 | která 96 | nás 97 | nový 98 | tipy 99 | pokud 100 | může 101 | strana 102 | jeho 103 | své 104 | jiné 105 | zprávy 106 | nové 107 | není 108 | vás 109 | jen 110 | podle 111 | zde 112 | už 113 | být 114 | více 115 | bude 116 | již 117 | než 118 | který 119 | by 120 | které 121 | co 122 | nebo 123 | ten 124 | tak 125 | má 126 | při 127 | od 128 | po 129 | jsou 130 | jak 131 | další 132 | ale 133 | si 134 | se 135 | ve 136 | to 137 | jako 138 | za 139 | zpět 140 | ze 141 | do 142 | pro 143 | je 144 | na 145 | atd 146 | atp 147 | jakmile 148 | přičemž 149 | já 150 | on 151 | ona 152 | ono 153 | oni 154 | ony 155 | my 156 | vy 157 | jí 158 | ji 159 | mě 160 | mne 161 | jemu 162 | tomu 163 | těm 164 | těmu 165 | němu 166 | němuž 167 | jehož 168 | jíž 169 | jelikož 170 | jež 171 | jakož 172 | načež 173 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_el.txt: -------------------------------------------------------------------------------- 1 | # Lucene Greek Stopwords list 2 | # Note: by default this file is used after GreekLowerCaseFilter, 3 | # so when modifying this file use 'σ' instead of 'ς' 4 | ο 5 | η 6 | το 7 | οι 8 | τα 9 | του 10 | τησ 11 | των 12 | τον 13 | την 14 | και 15 | κι 16 | κ 17 | ειμαι 18 | εισαι 19 | ειναι 20 | ειμαστε 21 | ειστε 22 | στο 23 | στον 24 | στη 25 | στην 26 | μα 27 | αλλα 28 | απο 29 | για 30 | προσ 31 | με 32 | σε 33 | ωσ 34 | παρα 35 | αντι 36 | κατα 37 | μετα 38 | θα 39 | να 40 | δε 41 | δεν 42 | μη 43 | μην 44 | επι 45 | ενω 46 | εαν 47 | αν 48 | τοτε 49 | που 50 | πωσ 51 | ποιοσ 52 | ποια 53 | ποιο 54 | ποιοι 55 | ποιεσ 56 | ποιων 57 | ποιουσ 58 | αυτοσ 59 | αυτη 60 | αυτο 61 | αυτοι 62 | αυτων 63 | αυτουσ 64 | αυτεσ 65 | αυτα 66 | εκεινοσ 67 | εκεινη 68 | εκεινο 69 | εκεινοι 70 | εκεινεσ 71 | εκεινα 72 | εκεινων 73 | εκεινουσ 74 | οπωσ 75 | ομωσ 76 | ισωσ 77 | οσο 78 | οτι 79 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_en.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # a couple of test stopwords to test that the words are really being 17 | # configured from this file: 18 | stopworda 19 | stopwordb 20 | 21 | # Standard english stop words taken from Lucene's StopAnalyzer 22 | a 23 | an 24 | and 25 | are 26 | as 27 | at 28 | be 29 | but 30 | by 31 | for 32 | if 33 | in 34 | into 35 | is 36 | it 37 | no 38 | not 39 | of 40 | on 41 | or 42 | such 43 | that 44 | the 45 | their 46 | then 47 | there 48 | these 49 | they 50 | this 51 | to 52 | was 53 | will 54 | with 55 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_eu.txt: -------------------------------------------------------------------------------- 1 | # example set of basque stopwords 2 | al 3 | anitz 4 | arabera 5 | asko 6 | baina 7 | bat 8 | batean 9 | batek 10 | bati 11 | batzuei 12 | batzuek 13 | batzuetan 14 | batzuk 15 | bera 16 | beraiek 17 | berau 18 | berauek 19 | bere 20 | berori 21 | beroriek 22 | beste 23 | bezala 24 | da 25 | dago 26 | dira 27 | ditu 28 | du 29 | dute 30 | edo 31 | egin 32 | ere 33 | eta 34 | eurak 35 | ez 36 | gainera 37 | gu 38 | gutxi 39 | guzti 40 | haiei 41 | haiek 42 | haietan 43 | hainbeste 44 | hala 45 | han 46 | handik 47 | hango 48 | hara 49 | hari 50 | hark 51 | hartan 52 | hau 53 | hauei 54 | hauek 55 | hauetan 56 | hemen 57 | hemendik 58 | hemengo 59 | hi 60 | hona 61 | honek 62 | honela 63 | honetan 64 | honi 65 | hor 66 | hori 67 | horiei 68 | horiek 69 | horietan 70 | horko 71 | horra 72 | horrek 73 | horrela 74 | horretan 75 | horri 76 | hortik 77 | hura 78 | izan 79 | ni 80 | noiz 81 | nola 82 | non 83 | nondik 84 | nongo 85 | nor 86 | nora 87 | ze 88 | zein 89 | zen 90 | zenbait 91 | zenbat 92 | zer 93 | zergatik 94 | ziren 95 | zituen 96 | zu 97 | zuek 98 | zuen 99 | zuten 100 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fa.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | # Note: by default this file is used after normalization, so when adding entries 5 | # to this file, use the arabic 'ي' instead of 'ی' 6 | انان 7 | نداشته 8 | سراسر 9 | خياه 10 | ايشان 11 | وي 12 | تاكنون 13 | بيشتري 14 | دوم 15 | پس 16 | ناشي 17 | وگو 18 | يا 19 | داشتند 20 | سپس 21 | هنگام 22 | هرگز 23 | پنج 24 | نشان 25 | امسال 26 | ديگر 27 | گروهي 28 | شدند 29 | چطور 30 | ده 31 | و 32 | دو 33 | نخستين 34 | ولي 35 | چرا 36 | چه 37 | وسط 38 | ه 39 | كدام 40 | قابل 41 | يك 42 | رفت 43 | هفت 44 | همچنين 45 | در 46 | هزار 47 | بله 48 | بلي 49 | شايد 50 | اما 51 | شناسي 52 | گرفته 53 | دهد 54 | داشته 55 | دانست 56 | داشتن 57 | خواهيم 58 | ميليارد 59 | وقتيكه 60 | امد 61 | خواهد 62 | جز 63 | اورده 64 | شده 65 | بلكه 66 | خدمات 67 | شدن 68 | برخي 69 | نبود 70 | بسياري 71 | جلوگيري 72 | حق 73 | كردند 74 | نوعي 75 | بعري 76 | نكرده 77 | نظير 78 | نبايد 79 | بوده 80 | بودن 81 | داد 82 | اورد 83 | هست 84 | جايي 85 | شود 86 | دنبال 87 | داده 88 | بايد 89 | سابق 90 | هيچ 91 | همان 92 | انجا 93 | كمتر 94 | كجاست 95 | گردد 96 | كسي 97 | تر 98 | مردم 99 | تان 100 | دادن 101 | بودند 102 | سري 103 | جدا 104 | ندارند 105 | مگر 106 | يكديگر 107 | دارد 108 | دهند 109 | بنابراين 110 | هنگامي 111 | سمت 112 | جا 113 | انچه 114 | خود 115 | دادند 116 | زياد 117 | دارند 118 | اثر 119 | بدون 120 | بهترين 121 | بيشتر 122 | البته 123 | به 124 | براساس 125 | بيرون 126 | كرد 127 | بعضي 128 | گرفت 129 | توي 130 | اي 131 | ميليون 132 | او 133 | جريان 134 | تول 135 | بر 136 | مانند 137 | برابر 138 | باشيم 139 | مدتي 140 | گويند 141 | اكنون 142 | تا 143 | تنها 144 | جديد 145 | چند 146 | بي 147 | نشده 148 | كردن 149 | كردم 150 | گويد 151 | كرده 152 | كنيم 153 | نمي 154 | نزد 155 | روي 156 | قصد 157 | فقط 158 | بالاي 159 | ديگران 160 | اين 161 | ديروز 162 | توسط 163 | سوم 164 | ايم 165 | دانند 166 | سوي 167 | استفاده 168 | شما 169 | كنار 170 | داريم 171 | ساخته 172 | طور 173 | امده 174 | رفته 175 | نخست 176 | بيست 177 | نزديك 178 | طي 179 | كنيد 180 | از 181 | انها 182 | تمامي 183 | داشت 184 | يكي 185 | طريق 186 | اش 187 | چيست 188 | روب 189 | نمايد 190 | گفت 191 | چندين 192 | چيزي 193 | تواند 194 | ام 195 | ايا 196 | با 197 | ان 198 | ايد 199 | ترين 200 | اينكه 201 | ديگري 202 | راه 203 | هايي 204 | بروز 205 | همچنان 206 | پاعين 207 | كس 208 | حدود 209 | مختلف 210 | مقابل 211 | چيز 212 | گيرد 213 | ندارد 214 | ضد 215 | همچون 216 | سازي 217 | شان 218 | مورد 219 | باره 220 | مرسي 221 | خويش 222 | برخوردار 223 | چون 224 | خارج 225 | شش 226 | هنوز 227 | تحت 228 | ضمن 229 | هستيم 230 | گفته 231 | فكر 232 | بسيار 233 | پيش 234 | براي 235 | روزهاي 236 | انكه 237 | نخواهد 238 | بالا 239 | كل 240 | وقتي 241 | كي 242 | چنين 243 | كه 244 | گيري 245 | نيست 246 | است 247 | كجا 248 | كند 249 | نيز 250 | يابد 251 | بندي 252 | حتي 253 | توانند 254 | عقب 255 | خواست 256 | كنند 257 | بين 258 | تمام 259 | همه 260 | ما 261 | باشند 262 | مثل 263 | شد 264 | اري 265 | باشد 266 | اره 267 | طبق 268 | بعد 269 | اگر 270 | صورت 271 | غير 272 | جاي 273 | بيش 274 | ريزي 275 | اند 276 | زيرا 277 | چگونه 278 | بار 279 | لطفا 280 | مي 281 | درباره 282 | من 283 | ديده 284 | همين 285 | گذاري 286 | برداري 287 | علت 288 | گذاشته 289 | هم 290 | فوق 291 | نه 292 | ها 293 | شوند 294 | اباد 295 | همواره 296 | هر 297 | اول 298 | خواهند 299 | چهار 300 | نام 301 | امروز 302 | مان 303 | هاي 304 | قبل 305 | كنم 306 | سعي 307 | تازه 308 | را 309 | هستند 310 | زير 311 | جلوي 312 | عنوان 313 | بود 314 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_fi.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | forms of BE 11 | 12 | olla 13 | olen 14 | olet 15 | on 16 | olemme 17 | olette 18 | ovat 19 | ole | negative form 20 | 21 | oli 22 | olisi 23 | olisit 24 | olisin 25 | olisimme 26 | olisitte 27 | olisivat 28 | olit 29 | olin 30 | olimme 31 | olitte 32 | olivat 33 | ollut 34 | olleet 35 | 36 | en | negation 37 | et 38 | ei 39 | emme 40 | ette 41 | eivät 42 | 43 | |Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans 44 | minä minun minut minua minussa minusta minuun minulla minulta minulle | I 45 | sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you 46 | hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she 47 | me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we 48 | te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you 49 | he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they 50 | 51 | tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this 52 | tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that 53 | se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it 54 | nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these 55 | nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those 56 | ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they 57 | 58 | kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who 59 | ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) 60 | mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what 61 | mitkä | (pl) 62 | 63 | joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which 64 | jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) 65 | 66 | | conjunctions 67 | 68 | että | that 69 | ja | and 70 | jos | if 71 | koska | because 72 | kuin | than 73 | mutta | but 74 | niin | so 75 | sekä | and 76 | sillä | for 77 | tai | or 78 | vaan | but 79 | vai | or 80 | vaikka | although 81 | 82 | 83 | | prepositions 84 | 85 | kanssa | with 86 | mukaan | according to 87 | noin | about 88 | poikki | across 89 | yli | over, across 90 | 91 | | other 92 | 93 | kun | when 94 | niin | so 95 | nyt | now 96 | itse | self 97 | 98 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ga.txt: -------------------------------------------------------------------------------- 1 | 2 | a 3 | ach 4 | ag 5 | agus 6 | an 7 | aon 8 | ar 9 | arna 10 | as 11 | b' 12 | ba 13 | beirt 14 | bhúr 15 | caoga 16 | ceathair 17 | ceathrar 18 | chomh 19 | chtó 20 | chuig 21 | chun 22 | cois 23 | céad 24 | cúig 25 | cúigear 26 | d' 27 | daichead 28 | dar 29 | de 30 | deich 31 | deichniúr 32 | den 33 | dhá 34 | do 35 | don 36 | dtí 37 | dá 38 | dár 39 | dó 40 | faoi 41 | faoin 42 | faoina 43 | faoinár 44 | fara 45 | fiche 46 | gach 47 | gan 48 | go 49 | gur 50 | haon 51 | hocht 52 | i 53 | iad 54 | idir 55 | in 56 | ina 57 | ins 58 | inár 59 | is 60 | le 61 | leis 62 | lena 63 | lenár 64 | m' 65 | mar 66 | mo 67 | mé 68 | na 69 | nach 70 | naoi 71 | naonúr 72 | ná 73 | ní 74 | níor 75 | nó 76 | nócha 77 | ocht 78 | ochtar 79 | os 80 | roimh 81 | sa 82 | seacht 83 | seachtar 84 | seachtó 85 | seasca 86 | seisear 87 | siad 88 | sibh 89 | sinn 90 | sna 91 | sé 92 | sí 93 | tar 94 | thar 95 | thú 96 | triúr 97 | trí 98 | trína 99 | trínár 100 | tríocha 101 | tú 102 | um 103 | ár 104 | é 105 | éis 106 | í 107 | ó 108 | ón 109 | óna 110 | ónár 111 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_gl.txt: -------------------------------------------------------------------------------- 1 | # galican stopwords 2 | a 3 | aínda 4 | alí 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | ao 12 | aos 13 | as 14 | así 15 | á 16 | ben 17 | cando 18 | che 19 | co 20 | coa 21 | comigo 22 | con 23 | connosco 24 | contigo 25 | convosco 26 | coas 27 | cos 28 | cun 29 | cuns 30 | cunha 31 | cunhas 32 | da 33 | dalgunha 34 | dalgunhas 35 | dalgún 36 | dalgúns 37 | das 38 | de 39 | del 40 | dela 41 | delas 42 | deles 43 | desde 44 | deste 45 | do 46 | dos 47 | dun 48 | duns 49 | dunha 50 | dunhas 51 | e 52 | el 53 | ela 54 | elas 55 | eles 56 | en 57 | era 58 | eran 59 | esa 60 | esas 61 | ese 62 | eses 63 | esta 64 | estar 65 | estaba 66 | está 67 | están 68 | este 69 | estes 70 | estiven 71 | estou 72 | eu 73 | é 74 | facer 75 | foi 76 | foron 77 | fun 78 | había 79 | hai 80 | iso 81 | isto 82 | la 83 | las 84 | lle 85 | lles 86 | lo 87 | los 88 | mais 89 | me 90 | meu 91 | meus 92 | min 93 | miña 94 | miñas 95 | moi 96 | na 97 | nas 98 | neste 99 | nin 100 | no 101 | non 102 | nos 103 | nosa 104 | nosas 105 | noso 106 | nosos 107 | nós 108 | nun 109 | nunha 110 | nuns 111 | nunhas 112 | o 113 | os 114 | ou 115 | ó 116 | ós 117 | para 118 | pero 119 | pode 120 | pois 121 | pola 122 | polas 123 | polo 124 | polos 125 | por 126 | que 127 | se 128 | senón 129 | ser 130 | seu 131 | seus 132 | sexa 133 | sido 134 | sobre 135 | súa 136 | súas 137 | tamén 138 | tan 139 | te 140 | ten 141 | teñen 142 | teño 143 | ter 144 | teu 145 | teus 146 | ti 147 | tido 148 | tiña 149 | tiven 150 | túa 151 | túas 152 | un 153 | unha 154 | unhas 155 | uns 156 | vos 157 | vosa 158 | vosas 159 | voso 160 | vosos 161 | vós 162 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hi.txt: -------------------------------------------------------------------------------- 1 | # Also see http://www.opensource.org/licenses/bsd-license.html 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # This file was created by Jacques Savoy and is distributed under the BSD license. 4 | # Note: by default this file also contains forms normalized by HindiNormalizer 5 | # for spelling variation (see section below), such that it can be used whether or 6 | # not you enable that feature. When adding additional entries to this list, 7 | # please add the normalized form as well. 8 | अंदर 9 | अत 10 | अपना 11 | अपनी 12 | अपने 13 | अभी 14 | आदि 15 | आप 16 | इत्यादि 17 | इन 18 | इनका 19 | इन्हीं 20 | इन्हें 21 | इन्हों 22 | इस 23 | इसका 24 | इसकी 25 | इसके 26 | इसमें 27 | इसी 28 | इसे 29 | उन 30 | उनका 31 | उनकी 32 | उनके 33 | उनको 34 | उन्हीं 35 | उन्हें 36 | उन्हों 37 | उस 38 | उसके 39 | उसी 40 | उसे 41 | एक 42 | एवं 43 | एस 44 | ऐसे 45 | और 46 | कई 47 | कर 48 | करता 49 | करते 50 | करना 51 | करने 52 | करें 53 | कहते 54 | कहा 55 | का 56 | काफ़ी 57 | कि 58 | कितना 59 | किन्हें 60 | किन्हों 61 | किया 62 | किर 63 | किस 64 | किसी 65 | किसे 66 | की 67 | कुछ 68 | कुल 69 | के 70 | को 71 | कोई 72 | कौन 73 | कौनसा 74 | गया 75 | घर 76 | जब 77 | जहाँ 78 | जा 79 | जितना 80 | जिन 81 | जिन्हें 82 | जिन्हों 83 | जिस 84 | जिसे 85 | जीधर 86 | जैसा 87 | जैसे 88 | जो 89 | तक 90 | तब 91 | तरह 92 | तिन 93 | तिन्हें 94 | तिन्हों 95 | तिस 96 | तिसे 97 | तो 98 | था 99 | थी 100 | थे 101 | दबारा 102 | दिया 103 | दुसरा 104 | दूसरे 105 | दो 106 | द्वारा 107 | न 108 | नहीं 109 | ना 110 | निहायत 111 | नीचे 112 | ने 113 | पर 114 | पर 115 | पहले 116 | पूरा 117 | पे 118 | फिर 119 | बनी 120 | बही 121 | बहुत 122 | बाद 123 | बाला 124 | बिलकुल 125 | भी 126 | भीतर 127 | मगर 128 | मानो 129 | मे 130 | में 131 | यदि 132 | यह 133 | यहाँ 134 | यही 135 | या 136 | यिह 137 | ये 138 | रखें 139 | रहा 140 | रहे 141 | ऱ्वासा 142 | लिए 143 | लिये 144 | लेकिन 145 | व 146 | वर्ग 147 | वह 148 | वह 149 | वहाँ 150 | वहीं 151 | वाले 152 | वुह 153 | वे 154 | वग़ैरह 155 | संग 156 | सकता 157 | सकते 158 | सबसे 159 | सभी 160 | साथ 161 | साबुत 162 | साभ 163 | सारा 164 | से 165 | सो 166 | ही 167 | हुआ 168 | हुई 169 | हुए 170 | है 171 | हैं 172 | हो 173 | होता 174 | होती 175 | होते 176 | होना 177 | होने 178 | # additional normalized forms of the above 179 | अपनि 180 | जेसे 181 | होति 182 | सभि 183 | तिंहों 184 | इंहों 185 | दवारा 186 | इसि 187 | किंहें 188 | थि 189 | उंहों 190 | ओर 191 | जिंहें 192 | वहिं 193 | अभि 194 | बनि 195 | हि 196 | उंहिं 197 | उंहें 198 | हें 199 | वगेरह 200 | एसे 201 | रवासा 202 | कोन 203 | निचे 204 | काफि 205 | उसि 206 | पुरा 207 | भितर 208 | हे 209 | बहि 210 | वहां 211 | कोइ 212 | यहां 213 | जिंहों 214 | तिंहें 215 | किसि 216 | कइ 217 | यहि 218 | इंहिं 219 | जिधर 220 | इंहें 221 | अदि 222 | इतयादि 223 | हुइ 224 | कोनसा 225 | इसकि 226 | दुसरे 227 | जहां 228 | अप 229 | किंहों 230 | उनकि 231 | भि 232 | वरग 233 | हुअ 234 | जेसा 235 | नहिं 236 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hu.txt: -------------------------------------------------------------------------------- 1 | | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt 2 | | This file is distributed under the BSD License. 3 | | See http://snowball.tartarus.org/license.php 4 | | Also see http://www.opensource.org/licenses/bsd-license.html 5 | | - Encoding was converted to UTF-8. 6 | | - This notice was added. 7 | | 8 | | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" 9 | 10 | | Hungarian stop word list 11 | | prepared by Anna Tordai 12 | 13 | a 14 | ahogy 15 | ahol 16 | aki 17 | akik 18 | akkor 19 | alatt 20 | által 21 | általában 22 | amely 23 | amelyek 24 | amelyekben 25 | amelyeket 26 | amelyet 27 | amelynek 28 | ami 29 | amit 30 | amolyan 31 | amíg 32 | amikor 33 | át 34 | abban 35 | ahhoz 36 | annak 37 | arra 38 | arról 39 | az 40 | azok 41 | azon 42 | azt 43 | azzal 44 | azért 45 | aztán 46 | azután 47 | azonban 48 | bár 49 | be 50 | belül 51 | benne 52 | cikk 53 | cikkek 54 | cikkeket 55 | csak 56 | de 57 | e 58 | eddig 59 | egész 60 | egy 61 | egyes 62 | egyetlen 63 | egyéb 64 | egyik 65 | egyre 66 | ekkor 67 | el 68 | elég 69 | ellen 70 | elő 71 | először 72 | előtt 73 | első 74 | én 75 | éppen 76 | ebben 77 | ehhez 78 | emilyen 79 | ennek 80 | erre 81 | ez 82 | ezt 83 | ezek 84 | ezen 85 | ezzel 86 | ezért 87 | és 88 | fel 89 | felé 90 | hanem 91 | hiszen 92 | hogy 93 | hogyan 94 | igen 95 | így 96 | illetve 97 | ill. 98 | ill 99 | ilyen 100 | ilyenkor 101 | ison 102 | ismét 103 | itt 104 | jó 105 | jól 106 | jobban 107 | kell 108 | kellett 109 | keresztül 110 | keressünk 111 | ki 112 | kívül 113 | között 114 | közül 115 | legalább 116 | lehet 117 | lehetett 118 | legyen 119 | lenne 120 | lenni 121 | lesz 122 | lett 123 | maga 124 | magát 125 | majd 126 | majd 127 | már 128 | más 129 | másik 130 | meg 131 | még 132 | mellett 133 | mert 134 | mely 135 | melyek 136 | mi 137 | mit 138 | míg 139 | miért 140 | milyen 141 | mikor 142 | minden 143 | mindent 144 | mindenki 145 | mindig 146 | mint 147 | mintha 148 | mivel 149 | most 150 | nagy 151 | nagyobb 152 | nagyon 153 | ne 154 | néha 155 | nekem 156 | neki 157 | nem 158 | néhány 159 | nélkül 160 | nincs 161 | olyan 162 | ott 163 | össze 164 | ő 165 | ők 166 | őket 167 | pedig 168 | persze 169 | rá 170 | s 171 | saját 172 | sem 173 | semmi 174 | sok 175 | sokat 176 | sokkal 177 | számára 178 | szemben 179 | szerint 180 | szinte 181 | talán 182 | tehát 183 | teljes 184 | tovább 185 | továbbá 186 | több 187 | úgy 188 | ugyanis 189 | új 190 | újabb 191 | újra 192 | után 193 | utána 194 | utolsó 195 | vagy 196 | vagyis 197 | valaki 198 | valami 199 | valamint 200 | való 201 | vagyok 202 | van 203 | vannak 204 | volt 205 | voltam 206 | voltak 207 | voltunk 208 | vissza 209 | vele 210 | viszont 211 | volna 212 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_hy.txt: -------------------------------------------------------------------------------- 1 | # example set of Armenian stopwords. 2 | այդ 3 | այլ 4 | այն 5 | այս 6 | դու 7 | դուք 8 | եմ 9 | են 10 | ենք 11 | ես 12 | եք 13 | է 14 | էի 15 | էին 16 | էինք 17 | էիր 18 | էիք 19 | էր 20 | ըստ 21 | թ 22 | ի 23 | ին 24 | իսկ 25 | իր 26 | կամ 27 | համար 28 | հետ 29 | հետո 30 | մենք 31 | մեջ 32 | մի 33 | ն 34 | նա 35 | նաև 36 | նրա 37 | նրանք 38 | որ 39 | որը 40 | որոնք 41 | որպես 42 | ու 43 | ում 44 | պիտի 45 | վրա 46 | և 47 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file defines a stopword set for Japanese. 3 | # 4 | # This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. 5 | # Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 6 | # for frequency lists, etc. that can be useful for making your own set (if desired) 7 | # 8 | # Note that there is an overlap between these stopwords and the terms stopped when used 9 | # in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note 10 | # that comments are not allowed on the same line as stopwords. 11 | # 12 | # Also note that stopping is done in a case-insensitive manner. Change your StopFilter 13 | # configuration if you need case-sensitive stopping. Lastly, note that stopping is done 14 | # using the same character width as the entries in this file. Since this StopFilter is 15 | # normally done after a CJKWidthFilter in your chain, you would usually want your romaji 16 | # entries to be in half-width and your kana entries to be in full-width. 17 | # 18 | の 19 | に 20 | は 21 | を 22 | た 23 | が 24 | で 25 | て 26 | と 27 | し 28 | れ 29 | さ 30 | ある 31 | いる 32 | も 33 | する 34 | から 35 | な 36 | こと 37 | として 38 | い 39 | や 40 | れる 41 | など 42 | なっ 43 | ない 44 | この 45 | ため 46 | その 47 | あっ 48 | よう 49 | また 50 | もの 51 | という 52 | あり 53 | まで 54 | られ 55 | なる 56 | へ 57 | か 58 | だ 59 | これ 60 | によって 61 | により 62 | おり 63 | より 64 | による 65 | ず 66 | なり 67 | られる 68 | において 69 | ば 70 | なかっ 71 | なく 72 | しかし 73 | について 74 | せ 75 | だっ 76 | その後 77 | できる 78 | それ 79 | う 80 | ので 81 | なお 82 | のみ 83 | でき 84 | き 85 | つ 86 | における 87 | および 88 | いう 89 | さらに 90 | でも 91 | ら 92 | たり 93 | その他 94 | に関する 95 | たち 96 | ます 97 | ん 98 | なら 99 | に対して 100 | 特に 101 | せる 102 | 及び 103 | これら 104 | とき 105 | では 106 | にて 107 | ほか 108 | ながら 109 | うち 110 | そして 111 | とともに 112 | ただし 113 | かつて 114 | それぞれ 115 | または 116 | お 117 | ほど 118 | ものの 119 | に対する 120 | ほとんど 121 | と共に 122 | といった 123 | です 124 | とも 125 | ところ 126 | ここ 127 | ##### End of file 128 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_lv.txt: -------------------------------------------------------------------------------- 1 | # Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins 2 | # the original list of over 800 forms was refined: 3 | # pronouns, adverbs, interjections were removed 4 | # 5 | # prepositions 6 | aiz 7 | ap 8 | ar 9 | apakš 10 | ārpus 11 | augšpus 12 | bez 13 | caur 14 | dēļ 15 | gar 16 | iekš 17 | iz 18 | kopš 19 | labad 20 | lejpus 21 | līdz 22 | no 23 | otrpus 24 | pa 25 | par 26 | pār 27 | pēc 28 | pie 29 | pirms 30 | pret 31 | priekš 32 | starp 33 | šaipus 34 | uz 35 | viņpus 36 | virs 37 | virspus 38 | zem 39 | apakšpus 40 | # Conjunctions 41 | un 42 | bet 43 | jo 44 | ja 45 | ka 46 | lai 47 | tomēr 48 | tikko 49 | turpretī 50 | arī 51 | kaut 52 | gan 53 | tādēļ 54 | tā 55 | ne 56 | tikvien 57 | vien 58 | kā 59 | ir 60 | te 61 | vai 62 | kamēr 63 | # Particles 64 | ar 65 | diezin 66 | droši 67 | diemžēl 68 | nebūt 69 | ik 70 | it 71 | taču 72 | nu 73 | pat 74 | tiklab 75 | iekšpus 76 | nedz 77 | tik 78 | nevis 79 | turpretim 80 | jeb 81 | iekam 82 | iekām 83 | iekāms 84 | kolīdz 85 | līdzko 86 | tiklīdz 87 | jebšu 88 | tālab 89 | tāpēc 90 | nekā 91 | itin 92 | jā 93 | jau 94 | jel 95 | nē 96 | nezin 97 | tad 98 | tikai 99 | vis 100 | tak 101 | iekams 102 | vien 103 | # modal verbs 104 | būt 105 | biju 106 | biji 107 | bija 108 | bijām 109 | bijāt 110 | esmu 111 | esi 112 | esam 113 | esat 114 | būšu 115 | būsi 116 | būs 117 | būsim 118 | būsiet 119 | tikt 120 | tiku 121 | tiki 122 | tika 123 | tikām 124 | tikāt 125 | tieku 126 | tiec 127 | tiek 128 | tiekam 129 | tiekat 130 | tikšu 131 | tiks 132 | tiksim 133 | tiksiet 134 | tapt 135 | tapi 136 | tapāt 137 | topat 138 | tapšu 139 | tapsi 140 | taps 141 | tapsim 142 | tapsiet 143 | kļūt 144 | kļuvu 145 | kļuvi 146 | kļuva 147 | kļuvām 148 | kļuvāt 149 | kļūstu 150 | kļūsti 151 | kļūst 152 | kļūstam 153 | kļūstat 154 | kļūšu 155 | kļūsi 156 | kļūs 157 | kļūsim 158 | kļūsiet 159 | # verbs 160 | varēt 161 | varēju 162 | varējām 163 | varēšu 164 | varēsim 165 | var 166 | varēji 167 | varējāt 168 | varēsi 169 | varēsiet 170 | varat 171 | varēja 172 | varēs 173 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_ro.txt: -------------------------------------------------------------------------------- 1 | # This file was created by Jacques Savoy and is distributed under the BSD license. 2 | # See http://members.unine.ch/jacques.savoy/clef/index.html. 3 | # Also see http://www.opensource.org/licenses/bsd-license.html 4 | acea 5 | aceasta 6 | această 7 | aceea 8 | acei 9 | aceia 10 | acel 11 | acela 12 | acele 13 | acelea 14 | acest 15 | acesta 16 | aceste 17 | acestea 18 | aceşti 19 | aceştia 20 | acolo 21 | acum 22 | ai 23 | aia 24 | aibă 25 | aici 26 | al 27 | ăla 28 | ale 29 | alea 30 | ălea 31 | altceva 32 | altcineva 33 | am 34 | ar 35 | are 36 | aş 37 | aşadar 38 | asemenea 39 | asta 40 | ăsta 41 | astăzi 42 | astea 43 | ăstea 44 | ăştia 45 | asupra 46 | aţi 47 | au 48 | avea 49 | avem 50 | aveţi 51 | azi 52 | bine 53 | bucur 54 | bună 55 | ca 56 | că 57 | căci 58 | când 59 | care 60 | cărei 61 | căror 62 | cărui 63 | cât 64 | câte 65 | câţi 66 | către 67 | câtva 68 | ce 69 | cel 70 | ceva 71 | chiar 72 | cînd 73 | cine 74 | cineva 75 | cît 76 | cîte 77 | cîţi 78 | cîtva 79 | contra 80 | cu 81 | cum 82 | cumva 83 | curând 84 | curînd 85 | da 86 | dă 87 | dacă 88 | dar 89 | datorită 90 | de 91 | deci 92 | deja 93 | deoarece 94 | departe 95 | deşi 96 | din 97 | dinaintea 98 | dintr 99 | dintre 100 | drept 101 | după 102 | ea 103 | ei 104 | el 105 | ele 106 | eram 107 | este 108 | eşti 109 | eu 110 | face 111 | fără 112 | fi 113 | fie 114 | fiecare 115 | fii 116 | fim 117 | fiţi 118 | iar 119 | ieri 120 | îi 121 | îl 122 | îmi 123 | împotriva 124 | în 125 | înainte 126 | înaintea 127 | încât 128 | încît 129 | încotro 130 | între 131 | întrucât 132 | întrucît 133 | îţi 134 | la 135 | lângă 136 | le 137 | li 138 | lîngă 139 | lor 140 | lui 141 | mă 142 | mâine 143 | mea 144 | mei 145 | mele 146 | mereu 147 | meu 148 | mi 149 | mine 150 | mult 151 | multă 152 | mulţi 153 | ne 154 | nicăieri 155 | nici 156 | nimeni 157 | nişte 158 | noastră 159 | noastre 160 | noi 161 | noştri 162 | nostru 163 | nu 164 | ori 165 | oricând 166 | oricare 167 | oricât 168 | orice 169 | oricînd 170 | oricine 171 | oricît 172 | oricum 173 | oriunde 174 | până 175 | pe 176 | pentru 177 | peste 178 | pînă 179 | poate 180 | pot 181 | prea 182 | prima 183 | primul 184 | prin 185 | printr 186 | sa 187 | să 188 | săi 189 | sale 190 | sau 191 | său 192 | se 193 | şi 194 | sînt 195 | sîntem 196 | sînteţi 197 | spre 198 | sub 199 | sunt 200 | suntem 201 | sunteţi 202 | ta 203 | tăi 204 | tale 205 | tău 206 | te 207 | ţi 208 | ţie 209 | tine 210 | toată 211 | toate 212 | tot 213 | toţi 214 | totuşi 215 | tu 216 | un 217 | una 218 | unde 219 | undeva 220 | unei 221 | unele 222 | uneori 223 | unor 224 | vă 225 | vi 226 | voastră 227 | voastre 228 | voi 229 | voştri 230 | vostru 231 | vouă 232 | vreo 233 | vreun 234 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_th.txt: -------------------------------------------------------------------------------- 1 | # Thai stopwords from: 2 | # "Opinion Detection in Thai Political News Columns 3 | # Based on Subjectivity Analysis" 4 | # Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak 5 | ไว้ 6 | ไม่ 7 | ไป 8 | ได้ 9 | ให้ 10 | ใน 11 | โดย 12 | แห่ง 13 | แล้ว 14 | และ 15 | แรก 16 | แบบ 17 | แต่ 18 | เอง 19 | เห็น 20 | เลย 21 | เริ่ม 22 | เรา 23 | เมื่อ 24 | เพื่อ 25 | เพราะ 26 | เป็นการ 27 | เป็น 28 | เปิดเผย 29 | เปิด 30 | เนื่องจาก 31 | เดียวกัน 32 | เดียว 33 | เช่น 34 | เฉพาะ 35 | เคย 36 | เข้า 37 | เขา 38 | อีก 39 | อาจ 40 | อะไร 41 | ออก 42 | อย่าง 43 | อยู่ 44 | อยาก 45 | หาก 46 | หลาย 47 | หลังจาก 48 | หลัง 49 | หรือ 50 | หนึ่ง 51 | ส่วน 52 | ส่ง 53 | สุด 54 | สําหรับ 55 | ว่า 56 | วัน 57 | ลง 58 | ร่วม 59 | ราย 60 | รับ 61 | ระหว่าง 62 | รวม 63 | ยัง 64 | มี 65 | มาก 66 | มา 67 | พร้อม 68 | พบ 69 | ผ่าน 70 | ผล 71 | บาง 72 | น่า 73 | นี้ 74 | นํา 75 | นั้น 76 | นัก 77 | นอกจาก 78 | ทุก 79 | ที่สุด 80 | ที่ 81 | ทําให้ 82 | ทํา 83 | ทาง 84 | ทั้งนี้ 85 | ทั้ง 86 | ถ้า 87 | ถูก 88 | ถึง 89 | ต้อง 90 | ต่างๆ 91 | ต่าง 92 | ต่อ 93 | ตาม 94 | ตั้งแต่ 95 | ตั้ง 96 | ด้าน 97 | ด้วย 98 | ดัง 99 | ซึ่ง 100 | ช่วง 101 | จึง 102 | จาก 103 | จัด 104 | จะ 105 | คือ 106 | ความ 107 | ครั้ง 108 | คง 109 | ขึ้น 110 | ของ 111 | ขอ 112 | ขณะ 113 | ก่อน 114 | ก็ 115 | การ 116 | กับ 117 | กัน 118 | กว่า 119 | กล่าว 120 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/stopwords_tr.txt: -------------------------------------------------------------------------------- 1 | # Turkish stopwords from LUCENE-559 2 | # merged with the list from "Information Retrieval on Turkish Texts" 3 | # (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) 4 | acaba 5 | altmış 6 | altı 7 | ama 8 | ancak 9 | arada 10 | aslında 11 | ayrıca 12 | bana 13 | bazı 14 | belki 15 | ben 16 | benden 17 | beni 18 | benim 19 | beri 20 | beş 21 | bile 22 | bin 23 | bir 24 | birçok 25 | biri 26 | birkaç 27 | birkez 28 | birşey 29 | birşeyi 30 | biz 31 | bize 32 | bizden 33 | bizi 34 | bizim 35 | böyle 36 | böylece 37 | bu 38 | buna 39 | bunda 40 | bundan 41 | bunlar 42 | bunları 43 | bunların 44 | bunu 45 | bunun 46 | burada 47 | çok 48 | çünkü 49 | da 50 | daha 51 | dahi 52 | de 53 | defa 54 | değil 55 | diğer 56 | diye 57 | doksan 58 | dokuz 59 | dolayı 60 | dolayısıyla 61 | dört 62 | edecek 63 | eden 64 | ederek 65 | edilecek 66 | ediliyor 67 | edilmesi 68 | ediyor 69 | eğer 70 | elli 71 | en 72 | etmesi 73 | etti 74 | ettiği 75 | ettiğini 76 | gibi 77 | göre 78 | halen 79 | hangi 80 | hatta 81 | hem 82 | henüz 83 | hep 84 | hepsi 85 | her 86 | herhangi 87 | herkesin 88 | hiç 89 | hiçbir 90 | için 91 | iki 92 | ile 93 | ilgili 94 | ise 95 | işte 96 | itibaren 97 | itibariyle 98 | kadar 99 | karşın 100 | katrilyon 101 | kendi 102 | kendilerine 103 | kendini 104 | kendisi 105 | kendisine 106 | kendisini 107 | kez 108 | ki 109 | kim 110 | kimden 111 | kime 112 | kimi 113 | kimse 114 | kırk 115 | milyar 116 | milyon 117 | mu 118 | mü 119 | mı 120 | nasıl 121 | ne 122 | neden 123 | nedenle 124 | nerde 125 | nerede 126 | nereye 127 | niye 128 | niçin 129 | o 130 | olan 131 | olarak 132 | oldu 133 | olduğu 134 | olduğunu 135 | olduklarını 136 | olmadı 137 | olmadığı 138 | olmak 139 | olması 140 | olmayan 141 | olmaz 142 | olsa 143 | olsun 144 | olup 145 | olur 146 | olursa 147 | oluyor 148 | on 149 | ona 150 | ondan 151 | onlar 152 | onlardan 153 | onları 154 | onların 155 | onu 156 | onun 157 | otuz 158 | oysa 159 | öyle 160 | pek 161 | rağmen 162 | sadece 163 | sanki 164 | sekiz 165 | seksen 166 | sen 167 | senden 168 | seni 169 | senin 170 | siz 171 | sizden 172 | sizi 173 | sizin 174 | şey 175 | şeyden 176 | şeyi 177 | şeyler 178 | şöyle 179 | şu 180 | şuna 181 | şunda 182 | şundan 183 | şunları 184 | şunu 185 | tarafından 186 | trilyon 187 | tüm 188 | üç 189 | üzere 190 | var 191 | vardı 192 | ve 193 | veya 194 | ya 195 | yani 196 | yapacak 197 | yapılan 198 | yapılması 199 | yapıyor 200 | yapmak 201 | yaptı 202 | yaptığı 203 | yaptığını 204 | yaptıkları 205 | yedi 206 | yerine 207 | yetmiş 208 | yine 209 | yirmi 210 | yoksa 211 | yüz 212 | zaten 213 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/lang/userdict_ja.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample user dictionary for Kuromoji (JapaneseTokenizer) 3 | # 4 | # Add entries to this file in order to override the statistical model in terms 5 | # of segmentation, readings and part-of-speech tags. Notice that entries do 6 | # not have weights since they are always used when found. This is by-design 7 | # in order to maximize ease-of-use. 8 | # 9 | # Entries are defined using the following CSV format: 10 | # , ... , ... , 11 | # 12 | # Notice that a single half-width space separates tokens and readings, and 13 | # that the number tokens and readings must match exactly. 14 | # 15 | # Also notice that multiple entries with the same is undefined. 16 | # 17 | # Whitespace only lines are ignored. Comments are not allowed on entry lines. 18 | # 19 | 20 | # Custom segmentation for kanji compounds 21 | 日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 22 | 関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 23 | 24 | # Custom segmentation for compound katakana 25 | トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 26 | ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 27 | 28 | # Custom reading for former sumo wrestler 29 | 朝青龍,朝青龍,アサショウリュウ,カスタム人名 30 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/name_synonyms.txt: -------------------------------------------------------------------------------- 1 | sky walker, skywalker -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/names.txt: -------------------------------------------------------------------------------- 1 | luke_skywalker -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/params.json: -------------------------------------------------------------------------------- 1 | {"params":{ 2 | "query":{ 3 | "defType":"edismax", 4 | "q.alt":"*:*", 5 | "rows":"10", 6 | "fl":"*,score", 7 | "":{"v":0} 8 | }, 9 | "facets":{ 10 | "facet":"on", 11 | "facet.mincount": "1", 12 | "":{"v":0} 13 | }, 14 | "velocity":{ 15 | "wt": "velocity", 16 | "v.template":"browse", 17 | "v.layout": "layout", 18 | "":{"v":0} 19 | } 20 | }} -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/protwords.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | # Use a protected word file to protect against the stemmer reducing two 15 | # unrelated words to the same base word. 16 | 17 | # Some non-words that normally won't be encountered, 18 | # just to test that they won't be stemmed. 19 | dontstems 20 | zwhacky 21 | 22 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/stopwords.txt: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms.txt: -------------------------------------------------------------------------------- 1 | # The ASF licenses this file to You under the Apache License, Version 2.0 2 | # (the "License"); you may not use this file except in compliance with 3 | # the License. You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | #----------------------------------------------------------------------- 14 | #some test synonym mappings unlikely to appear in real input text 15 | 16 | # Demonstrating bidirectional synonyms 17 | #wife,bride 18 | #wife,spouse 19 | #toons,tunes,cartoon 20 | 21 | # Demonstrating => syntax 22 | # wife => wife, bride 23 | # spouse => spouse, husband, wife, partner 24 | # tunes => cartoons, toons, songs 25 | # cartoon => toons, tunes 26 | 27 | # Demonstrating multi phrase 28 | #looney tunes, cartoons 29 | #science fiction, sci fi, sci-fi, scifi 30 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_bidirect.txt: -------------------------------------------------------------------------------- 1 | # Often people erroneously equate linguistic synonyms 2 | # with Solr synonyms. Here the bidirectional nature 3 | # of the synonyms creates problems where the more specific 4 | # term is not prioritized 5 | wife,bride 6 | wife,spouse 7 | toons,tunes,cartoon -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_directed.txt: -------------------------------------------------------------------------------- 1 | wife => wife, bride 2 | spouse => spouse, husband, wife, partner 3 | tunes => cartoons, toons, songs 4 | cartoon => toons, tunes -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_genres.txt: -------------------------------------------------------------------------------- 1 | scifi,science fiction,science fiction movie -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/synonyms_multiterm.txt: -------------------------------------------------------------------------------- 1 | # Here are some multi term synonym to 2 | # see what happens at query time 3 | 4 | looney tunes, cartoons 5 | science fiction, sci fi, sci-fi, scifi -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/taxonomy.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes, cartoons 3 | #bugs bunny => bug_bunny, looney_tunes, cartoons 4 | #mickey mouse => mickey_mouse, disney, cartoons 5 | #minnie mouse => minnie_mouse, disney, cartoons 6 | #donald duck => donald_duck, disney, cartoons 7 | #yogi bear => yogi_bear, disney, cartoons 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse 11 | -------------------------------------------------------------------------------- /notebooks/solr/tmdb/solr_config/conf/taxonomy_parent.txt: -------------------------------------------------------------------------------- 1 | # Capture how the *user* structures information 2 | #looneytunes, looney tunes => looney_tunes 3 | #bugs bunny => bug_bunny, looney_tunes 4 | #mickey mouse => mickey_mouse, disney 5 | #minnie mouse => minnie_mouse, disney 6 | #donald duck => donald_duck, disney 7 | #yogi bear => yogi_bear, disney 8 | 9 | wife => wife, spouse 10 | bride => bride, spouse -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | alembic==1.7.6 3 | appnope==0.1.2 4 | attrs==21.4.0 5 | backcall==0.2.0 6 | bleach==4.1.0 7 | certifi==2022.12.07 8 | chardet==4.0.0 9 | cycler==0.11.0 10 | Cython 11 | decorator==4.3.2 12 | defusedxml==0.5.0 13 | elasticsearch==7.16.2 14 | entrypoints==0.3 15 | fuzzywuzzy==0.18.0 16 | graphviz==0.19.1 17 | idna==3.3 18 | ipykernel==6.4.2 19 | ipython==7.31.1 20 | ipython-genutils==0.2.0 21 | ipywidgets==7.6.5 22 | jedi==0.18.1 23 | Jinja2==3.0.3 24 | joblib==1.1.1 25 | jsonschema==4.4.0 26 | jupyter 27 | jupyter-client==7.4.4 28 | jupyter-console==6.4.0 29 | jupyter-core==4.12 30 | kiwisolver==1.3.2 31 | Mako==1.1.6 32 | MarkupSafe==2.0.1 33 | matplotlib==3.7.2 34 | mistune 35 | mizani==0.9.2 36 | nbconvert==6.5.0 37 | nbformat==5.3.0 38 | nbgrader 39 | nbstripout==0.5.0 40 | notebook==6.4.8 41 | numpy==1.23.5 42 | opensearch-py==2.2.0 43 | pandas==2.0.3 44 | pandocfilters==1.5.0 45 | parso==0.8.3 46 | pexpect==4.8.0 47 | pickleshare==0.7.5 48 | plotly==5.5.0 49 | plotnine==0.12.2 50 | prometheus-client==0.13.1 51 | prompt-toolkit==3.0.26 52 | ptyprocess==0.7.0 53 | Pygments==2.11.2 54 | pyparsing==3.0.7 55 | pyrsistent==0.18.1 56 | python-dateutil==2.8.2 57 | python-editor==1.0.4 58 | pytz==2021.3 59 | pyzmq==25.1.1 60 | qtconsole==5.2.2 61 | requests==2.27.1 62 | retrying==1.3.3 63 | scikit-learn==1.3.0 64 | scipy==1.10.1 65 | seaborn==0.11.2 66 | Send2Trash==1.8.0 67 | six==1.16.0 68 | #sklearn==1.3.0 69 | SQLAlchemy==1.3.24 70 | terminado==0.13.1 71 | testpath==0.5.0 72 | threadpoolctl==3.1.0 73 | tornado==6.2 74 | tqdm==4.62.3 75 | traitlets==5.9.0 76 | urllib3==1.26.8 77 | wcwidth==0.2.5 78 | webencodings==0.5.1 79 | widgetsnbextension==3.5.2 80 | xgboost==1.7.6 81 | -------------------------------------------------------------------------------- /rre/README.md: -------------------------------------------------------------------------------- 1 | rre 2 | 3 | This folder contains some basic RRE demonstrations for running evaluations against your LTR models. 4 | 5 | Navigate to `solr` or `elastic` depending on which you are using and do the following: 6 | 7 | ## Getting Started 8 | - Build the docker image: `docker build -t ltr-rre .` 9 | - Run an evaluation: `docker run --name ltr-rre ltr-rre` 10 | - Copy the report to your host: `docker cp ltr-rre:/rre/target/site/rre-report.xlsx .` 11 | 12 | Alternatively, you can run thru the `evaluation` notebooks in Jupyter to run these steps for you. 13 | 14 | __Note:__ Older versions of Docker for Linux may have issues accessing localhost on the host machine 15 | -------------------------------------------------------------------------------- /rre/elastic/.dockerignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/elastic/.gitignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/elastic/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.6.0-jdk-8 2 | 3 | # Clone the RRE repo 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator 5 | WORKDIR rated-ranking-evaluator 6 | 7 | # Build RRE 8 | RUN mvn clean install 9 | 10 | # Bring over the RRE config 11 | WORKDIR / 12 | COPY . rre 13 | WORKDIR rre 14 | 15 | # By default, run an RRE evaluation if no other command is specified 16 | CMD mvn clean install 17 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/README.md: -------------------------------------------------------------------------------- 1 | This folder contains one subfolder for each configuration version. 2 | Each version folder should contain the index settings associated with such version: 3 | 4 | - `hostUrls`: an array of URLs where the Elasticsearch instance for this 5 | version can be accessed. 6 | - `index`: the name of the index holding the data being used to search. 7 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/baseline/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/classic/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/configuration_sets/latest/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/ratings/ratings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "tmdb", 3 | "id_field": "id", 4 | "topics": [ 5 | { 6 | "description": "LTR Example Evaluation", 7 | "queries": [ 8 | { 9 | "template": "query.json", 10 | "placeholders": { 11 | "$query": "batman" 12 | } 13 | } 14 | ], 15 | "relevant_documents": { 16 | "4": [ 17 | "40662", 18 | "45162", 19 | "69735", 20 | "123025", 21 | "142061", 22 | "177271", 23 | "209112", 24 | "242643", 25 | "251519", 26 | "321528", 27 | "324849", 28 | "366924", 29 | "382322" 30 | ], 31 | "3": [ 32 | "272", 33 | "13851", 34 | "14919", 35 | "16234", 36 | "20077", 37 | "21683", 38 | "22855", 39 | "29751" 40 | ], 41 | "2": [ 42 | "268", 43 | "364", 44 | "414", 45 | "415", 46 | "15805", 47 | "17074" 48 | ], 49 | "1": [ 50 | "2661", 51 | "93560", 52 | "125249" 53 | ] 54 | } 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/README.md: -------------------------------------------------------------------------------- 1 | This folder will contain the query templates associated with the evaluation suite. 2 | The query shape in Elasticsearch is already a JSON file so each template should be a valid Elasticsearch query 3 | with all needed placeholders (that will be defined within the ratings file). 4 | 5 | ```javascript 6 | { 7 | "size": 0, 8 | "query": { 9 | "bool": { 10 | "must": [ 11 | { 12 | "multi_match": { 13 | "query": "$query", 14 | "fields": [ 15 | "some_searchable_field_1^1.75", 16 | "some_other_searchable_field" 17 | ], 18 | "minimum_should_match": "3<-45% 6<-95%" 19 | } 20 | } 21 | ] 22 | } 23 | }, 24 | "aggs": { 25 | "headings": { 26 | "terms": { 27 | "field": "title_sugg", 28 | "order": { "max_score": "desc" } 29 | }, 30 | "aggs": { 31 | "max_score": { 32 | "max": { 33 | "script": { 34 | "lang": "painless", 35 | "inline": "_score" 36 | } 37 | } 38 | } 39 | } 40 | } 41 | } 42 | } 43 | ``` -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/baseline/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "match": { 4 | "title": { 5 | "query": "$query" 6 | } 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/classic/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "should": [ 5 | { 6 | "sltr": { 7 | "featureset": "release", 8 | "model": "classic", 9 | "params": {} 10 | } 11 | } 12 | ], 13 | "filter": [ 14 | {"match": {"title": "$query"}} 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /rre/elastic/src/etc/templates/latest/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "should": [ 5 | { 6 | "sltr": { 7 | "featureset": "release", 8 | "model": "latest", 9 | "params": {} 10 | } 11 | } 12 | ], 13 | "filter": [ 14 | {"match": {"title": "$query"}} 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /rre/opensearch/.dockerignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/opensearch/.gitignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/opensearch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.6.0-jdk-8 2 | 3 | # Clone the RRE repo 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator 5 | WORKDIR rated-ranking-evaluator 6 | 7 | # Build RRE 8 | RUN mvn clean install 9 | 10 | # Bring over the RRE config 11 | WORKDIR / 12 | COPY . rre 13 | WORKDIR rre 14 | 15 | # By default, run an RRE evaluation if no other command is specified 16 | CMD mvn clean install 17 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/configuration_sets/README.md: -------------------------------------------------------------------------------- 1 | This folder contains one subfolder for each configuration version. 2 | Each version folder should contain the index settings associated with such version: 3 | 4 | - `hostUrls`: an array of URLs where the Elasticsearch instance for this 5 | version can be accessed. 6 | - `index`: the name of the index holding the data being used to search. 7 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/configuration_sets/baseline/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/configuration_sets/classic/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/configuration_sets/latest/index-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "hostUrls": [ "http://host.docker.internal:9200" ], 3 | "index": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/ratings/ratings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "tmdb", 3 | "id_field": "id", 4 | "topics": [ 5 | { 6 | "description": "LTR Example Evaluation", 7 | "queries": [ 8 | { 9 | "template": "query.json", 10 | "placeholders": { 11 | "$query": "batman" 12 | } 13 | } 14 | ], 15 | "relevant_documents": { 16 | "4": [ 17 | "40662", 18 | "45162", 19 | "69735", 20 | "123025", 21 | "142061", 22 | "177271", 23 | "209112", 24 | "242643", 25 | "251519", 26 | "321528", 27 | "324849", 28 | "366924", 29 | "382322" 30 | ], 31 | "3": [ 32 | "272", 33 | "13851", 34 | "14919", 35 | "16234", 36 | "20077", 37 | "21683", 38 | "22855", 39 | "29751" 40 | ], 41 | "2": [ 42 | "268", 43 | "364", 44 | "414", 45 | "415", 46 | "15805", 47 | "17074" 48 | ], 49 | "1": [ 50 | "2661", 51 | "93560", 52 | "125249" 53 | ] 54 | } 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/templates/README.md: -------------------------------------------------------------------------------- 1 | This folder will contain the query templates associated with the evaluation suite. 2 | The query shape in Elasticsearch is already a JSON file so each template should be a valid Elasticsearch query 3 | with all needed placeholders (that will be defined within the ratings file). 4 | 5 | ```javascript 6 | { 7 | "size": 0, 8 | "query": { 9 | "bool": { 10 | "must": [ 11 | { 12 | "multi_match": { 13 | "query": "$query", 14 | "fields": [ 15 | "some_searchable_field_1^1.75", 16 | "some_other_searchable_field" 17 | ], 18 | "minimum_should_match": "3<-45% 6<-95%" 19 | } 20 | } 21 | ] 22 | } 23 | }, 24 | "aggs": { 25 | "headings": { 26 | "terms": { 27 | "field": "title_sugg", 28 | "order": { "max_score": "desc" } 29 | }, 30 | "aggs": { 31 | "max_score": { 32 | "max": { 33 | "script": { 34 | "lang": "painless", 35 | "inline": "_score" 36 | } 37 | } 38 | } 39 | } 40 | } 41 | } 42 | } 43 | ``` -------------------------------------------------------------------------------- /rre/opensearch/src/etc/templates/baseline/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "match": { 4 | "title": { 5 | "query": "$query" 6 | } 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/templates/classic/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "should": [ 5 | { 6 | "sltr": { 7 | "featureset": "release", 8 | "model": "classic", 9 | "params": {} 10 | } 11 | } 12 | ], 13 | "filter": [ 14 | {"match": {"title": "$query"}} 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /rre/opensearch/src/etc/templates/latest/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "query": { 3 | "bool": { 4 | "should": [ 5 | { 6 | "sltr": { 7 | "featureset": "release", 8 | "model": "latest", 9 | "params": {} 10 | } 11 | } 12 | ], 13 | "filter": [ 14 | {"match": {"title": "$query"}} 15 | ] 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /rre/solr/.dockerignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/solr/.gitignore: -------------------------------------------------------------------------------- 1 | target/* 2 | -------------------------------------------------------------------------------- /rre/solr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM maven:3.6.0-jdk-8 2 | 3 | # Clone the RRE repo 4 | RUN git clone https://github.com/SeaseLtd/rated-ranking-evaluator 5 | WORKDIR rated-ranking-evaluator 6 | 7 | # Build RRE 8 | RUN mvn clean install 9 | 10 | # Bring over the RRE config 11 | WORKDIR / 12 | COPY . rre 13 | WORKDIR rre 14 | 15 | # By default, run an RRE evaluation if no other command is specified 16 | CMD mvn clean install 17 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/README.md: -------------------------------------------------------------------------------- 1 | This folder contains one subfolder for each configuration version. 2 | Each version folder should contain a solr-settings.json file with details of 3 | how to connect to the appropriate Solr core. 4 | 5 | This is an example: 6 | 7 | * configuration_sets 8 | * v1.0 9 | * solr-settings.json 10 | * v1.1 11 | * solr-settings.json 12 | 13 | The solr-settings.json files may have the following properties: 14 | 15 | - `baseUrls`: an array of Solr base URLs (eg. `[ "http://localhost:8983/solr", "http://localhost:7574/solr" ]`). 16 | - `collectionName` [**REQUIRED**]: the name of the collection or core being evaluated. 17 | - `zkHosts`: an array of Zookeeper hosts (eg. `[ "zk1:2181", "zk2:2181" ]`). 18 | - `zkChroot`: the path to the root Zookeeper node containing Solr data, if running in a Chroot environment (eg. `"/solr"`). 19 | Optional. 20 | - `connectionTimeoutMillis`: the number of milliseconds to wait for a connection to be made to Solr. Optional. 21 | - `socketTimeoutMillis`: the number of milliseconds to allow for a response from Solr. Optional. 22 | 23 | **Either** the baseUrls **or** the zkHosts property must contain values. If both are empty, 24 | the configuration will fail to load. -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/baseline/solr-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ], 3 | "collectionName": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/classic/solr-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ], 3 | "collectionName": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/configuration_sets/latest/solr-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "baseUrls": [ "http://host.docker.internal:8983/solr" ], 3 | "collectionName": "tmdb" 4 | } 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/ratings/README.md: -------------------------------------------------------------------------------- 1 | Under the ratings folder you should have at least 1 ratings file. 2 | A ratings file is connected with a dataset and contains a set of queries that compose the evaluation execution. -------------------------------------------------------------------------------- /rre/solr/src/etc/ratings/ratings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": "tmdb", 3 | "id_field": "id", 4 | "topics": [ 5 | { 6 | "description": "LTR Example Evaluation", 7 | "queries": [ 8 | { 9 | "template": "query.json", 10 | "placeholders": { 11 | "$query": "batman" 12 | } 13 | } 14 | ], 15 | "relevant_documents": { 16 | "4": [ 17 | "40662", 18 | "45162", 19 | "69735", 20 | "123025", 21 | "142061", 22 | "177271", 23 | "209112", 24 | "242643", 25 | "251519", 26 | "321528", 27 | "324849", 28 | "366924", 29 | "382322" 30 | ], 31 | "3": [ 32 | "272", 33 | "13851", 34 | "14919", 35 | "16234", 36 | "20077", 37 | "21683", 38 | "22855", 39 | "29751" 40 | ], 41 | "2": [ 42 | "268", 43 | "364", 44 | "414", 45 | "415", 46 | "15805", 47 | "17074" 48 | ], 49 | "1": [ 50 | "2661", 51 | "93560", 52 | "125249" 53 | ] 54 | } 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/README.md: -------------------------------------------------------------------------------- 1 | This folder will contain the query templates associated with the evaluation suite. 2 | A template is a JSON file containing a JSON object with name->value(s) pairs corresponding to query parameters. 3 | Although it is completely ok to have statically-defined values here, usually you will be using placeholders. 4 | 5 | ```javascript 6 | { 7 | "q": "$query", 8 | "fq": "language:$lang" 9 | } 10 | ``` 11 | The placeholders values will be defined within the ratings file, specifically in the queries definitions. -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/baseline/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "q": "title:($query)" 3 | } 4 | -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/classic/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "q": "title:($query)", 3 | "rq": "{!ltr model=classic}" 4 | } 5 | -------------------------------------------------------------------------------- /rre/solr/src/etc/templates/latest/query.json: -------------------------------------------------------------------------------- 1 | { 2 | "q": "title:($query)", 3 | "rq": "{!ltr model=latest}" 4 | } 5 | -------------------------------------------------------------------------------- /tests/fail.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class Fail(unittest.TestCase): 4 | 5 | def test_that_fails(self): 6 | assert 1 == 0 7 | 8 | if __name__ == "__main__": 9 | unittest.main() 10 | -------------------------------------------------------------------------------- /tests/nb_test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | class NotebookTestConfig: 4 | 5 | SETUP_NB = 'setup.ipynb' 6 | 7 | def __init__(self, path): 8 | self.notebooks = [] 9 | self.setup = None 10 | for nb_path in os.listdir(path): 11 | full_nb_path = os.path.join(path,nb_path) 12 | if os.path.isfile(full_nb_path) and nb_path.endswith('.ipynb'): 13 | if nb_path == NotebookTestConfig.SETUP_NB: 14 | self.setup = full_nb_path 15 | else: 16 | self.notebooks.append(full_nb_path) 17 | 18 | -------------------------------------------------------------------------------- /tests/notebook_test_case.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from nb_test_config import NotebookTestConfig 3 | import runner 4 | 5 | class NotebooksTestCase(unittest.TestCase): 6 | 7 | SAVE_NB_PATH='tests/last_run.ipynb' 8 | 9 | def test_paths(self): 10 | return [] 11 | 12 | def ignored_nbs(self): 13 | return [] 14 | 15 | def nbs_to_run(self): 16 | class IncludeAll: 17 | def __contains__(self, _): 18 | return True 19 | return IncludeAll() 20 | 21 | def test_for_no_errors(self): 22 | """ Run all nbs in directories at test_paths() 23 | also included in nbs_to_run(), 24 | excepting those in ignored_nbs() 25 | - assert there are no errors 26 | """ 27 | for nb_path in self.test_paths(): 28 | 29 | nb_cfg = NotebookTestConfig(path=nb_path) 30 | print("EXECUTING NBS IN DIRECTORY: " + nb_path) 31 | if nb_cfg.setup: 32 | print("Setting up ... " + nb_path) 33 | nb, errors = runner.run_notebook(nb_cfg.setup, save_nb_path=NotebooksTestCase.SAVE_NB_PATH) 34 | print(errors) 35 | assert len(errors) == 0 36 | for nb in nb_cfg.notebooks: 37 | if nb in self.nbs_to_run(): 38 | if nb in self.ignored_nbs(): 39 | print("Ignored " + nb) 40 | else: 41 | print("Running... " + nb) 42 | nb, errors = runner.run_notebook(nb, save_nb_path=NotebooksTestCase.SAVE_NB_PATH) 43 | print(errors) 44 | assert len(errors) == 0 45 | 46 | -------------------------------------------------------------------------------- /tests/pass.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class Pass(unittest.TestCase): 4 | 5 | def test_that_passes(self): 6 | assert 1 == 1 7 | 8 | if __name__ == "__main__": 9 | unittest.main() 10 | -------------------------------------------------------------------------------- /tests/run_most_nbs.py: -------------------------------------------------------------------------------- 1 | from notebook_test_case import NotebooksTestCase 2 | import unittest 3 | 4 | class RunMostNotebooksTestCase(NotebooksTestCase): 5 | 6 | TEST_PATHS = ['./notebooks/', 7 | './notebooks/solr/tmdb', 8 | './notebooks/elasticsearch/tmdb', 9 | './notebooks/elasticsearch/osc-blog', 10 | './notebooks/opensearch/tmdb', 11 | './notebooks/opensearch/osc-blog'] 12 | 13 | IGNORED_NBS = ['./notebooks/solr/tmdb/evaluation (Solr).ipynb', 14 | './notebooks/elasticsearch/tmdb/XGBoost.ipynb', 15 | './notebooks/elasticsearch/tmdb/evaluation.ipynb', 16 | './notebooks/opensearch/tmdb/XGBoost.ipynb', 17 | './notebooks/opensearch/tmdb/evaluation.ipynb'] 18 | 19 | 20 | def test_paths(self): 21 | return RunMostNotebooksTestCase.TEST_PATHS 22 | 23 | def ignored_nbs(self): 24 | return RunMostNotebooksTestCase.IGNORED_NBS 25 | 26 | 27 | 28 | if __name__ == "__main__": 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /tests/runner.py: -------------------------------------------------------------------------------- 1 | # Notebook test runner, adapted from 2 | # https://www.blog.pythonlibrary.org/2018/10/16/testing-jupyter-notebooks/ 3 | import nbformat 4 | import os 5 | 6 | from nbconvert.preprocessors import ExecutePreprocessor 7 | 8 | def hours(hours): 9 | """ Hours as seconds """ 10 | hours * 60 * 60 11 | 12 | def run_notebook(notebook_path, timeout=hours(6), save_nb_path=None): 13 | nb_name, _ = os.path.splitext(os.path.basename(notebook_path)) 14 | dirname = os.path.dirname(notebook_path) 15 | 16 | with open(notebook_path) as f: 17 | nb = nbformat.read(f, as_version=4) 18 | 19 | proc = ExecutePreprocessor(timeout=timeout, kernel_name='python3') 20 | proc.allow_errors = True 21 | 22 | proc.preprocess(nb, {'metadata': {'path': dirname}}) 23 | 24 | if save_nb_path: 25 | with open(save_nb_path, mode='wt') as f: 26 | nbformat.write(nb, f) 27 | 28 | errors = [] 29 | for cell in nb.cells: 30 | if 'outputs' in cell: 31 | for output in cell['outputs']: 32 | if output.output_type == 'error': 33 | errors.append(output) 34 | 35 | return nb, errors 36 | 37 | if __name__ == '__main__': 38 | nb, errors = run_notebook('Testing.ipynb') 39 | print(errors) 40 | -------------------------------------------------------------------------------- /tests/test_prep.py: -------------------------------------------------------------------------------- 1 | from ltr.client.solr_client import SolrClient 2 | client = SolrClient() 3 | 4 | from ltr import download 5 | from ltr.index import rebuild 6 | from ltr.helpers.movies import indexable_movies 7 | 8 | corpus='http://es-learn-to-rank.labs.o19s.com/tmdb.json' 9 | download([corpus], dest='data/'); 10 | 11 | movies=indexable_movies(movies='data/tmdb.json') 12 | rebuild(client, index='tmdb', doc_src=movies) -------------------------------------------------------------------------------- /utils/rateFuzzySearch.json.jinja: -------------------------------------------------------------------------------- 1 | { 2 | "from": 0, 3 | "size": 7, 4 | "query": { 5 | "bool": { 6 | "should": [ 7 | {"match": { 8 | "title": { 9 | "query": "{{ keywords }}", 10 | "fuzziness": "AUTO"} 11 | }} 12 | ] 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /utils/rateSearch.json.jinja: -------------------------------------------------------------------------------- 1 | { 2 | "from": 0, 3 | "size": 5, 4 | "query": { 5 | "bool": { 6 | "should": [ 7 | {"match": { 8 | "text_all": "{{ keywords }}" 9 | }}, 10 | { 11 | "match_phrase": { 12 | "title": { 13 | "query": "{{ keywords }}", 14 | "boost": 1000 15 | } 16 | } 17 | }] 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /utils/train_to_csv.py: -------------------------------------------------------------------------------- 1 | import utils 2 | from ltr.judgments import judgments_from_file 3 | from ltr.client import ElasticClient 4 | import csv 5 | 6 | 7 | def train_to_csv(client, feature_set, in_filename, out_filename): 8 | features = client.feature_set(name=feature_set, index='tmdb')[0] 9 | fieldnames = ['keywords', 'qid', 'grade'] 10 | fieldnames.extend([feature['name'] for feature in features]) 11 | with open(out_filename, 'w') as csvfile: 12 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 13 | writer.writeheader() 14 | 15 | judgments = judgments_from_file(filename='data/title_judgments_train.txt') 16 | for judgment in judgments: 17 | assert len(judgment.features) == len(fieldnames) - 3 18 | record = {} 19 | record[fieldnames[0]] = judgment.keywords 20 | record[fieldnames[1]] = judgment.qid 21 | record[fieldnames[2]] = judgment.grade 22 | for idx,field in enumerate(fieldnames[3:]): 23 | record[field] = judgment.features[idx] 24 | 25 | writer.writerow(record) 26 | 27 | if __name__ == "__main__": 28 | from sys import argv 29 | client = ElasticClient() 30 | train_to_csv(client=client, in_filename=argv[1], 31 | feature_set=argv[2], out_filename=argv[3]) 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.getcwd()) 5 | --------------------------------------------------------------------------------