├── __init__.py ├── config ├── stopwords │ └── english.txt ├── hunspell │ └── en_GB │ │ ├── en_GB.aff │ │ └── en_GB.dic └── analysis │ └── synonyms.txt ├── .DS_Store ├── examples_fox.json ├── examples_sid.json ├── examples_posts.json ├── .gitignore ├── examples_main.json ├── README.md ├── .ipynb_checkpoints ├── populate-checkpoint.ipynb ├── Mapping and Analysis-checkpoint.ipynb └── Searching - The Basic Tools-checkpoint.ipynb ├── index.py ├── Dealing with Human Language ├── Normalizing Tokens.ipynb ├── Identifying Words.ipynb ├── Getting Started with Languages.ipynb ├── Typoes and Mispelings.ipynb ├── Reducing Words to Their Root Form (Pt.1).ipynb ├── Reducing Words (Pt.2).ipynb └── Reducing Words (Pt.2)-Copy1.ipynb ├── populate.ipynb ├── Search in Depth ├── Proximity Matching.ipynb └── Multifield Search (Pt.2).ipynb └── Getting Started ├── Searching - The Basic Tools.ipynb └── Sorting and Relevance.ipynb /__init__.py: -------------------------------------------------------------------------------- 1 | import index -------------------------------------------------------------------------------- /config/stopwords/english.txt: -------------------------------------------------------------------------------- 1 | a 2 | the 3 | dead -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgolding/elasticsearch/HEAD/.DS_Store -------------------------------------------------------------------------------- /config/hunspell/en_GB/en_GB.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgolding/elasticsearch/HEAD/config/hunspell/en_GB/en_GB.aff -------------------------------------------------------------------------------- /config/hunspell/en_GB/en_GB.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgolding/elasticsearch/HEAD/config/hunspell/en_GB/en_GB.dic -------------------------------------------------------------------------------- /config/analysis/synonyms.txt: -------------------------------------------------------------------------------- 1 | manager => leader,boss,person 2 | chef => chef,cook,person 3 | maid => maid,housemaid,young_lady,female,person -------------------------------------------------------------------------------- /examples_fox.json: -------------------------------------------------------------------------------- 1 | { "index": { "_id": 1 }} 2 | { "title": "The quick brown fox" } 3 | { "index": { "_id": 2 }} 4 | { "title": "The quick brown fox jumps over the lazy dog" } 5 | { "index": { "_id": 3 }} 6 | { "title": "The quick brown fox jumps over the quick dog" } 7 | { "index": { "_id": 4 }} 8 | { "title": "Brown fox brown dog" } 9 | -------------------------------------------------------------------------------- /examples_sid.json: -------------------------------------------------------------------------------- 1 | { "index": { "_id": 1 }} 2 | { "price" : 10, "productID" : "XHDK-A-1293-#fJ3" } 3 | { "index": { "_id": 2 }} 4 | { "price" : 20, "productID" : "KDKE-B-9947-#kL5" } 5 | { "index": { "_id": 3 }} 6 | { "price" : 30, "productID" : "JODL-X-1937-#pV7" } 7 | { "index": { "_id": 4 }} 8 | { "price" : 30, "productID" : "QQPX-R-3956-#aD8" } 9 | -------------------------------------------------------------------------------- /examples_posts.json: -------------------------------------------------------------------------------- 1 | { "index": { "_id": "1" }} 2 | { "tags" : ["search"] } 3 | { "index": { "_id": "2" }} 4 | { "tags" : ["search", "open_source"] } 5 | { "index": { "_id": "3" }} 6 | { "other_field" : "some data" } 7 | { "index": { "_id": "4" }} 8 | { "tags" : null } 9 | { "index": { "_id": "5" }} 10 | { "tags" : ["search", null] } 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | 3 | # Compiled source # 4 | ################### 5 | *.com 6 | *.class 7 | *.dll 8 | *.exe 9 | *.o 10 | *.so 11 | 12 | # Packages # 13 | ############ 14 | # it's better to unpack these files and commit the raw source 15 | # git has its own built in compression methods 16 | *.7z 17 | *.dmg 18 | *.gz 19 | *.iso 20 | *.jar 21 | *.rar 22 | *.tar 23 | *.zip 24 | 25 | # Logs and databases # 26 | ###################### 27 | *.log 28 | *.sql 29 | *.sqlite 30 | 31 | # OS generated files # 32 | ###################### 33 | .DS_Store 34 | .DS_Store? 35 | ._* 36 | .Spotlight-V100 37 | .Trashes 38 | ehthumbs.db 39 | Thumbs.db -------------------------------------------------------------------------------- /examples_main.json: -------------------------------------------------------------------------------- 1 | { "create": { "_index": "us", "_type": "user", "_id": "1" }} 2 | { "email" : "john@smith.com", "name" : "John Smith", "username" : "@john" } 3 | { "create": { "_index": "gb", "_type": "user", "_id": "2" }} 4 | { "email" : "mary@jones.com", "name" : "Mary Jones", "username" : "@mary" } 5 | { "create": { "_index": "gb", "_type": "tweet", "_id": "3" }} 6 | { "date" : "2014-09-13", "name" : "Mary Jones", "tweet" : "Elasticsearch means full text search has never been so easy", "user_id" : 2 } 7 | { "create": { "_index": "us", "_type": "tweet", "_id": "4" }} 8 | { "date" : "2014-09-14", "name" : "John Smith", "tweet" : "@mary it is not just text, it does everything", "user_id" : 1 } 9 | { "create": { "_index": "gb", "_type": "tweet", "_id": "5" }} 10 | { "date" : "2014-09-15", "name" : "Mary Jones", "tweet" : "However did I manage before Elasticsearch?", "user_id" : 2 } 11 | { "create": { "_index": "us", "_type": "tweet", "_id": "6" }} 12 | { "date" : "2014-09-16", "name" : "John Smith", "tweet" : "The Elasticsearch API is really easy to use", "user_id" : 1 } 13 | { "create": { "_index": "gb", "_type": "tweet", "_id": "7" }} 14 | { "date" : "2014-09-17", "name" : "Mary Jones", "tweet" : "The Query DSL is really powerful and flexible", "user_id" : 2 } 15 | { "create": { "_index": "us", "_type": "tweet", "_id": "8" }} 16 | { "date" : "2014-09-18", "name" : "John Smith", "user_id" : 1 } 17 | { "create": { "_index": "gb", "_type": "tweet", "_id": "9" }} 18 | { "date" : "2014-09-19", "name" : "Mary Jones", "tweet" : "Geo-location aggregations are really cool", "user_id" : 2 } 19 | { "create": { "_index": "us", "_type": "tweet", "_id": "10" }} 20 | { "date" : "2014-09-20", "name" : "John Smith", "tweet" : "Elasticsearch surely is one of the hottest new NoSQL products", "user_id" : 1 } 21 | { "create": { "_index": "gb", "_type": "tweet", "_id": "11" }} 22 | { "date" : "2014-09-21", "name" : "Mary Jones", "tweet" : "Elasticsearch is built for the cloud, easy to scale", "user_id" : 2 } 23 | { "create": { "_index": "us", "_type": "tweet", "_id": "12" }} 24 | { "date" : "2014-09-22", "name" : "John Smith", "tweet" : "Elasticsearch and I have left the honeymoon stage, and I still love her.", "user_id" : 1 } 25 | { "create": { "_index": "gb", "_type": "tweet", "_id": "13" }} 26 | { "date" : "2014-09-23", "name" : "Mary Jones", "tweet" : "So yes, I am an Elasticsearch fanboy", "user_id" : 2 } 27 | { "create": { "_index": "us", "_type": "tweet", "_id": "14" }} 28 | { "date" : "2014-09-24", "name" : "John Smith", "tweet" : "How many more cheesy tweets do I have to write?", "user_id" : 1 } 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch: The Definitive Guide (with Python examples) # 2 | 3 | #### Versions: #### 4 | 5 | ##### Services: 6 | * Kibana (5.2.2) 7 | * Elasticsearch (5.2.2) 8 | 9 | ##### Python libs: 10 | * elasticsearch (5.2.0) 11 | * elasticsearch-dsl (5.1.0) 12 | 13 | ### What is this? 14 | 15 | This is a set of Jupyter notebooks to help those who want to follow the book [Elasticsearch: The Definitive Guide](https://www.elastic.co/guide/en/elasticsearch/guide/master/index.html) using Python code in addition to the JSON API calls in the book. I have reproduced most of the example API calls, often in various ways, using the two Python libraries: 16 | 17 | * [Elasticsearch](http://elasticsearch-py.readthedocs.io/en/master/index.html) 18 | * [Elasticsearch DSL](http://elasticsearch-dsl.readthedocs.io/en/latest/index.html) 19 | 20 | My goal is to assist the reader/learner in understanding the mechanics of Elasticsearch whilst understanding the Python libs. 21 | 22 | I follow the structure of the book fairly closely (beginning with "Seaching - The Basic Tools") using identical chapter names and headings. I suggest to follow the book whilst exercising some examples in the Kibana console (or via CURL) and some in Python. 23 | 24 | In true notebook fashion, the notebooks provide an interactive documented flow and a place to play. Where useful, I insert text from the guide so as to not break the flow too much (between the book and the notebooks). 25 | 26 | Note that the examples here assume the same setup as the examples in the book, namely a virgin instance of Elasticsearch (most likely on localhost) pre-populated with the [test data](https://github.com/pgolding/elasticsearch/blob/master/examples.json). 27 | 28 | The helper script (index.py) is available to populate/delete/reset the index at various times throughout the chapters. You don't need to touch it as I included initialization at the start of each chapter: 29 | 30 | ```python 31 | import index 32 | 33 | r = index.populate() 34 | print('{} items created'.format(len(r['items']))) 35 | ``` 36 | 37 | If at any time you get stuck with the index, then just call ```index.populate()``` to delete and re-populate the index. You can also pass in a JSON object to define the settings and field mappings etc: 38 | 39 | ```python 40 | index_template = { 41 | "mappings": { 42 | "tweet" : { 43 | "properties" : { 44 | "tweet" : { 45 | "type" : "text", 46 | "analyzer": "english" 47 | }, 48 | "date" : { 49 | "type" : "date" 50 | }, 51 | "name" : { 52 | "type" : "text" 53 | }, 54 | "user_id" : { 55 | "type" : "long" 56 | } 57 | } 58 | } 59 | } 60 | } 61 | index.populate(index_template) 62 | ``` 63 | 64 | (However, I usually make these calls where needed in the notebooks.) 65 | 66 | This is a WIP and I will continue to update it with all examples and later build out more complex examples as accompanying notebooks. 67 | 68 | Note that this is **not** a comprehensive coverage of all examples in the book. I have skipped a few examples here and there, mostly because they are repetitive or because they deal with non-English languages. 69 | 70 | Also, I have added extra examples and included supplementary test data where useful (e.g. synonyms, stopwords files etc.) . This was to add further clarity of emphasis to some of the examples or to provide settings or info overlooked by the book (but covered in the API docs). -------------------------------------------------------------------------------- /.ipynb_checkpoints/populate-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 30, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from elasticsearch import Elasticsearch\n", 12 | "from pprint import pprint\n", 13 | "\n", 14 | "es = Elasticsearch(\n", 15 | " 'localhost',\n", 16 | " # sniff before doing anything\n", 17 | " sniff_on_start=True,\n", 18 | " # refresh nodes after a node fails to respond\n", 19 | " sniff_on_connection_fail=True,\n", 20 | " # and also every 60 seconds\n", 21 | " sniffer_timeout=60\n", 22 | ")\n", 23 | "\n", 24 | "f = open('examples.json', 'r')\n", 25 | "data = f.read()\n", 26 | "\n", 27 | "response = es.bulk(body=data)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 31, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "assert response['errors'] == False\n", 39 | "# Should not produce an AssertionError" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "source": [ 48 | "For the later chapters, you may want to delete the index and re-create, including using a different index:\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 27, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "{'acknowledged': True}" 62 | ] 63 | }, 64 | "execution_count": 27, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "es.indices.delete(index=['gb','us'])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 28, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "index_template = {\n", 82 | " \"mappings\": {\n", 83 | " \"tweet\" : {\n", 84 | " \"properties\" : {\n", 85 | " \"tweet\" : {\n", 86 | " \"type\" : \"text\",\n", 87 | " \"analyzer\": \"english\"\n", 88 | " },\n", 89 | " \"date\" : {\n", 90 | " \"type\" : \"date\"\n", 91 | " },\n", 92 | " \"name\" : {\n", 93 | " \"type\" : \"text\"\n", 94 | " },\n", 95 | " \"user_id\" : {\n", 96 | " \"type\" : \"long\"\n", 97 | " }\n", 98 | " }\n", 99 | " }\n", 100 | " }\n", 101 | "}" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 29, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "{'acknowledged': True, 'shards_acknowledged': True}" 115 | ] 116 | }, 117 | "execution_count": 29, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "es.indices.create(index='gb', body=index_template)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.5.1" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 0 157 | } 158 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | from pprint import pprint 3 | import time 4 | import uuid 5 | from IPython.display import display_javascript, display_html, display 6 | import json 7 | 8 | es = Elasticsearch( 9 | 'localhost', 10 | # sniff before doing anything 11 | sniff_on_start=True, 12 | # refresh nodes after a node fails to respond 13 | sniff_on_connection_fail=True, 14 | # and also every 60 seconds 15 | sniffer_timeout=60 16 | ) 17 | 18 | # Shards = 1 because of https://www.elastic.co/guide/en/elasticsearch/guide/master/relevance-is-broken.html 19 | index_template = { 20 | "settings": { "number_of_shards": 1 }, 21 | "mappings": { 22 | "tweet" : { 23 | "properties" : { 24 | "tweet" : { 25 | "type" : "text", 26 | "analyzer": "english" 27 | }, 28 | "date" : { 29 | "type" : "date" 30 | }, 31 | "name" : { 32 | "type" : "text" 33 | }, 34 | "user_id" : { 35 | "type" : "long" 36 | } 37 | } 38 | } 39 | } 40 | } 41 | 42 | # Shards = 1 because of https://www.elastic.co/guide/en/elasticsearch/guide/master/relevance-is-broken.html 43 | multi_field_index_template = { 44 | "settings": { "number_of_shards": 1 }, 45 | "mappings": { 46 | "tweet" : { 47 | "properties" : { 48 | 49 | "tweet": { 50 | "type": "string", 51 | "analyzer": "english", 52 | "fields": { 53 | "raw": { 54 | "type": "string", 55 | "index": "not_analyzed" 56 | } 57 | } 58 | }, 59 | "date" : { 60 | "type" : "date" 61 | }, 62 | "name" : { 63 | "type" : "text" 64 | }, 65 | "user_id" : { 66 | "type" : "long" 67 | } 68 | } 69 | } 70 | } 71 | } 72 | 73 | f = open('../examples_main.json', 'r') 74 | data = f.read() 75 | 76 | def load_sid_examples(settings=None, set=None): 77 | if set==1: 78 | file_to_load='../examples_sid.json' 79 | idx = 'my_store' 80 | dt = 'produces' 81 | elif set==2: 82 | file_to_load='../examples_posts.json' 83 | idx = 'my_index' 84 | dt = 'posts' 85 | elif set==3: 86 | file_to_load='../examples_fox.json' 87 | idx = 'my_index' 88 | dt = 'my_type' 89 | else: 90 | file_to_load='../examples_sid.json' 91 | idx = 'my_store' 92 | dt = 'produces' 93 | 94 | try: 95 | f = open(file_to_load, 'r') 96 | sid_data = f.read() 97 | if es.indices.exists(idx): 98 | es.indices.delete(idx) 99 | if settings: 100 | es.indices.create(index=idx, body=settings) 101 | response = es.bulk(index=idx, doc_type=dt, body=sid_data) 102 | except Exception as e: 103 | print('Error loading examples') 104 | response = e 105 | return response 106 | 107 | def reset_all(): 108 | reset() 109 | if es.indices.exists('shows'): 110 | es.indices.delete(index='shows') 111 | if es.indices.exists('email'): 112 | es.indices.delete(index='email') 113 | 114 | def create_my_index(index_name='my_index', body=None): 115 | if es.indices.exists(index_name): 116 | es.indices.delete(index_name) 117 | es.indices.create(index=index_name, body=body) 118 | 119 | 120 | def populate(template_num=None): 121 | if es.indices.exists('gb'): 122 | es.indices.delete(index='gb') 123 | # cautious wait on index deletion - prob. not needed 124 | time.sleep(1) 125 | if es.indices.exists('us'): 126 | es.indices.delete(index='us') 127 | # cautious wait on index deletion - prob. not needed 128 | time.sleep(1) 129 | if isinstance(template_num, int): 130 | if template==1: 131 | es.indices.create(index='gb', body=index_template) 132 | response = es.bulk(body=data) 133 | elif template==2: 134 | es.indices.create(index='gb', body=multi_field_index_template) 135 | es.indices.create(index='us', body=multi_field_index_template) 136 | response = es.bulk(body=data) 137 | else: 138 | response = es.bulk(body=data) 139 | return response 140 | 141 | 142 | def populate_tweets_using_mapping(template=None): 143 | if es.indices.exists('gb'): 144 | es.indices.delete(index='gb') 145 | # cautious wait on index deletion - prob. not needed 146 | time.sleep(1) 147 | if es.indices.exists('us'): 148 | es.indices.delete(index='us') 149 | # cautious wait on index deletion - prob. not needed 150 | time.sleep(1) 151 | if isinstance(template, dict): 152 | es.indices.create(index='gb', body=template) 153 | es.indices.create(index='us', body=template) 154 | response = es.bulk(body=data) 155 | else: 156 | response = es.bulk(body=data) 157 | return response 158 | 159 | 160 | def reset(): 161 | if es.indices.exists('gb'): 162 | es.indices.delete(index='gb') 163 | time.sleep(1) 164 | if es.indices.exists('us'): 165 | es.indices.delete(index='us') 166 | time.sleep(1) 167 | 168 | # A helped class to render long JSON objects from ES with collapsible elements 169 | class RenderJSON(object): 170 | def __init__(self, json_data): 171 | if isinstance(json_data, dict): 172 | self.json_str = json.dumps(json_data) 173 | else: 174 | self.json_str = json_data 175 | self.uuid = str(uuid.uuid4()) 176 | 177 | def _ipython_display_(self): 178 | display_html('
'.format(self.uuid), raw=True) 179 | display_javascript(""" 180 | require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() { 181 | document.getElementById('%s').appendChild(renderjson(%s)) 182 | }); 183 | """ % (self.uuid, self.json_str), raw=True) 184 | 185 | reset_all() -------------------------------------------------------------------------------- /Dealing with Human Language/Normalizing Tokens.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Normalizing Tokens\n", 83 | "\n", 84 | "Breaking text into tokens is only half the job. To make those tokens more easily searchable, they need to go through a normalization process to remove insignificant differences between otherwise identical words, such as uppercase versus lowercase. Perhaps we also need to remove significant differences, to make esta, ésta, and está all searchable as the same word. Would you search for déjà vu, or just for deja vu?\n", 85 | "\n", 86 | "This is the job of the token filters, which receive a stream of tokens from the tokenizer. You can have multiple token filters, each doing its particular job. Each receives the new token stream as output by the token filter before it.\n", 87 | "\n", 88 | "#### In That Case\n", 89 | "\n", 90 | "The most frequently used token filter is the lowercase filter, which does exactly what you would expect; it transforms each token into its lowercase form:\n", 91 | "\n", 92 | "```\n", 93 | "GET /_analyze?tokenizer=standard&filters=lowercase\n", 94 | "The QUICK Brown FOX! \n", 95 | "```" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "the,quick,brown,fox\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "text = 'The QUICK Brown FOX!'# contains some uppercase words\n", 115 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 116 | " (tokenizer='standard', filter=['lowercase'], text=text)['tokens']]\n", 117 | "print(','.join(analyzed_text))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "To make this automatic as part of the analysis process, we can create a custom analyzer:\n", 125 | "```\n", 126 | "PUT /my_index\n", 127 | "{\n", 128 | " \"settings\": {\n", 129 | " \"analysis\": {\n", 130 | " \"analyzer\": {\n", 131 | " \"my_lowercaser\": {\n", 132 | " \"tokenizer\": \"standard\",\n", 133 | " \"filter\": [ \"lowercase\" ]\n", 134 | " }\n", 135 | " }\n", 136 | " }\n", 137 | " }\n", 138 | "}\n", 139 | "```" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 15, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "# first delete the index from previous chapters, if it exists\n", 151 | "if es.indices.exists('my_index'): \n", 152 | " es.indices.delete('my_index')" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 16, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "#es.indices.create('my_index')\n", 164 | "from elasticsearch_dsl import analyzer, Index\n", 165 | "my_custom_analyzer = analyzer('my_lowercaser',\n", 166 | " tokenizer='standard',\n", 167 | " filter='lowercase')\n", 168 | "i = Index('my_index')\n", 169 | "i.analyzer(my_custom_analyzer)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 17, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "{'tokens': [{'end_offset': 3,\n", 183 | " 'position': 0,\n", 184 | " 'start_offset': 0,\n", 185 | " 'token': 'the',\n", 186 | " 'type': ''},\n", 187 | " {'end_offset': 9,\n", 188 | " 'position': 1,\n", 189 | " 'start_offset': 4,\n", 190 | " 'token': 'quick',\n", 191 | " 'type': ''},\n", 192 | " {'end_offset': 15,\n", 193 | " 'position': 2,\n", 194 | " 'start_offset': 10,\n", 195 | " 'token': 'brown',\n", 196 | " 'type': ''},\n", 197 | " {'end_offset': 19,\n", 198 | " 'position': 3,\n", 199 | " 'start_offset': 16,\n", 200 | " 'token': 'fox',\n", 201 | " 'type': ''}]}" 202 | ] 203 | }, 204 | "execution_count": 17, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "es.indices.analyze(index='my_index', analyzer='my_lowercaser', text=text)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.5.1" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 0 244 | } 245 | -------------------------------------------------------------------------------- /populate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Populate the Index\n", 8 | "\n", 9 | "Here are some scripts to manage the index used for most of the examples. You might need to call these various scripts in various orders depending upon the chapter. It should be obvious. In future I will try to move these to a helped script accessible to all." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 35, 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from elasticsearch import Elasticsearch\n", 21 | "from pprint import pprint\n", 22 | "\n", 23 | "es = Elasticsearch(\n", 24 | " 'localhost',\n", 25 | " # sniff before doing anything\n", 26 | " sniff_on_start=True,\n", 27 | " # refresh nodes after a node fails to respond\n", 28 | " sniff_on_connection_fail=True,\n", 29 | " # and also every 60 seconds\n", 30 | " sniffer_timeout=60\n", 31 | ")\n", 32 | "\n", 33 | "f = open('examples.json', 'r')\n", 34 | "data = f.read()\n", 35 | "\n", 36 | "response = es.bulk(body=data)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 36, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "assert response['errors'] == False\n", 48 | "# Should not produce an AssertionError" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "source": [ 57 | "For the later chapters, you may want to delete the index and re-create, including using a different index:\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 33, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "{'acknowledged': True}" 71 | ] 72 | }, 73 | "execution_count": 33, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "es.indices.delete(index=['gb','us'])" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 28, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "index_template = {\n", 91 | " \"mappings\": {\n", 92 | " \"tweet\" : {\n", 93 | " \"properties\" : {\n", 94 | " \"tweet\" : {\n", 95 | " \"type\" : \"text\",\n", 96 | " \"analyzer\": \"english\"\n", 97 | " },\n", 98 | " \"date\" : {\n", 99 | " \"type\" : \"date\"\n", 100 | " },\n", 101 | " \"name\" : {\n", 102 | " \"type\" : \"text\"\n", 103 | " },\n", 104 | " \"user_id\" : {\n", 105 | " \"type\" : \"long\"\n", 106 | " }\n", 107 | " }\n", 108 | " }\n", 109 | " }\n", 110 | "}" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 29, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "{'acknowledged': True, 'shards_acknowledged': True}" 124 | ] 125 | }, 126 | "execution_count": 29, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "es.indices.create(index='gb', body=index_template)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 32, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "{'acknowledged': True}" 146 | ] 147 | }, 148 | "execution_count": 32, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "es.indices.delete(index='email') # an index we create later on" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 34, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "{'acknowledged': True, 'shards_acknowledged': True}" 168 | ] 169 | }, 170 | "execution_count": 34, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "multi_field_index_template = {\n", 177 | " \"mappings\": {\n", 178 | " \"tweet\" : {\n", 179 | " \"properties\" : {\n", 180 | " \n", 181 | " \"tweet\": { \n", 182 | " \"type\": \"string\",\n", 183 | " \"analyzer\": \"english\",\n", 184 | " \"fields\": {\n", 185 | " \"raw\": { \n", 186 | " \"type\": \"string\",\n", 187 | " \"index\": \"not_analyzed\"\n", 188 | " }\n", 189 | " }\n", 190 | " }, \n", 191 | " \"date\" : {\n", 192 | " \"type\" : \"date\"\n", 193 | " },\n", 194 | " \"name\" : {\n", 195 | " \"type\" : \"text\"\n", 196 | " },\n", 197 | " \"user_id\" : {\n", 198 | " \"type\" : \"long\"\n", 199 | " }\n", 200 | " }\n", 201 | " }\n", 202 | " }\n", 203 | "}\n", 204 | "es.indices.create(index='gb', body=multi_field_index_template)\n", 205 | "\n" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 37, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "True" 219 | ] 220 | }, 221 | "execution_count": 37, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "es.indices.exists('gb')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 38, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | " if es.indices.exists(['gb,us']):\n", 239 | " es.indices.delete(index=['gb,us'])" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 39, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "False" 253 | ] 254 | }, 255 | "execution_count": 39, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "es.indices.exists('gb')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 43, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "{'acknowledged': True, 'shards_acknowledged': True}" 275 | ] 276 | }, 277 | "execution_count": 43, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "es.indices.create(index='gb', body=multi_field_index_template)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "collapsed": true 291 | }, 292 | "outputs": [], 293 | "source": [] 294 | } 295 | ], 296 | "metadata": { 297 | "kernelspec": { 298 | "display_name": "Python 3", 299 | "language": "python", 300 | "name": "python3" 301 | }, 302 | "language_info": { 303 | "codemirror_mode": { 304 | "name": "ipython", 305 | "version": 3 306 | }, 307 | "file_extension": ".py", 308 | "mimetype": "text/x-python", 309 | "name": "python", 310 | "nbconvert_exporter": "python", 311 | "pygments_lexer": "ipython3", 312 | "version": "3.5.1" 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 0 317 | } 318 | -------------------------------------------------------------------------------- /Search in Depth/Proximity Matching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "import index\n", 47 | "from elasticsearch import Elasticsearch\n", 48 | "from elasticsearch_dsl import Search, Q, Index\n", 49 | "from pprint import pprint\n", 50 | "\n", 51 | "es = Elasticsearch(\n", 52 | " 'localhost',\n", 53 | " # sniff before doing anything\n", 54 | " sniff_on_start=True,\n", 55 | " # refresh nodes after a node fails to respond\n", 56 | " sniff_on_connection_fail=True,\n", 57 | " # and also every 60 seconds\n", 58 | " sniffer_timeout=60\n", 59 | ")\n", 60 | "\n", 61 | "r = index.load_sid_examples(settings={ \"settings\": { \"number_of_shards\": 1 }},set=3)\n", 62 | "#print('{} items created'.format(len(r['items'])))\n", 63 | "\n", 64 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 65 | "# Run the script: populate.ipynb" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "source": [ 74 | "### Phrase Matching\n", 75 | "\n", 76 | "In the same way that the match query is the go-to query for standard full-text search, the match_phrase query is the one you should reach for when you want to find words that are near each other:\n", 77 | " \n", 78 | "\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | ", , ]>" 92 | ] 93 | }, 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "s = Index('my_index', using=es).search()\n", 101 | "s = s.query(Q('match_phrase', title=\"quick brown fox\"))\n", 102 | "s.execute()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "Like the `match` query, the `match_phrase` query first analyzes the query string to produce a list of terms. It then searches for all the terms, but keeps only documents that contain all of the search terms, in the same positions relative to each other. A query for the phrase quick fox would not match any of our documents, because no document contains the word quick immediately followed by fox.\n", 110 | "\n", 111 | "Can also be written as a `match` query with type `phrase`:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 8, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | ", , ]>" 125 | ] 126 | }, 127 | "execution_count": 8, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "s = Index('my_index', using=es).search()\n", 134 | "s = s.query(Q('match', title={\"query\": \"quick brown fox\", \"type\":\"phrase\"}))\n", 135 | "s.execute()\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | ", ]>" 149 | ] 150 | }, 151 | "execution_count": 6, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "s = Index('my_index', using=es).search()\n", 158 | "s = s.query(Q('prefix', postcode=\"W1\"))\n", 159 | "s.execute()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "#### Term Positions\n", 167 | "\n", 168 | "When a string is analyzed, the analyzer returns not only a list of terms, but also the position, or order, of each term in the original string:\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 9, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "{'tokens': [{'end_offset': 5,\n", 182 | " 'position': 0,\n", 183 | " 'start_offset': 0,\n", 184 | " 'token': 'quick',\n", 185 | " 'type': ''},\n", 186 | " {'end_offset': 11,\n", 187 | " 'position': 1,\n", 188 | " 'start_offset': 6,\n", 189 | " 'token': 'brown',\n", 190 | " 'type': ''},\n", 191 | " {'end_offset': 15,\n", 192 | " 'position': 2,\n", 193 | " 'start_offset': 12,\n", 194 | " 'token': 'fox',\n", 195 | " 'type': ''}]}" 196 | ] 197 | }, 198 | "execution_count": 9, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "es.indices.analyze(index='my_index', analyzer='standard', text='Quick brown fox')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "Positions can be stored in the inverted index, and position-aware queries like the match_phrase query can use them to match only documents that contain all the words in exactly the order specified, with no words in-between.\n", 212 | "\n", 213 | "For a document to be considered a match for the phrase “quick brown fox”, the following must be true:\n", 214 | "\n", 215 | "* quick, brown, and fox must all appear in the field.\n", 216 | "* The position of brown must be 1 greater than the position of quick.\n", 217 | "* The position of fox must be 2 greater than the position of quick.\n", 218 | "* If any of these conditions is not met, the document is not considered a match.\n", 219 | "\n", 220 | "#### Mixing It Up\n", 221 | "\n", 222 | "Requiring exact-phrase matches may be too strict a constraint. Perhaps we do want documents that contain “quick brown fox” to be considered a match for the query “quick fox,” even though the positions aren’t exactly equivalent.\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 10, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | ", , ]>" 236 | ] 237 | }, 238 | "execution_count": 10, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "# \"sloppy\"\n", 245 | "s = Index('my_index', using=es).search()\n", 246 | "s = s.query(Q('match_phrase', title={\"query\": \"quick fox\", \"slop\":1}))\n", 247 | "s.execute()" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "The `slop` parameter tells the `match_phrase` query how far apart terms are allowed to be while still considering the document a match. By how far apart we mean how many times do you need to move a term in order to make the query and document match?\n", 255 | "\n", 256 | "We’ll start with a simple example. To make the query quick fox match a document containing `quick brown fox` we need a `slop` of just 1:\n", 257 | "\n", 258 | "\n", 259 | "| | Pos 1 | Pos 2 | Pos 3 |\n", 260 | "|---------|-------|-------|-------|\n", 261 | "| Doc: | quick | brown | fox |\n", 262 | "| Query: | quick | fox | |\n", 263 | "| Slop 1: | quick | ↳ |fox |\n", 264 | "\n", 265 | "Higher slop can move the words in any direction:\n", 266 | "\n", 267 | "| | Pos 1 | |Pos 2 | | Pos 3 |\n", 268 | "|-----------|--------|--|-----------|-|---------|\n", 269 | "|Doc: | quick | |brown | | fox |\n", 270 | "|Query: | fox | | quick | | |\n", 271 | "|Slop 1: | fox/quick |↵| | | |\n", 272 | "|Slop 2: | quick | ↳ | fox | | |\n", 273 | "|Slop 3: | quick | | | ↳ | fox|" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [] 284 | } 285 | ], 286 | "metadata": { 287 | "kernelspec": { 288 | "display_name": "Python 3", 289 | "language": "python", 290 | "name": "python3" 291 | }, 292 | "language_info": { 293 | "codemirror_mode": { 294 | "name": "ipython", 295 | "version": 3 296 | }, 297 | "file_extension": ".py", 298 | "mimetype": "text/x-python", 299 | "name": "python", 300 | "nbconvert_exporter": "python", 301 | "pygments_lexer": "ipython3", 302 | "version": "3.5.1" 303 | } 304 | }, 305 | "nbformat": 4, 306 | "nbformat_minor": 0 307 | } 308 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Mapping and Analysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from elasticsearch import Elasticsearch\n", 35 | "from elasticsearch_dsl import Search, Q\n", 36 | "from pprint import pprint\n", 37 | "\n", 38 | "es = Elasticsearch(\n", 39 | " 'localhost',\n", 40 | " # sniff before doing anything\n", 41 | " sniff_on_start=True,\n", 42 | " # refresh nodes after a node fails to respond\n", 43 | " sniff_on_connection_fail=True,\n", 44 | " # and also every 60 seconds\n", 45 | " sniffer_timeout=60\n", 46 | ")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Mapping and Analysis\n", 54 | "\n", 55 | "> GET /gb/_mapping/tweet" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "{'gb': {'mappings': {'tweet': {'properties': {'date': {'type': 'date'},\n", 70 | " 'name': {'fields': {'keyword': {'ignore_above': 256,\n", 71 | " 'type': 'keyword'}},\n", 72 | " 'type': 'text'},\n", 73 | " 'tweet': {'fields': {'keyword': {'ignore_above': 256,\n", 74 | " 'type': 'keyword'}},\n", 75 | " 'type': 'text'},\n", 76 | " 'user_id': {'type': 'long'}}}}}}\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "res = es.indices.get_mapping(index='gb', doc_type='tweet')\n", 82 | "pprint(res)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "These queries return the same results because the analyzer has tokenized and normalized the string ```2014-09-15``` to ```2014```, ```09```, and ```15```.\n", 90 | "\n", 91 | "> GET /_search?q=2014-09-15 # 12 results !" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "12\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "res = es.search(q='2014-09-15')\n", 111 | "print(res['hits']['total'])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "This search is against the _all meta field and so wherever these values are found (in all tweets), a hit is registered." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 4, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Total hits:12\n", 133 | "\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "s = Search(using=es) \\\n", 139 | " .query('match', _all='2014-09-15')\n", 140 | "response = s.execute()\n", 141 | "print('Total hits:{}\\n'.format(response['hits']['total']))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "If we change the field explicitly to date, then only the one tweet (with that date) is returned:" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 5, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "Total hits:1\n", 163 | "\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "s = Search(using=es) \\\n", 169 | " .query('match', date='2014-09-15')\n", 170 | "response = s.execute()\n", 171 | "print('Total hits:{}\\n'.format(response['hits']['total']))" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "#### Testing Analyzers\n", 179 | "\n", 180 | "We can test analyzers using the _analyze API:\n", 181 | "```\n", 182 | "GET /_analyze\n", 183 | "{\n", 184 | " \"analyzer\": \"standard\",\n", 185 | " \"text\": \"Text to analyze\"\n", 186 | "}\n", 187 | "```" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 6, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "{'tokens': [{'end_offset': 4,\n", 202 | " 'position': 0,\n", 203 | " 'start_offset': 0,\n", 204 | " 'token': 'text',\n", 205 | " 'type': ''},\n", 206 | " {'end_offset': 7,\n", 207 | " 'position': 1,\n", 208 | " 'start_offset': 5,\n", 209 | " 'token': 'to',\n", 210 | " 'type': ''},\n", 211 | " {'end_offset': 15,\n", 212 | " 'position': 2,\n", 213 | " 'start_offset': 8,\n", 214 | " 'token': 'analyze',\n", 215 | " 'type': ''}]}\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "text = \"Text to analyze\"\n", 221 | "res = es.indices.analyze(analyzer='standard', body=text)\n", 222 | "pprint(res)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "Note that the token is the actual term that will be stored in the inverted index:" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 7, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "text\n", 244 | "to\n", 245 | "analyze\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "for token in res['tokens']:\n", 251 | " print(token['token'])" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "#### Built-in Analyzers (Examples)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 8, 264 | "metadata": { 265 | "collapsed": true 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "text = \"I want to buy an i-pad and use it to purchase some socks on e-bay\"" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 9, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "i,want,to,buy,an,i,pad,and,use,it,to,purchase,some,socks,on,e,bay\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "#standard\n", 289 | "analyzed_text = [x['token'] for x in es.indices.analyze(analyzer='standard', body=text)['tokens']]\n", 290 | "print(','.join(analyzed_text))" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 10, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "i,want,to,buy,an,i,pad,and,use,it,to,purchase,some,socks,on,e,bay\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "#simple\n", 310 | "analyzed_text = [x['token'] for x in es.indices.analyze(analyzer='simple', body=text)['tokens']]\n", 311 | "print(','.join(analyzed_text))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 11, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "I,want,to,buy,an,i-pad,and,use,it,to,purchase,some,socks,on,e-bay\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "#whitespace\n", 331 | "analyzed_text = [x['token'] for x in es.indices.analyze(analyzer='whitespace', body=text)['tokens']]\n", 332 | "print(','.join(analyzed_text))" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 12, 338 | "metadata": { 339 | "collapsed": false 340 | }, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "i,want,bui,i,pad,us,purchas,some,sock,e,bai\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "#english (language)\n", 352 | "analyzed_text = [x['token'] for x in es.indices.analyze(analyzer='english', body=text)['tokens']]\n", 353 | "print(','.join(analyzed_text))" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [] 364 | } 365 | ], 366 | "metadata": { 367 | "kernelspec": { 368 | "display_name": "Python 3", 369 | "language": "python", 370 | "name": "python3" 371 | }, 372 | "language_info": { 373 | "codemirror_mode": { 374 | "name": "ipython", 375 | "version": 3 376 | }, 377 | "file_extension": ".py", 378 | "mimetype": "text/x-python", 379 | "name": "python", 380 | "nbconvert_exporter": "python", 381 | "pygments_lexer": "ipython3", 382 | "version": "3.5.1" 383 | } 384 | }, 385 | "nbformat": 4, 386 | "nbformat_minor": 0 387 | } 388 | -------------------------------------------------------------------------------- /Dealing with Human Language/Identifying Words.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Identifying Words\n", 83 | "\n", 84 | "A word in English is relatively simple to spot: words are separated by whitespace or (some) punctuation. Even in English, though, there can be controversy: is you’re one word or two? What about o’clock, cooperate, half-baked, or eyewitness?\n", 85 | "\n", 86 | "The standard analyzer is used by default for any full-text analyzed string field. If we were to reimplement the standard analyzer as a custom analyzer, it would be defined as follows:\n", 87 | "\n", 88 | "```\n", 89 | "{\n", 90 | " \"type\": \"custom\",\n", 91 | " \"tokenizer\": \"standard\",\n", 92 | " \"filter\": [ \"lowercase\", \"stop\" ]\n", 93 | "}\n", 94 | "```\n", 95 | "\n", 96 | "#### Standard Tokenizer\n", 97 | "\n", 98 | "What is interesting is the algorithm that is used to identify words. The whitespace tokenizer simply breaks on whitespace—spaces, tabs, line feeds, and so forth—and assumes that contiguous nonwhitespace characters form a single token. For instance:" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 3, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "You're,the,1st,runner,home!\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "# Whitespace tokenizer\n", 118 | "text = \"You're the 1st runner home!\"\n", 119 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 120 | " (tokenizer='whitespace', body=text)['tokens']]\n", 121 | "print(','.join(analyzed_text))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 4, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "You're,my,co,opted,favorite,cool_dude\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "# Standard tokenizer - uses Unicode Text Segmentation standard\n", 141 | "text = \"You're my co-opted 'favorite' cool_dude.\" # single quotes 'favorite'\n", 142 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 143 | " (tokenizer='standard', body=text)['tokens']]\n", 144 | "print(','.join(analyzed_text))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 5, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "You're,my,co,opted,favorite,cool_dude,Pls,email,me,friend,dude.it\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "# Standard tokenizer - uses Unicode Text Segmentation standard\n", 164 | "# Note that string contains an email address\n", 165 | "text = \"You're my co-opted 'favorite' cool_dude. Pls email me friend@dude.it\"\n", 166 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 167 | " (tokenizer='standard', body=text)['tokens']]\n", 168 | "print(','.join(analyzed_text))" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 30, 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "You're,my,co,opted,favorite,cool_dude,Pls,email,me,friend@dude.it\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "# Standard tokenizer - uses Unicode Text Segmentation standard\n", 188 | "text = \"You're my co-opted 'favorite' cool_dude. Pls email me friend@dude.it\"\n", 189 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 190 | " (tokenizer='uax_url_email', text=text)['tokens']]\n", 191 | "print(','.join(analyzed_text))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "The standard tokenizer is a reasonable starting point for tokenizing most languages, especially Western languages. In fact, it forms the basis of most of the language-specific analyzers like the english, french, and spanish analyzers. Its support for Asian languages, however, is limited, and you should consider using the icu_tokenizer instead, which is available in the ICU plug-in.\n", 199 | "\n", 200 | "#### Tidying Up Input Text\n", 201 | "\n", 202 | "Tokenizers produce the best results when the input text is clean, valid text, where valid means that it follows the punctuation rules that the Unicode algorithm expects. Quite often, though, the text we need to process is anything but clean. Cleaning it up before tokenization improves the quality of the output.\n", 203 | "\n", 204 | "For example, HTML can get messy...\n", 205 | "\n", 206 | "```\n", 207 | "GET /_analyze?tokenizer=standard\n", 208 | "

Some déjà vu \">website\n", 209 | "```\n", 210 | "\n", 211 | "To use them as part of the analyzer, they should be added to a custom analyzer definition:\n", 212 | "\n", 213 | "```\n", 214 | "PUT /my_index\n", 215 | "{\n", 216 | " \"settings\": {\n", 217 | " \"analysis\": {\n", 218 | " \"analyzer\": {\n", 219 | " \"my_html_analyzer\": {\n", 220 | " \"tokenizer\": \"standard\",\n", 221 | " \"char_filter\": [ \"html_strip\" ]\n", 222 | " }\n", 223 | " }\n", 224 | " }\n", 225 | " }\n", 226 | "}\n", 227 | "```" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 31, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "text = '

Some déjà vu \">website'" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 32, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "from elasticsearch_dsl import analyzer, Index" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 33, 255 | "metadata": { 256 | "collapsed": false 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "my_custom_analyzer = analyzer('my_html_analyzer',\n", 261 | " tokenizer='standard',\n", 262 | " char_filter='html_strip')" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 34, 268 | "metadata": { 269 | "collapsed": true 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "i = Index('my_index')" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 35, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "i.analyzer(my_custom_analyzer)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 38, 290 | "metadata": { 291 | "collapsed": false 292 | }, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "Some,déjà,vu,website\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 304 | " (index='my_index', analyzer='my_html_analyzer', text=text)['tokens']]\n", 305 | "print(','.join(analyzed_text))" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "NOTE (and TO_DO): I cheated here because the above method call returned an illegal exception that I was unable to debug (related to passing in the char_filter param). So I created the index using the above params via the Kibana developer console before making the call." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [] 323 | } 324 | ], 325 | "metadata": { 326 | "kernelspec": { 327 | "display_name": "Python 3", 328 | "language": "python", 329 | "name": "python3" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 3 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython3", 341 | "version": "3.5.1" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 0 346 | } 347 | -------------------------------------------------------------------------------- /Search in Depth/Multifield Search (Pt.2).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "4 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q, Index\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.load_sid_examples(settings={ \"settings\": { \"number_of_shards\": 1 }},set=3)\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Multifield Search\n", 83 | "\n", 84 | "#### Cross-fields Entity Search\n", 85 | "\n", 86 | "Data often spread across many fields:\n", 87 | "\n", 88 | "`\n", 89 | "{\n", 90 | " \"street\": \"5 Poland Street\",\n", 91 | " \"city\": \"London\",\n", 92 | " \"country\": \"United Kingdom\",\n", 93 | " \"postcode\": \"W1V 3DG\"\n", 94 | "}\n", 95 | "`\n", 96 | "\n", 97 | "Here we are not concerned with multiple-query strings. Here we want to look at a _single_ query string like \"Poland Street W1V.\" As parts of this string appear in different fields in the doc, using `dis_max / best_fields` will not work as they attempt to find the _single_ best-matching field.\n", 98 | "\n", 99 | "#### A Naive Approach\n", 100 | "\n", 101 | "We could try this:\n", 102 | "`\n", 103 | "{\n", 104 | " \"query\": {\n", 105 | " \"bool\": {\n", 106 | " \"should\": [\n", 107 | " { \"match\": { \"street\": \"Poland Street W1V\" }},\n", 108 | " { \"match\": { \"city\": \"Poland Street W1V\" }},\n", 109 | " { \"match\": { \"country\": \"Poland Street W1V\" }},\n", 110 | " { \"match\": { \"postcode\": \"Poland Street W1V\" }}\n", 111 | " ]\n", 112 | " }\n", 113 | " }\n", 114 | "}\n", 115 | "`\n", 116 | "\n", 117 | "Which is better issued as this:\n", 118 | "`\n", 119 | "{\n", 120 | " \"query\": {\n", 121 | " \"multi_match\": {\n", 122 | " \"query\": \"Poland Street W1V\",\n", 123 | " \"type\": \"most_fields\",\n", 124 | " \"fields\": [ \"street\", \"city\", \"country\", \"postcode\" ]\n", 125 | " }\n", 126 | " }\n", 127 | "}\n", 128 | "`\n", 129 | "\n", 130 | "However:\n", 131 | "\n", 132 | "The most_fields approach to entity search has some problems that are not immediately obvious:\n", 133 | "\n", 134 | "* It is designed to find the most fields matching **any words**, rather than to find the most matching words across **all fields.**\n", 135 | "* It can’t use the `operator` or `minimum_should_match` parameters to reduce the long tail of less-relevant results.\n", 136 | "* Term frequencies are different in each field and could interfere with each other to produce badly ordered results.\n", 137 | "\n", 138 | "#### Field-Centric Queries\n", 139 | "\n", 140 | "All three of the above problems come from `most_fields` being field-centric rather than term-centric - it looks for the most matching fields, not terms! (Ditto `best_fields`).\n", 141 | "\n", 142 | "Let's look at why these problems exist:\n", 143 | "\n", 144 | "##### Problem 1 - Matching the same word in multiple fields\n" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 24, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "'(city:poland city:street city:w1v) (country:poland country:street country:w1v) (postcode:poland postcode:street postcode:w1v) (street:poland street:street street:w1v)'" 158 | ] 159 | }, 160 | "execution_count": 24, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "# Let's confirm how the most_fields query works by validating the query\n", 167 | "body= {\n", 168 | " \"query\": {\n", 169 | " \"multi_match\": {\n", 170 | " \"query\": \"Poland Street W1V\",\n", 171 | " \"type\": \"most_fields\",\n", 172 | " \"fields\": [ \"street\", \"city\", \"country\", \"postcode\" ]\n", 173 | " }\n", 174 | " }\n", 175 | "}\n", 176 | "es.indices.validate_query(index='my_index', body=body, explain=1)\\\n", 177 | " ['explanations'][0]['explanation']" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "You can see that a document matching just the word poland in two fields could score higher than a document matching poland and street in one field.\n", 185 | "\n", 186 | "NOTE: The validated explanation shows the query as a [query string](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)\n", 187 | "\n", 188 | "##### Problem 2 - Trimming the long tail\n", 189 | "\n", 190 | "Perhaps we could try this:\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 25, 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "'(+city:poland +city:street +city:w1v) (+country:poland +country:street +country:w1v) (+postcode:poland +postcode:street +postcode:w1v) (+street:poland +street:street +street:w1v)'" 204 | ] 205 | }, 206 | "execution_count": 25, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "# Adding the **and** operator\n", 213 | "body= {\n", 214 | " \"query\": {\n", 215 | " \"multi_match\": {\n", 216 | " \"query\": \"Poland Street W1V\",\n", 217 | " \"type\": \"most_fields\",\n", 218 | " \"operator\": \"and\",\n", 219 | " \"fields\": [ \"street\", \"city\", \"country\", \"postcode\" ]\n", 220 | " }\n", 221 | " }\n", 222 | "}\n", 223 | "es.indices.validate_query(index='my_index', body=body, explain=1)\\\n", 224 | " ['explanations'][0]['explanation']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "This shows that all words must exist (+) in the same field, which is clearly wrong! It is unlikely that any documents would match this query.\n", 232 | "\n", 233 | "##### Problem 3 - Term Frequencies\n", 234 | "\n", 235 | "In [What Is Relevance?](https://www.elastic.co/guide/en/elasticsearch/guide/master/relevance-intro.html), we explained that the default similarity algorithm used to calculate the relevance score for each term is TF/IDF:\n", 236 | "\n", 237 | "##### Term frequency\n", 238 | ">The more often a term appears in a field in a single document, the more relevant the document.\n", 239 | "\n", 240 | "##### Inverse document frequency\n", 241 | ">The more often a term appears in a field in all documents in the index, the less relevant is that term.\n", 242 | "\n", 243 | "When searching against multiple fields, TF/IDF can introduce some surprising results.\n", 244 | "\n", 245 | "Consider searching for “Peter Smith” using `first_name` and `last_name` fields. Peter is a common first name and Smith is a common last name, so both will have low IDFs. But what if we have another person in the index whose name is Smith Williams? Smith as a first name is very uncommon and so will have a high IDF!\n", 246 | "\n", 247 | "A simple query like the following may well return Smith Williams above Peter Smith in spite of the fact that the second person is a better match than the first.\n", 248 | "\n", 249 | "`\n", 250 | "{\n", 251 | " \"query\": {\n", 252 | " \"multi_match\": {\n", 253 | " \"query\": \"Peter Smith\",\n", 254 | " \"type\": \"most_fields\",\n", 255 | " \"fields\": [ \"*_name\" ]\n", 256 | " }\n", 257 | " }\n", 258 | "}\n", 259 | "`\n", 260 | "The high IDF of smith in the first name field can overwhelm the two low IDFs of peter as a first name and smith as a last name.\n", 261 | "\n", 262 | "#### Solution\n", 263 | "\n", 264 | "These problems only exist because we are dealing with multiple fields. If we were to combine all of these fields into a single field, the problems would vanish. We could achieve this by adding a full_name field to our person document:\n", 265 | "\n", 266 | "`\n", 267 | "{\n", 268 | " \"first_name\": \"Peter\",\n", 269 | " \"last_name\": \"Smith\",\n", 270 | " \"full_name\": \"Peter Smith\"\n", 271 | "}`\n", 272 | "\n", 273 | "When querying just the full_name field:\n", 274 | "\n", 275 | "* Documents with more matching words would trump documents with the same word repeated.\n", 276 | "* The minimum_should_match and operator parameters would function as expected.\n", 277 | "* The inverse document frequencies for first and last names would be combined so it wouldn’t matter whether Smith were a first or last name anymore.\n", 278 | "\n", 279 | "While this would work, we don’t like having to store redundant data. Instead, Elasticsearch offers us two solutions—one at index time and one at search time:\n", 280 | "\n", 281 | "#### Custom `_all` Fields\n", 282 | "\n", 283 | "The [Metadata: _all Field](https://www.elastic.co/guide/en/elasticsearch/guide/master/root-object.html#all-field) stored all values from all fields as one big string. A more flexible approach is an `_all` field for the person’s name, and another custom `_all` field for the address. \n", 284 | "\n", 285 | "This can be done using the `copy_to` parameter in field mappings:\n", 286 | "\n", 287 | "`PUT /my_index\n", 288 | "{\n", 289 | " \"mappings\": {\n", 290 | " \"person\": {\n", 291 | " \"properties\": {\n", 292 | " \"first_name\": {\n", 293 | " \"type\": \"string\",\n", 294 | " \"copy_to\": \"full_name\" \n", 295 | " },\n", 296 | " \"last_name\": {\n", 297 | " \"type\": \"string\",\n", 298 | " \"copy_to\": \"full_name\" \n", 299 | " },\n", 300 | " \"full_name\": {\n", 301 | " \"type\": \"string\"\n", 302 | " }\n", 303 | " }\n", 304 | " }\n", 305 | " }\n", 306 | "}\n", 307 | "`" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "With this mapping in place, we can query the `first_name` field for first names, the `last_name` field for last name, or the `full_name` field for first and last names.\n", 315 | "\n", 316 | "**NOTE:** The copy_to setting will not work on a multi-field. If you attempt to configure your mapping this way, Elasticsearch will throw an exception.\n", 317 | "\n", 318 | "Just add the `copy_to` to the main field, **not** the multi-field:\n", 319 | "\n", 320 | "`\n", 321 | "PUT /my_index\n", 322 | "{\n", 323 | " \"mappings\": {\n", 324 | " \"person\": {\n", 325 | " \"properties\": {\n", 326 | " \"first_name\": {\n", 327 | " \"type\": \"string\",\n", 328 | " \"copy_to\": \"full_name\", \n", 329 | " \"fields\": {\n", 330 | " \"raw\": {\n", 331 | " \"type\": \"string\",\n", 332 | " \"index\": \"not_analyzed\"\n", 333 | " }\n", 334 | " }\n", 335 | " },\n", 336 | " \"full_name\": {\n", 337 | " \"type\": \"string\"\n", 338 | " }\n", 339 | " }\n", 340 | " }\n", 341 | " }\n", 342 | "}\n", 343 | "`" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [] 354 | } 355 | ], 356 | "metadata": { 357 | "kernelspec": { 358 | "display_name": "Python 3", 359 | "language": "python", 360 | "name": "python3" 361 | }, 362 | "language_info": { 363 | "codemirror_mode": { 364 | "name": "ipython", 365 | "version": 3 366 | }, 367 | "file_extension": ".py", 368 | "mimetype": "text/x-python", 369 | "name": "python", 370 | "nbconvert_exporter": "python", 371 | "pygments_lexer": "ipython3", 372 | "version": "3.5.1" 373 | } 374 | }, 375 | "nbformat": 4, 376 | "nbformat_minor": 0 377 | } 378 | -------------------------------------------------------------------------------- /Dealing with Human Language/Getting Started with Languages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 9, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 12, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Getting Started with Languages\n", 81 | "\n", 82 | "Full-text search is a battle between precision—returning as few irrelevant documents as possible—and recall—returning as many relevant documents as possible.\n", 83 | "\n", 84 | "Many tactics can be deployed to tackle precision and recall, such as modifying words: e.g. search for \"jumping\", \"jumps\" and \"jumped\" by reducing words to their stem (root form) - \"jump\".\n", 85 | "\n", 86 | "However, the first step is to identify words using an analyzer:\n", 87 | "\n", 88 | "##### Tokenize text into individual words:\n", 89 | "\n", 90 | "```The quick brown foxes → [The, quick, brown, foxes]```\n", 91 | "\n", 92 | "##### Lowercase tokens:\n", 93 | "\n", 94 | "```The → the```\n", 95 | "\n", 96 | "##### Remove common stopwords:\n", 97 | "\n", 98 | "```[The, quick, brown, foxes] → [quick, brown, foxes]```\n", 99 | "\n", 100 | "##### Stem tokens to their root form:\n", 101 | "\n", 102 | "```foxes → fox```\n", 103 | "\n", 104 | "Each analyzer may also apply other transformations specific to its language in order to make words from that language more searchable:\n", 105 | "\n", 106 | "##### The english analyzer removes the possessive 's:\n", 107 | "\n", 108 | "```John's → john```\n", 109 | "\n", 110 | "##### The french analyzer removes elisions like l' and qu' and diacritics like ¨ or ^:\n", 111 | "\n", 112 | "```l'église → eglis```\n", 113 | "\n", 114 | "##### The german analyzer normalizes terms, replacing ä and ae with a, or ß with ss, among others:\n", 115 | "\n", 116 | "```äußerst → ausserst```\n", 117 | "\n", 118 | "### Using Language Analyzers\n", 119 | "\n", 120 | "The built-in language analyzers are available globally and don’t need to be configured before being used. They can be specified directly in the field mapping:\n", 121 | "\n", 122 | "```\n", 123 | "PUT /my_index\n", 124 | "{\n", 125 | " \"mappings\": {\n", 126 | " \"blog\": {\n", 127 | " \"properties\": {\n", 128 | " \"title\": {\n", 129 | " \"type\": \"string\",\n", 130 | " \"analyzer\": \"english\" \n", 131 | " }\n", 132 | " }\n", 133 | " }\n", 134 | " }\n", 135 | "}\n", 136 | "```" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 37, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "i'm,happi,about,fox\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "#english (language)\n", 156 | "text = 'I\\'m not happy about the foxes'\n", 157 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 158 | " (analyzer='english', body=text)['tokens']]\n", 159 | "print(','.join(analyzed_text))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "We can’t tell if a document mentions one fox or many foxes; the word 'not' is a stopword and is removed, so we can’t tell whether the document is happy about foxes or not. By using the english analyzer, we have **increased recall** as we can match more loosely, but we have reduced our ability to rank documents accurately.\n", 167 | "\n", 168 | "To get the best of both worlds, we can use multifields to index the title field twice: once with the english analyzer and once with the standard analyzer:\n", 169 | "\n", 170 | "```\n", 171 | "PUT /my_index\n", 172 | "{\n", 173 | " \"mappings\": {\n", 174 | " \"blog\": {\n", 175 | " \"properties\": {\n", 176 | " \"title\": { \n", 177 | " \"type\": \"string\",\n", 178 | " \"fields\": {\n", 179 | " \"english\": { \n", 180 | " \"type\": \"string\",\n", 181 | " \"analyzer\": \"english\"\n", 182 | " }\n", 183 | " }\n", 184 | " }\n", 185 | " }\n", 186 | " }\n", 187 | " }\n", 188 | "}\n", 189 | "```" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 18, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "index_template = {\n", 201 | " \"mappings\": {\n", 202 | " \"blog\": {\n", 203 | " \"properties\": {\n", 204 | " \"title\": { \n", 205 | " \"type\": \"text\",\n", 206 | " \"fields\": {\n", 207 | " \"english\": { \n", 208 | " \"type\": \"text\",\n", 209 | " \"analyzer\": \"english\"\n", 210 | " }\n", 211 | " }\n", 212 | " }\n", 213 | " }\n", 214 | " }\n", 215 | " }\n", 216 | "}\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 19, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "{'acknowledged': True, 'shards_acknowledged': True}" 230 | ] 231 | }, 232 | "execution_count": 19, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "es.indices.create(index='my_index', body=index_template)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 21, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "{'_id': '1',\n", 252 | " '_index': 'my_index',\n", 253 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n", 254 | " '_type': 'blog',\n", 255 | " '_version': 1,\n", 256 | " 'created': True,\n", 257 | " 'result': 'created'}" 258 | ] 259 | }, 260 | "execution_count": 21, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "data = { \"title\": \"I'm happy for this fox\" }\n", 267 | "es.create(index='my_index', doc_type='blog', body=data, id=1)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 22, 273 | "metadata": { 274 | "collapsed": false 275 | }, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "{'_id': '2',\n", 281 | " '_index': 'my_index',\n", 282 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n", 283 | " '_type': 'blog',\n", 284 | " '_version': 1,\n", 285 | " 'created': True,\n", 286 | " 'result': 'created'}" 287 | ] 288 | }, 289 | "execution_count": 22, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "data = { \"title\": \"I'm not happy about my fox problem\" }\n", 296 | "es.create(index='my_index', doc_type='blog', body=data, id=2)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 29, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "I'm not happy about my fox problem\n", 311 | "I'm happy for this fox\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "s = Search(using=es, index='my_index', doc_type='blog')\n", 317 | "q = Q('multi_match', type='most_fields', query='not happy foxes', fields=['title', 'title.english'])\n", 318 | "s = s.query()\n", 319 | "res = s.execute()\n", 320 | "for hit in res:\n", 321 | " print(hit.title)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "Note that both hits **do not** contain the word foxes, but we got a hit on fox.\n", 329 | "\n", 330 | "Use the ```most_fields``` query type to match the same text in as many fields as possible." 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "### Configuring Lanuage Analyzers\n", 338 | "\n", 339 | "It might be useful to avoid stemming words (like \"organization\" --> organ) if you know this will sacrifice certain precision requirements (e.g. seaches for \"world health organization\"). It is possible to configure the analyzers, e.g. to exclude certain stop words or stems:\n", 340 | "\n", 341 | "```\n", 342 | "PUT /my_index\n", 343 | "{\n", 344 | " \"settings\": {\n", 345 | " \"analysis\": {\n", 346 | " \"analyzer\": {\n", 347 | " \"my_english\": {\n", 348 | " \"type\": \"english\",\n", 349 | " \"stem_exclusion\": [ \"organization\", \"organizations\" ], \n", 350 | " \"stopwords\": [ \n", 351 | " \"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", 352 | " \"if\", \"in\", \"into\", \"is\", \"it\", \"of\", \"on\", \"or\", \"such\", \"that\",\n", 353 | " \"the\", \"their\", \"then\", \"there\", \"these\", \"they\", \"this\", \"to\",\n", 354 | " \"was\", \"will\", \"with\"\n", 355 | " ]\n", 356 | " }\n", 357 | " }\n", 358 | " }\n", 359 | " }\n", 360 | "}\n", 361 | "\n", 362 | "GET /my_index/_analyze?analyzer=my_english \n", 363 | "The World Health Organization does not sell organs.\n", 364 | "```\n", 365 | "\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 31, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [ 375 | { 376 | "data": { 377 | "text/plain": [ 378 | "{'acknowledged': True, 'shards_acknowledged': True}" 379 | ] 380 | }, 381 | "execution_count": 31, 382 | "metadata": {}, 383 | "output_type": "execute_result" 384 | } 385 | ], 386 | "source": [ 387 | "es.indices.delete(index='my_index')\n", 388 | "index_template_with_exclusions = \\\n", 389 | "{\n", 390 | " \"settings\": {\n", 391 | " \"analysis\": {\n", 392 | " \"analyzer\": {\n", 393 | " \"my_english\": {\n", 394 | " \"type\": \"english\",\n", 395 | " \"stem_exclusion\": [ \"organization\", \"organizations\" ], \n", 396 | " \"stopwords\": [ \n", 397 | " \"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n", 398 | " \"if\", \"in\", \"into\", \"is\", \"it\", \"of\", \"on\", \"or\", \"such\", \"that\",\n", 399 | " \"the\", \"their\", \"then\", \"there\", \"these\", \"they\", \"this\", \"to\",\n", 400 | " \"was\", \"will\", \"with\"\n", 401 | " ]\n", 402 | " }\n", 403 | " }\n", 404 | " }\n", 405 | " }\n", 406 | "}\n", 407 | "\n", 408 | "es.indices.create(index='my_index', body=index_template_with_exclusions)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 36, 414 | "metadata": { 415 | "collapsed": false 416 | }, 417 | "outputs": [ 418 | { 419 | "name": "stdout", 420 | "output_type": "stream", 421 | "text": [ 422 | "world,health,organization,doe,not,sell,organ\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "#english (language) with exclusions - my_english\n", 428 | "text = 'The World Health Organization does not sell organs.'\n", 429 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 430 | " (index='my_index', analyzer='my_english', body=text)['tokens']]\n", 431 | "print(','.join(analyzed_text))" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": { 438 | "collapsed": true 439 | }, 440 | "outputs": [], 441 | "source": [] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "Python 3", 447 | "language": "python", 448 | "name": "python3" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 3 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython3", 460 | "version": "3.5.1" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 0 465 | } 466 | -------------------------------------------------------------------------------- /Dealing with Human Language/Typoes and Mispelings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Typoes and Mispelings\n", 83 | "\n", 84 | "full-text search that only matches exactly will probably frustrate your users. Wouldn’t you expect a search for “quick brown fox” to match a document containing “fast brown foxes,” “Johnny Walker” to match “Johnnie Walker,” or “Arnold Shcwarzenneger” to match “Arnold Schwarzenegger”?\n", 85 | "\n", 86 | "Fuzzy matching allows for query-time matching of misspelled words, while phonetic token filters at index time can be used for sounds-like matching.\n", 87 | "\n", 88 | "#### Fuzziness\n", 89 | "\n", 90 | "Fuzzy matching treats two words that are “fuzzily” similar as if they were the same word. First, we need to define what we mean by fuzziness. It is the concept of distance - e.g. Damerau-Levenshtein distance.\n", 91 | "\n", 92 | "Damerau observed that 80% of human misspellings have an edit distance of 1. In other words, 80% of misspellings could be corrected with a single edit to the original string.\n", 93 | "\n", 94 | "Elasticsearch supports a maximum edit distance, specified with the fuzziness parameter, of 2.\n", 95 | "\n", 96 | "Of course, the impact that a single edit has on a string depends on the length of the string. Two edits to the word hat can produce mad, so allowing two edits on a string of length 3 is overkill. The fuzziness parameter can be set to AUTO, which results in the following maximum edit distances:\n", 97 | "\n", 98 | "* 0 for strings of one or two characters\n", 99 | "* 1 for strings of three, four, or five characters\n", 100 | "* 2 for strings of more than five characters\n", 101 | "\n", 102 | "Of course, you may find that an edit distance of 2 is still overkill, and returns results that don’t appear to be related. You may get better results, and better performance, with a maximum fuzziness of 1." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 33, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "data = ['Surprise me!', 'That was surprising.', 'I wasn\\'t surprised.']\n", 114 | "for i,txt in enumerate(data):\n", 115 | " body = { \"text\": \"\"}\n", 116 | " body['text'] = txt\n", 117 | " es.create(index='my_index', doc_type='my_type', id=i, body=body)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 35, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n", 131 | " 'hits': {'hits': [{'_id': '0',\n", 132 | " '_index': 'my_index',\n", 133 | " '_score': 0.22585157,\n", 134 | " '_source': {'text': 'Surprise me!'},\n", 135 | " '_type': 'my_type'},\n", 136 | " {'_id': '2',\n", 137 | " '_index': 'my_index',\n", 138 | " '_score': 0.1898702,\n", 139 | " '_source': {'text': \"I wasn't surprised.\"},\n", 140 | " '_type': 'my_type'}],\n", 141 | " 'max_score': 0.22585157,\n", 142 | " 'total': 2},\n", 143 | " 'timed_out': False,\n", 144 | " 'took': 5}" 145 | ] 146 | }, 147 | "execution_count": 35, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "body = {\n", 154 | " \"query\": {\n", 155 | " \"fuzzy\": {\n", 156 | " \"text\": \"surprize\"\n", 157 | " }\n", 158 | " }\n", 159 | "}\n", 160 | "es.search(body=body)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "The fuzzy query is a term-level query, so it doesn’t do any analysis. It takes a single term and finds all terms in the term dictionary that are within the specified fuzziness. The default fuzziness is AUTO.\n", 168 | "\n", 169 | "In our example, surprize is within an edit distance of 2 from both surprise and surprised, so documents 1 and 3 match. We could reduce the matches to just surprise with the following query:\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 36, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n", 183 | " 'hits': {'hits': [{'_id': '0',\n", 184 | " '_index': 'my_index',\n", 185 | " '_score': 0.22585157,\n", 186 | " '_source': {'text': 'Surprise me!'},\n", 187 | " '_type': 'my_type'}],\n", 188 | " 'max_score': 0.22585157,\n", 189 | " 'total': 1},\n", 190 | " 'timed_out': False,\n", 191 | " 'took': 3}" 192 | ] 193 | }, 194 | "execution_count": 36, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "body = {\n", 201 | " \"query\": {\n", 202 | " \"fuzzy\": {\n", 203 | " \"text\": {\n", 204 | " \"value\": \"surprize\",\n", 205 | " \"fuzziness\": 1\n", 206 | " }\n", 207 | " }\n", 208 | " }\n", 209 | "}\n", 210 | "es.search(body=body)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "#### Improving Performance\n", 218 | "\n", 219 | "The fuzzy query works by taking the original term and building a Levenshtein automaton—like a big graph representing all the strings that are within the specified edit distance of the original string.\n", 220 | "\n", 221 | "The fuzzy query then uses the automaton to step efficiently through all of the terms in the term dictionary to see if they match. Once it has collected all of the matching terms that exist in the term dictionary, it can compute the list of matching documents.\n", 222 | "\n", 223 | "Of course, depending on the type of data stored in the index, a fuzzy query with an edit distance of 2 can match a very large number of terms and perform very badly. Two parameters can be used to limit the performance impact:\n", 224 | "\n", 225 | "##### prefix_length\n", 226 | "\n", 227 | ">The number of initial characters that will not be “fuzzified.” **Most spelling errors occur toward the end of the word, not toward the beginning.** By using a prefix_length of 3, for example, you can signficantly reduce the number of matching terms.\n", 228 | "\n", 229 | "##### max_expansions\n", 230 | "\n", 231 | ">If a fuzzy query expands to three or four fuzzy options, the new options may be meaningful. If it produces 1,000 options, they are essentially meaningless. Use max_expansions to limit the total number of options that will be produced. The fuzzy query will collect matching terms until it runs out of terms or reaches the max_expansions limit.\n", 232 | "\n", 233 | "#### Fuzzy Match Query\n", 234 | "\n", 235 | "The `match` query supports fuzzy matching out of the box:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 37, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n", 249 | " 'hits': {'hits': [{'_id': '0',\n", 250 | " '_index': 'my_index',\n", 251 | " '_score': 0.48396763,\n", 252 | " '_source': {'text': 'Surprise me!'},\n", 253 | " '_type': 'my_type'}],\n", 254 | " 'max_score': 0.48396763,\n", 255 | " 'total': 1},\n", 256 | " 'timed_out': False,\n", 257 | " 'took': 6}" 258 | ] 259 | }, 260 | "execution_count": 37, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "body= {\n", 267 | " \"query\": {\n", 268 | " \"match\": {\n", 269 | " \"text\": {\n", 270 | " \"query\": \"SURPRIZE ME!\",\n", 271 | " \"fuzziness\": \"AUTO\",\n", 272 | " \"operator\": \"and\"\n", 273 | " }\n", 274 | " }\n", 275 | " }\n", 276 | "}\n", 277 | "es.search(body=body)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 38, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n", 291 | " 'hits': {'hits': [{'_id': '0',\n", 292 | " '_index': 'my_index',\n", 293 | " '_score': 0.48396763,\n", 294 | " '_source': {'text': 'Surprise me!'},\n", 295 | " '_type': 'my_type'},\n", 296 | " {'_id': '2',\n", 297 | " '_index': 'my_index',\n", 298 | " '_score': 0.1898702,\n", 299 | " '_source': {'text': \"I wasn't surprised.\"},\n", 300 | " '_type': 'my_type'}],\n", 301 | " 'max_score': 0.48396763,\n", 302 | " 'total': 2},\n", 303 | " 'timed_out': False,\n", 304 | " 'took': 7}" 305 | ] 306 | }, 307 | "execution_count": 38, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "body = {\n", 314 | " \"query\": {\n", 315 | " \"multi_match\": {\n", 316 | " \"fields\": [ \"text\", \"title\" ],\n", 317 | " \"query\": \"SURPRIZE ME!\",\n", 318 | " \"fuzziness\": \"AUTO\"\n", 319 | " }\n", 320 | " }\n", 321 | "}\n", 322 | "es.search(body=body)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 39, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "# Let's add some more data to test how fuzziness relates to relevance:\n", 334 | "data = ['The element of surprize!', 'That is surprising.', 'Inside every Kinder egg is a surprise.']\n", 335 | "for i,txt in enumerate(data):\n", 336 | " body = { \"text\": \"\"}\n", 337 | " body['text'] = txt\n", 338 | " es.create(index='my_index', doc_type='my_type', id=i+3, body=body)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 41, 344 | "metadata": { 345 | "collapsed": false 346 | }, 347 | "outputs": [ 348 | { 349 | "data": { 350 | "text/plain": [ 351 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n", 352 | " 'hits': {'hits': [{'_id': '2',\n", 353 | " '_index': 'my_index',\n", 354 | " '_score': 0.45747715,\n", 355 | " '_source': {'text': \"I wasn't surprised.\"},\n", 356 | " '_type': 'my_type'},\n", 357 | " {'_id': '3',\n", 358 | " '_index': 'my_index',\n", 359 | " '_score': 0.2876821,\n", 360 | " '_source': {'text': 'The element of surprize!'},\n", 361 | " '_type': 'my_type'},\n", 362 | " {'_id': '5',\n", 363 | " '_index': 'my_index',\n", 364 | " '_score': 0.2500978,\n", 365 | " '_source': {'text': 'Inside every Kinder egg is a surprise.'},\n", 366 | " '_type': 'my_type'},\n", 367 | " {'_id': '0',\n", 368 | " '_index': 'my_index',\n", 369 | " '_score': 0.22585157,\n", 370 | " '_source': {'text': 'Surprise me!'},\n", 371 | " '_type': 'my_type'}],\n", 372 | " 'max_score': 0.45747715,\n", 373 | " 'total': 4},\n", 374 | " 'timed_out': False,\n", 375 | " 'took': 8}" 376 | ] 377 | }, 378 | "execution_count": 41, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "body= {\n", 385 | " \"query\": {\n", 386 | " \"match\": {\n", 387 | " \"text\": {\n", 388 | " \"query\": \"SURPRIZE!\",\n", 389 | " \"fuzziness\": \"AUTO\"\n", 390 | " }\n", 391 | " }\n", 392 | " }\n", 393 | "}\n", 394 | "es.search(body=body)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "#### Scoring Fuzziness\n", 402 | "\n", 403 | "Imagine that we have 1,000 documents containing “Schwarzenegger,” and just one document with the misspelling “Schwarzeneger.” According to the theory of term frequency/inverse document frequency, the misspelling is much more relevant than the correct spelling, because it appears in far fewer documents!\n", 404 | "\n", 405 | "\n", 406 | "Fuzzy queries alone are much less useful than they initially appear. They are better used as part of a “bigger” feature, such as the search-as-you-type completion suggester or the did-you-mean phrase suggester.\n", 407 | "\n", 408 | "#### Phonetic Matching\n", 409 | "\n", 410 | "It might be useful to match by phonetic similarity - words that sound similar (despite different spellings):\n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 44, 416 | "metadata": { 417 | "collapsed": false 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "settings = {\n", 422 | " \"settings\": {\n", 423 | " \"analysis\": {\n", 424 | " \"filter\": {\n", 425 | " \"dbl_metaphone\": { \n", 426 | " \"type\": \"phonetic\",\n", 427 | " \"encoder\": \"double_metaphone\"\n", 428 | " }\n", 429 | " },\n", 430 | " \"analyzer\": {\n", 431 | " \"dbl_metaphone\": {\n", 432 | " \"tokenizer\": \"standard\",\n", 433 | " \"filter\": \"dbl_metaphone\" \n", 434 | " }\n", 435 | " }\n", 436 | " }\n", 437 | " }\n", 438 | "}\n" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "This won't work as it needs a plug-in for [Phoentic analysis](\n", 446 | "https://www.elastic.co/guide/en/elasticsearch/plugins/5.2/analysis-phonetic.html)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [] 457 | } 458 | ], 459 | "metadata": { 460 | "kernelspec": { 461 | "display_name": "Python 3", 462 | "language": "python", 463 | "name": "python3" 464 | }, 465 | "language_info": { 466 | "codemirror_mode": { 467 | "name": "ipython", 468 | "version": 3 469 | }, 470 | "file_extension": ".py", 471 | "mimetype": "text/x-python", 472 | "name": "python", 473 | "nbconvert_exporter": "python", 474 | "pygments_lexer": "ipython3", 475 | "version": "3.5.1" 476 | } 477 | }, 478 | "nbformat": 4, 479 | "nbformat_minor": 0 480 | } 481 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Searching - The Basic Tools-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from elasticsearch import Elasticsearch\n", 35 | "from elasticsearch_dsl import Search, Q\n", 36 | "from pprint import pprint\n", 37 | "\n", 38 | "es = Elasticsearch(\n", 39 | " 'localhost',\n", 40 | " # sniff before doing anything\n", 41 | " sniff_on_start=True,\n", 42 | " # refresh nodes after a node fails to respond\n", 43 | " sniff_on_connection_fail=True,\n", 44 | " # and also every 60 seconds\n", 45 | " sniffer_timeout=60\n", 46 | ")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Empty Search\n", 54 | "From: https://www.elastic.co/guide/en/elasticsearch/guide/master/empty-search.html\n", 55 | "\n", 56 | ">GET _search" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "res = es.search('_all') # same as es.search()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "#from pprint import pprint\n", 79 | "#pprint(res)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | ", , , , , , , , , ]>" 93 | ] 94 | }, 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "s = Search(using=es)\n", 102 | "response = s.execute()\n", 103 | "response" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "With timeout:\n", 111 | "\n", 112 | ">GET /_search?timeout=10ms" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "res = es.search('_all', timeout='10ms') # same as es.search(timeout='10ms')" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 6, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "\n", 138 | "\n", 139 | "\n", 140 | "\n", 141 | "\n", 142 | "\n", 143 | "\n", 144 | "\n", 145 | "\n", 146 | "\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "# To see the results, we can iterate:\n", 152 | "# Elasticsearch pages the results (to 10 hits)\n", 153 | "for hit in s:\n", 154 | " print(hit)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Multi-index, Multitype\n", 162 | "\n", 163 | "First using the low-level API" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 7, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "14\n", 178 | "14\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "#/_search\n", 184 | "#Search all types in all indices\n", 185 | "res = es.search('_all')\n", 186 | "\n", 187 | "#/gb/_search\n", 188 | "#Search all types in the gb index\n", 189 | "res = es.search(index='gb')\n", 190 | "\n", 191 | "#/gb,us/_search\n", 192 | "#Search all types in the gb and us indices\n", 193 | "res = es.search(index=['gb','us'])\n", 194 | "\n", 195 | "#/g*,u*/_search\n", 196 | "#Search all types in any indices beginning with g or beginning with u\n", 197 | "res = es.search(index=['g*','u*'])\n", 198 | "\n", 199 | "#/gb/user/_search\n", 200 | "#Search type user in the gb index\n", 201 | "res = es.search(index='gb', doc_type='user')\n", 202 | "\n", 203 | "#/gb,us/user,tweet/_search\n", 204 | "#Search types user and tweet in the gb and us indices\n", 205 | "res = es.search(index=['g*','u*'], doc_type=['user', 'tweet'])\n", 206 | "print(res['hits']['total'])\n", 207 | "\n", 208 | "#/_all/user,tweet/_search\n", 209 | "#Search types user and tweet in all indices\n", 210 | "res = es.search(doc_type=['user', 'tweet'])\n", 211 | "print(res['hits']['total'])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "Next using the DSL, although similar for such basic searches" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 8, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "14\n", 233 | "10\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "#/_search\n", 239 | "#Search all types in all indices\n", 240 | "s = Search(using=es)\n", 241 | "response = s.execute()\n", 242 | "\n", 243 | "#/gb/_search\n", 244 | "#Search all types in the gb index\n", 245 | "s = Search(using=es, index='gb')\n", 246 | "response = s.execute()\n", 247 | "\n", 248 | "#/gb,us/_search\n", 249 | "#Search all types in the gb and us indices\n", 250 | "s = Search(using=es, index=['gb','us'])\n", 251 | "response = s.execute()\n", 252 | "\n", 253 | "#/g*,u*/_search\n", 254 | "#Search all types in any indices beginning with g or beginning with u\n", 255 | "s = Search(using=es, index=['g*','u*'])\n", 256 | "response = s.execute()\n", 257 | "\n", 258 | "#/gb/user/_search\n", 259 | "#Search type user in the gb index\n", 260 | "s = Search(using=es, index=['g*','u*'], doc_type='user')\n", 261 | "response = s.execute()\n", 262 | "\n", 263 | "\n", 264 | "#/gb,us/user,tweet/_search\n", 265 | "#Search types user and tweet in the gb and us indices\n", 266 | "s = Search(using=es, index=['g*','u*'], doc_type=['user','tweet'])\n", 267 | "response = s.execute()\n", 268 | "\n", 269 | "#/_all/user,tweet/_search\n", 270 | "#Search types user and tweet in all indices\n", 271 | "s = Search(using=es, doc_type=['user','tweet'])\n", 272 | "response = s.execute()\n", 273 | "print(response['hits']['total'])\n", 274 | "print(len(res['hits']['hits']))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Pagination\n", 282 | "\n", 283 | "The last search produced a hits total of 14, but there are only 10 documents in the array.\n", 284 | "\n", 285 | "This is due to pagination, so we need to use pointers:\n", 286 | "\n", 287 | ">GET /_search?size=5\n", 288 | "\n", 289 | ">GET /_search?size=5&from=5\n", 290 | "\n", 291 | ">GET /_search?size=5&from=10\n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 9, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "# For search API:\n", 303 | "res = es.search(doc_type=['user', 'tweet'], from_=5, size=5)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 10, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "14\n", 318 | "5\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "print(res['hits']['total'])\n", 324 | "print(len(res['hits']['hits']))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Search Lite\n", 332 | "\n", 333 | "These initial searches all use the Lucene Query String Syntax.\n", 334 | "\n", 335 | ">GET /_all/tweet/_search?q=tweet:elasticsearch\n", 336 | "\n", 337 | "For the low-level API, we use the q parameter:" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 11, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "Total hits:7\n", 352 | "\n", 353 | "{'_id': '13',\n", 354 | " '_index': 'gb',\n", 355 | " '_score': 0.7081689,\n", 356 | " '_source': {'date': '2014-09-23',\n", 357 | " 'name': 'Mary Jones',\n", 358 | " 'tweet': 'So yes, I am an Elasticsearch fanboy',\n", 359 | " 'user_id': 2},\n", 360 | " '_type': 'tweet'}\n" 361 | ] 362 | } 363 | ], 364 | "source": [ 365 | "res = es.search(doc_type=['tweet'], q='tweet:elasticsearch')\n", 366 | "print('Total hits:{}\\n'.format(res['hits']['total']))\n", 367 | "pprint(res['hits']['hits'][0])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "For the DSL, the intended purpose is to avoid the query string syntax and use the query string language instead. For completeness, here is an equivalent script:" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 12, 380 | "metadata": { 381 | "collapsed": false 382 | }, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "Total hits:7\n", 389 | "\n", 390 | "{'_id': '13', '_type': 'tweet', '_source': {'date': '2014-09...}\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "s = Search(using=es, doc_type=['tweet']) \\\n", 396 | " .query('match', tweet='elasticsearch')\n", 397 | "response = s.execute()\n", 398 | "print('Total hits:{}\\n'.format(response['hits']['total']))\n", 399 | "pprint(response['hits']['hits'][0])" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "However, notice that the pprint has not given us the same JSON response as the above query string syntax result via the low-level API. This is because the Search() object returns an array of Hit objects. These are constructed so as to expose the individual fields as object attributes (__getattr__)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "So yes, I am an Elasticsearch fanboy\n", 421 | "However did I manage before Elasticsearch?\n", 422 | "The Elasticsearch API is really easy to use\n", 423 | "Elasticsearch surely is one of the hottest new NoSQL products\n", 424 | "Elasticsearch means full text search has never been so easy\n", 425 | "Elasticsearch is built for the cloud, easy to scale\n", 426 | "Elasticsearch and I have left the honeymoon stage, and I still love her.\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "for hit in response:\n", 432 | " print(hit.tweet)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "### The _all field\n", 440 | "\n", 441 | "> GET /_search?q=mary" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 14, 447 | "metadata": { 448 | "collapsed": false 449 | }, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "Total hits:8\n", 456 | "\n", 457 | "{'_id': '4',\n", 458 | " '_index': 'us',\n", 459 | " '_score': 0.6650044,\n", 460 | " '_source': {'date': '2014-09-14',\n", 461 | " 'name': 'John Smith',\n", 462 | " 'tweet': '@mary it is not just text, it does everything',\n", 463 | " 'user_id': 1},\n", 464 | " '_type': 'tweet'}\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "res = es.search(q='mary')\n", 470 | "print('Total hits:{}\\n'.format(res['hits']['total']))\n", 471 | "pprint(res['hits']['hits'][0])" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "For the DSL, we need to call the _all field explicitly" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 15, 484 | "metadata": { 485 | "collapsed": false 486 | }, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | "Total hits:8\n", 493 | "\n", 494 | "@mary it is not just text, it does everything\n" 495 | ] 496 | } 497 | ], 498 | "source": [ 499 | "s = Search(using=es) \\\n", 500 | " .query('match', _all='mary')\n", 501 | "response = s.execute()\n", 502 | "print('Total hits:{}\\n'.format(response['hits']['total']))\n", 503 | "print(response[0].tweet)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "> +name:(mary john) +date:>2014-09-10 +(aggregations geo)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 16, 516 | "metadata": { 517 | "collapsed": false 518 | }, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "Total hits:1\n", 525 | "\n", 526 | "{'_id': '9',\n", 527 | " '_index': 'gb',\n", 528 | " '_score': 2.3835227,\n", 529 | " '_source': {'date': '2014-09-19',\n", 530 | " 'name': 'Mary Jones',\n", 531 | " 'tweet': 'Geo-location aggregations are really cool',\n", 532 | " 'user_id': 2},\n", 533 | " '_type': 'tweet'}\n" 534 | ] 535 | } 536 | ], 537 | "source": [ 538 | "res = es.search(q='+name:(mary john) +date:>2014-09-10 +(aggregations geo)')\n", 539 | "print('Total hits:{}\\n'.format(res['hits']['total']))\n", 540 | "pprint(res['hits']['hits'][0])" 541 | ] 542 | } 543 | ], 544 | "metadata": { 545 | "kernelspec": { 546 | "display_name": "Python 3", 547 | "language": "python", 548 | "name": "python3" 549 | }, 550 | "language_info": { 551 | "codemirror_mode": { 552 | "name": "ipython", 553 | "version": 3 554 | }, 555 | "file_extension": ".py", 556 | "mimetype": "text/x-python", 557 | "name": "python", 558 | "nbconvert_exporter": "python", 559 | "pygments_lexer": "ipython3", 560 | "version": "3.5.1" 561 | } 562 | }, 563 | "nbformat": 4, 564 | "nbformat_minor": 0 565 | } 566 | -------------------------------------------------------------------------------- /Getting Started/Searching - The Basic Tools.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 36, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from elasticsearch import Elasticsearch\n", 35 | "from elasticsearch_dsl import Search, Q\n", 36 | "from pprint import pprint\n", 37 | "\n", 38 | "es = Elasticsearch(\n", 39 | " 'localhost',\n", 40 | " # sniff before doing anything\n", 41 | " sniff_on_start=True,\n", 42 | " # refresh nodes after a node fails to respond\n", 43 | " sniff_on_connection_fail=True,\n", 44 | " # and also every 60 seconds\n", 45 | " sniffer_timeout=60\n", 46 | ")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Empty Search\n", 54 | "From: https://www.elastic.co/guide/en/elasticsearch/guide/master/empty-search.html\n", 55 | "\n", 56 | ">GET _search" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 37, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "res = es.search('_all') # same as es.search()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 38, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "#from pprint import pprint\n", 79 | "#pprint(res)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 39, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | ", , , , , , , , , ]>" 93 | ] 94 | }, 95 | "execution_count": 39, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "s = Search(using=es)\n", 102 | "response = s.execute()\n", 103 | "response" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "With timeout:\n", 111 | "\n", 112 | ">GET /_search?timeout=10ms" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 40, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "res = es.search('_all', timeout='10ms') # same as es.search(timeout='10ms')" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 41, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "\n", 138 | "\n", 139 | "\n", 140 | "\n", 141 | "\n", 142 | "\n", 143 | "\n", 144 | "\n", 145 | "\n", 146 | "\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "# To see the results, we can iterate:\n", 152 | "# Elasticsearch pages the results (to 10 hits)\n", 153 | "for hit in s:\n", 154 | " print(hit)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Multi-index, Multitype\n", 162 | "\n", 163 | "First using the low-level API" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 42, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "14\n", 178 | "14\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "#/_search\n", 184 | "#Search all types in all indices\n", 185 | "res = es.search('_all')\n", 186 | "\n", 187 | "#/gb/_search\n", 188 | "#Search all types in the gb index\n", 189 | "res = es.search(index='gb')\n", 190 | "\n", 191 | "#/gb,us/_search\n", 192 | "#Search all types in the gb and us indices\n", 193 | "res = es.search(index=['gb','us'])\n", 194 | "\n", 195 | "#/g*,u*/_search\n", 196 | "#Search all types in any indices beginning with g or beginning with u\n", 197 | "res = es.search(index=['g*','u*'])\n", 198 | "\n", 199 | "#/gb/user/_search\n", 200 | "#Search type user in the gb index\n", 201 | "res = es.search(index='gb', doc_type='user')\n", 202 | "\n", 203 | "#/gb,us/user,tweet/_search\n", 204 | "#Search types user and tweet in the gb and us indices\n", 205 | "res = es.search(index=['g*','u*'], doc_type=['user', 'tweet'])\n", 206 | "print(res['hits']['total'])\n", 207 | "\n", 208 | "#/_all/user,tweet/_search\n", 209 | "#Search types user and tweet in all indices\n", 210 | "res = es.search(doc_type=['user', 'tweet'])\n", 211 | "print(res['hits']['total'])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "Next using the DSL, although similar for such basic searches" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 43, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "14\n", 233 | "10\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "#/_search\n", 239 | "#Search all types in all indices\n", 240 | "s = Search(using=es)\n", 241 | "response = s.execute()\n", 242 | "\n", 243 | "#/gb/_search\n", 244 | "#Search all types in the gb index\n", 245 | "s = Search(using=es, index='gb')\n", 246 | "response = s.execute()\n", 247 | "\n", 248 | "#/gb,us/_search\n", 249 | "#Search all types in the gb and us indices\n", 250 | "s = Search(using=es, index=['gb','us'])\n", 251 | "response = s.execute()\n", 252 | "\n", 253 | "#/g*,u*/_search\n", 254 | "#Search all types in any indices beginning with g or beginning with u\n", 255 | "s = Search(using=es, index=['g*','u*'])\n", 256 | "response = s.execute()\n", 257 | "\n", 258 | "#/gb/user/_search\n", 259 | "#Search type user in the gb index\n", 260 | "s = Search(using=es, index=['g*','u*'], doc_type='user')\n", 261 | "response = s.execute()\n", 262 | "\n", 263 | "\n", 264 | "#/gb,us/user,tweet/_search\n", 265 | "#Search types user and tweet in the gb and us indices\n", 266 | "s = Search(using=es, index=['g*','u*'], doc_type=['user','tweet'])\n", 267 | "response = s.execute()\n", 268 | "\n", 269 | "#/_all/user,tweet/_search\n", 270 | "#Search types user and tweet in all indices\n", 271 | "s = Search(using=es, doc_type=['user','tweet'])\n", 272 | "response = s.execute()\n", 273 | "print(response.hits.total)\n", 274 | "print(len(res['hits']['hits']))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Pagination\n", 282 | "\n", 283 | "The last search produced a hits total of 14, but there are only 10 documents in the array.\n", 284 | "\n", 285 | "This is due to pagination, so we need to use pointers:\n", 286 | "\n", 287 | ">GET /_search?size=5\n", 288 | "\n", 289 | ">GET /_search?size=5&from=5\n", 290 | "\n", 291 | ">GET /_search?size=5&from=10\n" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 44, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "# For search API:\n", 303 | "res = es.search(doc_type=['user', 'tweet'], from_=5, size=5)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 45, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "14\n", 318 | "5\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "print(res['hits']['total'])\n", 324 | "print(len(res['hits']['hits']))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Search Lite\n", 332 | "\n", 333 | "These initial searches all use the Lucene Query String Syntax.\n", 334 | "\n", 335 | ">GET /_all/tweet/_search?q=tweet:elasticsearch\n", 336 | "\n", 337 | "For the low-level API, we use the q parameter:" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 46, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "Total hits:7\n", 352 | "\n", 353 | "{'_id': '6',\n", 354 | " '_index': 'us',\n", 355 | " '_score': 0.6395861,\n", 356 | " '_source': {'date': '2014-09-16',\n", 357 | " 'name': 'John Smith',\n", 358 | " 'tweet': 'The Elasticsearch API is really easy to use',\n", 359 | " 'user_id': 1},\n", 360 | " '_type': 'tweet'}\n" 361 | ] 362 | } 363 | ], 364 | "source": [ 365 | "res = es.search(doc_type=['tweet'], q='tweet:elasticsearch')\n", 366 | "print('Total hits:{}\\n'.format(res['hits']['total']))\n", 367 | "pprint(res['hits']['hits'][0])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "For the DSL, the intended purpose is to avoid the query string syntax and use the query string language instead. For completeness, here is an equivalent script:" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 47, 380 | "metadata": { 381 | "collapsed": false 382 | }, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "Total hits:7\n", 389 | "\n", 390 | "{'score': 0.6395861, 'index': 'us', 'doc_type': 'tweet', 'id...}\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "s = Search(using=es, doc_type=['tweet']) \\\n", 396 | " .query('match', tweet='elasticsearch')\n", 397 | "response = s.execute()\n", 398 | "print('Total hits:{}\\n'.format(response.hits.total))\n", 399 | "pprint(response[0].meta)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "However, notice that the pprint has not given us the same JSON response as the above query string syntax result via the low-level API. This is because the Search() object returns an array of Hit objects. These are constructed so as to expose the individual fields as object attributes (__getattr__)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 48, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "The Elasticsearch API is really easy to use\n", 421 | "However did I manage before Elasticsearch?\n", 422 | "So yes, I am an Elasticsearch fanboy\n", 423 | "Elasticsearch is built for the cloud, easy to scale\n", 424 | "Elasticsearch surely is one of the hottest new NoSQL products\n", 425 | "Elasticsearch means full text search has never been so easy\n", 426 | "Elasticsearch and I have left the honeymoon stage, and I still love her.\n" 427 | ] 428 | } 429 | ], 430 | "source": [ 431 | "for hit in response:\n", 432 | " print(hit.tweet)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "### The _all field\n", 440 | "\n", 441 | "> GET /_search?q=mary" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 49, 447 | "metadata": { 448 | "collapsed": false 449 | }, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "Total hits:8\n", 456 | "\n", 457 | "{'_id': '4',\n", 458 | " '_index': 'us',\n", 459 | " '_score': 0.6650044,\n", 460 | " '_source': {'date': '2014-09-14',\n", 461 | " 'name': 'John Smith',\n", 462 | " 'tweet': '@mary it is not just text, it does everything',\n", 463 | " 'user_id': 1},\n", 464 | " '_type': 'tweet'}\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "res = es.search(q='mary')\n", 470 | "print('Total hits:{}\\n'.format(res['hits']['total']))\n", 471 | "pprint(res['hits']['hits'][0])" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "For the DSL, we need to call the _all field explicitly" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 50, 484 | "metadata": { 485 | "collapsed": false 486 | }, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | "Total hits:8\n", 493 | "\n", 494 | "@mary it is not just text, it does everything\n" 495 | ] 496 | } 497 | ], 498 | "source": [ 499 | "s = Search(using=es) \\\n", 500 | " .query('match', _all='mary')\n", 501 | "response = s.execute()\n", 502 | "print('Total hits:{}\\n'.format(response.hits.total))\n", 503 | "print(response[0].tweet)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "> +name:(mary john) +date:>2014-09-10 +(aggregations geo)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 51, 516 | "metadata": { 517 | "collapsed": false 518 | }, 519 | "outputs": [ 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | "Total hits:1\n", 525 | "\n", 526 | "{'_id': '9',\n", 527 | " '_index': 'gb',\n", 528 | " '_score': 2.3835227,\n", 529 | " '_source': {'date': '2014-09-19',\n", 530 | " 'name': 'Mary Jones',\n", 531 | " 'tweet': 'Geo-location aggregations are really cool',\n", 532 | " 'user_id': 2},\n", 533 | " '_type': 'tweet'}\n" 534 | ] 535 | } 536 | ], 537 | "source": [ 538 | "res = es.search(q='+name:(mary john) +date:>2014-09-10 +(aggregations geo)')\n", 539 | "print('Total hits:{}\\n'.format(res['hits']['total']))\n", 540 | "pprint(res['hits']['hits'][0])" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": { 547 | "collapsed": true 548 | }, 549 | "outputs": [], 550 | "source": [] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "collapsed": true 557 | }, 558 | "outputs": [], 559 | "source": [] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": { 565 | "collapsed": true 566 | }, 567 | "outputs": [], 568 | "source": [] 569 | } 570 | ], 571 | "metadata": { 572 | "kernelspec": { 573 | "display_name": "Python 3", 574 | "language": "python", 575 | "name": "python3" 576 | }, 577 | "language_info": { 578 | "codemirror_mode": { 579 | "name": "ipython", 580 | "version": 3 581 | }, 582 | "file_extension": ".py", 583 | "mimetype": "text/x-python", 584 | "name": "python", 585 | "nbconvert_exporter": "python", 586 | "pygments_lexer": "ipython3", 587 | "version": "3.5.1" 588 | } 589 | }, 590 | "nbformat": 4, 591 | "nbformat_minor": 0 592 | } 593 | -------------------------------------------------------------------------------- /Dealing with Human Language/Reducing Words to Their Root Form (Pt.1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Reducing Words to Their Root Form\n", 83 | "\n", 84 | "Most languages of the world are inflected, meaning that words can change their form to express differences in the following:\n", 85 | "\n", 86 | "* Number: fox, foxes\n", 87 | "* Tense: pay, paid, paying\n", 88 | "* Gender: waiter, waitress\n", 89 | "* Person: hear, hears\n", 90 | "* Case: I, me, my\n", 91 | "* Aspect: ate, eaten\n", 92 | "* Mood: so be it, were it so\n", 93 | "\n", 94 | "While inflection aids expressivity, it interferes with retrievability, as a single root word sense (or meaning) may be represented by many different sequences of letters\n", 95 | "\n", 96 | "Stemming attempts to remove the differences between inflected forms of a word, in order to reduce each word to its root form. For instance foxes may be reduced to the root fox.\n", 97 | "\n", 98 | "If stemming were easy, there would be only one implementation. Unfortunately, stemming is an inexact science that suffers from two issues: understemming and overstemming.\n", 99 | "\n", 100 | "**The root form of a word may not even be a real word**. The words ```jumping``` and ```jumpiness``` may both be stemmed to ```jumpi```. It doesn’t matter—as long as the same terms are produced at index time and at search time, search will just work.\n", 101 | "\n", 102 | "Understemming is the failure to reduce words with the same meaning to the same root. For example, ```jumped``` and ```jumps``` may be reduced to ```jump```, while ```jumping``` may be reduced to ```jumpi```. **Understemming reduces retrieval**; relevant documents are not returned.\n", 103 | "\n", 104 | "Overstemming is the failure to keep two words with distinct meanings separate. For instance, ```general``` and ```generate``` may both be stemmed to ```gener```. **Overstemming reduces precision**: irrelevant documents are returned when they shouldn’t be.\n", 105 | "\n", 106 | "#### Lemmatization ####\n", 107 | "\n", 108 | "A lemma is the canonical, or dictionary, form of a set of related words—the lemma of paying, paid, and pays is pay. Sometimes the morphology differs: is, was, am, and being is be.\n", 109 | "\n", 110 | "Lemmatization, like stemming, tries to group related words, but it goes one step further than stemming in that it tries to group words by their word sense, or meaning. The same word may represent two meanings—for example,wake can mean to wake up or a funeral. While lemmatization would try to distinguish these two word senses, stemming would incorrectly conflate them.\n", 111 | "\n", 112 | "Lemmatization is a much more complicated and expensive process that needs to understand the context in which words appear in order to make decisions about what they mean. In practice, stemming appears to be just as effective as lemmatization, but with a much lower cost.\n", 113 | "\n", 114 | "### Algorithmic Stemmers\n", 115 | "\n", 116 | "While you can use the porter_stem or kstem token filter directly, or create a language-specific Snowball stemmer with the snowball token filter, all of the algorithmic stemmers are exposed via a single unified interface: the stemmer token filter, which accepts the language parameter.\n", 117 | "\n", 118 | "For instance, perhaps you find the default stemmer used by the english analyzer to be too aggressive and you want to make it less aggressive. The first step is to look up the configuration for the english analyzer in the language analyzers documentation, which shows the following:\n", 119 | "\n", 120 | "```\n", 121 | "{\n", 122 | " \"settings\": {\n", 123 | " \"analysis\": {\n", 124 | " \"filter\": {\n", 125 | " \"english_stop\": {\n", 126 | " \"type\": \"stop\",\n", 127 | " \"stopwords\": \"_english_\"\n", 128 | " },\n", 129 | " \"english_keywords\": {\n", 130 | " \"type\": \"keyword_marker\", \n", 131 | " \"keywords\": []\n", 132 | " },\n", 133 | " \"english_stemmer\": {\n", 134 | " \"type\": \"stemmer\",\n", 135 | " \"language\": \"english\" \n", 136 | " },\n", 137 | " \"english_possessive_stemmer\": {\n", 138 | " \"type\": \"stemmer\",\n", 139 | " \"language\": \"possessive_english\" \n", 140 | " }\n", 141 | " },\n", 142 | " \"analyzer\": {\n", 143 | " \"english\": {\n", 144 | " \"tokenizer\": \"standard\",\n", 145 | " \"filter\": [\n", 146 | " \"english_possessive_stemmer\",\n", 147 | " \"lowercase\",\n", 148 | " \"english_stop\",\n", 149 | " \"english_keywords\",\n", 150 | " \"english_stemmer\"\n", 151 | " ]\n", 152 | " }\n", 153 | " }\n", 154 | " }\n", 155 | " }\n", 156 | "}\n", 157 | "```\n", 158 | "\n", 159 | "The \"lighter\" modified English token filter:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 3, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "english_token_filter = {\n", 171 | " \"settings\": {\n", 172 | " \"analysis\": {\n", 173 | " \"filter\": {\n", 174 | " \"english_stop\": {\n", 175 | " \"type\": \"stop\",\n", 176 | " \"stopwords\": \"_english_\"\n", 177 | " },\n", 178 | " \"light_english_stemmer\": {\n", 179 | " \"type\": \"stemmer\",\n", 180 | " \"language\": \"light_english\" \n", 181 | " },\n", 182 | " \"english_possessive_stemmer\": {\n", 183 | " \"type\": \"stemmer\",\n", 184 | " \"language\": \"possessive_english\"\n", 185 | " }\n", 186 | " },\n", 187 | " \"analyzer\": {\n", 188 | " \"my_english\": {\n", 189 | " \"tokenizer\": \"standard\",\n", 190 | " \"filter\": [\n", 191 | " \"english_possessive_stemmer\",\n", 192 | " \"lowercase\",\n", 193 | " \"english_stop\",\n", 194 | " \"light_english_stemmer\", \n", 195 | " \"asciifolding\" \n", 196 | " ]\n", 197 | " }\n", 198 | " }\n", 199 | " }\n", 200 | " }\n", 201 | "}" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 4, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "index.create_my_index(body=english_token_filter)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Now let's put some data into my_index:" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 5, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "{'_id': '1',\n", 233 | " '_index': 'my_index',\n", 234 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n", 235 | " '_type': 'test',\n", 236 | " '_version': 1,\n", 237 | " 'created': True,\n", 238 | " 'result': 'created'}" 239 | ] 240 | }, 241 | "execution_count": 5, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "text = \"You're right about jumping jack's Über generation of waiters.\"\n", 248 | "doc = {\n", 249 | " \"message\": text\n", 250 | "}\n", 251 | "es.create(index=\"my_index\", doc_type='test', body=doc, id=1)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 6, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "you'r,right,about,jump,jack,über,gener,waiter\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "# test with the standard English analyzer\n", 271 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 272 | " (index='my_index', analyzer='english', text=text)['tokens']]\n", 273 | "print(','.join(analyzed_text))\n" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "Besides the singlular from plural ('waiter(s)'), notice the following mappings:\n", 281 | "\n", 282 | "* jumping => jump\n", 283 | "* generation => gener" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 7, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "name": "stdout", 295 | "output_type": "stream", 296 | "text": [ 297 | "you're,right,about,jump,jack,uber,generation,waiter\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "# test with the modified English analyzer - 'my_english'\n", 303 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 304 | " (index='my_index', analyzer='my_english', text=text)['tokens']]\n", 305 | "print(','.join(analyzed_text))" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Besides the singlular from plural ('waiter(s)'), notice the different mappings:\n", 313 | "\n", 314 | "* generation => generation (same e.g. non-stemmed)\n", 315 | "* Über => uber (i.e. asciifolded)\n", 316 | "\n", 317 | "But what if we search for one of these transformed words in the docs:" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 8, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "" 331 | ] 332 | }, 333 | "execution_count": 8, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "s = Search(using=es, index=\"my_index\").query('match', message=\"jump uber\")\n", 340 | "s.execute()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Hmmm. No results for ```jump```. How come?\n", 348 | "Let's check the mapping for the field ```message```:" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 9, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "{'my_index': {'mappings': {'test': {'properties': {'message': {'fields': {'keyword': {'ignore_above': 256,\n", 362 | " 'type': 'keyword'}},\n", 363 | " 'type': 'text'}}}}}}" 364 | ] 365 | }, 366 | "execution_count": 9, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "res = es.indices.get_mapping(index='my_index', doc_type='test')\n", 373 | "res\n", 374 | "#es.indices.get_field_mapping(index='my_index', fields='messages')" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "Our analyzer has not been mapped. Let's do it now:" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 10, 387 | "metadata": { 388 | "collapsed": false 389 | }, 390 | "outputs": [], 391 | "source": [ 392 | "english_token_filter = {\n", 393 | " \"settings\": {\n", 394 | " \"analysis\": {\n", 395 | " \"filter\": {\n", 396 | " \"english_stop\": {\n", 397 | " \"type\": \"stop\",\n", 398 | " \"stopwords\": \"_english_\"\n", 399 | " },\n", 400 | " \"light_english_stemmer\": {\n", 401 | " \"type\": \"stemmer\",\n", 402 | " \"language\": \"light_english\" \n", 403 | " },\n", 404 | " \"english_possessive_stemmer\": {\n", 405 | " \"type\": \"stemmer\",\n", 406 | " \"language\": \"possessive_english\"\n", 407 | " }\n", 408 | " },\n", 409 | " \"analyzer\": {\n", 410 | " \"my_english\": {\n", 411 | " \"tokenizer\": \"standard\",\n", 412 | " \"filter\": [\n", 413 | " \"english_possessive_stemmer\",\n", 414 | " \"lowercase\",\n", 415 | " \"english_stop\",\n", 416 | " \"light_english_stemmer\", \n", 417 | " \"asciifolding\" \n", 418 | " ]\n", 419 | " }\n", 420 | " }\n", 421 | " }\n", 422 | " },\n", 423 | " \"mappings\": {\n", 424 | " \"test\" : {\n", 425 | " \"properties\" : {\n", 426 | " \"message\" : {\n", 427 | " \"type\" : \"text\",\n", 428 | " \"analyzer\": \"my_english\"\n", 429 | " }\n", 430 | " }\n", 431 | " }\n", 432 | " }\n", 433 | "}\n", 434 | "index.create_my_index(body=english_token_filter)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 11, 440 | "metadata": { 441 | "collapsed": false 442 | }, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/plain": [ 447 | "{'_id': '1',\n", 448 | " '_index': 'my_index',\n", 449 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n", 450 | " '_type': 'test',\n", 451 | " '_version': 1,\n", 452 | " 'created': True,\n", 453 | " 'result': 'created'}" 454 | ] 455 | }, 456 | "execution_count": 11, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "text = \"You're right about those jumping jacks in the Über generation of waiters.\"\n", 463 | "doc = {\n", 464 | " \"message\": text\n", 465 | "}\n", 466 | "es.create(index=\"my_index\", doc_type='test', body=doc, id=1)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 12, 472 | "metadata": { 473 | "collapsed": false 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "1\n", 481 | "You're right about those jumping jacks in the Über generation of waiters.\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "s = Search(using=es, index=\"my_index\", doc_type='test').query('match', message=\"jump\")\n", 487 | "res = s.execute()\n", 488 | "print(res.hits.total)\n", 489 | "print(res[0].message)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 13, 495 | "metadata": { 496 | "collapsed": false 497 | }, 498 | "outputs": [ 499 | { 500 | "name": "stdout", 501 | "output_type": "stream", 502 | "text": [ 503 | "1\n", 504 | "You're right about those jumping jacks in the Über generation of waiters.\n" 505 | ] 506 | } 507 | ], 508 | "source": [ 509 | "s = Search(using=es, index=\"my_index\", doc_type='test').query('match', message=\"uber\")\n", 510 | "res = s.execute()\n", 511 | "print(res.hits.total)\n", 512 | "print(res[0].message)" 513 | ] 514 | } 515 | ], 516 | "metadata": { 517 | "kernelspec": { 518 | "display_name": "Python 3", 519 | "language": "python", 520 | "name": "python3" 521 | }, 522 | "language_info": { 523 | "codemirror_mode": { 524 | "name": "ipython", 525 | "version": 3 526 | }, 527 | "file_extension": ".py", 528 | "mimetype": "text/x-python", 529 | "name": "python", 530 | "nbconvert_exporter": "python", 531 | "pygments_lexer": "ipython3", 532 | "version": "3.5.1" 533 | } 534 | }, 535 | "nbformat": 4, 536 | "nbformat_minor": 0 537 | } 538 | -------------------------------------------------------------------------------- /Dealing with Human Language/Reducing Words (Pt.2).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Reducing Words to Their Root Form\n", 83 | "\n", 84 | "#### Dictionary Stemmers\n", 85 | "\n", 86 | "Dictionary stemmers work quite differently from algorithmic stemmers. Instead of applying a standard set of rules to each word, they simply look up the word in the dictionary. Theoretically, they could produce much better results than an algorithmic stemmer. A dictionary stemmer should be able to do the following:\n", 87 | "\n", 88 | "* Return the correct root word for irregular forms such as feet and mice\n", 89 | "* Recognize the distinction between words that are similar but have different word senses—for example, organ and organization\n", 90 | "\n", 91 | "**Dictionary Stemmer** - only as good as its dictionary. Most e-dictionaries only ~10% of full dictionaries. Have to be updated etc.\n", 92 | "\n", 93 | "**Size and performance** - A dictionary stemmer needs to load all words, all prefixes, and all suffixes into memory. This can use a significant amount of RAM. Finding the right stem for a word is often considerably more complex than the equivalent process with an algorithmic stemmer.\n", 94 | "\n", 95 | "Let's explore the Hunspell dictionary \"stemmer\":\n", 96 | "\n", 97 | "```\n", 98 | "config/\n", 99 | " └ hunspell/ \n", 100 | " └ en_GB/ \n", 101 | " ├ en_GB.dic\n", 102 | " ├ en_GB.aff\n", 103 | " └ settings.yml \n", 104 | "```\n", 105 | "\n", 106 | "Note that we don't need to touch settings.yml (which override any settings in the master settings file: ```elasticsearch.yml```. Settings can be used to ignore case, which is otherwise set to false. \n", 107 | "\n", 108 | "* ```indices.analysis.hunspell.dictionary.ignore_case```\n", 109 | "\n", 110 | "(NOTE: due to my British roots, I changed the example to use the GB dictionary noting that the US version is derived from it.)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "settings = {\n", 122 | " \"analysis\" : {\n", 123 | " \"analyzer\" : {\n", 124 | " \"en_GB\" : {\n", 125 | " \"tokenizer\" : \"standard\",\n", 126 | " \"filter\" : [ \"lowercase\", \"en_GB\" ]\n", 127 | " }\n", 128 | " },\n", 129 | " \"filter\" : {\n", 130 | " \"en_GB\" : {\n", 131 | " \"type\" : \"hunspell\",\n", 132 | " \"locale\" : \"en_GB\"\n", 133 | " }\n", 134 | " }\n", 135 | " }\n", 136 | "}\n", 137 | "index.create_my_index(body=settings)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 4, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "you'r,right,about,organ,jack,über,gener,waiter\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# test with the standard English analyzer\n", 157 | "text = \"You're right about organizing jack's Über generation of waiters.\" \n", 158 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 159 | " (index='my_index', analyzer='english', text=text)['tokens']]\n", 160 | "print(','.join(analyzed_text))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "you're,right,about,organize,organ,jack,über,generation,generate,genera,of,wait\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 180 | " (index='my_index', analyzer='en_GB', text=text)['tokens']]\n", 181 | "print(','.join(analyzed_text))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Let's see what happens with the following words that are known to be overstemmed by Porter stemmers (and later improved by the Porter2 stemmer):" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 6, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "gener,gener,gener,gener,organ,waiter\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "text = \"A generically generally generously generated organized waiter.\"\n", 208 | "# English\n", 209 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 210 | " (index='my_index', analyzer='english', text=text)['tokens']]\n", 211 | "print(','.join(analyzed_text))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 7, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "a,genera,genera,generously,generous,generate,organize,organ,wait\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "# en_GB Hunspell:\n", 231 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 232 | " (index='my_index', analyzer='en_GB', text=text)['tokens']]\n", 233 | "print(','.join(analyzed_text))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 8, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "english_token_filter = {\n", 245 | " \"settings\": {\n", 246 | " \"analysis\": {\n", 247 | " \"filter\": {\n", 248 | " \"english_stop\": {\n", 249 | " \"type\": \"stop\",\n", 250 | " \"stopwords\": \"_english_\"\n", 251 | " },\n", 252 | " \"light_english_stemmer\": {\n", 253 | " \"type\": \"stemmer\",\n", 254 | " \"language\": \"light_english\" \n", 255 | " },\n", 256 | " \"english_possessive_stemmer\": {\n", 257 | " \"type\": \"stemmer\",\n", 258 | " \"language\": \"possessive_english\"\n", 259 | " }\n", 260 | " },\n", 261 | " \"analyzer\": {\n", 262 | " \"my_english\": {\n", 263 | " \"tokenizer\": \"standard\",\n", 264 | " \"filter\": [\n", 265 | " \"english_possessive_stemmer\",\n", 266 | " \"lowercase\",\n", 267 | " \"english_stop\",\n", 268 | " \"light_english_stemmer\", \n", 269 | " \"asciifolding\" \n", 270 | " ]\n", 271 | " }\n", 272 | " }\n", 273 | " }\n", 274 | " }\n", 275 | "}\n", 276 | "index.create_my_index(body=english_token_filter)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 9, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "generic,generally,generous,generate,organized,waiter\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "# my_english custom analyzer:\n", 296 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 297 | " (index='my_index', analyzer='my_english', text=text)['tokens']]\n", 298 | "print(','.join(analyzed_text))" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 10, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "porter_token_filter = {\n", 310 | " \"settings\": {\n", 311 | " \"analysis\": {\n", 312 | " \"filter\": {\n", 313 | " \"english_stop\": {\n", 314 | " \"type\": \"stop\",\n", 315 | " \"stopwords\": \"_english_\"\n", 316 | " },\n", 317 | " \"porter\": {\n", 318 | " \"type\": \"stemmer\",\n", 319 | " \"language\": \"porter\" \n", 320 | " },\n", 321 | " \"english_possessive_stemmer\": {\n", 322 | " \"type\": \"stemmer\",\n", 323 | " \"language\": \"possessive_english\"\n", 324 | " }\n", 325 | " },\n", 326 | " \"analyzer\": {\n", 327 | " \"my_porter_english\": {\n", 328 | " \"tokenizer\": \"standard\",\n", 329 | " \"filter\": [\n", 330 | " \"english_possessive_stemmer\",\n", 331 | " \"lowercase\",\n", 332 | " \"english_stop\",\n", 333 | " \"porter\", \n", 334 | " \"asciifolding\" \n", 335 | " ]\n", 336 | " }\n", 337 | " }\n", 338 | " }\n", 339 | " }\n", 340 | "}\n", 341 | "index.create_my_index(body=porter_token_filter)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "gener,gener,gener,gener,organ,waiter\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "# my_english custom analyzer:\n", 361 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 362 | " (index='my_index', analyzer='my_porter_english', text=text)['tokens']]\n", 363 | "print(','.join(analyzed_text))" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 12, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "porter2_token_filter = {\n", 375 | " \"settings\": {\n", 376 | " \"analysis\": {\n", 377 | " \"filter\": {\n", 378 | " \"english_stop\": {\n", 379 | " \"type\": \"stop\",\n", 380 | " \"stopwords\": \"_english_\"\n", 381 | " },\n", 382 | " \"porter2\": {\n", 383 | " \"type\": \"stemmer\",\n", 384 | " \"language\": \"porter2\" \n", 385 | " },\n", 386 | " \"english_possessive_stemmer\": {\n", 387 | " \"type\": \"stemmer\",\n", 388 | " \"language\": \"possessive_english\"\n", 389 | " }\n", 390 | " },\n", 391 | " \"analyzer\": {\n", 392 | " \"my_porter2_english\": {\n", 393 | " \"tokenizer\": \"standard\",\n", 394 | " \"filter\": [\n", 395 | " \"english_possessive_stemmer\",\n", 396 | " \"lowercase\",\n", 397 | " \"english_stop\",\n", 398 | " \"porter2\", \n", 399 | " \"asciifolding\" \n", 400 | " ]\n", 401 | " }\n", 402 | " }\n", 403 | " }\n", 404 | " }\n", 405 | "}\n", 406 | "index.create_my_index(body=porter2_token_filter)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "generic,general,generous,generat,organ,waiter\n" 421 | ] 422 | } 423 | ], 424 | "source": [ 425 | "# my_english custom analyzer:\n", 426 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 427 | " (index='my_index', analyzer='my_porter2_english', text=text)['tokens']]\n", 428 | "print(','.join(analyzed_text))" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "### Summary of Analyzer Comparison:\n", 436 | "\n", 437 | "text = \"A generically generally generously generated organized waiter.\"\n", 438 | "\n", 439 | "##### English\n", 440 | "\n", 441 | "gener,gener,gener,gener,organ,waiter\n", 442 | "\n", 443 | "##### Hunspell (en_GB) #####\n", 444 | "\n", 445 | "a,genera,genera,generously,generous,generate,organize,organ,wait\n", 446 | "\n", 447 | "##### \"My English\" (Lite stemmer)\n", 448 | "\n", 449 | "generic,generally,generous,generate,organized,waiter\n", 450 | "\n", 451 | "##### \"My English\" (Porter stemmer)\n", 452 | "\n", 453 | "gener,gener,gener,gener,organ,waiter\n", 454 | "\n", 455 | "##### \"My English\" (Porter2 stemmer)\n", 456 | "\n", 457 | "generic,general,generous,generat,organ,waiter\n", 458 | "\n", 459 | "\n", 460 | "### Preventing Stemming\n", 461 | "\n", 462 | "Maybe important to keep skies and skiing as distinct words rather than stemming them both down to ski (as would happen with the english analyzer).\n", 463 | "\n", 464 | "The ```keyword_marker``` and ```stemmer_override``` token filters customize the stemming process." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 14, 470 | "metadata": { 471 | "collapsed": false 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "stem_control_settings = {\n", 476 | " \"settings\": {\n", 477 | " \"analysis\": {\n", 478 | " \"filter\": {\n", 479 | " \"no_stem\": {\n", 480 | " \"type\": \"keyword_marker\",\n", 481 | " \"keywords\": [ \"skies\" ] \n", 482 | " }\n", 483 | " },\n", 484 | " \"analyzer\": {\n", 485 | " \"my_stemmer\": {\n", 486 | " \"tokenizer\": \"standard\",\n", 487 | " \"filter\": [\n", 488 | " \"lowercase\",\n", 489 | " \"no_stem\",\n", 490 | " \"porter_stem\"\n", 491 | " ]\n", 492 | " }\n", 493 | " }\n", 494 | " }\n", 495 | " }\n", 496 | "}\n", 497 | "index.create_my_index(body=stem_control_settings)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 15, 503 | "metadata": { 504 | "collapsed": false 505 | }, 506 | "outputs": [ 507 | { 508 | "name": "stdout", 509 | "output_type": "stream", 510 | "text": [ 511 | "sky,skies,ski,ski\n" 512 | ] 513 | } 514 | ], 515 | "source": [ 516 | "# my_stemmer custom analyzer:\n", 517 | "text = ['sky skies skiing skis']\n", 518 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 519 | " (index='my_index', analyzer='my_stemmer', text=text)['tokens']]\n", 520 | "print(','.join(analyzed_text))" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "While the language analyzers allow us only to specify an array of words in the stem_exclusion parameter, the keyword_marker token filter also accepts a keywords_path parameter that allows us to store all of our keywords in [a file](https://www.elastic.co/guide/en/elasticsearch/guide/master/using-stopwords.html#updating-stopwords).\n", 528 | "\n", 529 | "#### Customizing Stemming\n", 530 | "\n", 531 | "Perhaps we prefer \"skies\" to be stemmed to \"sky\" instead. The ```stemmer_override``` token filter allows us to specify our own custom stemming rules. At the same time, we can handle some irregular forms like stemming mice to mouse and feet to foot:" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 16, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [ 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "the,mouse,came,down,from,the,sky,and,ran,over,my,foot\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "my_stemmer_override = {\n", 551 | " \"settings\": {\n", 552 | " \"analysis\": {\n", 553 | " \"filter\": {\n", 554 | " \"custom_stem\": {\n", 555 | " \"type\": \"stemmer_override\",\n", 556 | " \"rules\": [ \n", 557 | " \"skies=>sky\",\n", 558 | " \"mice=>mouse\",\n", 559 | " \"feet=>foot\"\n", 560 | " ]\n", 561 | " }\n", 562 | " },\n", 563 | " \"analyzer\": {\n", 564 | " \"my_stemmer_override\": {\n", 565 | " \"tokenizer\": \"standard\",\n", 566 | " \"filter\": [\n", 567 | " \"lowercase\",\n", 568 | " \"custom_stem\", \n", 569 | " \"porter_stem\"\n", 570 | " ]\n", 571 | " }\n", 572 | " }\n", 573 | " }\n", 574 | " }\n", 575 | "}\n", 576 | "index.create_my_index(body=my_stemmer_override)\n", 577 | "# my_stemmer_override custom analyzer:\n", 578 | "text = ['The mice came down from the skies and ran over my feet']\n", 579 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 580 | " (index='my_index', analyzer='my_stemmer_override', text=text)['tokens']]\n", 581 | "print(','.join(analyzed_text))" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "**NOTE**: The stemmer_override filter (\"custom_stem\") must be placed **before** the stemmer (here \"porter_stem\").\n", 589 | "\n", 590 | "Just as for the keyword_marker token filter, rules can be stored in a file whose location should be specified with the ```rules_path``` parameter." 591 | ] 592 | } 593 | ], 594 | "metadata": { 595 | "kernelspec": { 596 | "display_name": "Python 3", 597 | "language": "python", 598 | "name": "python3" 599 | }, 600 | "language_info": { 601 | "codemirror_mode": { 602 | "name": "ipython", 603 | "version": 3 604 | }, 605 | "file_extension": ".py", 606 | "mimetype": "text/x-python", 607 | "name": "python", 608 | "nbconvert_exporter": "python", 609 | "pygments_lexer": "ipython3", 610 | "version": "3.5.1" 611 | } 612 | }, 613 | "nbformat": 4, 614 | "nbformat_minor": 0 615 | } 616 | -------------------------------------------------------------------------------- /Dealing with Human Language/Reducing Words (Pt.2)-Copy1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Reducing Words to Their Root Form\n", 83 | "\n", 84 | "#### Dictionary Stemmers\n", 85 | "\n", 86 | "Dictionary stemmers work quite differently from algorithmic stemmers. Instead of applying a standard set of rules to each word, they simply look up the word in the dictionary. Theoretically, they could produce much better results than an algorithmic stemmer. A dictionary stemmer should be able to do the following:\n", 87 | "\n", 88 | "* Return the correct root word for irregular forms such as feet and mice\n", 89 | "* Recognize the distinction between words that are similar but have different word senses—for example, organ and organization\n", 90 | "\n", 91 | "**Dictionary Stemmer** - only as good as its dictionary. Most e-dictionaries only ~10% of full dictionaries. Have to be updated etc.\n", 92 | "\n", 93 | "**Size and performance** - A dictionary stemmer needs to load all words, all prefixes, and all suffixes into memory. This can use a significant amount of RAM. Finding the right stem for a word is often considerably more complex than the equivalent process with an algorithmic stemmer.\n", 94 | "\n", 95 | "Let's explore the Hunspell dictionary \"stemmer\":\n", 96 | "\n", 97 | "```\n", 98 | "config/\n", 99 | " └ hunspell/ \n", 100 | " └ en_GB/ \n", 101 | " ├ en_GB.dic\n", 102 | " ├ en_GB.aff\n", 103 | " └ settings.yml \n", 104 | "```\n", 105 | "\n", 106 | "Note that we don't need to touch settings.yml (which override any settings in the master settings file: ```elasticsearch.yml```. Settings can be used to ignore case, which is otherwise set to false. \n", 107 | "\n", 108 | "* ```indices.analysis.hunspell.dictionary.ignore_case```\n", 109 | "\n", 110 | "(NOTE: due to my British roots, I changed the example to use the GB dictionary noting that the US version is derived from it.)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "settings = {\n", 122 | " \"analysis\" : {\n", 123 | " \"analyzer\" : {\n", 124 | " \"en_GB\" : {\n", 125 | " \"tokenizer\" : \"standard\",\n", 126 | " \"filter\" : [ \"lowercase\", \"en_GB\" ]\n", 127 | " }\n", 128 | " },\n", 129 | " \"filter\" : {\n", 130 | " \"en_GB\" : {\n", 131 | " \"type\" : \"hunspell\",\n", 132 | " \"locale\" : \"en_GB\"\n", 133 | " }\n", 134 | " }\n", 135 | " }\n", 136 | "}\n", 137 | "index.create_my_index(body=settings)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 4, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "you'r,right,about,organ,jack,über,gener,waiter\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# test with the standard English analyzer\n", 157 | "text = \"You're right about organizing jack's Über generation of waiters.\" \n", 158 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 159 | " (index='my_index', analyzer='english', text=text)['tokens']]\n", 160 | "print(','.join(analyzed_text))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "you're,right,about,organize,organ,jack,über,generation,generate,genera,of,wait\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 180 | " (index='my_index', analyzer='en_GB', text=text)['tokens']]\n", 181 | "print(','.join(analyzed_text))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Let's see what happens with the following words that are known to be overstemmed by Porter stemmers (and later improved by the Porter2 stemmer):" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 6, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "gener,gener,gener,gener,organ,waiter\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "text = \"A generically generally generously generated organized waiter.\"\n", 208 | "# English\n", 209 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 210 | " (index='my_index', analyzer='english', text=text)['tokens']]\n", 211 | "print(','.join(analyzed_text))" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 7, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "a,genera,genera,generously,generous,generate,organize,organ,wait\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "# en_GB Hunspell:\n", 231 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 232 | " (index='my_index', analyzer='en_GB', text=text)['tokens']]\n", 233 | "print(','.join(analyzed_text))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 8, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "english_token_filter = {\n", 245 | " \"settings\": {\n", 246 | " \"analysis\": {\n", 247 | " \"filter\": {\n", 248 | " \"english_stop\": {\n", 249 | " \"type\": \"stop\",\n", 250 | " \"stopwords\": \"_english_\"\n", 251 | " },\n", 252 | " \"light_english_stemmer\": {\n", 253 | " \"type\": \"stemmer\",\n", 254 | " \"language\": \"light_english\" \n", 255 | " },\n", 256 | " \"english_possessive_stemmer\": {\n", 257 | " \"type\": \"stemmer\",\n", 258 | " \"language\": \"possessive_english\"\n", 259 | " }\n", 260 | " },\n", 261 | " \"analyzer\": {\n", 262 | " \"my_english\": {\n", 263 | " \"tokenizer\": \"standard\",\n", 264 | " \"filter\": [\n", 265 | " \"english_possessive_stemmer\",\n", 266 | " \"lowercase\",\n", 267 | " \"english_stop\",\n", 268 | " \"light_english_stemmer\", \n", 269 | " \"asciifolding\" \n", 270 | " ]\n", 271 | " }\n", 272 | " }\n", 273 | " }\n", 274 | " }\n", 275 | "}\n", 276 | "index.create_my_index(body=english_token_filter)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 9, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "generic,generally,generous,generate,organized,waiter\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "# my_english custom analyzer:\n", 296 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 297 | " (index='my_index', analyzer='my_english', text=text)['tokens']]\n", 298 | "print(','.join(analyzed_text))" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 10, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "porter_token_filter = {\n", 310 | " \"settings\": {\n", 311 | " \"analysis\": {\n", 312 | " \"filter\": {\n", 313 | " \"english_stop\": {\n", 314 | " \"type\": \"stop\",\n", 315 | " \"stopwords\": \"_english_\"\n", 316 | " },\n", 317 | " \"porter\": {\n", 318 | " \"type\": \"stemmer\",\n", 319 | " \"language\": \"porter\" \n", 320 | " },\n", 321 | " \"english_possessive_stemmer\": {\n", 322 | " \"type\": \"stemmer\",\n", 323 | " \"language\": \"possessive_english\"\n", 324 | " }\n", 325 | " },\n", 326 | " \"analyzer\": {\n", 327 | " \"my_porter_english\": {\n", 328 | " \"tokenizer\": \"standard\",\n", 329 | " \"filter\": [\n", 330 | " \"english_possessive_stemmer\",\n", 331 | " \"lowercase\",\n", 332 | " \"english_stop\",\n", 333 | " \"porter\", \n", 334 | " \"asciifolding\" \n", 335 | " ]\n", 336 | " }\n", 337 | " }\n", 338 | " }\n", 339 | " }\n", 340 | "}\n", 341 | "index.create_my_index(body=porter_token_filter)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 11, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "gener,gener,gener,gener,organ,waiter\n" 356 | ] 357 | } 358 | ], 359 | "source": [ 360 | "# my_english custom analyzer:\n", 361 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 362 | " (index='my_index', analyzer='my_porter_english', text=text)['tokens']]\n", 363 | "print(','.join(analyzed_text))" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 12, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "porter2_token_filter = {\n", 375 | " \"settings\": {\n", 376 | " \"analysis\": {\n", 377 | " \"filter\": {\n", 378 | " \"english_stop\": {\n", 379 | " \"type\": \"stop\",\n", 380 | " \"stopwords\": \"_english_\"\n", 381 | " },\n", 382 | " \"porter2\": {\n", 383 | " \"type\": \"stemmer\",\n", 384 | " \"language\": \"porter2\" \n", 385 | " },\n", 386 | " \"english_possessive_stemmer\": {\n", 387 | " \"type\": \"stemmer\",\n", 388 | " \"language\": \"possessive_english\"\n", 389 | " }\n", 390 | " },\n", 391 | " \"analyzer\": {\n", 392 | " \"my_porter2_english\": {\n", 393 | " \"tokenizer\": \"standard\",\n", 394 | " \"filter\": [\n", 395 | " \"english_possessive_stemmer\",\n", 396 | " \"lowercase\",\n", 397 | " \"english_stop\",\n", 398 | " \"porter2\", \n", 399 | " \"asciifolding\" \n", 400 | " ]\n", 401 | " }\n", 402 | " }\n", 403 | " }\n", 404 | " }\n", 405 | "}\n", 406 | "index.create_my_index(body=porter2_token_filter)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "generic,general,generous,generat,organ,waiter\n" 421 | ] 422 | } 423 | ], 424 | "source": [ 425 | "# my_english custom analyzer:\n", 426 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 427 | " (index='my_index', analyzer='my_porter2_english', text=text)['tokens']]\n", 428 | "print(','.join(analyzed_text))" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "### Summary of Analyzer Comparison:\n", 436 | "\n", 437 | "text = \"A generically generally generously generated organized waiter.\"\n", 438 | "\n", 439 | "##### English\n", 440 | "\n", 441 | "gener,gener,gener,gener,organ,waiter\n", 442 | "\n", 443 | "##### Hunspell (en_GB) #####\n", 444 | "\n", 445 | "a,genera,genera,generously,generous,generate,organize,organ,wait\n", 446 | "\n", 447 | "##### \"My English\" (Lite stemmer)\n", 448 | "\n", 449 | "generic,generally,generous,generate,organized,waiter\n", 450 | "\n", 451 | "##### \"My English\" (Porter stemmer)\n", 452 | "\n", 453 | "gener,gener,gener,gener,organ,waiter\n", 454 | "\n", 455 | "##### \"My English\" (Porter2 stemmer)\n", 456 | "\n", 457 | "generic,general,generous,generat,organ,waiter\n", 458 | "\n", 459 | "\n", 460 | "### Preventing Stemming\n", 461 | "\n", 462 | "Maybe important to keep skies and skiing as distinct words rather than stemming them both down to ski (as would happen with the english analyzer).\n", 463 | "\n", 464 | "The ```keyword_marker``` and ```stemmer_override``` token filters customize the stemming process." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 14, 470 | "metadata": { 471 | "collapsed": false 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "stem_control_settings = {\n", 476 | " \"settings\": {\n", 477 | " \"analysis\": {\n", 478 | " \"filter\": {\n", 479 | " \"no_stem\": {\n", 480 | " \"type\": \"keyword_marker\",\n", 481 | " \"keywords\": [ \"skies\" ] \n", 482 | " }\n", 483 | " },\n", 484 | " \"analyzer\": {\n", 485 | " \"my_stemmer\": {\n", 486 | " \"tokenizer\": \"standard\",\n", 487 | " \"filter\": [\n", 488 | " \"lowercase\",\n", 489 | " \"no_stem\",\n", 490 | " \"porter_stem\"\n", 491 | " ]\n", 492 | " }\n", 493 | " }\n", 494 | " }\n", 495 | " }\n", 496 | "}\n", 497 | "index.create_my_index(body=stem_control_settings)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 15, 503 | "metadata": { 504 | "collapsed": false 505 | }, 506 | "outputs": [ 507 | { 508 | "name": "stdout", 509 | "output_type": "stream", 510 | "text": [ 511 | "sky,skies,ski,ski\n" 512 | ] 513 | } 514 | ], 515 | "source": [ 516 | "# my_stemmer custom analyzer:\n", 517 | "text = ['sky skies skiing skis']\n", 518 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 519 | " (index='my_index', analyzer='my_stemmer', text=text)['tokens']]\n", 520 | "print(','.join(analyzed_text))" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "While the language analyzers allow us only to specify an array of words in the stem_exclusion parameter, the keyword_marker token filter also accepts a keywords_path parameter that allows us to store all of our keywords in [a file](https://www.elastic.co/guide/en/elasticsearch/guide/master/using-stopwords.html#updating-stopwords).\n", 528 | "\n", 529 | "#### Customizing Stemming\n", 530 | "\n", 531 | "Perhaps we prefer \"skies\" to be stemmed to \"sky\" instead. The ```stemmer_override``` token filter allows us to specify our own custom stemming rules. At the same time, we can handle some irregular forms like stemming mice to mouse and feet to foot:" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 16, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [ 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "the,mouse,came,down,from,the,sky,and,ran,over,my,foot\n" 546 | ] 547 | } 548 | ], 549 | "source": [ 550 | "my_stemmer_override = {\n", 551 | " \"settings\": {\n", 552 | " \"analysis\": {\n", 553 | " \"filter\": {\n", 554 | " \"custom_stem\": {\n", 555 | " \"type\": \"stemmer_override\",\n", 556 | " \"rules\": [ \n", 557 | " \"skies=>sky\",\n", 558 | " \"mice=>mouse\",\n", 559 | " \"feet=>foot\"\n", 560 | " ]\n", 561 | " }\n", 562 | " },\n", 563 | " \"analyzer\": {\n", 564 | " \"my_stemmer_override\": {\n", 565 | " \"tokenizer\": \"standard\",\n", 566 | " \"filter\": [\n", 567 | " \"lowercase\",\n", 568 | " \"custom_stem\", \n", 569 | " \"porter_stem\"\n", 570 | " ]\n", 571 | " }\n", 572 | " }\n", 573 | " }\n", 574 | " }\n", 575 | "}\n", 576 | "index.create_my_index(body=my_stemmer_override)\n", 577 | "# my_stemmer_override custom analyzer:\n", 578 | "text = ['The mice came down from the skies and ran over my feet']\n", 579 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 580 | " (index='my_index', analyzer='my_stemmer_override', text=text)['tokens']]\n", 581 | "print(','.join(analyzed_text))" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "**NOTE**: The stemmer_override filter (\"custom_stem\") must be placed **before** the stemmer (here \"porter_stem\").\n", 589 | "\n", 590 | "Just as for the keyword_marker token filter, rules can be stored in a file whose location should be specified with the ```rules_path``` parameter." 591 | ] 592 | } 593 | ], 594 | "metadata": { 595 | "kernelspec": { 596 | "display_name": "Python 3", 597 | "language": "python", 598 | "name": "python3" 599 | }, 600 | "language_info": { 601 | "codemirror_mode": { 602 | "name": "ipython", 603 | "version": 3 604 | }, 605 | "file_extension": ".py", 606 | "mimetype": "text/x-python", 607 | "name": "python", 608 | "nbconvert_exporter": "python", 609 | "pygments_lexer": "ipython3", 610 | "version": "3.5.1" 611 | } 612 | }, 613 | "nbformat": 4, 614 | "nbformat_minor": 0 615 | } 616 | -------------------------------------------------------------------------------- /Getting Started/Sorting and Relevance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "14 items created\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "import index\n", 43 | "from elasticsearch import Elasticsearch\n", 44 | "from elasticsearch_dsl import Search, Q\n", 45 | "from pprint import pprint\n", 46 | "\n", 47 | "es = Elasticsearch(\n", 48 | " 'localhost',\n", 49 | " # sniff before doing anything\n", 50 | " sniff_on_start=True,\n", 51 | " # refresh nodes after a node fails to respond\n", 52 | " sniff_on_connection_fail=True,\n", 53 | " # and also every 60 seconds\n", 54 | " sniffer_timeout=60\n", 55 | ")\n", 56 | "\n", 57 | "r = index.populate()\n", 58 | "print('{} items created'.format(len(r['items'])))\n", 59 | "\n", 60 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 61 | "# Run the script: populate.ipynb" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "source": [ 70 | "### Sorting and Relevance\n", 71 | "\n", 72 | "By default, results are returned sorted by relevance—with the most relevant docs first. Later in this chapter, we explain what we mean by relevance and how it is calculated, but let’s start by looking at the sort parameter and how to use it.\n", 73 | "\n", 74 | "Relevance isn't always meaningful e.g. if we are mostly filtering:\n", 75 | "\n", 76 | "```\n", 77 | "GET /_search\n", 78 | "{\n", 79 | " \"query\" : {\n", 80 | " \"bool\" : {\n", 81 | " \"filter\" : {\n", 82 | " \"term\" : {\n", 83 | " \"user_id\" : 1\n", 84 | " }\n", 85 | " }\n", 86 | " }\n", 87 | " }\n", 88 | "}\n", 89 | "```" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 2, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "# And a filter-only search\n", 101 | "s = Search(using=es)\n", 102 | "s = s.filter('term', user_id=1)\n", 103 | "res = s.execute()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Docs returned in random order and will have a _score of 0" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Score:0.0\n", 125 | "Score:0.0\n", 126 | "Score:0.0\n", 127 | "Score:0.0\n", 128 | "Score:0.0\n", 129 | "Score:0.0\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "for hit in res:\n", 135 | " print('Score:{}'.format(hit.meta.score))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 4, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | ", , , , , ]>" 149 | ] 150 | }, 151 | "execution_count": 4, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "res" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "# Or we can make sure the items have a constant non-zero score\n", 169 | "s = Search(using=es).query('constant_score', filter=Q('term', user_id=1))\n", 170 | "res = s.execute()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "Score:1.0 with date of 2014-09-24\n", 185 | "Score:1.0 with date of 2014-09-18\n", 186 | "Score:1.0 with date of 2014-09-20\n", 187 | "Score:1.0 with date of 2014-09-22\n", 188 | "Score:1.0 with date of 2014-09-14\n", 189 | "Score:1.0 with date of 2014-09-16\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "for hit in res:\n", 195 | " print('Score:{} with date of {}'.format(hit.meta.score,hit.date))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Sorting by Field Values\n", 203 | "\n", 204 | "```\n", 205 | "GET /_search\n", 206 | "{\n", 207 | " \"query\" : {\n", 208 | " \"bool\" : {\n", 209 | " \"filter\" : { \"term\" : { \"user_id\" : 1 }}\n", 210 | " }\n", 211 | " },\n", 212 | " \"sort\": { \"date\": { \"order\": \"desc\" }}\n", 213 | "}\n", 214 | "```" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 7, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "Score:None with date of 2014-09-24 and sort field:[1411516800000]\n", 229 | "Score:None with date of 2014-09-22 and sort field:[1411344000000]\n", 230 | "Score:None with date of 2014-09-20 and sort field:[1411171200000]\n", 231 | "Score:None with date of 2014-09-18 and sort field:[1410998400000]\n", 232 | "Score:None with date of 2014-09-16 and sort field:[1410825600000]\n", 233 | "Score:None with date of 2014-09-14 and sort field:[1410652800000]\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "s = Search(using=es).query('bool', filter=Q('term', user_id=1))\n", 239 | "s = s.sort({ \"date\": { \"order\": \"desc\" }})\n", 240 | "res = s.execute()\n", 241 | "# Now is date descending order:\n", 242 | "for hit in res:\n", 243 | " print('Score:{} with date of {} and sort field:{}'\n", 244 | " .format(hit.meta.score,hit.date,hit.meta.sort))" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "Notice the score field set to None because it isn't required (due to a sort) and the addition of a \"sort\" field that was indexed internally and used to perform the sort (here in milliseconds since the epoch)." 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "### Multilevel Sorting\n", 259 | "\n", 260 | "```\n", 261 | "GET /_search\n", 262 | "{\n", 263 | " \"query\" : {\n", 264 | " \"bool\" : {\n", 265 | " \"must\": { \"match\": { \"tweet\": \"manage text search\" }},\n", 266 | " \"filter\" : { \"term\" : { \"user_id\" : 2 }}\n", 267 | " }\n", 268 | " },\n", 269 | " \"sort\": [\n", 270 | " { \"date\": { \"order\": \"desc\" }},\n", 271 | " { \"_score\": { \"order\": \"desc\" }}\n", 272 | " ]\n", 273 | "}\n", 274 | "```" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 8, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "s = Search(using=es).query('bool', \n", 286 | " must=Q('match', tweet='manage text search'),\n", 287 | " filter=Q('term', user_id=2))\n", 288 | "s = s.sort({ \"date\": { \"order\": \"desc\" }}, { \"_score\": { \"order\": \"desc\" }})\n", 289 | "#s = s.sort(\"date\",\"_score\") # sorted by date first\n", 290 | "res = s.execute()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 9, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "Score:0.64433396 with date of 2014-09-15 and sort field:[1410739200000, 0.64433396]\n", 305 | "Score:1.3434829 with date of 2014-09-13 and sort field:[1410566400000, 1.3434829]\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "for hit in res:\n", 311 | " print('Score:{} with date of {} and sort field:{}'\n", 312 | " .format(hit.meta.score,hit.date,hit.meta.sort))" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "Order is important. Results are sorted by the first criterion first. Only results whose first sort value is identical will then be sorted by the second criterion, and so on.\n", 320 | "\n", 321 | "Multilevel sorting doesn’t have to involve the _score. You could sort by using several different fields, on geo-distance or on a custom value calculated in a script.\n", 322 | "\n", 323 | "#### Sorting on Multivalue Fields\n", 324 | "\n", 325 | "Let's say we have fields with more than one item. How do we sort on them? For numbers and dates, you can reduce a multivalue field to a single value by using the min, max, avg, or sum sort modes. For instance, you could sort on the earliest date in each dates field by using the following:\n", 326 | "\n", 327 | "```\n", 328 | "\"sort\": {\n", 329 | " \"dates\": {\n", 330 | " \"order\": \"asc\",\n", 331 | " \"mode\": \"min\"\n", 332 | " }\n", 333 | "}\n", 334 | "```\n", 335 | "\n", 336 | "Let's create some docs to try this." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 10, 342 | "metadata": { 343 | "collapsed": false 344 | }, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "{'_id': '2',\n", 350 | " '_index': 'shows',\n", 351 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n", 352 | " '_type': 'tv_series',\n", 353 | " '_version': 1,\n", 354 | " 'created': True,\n", 355 | " 'result': 'created'}" 356 | ] 357 | }, 358 | "execution_count": 10, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "doc1 = {\n", 365 | " 'title': 'How I Met Your Mother',\n", 366 | " 'date': '2013-01-01',\n", 367 | " 'ratings': [2,3,1,3,4,5,5,5,3,4,2]\n", 368 | "}\n", 369 | "doc2 = {\n", 370 | " 'title': 'Breaking Bad',\n", 371 | " 'date': '2013-01-01',\n", 372 | " 'ratings': [5,5,4,3,4,5,5,5,3,5,5]\n", 373 | "}\n", 374 | "es.create(index='shows', doc_type='tv_series', body=doc1, id=1)\n", 375 | "es.create(index='shows', doc_type='tv_series', body=doc2, id=2)\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 11, 381 | "metadata": { 382 | "collapsed": true 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "s = Search(using=es)\n", 387 | "s = s.sort({ \"ratings\": { \"order\": \"desc\", \"mode\":\"avg\" }})\n", 388 | "#s = s.sort(\"date\",\"_score\") # sorted by date first\n", 389 | "res = s.execute()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 12, 395 | "metadata": { 396 | "collapsed": false 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "for hit in res:\n", 401 | " print(hit.title, hit.meta.sort)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "### String Sorting and Multifields\n", 409 | "\n", 410 | "Sorting on text fields is problematic because an analyzed field will consist of a bunch of tokens (post analyzer). If you really want to sort on a text field, then it's best left in an unanalyzed form. This can be done by adding a field:\n", 411 | "\n", 412 | "```\n", 413 | "\"tweet\": { \n", 414 | " \"type\": \"string\",\n", 415 | " \"analyzer\": \"english\",\n", 416 | " \"fields\": {\n", 417 | " \"raw\": { \n", 418 | " \"type\": \"string\",\n", 419 | " \"index\": \"not_analyzed\"\n", 420 | " }\n", 421 | " }\n", 422 | "}\n", 423 | "```\n", 424 | "\n", 425 | "And then sort on the raw field:\n", 426 | "\n", 427 | "```\n", 428 | "GET /_search\n", 429 | "{\n", 430 | " \"query\": {\n", 431 | " \"match\": {\n", 432 | " \"tweet\": \"elasticsearch\"\n", 433 | " }\n", 434 | " },\n", 435 | " \"sort\": \"tweet.raw\"\n", 436 | "}\n", 437 | "```\n", 438 | "\n", 439 | "First I will delete the tweet index and re-create using template 2." 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 13, 445 | "metadata": { 446 | "collapsed": false 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "r = index.populate(template=2)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 14, 456 | "metadata": { 457 | "collapsed": false 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "s = Search(using=es).query(Q('match', tweet='elasticsearch'))\n", 462 | "s = s.sort(\"tweet.raw\")\n", 463 | "res = s.execute()" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 15, 469 | "metadata": { 470 | "collapsed": false 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "for hit in res:\n", 475 | " print(hit.meta.sort)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "### What is Relevance?\n", 483 | "\n", 484 | "The relevance score of each document is represented by a positive floating-point number called the _score. The higher the _score, the more relevant the document.\n", 485 | "\n", 486 | "A query clause generates a _score for each document. How that score is calculated depends on the type of query clause. Different query clauses are used for different purposes: a fuzzy query might determine the _score by calculating how similar the spelling of the found word is to the original search term; a terms query would incorporate the percentage of terms that were found. However, what we usually mean by relevance is the algorithm that we use to calculate how similar the contents of a full-text field are to a full-text query string.\n", 487 | "\n", 488 | "The standard similarity algorithm used in Elasticsearch is known as term frequency/inverse document frequency, or [TF/IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)\n", 489 | "\n", 490 | "Understanding how the relevance was calculated can be difficult to understand, hence the availability of the explain parameter.\n", 491 | "\n", 492 | "```\n", 493 | "GET /_search?explain \n", 494 | "{\n", 495 | " \"query\" : { \"match\" : { \"tweet\" : \"honeymoon\" }}\n", 496 | "}\n", 497 | "```" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 16, 503 | "metadata": { 504 | "collapsed": false 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "s = Search(using=es).query(Q('match', tweet='honeymoon'))\n", 509 | "s = s.extra(explain=True)\n", 510 | "res = s.execute()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 18, 516 | "metadata": { 517 | "collapsed": false 518 | }, 519 | "outputs": [ 520 | { 521 | "data": { 522 | "text/html": [ 523 | "

" 524 | ] 525 | }, 526 | "metadata": {}, 527 | "output_type": "display_data" 528 | }, 529 | { 530 | "data": { 531 | "application/javascript": [ 532 | "\n", 533 | " require([\"https://rawgit.com/caldwell/renderjson/master/renderjson.js\"], function() {\n", 534 | " document.getElementById('1873ba03-527b-44f2-b271-02475a2091f5').appendChild(renderjson([{'_score': 0.6395861, '_type': 'tweet', '_node': 'nKjjkxx5SfWhB1vabVu5ig', '_source': {'date': '2014-09-22', 'tweet': 'Elasticsearch and I have left the honeymoon stage, and I still love her.', 'user_id': 1, 'name': 'John Smith'}, '_id': '12', '_index': 'us', '_shard': '[us][1]', '_explanation': {'details': [{'details': [{'details': [{'details': [], 'value': 1.0, 'description': 'docFreq'}, {'details': [], 'value': 2.0, 'description': 'docCount'}], 'value': 0.6931472, 'description': 'idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:'}, {'details': [{'details': [], 'value': 1.0, 'description': 'termFreq=1.0'}, {'details': [], 'value': 1.2, 'description': 'parameter k1'}, {'details': [], 'value': 0.75, 'description': 'parameter b'}, {'details': [], 'value': 8.5, 'description': 'avgFieldLength'}, {'details': [], 'value': 10.24, 'description': 'fieldLength'}], 'value': 0.9227277, 'description': 'tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:'}], 'value': 0.6395861, 'description': 'score(doc=2,freq=1.0 = termFreq=1.0\\n), product of:'}], 'value': 0.6395861, 'description': 'weight(tweet:honeymoon in 2) [PerFieldSimilarity], result of:'}}]))\n", 535 | " });\n", 536 | " " 537 | ] 538 | }, 539 | "metadata": {}, 540 | "output_type": "display_data" 541 | } 542 | ], 543 | "source": [ 544 | "index.RenderJSON(res['hits']['hits'])" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 29, 550 | "metadata": { 551 | "collapsed": false 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "s = Search(using=es).query(Q('match', tweet='honeymoon') & Q('match', _id=12))\n", 556 | "s = s.extra(explain=True)\n", 557 | "res = s.execute()" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 33, 563 | "metadata": { 564 | "collapsed": false 565 | }, 566 | "outputs": [ 567 | { 568 | "data": { 569 | "text/html": [ 570 | "
" 571 | ] 572 | }, 573 | "metadata": {}, 574 | "output_type": "display_data" 575 | }, 576 | { 577 | "data": { 578 | "application/javascript": [ 579 | "\n", 580 | " require([\"https://rawgit.com/caldwell/renderjson/master/renderjson.js\"], function() {\n", 581 | " document.getElementById('8748946e-0bb8-48ec-b4c1-ac03e0cd3026').appendChild(renderjson([{'_score': 1.6395861, '_type': 'tweet', '_node': 'nKjjkxx5SfWhB1vabVu5ig', '_source': {'date': '2014-09-22', 'tweet': 'Elasticsearch and I have left the honeymoon stage, and I still love her.', 'user_id': 1, 'name': 'John Smith'}, '_id': '12', '_index': 'us', '_shard': '[us][1]', '_explanation': {'details': [{'details': [{'details': [{'details': [{'details': [], 'value': 1.0, 'description': 'docFreq'}, {'details': [], 'value': 2.0, 'description': 'docCount'}], 'value': 0.6931472, 'description': 'idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:'}, {'details': [{'details': [], 'value': 1.0, 'description': 'termFreq=1.0'}, {'details': [], 'value': 1.2, 'description': 'parameter k1'}, {'details': [], 'value': 0.75, 'description': 'parameter b'}, {'details': [], 'value': 8.5, 'description': 'avgFieldLength'}, {'details': [], 'value': 10.24, 'description': 'fieldLength'}], 'value': 0.9227277, 'description': 'tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:'}], 'value': 0.6395861, 'description': 'score(doc=2,freq=1.0 = termFreq=1.0\\n), product of:'}], 'value': 0.6395861, 'description': 'weight(tweet:honeymoon in 2) [PerFieldSimilarity], result of:'}, {'details': [{'details': [], 'value': 1.0, 'description': 'boost'}, {'details': [], 'value': 1.0, 'description': 'queryNorm'}], 'value': 1.0, 'description': 'ConstantScore(_uid:tweet#12 _uid:user#12), product of:'}], 'value': 1.6395861, 'description': 'sum of:'}}]))\n", 582 | " });\n", 583 | " " 584 | ] 585 | }, 586 | "metadata": {}, 587 | "output_type": "display_data" 588 | } 589 | ], 590 | "source": [ 591 | "index.RenderJSON(res['hits']['hits'])" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": { 598 | "collapsed": true 599 | }, 600 | "outputs": [], 601 | "source": [] 602 | } 603 | ], 604 | "metadata": { 605 | "kernelspec": { 606 | "display_name": "Python 3", 607 | "language": "python", 608 | "name": "python3" 609 | }, 610 | "language_info": { 611 | "codemirror_mode": { 612 | "name": "ipython", 613 | "version": 3 614 | }, 615 | "file_extension": ".py", 616 | "mimetype": "text/x-python", 617 | "name": "python", 618 | "nbconvert_exporter": "python", 619 | "pygments_lexer": "ipython3", 620 | "version": "3.5.1" 621 | } 622 | }, 623 | "nbformat": 4, 624 | "nbformat_minor": 0 625 | } 626 | --------------------------------------------------------------------------------