├── __init__.py ├── config ├── stopwords │ └── english.txt ├── hunspell │ └── en_GB │ │ ├── en_GB.aff │ │ └── en_GB.dic └── analysis │ └── synonyms.txt ├── .DS_Store ├── examples_fox.json ├── examples_sid.json ├── examples_posts.json ├── .gitignore ├── examples_main.json ├── README.md ├── .ipynb_checkpoints ├── populate-checkpoint.ipynb ├── Mapping and Analysis-checkpoint.ipynb └── Searching - The Basic Tools-checkpoint.ipynb ├── index.py ├── Dealing with Human Language ├── Normalizing Tokens.ipynb ├── Identifying Words.ipynb ├── Getting Started with Languages.ipynb ├── Typoes and Mispelings.ipynb ├── Reducing Words to Their Root Form (Pt.1).ipynb ├── Reducing Words (Pt.2).ipynb └── Reducing Words (Pt.2)-Copy1.ipynb ├── populate.ipynb ├── Search in Depth ├── Proximity Matching.ipynb └── Multifield Search (Pt.2).ipynb └── Getting Started ├── Searching - The Basic Tools.ipynb └── Sorting and Relevance.ipynb /__init__.py: -------------------------------------------------------------------------------- 1 | import index -------------------------------------------------------------------------------- /config/stopwords/english.txt: -------------------------------------------------------------------------------- 1 | a 2 | the 3 | dead -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgolding/elasticsearch/HEAD/.DS_Store -------------------------------------------------------------------------------- /config/hunspell/en_GB/en_GB.aff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgolding/elasticsearch/HEAD/config/hunspell/en_GB/en_GB.aff -------------------------------------------------------------------------------- /config/hunspell/en_GB/en_GB.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pgolding/elasticsearch/HEAD/config/hunspell/en_GB/en_GB.dic -------------------------------------------------------------------------------- /config/analysis/synonyms.txt: -------------------------------------------------------------------------------- 1 | manager => leader,boss,person 2 | chef => chef,cook,person 3 | maid => maid,housemaid,young_lady,female,person -------------------------------------------------------------------------------- /examples_fox.json: -------------------------------------------------------------------------------- 1 | { "index": { "_id": 1 }} 2 | { "title": "The quick brown fox" } 3 | { "index": { "_id": 2 }} 4 | { "title": "The quick brown fox jumps over the lazy dog" } 5 | { "index": { "_id": 3 }} 6 | { "title": "The quick brown fox jumps over the quick dog" } 7 | { "index": { "_id": 4 }} 8 | { "title": "Brown fox brown dog" } 9 | -------------------------------------------------------------------------------- /examples_sid.json: -------------------------------------------------------------------------------- 1 | { "index": { "_id": 1 }} 2 | { "price" : 10, "productID" : "XHDK-A-1293-#fJ3" } 3 | { "index": { "_id": 2 }} 4 | { "price" : 20, "productID" : "KDKE-B-9947-#kL5" } 5 | { "index": { "_id": 3 }} 6 | { "price" : 30, "productID" : "JODL-X-1937-#pV7" } 7 | { "index": { "_id": 4 }} 8 | { "price" : 30, "productID" : "QQPX-R-3956-#aD8" } 9 | -------------------------------------------------------------------------------- /examples_posts.json: -------------------------------------------------------------------------------- 1 | { "index": { "_id": "1" }} 2 | { "tags" : ["search"] } 3 | { "index": { "_id": "2" }} 4 | { "tags" : ["search", "open_source"] } 5 | { "index": { "_id": "3" }} 6 | { "other_field" : "some data" } 7 | { "index": { "_id": "4" }} 8 | { "tags" : null } 9 | { "index": { "_id": "5" }} 10 | { "tags" : ["search", null] } 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | 3 | # Compiled source # 4 | ################### 5 | *.com 6 | *.class 7 | *.dll 8 | *.exe 9 | *.o 10 | *.so 11 | 12 | # Packages # 13 | ############ 14 | # it's better to unpack these files and commit the raw source 15 | # git has its own built in compression methods 16 | *.7z 17 | *.dmg 18 | *.gz 19 | *.iso 20 | *.jar 21 | *.rar 22 | *.tar 23 | *.zip 24 | 25 | # Logs and databases # 26 | ###################### 27 | *.log 28 | *.sql 29 | *.sqlite 30 | 31 | # OS generated files # 32 | ###################### 33 | .DS_Store 34 | .DS_Store? 35 | ._* 36 | .Spotlight-V100 37 | .Trashes 38 | ehthumbs.db 39 | Thumbs.db -------------------------------------------------------------------------------- /examples_main.json: -------------------------------------------------------------------------------- 1 | { "create": { "_index": "us", "_type": "user", "_id": "1" }} 2 | { "email" : "john@smith.com", "name" : "John Smith", "username" : "@john" } 3 | { "create": { "_index": "gb", "_type": "user", "_id": "2" }} 4 | { "email" : "mary@jones.com", "name" : "Mary Jones", "username" : "@mary" } 5 | { "create": { "_index": "gb", "_type": "tweet", "_id": "3" }} 6 | { "date" : "2014-09-13", "name" : "Mary Jones", "tweet" : "Elasticsearch means full text search has never been so easy", "user_id" : 2 } 7 | { "create": { "_index": "us", "_type": "tweet", "_id": "4" }} 8 | { "date" : "2014-09-14", "name" : "John Smith", "tweet" : "@mary it is not just text, it does everything", "user_id" : 1 } 9 | { "create": { "_index": "gb", "_type": "tweet", "_id": "5" }} 10 | { "date" : "2014-09-15", "name" : "Mary Jones", "tweet" : "However did I manage before Elasticsearch?", "user_id" : 2 } 11 | { "create": { "_index": "us", "_type": "tweet", "_id": "6" }} 12 | { "date" : "2014-09-16", "name" : "John Smith", "tweet" : "The Elasticsearch API is really easy to use", "user_id" : 1 } 13 | { "create": { "_index": "gb", "_type": "tweet", "_id": "7" }} 14 | { "date" : "2014-09-17", "name" : "Mary Jones", "tweet" : "The Query DSL is really powerful and flexible", "user_id" : 2 } 15 | { "create": { "_index": "us", "_type": "tweet", "_id": "8" }} 16 | { "date" : "2014-09-18", "name" : "John Smith", "user_id" : 1 } 17 | { "create": { "_index": "gb", "_type": "tweet", "_id": "9" }} 18 | { "date" : "2014-09-19", "name" : "Mary Jones", "tweet" : "Geo-location aggregations are really cool", "user_id" : 2 } 19 | { "create": { "_index": "us", "_type": "tweet", "_id": "10" }} 20 | { "date" : "2014-09-20", "name" : "John Smith", "tweet" : "Elasticsearch surely is one of the hottest new NoSQL products", "user_id" : 1 } 21 | { "create": { "_index": "gb", "_type": "tweet", "_id": "11" }} 22 | { "date" : "2014-09-21", "name" : "Mary Jones", "tweet" : "Elasticsearch is built for the cloud, easy to scale", "user_id" : 2 } 23 | { "create": { "_index": "us", "_type": "tweet", "_id": "12" }} 24 | { "date" : "2014-09-22", "name" : "John Smith", "tweet" : "Elasticsearch and I have left the honeymoon stage, and I still love her.", "user_id" : 1 } 25 | { "create": { "_index": "gb", "_type": "tweet", "_id": "13" }} 26 | { "date" : "2014-09-23", "name" : "Mary Jones", "tweet" : "So yes, I am an Elasticsearch fanboy", "user_id" : 2 } 27 | { "create": { "_index": "us", "_type": "tweet", "_id": "14" }} 28 | { "date" : "2014-09-24", "name" : "John Smith", "tweet" : "How many more cheesy tweets do I have to write?", "user_id" : 1 } 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch: The Definitive Guide (with Python examples) # 2 | 3 | #### Versions: #### 4 | 5 | ##### Services: 6 | * Kibana (5.2.2) 7 | * Elasticsearch (5.2.2) 8 | 9 | ##### Python libs: 10 | * elasticsearch (5.2.0) 11 | * elasticsearch-dsl (5.1.0) 12 | 13 | ### What is this? 14 | 15 | This is a set of Jupyter notebooks to help those who want to follow the book [Elasticsearch: The Definitive Guide](https://www.elastic.co/guide/en/elasticsearch/guide/master/index.html) using Python code in addition to the JSON API calls in the book. I have reproduced most of the example API calls, often in various ways, using the two Python libraries: 16 | 17 | * [Elasticsearch](http://elasticsearch-py.readthedocs.io/en/master/index.html) 18 | * [Elasticsearch DSL](http://elasticsearch-dsl.readthedocs.io/en/latest/index.html) 19 | 20 | My goal is to assist the reader/learner in understanding the mechanics of Elasticsearch whilst understanding the Python libs. 21 | 22 | I follow the structure of the book fairly closely (beginning with "Seaching - The Basic Tools") using identical chapter names and headings. I suggest to follow the book whilst exercising some examples in the Kibana console (or via CURL) and some in Python. 23 | 24 | In true notebook fashion, the notebooks provide an interactive documented flow and a place to play. Where useful, I insert text from the guide so as to not break the flow too much (between the book and the notebooks). 25 | 26 | Note that the examples here assume the same setup as the examples in the book, namely a virgin instance of Elasticsearch (most likely on localhost) pre-populated with the [test data](https://github.com/pgolding/elasticsearch/blob/master/examples.json). 27 | 28 | The helper script (index.py) is available to populate/delete/reset the index at various times throughout the chapters. You don't need to touch it as I included initialization at the start of each chapter: 29 | 30 | ```python 31 | import index 32 | 33 | r = index.populate() 34 | print('{} items created'.format(len(r['items']))) 35 | ``` 36 | 37 | If at any time you get stuck with the index, then just call ```index.populate()``` to delete and re-populate the index. You can also pass in a JSON object to define the settings and field mappings etc: 38 | 39 | ```python 40 | index_template = { 41 | "mappings": { 42 | "tweet" : { 43 | "properties" : { 44 | "tweet" : { 45 | "type" : "text", 46 | "analyzer": "english" 47 | }, 48 | "date" : { 49 | "type" : "date" 50 | }, 51 | "name" : { 52 | "type" : "text" 53 | }, 54 | "user_id" : { 55 | "type" : "long" 56 | } 57 | } 58 | } 59 | } 60 | } 61 | index.populate(index_template) 62 | ``` 63 | 64 | (However, I usually make these calls where needed in the notebooks.) 65 | 66 | This is a WIP and I will continue to update it with all examples and later build out more complex examples as accompanying notebooks. 67 | 68 | Note that this is **not** a comprehensive coverage of all examples in the book. I have skipped a few examples here and there, mostly because they are repetitive or because they deal with non-English languages. 69 | 70 | Also, I have added extra examples and included supplementary test data where useful (e.g. synonyms, stopwords files etc.) . This was to add further clarity of emphasis to some of the examples or to provide settings or info overlooked by the book (but covered in the API docs). -------------------------------------------------------------------------------- /.ipynb_checkpoints/populate-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 30, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from elasticsearch import Elasticsearch\n", 12 | "from pprint import pprint\n", 13 | "\n", 14 | "es = Elasticsearch(\n", 15 | " 'localhost',\n", 16 | " # sniff before doing anything\n", 17 | " sniff_on_start=True,\n", 18 | " # refresh nodes after a node fails to respond\n", 19 | " sniff_on_connection_fail=True,\n", 20 | " # and also every 60 seconds\n", 21 | " sniffer_timeout=60\n", 22 | ")\n", 23 | "\n", 24 | "f = open('examples.json', 'r')\n", 25 | "data = f.read()\n", 26 | "\n", 27 | "response = es.bulk(body=data)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 31, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "assert response['errors'] == False\n", 39 | "# Should not produce an AssertionError" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "source": [ 48 | "For the later chapters, you may want to delete the index and re-create, including using a different index:\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 27, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "{'acknowledged': True}" 62 | ] 63 | }, 64 | "execution_count": 27, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "es.indices.delete(index=['gb','us'])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 28, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "index_template = {\n", 82 | " \"mappings\": {\n", 83 | " \"tweet\" : {\n", 84 | " \"properties\" : {\n", 85 | " \"tweet\" : {\n", 86 | " \"type\" : \"text\",\n", 87 | " \"analyzer\": \"english\"\n", 88 | " },\n", 89 | " \"date\" : {\n", 90 | " \"type\" : \"date\"\n", 91 | " },\n", 92 | " \"name\" : {\n", 93 | " \"type\" : \"text\"\n", 94 | " },\n", 95 | " \"user_id\" : {\n", 96 | " \"type\" : \"long\"\n", 97 | " }\n", 98 | " }\n", 99 | " }\n", 100 | " }\n", 101 | "}" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 29, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "{'acknowledged': True, 'shards_acknowledged': True}" 115 | ] 116 | }, 117 | "execution_count": 29, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "es.indices.create(index='gb', body=index_template)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.5.1" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 0 157 | } 158 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | from pprint import pprint 3 | import time 4 | import uuid 5 | from IPython.display import display_javascript, display_html, display 6 | import json 7 | 8 | es = Elasticsearch( 9 | 'localhost', 10 | # sniff before doing anything 11 | sniff_on_start=True, 12 | # refresh nodes after a node fails to respond 13 | sniff_on_connection_fail=True, 14 | # and also every 60 seconds 15 | sniffer_timeout=60 16 | ) 17 | 18 | # Shards = 1 because of https://www.elastic.co/guide/en/elasticsearch/guide/master/relevance-is-broken.html 19 | index_template = { 20 | "settings": { "number_of_shards": 1 }, 21 | "mappings": { 22 | "tweet" : { 23 | "properties" : { 24 | "tweet" : { 25 | "type" : "text", 26 | "analyzer": "english" 27 | }, 28 | "date" : { 29 | "type" : "date" 30 | }, 31 | "name" : { 32 | "type" : "text" 33 | }, 34 | "user_id" : { 35 | "type" : "long" 36 | } 37 | } 38 | } 39 | } 40 | } 41 | 42 | # Shards = 1 because of https://www.elastic.co/guide/en/elasticsearch/guide/master/relevance-is-broken.html 43 | multi_field_index_template = { 44 | "settings": { "number_of_shards": 1 }, 45 | "mappings": { 46 | "tweet" : { 47 | "properties" : { 48 | 49 | "tweet": { 50 | "type": "string", 51 | "analyzer": "english", 52 | "fields": { 53 | "raw": { 54 | "type": "string", 55 | "index": "not_analyzed" 56 | } 57 | } 58 | }, 59 | "date" : { 60 | "type" : "date" 61 | }, 62 | "name" : { 63 | "type" : "text" 64 | }, 65 | "user_id" : { 66 | "type" : "long" 67 | } 68 | } 69 | } 70 | } 71 | } 72 | 73 | f = open('../examples_main.json', 'r') 74 | data = f.read() 75 | 76 | def load_sid_examples(settings=None, set=None): 77 | if set==1: 78 | file_to_load='../examples_sid.json' 79 | idx = 'my_store' 80 | dt = 'produces' 81 | elif set==2: 82 | file_to_load='../examples_posts.json' 83 | idx = 'my_index' 84 | dt = 'posts' 85 | elif set==3: 86 | file_to_load='../examples_fox.json' 87 | idx = 'my_index' 88 | dt = 'my_type' 89 | else: 90 | file_to_load='../examples_sid.json' 91 | idx = 'my_store' 92 | dt = 'produces' 93 | 94 | try: 95 | f = open(file_to_load, 'r') 96 | sid_data = f.read() 97 | if es.indices.exists(idx): 98 | es.indices.delete(idx) 99 | if settings: 100 | es.indices.create(index=idx, body=settings) 101 | response = es.bulk(index=idx, doc_type=dt, body=sid_data) 102 | except Exception as e: 103 | print('Error loading examples') 104 | response = e 105 | return response 106 | 107 | def reset_all(): 108 | reset() 109 | if es.indices.exists('shows'): 110 | es.indices.delete(index='shows') 111 | if es.indices.exists('email'): 112 | es.indices.delete(index='email') 113 | 114 | def create_my_index(index_name='my_index', body=None): 115 | if es.indices.exists(index_name): 116 | es.indices.delete(index_name) 117 | es.indices.create(index=index_name, body=body) 118 | 119 | 120 | def populate(template_num=None): 121 | if es.indices.exists('gb'): 122 | es.indices.delete(index='gb') 123 | # cautious wait on index deletion - prob. not needed 124 | time.sleep(1) 125 | if es.indices.exists('us'): 126 | es.indices.delete(index='us') 127 | # cautious wait on index deletion - prob. not needed 128 | time.sleep(1) 129 | if isinstance(template_num, int): 130 | if template==1: 131 | es.indices.create(index='gb', body=index_template) 132 | response = es.bulk(body=data) 133 | elif template==2: 134 | es.indices.create(index='gb', body=multi_field_index_template) 135 | es.indices.create(index='us', body=multi_field_index_template) 136 | response = es.bulk(body=data) 137 | else: 138 | response = es.bulk(body=data) 139 | return response 140 | 141 | 142 | def populate_tweets_using_mapping(template=None): 143 | if es.indices.exists('gb'): 144 | es.indices.delete(index='gb') 145 | # cautious wait on index deletion - prob. not needed 146 | time.sleep(1) 147 | if es.indices.exists('us'): 148 | es.indices.delete(index='us') 149 | # cautious wait on index deletion - prob. not needed 150 | time.sleep(1) 151 | if isinstance(template, dict): 152 | es.indices.create(index='gb', body=template) 153 | es.indices.create(index='us', body=template) 154 | response = es.bulk(body=data) 155 | else: 156 | response = es.bulk(body=data) 157 | return response 158 | 159 | 160 | def reset(): 161 | if es.indices.exists('gb'): 162 | es.indices.delete(index='gb') 163 | time.sleep(1) 164 | if es.indices.exists('us'): 165 | es.indices.delete(index='us') 166 | time.sleep(1) 167 | 168 | # A helped class to render long JSON objects from ES with collapsible elements 169 | class RenderJSON(object): 170 | def __init__(self, json_data): 171 | if isinstance(json_data, dict): 172 | self.json_str = json.dumps(json_data) 173 | else: 174 | self.json_str = json_data 175 | self.uuid = str(uuid.uuid4()) 176 | 177 | def _ipython_display_(self): 178 | display_html('
'.format(self.uuid), raw=True) 179 | display_javascript(""" 180 | require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() { 181 | document.getElementById('%s').appendChild(renderjson(%s)) 182 | }); 183 | """ % (self.uuid, self.json_str), raw=True) 184 | 185 | reset_all() -------------------------------------------------------------------------------- /Dealing with Human Language/Normalizing Tokens.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Elasticsearch: The Definitive Guide - Python\n", 8 | "\n", 9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n", 10 | "\n", 11 | "Documentation for the Python libs:\n", 12 | "\n", 13 | "Low-level API:\n", 14 | "\n", 15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n", 16 | "\n", 17 | "Expressive DSL API (more \"Pythonic\")\n", 18 | "\n", 19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n", 20 | "\n", 21 | "Github repo for DSL API:\n", 22 | "\n", 23 | "https://github.com/elastic/elasticsearch-dsl-py\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "import sys, os\n", 35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "14 items created\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import index\n", 55 | "from elasticsearch import Elasticsearch\n", 56 | "from elasticsearch_dsl import Search, Q\n", 57 | "from pprint import pprint\n", 58 | "\n", 59 | "es = Elasticsearch(\n", 60 | " 'localhost',\n", 61 | " # sniff before doing anything\n", 62 | " sniff_on_start=True,\n", 63 | " # refresh nodes after a node fails to respond\n", 64 | " sniff_on_connection_fail=True,\n", 65 | " # and also every 60 seconds\n", 66 | " sniffer_timeout=60\n", 67 | ")\n", 68 | "\n", 69 | "r = index.populate()\n", 70 | "print('{} items created'.format(len(r['items'])))\n", 71 | "\n", 72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n", 73 | "# Run the script: populate.ipynb" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "source": [ 82 | "### Normalizing Tokens\n", 83 | "\n", 84 | "Breaking text into tokens is only half the job. To make those tokens more easily searchable, they need to go through a normalization process to remove insignificant differences between otherwise identical words, such as uppercase versus lowercase. Perhaps we also need to remove significant differences, to make esta, ésta, and está all searchable as the same word. Would you search for déjà vu, or just for deja vu?\n", 85 | "\n", 86 | "This is the job of the token filters, which receive a stream of tokens from the tokenizer. You can have multiple token filters, each doing its particular job. Each receives the new token stream as output by the token filter before it.\n", 87 | "\n", 88 | "#### In That Case\n", 89 | "\n", 90 | "The most frequently used token filter is the lowercase filter, which does exactly what you would expect; it transforms each token into its lowercase form:\n", 91 | "\n", 92 | "```\n", 93 | "GET /_analyze?tokenizer=standard&filters=lowercase\n", 94 | "The QUICK Brown FOX! \n", 95 | "```" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "the,quick,brown,fox\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "text = 'The QUICK Brown FOX!'# contains some uppercase words\n", 115 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n", 116 | " (tokenizer='standard', filter=['lowercase'], text=text)['tokens']]\n", 117 | "print(','.join(analyzed_text))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "To make this automatic as part of the analysis process, we can create a custom analyzer:\n", 125 | "```\n", 126 | "PUT /my_index\n", 127 | "{\n", 128 | " \"settings\": {\n", 129 | " \"analysis\": {\n", 130 | " \"analyzer\": {\n", 131 | " \"my_lowercaser\": {\n", 132 | " \"tokenizer\": \"standard\",\n", 133 | " \"filter\": [ \"lowercase\" ]\n", 134 | " }\n", 135 | " }\n", 136 | " }\n", 137 | " }\n", 138 | "}\n", 139 | "```" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 15, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "# first delete the index from previous chapters, if it exists\n", 151 | "if es.indices.exists('my_index'): \n", 152 | " es.indices.delete('my_index')" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 16, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "#es.indices.create('my_index')\n", 164 | "from elasticsearch_dsl import analyzer, Index\n", 165 | "my_custom_analyzer = analyzer('my_lowercaser',\n", 166 | " tokenizer='standard',\n", 167 | " filter='lowercase')\n", 168 | "i = Index('my_index')\n", 169 | "i.analyzer(my_custom_analyzer)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 17, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "{'tokens': [{'end_offset': 3,\n", 183 | " 'position': 0,\n", 184 | " 'start_offset': 0,\n", 185 | " 'token': 'the',\n", 186 | " 'type': 'Some déjà vu \">website\n", 209 | "```\n", 210 | "\n", 211 | "To use them as part of the analyzer, they should be added to a custom analyzer definition:\n", 212 | "\n", 213 | "```\n", 214 | "PUT /my_index\n", 215 | "{\n", 216 | " \"settings\": {\n", 217 | " \"analysis\": {\n", 218 | " \"analyzer\": {\n", 219 | " \"my_html_analyzer\": {\n", 220 | " \"tokenizer\": \"standard\",\n", 221 | " \"char_filter\": [ \"html_strip\" ]\n", 222 | " }\n", 223 | " }\n", 224 | " }\n", 225 | " }\n", 226 | "}\n", 227 | "```" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 31, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "text = '
Some déjà vu \">website'"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 32,
244 | "metadata": {
245 | "collapsed": false
246 | },
247 | "outputs": [],
248 | "source": [
249 | "from elasticsearch_dsl import analyzer, Index"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 33,
255 | "metadata": {
256 | "collapsed": false
257 | },
258 | "outputs": [],
259 | "source": [
260 | "my_custom_analyzer = analyzer('my_html_analyzer',\n",
261 | " tokenizer='standard',\n",
262 | " char_filter='html_strip')"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 34,
268 | "metadata": {
269 | "collapsed": true
270 | },
271 | "outputs": [],
272 | "source": [
273 | "i = Index('my_index')"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 35,
279 | "metadata": {
280 | "collapsed": true
281 | },
282 | "outputs": [],
283 | "source": [
284 | "i.analyzer(my_custom_analyzer)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": 38,
290 | "metadata": {
291 | "collapsed": false
292 | },
293 | "outputs": [
294 | {
295 | "name": "stdout",
296 | "output_type": "stream",
297 | "text": [
298 | "Some,déjà,vu,website\n"
299 | ]
300 | }
301 | ],
302 | "source": [
303 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n",
304 | " (index='my_index', analyzer='my_html_analyzer', text=text)['tokens']]\n",
305 | "print(','.join(analyzed_text))"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "NOTE (and TO_DO): I cheated here because the above method call returned an illegal exception that I was unable to debug (related to passing in the char_filter param). So I created the index using the above params via the Kibana developer console before making the call."
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "collapsed": true
320 | },
321 | "outputs": [],
322 | "source": []
323 | }
324 | ],
325 | "metadata": {
326 | "kernelspec": {
327 | "display_name": "Python 3",
328 | "language": "python",
329 | "name": "python3"
330 | },
331 | "language_info": {
332 | "codemirror_mode": {
333 | "name": "ipython",
334 | "version": 3
335 | },
336 | "file_extension": ".py",
337 | "mimetype": "text/x-python",
338 | "name": "python",
339 | "nbconvert_exporter": "python",
340 | "pygments_lexer": "ipython3",
341 | "version": "3.5.1"
342 | }
343 | },
344 | "nbformat": 4,
345 | "nbformat_minor": 0
346 | }
347 |
--------------------------------------------------------------------------------
/Search in Depth/Multifield Search (Pt.2).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Elasticsearch: The Definitive Guide - Python\n",
8 | "\n",
9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n",
10 | "\n",
11 | "Documentation for the Python libs:\n",
12 | "\n",
13 | "Low-level API:\n",
14 | "\n",
15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n",
16 | "\n",
17 | "Expressive DSL API (more \"Pythonic\")\n",
18 | "\n",
19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n",
20 | "\n",
21 | "Github repo for DSL API:\n",
22 | "\n",
23 | "https://github.com/elastic/elasticsearch-dsl-py\n"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "import sys, os\n",
35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 4,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "4 items created\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "import index\n",
55 | "from elasticsearch import Elasticsearch\n",
56 | "from elasticsearch_dsl import Search, Q, Index\n",
57 | "from pprint import pprint\n",
58 | "\n",
59 | "es = Elasticsearch(\n",
60 | " 'localhost',\n",
61 | " # sniff before doing anything\n",
62 | " sniff_on_start=True,\n",
63 | " # refresh nodes after a node fails to respond\n",
64 | " sniff_on_connection_fail=True,\n",
65 | " # and also every 60 seconds\n",
66 | " sniffer_timeout=60\n",
67 | ")\n",
68 | "\n",
69 | "r = index.load_sid_examples(settings={ \"settings\": { \"number_of_shards\": 1 }},set=3)\n",
70 | "print('{} items created'.format(len(r['items'])))\n",
71 | "\n",
72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n",
73 | "# Run the script: populate.ipynb"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {
79 | "collapsed": true
80 | },
81 | "source": [
82 | "### Multifield Search\n",
83 | "\n",
84 | "#### Cross-fields Entity Search\n",
85 | "\n",
86 | "Data often spread across many fields:\n",
87 | "\n",
88 | "`\n",
89 | "{\n",
90 | " \"street\": \"5 Poland Street\",\n",
91 | " \"city\": \"London\",\n",
92 | " \"country\": \"United Kingdom\",\n",
93 | " \"postcode\": \"W1V 3DG\"\n",
94 | "}\n",
95 | "`\n",
96 | "\n",
97 | "Here we are not concerned with multiple-query strings. Here we want to look at a _single_ query string like \"Poland Street W1V.\" As parts of this string appear in different fields in the doc, using `dis_max / best_fields` will not work as they attempt to find the _single_ best-matching field.\n",
98 | "\n",
99 | "#### A Naive Approach\n",
100 | "\n",
101 | "We could try this:\n",
102 | "`\n",
103 | "{\n",
104 | " \"query\": {\n",
105 | " \"bool\": {\n",
106 | " \"should\": [\n",
107 | " { \"match\": { \"street\": \"Poland Street W1V\" }},\n",
108 | " { \"match\": { \"city\": \"Poland Street W1V\" }},\n",
109 | " { \"match\": { \"country\": \"Poland Street W1V\" }},\n",
110 | " { \"match\": { \"postcode\": \"Poland Street W1V\" }}\n",
111 | " ]\n",
112 | " }\n",
113 | " }\n",
114 | "}\n",
115 | "`\n",
116 | "\n",
117 | "Which is better issued as this:\n",
118 | "`\n",
119 | "{\n",
120 | " \"query\": {\n",
121 | " \"multi_match\": {\n",
122 | " \"query\": \"Poland Street W1V\",\n",
123 | " \"type\": \"most_fields\",\n",
124 | " \"fields\": [ \"street\", \"city\", \"country\", \"postcode\" ]\n",
125 | " }\n",
126 | " }\n",
127 | "}\n",
128 | "`\n",
129 | "\n",
130 | "However:\n",
131 | "\n",
132 | "The most_fields approach to entity search has some problems that are not immediately obvious:\n",
133 | "\n",
134 | "* It is designed to find the most fields matching **any words**, rather than to find the most matching words across **all fields.**\n",
135 | "* It can’t use the `operator` or `minimum_should_match` parameters to reduce the long tail of less-relevant results.\n",
136 | "* Term frequencies are different in each field and could interfere with each other to produce badly ordered results.\n",
137 | "\n",
138 | "#### Field-Centric Queries\n",
139 | "\n",
140 | "All three of the above problems come from `most_fields` being field-centric rather than term-centric - it looks for the most matching fields, not terms! (Ditto `best_fields`).\n",
141 | "\n",
142 | "Let's look at why these problems exist:\n",
143 | "\n",
144 | "##### Problem 1 - Matching the same word in multiple fields\n"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 24,
150 | "metadata": {
151 | "collapsed": false
152 | },
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "'(city:poland city:street city:w1v) (country:poland country:street country:w1v) (postcode:poland postcode:street postcode:w1v) (street:poland street:street street:w1v)'"
158 | ]
159 | },
160 | "execution_count": 24,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "# Let's confirm how the most_fields query works by validating the query\n",
167 | "body= {\n",
168 | " \"query\": {\n",
169 | " \"multi_match\": {\n",
170 | " \"query\": \"Poland Street W1V\",\n",
171 | " \"type\": \"most_fields\",\n",
172 | " \"fields\": [ \"street\", \"city\", \"country\", \"postcode\" ]\n",
173 | " }\n",
174 | " }\n",
175 | "}\n",
176 | "es.indices.validate_query(index='my_index', body=body, explain=1)\\\n",
177 | " ['explanations'][0]['explanation']"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "You can see that a document matching just the word poland in two fields could score higher than a document matching poland and street in one field.\n",
185 | "\n",
186 | "NOTE: The validated explanation shows the query as a [query string](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html)\n",
187 | "\n",
188 | "##### Problem 2 - Trimming the long tail\n",
189 | "\n",
190 | "Perhaps we could try this:\n"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 25,
196 | "metadata": {
197 | "collapsed": false
198 | },
199 | "outputs": [
200 | {
201 | "data": {
202 | "text/plain": [
203 | "'(+city:poland +city:street +city:w1v) (+country:poland +country:street +country:w1v) (+postcode:poland +postcode:street +postcode:w1v) (+street:poland +street:street +street:w1v)'"
204 | ]
205 | },
206 | "execution_count": 25,
207 | "metadata": {},
208 | "output_type": "execute_result"
209 | }
210 | ],
211 | "source": [
212 | "# Adding the **and** operator\n",
213 | "body= {\n",
214 | " \"query\": {\n",
215 | " \"multi_match\": {\n",
216 | " \"query\": \"Poland Street W1V\",\n",
217 | " \"type\": \"most_fields\",\n",
218 | " \"operator\": \"and\",\n",
219 | " \"fields\": [ \"street\", \"city\", \"country\", \"postcode\" ]\n",
220 | " }\n",
221 | " }\n",
222 | "}\n",
223 | "es.indices.validate_query(index='my_index', body=body, explain=1)\\\n",
224 | " ['explanations'][0]['explanation']"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "This shows that all words must exist (+) in the same field, which is clearly wrong! It is unlikely that any documents would match this query.\n",
232 | "\n",
233 | "##### Problem 3 - Term Frequencies\n",
234 | "\n",
235 | "In [What Is Relevance?](https://www.elastic.co/guide/en/elasticsearch/guide/master/relevance-intro.html), we explained that the default similarity algorithm used to calculate the relevance score for each term is TF/IDF:\n",
236 | "\n",
237 | "##### Term frequency\n",
238 | ">The more often a term appears in a field in a single document, the more relevant the document.\n",
239 | "\n",
240 | "##### Inverse document frequency\n",
241 | ">The more often a term appears in a field in all documents in the index, the less relevant is that term.\n",
242 | "\n",
243 | "When searching against multiple fields, TF/IDF can introduce some surprising results.\n",
244 | "\n",
245 | "Consider searching for “Peter Smith” using `first_name` and `last_name` fields. Peter is a common first name and Smith is a common last name, so both will have low IDFs. But what if we have another person in the index whose name is Smith Williams? Smith as a first name is very uncommon and so will have a high IDF!\n",
246 | "\n",
247 | "A simple query like the following may well return Smith Williams above Peter Smith in spite of the fact that the second person is a better match than the first.\n",
248 | "\n",
249 | "`\n",
250 | "{\n",
251 | " \"query\": {\n",
252 | " \"multi_match\": {\n",
253 | " \"query\": \"Peter Smith\",\n",
254 | " \"type\": \"most_fields\",\n",
255 | " \"fields\": [ \"*_name\" ]\n",
256 | " }\n",
257 | " }\n",
258 | "}\n",
259 | "`\n",
260 | "The high IDF of smith in the first name field can overwhelm the two low IDFs of peter as a first name and smith as a last name.\n",
261 | "\n",
262 | "#### Solution\n",
263 | "\n",
264 | "These problems only exist because we are dealing with multiple fields. If we were to combine all of these fields into a single field, the problems would vanish. We could achieve this by adding a full_name field to our person document:\n",
265 | "\n",
266 | "`\n",
267 | "{\n",
268 | " \"first_name\": \"Peter\",\n",
269 | " \"last_name\": \"Smith\",\n",
270 | " \"full_name\": \"Peter Smith\"\n",
271 | "}`\n",
272 | "\n",
273 | "When querying just the full_name field:\n",
274 | "\n",
275 | "* Documents with more matching words would trump documents with the same word repeated.\n",
276 | "* The minimum_should_match and operator parameters would function as expected.\n",
277 | "* The inverse document frequencies for first and last names would be combined so it wouldn’t matter whether Smith were a first or last name anymore.\n",
278 | "\n",
279 | "While this would work, we don’t like having to store redundant data. Instead, Elasticsearch offers us two solutions—one at index time and one at search time:\n",
280 | "\n",
281 | "#### Custom `_all` Fields\n",
282 | "\n",
283 | "The [Metadata: _all Field](https://www.elastic.co/guide/en/elasticsearch/guide/master/root-object.html#all-field) stored all values from all fields as one big string. A more flexible approach is an `_all` field for the person’s name, and another custom `_all` field for the address. \n",
284 | "\n",
285 | "This can be done using the `copy_to` parameter in field mappings:\n",
286 | "\n",
287 | "`PUT /my_index\n",
288 | "{\n",
289 | " \"mappings\": {\n",
290 | " \"person\": {\n",
291 | " \"properties\": {\n",
292 | " \"first_name\": {\n",
293 | " \"type\": \"string\",\n",
294 | " \"copy_to\": \"full_name\" \n",
295 | " },\n",
296 | " \"last_name\": {\n",
297 | " \"type\": \"string\",\n",
298 | " \"copy_to\": \"full_name\" \n",
299 | " },\n",
300 | " \"full_name\": {\n",
301 | " \"type\": \"string\"\n",
302 | " }\n",
303 | " }\n",
304 | " }\n",
305 | " }\n",
306 | "}\n",
307 | "`"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {},
313 | "source": [
314 | "With this mapping in place, we can query the `first_name` field for first names, the `last_name` field for last name, or the `full_name` field for first and last names.\n",
315 | "\n",
316 | "**NOTE:** The copy_to setting will not work on a multi-field. If you attempt to configure your mapping this way, Elasticsearch will throw an exception.\n",
317 | "\n",
318 | "Just add the `copy_to` to the main field, **not** the multi-field:\n",
319 | "\n",
320 | "`\n",
321 | "PUT /my_index\n",
322 | "{\n",
323 | " \"mappings\": {\n",
324 | " \"person\": {\n",
325 | " \"properties\": {\n",
326 | " \"first_name\": {\n",
327 | " \"type\": \"string\",\n",
328 | " \"copy_to\": \"full_name\", \n",
329 | " \"fields\": {\n",
330 | " \"raw\": {\n",
331 | " \"type\": \"string\",\n",
332 | " \"index\": \"not_analyzed\"\n",
333 | " }\n",
334 | " }\n",
335 | " },\n",
336 | " \"full_name\": {\n",
337 | " \"type\": \"string\"\n",
338 | " }\n",
339 | " }\n",
340 | " }\n",
341 | " }\n",
342 | "}\n",
343 | "`"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {
350 | "collapsed": true
351 | },
352 | "outputs": [],
353 | "source": []
354 | }
355 | ],
356 | "metadata": {
357 | "kernelspec": {
358 | "display_name": "Python 3",
359 | "language": "python",
360 | "name": "python3"
361 | },
362 | "language_info": {
363 | "codemirror_mode": {
364 | "name": "ipython",
365 | "version": 3
366 | },
367 | "file_extension": ".py",
368 | "mimetype": "text/x-python",
369 | "name": "python",
370 | "nbconvert_exporter": "python",
371 | "pygments_lexer": "ipython3",
372 | "version": "3.5.1"
373 | }
374 | },
375 | "nbformat": 4,
376 | "nbformat_minor": 0
377 | }
378 |
--------------------------------------------------------------------------------
/Dealing with Human Language/Getting Started with Languages.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Elasticsearch: The Definitive Guide - Python\n",
8 | "\n",
9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n",
10 | "\n",
11 | "Documentation for the Python libs:\n",
12 | "\n",
13 | "Low-level API:\n",
14 | "\n",
15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n",
16 | "\n",
17 | "Expressive DSL API (more \"Pythonic\")\n",
18 | "\n",
19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n",
20 | "\n",
21 | "Github repo for DSL API:\n",
22 | "\n",
23 | "https://github.com/elastic/elasticsearch-dsl-py\n"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 9,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "import sys, os\n",
35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 12,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "14 items created\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "import index\n",
55 | "from elasticsearch import Elasticsearch\n",
56 | "from elasticsearch_dsl import Search, Q\n",
57 | "from pprint import pprint\n",
58 | "\n",
59 | "es = Elasticsearch(\n",
60 | " 'localhost',\n",
61 | " # sniff before doing anything\n",
62 | " sniff_on_start=True,\n",
63 | " # refresh nodes after a node fails to respond\n",
64 | " sniff_on_connection_fail=True,\n",
65 | " # and also every 60 seconds\n",
66 | " sniffer_timeout=60\n",
67 | ")\n",
68 | "\n",
69 | "r = index.populate()\n",
70 | "print('{} items created'.format(len(r['items'])))\n",
71 | "\n",
72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n",
73 | "# Run the script: populate.ipynb"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {},
79 | "source": [
80 | "### Getting Started with Languages\n",
81 | "\n",
82 | "Full-text search is a battle between precision—returning as few irrelevant documents as possible—and recall—returning as many relevant documents as possible.\n",
83 | "\n",
84 | "Many tactics can be deployed to tackle precision and recall, such as modifying words: e.g. search for \"jumping\", \"jumps\" and \"jumped\" by reducing words to their stem (root form) - \"jump\".\n",
85 | "\n",
86 | "However, the first step is to identify words using an analyzer:\n",
87 | "\n",
88 | "##### Tokenize text into individual words:\n",
89 | "\n",
90 | "```The quick brown foxes → [The, quick, brown, foxes]```\n",
91 | "\n",
92 | "##### Lowercase tokens:\n",
93 | "\n",
94 | "```The → the```\n",
95 | "\n",
96 | "##### Remove common stopwords:\n",
97 | "\n",
98 | "```[The, quick, brown, foxes] → [quick, brown, foxes]```\n",
99 | "\n",
100 | "##### Stem tokens to their root form:\n",
101 | "\n",
102 | "```foxes → fox```\n",
103 | "\n",
104 | "Each analyzer may also apply other transformations specific to its language in order to make words from that language more searchable:\n",
105 | "\n",
106 | "##### The english analyzer removes the possessive 's:\n",
107 | "\n",
108 | "```John's → john```\n",
109 | "\n",
110 | "##### The french analyzer removes elisions like l' and qu' and diacritics like ¨ or ^:\n",
111 | "\n",
112 | "```l'église → eglis```\n",
113 | "\n",
114 | "##### The german analyzer normalizes terms, replacing ä and ae with a, or ß with ss, among others:\n",
115 | "\n",
116 | "```äußerst → ausserst```\n",
117 | "\n",
118 | "### Using Language Analyzers\n",
119 | "\n",
120 | "The built-in language analyzers are available globally and don’t need to be configured before being used. They can be specified directly in the field mapping:\n",
121 | "\n",
122 | "```\n",
123 | "PUT /my_index\n",
124 | "{\n",
125 | " \"mappings\": {\n",
126 | " \"blog\": {\n",
127 | " \"properties\": {\n",
128 | " \"title\": {\n",
129 | " \"type\": \"string\",\n",
130 | " \"analyzer\": \"english\" \n",
131 | " }\n",
132 | " }\n",
133 | " }\n",
134 | " }\n",
135 | "}\n",
136 | "```"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 37,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "i'm,happi,about,fox\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "#english (language)\n",
156 | "text = 'I\\'m not happy about the foxes'\n",
157 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n",
158 | " (analyzer='english', body=text)['tokens']]\n",
159 | "print(','.join(analyzed_text))"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "We can’t tell if a document mentions one fox or many foxes; the word 'not' is a stopword and is removed, so we can’t tell whether the document is happy about foxes or not. By using the english analyzer, we have **increased recall** as we can match more loosely, but we have reduced our ability to rank documents accurately.\n",
167 | "\n",
168 | "To get the best of both worlds, we can use multifields to index the title field twice: once with the english analyzer and once with the standard analyzer:\n",
169 | "\n",
170 | "```\n",
171 | "PUT /my_index\n",
172 | "{\n",
173 | " \"mappings\": {\n",
174 | " \"blog\": {\n",
175 | " \"properties\": {\n",
176 | " \"title\": { \n",
177 | " \"type\": \"string\",\n",
178 | " \"fields\": {\n",
179 | " \"english\": { \n",
180 | " \"type\": \"string\",\n",
181 | " \"analyzer\": \"english\"\n",
182 | " }\n",
183 | " }\n",
184 | " }\n",
185 | " }\n",
186 | " }\n",
187 | " }\n",
188 | "}\n",
189 | "```"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 18,
195 | "metadata": {
196 | "collapsed": true
197 | },
198 | "outputs": [],
199 | "source": [
200 | "index_template = {\n",
201 | " \"mappings\": {\n",
202 | " \"blog\": {\n",
203 | " \"properties\": {\n",
204 | " \"title\": { \n",
205 | " \"type\": \"text\",\n",
206 | " \"fields\": {\n",
207 | " \"english\": { \n",
208 | " \"type\": \"text\",\n",
209 | " \"analyzer\": \"english\"\n",
210 | " }\n",
211 | " }\n",
212 | " }\n",
213 | " }\n",
214 | " }\n",
215 | " }\n",
216 | "}\n"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 19,
222 | "metadata": {
223 | "collapsed": false
224 | },
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/plain": [
229 | "{'acknowledged': True, 'shards_acknowledged': True}"
230 | ]
231 | },
232 | "execution_count": 19,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "es.indices.create(index='my_index', body=index_template)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 21,
244 | "metadata": {
245 | "collapsed": false
246 | },
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/plain": [
251 | "{'_id': '1',\n",
252 | " '_index': 'my_index',\n",
253 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n",
254 | " '_type': 'blog',\n",
255 | " '_version': 1,\n",
256 | " 'created': True,\n",
257 | " 'result': 'created'}"
258 | ]
259 | },
260 | "execution_count": 21,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "data = { \"title\": \"I'm happy for this fox\" }\n",
267 | "es.create(index='my_index', doc_type='blog', body=data, id=1)"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 22,
273 | "metadata": {
274 | "collapsed": false
275 | },
276 | "outputs": [
277 | {
278 | "data": {
279 | "text/plain": [
280 | "{'_id': '2',\n",
281 | " '_index': 'my_index',\n",
282 | " '_shards': {'failed': 0, 'successful': 1, 'total': 2},\n",
283 | " '_type': 'blog',\n",
284 | " '_version': 1,\n",
285 | " 'created': True,\n",
286 | " 'result': 'created'}"
287 | ]
288 | },
289 | "execution_count": 22,
290 | "metadata": {},
291 | "output_type": "execute_result"
292 | }
293 | ],
294 | "source": [
295 | "data = { \"title\": \"I'm not happy about my fox problem\" }\n",
296 | "es.create(index='my_index', doc_type='blog', body=data, id=2)"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 29,
302 | "metadata": {
303 | "collapsed": false
304 | },
305 | "outputs": [
306 | {
307 | "name": "stdout",
308 | "output_type": "stream",
309 | "text": [
310 | "I'm not happy about my fox problem\n",
311 | "I'm happy for this fox\n"
312 | ]
313 | }
314 | ],
315 | "source": [
316 | "s = Search(using=es, index='my_index', doc_type='blog')\n",
317 | "q = Q('multi_match', type='most_fields', query='not happy foxes', fields=['title', 'title.english'])\n",
318 | "s = s.query()\n",
319 | "res = s.execute()\n",
320 | "for hit in res:\n",
321 | " print(hit.title)"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "Note that both hits **do not** contain the word foxes, but we got a hit on fox.\n",
329 | "\n",
330 | "Use the ```most_fields``` query type to match the same text in as many fields as possible."
331 | ]
332 | },
333 | {
334 | "cell_type": "markdown",
335 | "metadata": {},
336 | "source": [
337 | "### Configuring Lanuage Analyzers\n",
338 | "\n",
339 | "It might be useful to avoid stemming words (like \"organization\" --> organ) if you know this will sacrifice certain precision requirements (e.g. seaches for \"world health organization\"). It is possible to configure the analyzers, e.g. to exclude certain stop words or stems:\n",
340 | "\n",
341 | "```\n",
342 | "PUT /my_index\n",
343 | "{\n",
344 | " \"settings\": {\n",
345 | " \"analysis\": {\n",
346 | " \"analyzer\": {\n",
347 | " \"my_english\": {\n",
348 | " \"type\": \"english\",\n",
349 | " \"stem_exclusion\": [ \"organization\", \"organizations\" ], \n",
350 | " \"stopwords\": [ \n",
351 | " \"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
352 | " \"if\", \"in\", \"into\", \"is\", \"it\", \"of\", \"on\", \"or\", \"such\", \"that\",\n",
353 | " \"the\", \"their\", \"then\", \"there\", \"these\", \"they\", \"this\", \"to\",\n",
354 | " \"was\", \"will\", \"with\"\n",
355 | " ]\n",
356 | " }\n",
357 | " }\n",
358 | " }\n",
359 | " }\n",
360 | "}\n",
361 | "\n",
362 | "GET /my_index/_analyze?analyzer=my_english \n",
363 | "The World Health Organization does not sell organs.\n",
364 | "```\n",
365 | "\n"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 31,
371 | "metadata": {
372 | "collapsed": false
373 | },
374 | "outputs": [
375 | {
376 | "data": {
377 | "text/plain": [
378 | "{'acknowledged': True, 'shards_acknowledged': True}"
379 | ]
380 | },
381 | "execution_count": 31,
382 | "metadata": {},
383 | "output_type": "execute_result"
384 | }
385 | ],
386 | "source": [
387 | "es.indices.delete(index='my_index')\n",
388 | "index_template_with_exclusions = \\\n",
389 | "{\n",
390 | " \"settings\": {\n",
391 | " \"analysis\": {\n",
392 | " \"analyzer\": {\n",
393 | " \"my_english\": {\n",
394 | " \"type\": \"english\",\n",
395 | " \"stem_exclusion\": [ \"organization\", \"organizations\" ], \n",
396 | " \"stopwords\": [ \n",
397 | " \"a\", \"an\", \"and\", \"are\", \"as\", \"at\", \"be\", \"but\", \"by\", \"for\",\n",
398 | " \"if\", \"in\", \"into\", \"is\", \"it\", \"of\", \"on\", \"or\", \"such\", \"that\",\n",
399 | " \"the\", \"their\", \"then\", \"there\", \"these\", \"they\", \"this\", \"to\",\n",
400 | " \"was\", \"will\", \"with\"\n",
401 | " ]\n",
402 | " }\n",
403 | " }\n",
404 | " }\n",
405 | " }\n",
406 | "}\n",
407 | "\n",
408 | "es.indices.create(index='my_index', body=index_template_with_exclusions)"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 36,
414 | "metadata": {
415 | "collapsed": false
416 | },
417 | "outputs": [
418 | {
419 | "name": "stdout",
420 | "output_type": "stream",
421 | "text": [
422 | "world,health,organization,doe,not,sell,organ\n"
423 | ]
424 | }
425 | ],
426 | "source": [
427 | "#english (language) with exclusions - my_english\n",
428 | "text = 'The World Health Organization does not sell organs.'\n",
429 | "analyzed_text = [x['token'] for x in es.indices.analyze\\\n",
430 | " (index='my_index', analyzer='my_english', body=text)['tokens']]\n",
431 | "print(','.join(analyzed_text))"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "metadata": {
438 | "collapsed": true
439 | },
440 | "outputs": [],
441 | "source": []
442 | }
443 | ],
444 | "metadata": {
445 | "kernelspec": {
446 | "display_name": "Python 3",
447 | "language": "python",
448 | "name": "python3"
449 | },
450 | "language_info": {
451 | "codemirror_mode": {
452 | "name": "ipython",
453 | "version": 3
454 | },
455 | "file_extension": ".py",
456 | "mimetype": "text/x-python",
457 | "name": "python",
458 | "nbconvert_exporter": "python",
459 | "pygments_lexer": "ipython3",
460 | "version": "3.5.1"
461 | }
462 | },
463 | "nbformat": 4,
464 | "nbformat_minor": 0
465 | }
466 |
--------------------------------------------------------------------------------
/Dealing with Human Language/Typoes and Mispelings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Elasticsearch: The Definitive Guide - Python\n",
8 | "\n",
9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n",
10 | "\n",
11 | "Documentation for the Python libs:\n",
12 | "\n",
13 | "Low-level API:\n",
14 | "\n",
15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n",
16 | "\n",
17 | "Expressive DSL API (more \"Pythonic\")\n",
18 | "\n",
19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n",
20 | "\n",
21 | "Github repo for DSL API:\n",
22 | "\n",
23 | "https://github.com/elastic/elasticsearch-dsl-py\n"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "import sys, os\n",
35 | "sys.path.insert(1, os.path.join(sys.path[0], '..'))"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "14 items created\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "import index\n",
55 | "from elasticsearch import Elasticsearch\n",
56 | "from elasticsearch_dsl import Search, Q\n",
57 | "from pprint import pprint\n",
58 | "\n",
59 | "es = Elasticsearch(\n",
60 | " 'localhost',\n",
61 | " # sniff before doing anything\n",
62 | " sniff_on_start=True,\n",
63 | " # refresh nodes after a node fails to respond\n",
64 | " sniff_on_connection_fail=True,\n",
65 | " # and also every 60 seconds\n",
66 | " sniffer_timeout=60\n",
67 | ")\n",
68 | "\n",
69 | "r = index.populate()\n",
70 | "print('{} items created'.format(len(r['items'])))\n",
71 | "\n",
72 | "# Let's repopulate the index as we deleted 'gb' in earlier chapters:\n",
73 | "# Run the script: populate.ipynb"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {
79 | "collapsed": true
80 | },
81 | "source": [
82 | "### Typoes and Mispelings\n",
83 | "\n",
84 | "full-text search that only matches exactly will probably frustrate your users. Wouldn’t you expect a search for “quick brown fox” to match a document containing “fast brown foxes,” “Johnny Walker” to match “Johnnie Walker,” or “Arnold Shcwarzenneger” to match “Arnold Schwarzenegger”?\n",
85 | "\n",
86 | "Fuzzy matching allows for query-time matching of misspelled words, while phonetic token filters at index time can be used for sounds-like matching.\n",
87 | "\n",
88 | "#### Fuzziness\n",
89 | "\n",
90 | "Fuzzy matching treats two words that are “fuzzily” similar as if they were the same word. First, we need to define what we mean by fuzziness. It is the concept of distance - e.g. Damerau-Levenshtein distance.\n",
91 | "\n",
92 | "Damerau observed that 80% of human misspellings have an edit distance of 1. In other words, 80% of misspellings could be corrected with a single edit to the original string.\n",
93 | "\n",
94 | "Elasticsearch supports a maximum edit distance, specified with the fuzziness parameter, of 2.\n",
95 | "\n",
96 | "Of course, the impact that a single edit has on a string depends on the length of the string. Two edits to the word hat can produce mad, so allowing two edits on a string of length 3 is overkill. The fuzziness parameter can be set to AUTO, which results in the following maximum edit distances:\n",
97 | "\n",
98 | "* 0 for strings of one or two characters\n",
99 | "* 1 for strings of three, four, or five characters\n",
100 | "* 2 for strings of more than five characters\n",
101 | "\n",
102 | "Of course, you may find that an edit distance of 2 is still overkill, and returns results that don’t appear to be related. You may get better results, and better performance, with a maximum fuzziness of 1."
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 33,
108 | "metadata": {
109 | "collapsed": false
110 | },
111 | "outputs": [],
112 | "source": [
113 | "data = ['Surprise me!', 'That was surprising.', 'I wasn\\'t surprised.']\n",
114 | "for i,txt in enumerate(data):\n",
115 | " body = { \"text\": \"\"}\n",
116 | " body['text'] = txt\n",
117 | " es.create(index='my_index', doc_type='my_type', id=i, body=body)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 35,
123 | "metadata": {
124 | "collapsed": false
125 | },
126 | "outputs": [
127 | {
128 | "data": {
129 | "text/plain": [
130 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n",
131 | " 'hits': {'hits': [{'_id': '0',\n",
132 | " '_index': 'my_index',\n",
133 | " '_score': 0.22585157,\n",
134 | " '_source': {'text': 'Surprise me!'},\n",
135 | " '_type': 'my_type'},\n",
136 | " {'_id': '2',\n",
137 | " '_index': 'my_index',\n",
138 | " '_score': 0.1898702,\n",
139 | " '_source': {'text': \"I wasn't surprised.\"},\n",
140 | " '_type': 'my_type'}],\n",
141 | " 'max_score': 0.22585157,\n",
142 | " 'total': 2},\n",
143 | " 'timed_out': False,\n",
144 | " 'took': 5}"
145 | ]
146 | },
147 | "execution_count": 35,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "body = {\n",
154 | " \"query\": {\n",
155 | " \"fuzzy\": {\n",
156 | " \"text\": \"surprize\"\n",
157 | " }\n",
158 | " }\n",
159 | "}\n",
160 | "es.search(body=body)"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "The fuzzy query is a term-level query, so it doesn’t do any analysis. It takes a single term and finds all terms in the term dictionary that are within the specified fuzziness. The default fuzziness is AUTO.\n",
168 | "\n",
169 | "In our example, surprize is within an edit distance of 2 from both surprise and surprised, so documents 1 and 3 match. We could reduce the matches to just surprise with the following query:\n"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 36,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n",
183 | " 'hits': {'hits': [{'_id': '0',\n",
184 | " '_index': 'my_index',\n",
185 | " '_score': 0.22585157,\n",
186 | " '_source': {'text': 'Surprise me!'},\n",
187 | " '_type': 'my_type'}],\n",
188 | " 'max_score': 0.22585157,\n",
189 | " 'total': 1},\n",
190 | " 'timed_out': False,\n",
191 | " 'took': 3}"
192 | ]
193 | },
194 | "execution_count": 36,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "body = {\n",
201 | " \"query\": {\n",
202 | " \"fuzzy\": {\n",
203 | " \"text\": {\n",
204 | " \"value\": \"surprize\",\n",
205 | " \"fuzziness\": 1\n",
206 | " }\n",
207 | " }\n",
208 | " }\n",
209 | "}\n",
210 | "es.search(body=body)"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "#### Improving Performance\n",
218 | "\n",
219 | "The fuzzy query works by taking the original term and building a Levenshtein automaton—like a big graph representing all the strings that are within the specified edit distance of the original string.\n",
220 | "\n",
221 | "The fuzzy query then uses the automaton to step efficiently through all of the terms in the term dictionary to see if they match. Once it has collected all of the matching terms that exist in the term dictionary, it can compute the list of matching documents.\n",
222 | "\n",
223 | "Of course, depending on the type of data stored in the index, a fuzzy query with an edit distance of 2 can match a very large number of terms and perform very badly. Two parameters can be used to limit the performance impact:\n",
224 | "\n",
225 | "##### prefix_length\n",
226 | "\n",
227 | ">The number of initial characters that will not be “fuzzified.” **Most spelling errors occur toward the end of the word, not toward the beginning.** By using a prefix_length of 3, for example, you can signficantly reduce the number of matching terms.\n",
228 | "\n",
229 | "##### max_expansions\n",
230 | "\n",
231 | ">If a fuzzy query expands to three or four fuzzy options, the new options may be meaningful. If it produces 1,000 options, they are essentially meaningless. Use max_expansions to limit the total number of options that will be produced. The fuzzy query will collect matching terms until it runs out of terms or reaches the max_expansions limit.\n",
232 | "\n",
233 | "#### Fuzzy Match Query\n",
234 | "\n",
235 | "The `match` query supports fuzzy matching out of the box:"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 37,
241 | "metadata": {
242 | "collapsed": false
243 | },
244 | "outputs": [
245 | {
246 | "data": {
247 | "text/plain": [
248 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n",
249 | " 'hits': {'hits': [{'_id': '0',\n",
250 | " '_index': 'my_index',\n",
251 | " '_score': 0.48396763,\n",
252 | " '_source': {'text': 'Surprise me!'},\n",
253 | " '_type': 'my_type'}],\n",
254 | " 'max_score': 0.48396763,\n",
255 | " 'total': 1},\n",
256 | " 'timed_out': False,\n",
257 | " 'took': 6}"
258 | ]
259 | },
260 | "execution_count": 37,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "body= {\n",
267 | " \"query\": {\n",
268 | " \"match\": {\n",
269 | " \"text\": {\n",
270 | " \"query\": \"SURPRIZE ME!\",\n",
271 | " \"fuzziness\": \"AUTO\",\n",
272 | " \"operator\": \"and\"\n",
273 | " }\n",
274 | " }\n",
275 | " }\n",
276 | "}\n",
277 | "es.search(body=body)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 38,
283 | "metadata": {
284 | "collapsed": false
285 | },
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n",
291 | " 'hits': {'hits': [{'_id': '0',\n",
292 | " '_index': 'my_index',\n",
293 | " '_score': 0.48396763,\n",
294 | " '_source': {'text': 'Surprise me!'},\n",
295 | " '_type': 'my_type'},\n",
296 | " {'_id': '2',\n",
297 | " '_index': 'my_index',\n",
298 | " '_score': 0.1898702,\n",
299 | " '_source': {'text': \"I wasn't surprised.\"},\n",
300 | " '_type': 'my_type'}],\n",
301 | " 'max_score': 0.48396763,\n",
302 | " 'total': 2},\n",
303 | " 'timed_out': False,\n",
304 | " 'took': 7}"
305 | ]
306 | },
307 | "execution_count": 38,
308 | "metadata": {},
309 | "output_type": "execute_result"
310 | }
311 | ],
312 | "source": [
313 | "body = {\n",
314 | " \"query\": {\n",
315 | " \"multi_match\": {\n",
316 | " \"fields\": [ \"text\", \"title\" ],\n",
317 | " \"query\": \"SURPRIZE ME!\",\n",
318 | " \"fuzziness\": \"AUTO\"\n",
319 | " }\n",
320 | " }\n",
321 | "}\n",
322 | "es.search(body=body)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 39,
328 | "metadata": {
329 | "collapsed": true
330 | },
331 | "outputs": [],
332 | "source": [
333 | "# Let's add some more data to test how fuzziness relates to relevance:\n",
334 | "data = ['The element of surprize!', 'That is surprising.', 'Inside every Kinder egg is a surprise.']\n",
335 | "for i,txt in enumerate(data):\n",
336 | " body = { \"text\": \"\"}\n",
337 | " body['text'] = txt\n",
338 | " es.create(index='my_index', doc_type='my_type', id=i+3, body=body)"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 41,
344 | "metadata": {
345 | "collapsed": false
346 | },
347 | "outputs": [
348 | {
349 | "data": {
350 | "text/plain": [
351 | "{'_shards': {'failed': 0, 'successful': 16, 'total': 16},\n",
352 | " 'hits': {'hits': [{'_id': '2',\n",
353 | " '_index': 'my_index',\n",
354 | " '_score': 0.45747715,\n",
355 | " '_source': {'text': \"I wasn't surprised.\"},\n",
356 | " '_type': 'my_type'},\n",
357 | " {'_id': '3',\n",
358 | " '_index': 'my_index',\n",
359 | " '_score': 0.2876821,\n",
360 | " '_source': {'text': 'The element of surprize!'},\n",
361 | " '_type': 'my_type'},\n",
362 | " {'_id': '5',\n",
363 | " '_index': 'my_index',\n",
364 | " '_score': 0.2500978,\n",
365 | " '_source': {'text': 'Inside every Kinder egg is a surprise.'},\n",
366 | " '_type': 'my_type'},\n",
367 | " {'_id': '0',\n",
368 | " '_index': 'my_index',\n",
369 | " '_score': 0.22585157,\n",
370 | " '_source': {'text': 'Surprise me!'},\n",
371 | " '_type': 'my_type'}],\n",
372 | " 'max_score': 0.45747715,\n",
373 | " 'total': 4},\n",
374 | " 'timed_out': False,\n",
375 | " 'took': 8}"
376 | ]
377 | },
378 | "execution_count": 41,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "body= {\n",
385 | " \"query\": {\n",
386 | " \"match\": {\n",
387 | " \"text\": {\n",
388 | " \"query\": \"SURPRIZE!\",\n",
389 | " \"fuzziness\": \"AUTO\"\n",
390 | " }\n",
391 | " }\n",
392 | " }\n",
393 | "}\n",
394 | "es.search(body=body)"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "#### Scoring Fuzziness\n",
402 | "\n",
403 | "Imagine that we have 1,000 documents containing “Schwarzenegger,” and just one document with the misspelling “Schwarzeneger.” According to the theory of term frequency/inverse document frequency, the misspelling is much more relevant than the correct spelling, because it appears in far fewer documents!\n",
404 | "\n",
405 | "\n",
406 | "Fuzzy queries alone are much less useful than they initially appear. They are better used as part of a “bigger” feature, such as the search-as-you-type completion suggester or the did-you-mean phrase suggester.\n",
407 | "\n",
408 | "#### Phonetic Matching\n",
409 | "\n",
410 | "It might be useful to match by phonetic similarity - words that sound similar (despite different spellings):\n"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 44,
416 | "metadata": {
417 | "collapsed": false
418 | },
419 | "outputs": [],
420 | "source": [
421 | "settings = {\n",
422 | " \"settings\": {\n",
423 | " \"analysis\": {\n",
424 | " \"filter\": {\n",
425 | " \"dbl_metaphone\": { \n",
426 | " \"type\": \"phonetic\",\n",
427 | " \"encoder\": \"double_metaphone\"\n",
428 | " }\n",
429 | " },\n",
430 | " \"analyzer\": {\n",
431 | " \"dbl_metaphone\": {\n",
432 | " \"tokenizer\": \"standard\",\n",
433 | " \"filter\": \"dbl_metaphone\" \n",
434 | " }\n",
435 | " }\n",
436 | " }\n",
437 | " }\n",
438 | "}\n"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {},
444 | "source": [
445 | "This won't work as it needs a plug-in for [Phoentic analysis](\n",
446 | "https://www.elastic.co/guide/en/elasticsearch/plugins/5.2/analysis-phonetic.html)"
447 | ]
448 | },
449 | {
450 | "cell_type": "code",
451 | "execution_count": null,
452 | "metadata": {
453 | "collapsed": true
454 | },
455 | "outputs": [],
456 | "source": []
457 | }
458 | ],
459 | "metadata": {
460 | "kernelspec": {
461 | "display_name": "Python 3",
462 | "language": "python",
463 | "name": "python3"
464 | },
465 | "language_info": {
466 | "codemirror_mode": {
467 | "name": "ipython",
468 | "version": 3
469 | },
470 | "file_extension": ".py",
471 | "mimetype": "text/x-python",
472 | "name": "python",
473 | "nbconvert_exporter": "python",
474 | "pygments_lexer": "ipython3",
475 | "version": "3.5.1"
476 | }
477 | },
478 | "nbformat": 4,
479 | "nbformat_minor": 0
480 | }
481 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Searching - The Basic Tools-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Elasticsearch: The Definitive Guide - Python\n",
8 | "\n",
9 | "Following the examples in the book, here are Python snippets that achieve the same effect.\n",
10 | "\n",
11 | "Documentation for the Python libs:\n",
12 | "\n",
13 | "Low-level API:\n",
14 | "\n",
15 | "https://elasticsearch-py.readthedocs.io/en/master/index.html\n",
16 | "\n",
17 | "Expressive DSL API (more \"Pythonic\")\n",
18 | "\n",
19 | "http://elasticsearch-dsl.readthedocs.io/en/latest/index.html\n",
20 | "\n",
21 | "Github repo for DSL API:\n",
22 | "\n",
23 | "https://github.com/elastic/elasticsearch-dsl-py\n"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 1,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "from elasticsearch import Elasticsearch\n",
35 | "from elasticsearch_dsl import Search, Q\n",
36 | "from pprint import pprint\n",
37 | "\n",
38 | "es = Elasticsearch(\n",
39 | " 'localhost',\n",
40 | " # sniff before doing anything\n",
41 | " sniff_on_start=True,\n",
42 | " # refresh nodes after a node fails to respond\n",
43 | " sniff_on_connection_fail=True,\n",
44 | " # and also every 60 seconds\n",
45 | " sniffer_timeout=60\n",
46 | ")"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "### Empty Search\n",
54 | "From: https://www.elastic.co/guide/en/elasticsearch/guide/master/empty-search.html\n",
55 | "\n",
56 | ">GET _search"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 2,
62 | "metadata": {
63 | "collapsed": true
64 | },
65 | "outputs": [],
66 | "source": [
67 | "res = es.search('_all') # same as es.search()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 3,
73 | "metadata": {
74 | "collapsed": false
75 | },
76 | "outputs": [],
77 | "source": [
78 | "#from pprint import pprint\n",
79 | "#pprint(res)"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 4,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [
89 | {
90 | "data": {
91 | "text/plain": [
92 | "