├── .gitignore ├── README.md ├── docker-compose.yml ├── lessons ├── Lesson0_Prerequisites.md ├── Lesson1_Indexing.md ├── Lesson2_Searching.md ├── Lesson3_Analysis.md ├── Lesson4_ExtraCredit.md └── kibana │ ├── index.json │ └── search.json ├── requirements.txt ├── searchapp ├── __init__.py ├── app │ ├── __init__.py │ ├── app.py │ ├── search.py │ └── templates │ │ ├── base.html │ │ ├── index.html │ │ └── product.html ├── constants.py ├── data.py ├── index_products.py ├── products.json └── run.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### --- Copied from https://github.com/github/gitignore/blob/master/Python.gitignore --- 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | .static_storage/ 59 | .media/ 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | 110 | # End of https://www.gitignore.io/api/python 111 | 112 | # vim 113 | *.swp 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Build a Search Engine with Python + Elasticsearch](https://www.youtube.com/watch?v=6_P_h2bDwYs) 2 | 3 | - [Slides](http://bit.ly/pycon-es-slides) 4 | - [Video](https://www.youtube.com/watch?v=6_P_h2bDwYs) 5 | 6 | ## Instructors 7 | - [Julie Qiu](http://twitter.com/jqiu25) 8 | - [Jim Grandpre](https://twitter.com/jimtla) 9 | 10 | # [Tutorial Prerequisites](/lessons/Lesson0_Prerequisites.md) 11 | In this tutorial, you will be building a search engine to search for product attributes using a Flask app and Elasticsearch. 12 | 13 | To participate in this tutorial, you need the following: 14 | 15 | 1) A computer with a good text editor (Vim, Sublime, Atom, etc.) installed on it. 16 | 17 | 2) Complete the [Tutorial Prerequisites](/lessons/Lesson0_Prerequisites.md) 18 | 19 | # [Lesson 2](/lessons/Lesson2_Searching.md) 20 | Continuing from Part 1: 21 | ``` 22 | git commit -am "session1 work" 23 | git fetch 24 | git checkout session2 25 | source venv/bin/activate 26 | python searchapp/index_products.py 27 | ``` 28 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | elasticsearch: 5 | image: docker.elastic.co/elasticsearch/elasticsearch:6.3.2 6 | environment: 7 | - cluster.name=docker-cluster 8 | - bootstrap.memory_lock=true 9 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 10 | ulimits: 11 | memlock: 12 | soft: -1 13 | hard: -1 14 | ports: 15 | - "9200:9200" 16 | kibana: 17 | image: docker.elastic.co/kibana/kibana:6.3.2 18 | ports: 19 | - "5601:5601" 20 | -------------------------------------------------------------------------------- /lessons/Lesson0_Prerequisites.md: -------------------------------------------------------------------------------- 1 | # Tutorial Pre-Work 2 | 3 | In this tutorial, you will be building a search engine to search for product attributes using a Flask app and Elasticsearch. 4 | 5 | To participate in this tutorial, you need to complete the following prerequisites: 6 | 7 | 1. Install [Python 3.6.4](https://www.python.org/downloads/release/python-364/). 8 | 9 | 2. Install Elasticsearch 6.2 and Kibana 6.2. (Note: You may need to install [Java](https://java.com/en/download/)) 10 | 11 | - For OS X, you can use [Homebrew](https://brew.sh/): 12 | ``` 13 | brew update 14 | brew install kibana 15 | brew install elasticsearch 16 | 17 | brew services start elasticsearch 18 | brew services start kibana 19 | ``` 20 | - For Windows or Linux, see the Elastic downloads page for[Elasticsearch](https://www.elastic.co/downloads/elasticsearch) and [Kibana](https://www.elastic.co/downloads/kibana). 21 | 22 | - Make sure you can visit http://localhost:5601/ and http://localhost:9200/ in your browser. 23 | 24 | 3. Clone the `pycon-2018-pyelasticsearch` repository to your computer by running: 25 | ``` 26 | git clone github.com/julieqiu/pycon-2018-pyelasticsearch 27 | ``` 28 | 29 | 4. In root of the repository, set up a virtualenv: 30 | ``` 31 | python3 -m venv venv 32 | source venv/bin/activate 33 | ``` 34 | 35 | 5. Install the necessary python requirements: 36 | ``` 37 | pip install -r requirements.txt 38 | ``` 39 | 40 | 6. Set up the searchapp: 41 | ``` 42 | pip install -e . 43 | ``` 44 | 45 | You're all set for the tutorial this Wednesday! :) 46 | 47 | --- 48 | 49 | ### Docker Compose Instruction 50 | 51 | 1. Clone the `pycon-2018-pyelasticsearch` repository to your computer by running: 52 | ``` 53 | git clone github.com/julieqiu/pycon-2018-pyelasticsearch 54 | ``` 55 | 56 | 2. Download elasticsearch and kibana images; start containers via docker-compose 57 | ``` 58 | docker-compose up 59 | ``` 60 | 61 | 3. In root of the repository, set up a virtualenv: 62 | ``` 63 | python3 -m venv venv 64 | source venv/bin/activate 65 | ``` 66 | 67 | 4. Install the necessary python requirements: 68 | ``` 69 | pip install -r requirements.txt 70 | ``` 71 | 72 | 5. Set up the searchapp: 73 | ``` 74 | pip install -e . 75 | ``` 76 | -------------------------------------------------------------------------------- /lessons/Lesson1_Indexing.md: -------------------------------------------------------------------------------- 1 | # Lesson 1: Indexing 2 | We’ve provided a basic example of indexing data into elasticsearch to get you started. Step one is just running the indexer, and examining the results. 3 | 4 | ## Part 1: Starting the Flask App 5 | 6 | ### What you need to do: 7 | 1. Run `python searchapp/index_products.py` 8 | 2. Run `python searchapp/run.py` 9 | 10 | ### How you’ll know it worked: 11 | 1. http://localhost:9200/products_index/product/1 shows information about “A Great Product”. 12 | 2. http://localhost:9200/products_index shows information about the products index. 13 | 2. http://127.0.0.1:5000 returns “A Great Product” for every search term. 14 | 15 | 16 | ## Part 2: Indexing a Single Product 17 | Now that we know that everything is working, it’s time to put real data in the index. 18 | 19 | ### What you need to do: 20 | Modify the `index_product` function in `searchapp/index_product.py`. 21 | 22 | The `product` object passed into `index_product` is currently not used for anything. Use it inside es.create. 23 | 24 | ### How you’ll know it worked: 25 | Once you re-run index product, you’ll find that “A Great Product” has been replaced by “835 Mid-RIse Capri” everywhere. 26 | 27 | ### Helpful information 28 | Open up `searchapp/index_products.py`. You’ll notice that `index_product` currently takes a `ProductData` object as an argument. 29 | 30 | Currently, `index_product` does not use this argument. It creates a single product in the index using hardcoded data for `A Great Product` with the image of a kitten. 31 | 32 | For this step, you’ll need to modify `index_product` to use the `ProductData` objected passed in. 33 | 34 | #### Errors you might see: 35 |
36 | elasticsearch.exceptions.ConflictError 37 | When we create a document in elasticsearch, we must include a unique id. Are you passing the same id for every product in the es.create call? You should be passing `product.id`. 38 |
39 | 40 |
41 | 42 | ImportError: No module named searchapp.app.app 43 | 44 | Full Error Message: 45 | ``` 46 | Traceback (most recent call last): 47 | File "run.py", line 1, in 48 | from searchapp.app.app import app 49 | ImportError: No module named searchapp.app.app 50 | ``` 51 | All of our requirements were installed in a [virtual env](https://docs.python.org/3/library/venv.html). Is yours activated? 52 | 53 | Run `source venv/bin/activate` from the root of the repository to activate the venv. 54 |
55 | 56 | ## Part 3: Indexing 20,000 more products 57 | One product down, 19,999 to go. 58 | 59 | ### What you need to do: 60 | Modify index_product to insert everything from `searchapp/products.json`, instead of just the first item. 61 | 62 | ### How you’ll know it worked: 63 | http://127.0.0.1:5000 will now show nine products for every search result. 64 | 65 | ### Helpful information 66 | All of the product data that we will be using in this workshop is stored in a json file, `searchapp/products.json`. 67 | 68 | `searchapp/data.py` takes care of loading that json. 69 | 70 | Take a look at `searchapp/data.py`. It defines a class, `ProductData` and a function, `all_products`. 71 | 72 | `all_products` returns a list of `ProductData` objects created with the data in `searchapp/products.json`. 73 | 74 | In this problem, you will need to modify `index_products` in `searchapp/index_products.py` to these products into your products index. 75 | 76 | ## Part 4: Bulk Indexing 77 | We’re ready to start searching now! But let’s take a quick diversion to make indexing faster. 78 | 79 | It takes about a minute to index 20,000 products. Inserting documents into elasticsearch one by one is slow. Fortunately, Elasticsearch has a bulk api, and elasticsearch-py provides a great wrapper around it. 80 | 81 | ### What you need to do: 82 | Write a function called `products_to_index` to bulk index all the products. 83 | 84 | ### How you’ll know it worked: 85 | When you run index_products.py should take only a few seconds to run, and http://127.0.0.1:5000 should continue to show nine results. 86 | 87 | 88 | ### Helpful information: 89 | You’re going to invoke the `bulk` from `elasticsearch.helpers`, and pass it an iterable containing one insert action for each product. Your iterable can just be a list, or you can write a generator function and pass that to bulk. 90 | 91 | Each action is a dictionary containing some special fields that start with underscores, and a `doc` field that contains the actual document to index. 92 | 93 | You’ll want to: 94 | 95 | 1. Specify an `_op` type of index 96 | 2. Set `_index` and `_type` (doc type) to the appropriate constants 97 | 3. Provide the document (under the `_source` key), and `_id` just like in index_product 98 | 99 | 100 | #### Where are the docs? 101 | https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html 102 | http://elasticsearch-py.readthedocs.io/en/master/helpers.html 103 | -------------------------------------------------------------------------------- /lessons/Lesson2_Searching.md: -------------------------------------------------------------------------------- 1 | # Lesson 2: Searching 2 | It’s time to return some real results! 3 | 4 | Open up `searchapp/app/search.py`, and take a look at the `search` function. You’ll notice that it never makes use of the `term` that is passed in, and that’s why we get the same results for every query. 5 | 6 | 7 | ## A note on the hints below: 8 | For each question below, we have provided hints in case you get stuck. Before looking that them, take some time to think through the question first! 9 | 10 | When you want to read the spoilers, just click on them. 11 | 12 | We recommend that you take your time, and to read the hints one at a time. 13 | 14 | The more you try on your own, the more you’ll learn :) 15 | 16 | 17 | ## Part 1: Necklaces! (Term Query) 18 | Let’s start by using a term query to search for products by their name. 19 | 20 | ### Where are the docs? 21 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html 22 | 23 | ### What you need to do: 24 | Replace the match_all query with a term query. In the docs you’ll see "query" provided in the dictionary, but you don’t need to include that. Elasticsearch-dsl will automatically wrap the name_query for you. 25 | 26 | ### How you’ll know it worked: 27 | Instead of seeing the same products for every query, you’ll see necklaces under the necklace query! 28 | 29 | All of the other queries will return nothing. 30 | 31 | 32 | ## Part 2: Metal Necklaces (Match Query) 33 | We made a search engine! But it’s not very good yet. 34 | 35 | Let's take a look at the behavior of `term` queries by reading through [the term query docs](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html) and playing around the [dev tools console](http://localhost:5601/app/kibana#/dev_tools/console). 36 | 37 | Does it make sense why these return different results? 38 | 39 | Note: you don't need `_source` in the queries belows, but it is handy for limiting the fileds returned by the search query for clarity. See [`_source` documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-source-field.html). 40 | 41 | ``` 42 | POST _search 43 | { 44 | "query": { 45 | "term" : { "name" : "necklace" } 46 | }, 47 | "_source": ["name"] 48 | } 49 | ``` 50 | ``` 51 | POST _search 52 | { 53 | "query": { 54 | "term" : { "name" : "Necklace" } 55 | }, 56 | "_source": ["name"] 57 | } 58 | ``` 59 | ``` 60 | POST _search 61 | { 62 | "query": { 63 | "term" : { "name" : "neklace" } 64 | }, 65 | "_source": ["name"] 66 | } 67 | ``` 68 | 69 | What happens when you run a [match query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html) instead? 70 | 71 | Our next goal is to make the query for `Metal Necklace` work. 72 | 73 | ### What you need to do: 74 | There are lots of products with metal and necklace in their name - return metal necklaces. 75 | 76 |
77 | Hint 78 | Replace your term query with a match query. 79 |
80 | 81 | ### How you’ll know it worked: 82 | You’ll see results for a few more queries on http://127.0.0.1:5000. The metal necklaces will look really solid, with one exception: there will also be a metal filing cabinet included. 83 | 84 | ### Where are the docs? 85 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html 86 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html 87 | 88 | 89 | ## Part 3: Getting Rid of the Filing Cabinet 90 | The results for `metal neklace` returned 8 metal necklaces and 1 filing cabinet. 91 | 92 | Let’s deal with that filing cabinet. 93 | 94 | Term was way too restrictive for our purposes, but the default behavior of match is a bit too permissive. Take another look at the match docs, and see if you can figure out how to make it match the eight metal necklaces, but not the filing cabinet. 95 | 96 | ### What you need to do: 97 | What are some things that we do with a [match query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html)? 98 | 99 |
100 | Hint 101 | The critical section of the documentation is right at the top, under match. 102 | Setting the operator flag to and will exclude the filing cabinet, without excluding any of our desired results. 103 | The match query should contain a single key called “name,” the field your searching. The name key should map to a dictionary containing a query key, and the operator key. 104 |
105 | 106 | ### How you’ll know it worked: 107 | The filing cabinet is gone! But everything else remains. 108 | 109 | ### Where are the docs? 110 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html 111 | 112 | 113 | ## Part 4: "Necklce" 114 | There’s just one search term in our example that doesn’t have any results. “Necklce.” We could suggest that our customers check their spelling, but we don’t need to. Elasticsearch can provide good results even with minor misspellings. 115 | 116 | ### What you need to do: 117 |
118 | Hint 119 | The critical section of the documentation is Fuzziness. 120 | You can add `{“fuzziness”: 2}` right next to the operator setting. 121 |
122 | 123 | ### How you’ll know it worked: 124 | There are results for Necklce, that look just like the results for Necklace. 125 | 126 | ### Where are the docs? 127 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html 128 | 129 | ## Part 5: "OK" 130 | How do your results for “OK” look? If you fixed the Necklce query the same way I did, you’re seeing results like “OV Kit” when searching for “OK.” If you did it a little differently, things might look fine. If your results still look good, try to figure out what I did. If they’re broken now, then how can we fix them without breaking Necklce? 131 | 132 | ## What you need to do: 133 |
134 | Hint 135 | The difference between seeing a bunch of OV Kit, and getting good results is caused by the value of the fuzziness key. 136 | You’ve only got a few choices for the fuzziness value. Try it with 2 and try it with AUTO. 137 |
138 | 139 |
140 | Hint 141 | The critical section of the documentation is under [`AUTO`](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness 142 | ): 143 | generates an edit distance based on the length of the term. Low and high distance arguments may be optionally provided AUTO:[low],[high], if not specified, the default values are 3 and 6, equivalent to AUTO:3,6 that make for lengths: 144 |
145 | 146 | 147 | ## How you’ll know it worked: 148 | OK is back to normal, and necklce looks still works. 149 | 150 | ### Where are the docs? 151 | https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness 152 | -------------------------------------------------------------------------------- /lessons/Lesson3_Analysis.md: -------------------------------------------------------------------------------- 1 | # Lesson 3: Analysis 2 | 3 | For the next couple of queries, we’re going to be making changes to our index’s mapping and settings. We need to tell Elasticsearch more about the structure of our data, so it can effectively process our queries. 4 | 5 | ## Part 1: A Brass Necklace (Standard Analyzers) 6 | 7 | The word `a` in `a brass necklace` is a *stop word*. 8 | 9 | A *stop word* is a word that doesn’t add any extra information and can be ignored. 10 | 11 | Elasticsearch comes with built in support for removing stop words. We just have to tell it that the `name` field of our products contains english text. 12 | 13 | Once we do that, elasticsearch will filter out stop words in both the names of our products, and the search queries we run against them. I’ve left lots of spoilers visible below, because the documentation can easily lead you astray. 14 | 15 | #### Disclaimer: 16 | Unlike the previous problems, we are going to walk you through how to add stop words to your search app. 17 | 18 | If you really want to work through this on your own, don’t read any further! 19 | 20 | ### What you need to do: 21 | Head back to `index_products.py`. When you create the index, add a mapping that tells elasticsearch to analyze the `name` field as english. 22 | 23 | Let's look at the mapping that we are going to index. 24 | ``` 25 | 'mappings': { 26 | DOC_TYPE: { # This mapping applies to products. 27 | 'properties': { # Just a magic word. 28 | 'name': { # The field we want to configure. 29 | 'type': 'text', # The kind of data we’re working with. 30 | 'fields': { # create an analyzed field. 31 | 'english_analyzed': { # Name that field `name.english_analyzed`. 32 | 'type': 'text', # It’s also text. 33 | 'analyzer': 'english', # And here’s the analyzer we want to use. 34 | } 35 | } 36 | } 37 | } 38 | } 39 | } 40 | ``` 41 | Now that we have the field (don’t forget to re-run the index script), we need to use it in our search. 42 | 43 | Moving forward, simply swap in `name.english_analyzed` instead of `name` when writing search queries. 44 | 45 | ### How you’ll know it worked: 46 | `A brass necklace` returns the same results as `brass necklace`. 47 | 48 | ### Where are the docs? 49 | https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer.html 50 | https://www.elastic.co/guide/en/elasticsearch/guide/current/using-stopwords.html 51 | https://www.elastic.co/guide/en/elasticsearch/reference/6.2/analysis-lang-analyzer.html 52 | https://www.elastic.co/guide/en/elasticsearch/reference/current/multi-fields.html 53 | 54 | ## Part 2: Necklace Made of Brass 55 | The standard analyzers are powerful, but sometimes we need to augment them with information specific to our domain. We know that `made` doesn’t mean anything here, but elasticsearch doesn’t. So let’s create a custom analyzer that also includes `made` as a stop word. 56 | 57 | ### What you need to do: 58 | Create the custom analyzer in the settings when you create the index. Like before, I think you’ll learn more from reading the code, than trying to write it from scratch. 59 | ``` 60 | 'settings': { 61 | 'analysis': { # magic word. 62 | 'analyzer': { # yet another magic word. 63 | 'custom_english_analyzer': { # The name of our analyzer. 64 | 'type': 'english', # The built in analyzer we’re building on. 65 | 'stopwords': ['made', '_english_'], # Our custom stop words, plus the defaults. 66 | }, 67 | }, 68 | }, 69 | } 70 | ``` 71 | 72 | Re-run `index_products`, and it will create the analyzer. You can query it directly using the curl request below. 73 | 74 | ### How you’ll know it worked: 75 | ``` 76 | curl -X POST localhost:9200/products_index/_analyze -d '{"analyzer": "custom_english_analyzer", "text": "necklace made of brass"}' -H 'Content-Type: application/json' 77 | ``` 78 | 79 | Yields: 80 | ``` 81 | { 82 | "tokens":[ 83 | { 84 | "token":"necklac", 85 | "start_offset":0, 86 | "end_offset":8, 87 | "type":"", 88 | "position":0 89 | }, 90 | { 91 | "token":"brass", 92 | "start_offset":17, 93 | "end_offset":22, 94 | "type":"", 95 | "position":3 96 | } 97 | ] 98 | } 99 | ``` 100 | 101 | ### Where are the docs? 102 | https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stop-analyzer.html 103 | 104 | 105 | ## Part 3: Necklace Made of Brass (English Analyzer) 106 | Note that `made` and `of` are both missing. 107 | 108 | You can try the same query with the `english` analyzer directly, and you’ll see that `of` is removed, but `made` is not. 109 | 110 | `Necklaces made of brass` still doesn’t return anything – we need to actually use our custom analyzer. 111 | 112 | ### What you need to do: 113 |
114 | Hint 115 | Swap out the english analyzer in the mapping, for the `custom_english_analyzer`. 116 | 117 | As usual, don’t forget to re-run `index_products` after changing it. 118 |
119 | 120 | ### How you’ll know it worked: 121 | We get some `necklaces made of brass`! 122 | 123 | ### Where are the docs? 124 | https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer.html 125 | -------------------------------------------------------------------------------- /lessons/Lesson4_ExtraCredit.md: -------------------------------------------------------------------------------- 1 | # Lesson 4: Extra Credit 2 | 3 | Enough of necklaces, let’s buy a jacket. 4 | 5 | There are plenty of jackets available in our data set, but only one shows up here. 6 | 7 | The problem is that the words `men’s jacket` don’t usually show up in the product name, but they do show up in the product description. Let’s search the descriptions instead of the names. 8 | 9 | Where are the docs? 10 | 11 | There’s nothing new here, but you may want to review some of the documentation above. 12 | 13 | ## Part 1: Indexing Jackets (Indexing Product Description) 14 | 15 | ### What you need to do: 16 | The indexing and search flow here is very similar to what you've seen before already. Just adapt them for product description! 17 | 18 | In `searchapp/index_products.py`: 19 | 20 |
21 | Hint: Step 1 22 | Add the product description to the `_source` of your document. 23 |
24 | 25 |
26 | Hint: Step 2 27 | Add `description` to the mapping, and configure it to use our custom analyzer. 28 |
29 | 30 |
31 | Hint: Step 3 32 | Change `search.py` to reference `description.english_analyzed` instead of name. 33 |
34 | 35 |
36 | Hint: Step 4 37 | As usual, don’t forget to re-run `index_products` after changing it. 38 |
39 | 40 | ### How you’ll know it worked: 41 | There are actually Jackets returned for the `men’s jackets` query. 42 | 43 | This is a huge improvement for the jackets, but just about every other query has gotten worse. When there’s a match against the product name, the results are great, but we still need to support searches against the description. Fortunately, we can combine our name query and our description query, and search against both fields. 44 | 45 | ### Where are the docs? 46 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-dis-max-query.html 47 | 48 | ## Part 2: Searching for Jackets (Dis Max Queries) 49 | 50 | ### What you need to do: 51 |
52 | Hint 53 | There are a few different ways to combine queries in elasticsearch, but `dis_max` is the best for this. 54 | 55 | Separately create a `name_query` and `description_query`, and then combine them in the queries property of a `dis_max` query. Pass that dis_max query to `s.query`. 56 | 57 | If the results don’t look much better, try adding `"tie_breaker": 0.7` to the dis_max query. This will boost items that match in both the name and description. 58 |
59 | 60 | ### How you’ll know it worked: 61 | All of the results should look a bit better. 62 | 63 | Play around with your query, and see how changes impact the results. Start off by adding `boost` properties to your name and description queries. Try swapping out the dis_max query for a bool query and playing with the boost values. Take your time, and try to really understand the impact each of your changes has. There’s always a way to make things a little better! 64 | -------------------------------------------------------------------------------- /lessons/kibana/index.json: -------------------------------------------------------------------------------- 1 | # ------------------------------ 2 | # 1. Basic Indexing: Create a document with POST 3 | # ------------------------------ 4 | 5 | # We'll be adding products to a product catalog. Let's add our first JSON doc: 6 | 7 | POST /products_index_test/product 8 | { 9 | "name": "Orange Shirt", 10 | "description": "This is an orange shirt." 11 | } 12 | 13 | # Indexing the document automatically created the index for us, named "product_index_test" with mapping type "product" 14 | 15 | # The product is now in the index! 16 | 17 | # Let's find it in the index using a GET command: 18 | GET /products_index_test/_search 19 | 20 | # ------------------------------ 21 | # 2. Basic Indexing: Create a document with PUT 22 | # ------------------------------ 23 | # POST creates the document's ID for us 24 | # PUT requires an id for the document, as part of the URL 25 | 26 | # If we run the following we'll get an error 27 | PUT /products_index_test/product 28 | { 29 | "name": "Blue Shirt", 30 | "description": "This is a blue shirt." 31 | } 32 | 33 | # The error is expected! 34 | # Run this instead: 35 | PUT /products_index_test/product/1 36 | { 37 | "name": "Blue Shirt", 38 | "description": "This is an blue shirt." 39 | } 40 | 41 | # Run this to get all of the products in our index 42 | GET /products_index_test/_search 43 | 44 | # We now have an Orange Shirt and a Blue Shirt in our index. 45 | 46 | # Instead of dynamically creating the index based on the first document we add, we can create the index beforehand, to set certain settings. We will do that later on in the tutorial today. 47 | 48 | # ------------------------------ 49 | # 3. Basic Indexing: Bulk adding products 50 | # ------------------------------ 51 | 52 | # When you need to index a lot of docs, you should use the bulk API, you may see signficant performance benefits 53 | 54 | POST /products_index_test/product/_bulk 55 | { "index": { "_id": 2 }} 56 | {"name": "Green Shirt", "description": "This is a green shirt."} 57 | { "index": { "_id": 3 }} 58 | {"name": "Purple Shirt", "description": "This is a purple shirt."} 59 | 60 | 61 | GET /products_index_test/_search 62 | 63 | # Let's delete this index, since we won't be using it again. 64 | DELETE /products_index_test 65 | -------------------------------------------------------------------------------- /lessons/kibana/search.json: -------------------------------------------------------------------------------- 1 | # Hello! This is the Kibana Dev tools console, we'll use this to interact with Elasticsearch 2 | 3 | 4 | #-------------------------------------- 5 | # Basic Search: Querying documents with GET 6 | #-------------------------------------- 7 | 8 | # We now have about 20k products in the index! 9 | # Let's search for them. 10 | # Find *all* documents 11 | 12 | GET /products_index/product/_search 13 | 14 | # This is the same query. 15 | GET /products_index/product/_search 16 | { 17 | "query": { 18 | "match_all": {} 19 | } 20 | } 21 | 22 | # Let's find all necklaces! 23 | GET /products_index/product/_search 24 | { 25 | "query": { 26 | "match": { 27 | "name": "necklace" 28 | } 29 | } 30 | } 31 | 32 | # Results are ranked by "relevance" (_score) 33 | # More info: https://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-intro.html 34 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.12.3 2 | Jinja2>=2.10.1 3 | MarkupSafe==1.0 4 | Werkzeug==0.15.3 5 | click==6.7 6 | elasticsearch-dsl>=6.1.0 7 | elasticsearch>=6.1.1 8 | itsdangerous==0.24 9 | urllib3==1.26.5 10 | -------------------------------------------------------------------------------- /searchapp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julieqiu/pycon-2018-pyelasticsearch/0c5232f0b6c133239ad885ea8d7d8867c39b6f7b/searchapp/__init__.py -------------------------------------------------------------------------------- /searchapp/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julieqiu/pycon-2018-pyelasticsearch/0c5232f0b6c133239ad885ea8d7d8867c39b6f7b/searchapp/app/__init__.py -------------------------------------------------------------------------------- /searchapp/app/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | 3 | from searchapp.data import all_products 4 | from searchapp.app.search import search 5 | 6 | app = Flask(__name__) 7 | 8 | 9 | @app.route('/') 10 | @app.route('/index') 11 | def index(): 12 | """ 13 | Search for products across a variety of terms, and show 9 results for each. 14 | """ 15 | search_terms = [ 16 | 'necklace', 17 | 'metal necklace', 18 | 'necklce', 19 | 'OK', 20 | 'brass necklace', 21 | 'a brass necklace', 22 | 'necklaces made of brass', 23 | "men's jacket", 24 | ] 25 | 26 | num_results = 9 27 | products_by_category = [(t, search(t, num_results)) for t in search_terms] 28 | return render_template( 29 | 'index.html', 30 | products_by_category=products_by_category, 31 | ) 32 | 33 | 34 | @app.route('/search', methods=['GET', 'POST']) 35 | def search_single_product(): 36 | """ 37 | Execute a search for a specific search term. 38 | 39 | Return the top 50 results. 40 | """ 41 | query = request.args.get('search') 42 | num_results = 50 43 | products_by_category = [(query, search(query, num_results))] 44 | return render_template( 45 | 'index.html', 46 | products_by_category=products_by_category, 47 | search_term=query, 48 | ) 49 | 50 | 51 | @app.route('/product/') 52 | def single_product(product_id): 53 | """ 54 | Display information about a specific product 55 | """ 56 | 57 | product = str(all_products()[product_id - 1]) 58 | 59 | return render_template( 60 | 'product.html', 61 | product_json=product, 62 | search_term='', 63 | ) 64 | -------------------------------------------------------------------------------- /searchapp/app/search.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | from elasticsearch_dsl import Search 3 | from typing import List 4 | 5 | from searchapp.constants import DOC_TYPE, INDEX_NAME 6 | 7 | HEADERS = {'content-type': 'application/json'} 8 | 9 | 10 | class SearchResult(): 11 | """Represents a product returned from elasticsearch.""" 12 | def __init__(self, id_, image, name): 13 | self.id = id_ 14 | self.image = image 15 | self.name = name 16 | 17 | def from_doc(doc) -> 'SearchResult': 18 | return SearchResult( 19 | id_ = doc.meta.id, 20 | image = doc.image, 21 | name = doc.name, 22 | ) 23 | 24 | 25 | def search(term: str, count: int) -> List[SearchResult]: 26 | client = Elasticsearch() 27 | 28 | # Elasticsearch 6 requires the content-type header to be set, and this is 29 | # not included by default in the current version of elasticsearch-py 30 | client.transport.connection_pool.connection.headers.update(HEADERS) 31 | 32 | s = Search(using=client, index=INDEX_NAME, doc_type=DOC_TYPE) 33 | name_query = {'match_all': {}} 34 | docs = s.query(name_query)[:count].execute() 35 | 36 | 37 | return [SearchResult.from_doc(d) for d in docs] 38 | -------------------------------------------------------------------------------- /searchapp/app/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | PyCon 2018: Building a Search Engine with Python and Elasticsearch 4 | 5 | 6 |
7 | {% block content %}{% endblock %} 8 | 9 | 10 | -------------------------------------------------------------------------------- /searchapp/app/templates/index.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 | 69 | 70 | 71 | 75 | 76 |
77 | {% for category, products in products_by_category %} 78 | 93 | {% endfor %} 94 |
95 | {% endblock %} 96 | -------------------------------------------------------------------------------- /searchapp/app/templates/product.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 | 6 | 7 |
8 |
{{product_json}}
9 |
10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /searchapp/constants.py: -------------------------------------------------------------------------------- 1 | DOC_TYPE = 'product' 2 | INDEX_NAME = 'products_index' 3 | -------------------------------------------------------------------------------- /searchapp/data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import textwrap 4 | _all_products = None 5 | 6 | 7 | class ProductData(): 8 | """ 9 | Our product records. In this case they come from a json file, but you could 10 | just as easily load them from a database, or anywhere else. 11 | """ 12 | 13 | def __init__(self, id_, name, description, image, taxonomy, price): 14 | self.id = id_ 15 | self.name = name 16 | self.description = description 17 | self.image = image 18 | self.taxonomy = taxonomy 19 | self.price = price 20 | 21 | def __str__(self): 22 | return textwrap.dedent("""\ 23 | Id: {} 24 | Name: {} 25 | ImageUrl: {} 26 | Taxonomy: {} 27 | Price: ${} 28 | Description: 29 | 30 | {} 31 | """).format(self.id, self.name, self.image, self.taxonomy, 32 | self.price, self.description) 33 | 34 | 35 | def all_products(): 36 | """ 37 | Returns a list of ~20,000 ProductData objects, loaded from 38 | searchapp/products.json 39 | """ 40 | 41 | global _all_products 42 | 43 | if _all_products is None: 44 | _all_products = [] 45 | 46 | # Load the product json from the same directory as this file. 47 | dir_path = os.path.dirname(os.path.realpath(__file__)) 48 | products_path = os.path.join(dir_path, 'products.json') 49 | with open(products_path) as product_file: 50 | for idx, product in enumerate(json.load(product_file)): 51 | id_ = idx + 1 # ES indexes must be positive integers, so add 1 52 | product_data = ProductData(id_, **product) 53 | _all_products.append(product_data) 54 | 55 | return _all_products 56 | -------------------------------------------------------------------------------- /searchapp/index_products.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | 3 | from searchapp.constants import DOC_TYPE, INDEX_NAME 4 | from searchapp.data import all_products, ProductData 5 | 6 | 7 | def main(): 8 | # Connect to localhost:9200 by default. 9 | es = Elasticsearch() 10 | 11 | es.indices.delete(index=INDEX_NAME, ignore=404) 12 | es.indices.create( 13 | index=INDEX_NAME, 14 | body={ 15 | 'mappings': {}, 16 | 'settings': {}, 17 | }, 18 | ) 19 | 20 | index_product(es, all_products()[0]) 21 | 22 | 23 | def index_product(es, product: ProductData): 24 | """Add a single product to the ProductData index.""" 25 | 26 | es.create( 27 | index=INDEX_NAME, 28 | doc_type=DOC_TYPE, 29 | id=1, 30 | body={ 31 | "name": "A Great Product", 32 | "image": "http://placekitten.com/200/200", 33 | } 34 | ) 35 | 36 | # Don't delete this! You'll need it to see if your indexing job is working, 37 | # or if it has stalled. 38 | print("Indexed {}".format("A Great Product")) 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /searchapp/run.py: -------------------------------------------------------------------------------- 1 | from searchapp.app.app import app 2 | 3 | 4 | def main(): 5 | app.run(debug=True) 6 | 7 | if __name__ == '__main__': 8 | main() 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='searchapp', 5 | packages=find_packages(), 6 | version='0.1', 7 | ) 8 | --------------------------------------------------------------------------------