├── .gitignore ├── README.md ├── docker-compose.yml ├── init.sql ├── movies.csv └── queries.sql /.gitignore: -------------------------------------------------------------------------------- 1 | db-data/* 2 | pgadmin-data/* 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Postgres Full Text Search is better than ... 2 | 3 | This repo is a companion to the blog post [Postgres Full Text Search is better than ...](https://admcpr.com/postgres-full-text-search-is-better-than-part1). It contains the data and queries used in the blog post along with a docker-compose config that will spin up postgres and pgadmin so you can follow along. 4 | 5 | ## Getting started 6 | 7 | ### Clone this repo 8 | ``` 9 | git clone git@github.com:admcpr/postgres-full-text-search-is-better-than.git 10 | ``` 11 | ### Use docker compose to spin up the containers 12 | ``` 13 | cd postgres-full-text-search-is-better-than 14 | docker compose up 15 | ``` 16 | ### Use the included pgAdmin to connect to Postgres 17 | 1. Open pgAdmin by navigating to `http://localhost:5050` and login with email: `admin@admin.com`, password: `root`. 18 | 1. From pgAdmin connect to PostgreSQL with: 19 | 1. Host name/Address: `` 20 | 2. Username: `root` 21 | 3. Password: `root`. 22 | 23 | See the [docker-compose.yml](docker-compose.yml) file for more details on configuration. 24 | 25 | ## FAQ 26 | Q: Why? 27 | 28 | A: Why not 29 | 30 | ## Database source 31 | The movies data used in this repo is sourced from [themoviedb.org](https://www.themoviedb.org/). 32 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | services: 3 | db: 4 | container_name: pg_container 5 | image: postgres 6 | restart: always 7 | environment: 8 | POSTGRES_USER: root 9 | POSTGRES_PASSWORD: root 10 | POSTGRES_DB: postgres 11 | ports: 12 | - "5432:5432" 13 | volumes: 14 | - ./db-data/:/var/lib/postgresql/data/ 15 | - ./movies.csv:/docker-entrypoint-initdb.d/movies.csv 16 | - ./init.sql:/docker-entrypoint-initdb.d/init.sql 17 | pgadmin: 18 | container_name: pgadmin4_container 19 | image: dpage/pgadmin4 20 | restart: always 21 | environment: 22 | PGADMIN_DEFAULT_EMAIL: admin@admin.com 23 | PGADMIN_DEFAULT_PASSWORD: root 24 | ports: 25 | - "5050:80" 26 | volumes: 27 | - ./pgadmin-data/:/var/lib/pgadmin/ 28 | -------------------------------------------------------------------------------- /init.sql: -------------------------------------------------------------------------------- 1 | -- create the movies table 2 | CREATE TABLE movies( 3 | id INT PRIMARY KEY, 4 | budget MONEY, 5 | genres JSONB, 6 | homepage TEXT, 7 | keywords JSONB, 8 | original_language TEXT, 9 | original_title TEXT, 10 | title TEXT, 11 | overview TEXT, 12 | popularity FLOAT, 13 | production_companies JSONB, 14 | production_countries JSONB, 15 | release_date DATE, 16 | revenue MONEY, 17 | runtime INT, 18 | spoken_languages JSONB, 19 | status TEXT, 20 | tagline TEXT, 21 | vote_average FLOAT, 22 | vote_count INT 23 | ); 24 | 25 | SET datestyle TO DMY; 26 | 27 | -- import the movies data 28 | COPY movies( 29 | budget, 30 | genres, 31 | homepage, 32 | id, 33 | keywords, 34 | original_language, 35 | original_title, 36 | overview, 37 | popularity, 38 | production_companies, 39 | production_countries, 40 | release_date, 41 | revenue, 42 | runtime, 43 | spoken_languages, 44 | status, 45 | tagline, 46 | title, 47 | vote_average, 48 | vote_count 49 | ) 50 | FROM '/docker-entrypoint-initdb.d/movies.csv' 51 | DELIMITER ',' 52 | CSV HEADER; 53 | -------------------------------------------------------------------------------- /queries.sql: -------------------------------------------------------------------------------- 1 | /* Naive title queries */ 2 | select title from movies where title like '%star wars%' -- :( No results 3 | 4 | select title from movies where title like '%star%' or title like '%wars%' -- :( More results, but still not Star Wars ?? 5 | 6 | -- Oh no like is case sensitive 7 | select title from movies where title ilike '%star%' or title ilike '%wars%' -- :( Even more results, and Star Wars but where is it? 8 | 9 | -- Help me tsvector you are my only hope 10 | select to_tsvector('I Am Altering The Deal. Pray I Don''t Alter It Any Further!'); 11 | 12 | -- Let's do that to some of our titles 13 | select to_tsvector(title), title from movies limit 20; 14 | 15 | -- So maybe we can query that with to_tsquery 16 | select title 17 | from movies 18 | where to_tsvector(title) @@ to_tsquery('Star') -- :| OK 19 | 20 | -- And can we rank the results perhaps? 21 | select title, ts_rank(to_tsvector(title), to_tsquery('Star')) as rank 22 | from movies 23 | where to_tsvector(title) @@ to_tsquery('star') -- :/ Interesting 24 | 25 | -- So let's find star wars 🎉 26 | select title, ts_rank(to_tsvector(title), to_tsquery('star wars')) as rank 27 | from movies 28 | where to_tsvector(title) @@ to_tsquery('star wars') -- :( Oh come on! 29 | 30 | -- Better error messages would be nice but hey ... 31 | select title, ts_rank(to_tsvector(title), plainto_tsquery('star wars')) as rank 32 | from movies 33 | where to_tsvector(title) @@ plainto_tsquery('star wars') 34 | order by rank desc -- :) OK that's cool 35 | 36 | -- I want to search like google though 37 | select title, ts_rank(to_tsvector(title), websearch_to_tsquery('"star wars" -clone')) as rank 38 | from movies 39 | where to_tsvector(title) @@ websearch_to_tsquery('"star wars" -clone') 40 | order by rank desc -- :) No more clone wars 41 | 42 | -- What if you love clone wars ... 43 | select title, ts_rank(to_tsvector(title), websearch_to_tsquery('"star wars" +clone')) as rank 44 | from movies 45 | where to_tsvector(title) @@ websearch_to_tsquery('"star wars" +clone') 46 | order by rank desc -- :) Only clone wars 47 | 48 | -- What if I need to query multiple columns? 49 | select to_tsvector(title), to_tsvector(original_title) 50 | from movies -- :( I don't want to query them separately though 51 | 52 | -- Let's concatenate them 53 | select to_tsvector(title) || ' ' || to_tsvector(original_title) 54 | from movies -- :| Oops, just remembered I need to weight them 55 | 56 | -- Set weights 57 | select setweight(to_tsvector(title), 'A') || ' ' || setweight(to_tsvector(original_title), 'B') 58 | from movies -- :\ Better, but how am I going to query this? 59 | 60 | -- Well we could store the weighted vectors in a column 61 | -- 62 | -- alter table movies add title_search tsvector; 63 | -- update movies set title_search = 64 | -- setweight(to_tsvector(title), 'A') || ' ' || setweight(to_tsvector(original_title), 'B') 65 | -- 66 | -- We're not going to though because running this would take too long and how do we keep the column 67 | -- up to date ... with triggers? Yuck ... 🤢 68 | 69 | -- Postgres is, once again, going to show how awesome it is now 70 | -- alter table movies drop title_search; 71 | alter table movies 72 | add title_search tsvector 73 | generated always as ( 74 | setweight(to_tsvector('simple', coalesce(title, '')), 'A') || ' ' || 75 | setweight(to_tsvector('simple', coalesce(original_title, '')), 'B') :: tsvector 76 | ) stored; 77 | 78 | -- Does that mean I've got a new search column I can query that's always up to date 79 | select title, original_title, ts_rank(title_search, websearch_to_tsquery('hero')) as rank 80 | from movies 81 | where title_search @@ websearch_to_tsquery('hero') 82 | order by rank desc 83 | -- :) Why yes it does, if only we could make this fast 84 | 85 | -- Like with some kind of special index optimised for vectors 86 | create index idx_search on movies using GIN(title_search) 87 | 88 | -- Is this fast enough? 89 | -- Remember we're querying multiple columns with full text search, weighting and web search features 90 | select title, ts_rank(title_search, websearch_to_tsquery('rush')) as rank 91 | from movies 92 | where title_search @@ websearch_to_tsquery('rush') 93 | order by rank desc -- :))))))) 94 | 95 | -- And now let's rank based on the length of the document (normalization yay https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING) 96 | select title, ts_rank(title_search, websearch_to_tsquery('rush'), 1) as rank 97 | from movies 98 | where title_search @@ websearch_to_tsquery('rush') 99 | order by rank desc 100 | 101 | -- THE END 102 | --------------------------------------------------------------------------------