├── .gitignore
├── README.md
├── docker-compose.yml
├── init.sql
├── movies.csv
└── queries.sql


/.gitignore:
--------------------------------------------------------------------------------
1 | db-data/*
2 | pgadmin-data/*
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Postgres Full Text Search is better than ...
 2 | 
 3 | This repo is a companion to the blog post [Postgres Full Text Search is better than ...](https://admcpr.com/postgres-full-text-search-is-better-than-part1). It contains the data and queries used in the blog post along with a docker-compose config that will spin up postgres and pgadmin so you can follow along.
 4 | 
 5 | ## Getting started
 6 | 
 7 | ### Clone this repo
 8 | ```
 9 | git clone git@github.com:admcpr/postgres-full-text-search-is-better-than.git
10 | ```
11 | ### Use docker compose to spin up the containers
12 | ```
13 | cd postgres-full-text-search-is-better-than
14 | docker compose up
15 | ```
16 | ### Use the included pgAdmin to connect to Postgres
17 | 1. Open pgAdmin by navigating to `http://localhost:5050` and login with email: `admin@admin.com`, password: `root`.
18 | 1. From pgAdmin connect to PostgreSQL with:
19 |    1. Host name/Address: `<your ip>`
20 |    2. Username: `root`
21 |    3. Password: `root`.
22 | 
23 | See the [docker-compose.yml](docker-compose.yml) file for more details on configuration. 
24 | 
25 | ## FAQ
26 | Q: Why?
27 | 
28 | A: Why not
29 | 
30 | ## Database source
31 | The movies data used in this repo is sourced from [themoviedb.org](https://www.themoviedb.org/).
32 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | services:
 3 |   db:
 4 |     container_name: pg_container
 5 |     image: postgres
 6 |     restart: always
 7 |     environment:
 8 |       POSTGRES_USER: root
 9 |       POSTGRES_PASSWORD: root
10 |       POSTGRES_DB: postgres
11 |     ports:
12 |       - "5432:5432"
13 |     volumes:
14 |       - ./db-data/:/var/lib/postgresql/data/
15 |       - ./movies.csv:/docker-entrypoint-initdb.d/movies.csv
16 |       - ./init.sql:/docker-entrypoint-initdb.d/init.sql
17 |   pgadmin:
18 |     container_name: pgadmin4_container
19 |     image: dpage/pgadmin4
20 |     restart: always
21 |     environment:
22 |       PGADMIN_DEFAULT_EMAIL: admin@admin.com
23 |       PGADMIN_DEFAULT_PASSWORD: root
24 |     ports:
25 |       - "5050:80"
26 |     volumes:
27 |       - ./pgadmin-data/:/var/lib/pgadmin/
28 | 


--------------------------------------------------------------------------------
/init.sql:
--------------------------------------------------------------------------------
 1 | -- create the movies table
 2 | CREATE TABLE movies(
 3 |   id INT PRIMARY KEY,
 4 |   budget MONEY,
 5 |   genres JSONB,
 6 |   homepage TEXT,
 7 |   keywords JSONB,
 8 |   original_language TEXT,
 9 |   original_title TEXT,
10 |   title TEXT,
11 |   overview TEXT,
12 |   popularity FLOAT,
13 |   production_companies JSONB,
14 |   production_countries JSONB,
15 |   release_date DATE,
16 |   revenue MONEY,
17 |   runtime INT,
18 |   spoken_languages JSONB,
19 |   status TEXT,
20 |   tagline TEXT,
21 |   vote_average FLOAT,
22 |   vote_count INT
23 | );
24 | 
25 | SET datestyle TO DMY;
26 | 
27 | -- import the movies data
28 | COPY movies(
29 |   budget,
30 |   genres,
31 |   homepage,
32 |   id,
33 |   keywords,
34 |   original_language,
35 |   original_title,
36 |   overview,
37 |   popularity,
38 |   production_companies,
39 |   production_countries,
40 |   release_date,
41 |   revenue,
42 |   runtime,
43 |   spoken_languages,
44 |   status,
45 |   tagline,
46 |   title,
47 |   vote_average,
48 |   vote_count
49 | )
50 | FROM '/docker-entrypoint-initdb.d/movies.csv'
51 | DELIMITER ','
52 | CSV HEADER;
53 | 


--------------------------------------------------------------------------------
/queries.sql:
--------------------------------------------------------------------------------
  1 | /* Naive title queries */
  2 | select title from movies where title like '%star wars%' -- :( No results
  3 | 
  4 | select title from movies where title like '%star%' or title like '%wars%' -- :( More results, but still not Star Wars ??
  5 | 
  6 | -- Oh no like is case sensitive
  7 | select title from movies where title ilike '%star%' or title ilike '%wars%' -- :( Even more results, and Star Wars but where is it?
  8 | 
  9 | -- Help me tsvector you are my only hope
 10 | select to_tsvector('I Am Altering The Deal. Pray I Don''t Alter It Any Further!');
 11 | 
 12 | -- Let's do that to some of our titles
 13 | select to_tsvector(title), title from movies limit 20;
 14 | 
 15 | -- So maybe we can query that with to_tsquery
 16 | select title 
 17 | from movies
 18 | where to_tsvector(title) @@ to_tsquery('Star') -- :| OK
 19 | 
 20 | -- And can we rank the results perhaps?
 21 | select title, ts_rank(to_tsvector(title), to_tsquery('Star')) as rank
 22 | from movies
 23 | where to_tsvector(title) @@ to_tsquery('star') -- :/ Interesting
 24 | 
 25 | -- So let's find star wars 🎉
 26 | select title, ts_rank(to_tsvector(title), to_tsquery('star wars')) as rank
 27 | from movies
 28 | where to_tsvector(title) @@ to_tsquery('star wars') -- :( Oh come on!
 29 | 
 30 | -- Better error messages would be nice but hey ...
 31 | select title, ts_rank(to_tsvector(title), plainto_tsquery('star wars')) as rank
 32 | from movies
 33 | where to_tsvector(title) @@ plainto_tsquery('star wars')
 34 | order by rank desc -- :) OK that's cool
 35 | 
 36 | -- I want to search like google though
 37 | select title, ts_rank(to_tsvector(title), websearch_to_tsquery('"star wars" -clone')) as rank
 38 | from movies
 39 | where to_tsvector(title) @@ websearch_to_tsquery('"star wars" -clone')
 40 | order by rank desc -- :) No more clone wars
 41 | 
 42 | -- What if you love clone wars ...
 43 | select title, ts_rank(to_tsvector(title), websearch_to_tsquery('"star wars" +clone')) as rank
 44 | from movies
 45 | where to_tsvector(title) @@ websearch_to_tsquery('"star wars" +clone')
 46 | order by rank desc -- :) Only clone wars
 47 | 
 48 | -- What if I need to query multiple columns? 
 49 | select to_tsvector(title), to_tsvector(original_title)
 50 | from movies -- :( I don't want to query them separately though
 51 | 
 52 | -- Let's concatenate them 
 53 | select to_tsvector(title) || ' ' || to_tsvector(original_title)
 54 | from movies -- :| Oops, just remembered I need to weight them
 55 | 
 56 | -- Set weights
 57 | select setweight(to_tsvector(title), 'A') || ' ' || setweight(to_tsvector(original_title), 'B')
 58 | from movies -- :\ Better, but how am I going to query this?
 59 | 
 60 | -- Well we could store the weighted vectors in a column
 61 | --
 62 | -- alter table movies add title_search tsvector;
 63 | -- update movies set title_search = 
 64 | -- 	setweight(to_tsvector(title), 'A') || ' ' || setweight(to_tsvector(original_title), 'B')
 65 | --
 66 | -- We're not going to though because running this would take too long and how do we keep the column
 67 | -- up to date ... with triggers? Yuck ... 🤢
 68 | 
 69 | -- Postgres is, once again, going to show how awesome it is now
 70 | -- alter table movies drop title_search;
 71 | alter table movies 
 72 | add title_search tsvector 
 73 | generated always as	(
 74 | 	setweight(to_tsvector('simple', coalesce(title, '')), 'A') || ' ' || 
 75 | 	setweight(to_tsvector('simple', coalesce(original_title, '')), 'B') :: tsvector
 76 | ) stored; 
 77 | 
 78 | -- Does that mean I've got a new search column I can query that's always up to date
 79 | select title, original_title, ts_rank(title_search, websearch_to_tsquery('hero')) as rank
 80 | from movies
 81 | where title_search @@ websearch_to_tsquery('hero')
 82 | order by rank desc
 83 | -- :) Why yes it does, if only we could make this fast 
 84 | 
 85 | -- Like with some kind of special index optimised for vectors
 86 | create index idx_search on movies using GIN(title_search)
 87 | 
 88 | -- Is this fast enough?
 89 | -- Remember we're querying multiple columns with full text search, weighting and web search features
 90 | select title, ts_rank(title_search, websearch_to_tsquery('rush')) as rank
 91 | from movies
 92 | where title_search @@ websearch_to_tsquery('rush')
 93 | order by rank desc -- :)))))))
 94 | 
 95 | -- And now let's rank based on the length of the document (normalization yay https://www.postgresql.org/docs/current/textsearch-controls.html#TEXTSEARCH-RANKING)
 96 | select title, ts_rank(title_search, websearch_to_tsquery('rush'), 1) as rank
 97 | from movies
 98 | where title_search @@ websearch_to_tsquery('rush')
 99 | order by rank desc
100 | 
101 | -- THE END
102 | 


--------------------------------------------------------------------------------