├── .flake8
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── dev_requirements.txt
├── pyproject.toml
├── requirements.txt
├── src
    ├── __init__.py
    ├── dataset.txt
    ├── db.py
    ├── filtering.py
    ├── init_db.py
    ├── models.py
    └── tests
    │   ├── conftest.py
    │   └── test_searching.py
└── tsearch_data
    ├── polish.affix
    ├── polish.dict
    └── polish.stop


/.flake8:
--------------------------------------------------------------------------------
 1 | # Flake8 explicitly will not support pyproject.toml :(
 2 | 
 3 | [flake8]
 4 | extend-ignore=
 5 |     E402,
 6 |     E501,
 7 |     W503,
 8 | extend-exclude=
 9 |     .venv,
10 |     migrations,
11 | 
12 | # vi: ft=dosini
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |   python: python3.9
 3 | repos:
 4 |   - repo: local
 5 |     hooks:
 6 |       - id: black
 7 |         name: black
 8 |         language: system
 9 |         entry: black
10 |         args: ["--check"]
11 |         types: [python]
12 |       - id: isort
13 |         name: isort
14 |         language: system
15 |         entry: isort
16 |         args: ["--check", "--diff"]
17 |         types: [python]
18 |       - id: flake8
19 |         name: flake8
20 |         language: system
21 |         entry: flake8
22 |         types: [python]
23 |         exclude: "migrations"
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Jarosław Orzeł
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # postgres-full-text-search
  2 | Postgres full text search options (tsearch, trigram, ilike) examples.
  3 | 
  4 | 1. [Create DB](#create-db)
  5 | 2. [Full text search using simple `ilike`](#full-text-search-using-simple-ilike)
  6 | 3. [Full text search using `ilike` supported by trigram index](#full-text-search-using-ilike-supported-by-trigram-index)
  7 | 4. [Create non-default language configuration for tsearch full text search](#create-non-default-language-configuration-for-tsearch-full-text-search)
  8 | 5. [Tsearch full text search without stored index](#tsearch-full-text-search-without-stored-index)
  9 | 6. [Tsearch full text search with stored partial index](#tsearch-full-text-search-with-stored-partial-index)
 10 | 7. [Tsearch full text search for partial words](#tsearch-full-text-search-for-partial-words)
 11 | 8. [Tsearch full text search results ranking](#tsearch-full-text-search-results-ranking)
 12 | 9. [GiST vs GIN](#gist-vs-gin)
 13 | 10. [Inspiration and help](#inspiration-and-help)
 14 | 
 15 | 
 16 | ## Create DB
 17 | ```sql
 18 | >> CREATE DATABASE ftdb;
 19 | ```
 20 | To feed db with an example dataset (`dataset.txt`, 100k rows, 15 words each one) I used python `init_db.py` script.
 21 | 
 22 | ## Full text search using simple `ilike`
 23 | ```sql
 24 | >> EXPLAIN ANALYZE
 25 |    SELECT text, language
 26 |    FROM public.document
 27 |    WHERE
 28 |       text ilike '%field%'
 29 |       AND text ilike '%window%'
 30 |       AND text ilike '%lamp%'
 31 |       AND text ilike '%research%'
 32 |       AND language = 'en'
 33 |     LIMIT 1;
 34 |                                                                   QUERY PLAN
 35 | ----------------------------------------------------------------------------------------------------------------------------------------------
 36 |  Limit  (cost=0.00..3734.02 rows=1 width=105) (actual time=87.473..87.474 rows=0 loops=1)
 37 |    ->  Seq Scan on document  (cost=0.00..3734.02 rows=1 width=105) (actual time=87.466..87.466 rows=0 loops=1)
 38 |          Filter: ((text ~~* '%field%'::text) AND (text ~~* '%window%'::text) AND (text ~~* '%lamp%'::text) AND (text ~~* '%research%'::text))
 39 |          Rows Removed by Filter: 100001
 40 |  Planning Time: 2.193 ms
 41 |  Execution Time: 87.500 ms
 42 |  ```
 43 | 
 44 | ## Full text search using `ilike` supported by trigram index
 45 | What is a trigram? See this example:
 46 | ```sql
 47 | >> CREATE EXTENSION pg_trgm;
 48 | CREATE EXTENSION
 49 | >> select show_trgm('fielded');
 50 |                 show_trgm
 51 | -----------------------------------------
 52 |  {"  f"," fi",ded,"ed ",eld,fie,iel,lde}
 53 | ```
 54 | 
 55 | We can improve `ilike` performance using trigram index, e.g. `gin_trgm_ops`.
 56 | ```sql
 57 | >> CREATE INDEX  ix_document_text_trigram ON document USING gin (text gin_trgm_ops) where language = 'en';
 58 | CREATE INDEX
 59 | 
 60 | >> EXPLAIN ANALYZE SELECT text, language
 61 |    FROM public.document
 62 |    WHERE
 63 |       text ilike '%field%'
 64 |       AND text ilike '%window%'
 65 |       AND text ilike '%lamp%'
 66 |       AND text ilike '%research%'
 67 |       AND language = 'en'
 68 |     LIMIT 1;
 69 |                                                                                        QUERY PLAN
 70 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 71 |  Limit  (cost=176.00..180.02 rows=1 width=105) (actual time=1.473..1.474 rows=0 loops=1)
 72 |    ->  Bitmap Heap Scan on document  (cost=176.00..180.02 rows=1 width=105) (actual time=1.470..1.471 rows=0 loops=1)
 73 |          Recheck Cond: ((text ~~* '%field%'::text) AND (text ~~* '%window%'::text) AND (text ~~* '%lamp%'::text) AND (text ~~* '%research%'::text) AND ((language)::text = 'en'::text))
 74 |          ->  Bitmap Index Scan on ix_document_text_trigram  (cost=0.00..176.00 rows=1 width=0) (actual time=1.466..1.466 rows=0 loops=1)
 75 |                Index Cond: ((text ~~* '%field%'::text) AND (text ~~* '%window%'::text) AND (text ~~* '%lamp%'::text) AND (text ~~* '%research%'::text))
 76 |  Planning Time: 2.389 ms
 77 |  Execution Time: 1.524 ms
 78 | ```
 79 | 
 80 | ## Create non-default language configuration for tsearch full text search
 81 | Postgres does not provide support for many languages by default. However, you can
 82 | setup the configuration quite easily. You just need additional dictionary files.
 83 | Here is an example for polish language.
 84 | Polish dictionary files can be downloaded from: https://github.com/judehunter/polish-tsearch.
 85 | 
 86 | polish.affix, polish.stop and polish.dict files should be copied to postgresql sharedir `tsearch_data` location,
 87 | e.g. `/usr/share/postgresql/13/tsearch_data`. To determin your sharedir location you can use `pg_config --sharedir`
 88 | 
 89 | There also must be created a configuration (see the [docs](https://www.postgresql.org/docs/current/textsearch-dictionaries.html)) inside database:
 90 | ```sql
 91 | >> DROP TEXT SEARCH DICTIONARY IF EXISTS polish_hunspell CASCADE;
 92 |    CREATE TEXT SEARCH DICTIONARY polish_hunspell (
 93 |     TEMPLATE  = ispell,
 94 |     DictFile  = polish,
 95 |     AffFile   = polish,
 96 |     StopWords = polish
 97 |   );
 98 |   CREATE TEXT SEARCH CONFIGURATION public.polish (
 99 |     COPY = pg_catalog.english
100 |   );
101 |   ALTER TEXT SEARCH CONFIGURATION polish
102 |     ALTER MAPPING
103 |     FOR
104 |         asciiword, asciihword, hword_asciipart,  word, hword, hword_part
105 |     WITH
106 |         polish_hunspell, simple;
107 | 
108 | ```
109 | You need these files and configuration because full text search engine uses lexeme comparing to find best matches
110 | (both query pattern and stored text are lexemized):
111 | ```sql
112 | >> SELECT to_tsquery('english', 'fielded'), to_tsvector('english', text)
113 |    FROM document
114 |    LIMIT 1;
115 |  to_tsquery |                                                                    to_tsvector
116 | ------------+----------------------------------------------------------------------------------------------------------------------------------------------------
117 |  'field'    | '19':16 'bat':12 'dead':8 'degre':1 'depth':5 'field':15 'lamp':13 'men':6 'put':14 'ranch':2 'tall':4 'time':3 'underlin':11 'wast':10 'window':9
118 | ```
119 | If you cannot provide dictionary files you can use full text in "simple" form (without transformation to lexeme):
120 | 
121 | ```sql
122 | >> SELECT to_tsquery('simple', 'fielded'), to_tsvector('simple', text)
123 |    FROM document
124 |    LIMIT 1;
125 |  to_tsquery |                                                                             to_tsvector
126 | ------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------
127 |  'fielded'  | '19':16 'bat':12 'below':7 'dead':8 'degree':1 'depth':5 'field':15 'lamp':13 'men':6 'putting':14 'ranch':2 'tall':4 'time':3 'underline':11 'waste':10 'window':9
128 | ```
129 | 
130 | ## Tsearch full text search without stored index
131 | ```sql
132 | >> EXPLAIN ANALYZE SELECT text, language
133 |    FROM public.document
134 |    WHERE to_tsvector('english', text) @@ to_tsquery('english', 'fielded & window & lamp & depth & test ')
135 |    LIMIT 1;
136 |                                                                                   QUERY PLAN
137 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
138 |  Limit  (cost=1000.00..18298.49 rows=1 width=103) (actual time=489.802..491.352 rows=0 loops=1)
139 |    ->  Gather  (cost=1000.00..18298.49 rows=1 width=103) (actual time=489.800..491.349 rows=0 loops=1)
140 |          Workers Planned: 1
141 |          Workers Launched: 1
142 |          ->  Parallel Seq Scan on document  (cost=0.00..17298.39 rows=1 width=103) (actual time=486.644..486.644 rows=0 loops=2)
143 |                Filter: (((language)::text = 'en'::text) AND (to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''lamp'' & ''depth'' & ''test'''::tsquery))
144 |                Rows Removed by Filter: 50000
145 |  Planning Time: 0.272 ms
146 |  Execution Time: 491.376 ms
147 | (9 rows)
148 | ```
149 | 
150 | ## Tsearch full text search with stored partial index
151 | Partial index gives as a possibility to store records in different languages using one table and query them effectively.
152 | ```sql
153 | >> CREATE INDEX ix_en_document_tsvector_text ON public.document USING gin (to_tsvector('english'::regconfig, text)) WHERE language = 'en';
154 | CREATED INDEX
155 | >> EXPLAIN ANALYZE SELECT text, language
156 |    FROM public.document
157 |    WHERE to_tsvector('english', text) @@ to_tsquery('english', 'fielded & window & lamp & depth & test ')
158 |    LIMIT 1;
159 |                                                                QUERY PLAN
160 | ----------------------------------------------------------------------------------------------------------------------------------------
161 |  Limit  (cost=1000.00..18151.43 rows=1 width=103) (actual time=487.120..488.569 rows=0 loops=1)
162 |    ->  Gather  (cost=1000.00..18151.43 rows=1 width=103) (actual time=487.117..488.567 rows=0 loops=1)
163 |          Workers Planned: 1
164 |          Workers Launched: 1
165 |          ->  Parallel Seq Scan on document  (cost=0.00..17151.33 rows=1 width=103) (actual time=484.418..484.419 rows=0 loops=2)
166 |                Filter: (to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''lamp'' & ''depth'' & ''test'''::tsquery)
167 |                Rows Removed by Filter: 50000
168 |  Planning Time: 0.193 ms
169 |  Execution Time: 488.596 ms
170 | ```
171 | 
172 | No difference? Index has not been used... Why is it not working?
173 | Ohh, looks to the partial index [docs](https://www.postgresql.org/docs/current/indexes-partial.html):
174 | > However, keep in mind that the predicate must match the conditions used in the queries that are supposed to benefit from the index.
175 | > To be precise, a partial index can be used in a query only if the system can recognize that the WHERE condition of the query mathematically implies the predicate of the index.
176 | > PostgreSQL does not have a sophisticated theorem prover that can recognize mathematically equivalent expressions that are written in different forms.
177 | > (Not only is such a general theorem prover extremely difficult to create, it would probably be too slow to be of any real use.)
178 | > The system can recognize simple inequality implications, for example "x < 1" implies "x < 2";
179 | > otherwise the predicate condition must exactly match part of the query's WHERE condition or the index will not be recognized as usable.
180 | > Matching takes place at query planning time, not at run time. As a result, parameterized query clauses do not work with a partial index.
181 | 
182 | We have to add to query a condition that was used to create partial index: `document.language = 'en'`:
183 | ```sql
184 | >> EXPLAIN ANALYZE SELECT text, language
185 |    FROM public.document
186 |    WHERE
187 |       to_tsvector('english', text) @@ to_tsquery('english', 'fielded & window & lamp & depth & test ')
188 |       AND language = 'en'
189 |    LIMIT 1;                                                                           QUERY PLAN
190 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
191 |  Limit  (cost=64.00..68.27 rows=1 width=103) (actual time=0.546..0.548 rows=0 loops=1)
192 |    ->  Bitmap Heap Scan on document  (cost=64.00..68.27 rows=1 width=103) (actual time=0.544..0.545 rows=0 loops=1)
193 |          Recheck Cond: ((to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''lamp'' & ''depth'' & ''test'''::tsquery) AND ((language)::text = 'en'::text))
194 |          ->  Bitmap Index Scan on ix_en_document_tsvector_text  (cost=0.00..64.00 rows=1 width=0) (actual time=0.540..0.540 rows=0 loops=1)
195 |                Index Cond: (to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''lamp'' & ''depth'' & ''test'''::tsquery)
196 |  Planning Time: 0.244 ms
197 |  Execution Time: 0.590 ms
198 | ```
199 | 
200 | ## Tsearch full text search for partial words
201 | `:*` operator enables prefix search. It can be useful to execute full text search during typing a word.
202 | ```sql
203 | >> EXPLAIN ANALYZE SELECT text, language
204 |    FROM public.document
205 |    WHERE
206 |       to_tsvector('english', text) @@ to_tsquery('english', 'fielded & window & l:*')
207 |       AND language = 'en'
208 |    LIMIT 1;
209 |                                                                    QUERY PLAN
210 | ------------------------------------------------------------------------------------------------------------------------------------------------
211 |  Bitmap Heap Scan on document  (cost=168.00..172.27 rows=1 width=102) (actual time=5.207..5.210 rows=4 loops=1)
212 |    Recheck Cond: ((to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''l'':*'::tsquery) AND ((language)::text = 'en'::text))
213 |    Heap Blocks: exact=4
214 |    ->  Bitmap Index Scan on ix_en_document_tsvector_text  (cost=0.00..168.00 rows=1 width=0) (actual time=5.202..5.202 rows=4 loops=1)
215 |          Index Cond: (to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''l'':*'::tsquery)
216 |  Planning Time: 0.240 ms
217 |  Execution Time: 5.240 ms
218 | 
219 | >> SELECT id,  text
220 |    FROM public.document
221 |    WHERE
222 |       to_tsvector('english', text) @@ to_tsquery('english', 'fielded & window & l:*')
223 |       AND language = 'en'
224 |    LIMIT 20;
225 |   id   |                                                   text
226 | -------+-----------------------------------------------------------------------------------------------------------
227 |      1 | ﻿degree ranch time tall depth men below dead window waste underline bat lamp putting field               +
228 |  20152 | Law pony follow memory star whatever window sets oxygen longer word whom glass field actual              +
229 |  21478 | Dried symbol willing design managed shade window pick share faster education drive field land everybody  +
230 |  30293 | Pencil seen engineer labor image entire smallest serve field should riding smaller window imagine traffic+
231 | 
232 | ```
233 | 
234 | ## Tsearch full text search results ranking
235 | There are two quite similar functions to rank tsearch results:
236 | - `ts_rank`, that ranks vectors based on the frequency of their matching lexemes
237 | - `ts_rank_cd`, that computes the "cover density" ranking
238 | 
239 | For more info, see the [docs](https://www.postgresql.org/docs/13/textsearch-controls.html)
240 | ```sql
241 | >> SELECT
242 |      id,
243 |      ts_rank_cd(to_tsvector('english', text), to_tsquery('english', 'fielded & wind:*')) rank,
244 |      text
245 |     FROM public.document
246 |     WHERE to_tsvector('english', text) @@ to_tsquery('english', 'fielded & wind:*')
247 |     ORDER BY rank DESC
248 |     LIMIT 20;
249 |    id   |    rank     |                                                   text
250 | --------+-------------+-----------------------------------------------------------------------------------------------------------
251 |  100002 |         0.1 | fielded window
252 |    9376 |        0.05 | Own mouse girl effect surprise physical newspaper forgot eat upper field element window simply unhappy   +
253 |   96597 |        0.05 | Opinion fastened pencil rear more theory size window heading field understanding farm up position attack +
254 |   44626 | 0.033333335 | Symbol each halfway window swam spider field page shinning donkey chose until cow cabin congress         +
255 |   80922 | 0.033333335 | Victory famous field shelter girl wind adventure he divide rear tip few studied ruler judge              +
256 |   30293 |       0.025 | Pencil seen engineer labor image entire smallest serve field should riding smaller window imagine traffic+
257 |       1 | 0.016666668 | ﻿degree ranch time tall depth men below dead window waste underline bat lamp putting field               +
258 |   21478 | 0.016666668 | Dried symbol willing design managed shade window pick share faster education drive field land everybody  +
259 |   60059 | 0.016666668 | However hungry make proud kids come willing field officer row above highest round wind mile              +
260 |   26001 | 0.014285714 | Earth earlier pocket might sense window way frog fire court family mouth field somebody recognize        +
261 |   20152 | 0.014285714 | Law pony follow memory star whatever window sets oxygen longer word whom glass field actual              +
262 |   37470 |      0.0125 | Farm weight balloon buried wind water donkey grain pig week should damage field was he                   +
263 |   49433 |        0.01 | Wind scientist leaving atom year bad child drink shore spirit field facing indicate wagon here           +
264 |   37851 | 0.007142857 | Field cloud you wife rhythm upward applied weigh continued property replace ahead forgotten trip window  +
265 | 
266 | ```
267 | `text='fielded window'` record was added manually to show best match result.
268 | 
269 | ## GIST vs GIN
270 | We have created GIN index. But there is also GIST index option. Which one is better?
271 | It depends...
272 | ```sql
273 | >> EXPLAIN ANALYZE SELECT text, language
274 |    FROM public.document
275 |    WHERE
276 |       to_tsvector('english', text) @@ to_tsquery('english', 'fielded & window & lamp & depth & test ')
277 |       AND language = 'en'
278 |    LIMIT 1;
279 |                                                                   QUERY PLAN
280 | -----------------------------------------------------------------------------------------------------------------------------------------------
281 |  Limit  (cost=0.28..8.30 rows=1 width=103) (actual time=2.699..2.700 rows=0 loops=1)
282 |    ->  Index Scan using ix_en_document_tsvector_text on document  (cost=0.28..8.30 rows=1 width=103) (actual time=2.697..2.697 rows=0 loops=1)
283 |          Index Cond: (to_tsvector('english'::regconfig, text) @@ '''field'' & ''window'' & ''lamp'' & ''depth'' & ''test'''::tsquery)
284 |  Planning Time: 0.274 ms
285 |  Execution Time: 2.730 ms
286 |  ```
287 |  GIN seems to be a little bit faster. I don't think I could explain it better than the [docs](https://www.postgresql.org/docs/13/textsearch-indexes.html) already does:
288 | > In choosing which index type to use, GiST or GIN, consider these performance differences:
289 | > - GIN index lookups are about three times faster than GiST
290 | > - GIN indexes take about three times longer to build than GiST
291 | > - GIN indexes are moderately slower to update than GiST indexes, but about 10 times slower if fast-update support was disabled (see Section 58.4.1 for details)
292 | > - GIN indexes are two-to-three times larger than GiST indexes
293 | 
294 | ## Inspiration and help
295 | - https://about.gitlab.com/blog/2016/03/18/fast-search-using-postgresql-trigram-indexes/
296 | - http://rachbelaid.com/postgres-full-text-search-is-good-enough/
297 | - https://scoutapm.com/blog/how-to-make-text-searches-in-postgresql-faster-with-trigram-similarity
298 | - https://stackoverflow.com/questions/27443950/make-postgres-full-text-search-tsvector-act-like-ilike-to-search-inside-words
299 | - https://stackoverflow.com/questions/46122175/fulltext-search-combined-with-fuzzysearch-in-postgresql
300 | - https://stackoverflow.com/questions/58651852/use-postgresql-full-text-search-to-fuzzy-match-all-search-terms
301 | - https://stackoverflow.com/questions/52140727/fuzzy-search-in-full-text-search
302 | - https://stackoverflow.com/questions/2513501/postgresql-full-text-search-how-to-search-partial-words
303 | - https://stackoverflow.com/questions/28975517/difference-between-gist-and-gin-index
304 | - https://dba.stackexchange.com/questions/149765/postgresql-gin-index-not-used-when-ts-query-language-is-fetched-from-a-column
305 | - https://dba.stackexchange.com/questions/251177/postgres-full-text-search-on-words-not-lexemes
306 | 


--------------------------------------------------------------------------------
/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | black==21.12b0
2 | flake8==4.0.1
3 | isort==5.10.1
4 | pre-commit==2.17.0
5 | pytest==6.2.5
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile = "black"
3 | multi_line_output = 3
4 | src_paths = "src/"
5 | 
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sqlalchemy==1.4.31
2 | psycopg2==2.9.3


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jorzel/postgres-full-text-search/9aeed6a05c74c7c01d2f8678d00b7f136fb2355d/src/__init__.py


--------------------------------------------------------------------------------
/src/db.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import re
 3 | 
 4 | from sqlalchemy import Column, Integer, MetaData, create_engine
 5 | from sqlalchemy.ext.declarative import declarative_base, declared_attr
 6 | from sqlalchemy.orm import configure_mappers
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | pattern = re.compile(r"(?<!^)(?=[A-Z])")
11 | 
12 | 
13 | def polish_text_search_configuration_command():
14 |     return """
15 |         DROP TEXT SEARCH DICTIONARY IF EXISTS polish_hunspell CASCADE;
16 |         CREATE TEXT SEARCH DICTIONARY polish_hunspell (
17 |             TEMPLATE  = ispell,
18 |             DictFile  = polish,
19 |             AffFile   = polish,
20 |             StopWords = polish
21 |         );
22 |         CREATE TEXT SEARCH CONFIGURATION public.polish (
23 |             COPY = pg_catalog.english
24 |         );
25 |         ALTER TEXT SEARCH CONFIGURATION polish
26 |             ALTER MAPPING
27 |             FOR
28 |                 asciiword, asciihword, hword_asciipart,  word, hword, hword_part
29 |             WITH
30 |                 polish_hunspell, simple;
31 |     """
32 | 
33 | 
34 | def toregconfig_function_command():
35 |     return """
36 |          CREATE OR REPLACE FUNCTION toregconfig(text)
37 |          RETURNS regconfig AS 'select (
38 |             case
39 |             when $1=''fr'' then ''french''
40 |             when $1=''es'' then ''spanish''
41 |             when $1=''pl'' then ''polish''
42 |             when $1=''ru'' then ''russian''
43 |             when $1=''en'' then ''english''
44 |             else ''simple''
45 |             end
46 |         )::regconfig;' LANGUAGE SQL IMMUTABLE
47 |         RETURNS NULL ON NULL INPUT;
48 |     """
49 | 
50 | 
51 | def camel_case_to_underscore(name: str) -> str:
52 |     return pattern.sub("_", name).lower()
53 | 
54 | 
55 | class Base:
56 |     """
57 |     Abstract model providing basic fields and tablename
58 |     """
59 | 
60 |     @declared_attr
61 |     def __tablename__(cls):
62 |         return camel_case_to_underscore(cls.__name__).lower()
63 | 
64 |     id = Column(Integer, primary_key=True, autoincrement=True)
65 | 
66 | 
67 | SQLALCHEMY_DATABASE_URI = "postgresql://postgres:postgres@localhost:5432/ftdb"
68 | 
69 | engine = create_engine(SQLALCHEMY_DATABASE_URI)
70 | metadata = MetaData(bind=engine)
71 | BaseModel = declarative_base(cls=Base, metadata=metadata)
72 | 
73 | 
74 | from models import Document  # noqa
75 | 
76 | configure_mappers()
77 | 


--------------------------------------------------------------------------------
/src/filtering.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import func
 2 | from sqlalchemy.orm import Query, Session
 3 | 
 4 | from models import Document
 5 | 
 6 | 
 7 | def filter_documents(session: Session, language: str, search: str) -> Query:
 8 |     filter_args = []
 9 |     filter_args.append(Document.language == language)
10 |     if search:
11 |         words = search.split()
12 |         if search[-1] != " ":
13 |             # no whitespace at the end means that word is still typed and not completed
14 |             words[-1] = f"{words[-1]}:*"
15 |         ts_pattern = "&".join(words)
16 |         tsquery = func.to_tsquery(func.toregconfig(language), ts_pattern)
17 |         filter_args.append(Document.tsvector_text.op("@@")(tsquery))
18 |     return session.query(Document).filter(*filter_args)
19 | 


--------------------------------------------------------------------------------
/src/init_db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy.orm import sessionmaker
 2 | 
 3 | from db import (
 4 |     engine,
 5 |     metadata,
 6 |     polish_text_search_configuration_command,
 7 |     toregconfig_function_command,
 8 | )
 9 | from models import Document
10 | 
11 | metadata.drop_all()
12 | engine.execute(toregconfig_function_command())
13 | engine.execute(polish_text_search_configuration_command())
14 | metadata.create_all()
15 | print("Database initialized")
16 | 
17 | Session = sessionmaker(bind=engine)
18 | 
19 | with Session() as session:
20 |     with open("dataset.txt", mode="rt") as textfile:
21 |         for row in textfile.readlines():
22 |             d = Document(
23 |                 text=row.replace('"', "").replace(",", " ").capitalize(),
24 |                 language="en",
25 |             )
26 |             session.add(d)
27 |     session.commit()
28 | print("Dataset loaded")
29 | 


--------------------------------------------------------------------------------
/src/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, Index, String, Text, func
 2 | 
 3 | from db import BaseModel
 4 | 
 5 | 
 6 | class Document(BaseModel):
 7 |     text = Column(Text, nullable=False, default="", server_default="")
 8 |     language = Column(String, nullable=False, default="en", server_default="en")
 9 | 
10 |     tsvector_text = func.to_tsvector(func.toregconfig(language), text)
11 | 
12 |     __table_args__ = (
13 |         Index(
14 |             "ix_pl_document_tsvector_text",
15 |             func.to_tsvector("polish", text),
16 |             postgresql_using="gin",
17 |             postgresql_where=(language == "pl"),
18 |         ),
19 |         Index(
20 |             "ix_en_document_tsvector_text",
21 |             func.to_tsvector("english", text),
22 |             postgresql_using="gin",
23 |             postgresql_where=(language == "en"),
24 |         ),
25 |     )
26 | 
27 |     def __str__(self):
28 |         return f"Document(text={self.text}, language={self.language}"
29 | 
30 |     __repr__ = __str__
31 | 


--------------------------------------------------------------------------------
/src/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from sqlalchemy import create_engine
 3 | from sqlalchemy.orm import Session, sessionmaker
 4 | 
 5 | from db import (
 6 |     BaseModel,
 7 |     polish_text_search_configuration_command,
 8 |     toregconfig_function_command,
 9 | )
10 | from models import Document
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def model_base():
15 |     return BaseModel
16 | 
17 | 
18 | @pytest.yield_fixture(scope="session")
19 | def db_connection(model_base):
20 |     SQLALCHEMY_DATABASE_URI = "postgresql://postgres:postgres@localhost:5432/testdb"
21 |     engine = create_engine(SQLALCHEMY_DATABASE_URI)
22 |     model_base.metadata.bind = engine
23 |     model_base.metadata.drop_all()
24 |     engine.execute(toregconfig_function_command())
25 |     engine.execute(polish_text_search_configuration_command())
26 |     model_base.metadata.create_all()
27 |     connection = engine.connect()
28 | 
29 |     yield connection
30 | 
31 |     model_base.metadata.drop_all()
32 |     engine.dispose()
33 | 
34 | 
35 | @pytest.yield_fixture
36 | def db_session(db_connection):
37 |     transaction = db_connection.begin()
38 |     session = sessionmaker(bind=db_connection, class_=Session)
39 |     db_session = session()
40 | 
41 |     yield db_session
42 | 
43 |     transaction.rollback()
44 |     db_session.close()
45 | 
46 | 
47 | @pytest.fixture
48 | def document_factory(db_session: Session):
49 |     def _document_factory(text: str, language: str):
50 |         document = Document(text=text, language=language)
51 |         db_session.add(document)
52 |         db_session.flush()
53 |         return document
54 | 
55 |     yield _document_factory
56 | 


--------------------------------------------------------------------------------
/src/tests/test_searching.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from filtering import filter_documents
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "text, search_pattern",
 8 |     [
 9 |         ("benefit", "benefits"),
10 |         ("benefits", "benefit"),
11 |         ("work", "working"),
12 |         ("working hours", "work hour"),
13 |         ("checking type", "check types"),
14 |         ("Best person at the world", "best world"),
15 |         ("I like swimming and fishing. I have finished a book about it", "swim finish"),
16 |     ],
17 | )
18 | def test_filter_full_text_search_en(text, search_pattern, db_session, document_factory):
19 |     language = "en"
20 |     document = document_factory(text=text, language=language)
21 |     _ = document_factory(text="", language=language)
22 | 
23 |     results = filter_documents(db_session, language, search=search_pattern)
24 | 
25 |     assert results.count() == 1
26 |     assert results.first() == document
27 | 
28 | 
29 | @pytest.mark.parametrize(
30 |     "text, search_pattern",
31 |     [
32 |         ("benefit", "benef"),
33 |         ("working hours", "work hou"),
34 |         ("checking type", "check t"),
35 |         ("Best person at the world", "best worl"),
36 |         ("I like swimming and fishing. I have finished a book about it", "swim fin"),
37 |     ],
38 | )
39 | def test_filter_partial_words_full_text_search_en(
40 |     text, search_pattern, db_session, document_factory
41 | ):
42 |     language = "en"
43 |     document = document_factory(text=text, language=language)
44 |     _ = document_factory(text="", language=language)
45 | 
46 |     results = filter_documents(db_session, language, search=search_pattern)
47 | 
48 |     assert results.count() == 1
49 |     assert results.first() == document
50 | 
51 | 
52 | @pytest.mark.parametrize(
53 |     "text, search_pattern",
54 |     [
55 |         ("test", "test"),
56 |         ("Benefity w firmie", "benefit"),
57 |         ("Benefity w firmie", "firma benefity"),
58 |         ("Praca zdalna w firmie", "praca firma"),
59 |         ("Benefity", "benefit"),
60 |         ("Delegacje", "delegacja"),
61 |         ("Praca zdalna", "zdalnie"),
62 |         ("Czytanie jest bardzo ważne. Zgadzasz się?", "czytać zgadzać"),
63 |     ],
64 | )
65 | def test_filter_full_text_search_pl(text, search_pattern, db_session, document_factory):
66 |     language = "pl"
67 |     document = document_factory(text=text, language=language)
68 |     _ = document_factory(text="", language=language)
69 | 
70 |     results = filter_documents(db_session, language, search=search_pattern)
71 | 
72 |     assert results.count() == 1
73 |     assert results.first() == document
74 | 


--------------------------------------------------------------------------------
/tsearch_data/polish.stop:
--------------------------------------------------------------------------------
  1 | a
  2 | aby
  3 | ach
  4 | acz
  5 | aczkolwiek
  6 | aj
  7 | albo
  8 | ale
  9 | alez
 10 | ależ
 11 | ani
 12 | az
 13 | aż
 14 | bardziej
 15 | bardzo
 16 | beda
 17 | bedzie
 18 | deda
 19 | będą
 20 | bede
 21 | będę
 22 | będzie
 23 | bo
 24 | bowiem
 25 | by
 26 | byc
 27 | być
 28 | byl
 29 | byla
 30 | byli
 31 | bylo
 32 | byly
 33 | był
 34 | była
 35 | było
 36 | były
 37 | bynajmniej
 38 | cala
 39 | cali
 40 | caly
 41 | cała
 42 | cały
 43 | ci
 44 | cie
 45 | ciebie
 46 | cię
 47 | co
 48 | cokolwiek
 49 | cos
 50 | coś
 51 | czasami
 52 | czasem
 53 | czemu
 54 | czy
 55 | czyli
 56 | daleko
 57 | dla
 58 | dlaczego
 59 | dlatego
 60 | do
 61 | dobrze
 62 | dokad
 63 | dokąd
 64 | dosc
 65 | dość
 66 | duzo
 67 | dużo
 68 | dwa
 69 | dwaj
 70 | dwie
 71 | dwoje
 72 | dzis
 73 | dzisiaj
 74 | dziś
 75 | gdy
 76 | gdyby
 77 | gdyz
 78 | gdyż
 79 | gdzie
 80 | gdziekolwiek
 81 | gdzies
 82 | gdzieś
 83 | i
 84 | ich
 85 | ile
 86 | im
 87 | inna
 88 | inne
 89 | inny
 90 | innych
 91 | iz
 92 | iż
 93 | ja
 94 | jak
 95 | jakas
 96 | jakaś
 97 | jakby
 98 | jaki
 99 | jakichs
100 | jakichś
101 | jakie
102 | jakis
103 | jakiś
104 | jakiz
105 | jakiż
106 | jakkolwiek
107 | jako
108 | jakos
109 | jakoś
110 | ją
111 | je
112 | jeden
113 | jedna
114 | jednak
115 | jednakze
116 | jednakże
117 | jedno
118 | jego
119 | jej
120 | jemu
121 | jesli
122 | jest
123 | jestem
124 | jeszcze
125 | jeśli
126 | jezeli
127 | jeżeli
128 | juz
129 | już
130 | kazdy
131 | każdy
132 | kiedy
133 | kilka
134 | kims
135 | kimś
136 | kto
137 | ktokolwiek
138 | ktora
139 | ktore
140 | ktorego
141 | ktorej
142 | ktory
143 | ktorych
144 | ktorym
145 | ktorzy
146 | ktos
147 | ktoś
148 | która
149 | które
150 | którego
151 | której
152 | który
153 | których
154 | którym
155 | którzy
156 | ku
157 | lat
158 | lecz
159 | lub
160 | ma
161 | mają
162 | mało
163 | mam
164 | mi
165 | miedzy
166 | między
167 | mimo
168 | mna
169 | mną
170 | mnie
171 | moga
172 | mogą
173 | moi
174 | moim
175 | moj
176 | moja
177 | moje
178 | moze
179 | mozliwe
180 | mozna
181 | może
182 | możliwe
183 | można
184 | mój
185 | mu
186 | musi
187 | my
188 | na
189 | nad
190 | nam
191 | nami
192 | nas
193 | nasi
194 | nasz
195 | nasza
196 | nasze
197 | naszego
198 | naszych
199 | natomiast
200 | natychmiast
201 | nawet
202 | nia
203 | nią
204 | nic
205 | nich
206 | nie
207 | niech
208 | niego
209 | niej
210 | niemu
211 | nigdy
212 | nim
213 | nimi
214 | niz
215 | niż
216 | no
217 | o
218 | obok
219 | od
220 | około
221 | on
222 | ona
223 | one
224 | oni
225 | ono
226 | oraz
227 | oto
228 | owszem
229 | pan
230 | pana
231 | pani
232 | po
233 | pod
234 | podczas
235 | pomimo
236 | ponad
237 | poniewaz
238 | ponieważ
239 | powinien
240 | powinna
241 | powinni
242 | powinno
243 | poza
244 | prawie
245 | przeciez
246 | przecież
247 | przed
248 | przede
249 | przedtem
250 | przez
251 | przy
252 | roku
253 | rowniez
254 | również
255 | sama
256 | są
257 | sie
258 | się
259 | skad
260 | skąd
261 | soba
262 | sobą
263 | sobie
264 | sposob
265 | sposób
266 | swoje
267 | ta
268 | tak
269 | taka
270 | taki
271 | takie
272 | takze
273 | także
274 | tam
275 | te
276 | tego
277 | tej
278 | ten
279 | teraz
280 | też
281 | to
282 | toba
283 | tobą
284 | tobie
285 | totez
286 | toteż
287 | totobą
288 | trzeba
289 | tu
290 | tutaj
291 | twoi
292 | twoim
293 | twoj
294 | twoja
295 | twoje
296 | twój
297 | twym
298 | ty
299 | tych
300 | tylko
301 | tym
302 | u
303 | w
304 | wam
305 | wami
306 | was
307 | wasz
308 | wasza
309 | wasze
310 | we
311 | według
312 | wiele
313 | wielu
314 | więc
315 | więcej
316 | wlasnie
317 | właśnie
318 | wszyscy
319 | wszystkich
320 | wszystkie
321 | wszystkim
322 | wszystko
323 | wtedy
324 | wy
325 | z
326 | za
327 | zaden
328 | zadna
329 | zadne
330 | zadnych
331 | zapewne
332 | zawsze
333 | ze
334 | zeby
335 | zeznowu
336 | zł
337 | znow
338 | znowu
339 | znów
340 | zostal
341 | został
342 | żaden
343 | żadna
344 | żadne
345 | żadnych
346 | że
347 | żeby
348 | 


--------------------------------------------------------------------------------