├── .github
└── workflows
│ └── ci.yml
├── .gitignore
├── .readthedocs.yaml
├── LICENSE.rst
├── Makefile
├── README.rst
├── advanced
├── admin_notes.rst
├── custom_weighting.rst
├── index.rst
├── postingsource.rst
├── replication.rst
├── scalability.rst
├── serialisation.rst
└── unigramlm.rst
├── attic
├── clustering.rst
├── eset.rst
├── geospatial.rst
├── pagination.rst
├── query_authorisation.rst
├── range_performance.rst
└── remote.rst
├── code
├── c++
│ ├── .gitignore
│ ├── delete1.cc
│ ├── index1.cc
│ ├── index_facets.cc
│ ├── index_filters.cc
│ ├── index_ranges.cc
│ ├── index_ranges2.cc
│ ├── index_values_with_geo.cc
│ ├── search1.cc
│ ├── search_facets.cc
│ ├── search_filters.cc
│ ├── search_filters2.cc
│ ├── search_ranges.cc
│ ├── search_ranges2.cc
│ ├── search_sorting.cc
│ ├── search_sorting2.cc
│ ├── search_sorting3.cc
│ ├── search_synonyms.cc
│ ├── support.cc
│ └── support.h
├── expected.out
│ ├── delete1.out
│ ├── index1.db_title=3asunwatch.out
│ ├── index1.out
│ ├── index_facets.out
│ ├── index_filters.out
│ ├── index_ranges.out
│ ├── index_ranges2.out
│ ├── index_values_with_geo.out
│ ├── search1.db_Dent_watch.out
│ ├── search1.db_description=3a=5c=22leather_case=5c=22_AND_title=3asundial.out
│ ├── search1.db_watch.out
│ ├── search1.out
│ ├── search_facets.out
│ ├── search_filters.out
│ ├── search_filters2.out
│ ├── search_ranges.db_..50mm.out
│ ├── search_ranges.db_1000..mm_1800..1899.out
│ ├── search_ranges.db_1980..1989.out
│ ├── search_ranges.db_clock_1960...out
│ ├── search_ranges2.statesdb_10000000...out
│ ├── search_ranges2.statesdb_11=2f08=2f1889..07=2f10=2f1890.out
│ ├── search_ranges2.statesdb_1780..1789_10000000...out
│ ├── search_ranges2.statesdb_1800..1899.out
│ ├── search_ranges2.statesdb_spanish.out
│ ├── search_sorting.out
│ ├── search_sorting2.out
│ ├── search_sorting3.out
│ ├── search_synonyms.db_time.out
│ └── search_synonyms.out
├── java
│ ├── .gitignore
│ ├── delete1.java
│ ├── index1.java
│ ├── index1.java.data=2f100-objects-v1.csv_db.out
│ ├── search1.java
│ └── support.java
├── perl
│ ├── Support.pm
│ ├── delete1.pl
│ ├── index1.pl
│ ├── index_facets.pl
│ ├── index_filters.pl
│ ├── index_ranges.pl
│ ├── index_ranges2.pl
│ ├── search1.pl
│ ├── search_facets.pl
│ ├── search_filters.pl
│ ├── search_filters2.pl
│ ├── search_sorting.pl
│ ├── search_sorting2.pl
│ ├── search_synonyms.pl
│ └── strings.t
├── php
│ ├── delete1.php
│ ├── index1.php
│ ├── index_facets.php
│ ├── index_filters.php
│ ├── logger.php
│ ├── parsecsv.php
│ ├── search1.php
│ ├── search_facets.php
│ ├── search_filters.php
│ └── search_filters2.php
├── python
│ ├── delete1.py
│ ├── from_wikipedia.py
│ ├── index1.py
│ ├── index_facets.py
│ ├── index_filters.py
│ ├── index_ranges.py
│ ├── index_ranges2.py
│ ├── index_sorting.py
│ ├── index_values_with_geo.py
│ ├── postingsource.py
│ ├── search1.py
│ ├── search_facets.py
│ ├── search_filters.py
│ ├── search_filters2.py
│ ├── search_ranges.py
│ ├── search_ranges2.py
│ ├── search_sorting.py
│ ├── search_sorting2.py
│ ├── search_sorting3.py
│ ├── search_synonyms.py
│ └── support.py
├── python3
│ ├── delete1.py
│ ├── index1.py
│ ├── index_facets.py
│ ├── index_filters.py
│ ├── index_ranges.py
│ ├── index_ranges2.py
│ ├── index_sorting.py
│ ├── index_values_with_geo.py
│ ├── postingsource.py
│ ├── search1.py
│ ├── search_facets.py
│ ├── search_filters.py
│ ├── search_filters2.py
│ ├── search_ranges.py
│ ├── search_ranges2.py
│ ├── search_sorting.py
│ ├── search_sorting2.py
│ ├── search_sorting3.py
│ ├── search_synonyms.py
│ └── support.py
└── ruby
│ ├── delete1.rb
│ ├── index1.rb
│ ├── index_facets.rb
│ ├── index_filters.rb
│ ├── index_ranges.rb
│ ├── index_ranges2.rb
│ ├── index_values_with_geo.rb
│ ├── search1.rb
│ ├── search_facets.rb
│ ├── search_filters.rb
│ ├── search_filters2.rb
│ ├── search_ranges.rb
│ ├── search_ranges2.rb
│ ├── search_sorting.rb
│ ├── search_sorting2.rb
│ ├── search_sorting3.rb
│ ├── search_synonyms.rb
│ └── support.rb
├── concepts
├── concurrency.rst
├── index.rst
├── indexing
│ ├── databases.rst
│ ├── documents.rst
│ ├── index.rst
│ ├── limitations.rst
│ ├── termgenerator.rst
│ ├── terms.rst
│ ├── uniqueness.rst
│ └── values.rst
├── introduction.rst
└── search
│ ├── index.rst
│ ├── queries.rst
│ ├── queryparser.rst
│ ├── ranked_matches.rst
│ └── search_limitations.rst
├── conf.py
├── data
├── 100-objects-v1.csv
├── 100-objects-v2.csv
├── states.csv
└── us_states_on_wikipedia
├── deprecation
├── deprecation.rst
├── features_deprecated.rst
├── features_removed.rst
└── index.rst
├── glossary.rst
├── howtos
├── boolean_filters.rst
├── collapsing.rst
├── facets.rst
├── index.rst
├── iterate_all_docs.rst
├── range_queries.rst
├── sorting.rst
├── spelling.rst
├── synonyms.rst
└── weighting_scheme.rst
├── index.rst
├── language_specific.rst
├── language_specific
├── c++
│ └── index.rst
├── csharp
│ └── index.rst
├── java
│ ├── index.rst
│ └── running_examples.rst
├── lua
│ └── index.rst
├── perl
│ └── index.rst
├── php
│ └── index.rst
├── python
│ └── index.rst
├── python3
│ └── index.rst
├── ruby
│ └── index.rst
└── tcl
│ └── index.rst
├── oldmanual
└── output.txt
├── overview.rst
├── practical_example
├── index.rst
├── indexing
│ ├── building_a_museum_catalogue.rst
│ ├── index.rst
│ ├── index_plan.rst
│ ├── updating_the_database.rst
│ ├── verifying_the_index.rst
│ ├── what_data_is_there.rst
│ ├── what_do_people_want_to_search_for.rst
│ └── writing_the_code.rst
└── searching
│ ├── building.rst
│ ├── database_modified.rst
│ ├── index.rst
│ ├── prefix.rst
│ └── running_the_search.rst
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | .*.sw?
3 | *.pyc
4 | _build
5 | ENV
6 | db
7 | statesdb
8 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the version of Python and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.11"
13 | jobs:
14 | # We want to replace the build command itself but the "Override the build
15 | # process" feature means we have to reimplement everything, so instead we
16 | # "Extend the build process" and set up a dummy no-op conf.py in
17 | # `pre_build` so `build` doesn't do anything, then actually build in
18 | # `post_build`.
19 | pre_build:
20 | - mkdir dummy
21 | - touch dummy/conf.py dummy/contents.rst
22 | post_build:
23 | - rm -rf dummy _readthedocs/html
24 | - make html LANGUAGE=python3 BUILDDIR=_readthedocs
25 |
26 | # (Don't) build documentation with Sphinx
27 | sphinx:
28 | configuration: dummy/conf.py
29 |
30 | # We recommend specifying your dependencies to enable reproducible builds:
31 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
32 | python:
33 | install:
34 | - requirements: requirements.txt
35 |
--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
1 | License and authors
2 | ===================
3 |
4 | This license applies to all documentation and example code in this book.
5 | Data sets are provided under suitable Creative Commons licenses.
6 |
7 | | Copyright (c) 2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts
8 | | Copyright (c) 2006,2007,2008,2009 Lemur Consulting Ltd
9 | | Copyright (c) 2007 Deron Meranda
10 | | Copyright (c) 2007 Jenny Black
11 | | Copyright (c) 2010,2011 Richard Boulton
12 | | Copyright (c) 2011 Justin Finkelstein
13 | | Copyright (c) 2011,2012 Dan Colish
14 | | Copyright (c) 2003,2006,2011,2012,2013,2014 James Aylett
15 | | Copyright (c) 2013 Aarsh Shah
16 | | Copyright (c) 2014 Jorge Carleitao
17 | | Copyright (c) 2014 Guarav Arora
18 | | Copyright (c) 2014 Assem Chelli
19 | | Copyright (c) 2014 Mayank Chaudhary
20 | | Copyright (c) 2016 Aakash Muttineni
21 | | Copyright (c) 2016 Vivek Pal
22 | | Copyright (c) 2016 Parth Gupta
23 | | Copyright (c) 2018 Marco Pessotto
24 |
25 |
26 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
27 |
28 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
29 |
30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Xapian documentation sprint
2 | ===========================
3 |
4 | This is the source for `Xapian's user
5 | guide `_.
6 | Eventually this repository will be merged into the main Xapian tree.
7 |
8 | You will need the `Sphinx documentation tool `_
9 | installed to process this documentation. You can install the `python3-sphinx`
10 | or `python-sphinx` package on Debian, Fedora and Ubuntu, or ``pip install -r
11 | requirements.txt`` to install the python package directly.
12 |
13 | You can generate versions for different programming languages (with translated
14 | examples and adjustments to the text). For full details see ``make help``
15 | but for example to generate an HTML version for C++ use::
16 |
17 | make html LANGUAGE=c++
18 |
19 | The default if `LANGUAGE` isn't specified (e.g. when you run just ``make
20 | html``) is to build for `python3`.
21 |
22 | You can chat to us on matrix or via our mailing lists. Links to
23 | these are `on our website `_.
24 |
--------------------------------------------------------------------------------
/advanced/index.rst:
--------------------------------------------------------------------------------
1 | Advanced features
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | postingsource
8 | unigramlm
9 | custom_weighting
10 | admin_notes
11 | scalability
12 | replication
13 | serialisation
14 |
--------------------------------------------------------------------------------
/attic/clustering.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | Clustering
3 | ==========
4 |
5 | Document this once we have support for it.
6 |
--------------------------------------------------------------------------------
/attic/eset.rst:
--------------------------------------------------------------------------------
1 | ======================
2 | Building an expand set
3 | ======================
4 |
5 | .. todo::
6 |
7 | Introduction, rationale and example.
8 | Documentation of the algorithm used by Enquire::get_eset().
9 |
10 | See https://lists.xapian.org/pipermail/xapian-discuss/2008-February/005263.html
11 | for a short discussion on the size of the RSet to use.
12 |
--------------------------------------------------------------------------------
/attic/geospatial.rst:
--------------------------------------------------------------------------------
1 | Geospatial searches
2 | ===================
3 |
4 | Xapian's geospatial support is currently still a work in progress, and
5 | until it is generally available this documentation will either be
6 | empty or may be out of date.
7 |
8 | .. todo:: Write about how to index geolocation information, and how to use the
9 | geo posting sources and keymaker to sort by distance, bias results by
10 | distance, and limit results by distance. Discuss storing geo
11 | bounding boxes terms for accelerating distance limited searches.
12 |
--------------------------------------------------------------------------------
/attic/pagination.rst:
--------------------------------------------------------------------------------
1 | Pagination
2 | ==========
3 |
4 | This will be a howto.
5 |
6 | .. todo:: pagination (ask for pagesize*#pages + 1)
7 | .. todo:: get_matches_estimated() and check_at_least
8 |
--------------------------------------------------------------------------------
/attic/query_authorisation.rst:
--------------------------------------------------------------------------------
1 | Query authorisation
2 | ===================
3 |
4 | Say you are building a system that allows people to write private
5 | diary entries, and only share them with specific people. You wouldn't
6 | want search to expose those entries to everyone, so you need to build
7 | understanding of your authorisation scheme into the search system.
8 |
9 | .. todo:: list up front the various methods
10 |
11 | .. todo:: mention that omindex now indexes Unix user and group permissions.
12 |
13 | Filtering results
14 | -----------------
15 |
16 | .. todo:: Discuss filtering results coming back from a query, and the problems
17 | with just doing that.
18 |
19 | Putting authorisation data into the search index
20 | ------------------------------------------------
21 |
22 | .. todo:: Discuss implementing auth schemes by indexing
23 | appropriate data.
24 |
25 | Hybrid schemes
26 | --------------
27 |
28 | .. todo:: Discuss hybrid schemes (implementing auth using
29 | indexed terms, and also filtering results).
30 |
31 | Timeliness of index authorisation
32 | ---------------------------------
33 |
34 | .. todo:: Discuss issues relating
35 | to updates (in particular, how fast does something need to be hidden
36 | if it is changed to being private).
37 |
--------------------------------------------------------------------------------
/attic/range_performance.rst:
--------------------------------------------------------------------------------
1 | Performance of Value Ranges
2 | ===========================
3 |
4 | If combined with a suitable term-based query (such as an `OP_AND`
5 | query over one or more terms), this performance impact will be less
6 | because the range operation will only have to run over the potential
7 | matches, which are reduced from the entire database by the term-based
8 | query.
9 |
10 | If, as well as using document values, you also convert groups of those
11 | values into terms, you can provide those term-based queries even when
12 | your users are only interested in a pure range search. For instance,
13 | consider the population information. If you divide the range of
14 | populations into a number of subranges, you can allocate a term to
15 | describe each. We'll use a prefix of `XP` (for "population") here.
16 |
17 | +------------------+------+
18 | | Population range | Term |
19 | +==================+======+
20 | | 0 - 10 million | XP0 |
21 | +------------------+------+
22 | | 10 - 20 million | XP1 |
23 | +------------------+------+
24 | | 20 - 30 million | XP2 |
25 | +------------------+------+
26 | | 30 - 40 million | XP3 |
27 | +------------------+------+
28 |
29 | Then you can use a custom :xapian-class:`RangeProcessor` to produce a
30 | query which uses :xapian-just-constant:`OP_VALUE_RANGE` to match the
31 | range exactly, but first limits the number of documents that this
32 | needs to consider use the filter terms above. For instance, if the user asks
33 | for '..15000000', you can use :xapian-just-constant:`OP_FILTER` to limit
34 | the value range subquery to only considering documents matching a
35 | :xapian-just-constant:`OP_AND` subquery with terms `XP0` and `XP1`.
36 |
37 | .. todo:: possibly implementing this example would help make it more clear.
38 |
39 | .. todo:: Now ticket #663 is done and
40 | we have RangeProcessor, we can move this to advanced and the range
41 | queries howto should point here.
42 |
--------------------------------------------------------------------------------
/attic/remote.rst:
--------------------------------------------------------------------------------
1 | =============================
2 | Working with remote databases
3 | =============================
4 |
5 | .. contents:: Table of contents
6 |
7 | .. todo:: This needs writing from scratch; the previous document wasn't very good and is license entangled.
8 |
--------------------------------------------------------------------------------
/code/c++/.gitignore:
--------------------------------------------------------------------------------
1 | .libs
2 | *.lo
3 | *.o
4 | built
5 |
--------------------------------------------------------------------------------
/code/c++/delete1.cc:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 |
6 | using namespace std;
7 |
8 | // Start of example code.
9 | static void delete_docs(const string &dbpath, char ** identifiers)
10 | {
11 | // Open the database we're going to be deleting from.
12 | Xapian::WritableDatabase db(dbpath, Xapian::DB_OPEN);
13 |
14 | while (*identifiers) {
15 | string idterm = "Q";
16 | idterm += *identifiers;
17 | db.delete_document(idterm);
18 | ++identifiers;
19 | }
20 | }
21 | // End of example code.
22 |
23 | int main(int argc, char** argv) {
24 | if (argc < 3) {
25 | cerr << "Usage: " << argv[0] << " DBPATH ID...\n";
26 | return 1;
27 | }
28 | delete_docs(argv[1], argv + 2);
29 | }
30 |
--------------------------------------------------------------------------------
/code/c++/search1.cc:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #include "support.h"
9 |
10 | using namespace std;
11 |
12 | // Start of example code.
13 | static void
14 | search(const string & dbpath, const string & querystring,
15 | Xapian::doccount offset = 0, Xapian::doccount pagesize = 10)
16 | {
17 | // offset - defines starting point within result set.
18 | // pagesize - defines number of records to retrieve.
19 |
20 | // Open the database we're going to search.
21 | Xapian::Database db(dbpath);
22 |
23 | // Set up a QueryParser with a stemmer and suitable prefixes.
24 | Xapian::QueryParser queryparser;
25 | queryparser.set_stemmer(Xapian::Stem("en"));
26 | queryparser.set_stemming_strategy(queryparser.STEM_SOME);
27 | // Start of prefix configuration.
28 | queryparser.add_prefix("title", "S");
29 | queryparser.add_prefix("description", "XD");
30 | // End of prefix configuration.
31 |
32 | // And parse the query.
33 | Xapian::Query query = queryparser.parse_query(querystring);
34 |
35 | // Use an Enquire object on the database to run the query.
36 | Xapian::Enquire enquire(db);
37 | enquire.set_query(query);
38 |
39 | // And print out something about each match.
40 | Xapian::MSet mset = enquire.get_mset(offset, pagesize);
41 |
42 | clog << "'" << querystring << "'[" << offset << ":" << offset + pagesize
43 | << "] =";
44 | for (Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m) {
45 | Xapian::docid did = *m;
46 | cout << m.get_rank() + 1 << ": #" << setfill('0') << setw(3) << did
47 | << ' ';
48 |
49 | const size_t DOC_FIELD_TITLE = 1;
50 | const string & data = m.get_document().get_data();
51 | cout << get_field(data, DOC_FIELD_TITLE) << '\n';
52 | // Log the document id.
53 | clog << ' ' << did;
54 | }
55 | clog << '\n';
56 | }
57 | // End of example code.
58 |
59 | int main(int argc, char** argv) {
60 | if (argc < 3) {
61 | cerr << "Usage: " << argv[0] << " DBPATH QUERYTERM...\n";
62 | return 1;
63 | }
64 | const char * dbpath = argv[1];
65 |
66 | // Join the rest of the arguments with spaces to make the query string.
67 | string querystring;
68 | for (argv += 2; *argv; ++argv) {
69 | if (!querystring.empty()) querystring += ' ';
70 | querystring += *argv;
71 | }
72 |
73 | search(dbpath, querystring);
74 | }
75 |
--------------------------------------------------------------------------------
/code/c++/search_filters2.cc:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "support.h"
10 |
11 | using namespace std;
12 |
13 | static void
14 | search(const string & dbpath, const string & querystring,
15 | Xapian::doccount offset = 0, Xapian::doccount pagesize = 10)
16 | {
17 | // offset - defines starting point within result set.
18 | // pagesize - defines number of records to retrieve.
19 |
20 | // Open the database we're going to search.
21 | Xapian::Database db(dbpath);
22 |
23 | // Start of example code.
24 | // Set up a QueryParser with a stemmer and suitable prefixes.
25 | Xapian::QueryParser queryparser;
26 | queryparser.set_stemmer(Xapian::Stem("en"));
27 | queryparser.set_stemming_strategy(queryparser.STEM_SOME);
28 | queryparser.add_prefix("title", "S");
29 | queryparser.add_prefix("description", "XD");
30 | queryparser.add_boolean_prefix("material", "XM");
31 |
32 | // And parse the query.
33 | Xapian::Query query = queryparser.parse_query(querystring);
34 | // End of example code.
35 |
36 | // Use an Enquire object on the database to run the query.
37 | Xapian::Enquire enquire(db);
38 | enquire.set_query(query);
39 |
40 | // And print out something about each match.
41 | Xapian::MSet mset = enquire.get_mset(offset, pagesize);
42 |
43 | clog << "'" << querystring << "'[" << offset << ":" << offset + pagesize
44 | << "] =";
45 | for (Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m) {
46 | Xapian::docid did = *m;
47 | cout << m.get_rank() + 1 << ": #" << setfill('0') << setw(3) << did
48 | << ' ';
49 |
50 | const size_t DOC_FIELD_TITLE = 1;
51 | const string & data = m.get_document().get_data();
52 | cout << get_field(data, DOC_FIELD_TITLE) << '\n';
53 | // Log the document id.
54 | clog << ' ' << did;
55 | }
56 | clog << '\n';
57 | }
58 |
59 | int main(int argc, char** argv) {
60 | if (argc < 3) {
61 | cerr << "Usage: " << argv[0] << " DBPATH QUERYTERM...\n";
62 | return 1;
63 | }
64 | const char * dbpath = argv[1];
65 |
66 | // Join the rest of the arguments with spaces to make the query string.
67 | string querystring;
68 | for (argv += 2; *argv; ++argv) {
69 | if (!querystring.empty()) querystring += ' ';
70 | querystring += *argv;
71 | }
72 |
73 | search(dbpath, querystring);
74 | }
75 |
--------------------------------------------------------------------------------
/code/c++/search_synonyms.cc:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #include "support.h"
9 |
10 | using namespace std;
11 |
12 | // Start of example code.
13 | static void
14 | search(const string & dbpath, const string & querystring,
15 | Xapian::doccount offset = 0, Xapian::doccount pagesize = 10)
16 | {
17 | // offset - defines starting point within result set.
18 | // pagesize - defines number of records to retrieve.
19 |
20 | // Open the database we're going to search.
21 | Xapian::WritableDatabase db(dbpath);
22 |
23 | // Start of adding synonyms
24 | db.add_synonym("time", "calendar");
25 | // End of adding synonyms
26 |
27 | // Set up a QueryParser with a stemmer and suitable prefixes.
28 | Xapian::QueryParser queryparser;
29 | queryparser.set_stemmer(Xapian::Stem("en"));
30 | queryparser.set_stemming_strategy(queryparser.STEM_SOME);
31 | queryparser.add_prefix("title", "S");
32 | queryparser.add_prefix("description", "XD");
33 |
34 | // Start of set database
35 | queryparser.set_database(db);
36 | // End of set database
37 |
38 | // And parse the query.
39 | Xapian::Query query = queryparser.parse_query(querystring,
40 | queryparser.FLAG_DEFAULT |
41 | queryparser.FLAG_SYNONYM);
42 |
43 | // Use an Enquire object on the database to run the query.
44 | Xapian::Enquire enquire(db);
45 | enquire.set_query(query);
46 |
47 | // And print out something about each match.
48 | Xapian::MSet mset = enquire.get_mset(offset, pagesize);
49 |
50 | clog << "'" << querystring << "'[" << offset << ":" << offset + pagesize
51 | << "] =";
52 | for (Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m) {
53 | Xapian::docid did = *m;
54 | cout << m.get_rank() + 1 << ": #" << setfill('0') << setw(3) << did
55 | << ' ';
56 |
57 | const size_t DOC_FIELD_TITLE = 1;
58 | const string & data = m.get_document().get_data();
59 | cout << get_field(data, DOC_FIELD_TITLE) << '\n';
60 | // Log the document id.
61 | clog << ' ' << did;
62 | }
63 | clog << '\n';
64 | }
65 | // End of example code.
66 |
67 | int main(int argc, char** argv) {
68 | if (argc < 3) {
69 | cerr << "Usage: " << argv[0] << " DBPATH QUERYTERM...\n";
70 | return 1;
71 | }
72 | const char * dbpath = argv[1];
73 |
74 | // Join the rest of the arguments with spaces to make the query string.
75 | string querystring;
76 | for (argv += 2; *argv; ++argv) {
77 | if (!querystring.empty()) querystring += ' ';
78 | querystring += *argv;
79 | }
80 |
81 | search(dbpath, querystring);
82 | }
83 |
--------------------------------------------------------------------------------
/code/c++/support.h:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | bool csv_parse_line(std::ifstream & csv, std::vector & fields);
6 |
7 | std::string get_field(const std::string & data, size_t field);
8 |
9 | bool max_number_in_string(const std::string & s, double *n_ptr);
10 |
11 | bool first_number_in_string(const std::string & s, double *n_ptr);
12 |
13 | std::string format_date(const std::string& yyyymmdd);
14 |
15 | std::string format_numeral(std::string n);
16 |
17 | double distance_between_coords(const std::pair& a,
18 | const std::pair& b);
19 |
--------------------------------------------------------------------------------
/code/expected.out/delete1.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/delete1.out
--------------------------------------------------------------------------------
/code/expected.out/index1.db_title=3asunwatch.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index1.db_title=3asunwatch.out
--------------------------------------------------------------------------------
/code/expected.out/index1.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index1.out
--------------------------------------------------------------------------------
/code/expected.out/index_facets.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_facets.out
--------------------------------------------------------------------------------
/code/expected.out/index_filters.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_filters.out
--------------------------------------------------------------------------------
/code/expected.out/index_ranges.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_ranges.out
--------------------------------------------------------------------------------
/code/expected.out/index_ranges2.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_ranges2.out
--------------------------------------------------------------------------------
/code/expected.out/index_values_with_geo.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_values_with_geo.out
--------------------------------------------------------------------------------
/code/expected.out/search1.db_Dent_watch.out:
--------------------------------------------------------------------------------
1 | 1: #046 Model by Dent of mechanism for setting hands and winding up
2 | 2: #004 Watch with Chinese duplex escapement
3 | 3: #018 Solar/Sidereal verge watch with epicyclic maintaining power
4 | 4: #013 Watch timer by P
5 | 5: #094 Model of a Lever Escapement , 1850-1883
6 | 6: #093 Model of Graham's Cylinder Escapement, 1850-1883
7 | 7: #033 A device by Favag of Neuchatel which enables a stop watch to
8 | 8: #015 Ingersoll "Dan Dare" automaton pocket watch with pin-pallet
9 | 9: #086 Model representing Earnshaw's detent chronometer escapement, 1950-1883
10 | 10: #036 Universal 'Tri-Compax' chronographic wrist watch
11 | 'Dent watch'[0:10] = 46 4 18 13 94 93 33 15 86 36
12 |
--------------------------------------------------------------------------------
/code/expected.out/search1.db_description=3a=5c=22leather_case=5c=22_AND_title=3asundial.out:
--------------------------------------------------------------------------------
1 | 1: #055 Silver altitude sundial in leather case
2 | 'description:"leather case" AND title:sundial'[0:10] = 55
3 |
--------------------------------------------------------------------------------
/code/expected.out/search1.db_watch.out:
--------------------------------------------------------------------------------
1 | 1: #004 Watch with Chinese duplex escapement
2 | 2: #018 Solar/Sidereal verge watch with epicyclic maintaining power
3 | 3: #013 Watch timer by P
4 | 4: #033 A device by Favag of Neuchatel which enables a stop watch to
5 | 5: #015 Ingersoll "Dan Dare" automaton pocket watch with pin-pallet
6 | 6: #036 Universal 'Tri-Compax' chronographic wrist watch
7 | 7: #046 Model by Dent of mechanism for setting hands and winding up
8 | 'watch'[0:10] = 4 18 13 33 15 36 46
9 |
--------------------------------------------------------------------------------
/code/expected.out/search1.out:
--------------------------------------------------------------------------------
1 | 1: #001 Ansonia Sunwatch (pocket compas dial)
2 | 'title:sunwatch'[0:10] = 1
3 |
--------------------------------------------------------------------------------
/code/expected.out/search_facets.out:
--------------------------------------------------------------------------------
1 | 1: #044 Two-dial clock by the Self-Winding Clock Co; as used on the
2 | 2: #096 Clock with Hipp pendulum (an electric driven clock with Hipp
3 | 3: #012 Assembled and unassembled EXA electric clock kit
4 | 4: #098 'Pond' electric clock movement (no dial)
5 | 5: #083 Harrison's eight-day wooden clock movement, 1715.
6 | 6: #005 "Ever Ready" ceiling clock
7 | 7: #039 Electric clock of the Bain type
8 | 8: #061 Van der Plancke master clock
9 | 9: #064 Morse electrical clock, dial mechanism
10 | 10: #052 Reconstruction of Dondi's Astronomical Clock, 1974
11 | Facet: Bain, Alexander; count: 3
12 | Facet: Bloxam, J. M.; count: 1
13 | Facet: Braun (maker); count: 1
14 | Facet: British Horo-Electric Ltd. (maker); count: 1
15 | Facet: British Vacuum Cleaner and Engineering Co. Ltd., Magneto Time division (maker); count: 1
16 | Facet: EXA; count: 1
17 | Facet: Ever Ready Co. (maker); count: 2
18 | Facet: Ferranti Ltd.; count: 1
19 | Facet: Galilei, Galileo, 1564-1642; Galilei, Vincenzio, 1606-1649; count: 1
20 | Facet: Harrison, John (maker); count: 1
21 | Facet: Hipp, M.; count: 1
22 | Facet: La Précision Cie; count: 1
23 | Facet: Lund, J.; count: 1
24 | Facet: Morse, J. S.; count: 1
25 | Facet: Self Winding Clock Company; count: 1
26 | Facet: Self-Winding Clock Co. (maker); count: 1
27 | Facet: Synchronome Co. Ltd. (maker); count: 2
28 | Facet: Thwaites and Reed Ltd.; count: 1
29 | Facet: Thwaites and Reed Ltd. (maker); count: 1
30 | Facet: Viviani, Vincenzo; count: 1
31 | Facet: Vulliamy, Benjamin, 1747-1811; count: 1
32 | Facet: Whitefriars Glass Ltd. (maker); count: 1
33 | 'clock'[0:10] = 44 96 12 98 83 5 39 61 64 52
34 |
--------------------------------------------------------------------------------
/code/expected.out/search_filters.out:
--------------------------------------------------------------------------------
1 | 1: #012 Assembled and unassembled EXA electric clock kit
2 | 2: #098 'Pond' electric clock movement (no dial)
3 | 3: #052 Reconstruction of Dondi's Astronomical Clock, 1974
4 | 4: #059 Electrically operated clock controller
5 | 5: #024 Regulator Clock with Gravity Escapement
6 | 6: #097 Bain's subsidiary electric clock
7 | 7: #009 Copy of a Dwerrihouse skeleton clock with coup-perdu escape
8 | 8: #091 Pendulum clock designed by Galileo in 1642 and made by his son in 1649, model.
9 | 'clock'[0:10] = 12 98 52 59 24 97 9 91
10 |
--------------------------------------------------------------------------------
/code/expected.out/search_filters2.out:
--------------------------------------------------------------------------------
1 | 1: #012 Assembled and unassembled EXA electric clock kit
2 | 2: #098 'Pond' electric clock movement (no dial)
3 | 3: #052 Reconstruction of Dondi's Astronomical Clock, 1974
4 | 4: #059 Electrically operated clock controller
5 | 5: #024 Regulator Clock with Gravity Escapement
6 | 6: #097 Bain's subsidiary electric clock
7 | 7: #009 Copy of a Dwerrihouse skeleton clock with coup-perdu escape
8 | 8: #091 Pendulum clock designed by Galileo in 1642 and made by his son in 1649, model.
9 | 'clock material:"steel (metal)"'[0:10] = 12 98 52 59 24 97 9 91
10 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges.db_..50mm.out:
--------------------------------------------------------------------------------
1 | 1: #031 (1588) overall diameter: 50 mm
2 | Portable universal equinoctial sundial, in brass, signed "A
3 | 2: #073 (1701-1721) overall: 15 mm x 44.45 mm, weight: 0.055kg
4 | Universal pocket sundial
5 | 3: #074 (1596) overall: 13 mm x 44.45 mm x 44.45 mm, weight: 0.095kg
6 | Sundial, made as a locket, gilt metal, part silver
7 | '..50mm'[0:10] = 31 73 74
8 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges.db_1000..mm_1800..1899.out:
--------------------------------------------------------------------------------
1 | 1: #024 (1845-1855) overall: 1850 mm x 350 mm x 250 mm
2 | Regulator Clock with Gravity Escapement
3 | '1000..mm 1800..1899'[0:10] = 24
4 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges.db_1980..1989.out:
--------------------------------------------------------------------------------
1 | 1: #050 (1984) overall: 105 mm x 75 mm x 57 mm,
2 | Quartz Analogue "no battery" wristwatch by Pulsar Quartz (CA
3 | 2: #051 (1984) overall: 85 mm x 65 mm x 38 mm,
4 | Analogue quartz clock with voice controlled alarm by Braun,
5 | '1980..1989'[0:10] = 50 51
6 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges.db_clock_1960...out:
--------------------------------------------------------------------------------
1 | 1: #052 (1974) clock: 1185 x 780 mm, 122 kg; rewind unit: 460 x 640 x 350 mm
2 | Reconstruction of Dondi's Astronomical Clock, 1974
3 | 2: #051 (1984) overall: 85 mm x 65 mm x 38 mm,
4 | Analogue quartz clock with voice controlled alarm by Braun,
5 | 3: #009 (1973) overall: 380 mm x 300 mm x 192 mm, weight: 6.45kg
6 | Copy of a Dwerrihouse skeleton clock with coup-perdu escape
7 | 'clock 1960..'[0:10] = 52 51 9
8 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges2.statesdb_10000000...out:
--------------------------------------------------------------------------------
1 | 1: #007 State of California September 9, 1850
2 | Population 37,253,956
3 | 2: #019 State of Texas December 29, 1845
4 | Population 25,145,561
5 | 3: #027 State of Illinois December 3, 1818
6 | Population 12,830,632
7 | 4: #030 State of Ohio March 1, 1803
8 | Population 11,536,504
9 | 5: #035 State of Florida March 3, 1845
10 | Population 18,801,310
11 | 6: #040 Commonwealth of Pennsylvania December 12, 1787
12 | Population 12,702,379
13 | 7: #041 State of New York July 26, 1788
14 | Population 19,378,102
15 | '10000000..'[0:10] = 7 19 27 30 35 40 41
16 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges2.statesdb_11=2f08=2f1889..07=2f10=2f1890.out:
--------------------------------------------------------------------------------
1 | 1: #001 State of Washington November 11, 1889
2 | Population 6,744,496
3 | 2: #004 State of Montana November 8, 1889
4 | Population 989,415
5 | 3: #005 Idaho July 3, 1890
6 | Population 1,567,582
7 | 4: #010 State of Wyoming July 10, 1890
8 | Population 563,626
9 | '11/08/1889..07/10/1890'[0:10] = 1 4 5 10
10 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges2.statesdb_1780..1789_10000000...out:
--------------------------------------------------------------------------------
1 | 1: #040 Commonwealth of Pennsylvania December 12, 1787
2 | Population 12,702,379
3 | 2: #041 State of New York July 26, 1788
4 | Population 19,378,102
5 | '1780..1789 10000000..'[0:10] = 40 41
6 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges2.statesdb_1800..1899.out:
--------------------------------------------------------------------------------
1 | 1: #001 State of Washington November 11, 1889
2 | Population 6,744,496
3 | 2: #002 State of Arkansas June 15, 1836
4 | Population 2,915,918
5 | 3: #003 State of Oregon February 14, 1859
6 | Population 3,831,074
7 | 4: #004 State of Montana November 8, 1889
8 | Population 989,415
9 | 5: #005 Idaho July 3, 1890
10 | Population 1,567,582
11 | 6: #006 State of Nevada October 31, 1864
12 | Population 2,700,551
13 | 7: #007 State of California September 9, 1850
14 | Population 37,253,956
15 | 8: #009 State of Utah January 4, 1896
16 | Population 2,763,885
17 | 9: #010 State of Wyoming July 10, 1890
18 | Population 563,626
19 | 10: #011 State of Colorado August 1, 1876
20 | Population 5,029,196
21 | '1800..1899'[0:10] = 1 2 3 4 5 6 7 9 10 11
22 |
--------------------------------------------------------------------------------
/code/expected.out/search_ranges2.statesdb_spanish.out:
--------------------------------------------------------------------------------
1 | 1: #004 State of Montana November 8, 1889
2 | Population 989,415
3 | 2: #019 State of Texas December 29, 1845
4 | Population 25,145,561
5 | 'spanish'[0:10] = 4 19
6 |
--------------------------------------------------------------------------------
/code/expected.out/search_sorting.out:
--------------------------------------------------------------------------------
1 | 1: #019 State of Texas December 29, 1845
2 | Population 25,145,561
3 | 2: #004 State of Montana November 8, 1889
4 | Population 989,415
5 | 'spanish'[0:10] = 19 4
6 |
--------------------------------------------------------------------------------
/code/expected.out/search_sorting2.out:
--------------------------------------------------------------------------------
1 | 1: #040 Commonwealth of Pennsylvania December 12, 1787
2 | Population 12,702,379
3 | 2: #043 State of New Jersey December 18, 1787
4 | Population 8,791,894
5 | 3: #049 State of Delaware December 7, 1787
6 | Population 897,934
7 | 4: #041 State of New York July 26, 1788
8 | Population 19,378,102
9 | 5: #034 State of Georgia January 2, 1788
10 | Population 9,687,653
11 | 6: #038 Commonwealth of Virginia June 25, 1788
12 | Population 8,001,024
13 | 7: #046 Commonwealth of Massachusetts February 6, 1788
14 | Population 6,547,629
15 | 8: #050 State of Maryland April 28, 1788
16 | Population 5,773,552
17 | 9: #036 State of South Carolina May 23, 1788
18 | Population 4,625,384
19 | 10: #048 State of Connecticut January 9, 1788
20 | Population 3,574,097
21 | 'State'[0:10] = 40 43 49 41 34 38 46 50 36 48
22 |
--------------------------------------------------------------------------------
/code/expected.out/search_sorting3.out:
--------------------------------------------------------------------------------
1 | 1: #050 State of Maryland April 28, 1788
2 | Population 5,773,552
3 | 2: #049 State of Delaware December 7, 1787
4 | Population 897,934
5 | 3: #040 Commonwealth of Pennsylvania December 12, 1787
6 | Population 12,702,379
7 | 4: #043 State of New Jersey December 18, 1787
8 | Population 8,791,894
9 | 5: #039 State of West Virginia June 20, 1863
10 | Population 1,859,815
11 | 6: #037 State of North Carolina November 21, 1789
12 | Population 9,535,483
13 | 7: #041 State of New York July 26, 1788
14 | Population 19,378,102
15 | 8: #038 Commonwealth of Virginia June 25, 1788
16 | Population 8,001,024
17 | 9: #048 State of Connecticut January 9, 1788
18 | Population 3,574,097
19 | 10: #036 State of South Carolina May 23, 1788
20 | Population 4,625,384
21 | 'State'[0:10] = 50 49 40 43 39 37 41 38 48 36
22 |
--------------------------------------------------------------------------------
/code/expected.out/search_synonyms.db_time.out:
--------------------------------------------------------------------------------
1 | 1: #065 Electric time piece with hands but without dial (no pendulum
2 | 2: #058 The "Empire" clock, to show the time at various longitudes,
3 | 3: #041 Frequency and time measuring instrument type TSA3436 by Venn
4 | 4: #056 Single sandglass in 4 pillared wood mount, running time 15 1
5 | 5: #043 Loughborough-Hayes automatic timing apparatus. Used by the R
6 | 6: #011 "Timetrunk" by Hines and Co., Glasgow (a sandglass for timin
7 | 7: #016 Copy of the gearing of the Byzantine sundial-calendar (1983-
8 | 8: #045 Master clock of the "Silent Electric" type made by the Magne
9 | 9: #018 Solar/Sidereal verge watch with epicyclic maintaining power
10 | 'time'[0:10] = 65 58 41 56 43 11 16 45 18
11 |
--------------------------------------------------------------------------------
/code/expected.out/search_synonyms.out:
--------------------------------------------------------------------------------
1 | 1: #016 Copy of the gearing of the Byzantine sundial-calendar (1983-
2 | 2: #072 German Perpetual Calendar in gilt metal
3 | 3: #065 Electric time piece with hands but without dial (no pendulum
4 | 4: #068 Ornate brass Perpetual Calendar
5 | 5: #058 The "Empire" clock, to show the time at various longitudes,
6 | 6: #041 Frequency and time measuring instrument type TSA3436 by Venn
7 | 7: #056 Single sandglass in 4 pillared wood mount, running time 15 1
8 | 8: #043 Loughborough-Hayes automatic timing apparatus. Used by the R
9 | 9: #026 Sundial and compass with perpetual calendar and lunar circles
10 | 10: #036 Universal 'Tri-Compax' chronographic wrist watch
11 | '~time'[0:10] = 16 72 65 68 58 41 56 43 26 36
12 |
--------------------------------------------------------------------------------
/code/java/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 |
--------------------------------------------------------------------------------
/code/java/delete1.java:
--------------------------------------------------------------------------------
1 | package code.java;
2 |
3 | import org.xapian.WritableDatabase;
4 | import org.xapian.XapianConstants;
5 |
6 | public class delete1 {
7 |
8 | // Command line args - dbpath identifiers...
9 | public static void main(String[] args) {
10 | if (args.length < 2) {
11 | System.out.println("Insufficient number of arguments (should be dbpath identifiers...)");
12 | return;
13 | }
14 | // Splitting the array to obtain an array of identifiers
15 | String[] identifierArgs = new String[args.length - 1];
16 | System.arraycopy(args, 1, identifierArgs, 0, identifierArgs.length);
17 | deleteDocs(args[0], identifierArgs);
18 | }
19 |
20 | // Start of example code.
21 | public static void deleteDocs(String dbpath, String[] identifierArgs) {
22 | // Open the database we're going to be deleting from.
23 | WritableDatabase db = new WritableDatabase(dbpath, XapianConstants.DB_OPEN);
24 |
25 | for (String identifierArg : identifierArgs) {
26 | String idterm = "Q" + identifierArg;
27 | db.deleteDocument(idterm);
28 | }
29 |
30 | // Commit to delete documents from disk
31 | db.commit();
32 | }
33 | // End of example code.
34 | }
35 |
--------------------------------------------------------------------------------
/code/java/index1.java.data=2f100-objects-v1.csv_db.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/java/index1.java.data=2f100-objects-v1.csv_db.out
--------------------------------------------------------------------------------
/code/java/support.java:
--------------------------------------------------------------------------------
1 | /* Support code for Java examples */
2 | package code.java;
3 |
4 | import java.util.ArrayList;
5 |
6 | public class support {
7 | // Returns an ArrayList of the parsed CSV line
8 | public static ArrayList parseCsvLine(String csvLine) {
9 | ArrayList words = new ArrayList();
10 | boolean insideQuote = false, endEarly = false;
11 | int start = 0, end = 0;
12 | for (int i = 0; i < csvLine.length()-1; i++) {
13 | if(csvLine.charAt(i) == ',' && !insideQuote) {
14 | if (endEarly) {
15 | words.add(csvLine.substring(start,i-1).replace("\"\"","\""));
16 | endEarly = false;
17 | } else {
18 | words.add(csvLine.substring(start,i));
19 | }
20 |
21 | if(csvLine.charAt(i+1) == '"') {
22 | start = i + 2;
23 | i++;
24 | endEarly = true;
25 | insideQuote = true;
26 | } else {
27 | start = i + 1;
28 | }
29 | } else if (csvLine.charAt(i) == '"') {
30 | insideQuote = !insideQuote;
31 | }
32 | }
33 | words.add(csvLine.substring(start));
34 | return words;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/code/perl/Support.pm:
--------------------------------------------------------------------------------
1 | package Support;
2 | use strict;
3 | use warnings;
4 | use Text::CSV;
5 | use Data::Dumper;
6 | use DateTime;
7 | use DateTime::Format::Strptime;;
8 |
9 | sub parse_csv {
10 | my $file = shift;
11 | my $csv = Text::CSV->new ({
12 | eol => "\r\n",
13 | sep_char => ',',
14 | binary => 1,
15 | })
16 | or die "Cannot use CSV: ".Text::CSV->error_diag ();
17 | open(my $fh, "<:encoding(UTF-8)", $file) or die "$file: $!";
18 |
19 | my $header = $csv->getline($fh);
20 |
21 | $csv->column_names(@$header);
22 | my @out;
23 | while (my $ref = $csv->getline_hr($fh)) {
24 | push @out, $ref;
25 | }
26 | $csv->eof or die $csv->error_diag();
27 | close $fh or die "$file: $!";
28 | return @out;
29 | }
30 |
31 | sub log_matches {
32 | my ($query, $offset, $page_size, $matches) = @_;
33 | printf(q{'%s'[%i:%i] = %s}, $query, $offset, $offset + $page_size,
34 | join(' ', @$matches));
35 | print "\n";
36 | }
37 |
38 | sub numbers_from_string {
39 | my $string = shift;
40 | return unless $string;
41 | my @all;
42 | while ($string =~ m/([\d\.]*\d[\d\.]*)/g) {
43 | push @all, $1;
44 | }
45 | return @all;
46 | }
47 |
48 | sub parse_states {
49 | my @records = parse_csv(@_);
50 | return grep { length($_->{order}) } @records;
51 | }
52 |
53 | sub format_numeral {
54 | my $number = shift;
55 | if ($number =~ m/\A[0-9]+\z/) {
56 | if ($number eq '0') {
57 | return $number;
58 | }
59 | else {
60 | my @out;
61 | my @all = reverse(split('', $number));
62 | for (my $i = 0; $i < @all; $i++) {
63 | if ($i and (($i % 3) == 0)) {
64 | push @out, ',';
65 | }
66 | push @out, $all[$i];
67 | }
68 | return join('', reverse @out);
69 | }
70 | }
71 | else {
72 | die "Numeral should be an integer";
73 | }
74 | }
75 |
76 | sub format_date {
77 | my $date = shift;
78 | my $strp = DateTime::Format::Strptime->new(pattern => '%Y%m%d');
79 | my $dt = $strp->parse_datetime($date);
80 | return $dt->month_name . ' ' . $dt->day . ', ' . $dt->year;
81 | };
82 |
83 | 1;
84 |
--------------------------------------------------------------------------------
/code/perl/delete1.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 |
19 | my ($db_path, @ids) = @ARGV;
20 | die "Usage $0 DBPATH ID..." unless $db_path && @ids;
21 |
22 | delete_docs($db_path, @ids);
23 |
24 | ### Start of example code.
25 | sub delete_docs {
26 | my ($db_path, @ids) = @_;
27 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
28 | foreach my $id (@ids) {
29 | $db->delete_document_by_term("Q$id");
30 | }
31 | }
32 | ### End of example code.
33 |
--------------------------------------------------------------------------------
/code/perl/index1.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 |
24 |
25 | my ($data_path, $db_path) = @ARGV;
26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path;
27 |
28 | index_csv($data_path, $db_path);
29 |
30 | ### Start of example code.
31 | sub index_csv {
32 | my ($data_path, $db_path) = @_;
33 | # Create or open the database we're going to be writing to.
34 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
35 | # Set up a TermGenerator that we'll use in indexing.
36 | my $term_generator = Search::Xapian::TermGenerator->new;
37 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en'));
38 | foreach my $rec (Support::parse_csv($data_path)) {
39 | # print Dumper($rec);
40 | my $doc = Search::Xapian::Document->new;
41 | $term_generator->set_document($doc);
42 | # Index each field with a suitable prefix.
43 | $term_generator->index_text($rec->{TITLE}, 1, 'S');
44 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD');
45 |
46 | # Index fields without prefixes for general search.
47 | $term_generator->index_text($rec->{TITLE});
48 | $term_generator->increase_termpos();
49 | $term_generator->index_text($rec->{DESCRIPTION});
50 |
51 | # Store all the fields for display purposes.
52 | $doc->set_data(encode_json($rec));
53 |
54 | # We use the identifier to ensure each object ends up in the
55 | # database only once no matter how many times we run the
56 | # indexer.
57 | my $idterm = "Q" . $rec->{id_NUMBER};
58 | $doc->add_boolean_term($idterm);
59 | $db->replace_document_by_term($idterm, $doc);
60 | }
61 | }
62 | ### End of example code.
63 |
--------------------------------------------------------------------------------
/code/perl/index_facets.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 | use Encode qw/encode/;
24 |
25 |
26 | my ($data_path, $db_path) = @ARGV;
27 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path;
28 |
29 | index_csv($data_path, $db_path);
30 |
31 | ### Start of example code.
32 | sub index_csv {
33 | my ($data_path, $db_path) = @_;
34 | # Create or open the database we're going to be writing to.
35 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
36 | # Set up a TermGenerator that we'll use in indexing.
37 | my $term_generator = Search::Xapian::TermGenerator->new;
38 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en'));
39 | foreach my $rec (Support::parse_csv($data_path)) {
40 | # print Dumper($rec);
41 | my $doc = Search::Xapian::Document->new;
42 | $term_generator->set_document($doc);
43 | # Index each field with a suitable prefix.
44 | $term_generator->index_text($rec->{TITLE}, 1, 'S');
45 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD');
46 |
47 | # Index fields without prefixes for general search.
48 | $term_generator->index_text($rec->{TITLE});
49 | $term_generator->increase_termpos();
50 | $term_generator->index_text($rec->{DESCRIPTION});
51 |
52 | # Store all the fields for display purposes.
53 | $doc->set_data(encode_json($rec));
54 |
55 | # add the collection and the maker into value slots
56 | if ($rec->{COLLECTION}) {
57 | $doc->add_value(0, encode('UTF-8', $rec->{COLLECTION}));
58 | }
59 | if ($rec->{MAKER}) {
60 | $doc->add_value(1, encode('UTF-8', $rec->{MAKER}));
61 | }
62 |
63 |
64 | # We use the identifier to ensure each object ends up in the
65 | # database only once no matter how many times we run the
66 | # indexer.
67 | my $idterm = "Q" . $rec->{id_NUMBER};
68 | $doc->add_boolean_term($idterm);
69 | $db->replace_document_by_term($idterm, $doc);
70 | }
71 | }
72 | ### End of example code.
73 |
--------------------------------------------------------------------------------
/code/perl/index_filters.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 |
24 |
25 | my ($data_path, $db_path) = @ARGV;
26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path;
27 |
28 | index_csv($data_path, $db_path);
29 |
30 | sub index_csv {
31 | my ($data_path, $db_path) = @_;
32 | # Create or open the database we're going to be writing to.
33 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
34 | # Set up a TermGenerator that we'll use in indexing.
35 | my $term_generator = Search::Xapian::TermGenerator->new;
36 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en'));
37 | foreach my $rec (Support::parse_csv($data_path)) {
38 | # print Dumper($rec);
39 | my $doc = Search::Xapian::Document->new;
40 | $term_generator->set_document($doc);
41 | # Index each field with a suitable prefix.
42 | $term_generator->index_text($rec->{TITLE}, 1, 'S');
43 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD');
44 |
45 | ### Start of new indexing code.
46 | # Index the MATERIALS field, splitting on semicolons.
47 | foreach my $material (split(/;/, $rec->{MATERIALS})) {
48 | $material =~ s/\A\s*//;
49 | $material =~ s/\s*\z//;
50 | $material = lc($material);
51 | if (length($material)) {
52 | $doc->add_boolean_term('XM' . $material);
53 | }
54 | }
55 | ### End of new indexing code.
56 |
57 | # Index fields without prefixes for general search.
58 | $term_generator->index_text($rec->{TITLE});
59 | $term_generator->increase_termpos();
60 | $term_generator->index_text($rec->{DESCRIPTION});
61 |
62 | # Store all the fields for display purposes.
63 | $doc->set_data(encode_json($rec));
64 |
65 | # We use the identifier to ensure each object ends up in the
66 | # database only once no matter how many times we run the
67 | # indexer.
68 | my $idterm = "Q" . $rec->{id_NUMBER};
69 | $doc->add_boolean_term($idterm);
70 | $db->replace_document_by_term($idterm, $doc);
71 | }
72 | }
73 |
74 |
--------------------------------------------------------------------------------
/code/perl/index_ranges.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 |
24 |
25 | my ($data_path, $db_path) = @ARGV;
26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path;
27 |
28 | index_csv($data_path, $db_path);
29 |
30 | sub index_csv {
31 | my ($data_path, $db_path) = @_;
32 | # Create or open the database we're going to be writing to.
33 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
34 | # Set up a TermGenerator that we'll use in indexing.
35 | my $term_generator = Search::Xapian::TermGenerator->new;
36 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en'));
37 | foreach my $rec (Support::parse_csv($data_path)) {
38 | # print Dumper($rec);
39 | my $doc = Search::Xapian::Document->new;
40 | $term_generator->set_document($doc);
41 | # Index each field with a suitable prefix.
42 | $term_generator->index_text($rec->{TITLE}, 1, 'S');
43 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD');
44 |
45 | ### Start of example code.
46 | if (my @numbers = Support::numbers_from_string($rec->{MEASUREMENTS})) {
47 | # index the higher one
48 | $doc->add_value(0, Search::Xapian::sortable_serialise((sort { $b <=> $a } @numbers)[0]));
49 | }
50 | if (my @years = Support::numbers_from_string($rec->{DATE_MADE})) {
51 | # index the first one
52 | $doc->add_value(1, Search::Xapian::sortable_serialise($years[0]));
53 | }
54 | ### End of example code.
55 |
56 | # Index fields without prefixes for general search.
57 | $term_generator->index_text($rec->{TITLE});
58 | $term_generator->increase_termpos();
59 | $term_generator->index_text($rec->{DESCRIPTION});
60 |
61 | # Store all the fields for display purposes.
62 | $doc->set_data(encode_json($rec));
63 |
64 | # We use the identifier to ensure each object ends up in the
65 | # database only once no matter how many times we run the
66 | # indexer.
67 | my $idterm = "Q" . $rec->{id_NUMBER};
68 | $doc->add_boolean_term($idterm);
69 | $db->replace_document_by_term($idterm, $doc);
70 | }
71 | }
72 |
73 |
--------------------------------------------------------------------------------
/code/perl/index_ranges2.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 |
24 |
25 | my ($data_path, $db_path) = @ARGV;
26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path;
27 |
28 | index_csv($data_path, $db_path);
29 |
30 | ### Start of example code.
31 | sub index_csv {
32 | my ($data_path, $db_path) = @_;
33 | # Create or open the database we're going to be writing to.
34 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
35 | # Set up a TermGenerator that we'll use in indexing.
36 | my $term_generator = Search::Xapian::TermGenerator->new;
37 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en'));
38 | foreach my $rec (Support::parse_states($data_path)) {
39 | my $doc = Search::Xapian::Document->new;
40 | $term_generator->set_document($doc);
41 |
42 | ### Start of example code.
43 | # Index each field with a suitable prefix.
44 | $term_generator->index_text($rec->{name}, 1, 'S');
45 | $term_generator->index_text($rec->{description}, 1, 'XD');
46 | $term_generator->index_text($rec->{motto}, 1, 'XD');
47 |
48 | # Index fields without prefixes for general search.
49 | $term_generator->index_text($rec->{name});
50 | $term_generator->increase_termpos();
51 | $term_generator->index_text($rec->{description});
52 | $term_generator->increase_termpos();
53 | $term_generator->index_text($rec->{motto});
54 |
55 | if (length($rec->{admitted})) {
56 | $doc->add_value(1, Search::Xapian::sortable_serialise(substr($rec->{admitted}, 0, 4)));
57 | $doc->add_value(2, $rec->{admitted});
58 | }
59 | if (length($rec->{population})) {
60 | $doc->add_value(3, Search::Xapian::sortable_serialise(int($rec->{population})));
61 | }
62 | ### End of example code.
63 |
64 | # Store all the fields for display purposes.
65 | $doc->set_data(encode_json($rec));
66 |
67 | # We use the identifier to ensure each object ends up in the
68 | # database only once no matter how many times we run the
69 | # indexer.
70 | my $idterm = "Q" . $rec->{order};
71 | $doc->add_boolean_term($idterm);
72 | $db->replace_document_by_term($idterm, $doc);
73 | }
74 | }
75 | ### End of example code.
76 |
--------------------------------------------------------------------------------
/code/perl/search1.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 |
23 |
24 | my ($db_path, @terms) = @ARGV;
25 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms;
26 |
27 | search($db_path, join(' ', @terms));
28 |
29 | ### Start of example code.
30 | sub search {
31 | my ($db_path, $query_string, $offset, $pagesize) = @_;
32 | $offset ||= 0;
33 | $pagesize ||= 10;
34 | my $db = Search::Xapian::Database->new($db_path);
35 | # Set up a QueryParser with a stemmer and suitable prefixes
36 | my $queryparser = Search::Xapian::QueryParser->new;
37 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en'));
38 | $queryparser->set_stemming_strategy(STEM_SOME);
39 |
40 | # Start of prefix configuration.
41 | $queryparser->add_prefix(title => "S");
42 | $queryparser->add_prefix(description => "XD");
43 | # End of prefix configuration.
44 |
45 | # And parse the query
46 | my $query = $queryparser->parse_query($query_string);
47 |
48 | # Use an Enquire object on the database to run the query
49 | my $enquire = $db->enquire($query);
50 |
51 | # And print out something about each match
52 | my @matches;
53 |
54 | my $mset = $enquire->get_mset($offset, $pagesize);
55 | foreach my $item ($mset->items) {
56 | my $fields = decode_json($item->get_document->get_data);
57 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE});
58 | print "\n";
59 | push @matches, $item->get_docid;
60 | }
61 | Support::log_matches($query_string, $offset, $pagesize, \@matches);
62 | }
63 | ### End of example code.
64 |
--------------------------------------------------------------------------------
/code/perl/search_facets.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Encode qw/decode/;
22 | use Support;
23 | use Data::Dumper;
24 | binmode STDOUT, ":encoding(UTF-8)";
25 | binmode STDERR, ":encoding(UTF-8)";
26 |
27 |
28 | my ($db_path, @terms) = @ARGV;
29 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms;
30 |
31 | search($db_path, join(' ', @terms));
32 |
33 | sub search {
34 | my ($db_path, $query_string, $offset, $pagesize) = @_;
35 | $offset ||= 0;
36 | $pagesize ||= 10;
37 | my $db = Search::Xapian::Database->new($db_path);
38 | # Set up a QueryParser with a stemmer and suitable prefixes
39 | my $queryparser = Search::Xapian::QueryParser->new;
40 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en'));
41 | $queryparser->set_stemming_strategy(STEM_SOME);
42 |
43 | # Start of prefix configuration.
44 | $queryparser->add_prefix(title => "S");
45 | $queryparser->add_prefix(description => "XD");
46 | # End of prefix configuration.
47 |
48 | # And parse the query
49 | my $query = $queryparser->parse_query($query_string);
50 |
51 | # Use an Enquire object on the database to run the query
52 | my $enquire = $db->enquire($query);
53 |
54 | # And print out something about each match
55 | my @matches;
56 |
57 | ### Start of example code.
58 |
59 | # Set up a spy to inspect the MAKER value at slot 1
60 | my $spy = Search::Xapian::ValueCountMatchSpy->new(1);
61 | $enquire->add_matchspy($spy);
62 |
63 | my $mset = $enquire->get_mset($offset, $pagesize, 100);
64 | foreach my $item ($mset->items) {
65 | my $fields = decode_json($item->get_document->get_data);
66 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE});
67 | print "\n";
68 | push @matches, $item->get_docid;
69 | }
70 | # Fetch and display the spy values
71 | my $end = $spy->values_end;
72 | # it looks like the values are not decoded coming out.
73 | for (my $it = $spy->values_begin; $it != $end; $it++) {
74 | print "Facet: " . decode('UTF-8', $it->get_termname) . "; count: " . $it->get_termfreq . "\n"
75 | }
76 |
77 |
78 | Support::log_matches($query_string, $offset, $pagesize, \@matches);
79 | ### End of example code.
80 | }
81 |
--------------------------------------------------------------------------------
/code/perl/search_filters.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 |
24 |
25 | my ($db_path, $query_string, @materials) = @ARGV;
26 | die "Usage: $0 DB_PATH QUERY MATERIALS..." unless $db_path && $query_string;
27 |
28 | search($db_path, $query_string, \@materials);
29 |
30 | sub search {
31 | my ($db_path, $query_string, $materials, $offset, $pagesize) = @_;
32 | $materials ||= [];
33 | $offset ||= 0;
34 | $pagesize ||= 10;
35 |
36 | my $db = Search::Xapian::Database->new($db_path);
37 |
38 | ### Start of example code.
39 | # Set up a QueryParser with a stemmer and suitable prefixes
40 | my $queryparser = Search::Xapian::QueryParser->new;
41 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en'));
42 | $queryparser->set_stemming_strategy(STEM_SOME);
43 |
44 | # Start of prefix configuration.
45 | $queryparser->add_prefix(title => "S");
46 | $queryparser->add_prefix(description => "XD");
47 |
48 | # End of prefix configuration.
49 |
50 | # And parse the query
51 | my $query = $queryparser->parse_query($query_string);
52 |
53 | # there is no pod for Search::Xapian::Query, but works anyway. Operator + list.
54 |
55 | if (@$materials) {
56 | my $material_query = Search::Xapian::Query->new(OP_OR,
57 | map { Search::Xapian::Query->new('XM' . lc($_)) }
58 | @$materials);
59 | $query = Search::Xapian::Query->new(OP_FILTER, $query, $material_query);
60 | }
61 | ### End of example code.
62 |
63 | # Use an Enquire object on the database to run the query
64 | my $enquire = $db->enquire($query);
65 |
66 | # And print out something about each match
67 | my @matches;
68 |
69 | my $mset = $enquire->get_mset($offset, $pagesize);
70 | foreach my $item ($mset->items) {
71 | my $fields = decode_json($item->get_document->get_data);
72 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE});
73 | print "\n";
74 | push @matches, $item->get_docid;
75 | }
76 | Support::log_matches($query_string, $offset, $pagesize, \@matches);
77 | }
78 |
--------------------------------------------------------------------------------
/code/perl/search_filters2.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 | use Data::Dumper;
23 |
24 |
25 | my ($db_path, @terms) = @ARGV;
26 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms;
27 |
28 | search($db_path, join(' ', @terms));
29 |
30 | sub search {
31 | my ($db_path, $query_string, $offset, $pagesize) = @_;
32 | $offset ||= 0;
33 | $pagesize ||= 10;
34 |
35 | my $db = Search::Xapian::Database->new($db_path);
36 |
37 | ### Start of example code.
38 | # Set up a QueryParser with a stemmer and suitable prefixes
39 | my $queryparser = Search::Xapian::QueryParser->new;
40 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en'));
41 | $queryparser->set_stemming_strategy(STEM_SOME);
42 |
43 | # Start of prefix configuration.
44 | $queryparser->add_prefix(title => "S");
45 | $queryparser->add_prefix(description => "XD");
46 |
47 | # allow the user to specify material:.... in the query
48 | $queryparser->add_boolean_prefix(material => "XM");
49 | # End of prefix configuration.
50 |
51 | # And parse the query
52 | my $query = $queryparser->parse_query($query_string);
53 |
54 | ### End of example code.
55 |
56 | # Use an Enquire object on the database to run the query
57 | my $enquire = $db->enquire($query);
58 |
59 | # And print out something about each match
60 | my @matches;
61 |
62 | my $mset = $enquire->get_mset($offset, $pagesize);
63 | foreach my $item ($mset->items) {
64 | my $fields = decode_json($item->get_document->get_data);
65 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE});
66 | print "\n";
67 | push @matches, $item->get_docid;
68 | }
69 | Support::log_matches($query_string, $offset, $pagesize, \@matches);
70 | }
71 |
--------------------------------------------------------------------------------
/code/perl/search_sorting.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 |
23 |
24 | my ($db_path, @terms) = @ARGV;
25 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms;
26 |
27 | search($db_path, join(' ', @terms));
28 |
29 | sub search {
30 | my ($db_path, $query_string, $offset, $pagesize) = @_;
31 | $offset ||= 0;
32 | $pagesize ||= 10;
33 | my $db = Search::Xapian::Database->new($db_path);
34 | # Set up a QueryParser with a stemmer and suitable prefixes
35 | my $queryparser = Search::Xapian::QueryParser->new;
36 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en'));
37 | $queryparser->set_stemming_strategy(STEM_SOME);
38 |
39 | # Start of prefix configuration.
40 | $queryparser->add_prefix(title => "S");
41 | $queryparser->add_prefix(description => "XD");
42 | # End of prefix configuration.
43 |
44 | # And parse the query
45 | my $query = $queryparser->parse_query($query_string);
46 |
47 | # Use an Enquire object on the database to run the query
48 | my $enquire = $db->enquire($query);
49 |
50 | # Start of example code.
51 | $enquire->set_sort_by_value_then_relevance(1, 0);
52 | # End of example code.
53 |
54 |
55 | # And print out something about each match
56 | my @matches;
57 |
58 | my $mset = $enquire->get_mset($offset, $pagesize);
59 | foreach my $item ($mset->items) {
60 | my $fields = decode_json($item->get_document->get_data);
61 | printf(qq{%i: #%3.3i %s %s\n Population %s\n},
62 | $item->get_rank + 1,
63 | $item->get_docid,
64 | $fields->{name},
65 | Support::format_date($fields->{admitted}),
66 | Support::format_numeral($fields->{population}));
67 | push @matches, $item->get_docid;
68 | }
69 | Support::log_matches($query_string, $offset, $pagesize, \@matches);
70 | }
71 |
72 |
--------------------------------------------------------------------------------
/code/perl/search_synonyms.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use strict;
4 | use warnings;
5 |
6 | BEGIN {
7 | eval {
8 | require Xapian;
9 | Xapian->import(':all');
10 | Xapian::search_xapian_compat();
11 | };
12 | if ($@) {
13 | require Search::Xapian;
14 | Search::Xapian->import(':all');
15 | }
16 | }
17 |
18 | use JSON::MaybeXS;
19 | use FindBin qw($Bin);
20 | use lib $Bin;
21 | use Support;
22 |
23 |
24 | my ($db_path, @terms) = @ARGV;
25 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms;
26 |
27 | search($db_path, join(' ', @terms));
28 |
29 | ### Start of example code.
30 | sub search {
31 | my ($db_path, $query_string, $offset, $pagesize) = @_;
32 | $offset ||= 0;
33 | $pagesize ||= 10;
34 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_OPEN);
35 |
36 | # Start of adding synonyms
37 | $db->add_synonym(time => 'calendar');
38 | # End of adding synonyms
39 |
40 | # Set up a QueryParser with a stemmer and suitable prefixes
41 | my $queryparser = Search::Xapian::QueryParser->new;
42 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en'));
43 | $queryparser->set_stemming_strategy(STEM_SOME);
44 |
45 | # Start of prefix configuration.
46 | $queryparser->add_prefix(title => "S");
47 | $queryparser->add_prefix(description => "XD");
48 | # End of prefix configuration.
49 |
50 | # Start of set database
51 | $queryparser->set_database($db);
52 | # End of set database
53 |
54 | # And parse the query
55 | my $query = $queryparser->parse_query($query_string, FLAG_SYNONYM);
56 |
57 | # Use an Enquire object on the database to run the query
58 | my $enquire = $db->enquire($query);
59 |
60 | # And print out something about each match
61 | my @matches;
62 |
63 | my $mset = $enquire->get_mset($offset, $pagesize);
64 | foreach my $item ($mset->items) {
65 | my $fields = decode_json($item->get_document->get_data);
66 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE});
67 | print "\n";
68 | push @matches, $item->get_docid;
69 | }
70 | Support::log_matches($query_string, $offset, $pagesize, \@matches);
71 | }
72 | ### End of example code.
73 |
--------------------------------------------------------------------------------
/code/perl/strings.t:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | use utf8;
4 | use strict;
5 | use warnings;
6 | use Encode qw/decode encode is_utf8/;
7 | use Test::More;
8 | use File::Temp;
9 | use Search::Xapian ':all';
10 |
11 | my $decoded_string = "Đe ši Šu";
12 | my $encoded_string = encode('UTF-8', $decoded_string);
13 | # most misnamed function ever, and it's internal anyway. Should be is_decoded()
14 | ok is_utf8($decoded_string), "decode is decoded";
15 | ok !is_utf8($encoded_string), "encoded is encoded";
16 | isnt $decoded_string, $encoded_string, "Strings differ";
17 |
18 | my $tmp = File::Temp->newdir;
19 | my $db_path = $tmp->dirname;
20 |
21 | foreach my $store_encoded (0..1) {
22 | # index
23 | {
24 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN);
25 | my $term_generator = Search::Xapian::TermGenerator->new;
26 | $term_generator->set_stemmer(Search::Xapian::Stem->new('none'));
27 | my $doc = Search::Xapian::Document->new;
28 | $term_generator->index_text('try');
29 |
30 | # this is the gist of the demostration. It doesn't care if the
31 | # stored string is encoded or decoded. We always get back the
32 | # encoded one.
33 | if ($store_encoded) {
34 | $doc->set_data($encoded_string);
35 | $doc->add_value(0, $encoded_string);
36 | }
37 | else {
38 | $doc->set_data($decoded_string);
39 | $doc->add_value(0, $decoded_string);
40 | }
41 |
42 | my $id = 'Qtry1';
43 | $doc->add_boolean_term($id);
44 | $db->replace_document_by_term($id, $doc);
45 | }
46 | # search and test
47 | {
48 | my $db = Search::Xapian::Database->new($db_path);
49 | my $query = Search::Xapian::Query->new('Qtry1');
50 | my $enquire = $db->enquire($query);
51 | my ($res) = $enquire->get_mset(0, 1)->items;
52 | my $doc = $res->get_document;
53 | is $doc->get_data, $encoded_string;
54 | isnt $doc->get_data, $decoded_string, "data is binary";
55 | is $doc->get_value(0), $encoded_string;
56 | isnt $doc->get_value(0), $decoded_string, "value is binary as well";
57 | }
58 | }
59 |
60 | done_testing;
61 |
--------------------------------------------------------------------------------
/code/php/delete1.php:
--------------------------------------------------------------------------------
1 | delete_document($idterm);
14 | }
15 | }
16 | // End of example code.
17 |
18 | if ($argc < 3) {
19 | print "Usage: php $argv[0] DBPATH ID...\n";
20 | die();
21 | }
22 |
23 | // Call the delete_docs function.
24 | delete_docs($argv[1], array_slice($argv, 2));
25 | ?>
26 |
--------------------------------------------------------------------------------
/code/php/index1.php:
--------------------------------------------------------------------------------
1 | set_stemmer(new XapianStem('en'));
14 |
15 | // Open the file.
16 | $fH = open_file($datapath);
17 |
18 | // Read the header row in.
19 | $headers = get_csv_headers($fH);
20 |
21 | while (($row = parse_csv_row($fH, $headers)) !== false) {
22 | // '$row' maps field name to value. The field names come from the
23 | // first row of the CSV file.
24 | //
25 | // We're just going to use DESCRIPTION, TITLE and id_NUMBER.
26 | $description = $row['DESCRIPTION'];
27 | $title = $row['TITLE'];
28 | $identifier = $row['id_NUMBER'];
29 |
30 | // We make a document and tell the term generator to use this.
31 | $doc = new XapianDocument();
32 | $termgenerator->set_document($doc);
33 |
34 | // Index each field with a suitable prefix.
35 | $termgenerator->index_text($title, 1, 'S');
36 | $termgenerator->index_text($description, 1, 'XD');
37 |
38 | // Index fields without prefixes for general search.
39 | $termgenerator->index_text($title);
40 | $termgenerator->increase_termpos();
41 | $termgenerator->index_text($description);
42 |
43 | // Store all the fields for display purposes.
44 | $doc->set_data(json_encode($row));
45 |
46 | // We use the identifier to ensure each object ends up in the
47 | // database only once no matter how many times we run the
48 | // indexer.
49 | $idterm = "Q".$identifier;
50 | $doc->add_boolean_term($idterm);
51 | $db->replace_document($idterm, $doc);
52 | }
53 | }
54 | // End of example code.
55 |
56 | if ($argc != 3) {
57 | print "Usage: php $argv[0] DATAPATH DBPATH\n";
58 | die();
59 | }
60 |
61 | // Call the index function.
62 | index($argv[1], $argv[2]);
63 | ?>
64 |
--------------------------------------------------------------------------------
/code/php/index_facets.php:
--------------------------------------------------------------------------------
1 | set_stemmer(new XapianStem('en'));
14 |
15 | // open the file
16 | $fH = open_file($datapath);
17 |
18 | // Read the header row in
19 | $headers = get_csv_headers($fH);
20 |
21 | while (($row = parse_csv_row($fH, $headers)) !== false) {
22 | // mapping from field name to value using first row headers
23 | // We're just going to use id_NUMBER, TITLE and DESCRIPTION
24 | $description = $row['DESCRIPTION'];
25 | $title = $row['TITLE'];
26 | $identifier = $row['id_NUMBER'];
27 | $collection = $row['COLLECTION'];
28 | $maker = $row['MAKER'];
29 |
30 | // we make a document and tell the term generator to use this
31 | $doc = new XapianDocument();
32 | $termgenerator->set_document($doc);
33 |
34 | // index each field with a suitable prefix
35 | $termgenerator->index_text($title, 1, 'S');
36 | $termgenerator->index_text($description, 1, 'XD');
37 |
38 | // index fields without prefixes for general search
39 | $termgenerator->index_text($title);
40 | $termgenerator->increase_termpos();
41 | $termgenerator->index_text($description);
42 |
43 | // add the collection as a value in slot 0
44 | $doc->add_value(0, $collection);
45 |
46 | // add the maker as a value in slot 1
47 | $doc->add_value(1, $maker);
48 |
49 | // store all the fields for display purposes
50 | $doc->set_data(json_encode($row));
51 |
52 | // we use the identifier to ensure each object ends up
53 | // in the database only once no matter how many times
54 | // we run the indexer
55 | $idterm = "Q".$identifier;
56 | $doc->add_boolean_term($idterm);
57 | $db->replace_document($idterm, $doc);
58 | }
59 | }
60 | // End of example code
61 |
62 | if ($argc != 3) {
63 | print "Usage: php $argv[0] DATAPATH DBPATH\n";
64 | die();
65 | }
66 |
67 | // call the index function
68 | index($argv[1], $argv[2]);
69 | ?>
70 |
--------------------------------------------------------------------------------
/code/php/logger.php:
--------------------------------------------------------------------------------
1 |
14 |
--------------------------------------------------------------------------------
/code/php/parsecsv.php:
--------------------------------------------------------------------------------
1 | column associations from open file
4 | *
5 | * @param resource $fH Open file resource
6 | *
7 | * @return array Associative array of column name => column number
8 | */
9 | function get_csv_headers ($fH)
10 | {
11 | return fgetcsv($fH);
12 | }
13 |
14 | /**
15 | * Handles file opening and error reporting if file in unavailable
16 | *
17 | * @param string $file Path of file to open
18 | *
19 | * @return resource Open file handle
20 | */
21 | function open_file ($file)
22 | {
23 | // Open the CSV file
24 | $fH = fopen($file, "r");
25 | if ($fH === false) {
26 | die("Failed to open input file {$file} for reading\n");
27 | }
28 |
29 | return $fH;
30 | }
31 |
32 | /**
33 | * Reads a row of data from a CSV file
34 | *
35 | * @param resource $fH Open file handle
36 | * @param array $headers Indexed array of column names
37 | *
38 | * @return mixed False if EOF; indexed array of data otherwise
39 | */
40 | function parse_csv_row ($fH, $headers)
41 | {
42 | $row = fgetcsv($fH);
43 | $data = array();
44 |
45 | if (is_array($row) === false)
46 | {
47 | return false;
48 | }
49 |
50 | foreach ($row as $key => $value) {
51 | $data[$headers[$key]] = $value;
52 | }
53 |
54 | return $data;
55 | }
56 | ?>
57 |
--------------------------------------------------------------------------------
/code/php/search1.php:
--------------------------------------------------------------------------------
1 | set_stemmer(new XapianStem("en"));
18 | $queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME);
19 | // Start of prefix configuration.
20 | $queryparser->add_prefix("title", "S");
21 | $queryparser->add_prefix("description", "XD");
22 | // End of prefix configuration.
23 |
24 | // And parse the query
25 | $query = $queryparser->parse_query($querystring);
26 |
27 | // Use an Enquire object on the database to run the query
28 | $enquire = new XapianEnquire($db);
29 | $enquire->set_query($query);
30 |
31 | // Retrieve the matches and compute start and end points
32 | $matches = $enquire->get_mset($offset, $pagesize);
33 | $match = $matches->begin();
34 | $end = $matches->end();
35 |
36 | // Use an array to record the DocIds of each match
37 | $docids = array();
38 |
39 | while (!($match->equals($end)))
40 | {
41 | // retrieve the document and its data
42 | $doc = $match->get_document();
43 | $fields = json_decode($doc->get_data());
44 | $position = $match->get_rank() + 1;
45 |
46 | // record the docid
47 | $docid = $match->get_docid();
48 | $docids[] = $docid;
49 |
50 | // display the results
51 | printf("%d: #%3.3d %s\n", $position, $docid, $fields->TITLE);
52 |
53 | // increment MSet iterator and our counter
54 | $match->next();
55 | }
56 |
57 | // Finally, make sure we log the query and displayed results
58 | printf(
59 | "'%s'[%d:%d] = %s\n",
60 | $querystring,
61 | $offset,
62 | $offset+$pagesize,
63 | implode(" ", $docids)
64 | );
65 | }
66 | ## End of example code.
67 |
68 | if ($argc < 3) {
69 | print "Usage: php $argv[0] DBPATH QUERYTERM...\n";
70 | die();
71 | }
72 |
73 | search($argv[1], join(' ', array_slice($argv, 2)));
74 | ?>
75 |
--------------------------------------------------------------------------------
/code/php/search_filters2.php:
--------------------------------------------------------------------------------
1 | set_stemmer(new XapianStem("en"));
18 | $queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME);
19 | $queryparser->add_prefix("title", "S");
20 | $queryparser->add_prefix("description", "XD");
21 | $queryparser->add_boolean_prefix("material", "XM");
22 |
23 | // And parse the query
24 | $query = $queryparser->parse_query($querystring);
25 | ### End of example code.
26 |
27 | // Use an Enquire object on the database to run the query
28 | $enquire = new XapianEnquire($db);
29 | $enquire->set_query($query);
30 |
31 | // Retrieve the matches and compute start and end points
32 | $matches = $enquire->get_mset($offset, $pagesize);
33 | $match = $matches->begin();
34 | $end = $matches->end();
35 |
36 | // Use an array to record the DocIds of each match
37 | $docids = array();
38 |
39 | while (!($match->equals($end)))
40 | {
41 | // retrieve the document and its data
42 | $doc = $match->get_document();
43 | $fields = json_decode($doc->get_data());
44 | $position = $match->get_rank() + 1;
45 |
46 | // record the docid
47 | $docid = $match->get_docid();
48 | $docids[] = $docid;
49 |
50 | // display the results
51 | printf("%d: #%3.3d %s\n", $position, $docid, $fields->TITLE);
52 |
53 | // increment MSet iterator and our counter
54 | $match->next();
55 | }
56 |
57 | // Finally, make sure we log the query and displayed results
58 | printf("'%s'[%d:%d] = %s\n",
59 | $querystring,
60 | $offset,
61 | $offset+$pagesize,
62 | implode(" ", $docids)
63 | );
64 | }
65 |
66 | if ($argc < 2) {
67 | print "Usage: php $argv[0] DBPATH QUERYTERM...\n";
68 | die();
69 | }
70 |
71 | search($argv[1], join(' ', array_slice($argv, 2)));
72 | ?>
73 |
--------------------------------------------------------------------------------
/code/python/delete1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import xapian
5 |
6 | ### Start of example code.
7 | def delete_docs(dbpath, identifiers):
8 | # Open the database we're going to be deleting from.
9 | db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN)
10 |
11 | for identifier in identifiers:
12 | idterm = u'Q' + identifier
13 | db.delete_document(idterm)
14 | ### End of example code.
15 |
16 | if len(sys.argv) < 3:
17 | print("Usage: %s DBPATH ID..." % sys.argv[0])
18 | sys.exit(1)
19 |
20 | delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:])
21 |
--------------------------------------------------------------------------------
/code/python/index1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | ### Start of example code.
9 | def index(datapath, dbpath):
10 | # Create or open the database we're going to be writing to.
11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
12 |
13 | # Set up a TermGenerator that we'll use in indexing.
14 | termgenerator = xapian.TermGenerator()
15 | termgenerator.set_stemmer(xapian.Stem("en"))
16 |
17 | for fields in parse_csv_file(datapath):
18 | # 'fields' is a dictionary mapping from field name to value.
19 | # Pick out the fields we're going to index.
20 | description = fields.get('DESCRIPTION', u'')
21 | title = fields.get('TITLE', u'')
22 | identifier = fields.get('id_NUMBER', u'')
23 |
24 | # We make a document and tell the term generator to use this.
25 | doc = xapian.Document()
26 | termgenerator.set_document(doc)
27 |
28 | # Index each field with a suitable prefix.
29 | termgenerator.index_text(title, 1, 'S')
30 | termgenerator.index_text(description, 1, 'XD')
31 |
32 | # Index fields without prefixes for general search.
33 | termgenerator.index_text(title)
34 | termgenerator.increase_termpos()
35 | termgenerator.index_text(description)
36 |
37 | # Store all the fields for display purposes.
38 | doc.set_data(json.dumps(fields))
39 |
40 | # We use the identifier to ensure each object ends up in the
41 | # database only once no matter how many times we run the
42 | # indexer.
43 | idterm = u"Q" + identifier
44 | doc.add_boolean_term(idterm)
45 | db.replace_document(idterm, doc)
46 | ### End of example code.
47 |
48 | if len(sys.argv) != 3:
49 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
50 | sys.exit(1)
51 |
52 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
53 |
--------------------------------------------------------------------------------
/code/python/index_facets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | ### Start of example code.
9 | def index(datapath, dbpath):
10 | # Create or open the database we're going to be writing to.
11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
12 |
13 | # Set up a TermGenerator that we'll use in indexing.
14 | termgenerator = xapian.TermGenerator()
15 | termgenerator.set_stemmer(xapian.Stem("en"))
16 |
17 | for fields in parse_csv_file(datapath):
18 | # 'fields' is a dictionary mapping from field name to value.
19 | # Pick out the fields we're going to index.
20 | description = fields.get('DESCRIPTION', u'')
21 | title = fields.get('TITLE', u'')
22 | identifier = fields.get('id_NUMBER', u'')
23 | collection = fields.get('COLLECTION', u'')
24 | maker = fields.get('MAKER', u'')
25 |
26 | # We make a document and tell the term generator to use this.
27 | doc = xapian.Document()
28 | termgenerator.set_document(doc)
29 |
30 | # Index each field with a suitable prefix.
31 | termgenerator.index_text(title, 1, 'S')
32 | termgenerator.index_text(description, 1, 'XD')
33 |
34 | # Index fields without prefixes for general search.
35 | termgenerator.index_text(title)
36 | termgenerator.increase_termpos()
37 | termgenerator.index_text(description)
38 |
39 | # Add the collection as a value in slot 0.
40 | doc.add_value(0, collection)
41 |
42 | # Add the maker as a value in slot 1.
43 | doc.add_value(1, maker)
44 |
45 | # Store all the fields for display purposes.
46 | doc.set_data(json.dumps(fields))
47 |
48 | # We use the identifier to ensure each object ends up in the
49 | # database only once no matter how many times we run the
50 | # indexer.
51 | idterm = u"Q" + identifier
52 | doc.add_boolean_term(idterm)
53 | db.replace_document(idterm, doc)
54 | ### End of example code.
55 |
56 | if len(sys.argv) != 3:
57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
58 | sys.exit(1)
59 |
60 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
61 |
--------------------------------------------------------------------------------
/code/python/index_filters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_csv_file(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | description = fields.get('DESCRIPTION', u'')
20 | title = fields.get('TITLE', u'')
21 | identifier = fields.get('id_NUMBER', u'')
22 |
23 | # We make a document and tell the term generator to use this.
24 | doc = xapian.Document()
25 | termgenerator.set_document(doc)
26 |
27 | # Index each field with a suitable prefix.
28 | termgenerator.index_text(title, 1, 'S')
29 | termgenerator.index_text(description, 1, 'XD')
30 |
31 | # Index fields without prefixes for general search.
32 | termgenerator.index_text(title)
33 | termgenerator.increase_termpos()
34 | termgenerator.index_text(description)
35 |
36 | ### Start of new indexing code.
37 | # Index the MATERIALS field, splitting on semicolons.
38 | for material in fields.get('MATERIALS', u'').split(';'):
39 | material = material.strip().lower()
40 | if len(material) > 0:
41 | doc.add_boolean_term('XM' + material)
42 | ### End of new indexing code.
43 |
44 | # Store all the fields for display purposes.
45 | doc.set_data(json.dumps(fields))
46 |
47 | # We use the identifier to ensure each object ends up in the
48 | # database only once no matter how many times we run the
49 | # indexer.
50 | idterm = u"Q" + identifier
51 | doc.add_boolean_term(idterm)
52 | db.replace_document(idterm, doc)
53 |
54 | if len(sys.argv) != 3:
55 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
56 | sys.exit(1)
57 |
58 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
59 |
--------------------------------------------------------------------------------
/code/python/index_ranges.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import numbers_from_string, parse_csv_file
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_csv_file(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | description = fields.get('DESCRIPTION', u'')
20 | title = fields.get('TITLE', u'')
21 | identifier = fields.get('id_NUMBER', u'')
22 |
23 | # We make a document and tell the term generator to use this.
24 | doc = xapian.Document()
25 | termgenerator.set_document(doc)
26 |
27 | # Index each field with a suitable prefix.
28 | termgenerator.index_text(title, 1, 'S')
29 | termgenerator.index_text(description, 1, 'XD')
30 |
31 | # Index fields without prefixes for general search.
32 | termgenerator.index_text(title)
33 | termgenerator.increase_termpos()
34 | termgenerator.index_text(description)
35 |
36 | # Store all the fields for display purposes.
37 | doc.set_data(json.dumps(fields))
38 |
39 | ### Start of example code.
40 | # parse the two values we need
41 | measurements = fields.get('MEASUREMENTS', u'')
42 | if len(measurements) > 0:
43 | numbers = numbers_from_string(measurements)
44 | if len(numbers) > 0:
45 | doc.add_value(0, xapian.sortable_serialise(max(numbers)))
46 |
47 | date_made = fields.get('DATE_MADE', u'')
48 | years = numbers_from_string(date_made)
49 | if len(years) > 0:
50 | doc.add_value(1, xapian.sortable_serialise(years[0]))
51 | ### End of example code.
52 |
53 | # We use the identifier to ensure each object ends up in the
54 | # database only once no matter how many times we run the
55 | # indexer.
56 | idterm = u"Q" + identifier
57 | doc.add_boolean_term(idterm)
58 | db.replace_document(idterm, doc)
59 |
60 | if len(sys.argv) != 3:
61 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
62 | sys.exit(1)
63 |
64 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
65 |
--------------------------------------------------------------------------------
/code/python/index_ranges2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | from support import parse_states
5 | import sys
6 | import xapian
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_states(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | name = fields.get('name', u'')
20 | description = fields.get('description', u'')
21 | motto = fields.get('motto', u'')
22 | admitted = fields.get('admitted', None)
23 | population = fields.get('population', None)
24 | order = fields.get('order', u'')
25 |
26 | # We make a document and tell the term generator to use this.
27 | doc = xapian.Document()
28 | termgenerator.set_document(doc)
29 |
30 | ### Start of example code.
31 | # Index each field with a suitable prefix.
32 | termgenerator.index_text(name, 1, 'S')
33 | termgenerator.index_text(description, 1, 'XD')
34 | termgenerator.index_text(motto, 1, 'XM')
35 |
36 | # Index fields without prefixes for general search.
37 | termgenerator.index_text(name)
38 | termgenerator.increase_termpos()
39 | termgenerator.index_text(description)
40 | termgenerator.increase_termpos()
41 | termgenerator.index_text(motto)
42 |
43 | # Add document values.
44 | if admitted is not None:
45 | doc.add_value(1, xapian.sortable_serialise(int(admitted[:4])))
46 | doc.add_value(2, admitted) # YYYYMMDD
47 | if population is not None:
48 | doc.add_value(3, xapian.sortable_serialise(int(population)))
49 | ### End of example code.
50 |
51 | # Store all the fields for display purposes.
52 | doc.set_data(json.dumps(fields))
53 |
54 | # We use the order to ensure each object ends up in the
55 | # database only once no matter how many times we run the
56 | # indexer.
57 | idterm = u"Q" + order
58 | doc.add_boolean_term(idterm)
59 | db.replace_document(idterm, doc)
60 |
61 | if len(sys.argv) != 3:
62 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
63 | sys.exit(1)
64 |
65 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
66 |
--------------------------------------------------------------------------------
/code/python/index_sorting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_csv_file(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | description = fields.get('DESCRIPTION', u'')
20 | title = fields.get('TITLE', u'')
21 | identifier = fields.get('id_NUMBER', u'')
22 | collection = fields.get('COLLECTION', u'')
23 | maker = fields.get('MAKER', u'')
24 |
25 | # We make a document and tell the term generator to use this.
26 | doc = xapian.Document()
27 | termgenerator.set_document(doc)
28 |
29 | # Index each field with a suitable prefix.
30 | termgenerator.index_text(title, 1, 'S')
31 | termgenerator.index_text(description, 1, 'XD')
32 |
33 | # Index fields without prefixes for general search.
34 | termgenerator.index_text(title)
35 | termgenerator.increase_termpos()
36 | termgenerator.index_text(description)
37 |
38 | ### Start of example code.
39 | # add the collection as a value in slot 0
40 | doc.add_value(0, collection)
41 |
42 | # add the maker as a value in slot 1
43 | doc.add_value(1, maker)
44 | ### End of example code.
45 |
46 | # Store all the fields for display purposes.
47 | doc.set_data(json.dumps(fields))
48 |
49 | # We use the identifier to ensure each object ends up in the
50 | # database only once no matter how many times we run the
51 | # indexer.
52 | idterm = u"Q" + identifier
53 | doc.add_boolean_term(idterm)
54 | db.replace_document(idterm, doc)
55 |
56 | if len(sys.argv) != 3:
57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
58 | sys.exit(1)
59 |
60 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
61 |
--------------------------------------------------------------------------------
/code/python/postingsource.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import xapian
4 |
5 | ### Start of class header and constructor.
6 | class ExternalWeightPostingSource(xapian.PostingSource):
7 | """
8 | A Xapian posting source returning weights from an external source.
9 | """
10 | def __init__(self, wtsource):
11 | xapian.PostingSource.__init__(self)
12 | self.wtsource = wtsource
13 | ### End of class header and constructor.
14 |
15 | ### Start of init.
16 | def init(self, db):
17 | self.db = db
18 | self.alldocs = db.postlist('')
19 | self.set_maxweight(self.wtsource.get_maxweight())
20 | ### End of init.
21 |
22 | ### Start of termfreq methods.
23 | def get_termfreq_min(self): return self.db.get_doccount()
24 | def get_termfreq_est(self): return self.db.get_doccount()
25 | def get_termfreq_max(self): return self.db.get_doccount()
26 | ### End of termfreq methods.
27 |
28 | ### Start of get_weight.
29 | def get_weight(self):
30 | doc = self.db.get_document(self.current.docid)
31 | return self.wtsource.get_weight(doc)
32 | ### End of get_weight.
33 |
34 | ### Start of get_docid.
35 | def get_docid(self):
36 | return self.current.docid
37 | ### End of get_docid.
38 |
39 | ### Start of at_end.
40 | def at_end(self):
41 | return self.current is None
42 | ### End of at_end.
43 |
44 | ### Start of next.
45 | def next(self, minweight):
46 | try:
47 | self.current = self.alldocs.next()
48 | except StopIteration:
49 | self.current = None
50 | ### End of next.
51 |
52 | ### Start of skip_to.
53 | def skip_to(self, docid, minweight):
54 | try:
55 | self.current = self.alldocs.skip_to(docid)
56 | except StopIteration:
57 | self.current = None
58 | ### End of skip_to.
59 |
--------------------------------------------------------------------------------
/code/python/search1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | ### Start of example code.
9 | def search(dbpath, querystring, offset=0, pagesize=10):
10 | # offset - defines starting point within result set
11 | # pagesize - defines number of records to retrieve
12 |
13 | # Open the database we're going to search.
14 | db = xapian.Database(dbpath)
15 |
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = xapian.QueryParser()
18 | queryparser.set_stemmer(xapian.Stem("en"))
19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
20 | # Start of prefix configuration.
21 | queryparser.add_prefix("title", "S")
22 | queryparser.add_prefix("description", "XD")
23 | # End of prefix configuration.
24 |
25 | # And parse the query
26 | query = queryparser.parse_query(querystring)
27 |
28 | # Use an Enquire object on the database to run the query
29 | enquire = xapian.Enquire(db)
30 | enquire.set_query(query)
31 |
32 | # And print out something about each match
33 | matches = []
34 | for match in enquire.get_mset(offset, pagesize):
35 | fields = json.loads(match.document.get_data())
36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
37 | 'rank': match.rank + 1,
38 | 'docid': match.docid,
39 | 'title': fields.get('TITLE', u''),
40 | })
41 | matches.append(match.docid)
42 |
43 | # Finally, make sure we log the query and displayed results
44 | support.log_matches(querystring, offset, pagesize, matches)
45 | ### End of example code.
46 |
47 | if len(sys.argv) < 3:
48 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
49 | sys.exit(1)
50 |
51 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
52 |
--------------------------------------------------------------------------------
/code/python/search_facets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 |
29 | # And print out something about each match
30 | matches = []
31 |
32 | ### Start of example code.
33 | # Set up a spy to inspect the MAKER value at slot 1
34 | spy = xapian.ValueCountMatchSpy(1)
35 | enquire.add_matchspy(spy)
36 |
37 | for match in enquire.get_mset(offset, pagesize, 100):
38 | fields = json.loads(match.document.get_data())
39 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
40 | 'rank': match.rank + 1,
41 | 'docid': match.docid,
42 | 'title': fields.get('TITLE', u''),
43 | })
44 | matches.append(match.docid)
45 |
46 | # Fetch and display the spy values
47 | for facet in spy.values():
48 | print("Facet: %(term)s; count: %(count)i" % {
49 | 'term' : facet.term,
50 | 'count' : facet.termfreq
51 | })
52 |
53 | # Finally, make sure we log the query and displayed results
54 | support.log_matches(querystring, offset, pagesize, matches)
55 | ### End of example code.
56 |
57 | if len(sys.argv) < 3:
58 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
59 | sys.exit(1)
60 |
61 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
62 |
--------------------------------------------------------------------------------
/code/python/search_filters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, materials, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | ### Start of example code.
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = xapian.QueryParser()
18 | queryparser.set_stemmer(xapian.Stem("en"))
19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
20 | queryparser.add_prefix("title", "S")
21 | queryparser.add_prefix("description", "XD")
22 |
23 | # And parse the query
24 | query = queryparser.parse_query(querystring)
25 |
26 | if len(materials) > 0:
27 | # Filter the results to ones which contain at least one of the
28 | # materials.
29 |
30 | # Build a query for each material value
31 | material_queries = [
32 | xapian.Query('XM' + material.lower())
33 | for material in materials
34 | ]
35 |
36 | # Combine these queries with an OR operator
37 | material_query = xapian.Query(xapian.Query.OP_OR, material_queries)
38 |
39 | # Use the material query to filter the main query
40 | query = xapian.Query(xapian.Query.OP_FILTER, query, material_query)
41 | ### End of example code.
42 |
43 | # Use an Enquire object on the database to run the query
44 | enquire = xapian.Enquire(db)
45 | enquire.set_query(query)
46 |
47 | # And print out something about each match
48 | matches = []
49 | for match in enquire.get_mset(offset, pagesize):
50 | fields = json.loads(match.document.get_data())
51 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
52 | 'rank': match.rank + 1,
53 | 'docid': match.docid,
54 | 'title': fields.get('TITLE', u''),
55 | })
56 | matches.append(match.docid)
57 |
58 | # Finally, make sure we log the query and displayed results
59 | support.log_matches(querystring, offset, pagesize, matches)
60 |
61 | if len(sys.argv) < 3:
62 | print("Usage: %s DBPATH QUERY [MATERIALS...]" % sys.argv[0])
63 | sys.exit(1)
64 |
65 | search(dbpath = sys.argv[1], querystring = sys.argv[2],
66 | materials = sys.argv[3:])
67 |
--------------------------------------------------------------------------------
/code/python/search_filters2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | ### Start of example code.
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = xapian.QueryParser()
18 | queryparser.set_stemmer(xapian.Stem("en"))
19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
20 | queryparser.add_prefix("title", "S")
21 | queryparser.add_prefix("description", "XD")
22 | queryparser.add_boolean_prefix("material", "XM")
23 |
24 | # And parse the query
25 | query = queryparser.parse_query(querystring)
26 | ### End of example code.
27 |
28 | # Use an Enquire object on the database to run the query
29 | enquire = xapian.Enquire(db)
30 | enquire.set_query(query)
31 |
32 | # And print out something about each match
33 | matches = []
34 | for match in enquire.get_mset(offset, pagesize):
35 | fields = json.loads(match.document.get_data())
36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
37 | 'rank': match.rank + 1,
38 | 'docid': match.docid,
39 | 'title': fields.get('TITLE', u''),
40 | })
41 | matches.append(match.docid)
42 | # Finally, make sure we log the query and displayed results
43 | support.log_matches(querystring, offset, pagesize, matches)
44 |
45 | if len(sys.argv) < 3:
46 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
47 | sys.exit(1)
48 |
49 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
50 |
--------------------------------------------------------------------------------
/code/python/search_ranges.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 | # and add in range processors
22 | queryparser.add_rangeprocessor(
23 | xapian.NumberRangeProcessor(0, 'mm', xapian.RP_SUFFIX)
24 | )
25 | queryparser.add_rangeprocessor(
26 | xapian.NumberRangeProcessor(1)
27 | )
28 |
29 | # And parse the query
30 | query = queryparser.parse_query(querystring)
31 |
32 | # Use an Enquire object on the database to run the query
33 | enquire = xapian.Enquire(db)
34 | enquire.set_query(query)
35 |
36 | # And print out something about each match
37 | matches = []
38 | for match in enquire.get_mset(offset, pagesize):
39 | fields = json.loads(match.document.get_data())
40 | print(u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n %(title)s" % {
41 | 'rank': match.rank + 1,
42 | 'docid': match.docid,
43 | 'measurements': fields.get('MEASUREMENTS', u''),
44 | 'date': fields.get('DATE_MADE', u''),
45 | 'title': fields.get('TITLE', u''),
46 | })
47 | matches.append(match.docid)
48 |
49 | # Finally, make sure we log the query and displayed results
50 | support.log_matches(querystring, offset, pagesize, matches)
51 |
52 | if len(sys.argv) < 3:
53 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
54 | sys.exit(1)
55 |
56 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
57 |
--------------------------------------------------------------------------------
/code/python/search_sorting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 | # Start of example code.
29 | enquire.set_sort_by_value_then_relevance(1, False)
30 | # End of example code.
31 |
32 | # And print out something about each match
33 | matches = []
34 | for match in enquire.get_mset(offset, pagesize):
35 | fields = json.loads(match.document.get_data())
36 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % {
37 | 'rank': match.rank + 1,
38 | 'docid': match.docid,
39 | 'name': fields.get('name', u''),
40 | 'date': support.format_date(fields.get('admitted', u'')),
41 | 'pop': support.format_numeral(int(fields.get('population', 0))),
42 | 'lat': fields.get('latitude', u''),
43 | 'lon': fields.get('longitude', u''),
44 | })
45 | matches.append(match.docid)
46 |
47 | # Finally, make sure we log the query and displayed results
48 | support.log_matches(querystring, offset, pagesize, matches)
49 |
50 | if len(sys.argv) < 3:
51 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
52 | sys.exit(1)
53 |
54 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
55 |
--------------------------------------------------------------------------------
/code/python/search_sorting2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 | # Start of example code.
29 | keymaker = xapian.MultiValueKeyMaker()
30 | keymaker.add_value(1, False)
31 | keymaker.add_value(3, True)
32 | enquire.set_sort_by_key_then_relevance(keymaker, False)
33 | # End of example code.
34 |
35 | # And print out something about each match
36 | matches = []
37 | for match in enquire.get_mset(offset, pagesize):
38 | fields = json.loads(match.document.get_data())
39 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % {
40 | 'rank': match.rank + 1,
41 | 'docid': match.docid,
42 | 'name': fields.get('name', u''),
43 | 'date': support.format_date(fields.get('admitted', u'')),
44 | 'pop': support.format_numeral(int(fields.get('population', 0))),
45 | 'lat': fields.get('latitude', u''),
46 | 'lon': fields.get('longitude', u''),
47 | })
48 | matches.append(match.docid)
49 |
50 | # Finally, make sure we log the query and displayed results
51 | support.log_matches(querystring, offset, pagesize, matches)
52 |
53 | if len(sys.argv) < 3:
54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
55 | sys.exit(1)
56 |
57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
58 |
--------------------------------------------------------------------------------
/code/python/search_sorting3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 | # Start of example code.
29 | class DistanceKeyMaker(xapian.KeyMaker):
30 | def __call__(self, doc):
31 | # we want to return a sortable string which represents
32 | # the distance from Washington, DC to the middle of this
33 | # state.
34 | coords = map(float, doc.get_value(4).split(','))
35 | washington = (38.012, -77.037)
36 | return xapian.sortable_serialise(
37 | support.distance_between_coords(coords, washington)
38 | )
39 | enquire.set_sort_by_key_then_relevance(DistanceKeyMaker(), False)
40 | # End of example code.
41 |
42 | # And print out something about each match
43 | matches = []
44 | for match in enquire.get_mset(offset, pagesize):
45 | fields = json.loads(match.document.get_data())
46 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % {
47 | 'rank': match.rank + 1,
48 | 'docid': match.docid,
49 | 'name': fields.get('name', u''),
50 | 'date': support.format_date(fields.get('admitted', u'')),
51 | 'pop': support.format_numeral(int(fields.get('population', 0))),
52 | 'lat': fields.get('latitude', u''),
53 | 'lon': fields.get('longitude', u''),
54 | })
55 | matches.append(match.docid)
56 |
57 | # Finally, make sure we log the query and displayed results
58 | support.log_matches(querystring, offset, pagesize, matches)
59 |
60 | if len(sys.argv) < 3:
61 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
62 | sys.exit(1)
63 |
64 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
65 |
--------------------------------------------------------------------------------
/code/python/search_synonyms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | ### Start of example code.
9 | def search(dbpath, querystring, offset=0, pagesize=10):
10 | # offset - defines starting point within result set
11 | # pagesize - defines number of records to retrieve
12 |
13 | # Open the database we're going to search.
14 | db = xapian.WritableDatabase(dbpath)
15 |
16 | # Start of adding synonyms
17 | db.add_synonym("time", "calendar")
18 | # End of adding synonyms
19 |
20 | # Set up a QueryParser with a stemmer and suitable prefixes
21 | queryparser = xapian.QueryParser()
22 | queryparser.set_stemmer(xapian.Stem("en"))
23 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
24 | queryparser.add_prefix("title", "S")
25 | queryparser.add_prefix("description", "XD")
26 |
27 | # Start of set database
28 | queryparser.set_database(db)
29 | # End of set database
30 |
31 | # And parse the query
32 | query = queryparser.parse_query(querystring, queryparser.FLAG_SYNONYM)
33 |
34 | # Use an Enquire object on the database to run the query
35 | enquire = xapian.Enquire(db)
36 | enquire.set_query(query)
37 |
38 | # And print out something about each match
39 | matches = []
40 | for match in enquire.get_mset(offset, pagesize):
41 | fields = json.loads(match.document.get_data())
42 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
43 | 'rank': match.rank + 1,
44 | 'docid': match.docid,
45 | 'title': fields.get('TITLE', u''),
46 | })
47 | matches.append(match.docid)
48 |
49 | # Finally, make sure we log the query and displayed results
50 | support.log_matches(querystring, offset, pagesize, matches)
51 | ### End of example code.
52 |
53 | if len(sys.argv) < 3:
54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
55 | sys.exit(1)
56 |
57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
58 |
--------------------------------------------------------------------------------
/code/python3/delete1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | import xapian
5 |
6 | ### Start of example code.
7 | def delete_docs(dbpath, identifiers):
8 | # Open the database we're going to be deleting from.
9 | db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN)
10 |
11 | for identifier in identifiers:
12 | idterm = u'Q' + identifier
13 | db.delete_document(idterm)
14 | ### End of example code.
15 |
16 | if len(sys.argv) < 3:
17 | print("Usage: %s DBPATH ID..." % sys.argv[0])
18 | sys.exit(1)
19 |
20 | delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:])
21 |
--------------------------------------------------------------------------------
/code/python3/index1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | ### Start of example code.
9 | def index(datapath, dbpath):
10 | # Create or open the database we're going to be writing to.
11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
12 |
13 | # Set up a TermGenerator that we'll use in indexing.
14 | termgenerator = xapian.TermGenerator()
15 | termgenerator.set_stemmer(xapian.Stem("en"))
16 |
17 | for fields in parse_csv_file(datapath):
18 | # 'fields' is a dictionary mapping from field name to value.
19 | # Pick out the fields we're going to index.
20 | description = fields.get('DESCRIPTION', u'')
21 | title = fields.get('TITLE', u'')
22 | identifier = fields.get('id_NUMBER', u'')
23 |
24 | # We make a document and tell the term generator to use this.
25 | doc = xapian.Document()
26 | termgenerator.set_document(doc)
27 |
28 | # Index each field with a suitable prefix.
29 | termgenerator.index_text(title, 1, 'S')
30 | termgenerator.index_text(description, 1, 'XD')
31 |
32 | # Index fields without prefixes for general search.
33 | termgenerator.index_text(title)
34 | termgenerator.increase_termpos()
35 | termgenerator.index_text(description)
36 |
37 | # Store all the fields for display purposes.
38 | doc.set_data(json.dumps(fields))
39 |
40 | # We use the identifier to ensure each object ends up in the
41 | # database only once no matter how many times we run the
42 | # indexer.
43 | idterm = u"Q" + identifier
44 | doc.add_boolean_term(idterm)
45 | db.replace_document(idterm, doc)
46 | ### End of example code.
47 |
48 | if len(sys.argv) != 3:
49 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
50 | sys.exit(1)
51 |
52 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
53 |
--------------------------------------------------------------------------------
/code/python3/index_facets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | ### Start of example code.
9 | def index(datapath, dbpath):
10 | # Create or open the database we're going to be writing to.
11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
12 |
13 | # Set up a TermGenerator that we'll use in indexing.
14 | termgenerator = xapian.TermGenerator()
15 | termgenerator.set_stemmer(xapian.Stem("en"))
16 |
17 | for fields in parse_csv_file(datapath):
18 | # 'fields' is a dictionary mapping from field name to value.
19 | # Pick out the fields we're going to index.
20 | description = fields.get('DESCRIPTION', u'')
21 | title = fields.get('TITLE', u'')
22 | identifier = fields.get('id_NUMBER', u'')
23 | collection = fields.get('COLLECTION', u'')
24 | maker = fields.get('MAKER', u'')
25 |
26 | # We make a document and tell the term generator to use this.
27 | doc = xapian.Document()
28 | termgenerator.set_document(doc)
29 |
30 | # Index each field with a suitable prefix.
31 | termgenerator.index_text(title, 1, 'S')
32 | termgenerator.index_text(description, 1, 'XD')
33 |
34 | # Index fields without prefixes for general search.
35 | termgenerator.index_text(title)
36 | termgenerator.increase_termpos()
37 | termgenerator.index_text(description)
38 |
39 | # Add the collection as a value in slot 0.
40 | doc.add_value(0, collection)
41 |
42 | # Add the maker as a value in slot 1.
43 | doc.add_value(1, maker)
44 |
45 | # Store all the fields for display purposes.
46 | doc.set_data(json.dumps(fields))
47 |
48 | # We use the identifier to ensure each object ends up in the
49 | # database only once no matter how many times we run the
50 | # indexer.
51 | idterm = u"Q" + identifier
52 | doc.add_boolean_term(idterm)
53 | db.replace_document(idterm, doc)
54 | ### End of example code.
55 |
56 | if len(sys.argv) != 3:
57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
58 | sys.exit(1)
59 |
60 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
61 |
--------------------------------------------------------------------------------
/code/python3/index_filters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_csv_file(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | description = fields.get('DESCRIPTION', u'')
20 | title = fields.get('TITLE', u'')
21 | identifier = fields.get('id_NUMBER', u'')
22 |
23 | # We make a document and tell the term generator to use this.
24 | doc = xapian.Document()
25 | termgenerator.set_document(doc)
26 |
27 | # Index each field with a suitable prefix.
28 | termgenerator.index_text(title, 1, 'S')
29 | termgenerator.index_text(description, 1, 'XD')
30 |
31 | # Index fields without prefixes for general search.
32 | termgenerator.index_text(title)
33 | termgenerator.increase_termpos()
34 | termgenerator.index_text(description)
35 |
36 | ### Start of new indexing code.
37 | # Index the MATERIALS field, splitting on semicolons.
38 | for material in fields.get('MATERIALS', u'').split(';'):
39 | material = material.strip().lower()
40 | if len(material) > 0:
41 | doc.add_boolean_term('XM' + material)
42 | ### End of new indexing code.
43 |
44 | # Store all the fields for display purposes.
45 | doc.set_data(json.dumps(fields))
46 |
47 | # We use the identifier to ensure each object ends up in the
48 | # database only once no matter how many times we run the
49 | # indexer.
50 | idterm = u"Q" + identifier
51 | doc.add_boolean_term(idterm)
52 | db.replace_document(idterm, doc)
53 |
54 | if len(sys.argv) != 3:
55 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
56 | sys.exit(1)
57 |
58 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
59 |
--------------------------------------------------------------------------------
/code/python3/index_ranges.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import numbers_from_string, parse_csv_file
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_csv_file(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | description = fields.get('DESCRIPTION', u'')
20 | title = fields.get('TITLE', u'')
21 | identifier = fields.get('id_NUMBER', u'')
22 |
23 | # We make a document and tell the term generator to use this.
24 | doc = xapian.Document()
25 | termgenerator.set_document(doc)
26 |
27 | # Index each field with a suitable prefix.
28 | termgenerator.index_text(title, 1, 'S')
29 | termgenerator.index_text(description, 1, 'XD')
30 |
31 | # Index fields without prefixes for general search.
32 | termgenerator.index_text(title)
33 | termgenerator.increase_termpos()
34 | termgenerator.index_text(description)
35 |
36 | # Store all the fields for display purposes.
37 | doc.set_data(json.dumps(fields))
38 |
39 | ### Start of example code.
40 | # parse the two values we need
41 | measurements = fields.get('MEASUREMENTS', u'')
42 | if len(measurements) > 0:
43 | numbers = numbers_from_string(measurements)
44 | if len(numbers) > 0:
45 | doc.add_value(0, xapian.sortable_serialise(max(numbers)))
46 |
47 | date_made = fields.get('DATE_MADE', u'')
48 | years = numbers_from_string(date_made)
49 | if len(years) > 0:
50 | doc.add_value(1, xapian.sortable_serialise(years[0]))
51 | ### End of example code.
52 |
53 | # We use the identifier to ensure each object ends up in the
54 | # database only once no matter how many times we run the
55 | # indexer.
56 | idterm = u"Q" + identifier
57 | doc.add_boolean_term(idterm)
58 | db.replace_document(idterm, doc)
59 |
60 | if len(sys.argv) != 3:
61 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
62 | sys.exit(1)
63 |
64 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
65 |
--------------------------------------------------------------------------------
/code/python3/index_ranges2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | from support import parse_states
5 | import sys
6 | import xapian
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_states(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | name = fields.get('name', u'')
20 | description = fields.get('description', u'')
21 | motto = fields.get('motto', u'')
22 | admitted = fields.get('admitted', None)
23 | population = fields.get('population', None)
24 | order = fields.get('order', u'')
25 |
26 | # We make a document and tell the term generator to use this.
27 | doc = xapian.Document()
28 | termgenerator.set_document(doc)
29 |
30 | ### Start of example code.
31 | # Index each field with a suitable prefix.
32 | termgenerator.index_text(name, 1, 'S')
33 | termgenerator.index_text(description, 1, 'XD')
34 | termgenerator.index_text(motto, 1, 'XM')
35 |
36 | # Index fields without prefixes for general search.
37 | termgenerator.index_text(name)
38 | termgenerator.increase_termpos()
39 | termgenerator.index_text(description)
40 | termgenerator.increase_termpos()
41 | termgenerator.index_text(motto)
42 |
43 | # Add document values.
44 | if admitted is not None:
45 | doc.add_value(1, xapian.sortable_serialise(int(admitted[:4])))
46 | doc.add_value(2, admitted) # YYYYMMDD
47 | if population is not None:
48 | doc.add_value(3, xapian.sortable_serialise(int(population)))
49 | ### End of example code.
50 |
51 | # Store all the fields for display purposes.
52 | doc.set_data(json.dumps(fields))
53 |
54 | # We use the order to ensure each object ends up in the
55 | # database only once no matter how many times we run the
56 | # indexer.
57 | idterm = u"Q" + order
58 | doc.add_boolean_term(idterm)
59 | db.replace_document(idterm, doc)
60 |
61 | if len(sys.argv) != 3:
62 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
63 | sys.exit(1)
64 |
65 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
66 |
--------------------------------------------------------------------------------
/code/python3/index_sorting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | from support import parse_csv_file
7 |
8 | def index(datapath, dbpath):
9 | # Create or open the database we're going to be writing to.
10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | termgenerator = xapian.TermGenerator()
14 | termgenerator.set_stemmer(xapian.Stem("en"))
15 |
16 | for fields in parse_csv_file(datapath):
17 | # 'fields' is a dictionary mapping from field name to value.
18 | # Pick out the fields we're going to index.
19 | description = fields.get('DESCRIPTION', u'')
20 | title = fields.get('TITLE', u'')
21 | identifier = fields.get('id_NUMBER', u'')
22 | collection = fields.get('COLLECTION', u'')
23 | maker = fields.get('MAKER', u'')
24 |
25 | # We make a document and tell the term generator to use this.
26 | doc = xapian.Document()
27 | termgenerator.set_document(doc)
28 |
29 | # Index each field with a suitable prefix.
30 | termgenerator.index_text(title, 1, 'S')
31 | termgenerator.index_text(description, 1, 'XD')
32 |
33 | # Index fields without prefixes for general search.
34 | termgenerator.index_text(title)
35 | termgenerator.increase_termpos()
36 | termgenerator.index_text(description)
37 |
38 | ### Start of example code.
39 | # add the collection as a value in slot 0
40 | doc.add_value(0, collection)
41 |
42 | # add the maker as a value in slot 1
43 | doc.add_value(1, maker)
44 | ### End of example code.
45 |
46 | # Store all the fields for display purposes.
47 | doc.set_data(json.dumps(fields))
48 |
49 | # We use the identifier to ensure each object ends up in the
50 | # database only once no matter how many times we run the
51 | # indexer.
52 | idterm = u"Q" + identifier
53 | doc.add_boolean_term(idterm)
54 | db.replace_document(idterm, doc)
55 |
56 | if len(sys.argv) != 3:
57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0])
58 | sys.exit(1)
59 |
60 | index(datapath = sys.argv[1], dbpath = sys.argv[2])
61 |
--------------------------------------------------------------------------------
/code/python3/postingsource.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import xapian
4 |
5 | ### Start of class header and constructor.
6 | class ExternalWeightPostingSource(xapian.PostingSource):
7 | """
8 | A Xapian posting source returning weights from an external source.
9 | """
10 | def __init__(self, wtsource):
11 | xapian.PostingSource.__init__(self)
12 | self.wtsource = wtsource
13 | ### End of class header and constructor.
14 |
15 | ### Start of init.
16 | def init(self, db):
17 | self.db = db
18 | self.alldocs = db.postlist('')
19 | self.set_maxweight(self.wtsource.get_maxweight())
20 | ### End of init.
21 |
22 | ### Start of termfreq methods.
23 | def get_termfreq_min(self): return self.db.get_doccount()
24 | def get_termfreq_est(self): return self.db.get_doccount()
25 | def get_termfreq_max(self): return self.db.get_doccount()
26 | ### End of termfreq methods.
27 |
28 | ### Start of get_weight.
29 | def get_weight(self):
30 | doc = self.db.get_document(self.current.docid)
31 | return self.wtsource.get_weight(doc)
32 | ### End of get_weight.
33 |
34 | ### Start of get_docid.
35 | def get_docid(self):
36 | return self.current.docid
37 | ### End of get_docid.
38 |
39 | ### Start of at_end.
40 | def at_end(self):
41 | return self.current is None
42 | ### End of at_end.
43 |
44 | ### Start of next.
45 | def next(self, minweight):
46 | try:
47 | self.current = self.alldocs.next()
48 | except StopIteration:
49 | self.current = None
50 | ### End of next.
51 |
52 | ### Start of skip_to.
53 | def skip_to(self, docid, minweight):
54 | try:
55 | self.current = self.alldocs.skip_to(docid)
56 | except StopIteration:
57 | self.current = None
58 | ### End of skip_to.
59 |
--------------------------------------------------------------------------------
/code/python3/search1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | ### Start of example code.
9 | def search(dbpath, querystring, offset=0, pagesize=10):
10 | # offset - defines starting point within result set
11 | # pagesize - defines number of records to retrieve
12 |
13 | # Open the database we're going to search.
14 | db = xapian.Database(dbpath)
15 |
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = xapian.QueryParser()
18 | queryparser.set_stemmer(xapian.Stem("en"))
19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
20 | # Start of prefix configuration.
21 | queryparser.add_prefix("title", "S")
22 | queryparser.add_prefix("description", "XD")
23 | # End of prefix configuration.
24 |
25 | # And parse the query
26 | query = queryparser.parse_query(querystring)
27 |
28 | # Use an Enquire object on the database to run the query
29 | enquire = xapian.Enquire(db)
30 | enquire.set_query(query)
31 |
32 | # And print out something about each match
33 | matches = []
34 | for match in enquire.get_mset(offset, pagesize):
35 | fields = json.loads(match.document.get_data().decode('utf8'))
36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
37 | 'rank': match.rank + 1,
38 | 'docid': match.docid,
39 | 'title': fields.get('TITLE', u''),
40 | })
41 | matches.append(match.docid)
42 |
43 | # Finally, make sure we log the query and displayed results
44 | support.log_matches(querystring, offset, pagesize, matches)
45 | ### End of example code.
46 |
47 | if len(sys.argv) < 3:
48 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
49 | sys.exit(1)
50 |
51 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
52 |
--------------------------------------------------------------------------------
/code/python3/search_facets.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 |
29 | # And print out something about each match
30 | matches = []
31 |
32 | ### Start of example code.
33 | # Set up a spy to inspect the MAKER value at slot 1
34 | spy = xapian.ValueCountMatchSpy(1)
35 | enquire.add_matchspy(spy)
36 |
37 | for match in enquire.get_mset(offset, pagesize, 100):
38 | fields = json.loads(match.document.get_data().decode('utf8'))
39 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
40 | 'rank': match.rank + 1,
41 | 'docid': match.docid,
42 | 'title': fields.get('TITLE', u''),
43 | })
44 | matches.append(match.docid)
45 |
46 | # Fetch and display the spy values
47 | for facet in spy.values():
48 | print("Facet: %(term)s; count: %(count)i" % {
49 | 'term' : facet.term.decode('utf-8'),
50 | 'count' : facet.termfreq
51 | })
52 |
53 | # Finally, make sure we log the query and displayed results
54 | support.log_matches(querystring, offset, pagesize, matches)
55 | ### End of example code.
56 |
57 | if len(sys.argv) < 3:
58 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
59 | sys.exit(1)
60 |
61 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
62 |
--------------------------------------------------------------------------------
/code/python3/search_filters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, materials, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | ### Start of example code.
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = xapian.QueryParser()
18 | queryparser.set_stemmer(xapian.Stem("en"))
19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
20 | queryparser.add_prefix("title", "S")
21 | queryparser.add_prefix("description", "XD")
22 |
23 | # And parse the query
24 | query = queryparser.parse_query(querystring)
25 |
26 | if len(materials) > 0:
27 | # Filter the results to ones which contain at least one of the
28 | # materials.
29 |
30 | # Build a query for each material value
31 | material_queries = [
32 | xapian.Query('XM' + material.lower())
33 | for material in materials
34 | ]
35 |
36 | # Combine these queries with an OR operator
37 | material_query = xapian.Query(xapian.Query.OP_OR, material_queries)
38 |
39 | # Use the material query to filter the main query
40 | query = xapian.Query(xapian.Query.OP_FILTER, query, material_query)
41 | ### End of example code.
42 |
43 | # Use an Enquire object on the database to run the query
44 | enquire = xapian.Enquire(db)
45 | enquire.set_query(query)
46 |
47 | # And print out something about each match
48 | matches = []
49 | for match in enquire.get_mset(offset, pagesize):
50 | fields = json.loads(match.document.get_data().decode('utf8'))
51 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
52 | 'rank': match.rank + 1,
53 | 'docid': match.docid,
54 | 'title': fields.get('TITLE', u''),
55 | })
56 | matches.append(match.docid)
57 |
58 | # Finally, make sure we log the query and displayed results
59 | support.log_matches(querystring, offset, pagesize, matches)
60 |
61 | if len(sys.argv) < 3:
62 | print("Usage: %s DBPATH QUERY [MATERIALS...]" % sys.argv[0])
63 | sys.exit(1)
64 |
65 | search(dbpath = sys.argv[1], querystring = sys.argv[2],
66 | materials = sys.argv[3:])
67 |
--------------------------------------------------------------------------------
/code/python3/search_filters2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | ### Start of example code.
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = xapian.QueryParser()
18 | queryparser.set_stemmer(xapian.Stem("en"))
19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
20 | queryparser.add_prefix("title", "S")
21 | queryparser.add_prefix("description", "XD")
22 | queryparser.add_boolean_prefix("material", "XM")
23 |
24 | # And parse the query
25 | query = queryparser.parse_query(querystring)
26 | ### End of example code.
27 |
28 | # Use an Enquire object on the database to run the query
29 | enquire = xapian.Enquire(db)
30 | enquire.set_query(query)
31 |
32 | # And print out something about each match
33 | matches = []
34 | for match in enquire.get_mset(offset, pagesize):
35 | fields = json.loads(match.document.get_data().decode('utf8'))
36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
37 | 'rank': match.rank + 1,
38 | 'docid': match.docid,
39 | 'title': fields.get('TITLE', u''),
40 | })
41 | matches.append(match.docid)
42 | # Finally, make sure we log the query and displayed results
43 | support.log_matches(querystring, offset, pagesize, matches)
44 |
45 | if len(sys.argv) < 3:
46 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
47 | sys.exit(1)
48 |
49 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
50 |
--------------------------------------------------------------------------------
/code/python3/search_ranges.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 | # and add in range processors
22 | queryparser.add_rangeprocessor(
23 | xapian.NumberRangeProcessor(0, 'mm', xapian.RP_SUFFIX)
24 | )
25 | queryparser.add_rangeprocessor(
26 | xapian.NumberRangeProcessor(1)
27 | )
28 |
29 | # And parse the query
30 | query = queryparser.parse_query(querystring)
31 |
32 | # Use an Enquire object on the database to run the query
33 | enquire = xapian.Enquire(db)
34 | enquire.set_query(query)
35 |
36 | # And print out something about each match
37 | matches = []
38 | for match in enquire.get_mset(offset, pagesize):
39 | fields = json.loads(match.document.get_data().decode('utf8'))
40 | print(u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n %(title)s" % {
41 | 'rank': match.rank + 1,
42 | 'docid': match.docid,
43 | 'measurements': fields.get('MEASUREMENTS', u''),
44 | 'date': fields.get('DATE_MADE', u''),
45 | 'title': fields.get('TITLE', u''),
46 | })
47 | matches.append(match.docid)
48 |
49 | # Finally, make sure we log the query and displayed results
50 | support.log_matches(querystring, offset, pagesize, matches)
51 |
52 | if len(sys.argv) < 3:
53 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
54 | sys.exit(1)
55 |
56 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
57 |
--------------------------------------------------------------------------------
/code/python3/search_sorting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 | # Start of example code.
29 | enquire.set_sort_by_value_then_relevance(1, False)
30 | # End of example code.
31 |
32 | # And print out something about each match
33 | matches = []
34 | for match in enquire.get_mset(offset, pagesize):
35 | fields = json.loads(match.document.get_data().decode('utf8'))
36 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % {
37 | 'rank': match.rank + 1,
38 | 'docid': match.docid,
39 | 'name': fields.get('name', u''),
40 | 'date': support.format_date(fields.get('admitted', u'')),
41 | 'pop': support.format_numeral(int(fields.get('population', 0))),
42 | 'lat': fields.get('latitude', u''),
43 | 'lon': fields.get('longitude', u''),
44 | })
45 | matches.append(match.docid)
46 |
47 | # Finally, make sure we log the query and displayed results
48 | support.log_matches(querystring, offset, pagesize, matches)
49 |
50 | if len(sys.argv) < 3:
51 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
52 | sys.exit(1)
53 |
54 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
55 |
--------------------------------------------------------------------------------
/code/python3/search_sorting2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | def search(dbpath, querystring, offset=0, pagesize=10):
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = xapian.Database(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = xapian.QueryParser()
17 | queryparser.set_stemmer(xapian.Stem("en"))
18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
19 | queryparser.add_prefix("title", "S")
20 | queryparser.add_prefix("description", "XD")
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = xapian.Enquire(db)
27 | enquire.set_query(query)
28 | # Start of example code.
29 | keymaker = xapian.MultiValueKeyMaker()
30 | keymaker.add_value(1, False)
31 | keymaker.add_value(3, True)
32 | enquire.set_sort_by_key_then_relevance(keymaker, False)
33 | # End of example code.
34 |
35 | # And print out something about each match
36 | matches = []
37 | for match in enquire.get_mset(offset, pagesize):
38 | fields = json.loads(match.document.get_data().decode('utf8'))
39 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % {
40 | 'rank': match.rank + 1,
41 | 'docid': match.docid,
42 | 'name': fields.get('name', u''),
43 | 'date': support.format_date(fields.get('admitted', u'')),
44 | 'pop': support.format_numeral(int(fields.get('population', 0))),
45 | 'lat': fields.get('latitude', u''),
46 | 'lon': fields.get('longitude', u''),
47 | })
48 | matches.append(match.docid)
49 |
50 | # Finally, make sure we log the query and displayed results
51 | support.log_matches(querystring, offset, pagesize, matches)
52 |
53 | if len(sys.argv) < 3:
54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
55 | sys.exit(1)
56 |
57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
58 |
--------------------------------------------------------------------------------
/code/python3/search_synonyms.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import json
4 | import sys
5 | import xapian
6 | import support
7 |
8 | ### Start of example code.
9 | def search(dbpath, querystring, offset=0, pagesize=10):
10 | # offset - defines starting point within result set
11 | # pagesize - defines number of records to retrieve
12 |
13 | # Open the database we're going to search.
14 | db = xapian.WritableDatabase(dbpath)
15 |
16 | # Start of adding synonyms
17 | db.add_synonym("time", "calendar")
18 | # End of adding synonyms
19 |
20 | # Set up a QueryParser with a stemmer and suitable prefixes
21 | queryparser = xapian.QueryParser()
22 | queryparser.set_stemmer(xapian.Stem("en"))
23 | queryparser.set_stemming_strategy(queryparser.STEM_SOME)
24 | queryparser.add_prefix("title", "S")
25 | queryparser.add_prefix("description", "XD")
26 |
27 | # Start of set database
28 | queryparser.set_database(db)
29 | # End of set database
30 |
31 | # And parse the query
32 | query = queryparser.parse_query(querystring, queryparser.FLAG_SYNONYM)
33 |
34 | # Use an Enquire object on the database to run the query
35 | enquire = xapian.Enquire(db)
36 | enquire.set_query(query)
37 |
38 | # And print out something about each match
39 | matches = []
40 | for match in enquire.get_mset(offset, pagesize):
41 | fields = json.loads(match.document.get_data().decode('utf8'))
42 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % {
43 | 'rank': match.rank + 1,
44 | 'docid': match.docid,
45 | 'title': fields.get('TITLE', u''),
46 | })
47 | matches.append(match.docid)
48 |
49 | # Finally, make sure we log the query and displayed results
50 | support.log_matches(querystring, offset, pagesize, matches)
51 | ### End of example code.
52 |
53 | if len(sys.argv) < 3:
54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0])
55 | sys.exit(1)
56 |
57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:]))
58 |
--------------------------------------------------------------------------------
/code/ruby/delete1.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 |
6 | ### Start of example code.
7 | def delete_docs(dbpath, identifiers)
8 | db = Xapian::WritableDatabase.new(dbpath, Xapian::DB_OPEN)
9 | identifiers.each do |identifier|
10 | idterm = "Q#{identifier}"
11 | db.delete_document(idterm)
12 | end
13 | end
14 | ### End of example code.
15 |
16 | abort "Usage #{__FILE__} DBPATH ID..." if ARGV.length < 2
17 |
18 | delete_docs(ARGV[0], ARGV[1..])
19 |
--------------------------------------------------------------------------------
/code/ruby/index1.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | ### Start of example code.
9 | def index_csv(data_path, db_path)
10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN)
11 | term_generator = Xapian::TermGenerator.new
12 | term_generator.stemmer = Xapian::Stem.new('en')
13 | parse_csv_file(data_path).each do |row|
14 | doc = Xapian::Document.new
15 | term_generator.document = doc
16 | term_generator.index_text(row['TITLE'].to_s, 1, 'S')
17 | term_generator.index_text(row['DESCRIPTION'].to_s, 1, 'XD')
18 | term_generator.index_text(row['TITLE'].to_s)
19 | term_generator.increase_termpos
20 | term_generator.index_text(row['DESCRIPTION'].to_s)
21 | doc.data = row.to_h.to_json
22 | idterm = "Q#{row['id_NUMBER']}"
23 | doc.add_boolean_term(idterm)
24 | db.replace_document(idterm, doc)
25 | end
26 | end
27 | ### End of example code.
28 |
29 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2
30 |
31 | index_csv(ARGV[0], ARGV[1])
32 |
--------------------------------------------------------------------------------
/code/ruby/index_facets.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | ### Start of example code.
9 | def index(data_path, db_path)
10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | term_generator = Xapian::TermGenerator.new
14 | term_generator.stemmer = Xapian::Stem.new('en')
15 |
16 | parse_csv_file(data_path).each do |row|
17 | title = row['TITLE'].to_s
18 | description = row['DESCRIPTION'].to_s
19 | identifier = row['id_NUMBER'].to_s
20 | collection = row['COLLECTION'].to_s
21 | maker = row['MAKER'].to_s
22 |
23 | # We make a document and tell the term generator to use this.
24 | doc = Xapian::Document.new
25 | term_generator.document = doc
26 |
27 | # Index each field with a suitable prefix.
28 | term_generator.index_text(title, 1, 'S')
29 | term_generator.index_text(description, 1, 'XD')
30 |
31 | # Index fields without prefixes for general search.
32 | term_generator.index_text(title)
33 | term_generator.increase_termpos
34 | term_generator.index_text(description)
35 |
36 | # Add the collection as a value in slot 0.
37 | doc.add_value(0, collection)
38 |
39 | # Add the maker as a value in slot 1.
40 | doc.add_value(1, maker)
41 |
42 | # Store all the fields for display purposes.
43 | doc.data = row.to_h.to_json
44 |
45 | # We use the identifier to ensure each object ends up in the
46 | # database only once no matter how many times we run the indexer.
47 | idterm = "Q#{identifier}"
48 | doc.add_boolean_term(idterm)
49 | db.replace_document(idterm, doc)
50 | end
51 | end
52 | ### End of example code.
53 |
54 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2
55 |
56 | index(ARGV[0], ARGV[1])
57 |
--------------------------------------------------------------------------------
/code/ruby/index_filters.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | ### Start of example code.
9 | def index(data_path, db_path)
10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN)
11 | term_generator = Xapian::TermGenerator.new
12 | term_generator.stemmer = Xapian::Stem.new('en')
13 | parse_csv_file(data_path).each do |row|
14 | doc = Xapian::Document.new
15 | term_generator.document = doc
16 | term_generator.index_text(row['TITLE'].to_s, 1, 'S')
17 | term_generator.index_text(row['DESCRIPTION'].to_s, 1, 'XD')
18 | term_generator.index_text(row['TITLE'].to_s)
19 | term_generator.increase_termpos
20 | term_generator.index_text(row['DESCRIPTION'].to_s)
21 |
22 | ### Start of new indexing code.
23 | # Index the MATERIALS field, splitting on semicolons.
24 | row['MATERIALS'].to_s.split(';').each do |material|
25 | material.strip!
26 | material.downcase!
27 | doc.add_boolean_term("XM#{material}") if material.length.positive?
28 | end
29 | ### End of new indexing code.
30 |
31 | doc.data = row.to_h.to_json
32 | idterm = "Q#{row['id_NUMBER']}"
33 | doc.add_boolean_term(idterm)
34 | db.replace_document(idterm, doc)
35 | end
36 | end
37 | ### End of example code.
38 |
39 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2
40 |
41 | index(ARGV[0], ARGV[1])
42 |
--------------------------------------------------------------------------------
/code/ruby/index_ranges.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def index(data_path, db_path)
9 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN)
10 |
11 | # Set up a TermGenerator that we'll use in indexing.
12 | term_generator = Xapian::TermGenerator.new
13 | term_generator.stemmer = Xapian::Stem.new('en')
14 |
15 | parse_csv_file(data_path).each do |row|
16 | doc = Xapian::Document.new
17 | term_generator.document = doc
18 |
19 | # Index each field with a suitable prefix.
20 | term_generator.index_text(row['TITLE'].to_s, 1, 'S')
21 | term_generator.index_text(row['DESCRIPTION'].to_s, 1, 'XD')
22 |
23 | # Index fields without prefixes for general search.
24 | term_generator.index_text(row['TITLE'].to_s)
25 | term_generator.increase_termpos
26 | term_generator.index_text(row['DESCRIPTION'].to_s)
27 |
28 | doc.data = row.to_h.to_json
29 |
30 | ### Start of example code.
31 |
32 | # parse the two values we need
33 | measurements = row['MEASUREMENTS'].to_s
34 | unless measurements.empty?
35 | numbers = numbers_from_string(measurements)
36 | doc.add_value(0, Xapian.sortable_serialise(numbers.max)) unless numbers.empty?
37 |
38 | date_made = row['DATE_MADE'].to_s
39 | years = numbers_from_string(date_made)
40 | doc.add_value(1, Xapian.sortable_serialise(years[0])) unless years.empty?
41 | end
42 | ### End of example code.
43 |
44 | # We use the identifier to ensure each object ends up in the
45 | # database only once no matter how many times we run the indexer.
46 |
47 | idterm = "Q#{row['id_NUMBER']}"
48 | doc.add_boolean_term(idterm)
49 | db.replace_document(idterm, doc)
50 | end
51 | end
52 |
53 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2
54 |
55 | index(ARGV[0], ARGV[1])
56 |
--------------------------------------------------------------------------------
/code/ruby/index_ranges2.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def index(data_path, db_path)
9 | # Create or open the database we're going to be writing to.
10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | term_generator = Xapian::TermGenerator.new
14 | term_generator.stemmer = Xapian::Stem.new('en')
15 |
16 | parse_states(data_path).each do |row|
17 | # We make a document and tell the term generator to use this.
18 | doc = Xapian::Document.new
19 | term_generator.document = doc
20 |
21 | ### Start of example code.
22 | # Index each field with a suitable prefix.
23 | term_generator.index_text(row['name'].to_s, 1, 'S')
24 | term_generator.index_text(row['description'].to_s, 1, 'XD')
25 | term_generator.index_text(row['motto'].to_s, 1, 'XM')
26 |
27 | # Index fields without prefixes for general search.
28 | term_generator.index_text(row['name'].to_s)
29 | term_generator.increase_termpos
30 | term_generator.index_text(row['description'].to_s)
31 | term_generator.increase_termpos
32 | term_generator.index_text(row['motto'].to_s)
33 |
34 | admitted = row['admitted'].to_s
35 | # Add document values.
36 | unless admitted.empty?
37 | doc.add_value(1, Xapian.sortable_serialise(admitted[0..3].to_i))
38 | doc.add_value(2, admitted) # YYYYMMDD
39 | end
40 |
41 | doc.add_value(3, Xapian.sortable_serialise(row['population'].to_i)) if row['population']
42 | ### End of example code.
43 |
44 | doc.data = row.to_h.to_json
45 |
46 | # We use the identifier to ensure each object ends up in the
47 | # database only once no matter how many times we run the indexer.
48 |
49 | idterm = "Q#{row['order']}"
50 | doc.add_boolean_term(idterm)
51 | db.replace_document(idterm, doc)
52 | end
53 | end
54 |
55 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2
56 |
57 | index(ARGV[0], ARGV[1])
58 |
--------------------------------------------------------------------------------
/code/ruby/index_values_with_geo.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def index(data_path, db_path)
9 | # Create or open the database we're going to be writing to.
10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN)
11 |
12 | # Set up a TermGenerator that we'll use in indexing.
13 | term_generator = Xapian::TermGenerator.new
14 | term_generator.stemmer = Xapian::Stem.new('en')
15 |
16 | parse_states(data_path).each do |row|
17 | # We make a document and tell the term generator to use this.
18 | doc = Xapian::Document.new
19 | term_generator.document = doc
20 |
21 | # Index each field with a suitable prefix.
22 | term_generator.index_text(row['name'].to_s, 1, 'S')
23 | term_generator.index_text(row['description'].to_s, 1, 'XD')
24 | term_generator.index_text(row['motto'].to_s, 1, 'XM')
25 |
26 | # Index fields without prefixes for general search.
27 | term_generator.index_text(row['name'].to_s)
28 | term_generator.increase_termpos
29 | term_generator.index_text(row['description'].to_s)
30 | term_generator.increase_termpos
31 | term_generator.index_text(row['motto'].to_s)
32 |
33 | admitted = row['admitted'].to_s
34 | # Add document values.
35 | unless admitted.empty?
36 | doc.add_value(1, Xapian.sortable_serialise(admitted[0..3].to_i))
37 | doc.add_value(2, admitted) # YYYYMMDD
38 | end
39 |
40 | doc.add_value(3, Xapian.sortable_serialise(row['population'].to_i)) if row['population']
41 |
42 | ### Start of example code.
43 | doc.add_value(4, "#{row['midlat'].to_f},#{row['midlon'].to_f}") if row['midlat'] && row['midlon']
44 | ### End of example code.
45 |
46 | doc.data = row.to_h.to_json
47 |
48 | # We use the identifier to ensure each object ends up in the
49 | # database only once no matter how many times we run the indexer.
50 |
51 | idterm = "Q#{row['order']}"
52 | doc.add_boolean_term(idterm)
53 | db.replace_document(idterm, doc)
54 | end
55 | end
56 |
57 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2
58 |
59 | index(ARGV[0], ARGV[1])
60 |
--------------------------------------------------------------------------------
/code/ruby/search1.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | ### Start of example code.
9 | def search(dbpath, querystring, offset: 0, pagesize: 10)
10 | # offset - defines starting point within result set
11 | # pagesize - defines number of records to retrieve
12 |
13 | # Open the database we're going to search.
14 | db = Xapian::Database.new(dbpath)
15 |
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = Xapian::QueryParser.new
18 | queryparser.stemmer = Xapian::Stem.new('en')
19 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
20 |
21 | # Start of prefix configuration.
22 | queryparser.add_prefix('title', 'S')
23 | queryparser.add_prefix('description', 'XD')
24 | # End of prefix configuration.
25 |
26 | # And parse the query
27 | query = queryparser.parse_query(querystring)
28 | # Use an Enquire object on the database to run the query
29 | enquire = Xapian::Enquire.new(db)
30 | enquire.query = query
31 | matches = []
32 | enquire.mset(offset, pagesize).matches.each do |match|
33 | fields = JSON.parse(match.document.data)
34 | printf "%i: #%3.3i %s\n",
35 | rank: match.rank + 1,
36 | docid: match.docid,
37 | title: fields['TITLE']
38 | matches << match.docid
39 | end
40 | log_matches(querystring, offset, pagesize, matches)
41 | end
42 | ### End of example code.
43 |
44 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
45 |
46 | search(ARGV[0], ARGV[1..].join(' '))
47 |
--------------------------------------------------------------------------------
/code/ruby/search_facets.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def search(dbpath, querystring, offset: 0, pagesize: 10)
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = Xapian::Database.new(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = Xapian::QueryParser.new
17 | queryparser.stemmer = Xapian::Stem.new('en')
18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
19 | queryparser.add_prefix('title', 'S')
20 | queryparser.add_prefix('description', 'XD')
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = Xapian::Enquire.new(db)
27 | enquire.query = query
28 |
29 | # And print out something about each match
30 | matches = []
31 |
32 | ### Start of example code.
33 | # Set up a spy to inspect the MAKER value at slot 1
34 | spy = Xapian::ValueCountMatchSpy.new(1)
35 | enquire.add_matchspy(spy)
36 |
37 | enquire.mset(offset, pagesize, 100).matches.each do |match|
38 | fields = JSON.parse(match.document.data)
39 | printf "%i: #%3.3i %s\n",
40 | rank: match.rank + 1,
41 | docid: match.docid,
42 | title: fields['TITLE']
43 | matches << match.docid
44 | end
45 | spy.values.each do |facet|
46 | printf "Facet: %s; count: %i\n",
47 | term: facet.term,
48 | count: facet.termfreq
49 | end
50 |
51 | # Finally, make sure we log the query and displayed results
52 | log_matches(querystring, offset, pagesize, matches)
53 | ### End of example code.
54 | end
55 |
56 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
57 |
58 | search(ARGV[0], ARGV[1..].join(' '))
59 |
--------------------------------------------------------------------------------
/code/ruby/search_filters.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def search(dbpath, querystring, materials, offset: 0, pagesize: 10)
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = Xapian::Database.new(dbpath)
14 |
15 | ### Start of example code.
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = Xapian::QueryParser.new
18 | queryparser.stemmer = Xapian::Stem.new('en')
19 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
20 | queryparser.add_prefix('title', 'S')
21 | queryparser.add_prefix('description', 'XD')
22 |
23 | # And parse the query
24 | query = queryparser.parse_query(querystring)
25 |
26 | if materials.length.positive?
27 | # Filter the results to ones which contain at least one of the
28 | # materials.
29 |
30 | # Build a query for each material value
31 | material_queries = materials.map { |material| "XM#{material.downcase}" }
32 |
33 | # Build a query for each material value
34 | material_query = Xapian::Query.new(Xapian::Query::OP_OR, material_queries)
35 |
36 | # Use the material query to filter the main query
37 | query = Xapian::Query.new(Xapian::Query::OP_FILTER, query, material_query)
38 | end
39 | ### End of example code.
40 |
41 | # Use an Enquire object on the database to run the query
42 | enquire = Xapian::Enquire.new(db)
43 | enquire.query = query
44 |
45 | # And print out something about each match
46 | matches = []
47 | enquire.mset(offset, pagesize).matches.each do |match|
48 | fields = JSON.parse(match.document.data)
49 | printf "%i: #%3.3i %s\n",
50 | rank: match.rank + 1,
51 | docid: match.docid,
52 | title: fields['TITLE']
53 | matches << match.docid
54 | end
55 | log_matches(querystring, offset, pagesize, matches)
56 | end
57 | ### End of example code.
58 |
59 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
60 |
61 | search(ARGV[0], ARGV[1], ARGV[2..])
62 |
--------------------------------------------------------------------------------
/code/ruby/search_filters2.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def search(dbpath, querystring, offset: 0, pagesize: 10)
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = Xapian::Database.new(dbpath)
14 |
15 | ### Start of example code.
16 | # Set up a QueryParser with a stemmer and suitable prefixes
17 | queryparser = Xapian::QueryParser.new
18 | queryparser.stemmer = Xapian::Stem.new('en')
19 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
20 | queryparser.add_prefix('title', 'S')
21 | queryparser.add_prefix('description', 'XD')
22 | queryparser.add_boolean_prefix('material', 'XM')
23 |
24 | # And parse the query
25 | query = queryparser.parse_query(querystring)
26 | ### End of example code.
27 |
28 | # Use an Enquire object on the database to run the query
29 | enquire = Xapian::Enquire.new(db)
30 | enquire.query = query
31 |
32 | # And print out something about each match
33 | matches = []
34 | enquire.mset(offset, pagesize).matches.each do |match|
35 | fields = JSON.parse(match.document.data)
36 | printf "%i: #%3.3i %s\n",
37 | rank: match.rank + 1,
38 | docid: match.docid,
39 | title: fields['TITLE']
40 | matches << match.docid
41 | end
42 | log_matches(querystring, offset, pagesize, matches)
43 | end
44 | ### End of example code.
45 |
46 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
47 |
48 | search(ARGV[0], ARGV[1])
49 |
--------------------------------------------------------------------------------
/code/ruby/search_ranges.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def search(dbpath, querystring, offset: 0, pagesize: 10)
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = Xapian::Database.new(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = Xapian::QueryParser.new
17 | queryparser.stemmer = Xapian::Stem.new('en')
18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
19 | queryparser.add_prefix('title', 'S')
20 | queryparser.add_prefix('description', 'XD')
21 | # and add in range processors
22 | queryparser.add_rangeprocessor(Xapian::NumberRangeProcessor.new(0, 'mm', Xapian::RP_SUFFIX))
23 | queryparser.add_rangeprocessor(Xapian::NumberRangeProcessor.new(1))
24 |
25 | # And parse the query
26 | query = queryparser.parse_query(querystring)
27 |
28 | # Use an Enquire object on the database to run the query
29 | enquire = Xapian::Enquire.new(db)
30 | enquire.query = query
31 |
32 | # And print out something about each match
33 | matches = []
34 | enquire.mset(offset, pagesize).matches.each do |match|
35 | fields = JSON.parse(match.document.data)
36 | printf "%i: #%3.3i (%s) %s\n %s\n",
37 | rank: match.rank + 1,
38 | docid: match.docid,
39 | measurements: fields['MEASUREMENTS'],
40 | date: fields['DATE_MADE'],
41 | title: fields['TITLE']
42 | matches << match.docid
43 | end
44 | log_matches(querystring, offset, pagesize, matches)
45 | end
46 |
47 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
48 |
49 | search(ARGV[0], ARGV[1..].join(' '))
50 |
--------------------------------------------------------------------------------
/code/ruby/search_sorting.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def search(dbpath, querystring, offset: 0, pagesize: 10)
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = Xapian::Database.new(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = Xapian::QueryParser.new
17 | queryparser.stemmer = Xapian::Stem.new('en')
18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
19 | queryparser.add_prefix('title', 'S')
20 | queryparser.add_prefix('description', 'XD')
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = Xapian::Enquire.new(db)
27 | enquire.query = query
28 |
29 | # puts enquire.methods
30 |
31 | # Start of example code.
32 | enquire.sort_by_value_then_relevance!(1, false)
33 | # End of example code.
34 |
35 | # And print out something about each match
36 | matches = []
37 | enquire.mset(offset, pagesize).matches.each do |match|
38 | fields = JSON.parse(match.document.data)
39 | printf "%i: #%3.3i %s %s\n Population %s\n",
40 | rank: match.rank + 1,
41 | docid: match.docid,
42 | name: fields['name'],
43 | date: format_date(fields['admitted'].to_s),
44 | pop: format_numeral(fields['population'].to_i),
45 | lat: fields['latitude'].to_s,
46 | lon: fields['longitude'].to_s
47 |
48 | matches << match.docid
49 | end
50 | # Finally, make sure we log the query and displayed results
51 | log_matches(querystring, offset, pagesize, matches)
52 | end
53 |
54 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
55 |
56 | search(ARGV[0], ARGV[1..].join(' '))
57 |
--------------------------------------------------------------------------------
/code/ruby/search_sorting2.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | def search(dbpath, querystring, offset: 0, pagesize: 10)
9 | # offset - defines starting point within result set
10 | # pagesize - defines number of records to retrieve
11 |
12 | # Open the database we're going to search.
13 | db = Xapian::Database.new(dbpath)
14 |
15 | # Set up a QueryParser with a stemmer and suitable prefixes
16 | queryparser = Xapian::QueryParser.new
17 | queryparser.stemmer = Xapian::Stem.new('en')
18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
19 | queryparser.add_prefix('title', 'S')
20 | queryparser.add_prefix('description', 'XD')
21 |
22 | # And parse the query
23 | query = queryparser.parse_query(querystring)
24 |
25 | # Use an Enquire object on the database to run the query
26 | enquire = Xapian::Enquire.new(db)
27 | enquire.query = query
28 |
29 | # puts enquire.methods
30 |
31 | # Start of example code.
32 | keymaker = Xapian::MultiValueKeyMaker.new
33 | keymaker.add_value(1, false)
34 | keymaker.add_value(3, true)
35 | enquire.set_sort_by_key_then_relevance(keymaker, false)
36 | # End of example code.
37 |
38 | # And print out something about each match
39 | matches = []
40 | enquire.mset(offset, pagesize).matches.each do |match|
41 | fields = JSON.parse(match.document.data)
42 | printf "%i: #%3.3i %s %s\n Population %s\n",
43 | rank: match.rank + 1,
44 | docid: match.docid,
45 | name: fields['name'],
46 | date: format_date(fields['admitted'].to_s),
47 | pop: format_numeral(fields['population'].to_i),
48 | lat: fields['latitude'].to_s,
49 | lon: fields['longitude'].to_s
50 |
51 | matches << match.docid
52 | end
53 | # Finally, make sure we log the query and displayed results
54 | log_matches(querystring, offset, pagesize, matches)
55 | end
56 |
57 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
58 |
59 | search(ARGV[0], ARGV[1..].join(' '))
60 |
--------------------------------------------------------------------------------
/code/ruby/search_sorting3.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | # Start of example code.
9 | class DistanceKeyMaker < Xapian::KeyMaker
10 | def __call__(doc)
11 | coords = doc.value(4).split(',').map(&:to_f)
12 | washington = [38.012, -77.037]
13 | Xapian.sortable_serialise(distance_between_coords(coords, washington))
14 | end
15 | end
16 | ## and later
17 | # enquire.set_sort_by_key_then_relevance(DistanceKeyMaker.new, false)
18 | # End of example code.
19 |
20 | def search(dbpath, querystring, offset: 0, pagesize: 10)
21 | # offset - defines starting point within result set
22 | # pagesize - defines number of records to retrieve
23 |
24 | # Open the database we're going to search.
25 | db = Xapian::Database.new(dbpath)
26 |
27 | # Set up a QueryParser with a stemmer and suitable prefixes
28 | queryparser = Xapian::QueryParser.new
29 | queryparser.stemmer = Xapian::Stem.new('en')
30 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
31 | queryparser.add_prefix('title', 'S')
32 | queryparser.add_prefix('description', 'XD')
33 |
34 | # And parse the query
35 | query = queryparser.parse_query(querystring)
36 |
37 | # Use an Enquire object on the database to run the query
38 | enquire = Xapian::Enquire.new(db)
39 | enquire.query = query
40 |
41 | enquire.set_sort_by_key_then_relevance(DistanceKeyMaker.new, false)
42 |
43 | # And print out something about each match
44 | matches = []
45 | enquire.mset(offset, pagesize).matches.each do |match|
46 | fields = JSON.parse(match.document.data)
47 | printf "%i: #%3.3i %s %s\n Population %s\n",
48 | rank: match.rank + 1,
49 | docid: match.docid,
50 | name: fields['name'],
51 | date: format_date(fields['admitted'].to_s),
52 | pop: format_numeral(fields['population'].to_i),
53 | lat: fields['latitude'].to_s,
54 | lon: fields['longitude'].to_s
55 |
56 | matches << match.docid
57 | end
58 | # Finally, make sure we log the query and displayed results
59 | log_matches(querystring, offset, pagesize, matches)
60 | end
61 |
62 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
63 |
64 | search(ARGV[0], ARGV[1..].join(' '))
65 |
--------------------------------------------------------------------------------
/code/ruby/search_synonyms.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | # frozen_string_literal: true
3 |
4 | require 'xapian'
5 | require 'json'
6 | require_relative 'support'
7 |
8 | ### Start of example code.
9 | def search(dbpath, querystring, offset: 0, pagesize: 10)
10 | # offset - defines starting point within result set
11 | # pagesize - defines number of records to retrieve
12 |
13 | # Open the database we're going to search.
14 | db = Xapian::WritableDatabase.new(dbpath)
15 |
16 | # Start of adding synonyms
17 | db.add_synonym("time", "calendar")
18 | # End of adding synonyms
19 |
20 | # Set up a QueryParser with a stemmer and suitable prefixes
21 | queryparser = Xapian::QueryParser.new
22 | queryparser.stemmer = Xapian::Stem.new('en')
23 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME
24 | queryparser.add_prefix('title', 'S')
25 | queryparser.add_prefix('description', 'XD')
26 |
27 | # Start of set database
28 | queryparser.database = db
29 | # End of set database
30 |
31 | # And parse the query
32 | query = queryparser.parse_query(querystring,
33 | Xapian::QueryParser::FLAG_SYNONYM)
34 |
35 | # Use an Enquire object on the database to run the query
36 | enquire = Xapian::Enquire.new(db)
37 | enquire.query = query
38 |
39 | # And print out something about each match
40 | matches = []
41 | enquire.mset(offset, pagesize).matches.each do |match|
42 | fields = JSON.parse(match.document.data)
43 | printf "%i: #%3.3i %s\n",
44 | rank: match.rank + 1,
45 | docid: match.docid,
46 | title: fields['TITLE']
47 | matches << match.docid
48 | end
49 | log_matches(querystring, offset, pagesize, matches)
50 | end
51 | ### End of example code.
52 |
53 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2
54 |
55 | search(ARGV[0], ARGV[1..].join(' '))
56 |
--------------------------------------------------------------------------------
/code/ruby/support.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'csv'
4 | require 'date'
5 |
6 | def parse_csv_file(datapath)
7 | CSV.read(datapath, headers: true)
8 | end
9 |
10 | def parse_states(datapath)
11 | CSV.read(datapath, headers: true).select { |r| r['order'] }
12 | end
13 |
14 | def log_matches(querystring, offset, pagesize, matches)
15 | puts "'#{querystring}'[#{offset}:#{offset + pagesize}] = #{matches.join(' ')}"
16 | end
17 |
18 | def numbers_from_string(string)
19 | out = []
20 | string.scan(/[\d.]*\d[\d.]*/) do |n|
21 | out << n.to_f
22 | end
23 | out
24 | end
25 |
26 | def distance_between_coords(latlon1, latlon2)
27 | Math.sqrt(((latlon2[0] - latlon1[0])**2) +
28 | ((latlon2[1] - latlon1[1])**2))
29 | end
30 |
31 | def format_numeral(numeral, sep: ',')
32 | raise 'Numeral must be an int type to format' unless numeral.is_a?(Integer)
33 |
34 | out = []
35 | numeral.to_s.split('').reverse.each_with_index do |s, i|
36 | out << sep if i.positive? && (i % 3).zero? && i != numeral.to_s.size
37 | out << s
38 | end
39 | out.reverse.join('')
40 | end
41 |
42 | def format_date(datestr)
43 | raise "Could not parse date to format 'YYYYMMDD'" unless datestr.is_a? String
44 |
45 | date = DateTime.strptime(datestr, '%Y%m%d')
46 | "#{date.strftime('%B')} #{date.day}, #{date.year}"
47 | end
48 |
--------------------------------------------------------------------------------
/concepts/concurrency.rst:
--------------------------------------------------------------------------------
1 | ===========
2 | Concurrency
3 | ===========
4 |
5 | ---------
6 | Threading
7 | ---------
8 |
9 | Xapian does not provide explicit support for multi-threading, though it
10 | can be used in a multi-threaded program if you are aware of the details
11 | described below.
12 |
13 | Xapian doesn't maintain any global state, so you can safely use Xapian in a
14 | multi-threaded program provided you don't share objects between threads.
15 | In practice this restriction is often not a problem - each thread can
16 | create its own :xapian-class:`Database` object, and everything will work
17 | fine.
18 |
19 | If you really want to access the same Xapian object from multiple threads,
20 | then you need to ensure that it won't ever be accessed concurrently (if you
21 | don't ensure this bad things are likely to happen - for example crashes
22 | or even data corruption). One way to prevent concurrent access is to
23 | require that a thread gets an exclusive lock on a mutex while the access is
24 | made.
25 |
26 | Xapian doesn't include thread locking code to avoid imposing an overhead
27 | when it isn't needed. And in practice the caller can often lock over
28 | several operations, which wouldn't work if the locking code was in
29 | Xapian itself.
30 |
31 | Be aware that some Xapian objects will keep internal references to others
32 | - for example, if you call :xapian-method:`Database::get_document()`, the
33 | resulting :xapian-class:`Document` object will keep a reference to the
34 | :xapian-class:`Database` object, and so you can't safely use the
35 | :xapian-class:`Database` object in one thread at the same time as using the
36 | :xapian-class:`Document` object in another.
37 |
--------------------------------------------------------------------------------
/concepts/index.rst:
--------------------------------------------------------------------------------
1 | Core concepts
2 | =============
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | concurrency
8 | indexing/index
9 | search/index
10 | introduction
11 |
--------------------------------------------------------------------------------
/concepts/indexing/documents.rst:
--------------------------------------------------------------------------------
1 | Documents
2 | =========
3 |
4 | A document in Xapian is simply an item which is returned by a search. When
5 | building a new search system, a key thing to decide is what the documents
6 | in your system are going to be. There's often an obvious choice here, but
7 | in many cases there are alternatives. For example, for a search over a
8 | website, it seems natural to have one document for each page of the site.
9 | However, you could instead choose to use one document for each paragraph of
10 | each page, or to group pages together into subjects and have one document
11 | for each subject.
12 |
13 | Documents are identified in a database by a unique positive integer id,
14 | known as the `document ID`. Currently this is a 32-bit quantity by
15 | default (you can configure xapian-core with ``--enable-64bit-docid``
16 | to get 64-bit docids).
17 |
18 | Documents have three components: `data`, `terms` and `values`. We'll
19 | discuss terms and data first - values are useful for some more advanced
20 | search types.
21 |
22 | Document Data
23 | -------------
24 |
25 | The `document data` is an arbitrary binary blob of data associated with the
26 | document. Xapian treats this as completely opaque, and does nothing with
27 | this data other than storing it in the database (compressed with zlib if it
28 | is compressible) and returning it when requested.
29 |
30 | It can be used to hold a reference to the document elsewhere (such as the
31 | primary key in an external database table), or could be used to store the
32 | full text of the document.
33 |
34 | Generally you use the document data to store any information you need in order
35 | to display the resulting document to the user (or to whatever process consumes
36 | the results of searches). Xapian doesn't enforce a serialisation scheme for
37 | putting structured information into the document data, so you can use whatever
38 | is most appropriate for your application.
39 |
40 | Protocol buffers are often a good choice - there's support for them for most
41 | programming languages, they use a compact representation which doesn't
42 | explicitly store the field names, and you can add new fields without
43 | invalidating existing encoded data.
44 |
45 | Some other possible options are a simple scheme using newlines to separate
46 | ``name=value`` entries (like Omega uses), JSON, XML, or a language-specific
47 | serialisation such as ``pickle`` in Python.
48 |
49 | .. todo:: Talk about the importance of batching changes where feasible
50 |
--------------------------------------------------------------------------------
/concepts/indexing/index.rst:
--------------------------------------------------------------------------------
1 | Indexing concepts
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | databases
8 | documents
9 | terms
10 | termgenerator
11 | uniqueness
12 | values
13 | limitations
14 |
15 | .. todo:: cover user metadata; note it is included in transactions
16 |
--------------------------------------------------------------------------------
/concepts/indexing/termgenerator.rst:
--------------------------------------------------------------------------------
1 | Term Generator
2 | ==============
3 |
4 | Rather than force all users to write their own code to process text into terms
5 | for indexing, Xapian provides a :xapian-class:`TermGenerator` class. This parses
6 | chunks of text, producing appropriate terms, and adds them to a document.
7 |
8 | The :xapian-class:`TermGenerator` can be configured to perform stemming (and
9 | stopwording) when generating terms. It can optionally store information about
10 | the positions of words within the text, and can apply field-specific prefixes
11 | to the generated terms to allow searches to be restricted to specific
12 | fields. It can also add additional information to the database for use
13 | when performing spelling correction.
14 |
15 | If you're using the :xapian-class:`TermGenerator` to process text in this way,
16 | you will probably want to use the :doc:`QueryParser <../search/queryparser>`
17 | (described later) when performing searches.
18 |
--------------------------------------------------------------------------------
/concepts/indexing/uniqueness.rst:
--------------------------------------------------------------------------------
1 | Using identifiers with Xapian
2 | =============================
3 |
4 | Every document stored in a Xapian database has a unique positive integer
5 | id, either assigned automatically or manually.
6 |
7 | Often the documents which you're indexing with Xapian will already have
8 | unique ids, and you'll want to be able to use these to reindex an updated
9 | version of an existing document, or delete an expired document from the
10 | Xapian index. There are two ways of approaching this.
11 |
12 | One is to use a one-to-one mapping between your identifiers and Xapian
13 | docids. This will work if your identifiers are positive integers and they
14 | all fit within 32 bits (under about 4 billion), or if they are 64-bit
15 | and you configure xapian-core with `--enable-64bit-docid`.
16 |
17 | The other is to use a special term containing your identifier, which will
18 | work for any type of identifier. Typically you will prefix this (by
19 | convention with 'Q') to avoid collisions with other terms. Terms have a
20 | limited length (245 bytes in glass and chert), so if your unique identifiers
21 | are really long you'll need to do something more complicated.
22 |
23 | For more information on both techniques, `see our FAQ on this`_.
24 |
25 | .. _see our FAQ on this: https://trac.xapian.org/wiki/FAQ/UniqueIds
26 |
--------------------------------------------------------------------------------
/concepts/search/index.rst:
--------------------------------------------------------------------------------
1 | Search concepts
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | queries
8 | queryparser
9 | ranked_matches
10 | search_limitations
11 |
--------------------------------------------------------------------------------
/concepts/search/ranked_matches.rst:
--------------------------------------------------------------------------------
1 | Ranked matches
2 | ==============
3 |
4 | When you run a Query using Xapian, what you get is a list of `ranked`
5 | `matches`.
6 |
7 | Each match is a Xapian Document which satisfies the Query, with a
8 | `weight`, and the list is ordered by decreasing weight, the weight
9 | being an indicator of how good a match that Document is for the query
10 | that was run: a higher weight means a better match. The `rank` of each
11 | match is simply the position in the list of all matches, starting from
12 | 0. Some other search systems use the word "score" instead of weight.
13 |
14 | The actual weight is calculated by a `weighting scheme`; Xapian comes
15 | with a few different ones or you can write your own, although often
16 | the default is fine. (It uses a scheme called BM25, which takes into
17 | account things like how common a matching term is in a matching
18 | document compared to in the entire database, and the lengths of
19 | different matching documents.)
20 |
21 | Rather than having to run through the entire list of matches from the
22 | beginning, you actually ask for a sub-range of the entire list of
23 | matches, from an offset and extending for a given number of
24 | matches. Many search applications will provide the user with a way of
25 | "paging" through the matches, so the first page might be starting at 0
26 | for 10 matches, the second page starting at 10 for 10 matches, and so
27 | on.
28 |
29 | A page of matches in Xapian is called an MSet (for "match set").
30 |
31 | Alternative sort orders
32 | -----------------------
33 |
34 | Sometimes, rather than getting results sorted by `weight`, it would be more
35 | useful to get them in some other order. For example, it might be desirable
36 | to get results in order of the values stored in a date field.
37 |
38 | To do this, you first need to store the information used for the sort in a
39 | value slot, as described in the indexing documentation. You can then tell
40 | Xapian at search time to sort by the value in that slot. It is also
41 | possible to sort by the values in several slots (e.g., to sort items which
42 | have the same value in a particular slot by the value in a secondary slot).
43 |
44 | Finally, it is possible to ask Xapian to return the documents in order of
45 | the Xapian document ID numbers.
46 |
--------------------------------------------------------------------------------
/data/us_states_on_wikipedia:
--------------------------------------------------------------------------------
1 | Washington_(state)
2 | Oregon
3 | Montana
4 | Idaho
5 | Nevada
6 | California
7 | Utah
8 | Wyoming
9 | Colorado
10 | New_Mexico
11 | Alaska
12 | Hawaii
13 | North_Dakota
14 | South_Dakota
15 | Nebraska
16 | Kansas
17 | Texas
18 | Minnesota
19 | Wisconsin
20 | Iowa
21 | Oklahoma
22 | Missouri
23 | Arizona
24 | Arkansas
25 | Louisiana
26 | Mississippi
27 | Illinois
28 | Indiana
29 | Michigan
30 | Ohio
31 | Kentucky
32 | Tennessee
33 | Alabama
34 | Georgia_(U.S._state)
35 | Florida
36 | South_Carolina
37 | North_Carolina
38 | Virginia
39 | West_Virginia
40 | Pennsylvania
41 | New_York
42 | Vermont
43 | New_Jersey
44 | Maine
45 | New_Hampshire
46 | Massachusetts
47 | Rhode_Island
48 | Connecticut
49 | Delaware
50 | Maryland
51 |
--------------------------------------------------------------------------------
/deprecation/index.rst:
--------------------------------------------------------------------------------
1 | =======================
2 | Deprecation of features
3 | =======================
4 |
5 | .. toctree::
6 | :maxdepth: 2
7 |
8 | deprecation
9 | features_deprecated
10 | features_removed
11 |
--------------------------------------------------------------------------------
/howtos/index.rst:
--------------------------------------------------------------------------------
1 | How To...
2 | =========
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | boolean_filters
8 | range_queries
9 | facets
10 | sorting
11 | collapsing
12 | spelling
13 | synonyms
14 | weighting_scheme
15 | iterate_all_docs
16 |
--------------------------------------------------------------------------------
/howtos/iterate_all_docs.rst:
--------------------------------------------------------------------------------
1 | Iterate through all documents
2 | =============================
3 |
4 | Sometimes you want to access all the documents in a Xapian database. This can actually be done in two ways:
5 |
6 | .. _match-all:
7 |
8 | MatchAll Queries
9 | ----------------
10 |
11 | The `Xapian::Query::MatchAll` query is a special static query which will match all documents in the database.
12 | If you run this query on its own, with appropriate start and end parameters, you could retrieve all the documents.
13 | However be aware that even if you paged through the result sets, when you try to access a page deep in the result
14 | set a lot of processing and memory will be used even if the page is small, so running a plain `MatchAll` query is
15 | rarely a good idea.
16 |
17 | However, this method *is* appropriate if you're constructing a complicated query, and one of the components of that
18 | query should be all the documents. In particular, since Xapian doesn't support a unary `NOT` operator, if you want to
19 | run a "pure NOT" query to retrieve all documents which do not contain a given term, this can be only be done using a
20 | `MatchAll` query and the binary `NOT` operator.
21 |
22 | .. todo: Need an example here, and probably some rewording of the previous paragraph.
23 |
24 | .. note: MatchAll queries can also be created by constructing a query with an empty term: the MatchAll class is
25 | .. syntactic sugar for this, and avoids you needing to create an instance of a query for this.
26 |
27 | Iterating through all documents
28 | -------------------------------
29 |
30 | If you do need access to all the documents in the database, it is better to use a "posting list iterator".
31 | Such an iterator, which returns all documents in the database, can be created using::
32 |
33 | Xapian::Database::postlist_begin("")
34 |
35 | In Xapian, a postlist is a list of the documents in which a term exists. Here, we're again using the special
36 | "empty" term, which implicitly matches all documents, to get an iterator over all documents.
37 |
38 | The iterator can be dereferenced to get the document IDs; to get the actual documents, the
39 | :xapian-method:`Database::get_document()` method should be used.
40 |
41 | .. todo: Need an example here, and probably some rewording.
42 |
--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
1 | Getting Started with Xapian |version|
2 | =====================================
3 |
4 | .. note::
5 |
6 | Not all Xapian functionality is yet documented in this guide, so
7 | once you've gone through it you may wish to look at our `online API
8 | documentation`_ and also at some of the additional help available
9 | on `the Xapian wiki`_.
10 |
11 | .. _online API documentation: https://xapian.org/docs/apidoc/html/annotated.html
12 | .. _the Xapian wiki: https://trac.xapian.org/wiki/
13 |
14 | .. toctree::
15 | :maxdepth: 2
16 |
17 | overview
18 | language_specific
19 | concepts/index
20 | practical_example/index
21 | howtos/index
22 | advanced/index
23 | deprecation/index
24 | glossary
25 | LICENSE
26 |
--------------------------------------------------------------------------------
/language_specific.rst:
--------------------------------------------------------------------------------
1 | .. xapianinclude:: language_specific/LANGUAGE/index.rst
2 |
--------------------------------------------------------------------------------
/language_specific/c++/index.rst:
--------------------------------------------------------------------------------
1 | ==================
2 | C++ Specific Notes
3 | ==================
4 |
5 | Exceptions
6 | ==========
7 |
8 | Xapian reports errors by throwing exceptions. For failures in things like
9 | memory allocation, you will see exceptions derived from ``std::exception``,
10 | but exceptions related to Xapian-specific issues will be derived from
11 | ``Xapian::Error``.
12 |
13 | Uncaught exceptions will cause your program to terminate, so it's wise
14 | to at least have a top-level exception handler which can catch any
15 | exceptions and report what they were. You can call the ``get_description()``
16 | method on a ``Xapian::Error`` object to get a human readable string including
17 | all the information the object contains.
18 |
19 | Because ``Xapian::Error`` is an abstract base class you need to catch
20 | it by reference::
21 |
22 | try {
23 | do_something_with_xapian();
24 | } catch (const Xapian::Error & e) {
25 | cout << "Exception: " << e.get_description() << '\n';
26 | } catch (const std::exception & e) {
27 | cout << "Exception: " << e.what() << '\n';
28 | }
29 |
30 | .. todo:: Xapian::Error hierarchy
31 |
32 | Object Copying
33 | ==============
34 |
35 | Objects are either reference counted handles or relatively cheap to copy.
36 |
37 | Object Ownership
38 | ================
39 |
40 | Creator owns.
41 |
42 | .. todo:: write me
43 |
44 | STL Compatibility
45 | =================
46 |
47 | .. todo:: write me
48 |
--------------------------------------------------------------------------------
/language_specific/csharp/index.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/csharp/index.rst
--------------------------------------------------------------------------------
/language_specific/java/index.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/java/index.rst
--------------------------------------------------------------------------------
/language_specific/java/running_examples.rst:
--------------------------------------------------------------------------------
1 | Since there isn't a standard location to install third-party Java
2 | libraries, you will likely have to set the ``CLASSPATH`` variable
3 | appropriately to indicate that you wish to use the Xapian Java
4 | bindings.
5 |
6 | There are two parts to the bindings: a jarfile (``xapian.jar``)
7 | containing the Java classes, and the JNI library (such as
8 | ``libxapian_jni.so`` on Linux, or ``libxapian_jni.jnilib`` on macOS)
9 | that connects them to Xapian itself. The easiest way to get this
10 | working is to copy those two files to the top-level directory of this
11 | repository. If you built your own Java bindings, the files will be in
12 | ``java/built`` in the bindings source code. Then you can use the
13 | following classpath (if on Linux)::
14 |
15 | xapian.jar:libxapian_jni.so:.
16 |
17 | If you set the ``CLASSPATH`` variable to this, then the example
18 | commands will work as shown. For instance, if you're using the
19 | ``bash`` shell, you should run the following before any example
20 | commands (again, on Linux)::
21 |
22 | export CLASSPATH=xapian.jar:libxapian_jni.so:.
23 |
--------------------------------------------------------------------------------
/language_specific/lua/index.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/lua/index.rst
--------------------------------------------------------------------------------
/language_specific/perl/index.rst:
--------------------------------------------------------------------------------
1 | ===================
2 | Perl Specific Notes
3 | ===================
4 |
5 | Unicode
6 | #######
7 |
8 | The Unicode support in Perl is good and straightforward as long as you
9 | understand how it works. A string can be either a byte string
10 | (encoded) or a character string (decoded). The correct way to deal
11 | with this matter is to decode the strings on input and encode them on
12 | output, while the code should deal with characters (not bytes, so a
13 | character with diacritics is seen a single character, not 2 or more
14 | bytes).
15 |
16 | Typically, this is done this way::
17 |
18 | #!/usr/bin/env perl
19 | use utf8; # this says that in this file we can use unicode and will be decoded
20 | use strict;
21 | use warnings;
22 | # this encodes on output
23 | binmode STDOUT, ":encoding(UTF-8)";
24 | binmode STDERR, ":encoding(UTF-8)";
25 |
26 | # this opens a file and decodes it.
27 | open (my $in, '<:encoding(UTF-8)', $file);
28 | while (<$in>) { .... }
29 |
30 | # this opens a file for writing and encodes the output on print
31 | open (my $out, '>:encoding(UTF-8)', $file);
32 | print $out "Đe ši Šu\n";
33 |
34 | Also, database drivers usually have a (recommended) setting to decode
35 | the strings coming out from the DB and encoding them before storing
36 | them, so the code deals transparently with characters.
37 |
38 | How this applies to Xapian? You usually store strings with
39 | ``set_data`` and ``add_value``. Such fields are binary fields, so they
40 | want bytes. If you pass a decoded string, it will be silently encoded.
41 | When you are going to retrieve them, the data will come out encoded,
42 | as a string of bytes, and you need to be prepared for it. You can do
43 | this using serialization. The example code stores the document data
44 | using ``encode_json`` (which produces a byte string) and on retrieving
45 | it calls ``decode_json`` (which returns decoded values). When you
46 | store a value, you encode it with ``encode`` or with the
47 | ``sortable_serialise``. Both functions produce bytes::
48 |
49 | use Encode qw/encode decode/;
50 | use JSON::MaybeXS;
51 | # ....
52 | $doc->set_data(encode_json($rec));
53 | $doc->add_value(0, encode('UTF-8', $string));
54 | $doc->add_value(1, Search::Xapian::sortable_serialise($value));
55 |
56 | If you retrieve a stored value, you need to decode it::
57 |
58 | use Encode qw/encode decode/;
59 | use JSON::MaybeXS;
60 | # ...
61 | my $string = decode('UTF-8', $doc->get_value(0));
62 | my $fields = decode_json($doc->get_data);
63 |
64 | See :xapian-code-example:`index_facets` and
65 | :xapian-code-example:`search_facets` for some example code.
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/language_specific/php/index.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/php/index.rst
--------------------------------------------------------------------------------
/language_specific/ruby/index.rst:
--------------------------------------------------------------------------------
1 | ===================
2 | Ruby Specific Notes
3 | ===================
4 |
--------------------------------------------------------------------------------
/language_specific/tcl/index.rst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/tcl/index.rst
--------------------------------------------------------------------------------
/practical_example/index.rst:
--------------------------------------------------------------------------------
1 | .. _a-practical-example:
2 |
3 | A practical example
4 | ===================
5 |
6 | .. toctree::
7 | :maxdepth: 2
8 |
9 | indexing/index
10 | searching/index
11 |
--------------------------------------------------------------------------------
/practical_example/indexing/building_a_museum_catalogue.rst:
--------------------------------------------------------------------------------
1 | Building a museum catalogue
2 | ===========================
3 |
4 | We're going to build a simple search system based on museum catalogue
5 | data released under the `Creative Commons
6 | Attribution-NonCommercial-ShareAlike
7 | `_ license by the
8 | `Science Museum in London, UK `_.
9 |
10 | Preparing to run the examples
11 | -----------------------------
12 |
13 | You should download both the two sample datasets and example code as
14 | described in the :ref:`overview `,
15 | and also check that you've installed Xapian as detailed there.
16 |
17 | .. The code is provided as a gzipped tar file, which you should unpack
18 | .. into the directory you're going to use while working through this
19 | .. guide. The datasets are gzipped CSV files, which should be
20 | .. uncompressed into the same directory. You should then open an
21 | .. interactive shell in that directory. For instance, if you're using
22 | .. Python for the examples, run something like the following::
23 | ..
24 | .. $ mkdir xapian-guide
25 | .. $ cd xapian-guide
26 | .. $ wget https://xapian.org/docs/examples/python.tgz
27 | .. $ wget https://xapian.org/data/muscat-data.csv.gz
28 | .. $ wget https://xapian.org/data/states-data.csv.gz
29 | .. $ gzip -dc python.tgz | tar xvf - && rm python.tgz
30 | .. $ gzip -d muscat-data.csv.gz
31 | .. $ gzip -d states-data.csv.gz
32 | ..
33 | .. This will leave you with two files, `muscat.csv` and `states.csv`, and
34 | .. a directory `code` which itself contains a directory `python` which
35 | .. contains all the example code.
36 |
--------------------------------------------------------------------------------
/practical_example/indexing/index.rst:
--------------------------------------------------------------------------------
1 | Indexing
2 | ========
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | building_a_museum_catalogue
8 | what_data_is_there
9 | what_do_people_want_to_search_for
10 | index_plan
11 | writing_the_code
12 | verifying_the_index
13 | updating_the_database
14 |
--------------------------------------------------------------------------------
/practical_example/indexing/index_plan.rst:
--------------------------------------------------------------------------------
1 | The index plan
2 | --------------
3 | In order to index the CSV, we want to take two fields from each row, title
4 | and description, and turn them into suitable terms. For straightforward
5 | textual search we don't need document values.
6 |
7 | Because we're dealing with free text, and because we know the whole dataset
8 | is in English, we can use stemming so that for instance searching for
9 | "sundial" and "sundials" will both match the same documents. This way
10 | people don't need to worry too much about exactly which words to use in
11 | their query.
12 |
13 | Finally, we want a way of separating the two fields. In Xapian this is done
14 | using `term prefixes`, basically by putting short strings at the beginning
15 | of terms to indicate which field the term indexes. As well as prefixed
16 | terms, we also want to generate unprefixed terms, so that as well as
17 | searching within fields you can also search for text in any field.
18 |
19 | There are some conventional prefixes used, which is helpful if you ever need to
20 | interoperate with omega (a web-based search engine) or other compatible
21 | systems. From this, we'll use 'S' to prefix title (it stands for 'subject'), and
22 | for description we'll use 'XD'. A full list of conventional prefixes is given at
23 | the top of the `omega documentation on termprefixes`_.
24 |
25 | .. _omega documentation on termprefixes: https://xapian.org/docs/omega/termprefixes
26 |
27 | When you're indexing multiple fields like this, the term positions used for
28 | each field when indexed unprefixed need to be kept apart. Say you have a
29 | title of "The Saints", and description "Don't like rabbits? Keep reading."
30 | If you index those fields without a gap, the phrase search "Saints don't
31 | like rabbits" will match, where it really shouldn't. Usually a gap of 100
32 | between each field is enough.
33 |
34 | To write to a database, we use the WritableDatabase class, which allows us
35 | to create, update or overwrite a database.
36 |
37 | To create terms, we use Xapian's TermGenerator, a built-in class to make
38 | turning free text into terms easier. It will split into words, apply
39 | stemming, and then add term prefixes as needed. It can also take care of
40 | term positions, including the gap between different fields.
41 |
--------------------------------------------------------------------------------
/practical_example/indexing/updating_the_database.rst:
--------------------------------------------------------------------------------
1 | Updating the database
2 | ---------------------
3 |
4 | If you look back at the verifying step of the database, you may notice
5 | that the first item we have indexed has the word 'compass' spelled
6 | incorrectly, which means that we will need to either update just that
7 | document, or to re-index the entire database.
8 |
9 | Reindexing the database can be done immediately using the :xapian-basename-code-example:`index1` script
10 | we used for the initial indexing; this is because we are using an external
11 | ID for each document we add to the database, taken from the `id_NUMBER`
12 | field from the original data set. We then pass this to the :xapian-method:`Database::replace_document()`
13 | method, which updates if there's already a document under that external ID,
14 | or adds a document to the database otherwise.
15 |
16 | In fact, because of this, :xapian-basename-code-example:`index1` can update just part of the
17 | database. Just give it a file with only the rows that correspond to
18 | documents that need updating. Everything else in the database will be
19 | left untouched.
20 |
21 | Deleting documents
22 | ~~~~~~~~~~~~~~~~~~
23 |
24 | It is also possible to delete documents from the index using the
25 | :xapian-method:`Database::delete_document()` method on a
26 | :xapian-class:`WritableDatabase` object. This can be done either by Xapian docid
27 | or using unique ID terms, as with :xapian-method:`Database::replace_document()`.
28 |
29 | .. xapianexample:: delete1
30 |
31 | A copy of this code is available in :xapian-code-example:`^`.
32 |
33 | Then we just run our deletion tool, giving it identifiers taken from
34 | the `id_NUMBER` field in the data set:
35 |
36 | .. xapianrunexample:: index1
37 | :silent:
38 | :args: data/100-objects-v1.csv db
39 |
40 | .. xapianrunexample:: delete1
41 | :args: db 1953-448 1985-438
42 |
43 | After that, we expect to see two fewer documents in our database using xapian-delve:
44 |
45 | .. code-block:: none
46 |
47 | $ xapian-delve db
48 | UUID = 1820ef0a-055b-4946-ae73-67aa4ef5c226
49 | number of documents = 98
50 | average document length = 100.041
51 | document length lower bound = 33
52 | document length upper bound = 251
53 | highest document id ever used = 100
54 | has positional information = true
55 |
--------------------------------------------------------------------------------
/practical_example/indexing/verifying_the_index.rst:
--------------------------------------------------------------------------------
1 | Verifying the index using xapian-delve
2 | --------------------------------------
3 |
4 | Xapian comes with a handy utility called `xapian-delve` which can be used to
5 | inspect a database, so let's look at the one you just built. If you just
6 | pass a database path as a parameter you'll get an overview: how many documents,
7 | average term length, and some other statistics:
8 |
9 | .. code-block:: none
10 |
11 | $ xapian-delve db
12 | UUID = 1820ef0a-055b-4946-ae73-67aa4ef5c226
13 | number of documents = 100
14 | average document length = 100.58
15 | document length lower bound = 33
16 | document length upper bound = 251
17 | highest document id ever used = 100
18 | has positional information = true
19 |
20 | You can also look at an individual document, using Xapian's docid (``-d``
21 | means output document data as well):
22 |
23 | .. code-block:: none
24 |
25 | $ xapian-delve -r 1 -d db # output has been reformatted
26 | Data for record #1:
27 | {
28 | "MEASUREMENTS": "",
29 | "DESCRIPTION": "Ansonia Sunwatch (pocket compas dial)",
30 | "PLACE_MADE": "New York county, New York state, United States",
31 | "id_NUMBER": "1974-100",
32 | "WHOLE_PART": "WHOLE",
33 | "TITLE": "Ansonia Sunwatch (pocket compas dial)",
34 | "DATE_MADE": "1922-1939",
35 | "COLLECTION": "SCM - Time Measurement",
36 | "ITEM_NAME": "Pocket horizontal sundial",
37 | "MATERIALS": "",
38 | "MAKER": "Ansonia Clock Co."
39 | }
40 | Term List for record #1: Q1974-100 Sansonia Scompas Sdial Spocket
41 | Ssunwatch XDansonia XDcompass XDdial XDpocket XDsunwatch ZSansonia
42 | ZScompas ZSdial ZSpocket ZSsunwatch ZXDansonia ZXDcompas ZXDdial
43 | ZXDpocket ZXDsunwatch Zansonia Zcompass Zdial Zpocket Zsunwatch
44 | ansonia compass dial pocket sunwatch
45 |
46 | You can also go the other way, starting with a term and finding both
47 | statistics and which documents it indexes:
48 |
49 | .. code-block:: none
50 |
51 | $ xapian-delve -t Stime db
52 | Posting List for term `Stime' (termfreq 4, collfreq 4, wdf_max 4):
53 | 41 56 58 65
54 |
55 | This means you can look documents up by identifier:
56 |
57 | .. code-block:: none
58 |
59 | $ xapian-delve -t Q1974-100 db
60 | Posting List for term `Q1974-100' (termfreq 1, collfreq 1, wdf_max 1):
61 | 1
62 |
63 | ``xapian-delve`` is frequently useful if you aren't getting the behaviour you
64 | expect from a search system, to check that the database contains the
65 | documents and terms you expect.
66 |
--------------------------------------------------------------------------------
/practical_example/indexing/what_data_is_there.rst:
--------------------------------------------------------------------------------
1 | What data is there?
2 | -------------------
3 |
4 | Each row in the CSV file is an object from the catalogue, and has a number
5 | of fields. There are:
6 |
7 | id_NUMBER:
8 | a unique identifier
9 | ITEM_NAME:
10 | a simple name, often from an established thesaurus
11 | TITLE:
12 | a short caption
13 | MAKER:
14 | the name of who made the object
15 | DATE_MADE:
16 | when the object was made, which may be a range, approximate date or
17 | unknown
18 | PLACE_MADE:
19 | where the object was made
20 | MATERIALS:
21 | what the object is made of
22 | MEASUREMENTS:
23 | the dimensions of the object
24 | DESCRIPTION
25 | a description of the object
26 | COLLECTION:
27 | the collection the object came from (eg: Science Museum - Space
28 | Technology)
29 |
30 | There are obviously a number of different types of data here: free text,
31 | identifiers, dates, places (which could be geocoded to geo coordinates),
32 | and dimensions. Additionally, COLLECTION and MAKER both come from a list of
33 | possible values.
34 |
--------------------------------------------------------------------------------
/practical_example/indexing/what_do_people_want_to_search_for.rst:
--------------------------------------------------------------------------------
1 | What do people want to search for?
2 | ----------------------------------
3 |
4 | We can think of a large number of different things that people might want
5 | to find from our catalogue. For instance, they may want to find objects
6 | that were created in Nantes, or after 1812, or by Hurd-Brown Ltd. They may
7 | want to find everything made of brass, or not containing wood, or more than
8 | a metre in length. They may care only about objects in the National Railway
9 | Museum, or in their Railway Heraldry collection, or everything not in the
10 | Railway Heraldry collection. And of course they may want to look up objects
11 | that have certain words or phrases in the title or description - "free text
12 | search", one of the most common uses of search today.
13 |
14 | In order to support all of this we'll need to use many of the features of
15 | Xapian, but to get started we'll just look at one: free text search of the
16 | title and description.
17 |
18 | In later sections of this guide we'll use the same data and build on the
19 | system we create here.
20 |
--------------------------------------------------------------------------------
/practical_example/indexing/writing_the_code.rst:
--------------------------------------------------------------------------------
1 | Let's write some code
2 | ---------------------
3 |
4 | Here's the significant part of some example code to implement this index plan.
5 |
6 | .. xapianexample:: index1
7 |
8 | A full copy of this code is available in :xapian-code-example:`^`.
9 |
10 | You can run this code to index a sample data file (held in
11 | :xapian-example:`data/100-objects-v1.csv`) to a database at path ``db`` as follows:
12 |
13 | .. xapianrunexample:: index1
14 | :cleanfirst: db
15 | :args: data/100-objects-v1.csv db
16 |
--------------------------------------------------------------------------------
/practical_example/searching/building.rst:
--------------------------------------------------------------------------------
1 | Building the search
2 | -------------------
3 |
4 | Now we have our database populated with some values, it is time for
5 | the code to search the database and display some results.
6 |
7 | We want to take some text from the user, and search for it in the
8 | database; to do that we need to convert it into a Xapian Query, which
9 | you will recall is a tree made up of terms (which in this case will be
10 | the stemmed forms of words in the text from the user), and operations
11 | such as AND, OR and so forth.
12 |
13 | There are many ways to go from the user's text to a Query, but the
14 | most simple of these is to use the QueryParser. We then pass the Query
15 | to an Enquire object, which also needs setting up with a database, and
16 | is where you'd set various other options that affect how the query is
17 | run (such as sorting, for instance) which we won't address here.
18 |
19 | .. xapianexample:: search1
20 |
21 | A full copy of this code is available in :xapian-code-example:`^`.
22 |
--------------------------------------------------------------------------------
/practical_example/searching/database_modified.rst:
--------------------------------------------------------------------------------
1 | Database Modified
2 | -----------------
3 |
4 | If you're updating the same database you search from (rather than
5 | updating a separate database and then "flipping" between them, using a
6 | stub database), you may run into :xapian-class:`DatabaseModifiedError`, an
7 | exception that can be raised while reading from the database. What
8 | this means is that the database has changed too much since you opened
9 | it for Xapian to be able to continue supplying you with
10 | information. The solution here is to re-open the database with its
11 | :xapian-just-method:`reopen()` method.
12 |
13 |
--------------------------------------------------------------------------------
/practical_example/searching/index.rst:
--------------------------------------------------------------------------------
1 | Searching
2 | =========
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 | building
8 | running_the_search
9 | prefix
10 | database_modified
11 |
--------------------------------------------------------------------------------
/practical_example/searching/prefix.rst:
--------------------------------------------------------------------------------
1 | Searching separate fields
2 | -------------------------
3 |
4 | When we built our index, we used prefixes to separate the terms generated from
5 | the title and description fields. This allows us to perform searches which are
6 | restricted to the text in just one of those fields, by searching only terms
7 | with the desired prefix.
8 |
9 | When using the Query Parser, it is possible to restrict your search to
10 | certain prefixed terms (e.g. title, or description). These can be searched
11 | for either by using a search prefix (which can correlate to an indexing
12 | prefix) or as a general text document.
13 |
14 | To set up a search prefix, the QueryParser needs to be told which prefixes
15 | in the search query relate to those in the index. We did that in the previous
16 | search code:
17 |
18 | .. xapianrunexample:: index1
19 | :silent:
20 | :args: data/100-objects-v1.csv db
21 |
22 | .. xapianrunexample:: delete1
23 | :silent:
24 | :args: db 1953-448 1985-438
25 |
26 | .. xapianexample:: search1
27 | :marker: prefix configuration.
28 |
29 | This allows us to perform a search based on either field, for example:
30 |
31 | .. xapianrunexample:: search1
32 | :args: db title:sunwatch
33 |
34 | We can also combine prefixes with the logical operators to perform more
35 | complex queries (note that we need to escape quotes or else the shell
36 | will eat them):
37 |
38 | .. xapianrunexample:: search1
39 | :args: db description:\"leather case\" AND title:sundial
40 |
--------------------------------------------------------------------------------
/practical_example/searching/running_the_search.rst:
--------------------------------------------------------------------------------
1 | Running a Search
2 | ----------------
3 | To search the database we've built, you just run our simple search engine:
4 |
5 | .. xapianrunexample:: index1
6 | :silent:
7 | :args: data/100-objects-v1.csv db
8 |
9 | .. xapianrunexample:: delete1
10 | :silent:
11 | :args: db 1953-448 1985-438
12 |
13 | .. xapianrunexample:: search1
14 | :args: db watch
15 |
16 | These results show that 7 documents match our search for the term
17 | 'watch', providing the document IDs (e.g. #004) and title for each.
18 | If you want to search for multiple words, just chain them together on
19 | the command line:
20 |
21 | .. xapianrunexample:: search1
22 | :args: db Dent watch
23 |
24 | You'll notice that all of the results from the first time come back
25 | the second time also, with additional ones (the match 'Dent' but not
26 | 'watch'), because by default QueryParser will use the OR operator to
27 | combine the different search terms. Also, because #046 contains both
28 | 'Dent' and 'watch', it now ranks highest of all the matches.
29 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx~=1.8.0
2 |
--------------------------------------------------------------------------------