├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .readthedocs.yaml ├── LICENSE.rst ├── Makefile ├── README.rst ├── advanced ├── admin_notes.rst ├── custom_weighting.rst ├── index.rst ├── postingsource.rst ├── replication.rst ├── scalability.rst ├── serialisation.rst └── unigramlm.rst ├── attic ├── clustering.rst ├── eset.rst ├── geospatial.rst ├── pagination.rst ├── query_authorisation.rst ├── range_performance.rst └── remote.rst ├── code ├── c++ │ ├── .gitignore │ ├── delete1.cc │ ├── index1.cc │ ├── index_facets.cc │ ├── index_filters.cc │ ├── index_ranges.cc │ ├── index_ranges2.cc │ ├── index_values_with_geo.cc │ ├── search1.cc │ ├── search_facets.cc │ ├── search_filters.cc │ ├── search_filters2.cc │ ├── search_ranges.cc │ ├── search_ranges2.cc │ ├── search_sorting.cc │ ├── search_sorting2.cc │ ├── search_sorting3.cc │ ├── search_synonyms.cc │ ├── support.cc │ └── support.h ├── expected.out │ ├── delete1.out │ ├── index1.db_title=3asunwatch.out │ ├── index1.out │ ├── index_facets.out │ ├── index_filters.out │ ├── index_ranges.out │ ├── index_ranges2.out │ ├── index_values_with_geo.out │ ├── search1.db_Dent_watch.out │ ├── search1.db_description=3a=5c=22leather_case=5c=22_AND_title=3asundial.out │ ├── search1.db_watch.out │ ├── search1.out │ ├── search_facets.out │ ├── search_filters.out │ ├── search_filters2.out │ ├── search_ranges.db_..50mm.out │ ├── search_ranges.db_1000..mm_1800..1899.out │ ├── search_ranges.db_1980..1989.out │ ├── search_ranges.db_clock_1960...out │ ├── search_ranges2.statesdb_10000000...out │ ├── search_ranges2.statesdb_11=2f08=2f1889..07=2f10=2f1890.out │ ├── search_ranges2.statesdb_1780..1789_10000000...out │ ├── search_ranges2.statesdb_1800..1899.out │ ├── search_ranges2.statesdb_spanish.out │ ├── search_sorting.out │ ├── search_sorting2.out │ ├── search_sorting3.out │ ├── search_synonyms.db_time.out │ └── search_synonyms.out ├── java │ ├── .gitignore │ ├── delete1.java │ ├── index1.java │ ├── index1.java.data=2f100-objects-v1.csv_db.out │ ├── search1.java │ └── support.java ├── perl │ ├── Support.pm │ ├── delete1.pl │ ├── index1.pl │ ├── index_facets.pl │ ├── index_filters.pl │ ├── index_ranges.pl │ ├── index_ranges2.pl │ ├── search1.pl │ ├── search_facets.pl │ ├── search_filters.pl │ ├── search_filters2.pl │ ├── search_sorting.pl │ ├── search_sorting2.pl │ ├── search_synonyms.pl │ └── strings.t ├── php │ ├── delete1.php │ ├── index1.php │ ├── index_facets.php │ ├── index_filters.php │ ├── logger.php │ ├── parsecsv.php │ ├── search1.php │ ├── search_facets.php │ ├── search_filters.php │ └── search_filters2.php ├── python │ ├── delete1.py │ ├── from_wikipedia.py │ ├── index1.py │ ├── index_facets.py │ ├── index_filters.py │ ├── index_ranges.py │ ├── index_ranges2.py │ ├── index_sorting.py │ ├── index_values_with_geo.py │ ├── postingsource.py │ ├── search1.py │ ├── search_facets.py │ ├── search_filters.py │ ├── search_filters2.py │ ├── search_ranges.py │ ├── search_ranges2.py │ ├── search_sorting.py │ ├── search_sorting2.py │ ├── search_sorting3.py │ ├── search_synonyms.py │ └── support.py ├── python3 │ ├── delete1.py │ ├── index1.py │ ├── index_facets.py │ ├── index_filters.py │ ├── index_ranges.py │ ├── index_ranges2.py │ ├── index_sorting.py │ ├── index_values_with_geo.py │ ├── postingsource.py │ ├── search1.py │ ├── search_facets.py │ ├── search_filters.py │ ├── search_filters2.py │ ├── search_ranges.py │ ├── search_ranges2.py │ ├── search_sorting.py │ ├── search_sorting2.py │ ├── search_sorting3.py │ ├── search_synonyms.py │ └── support.py └── ruby │ ├── delete1.rb │ ├── index1.rb │ ├── index_facets.rb │ ├── index_filters.rb │ ├── index_ranges.rb │ ├── index_ranges2.rb │ ├── index_values_with_geo.rb │ ├── search1.rb │ ├── search_facets.rb │ ├── search_filters.rb │ ├── search_filters2.rb │ ├── search_ranges.rb │ ├── search_ranges2.rb │ ├── search_sorting.rb │ ├── search_sorting2.rb │ ├── search_sorting3.rb │ ├── search_synonyms.rb │ └── support.rb ├── concepts ├── concurrency.rst ├── index.rst ├── indexing │ ├── databases.rst │ ├── documents.rst │ ├── index.rst │ ├── limitations.rst │ ├── termgenerator.rst │ ├── terms.rst │ ├── uniqueness.rst │ └── values.rst ├── introduction.rst └── search │ ├── index.rst │ ├── queries.rst │ ├── queryparser.rst │ ├── ranked_matches.rst │ └── search_limitations.rst ├── conf.py ├── data ├── 100-objects-v1.csv ├── 100-objects-v2.csv ├── states.csv └── us_states_on_wikipedia ├── deprecation ├── deprecation.rst ├── features_deprecated.rst ├── features_removed.rst └── index.rst ├── glossary.rst ├── howtos ├── boolean_filters.rst ├── collapsing.rst ├── facets.rst ├── index.rst ├── iterate_all_docs.rst ├── range_queries.rst ├── sorting.rst ├── spelling.rst ├── synonyms.rst └── weighting_scheme.rst ├── index.rst ├── language_specific.rst ├── language_specific ├── c++ │ └── index.rst ├── csharp │ └── index.rst ├── java │ ├── index.rst │ └── running_examples.rst ├── lua │ └── index.rst ├── perl │ └── index.rst ├── php │ └── index.rst ├── python │ └── index.rst ├── python3 │ └── index.rst ├── ruby │ └── index.rst └── tcl │ └── index.rst ├── oldmanual └── output.txt ├── overview.rst ├── practical_example ├── index.rst ├── indexing │ ├── building_a_museum_catalogue.rst │ ├── index.rst │ ├── index_plan.rst │ ├── updating_the_database.rst │ ├── verifying_the_index.rst │ ├── what_data_is_there.rst │ ├── what_do_people_want_to_search_for.rst │ └── writing_the_code.rst └── searching │ ├── building.rst │ ├── database_modified.rst │ ├── index.rst │ ├── prefix.rst │ └── running_the_search.rst └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .*.sw? 3 | *.pyc 4 | _build 5 | ENV 6 | db 7 | statesdb 8 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | jobs: 14 | # We want to replace the build command itself but the "Override the build 15 | # process" feature means we have to reimplement everything, so instead we 16 | # "Extend the build process" and set up a dummy no-op conf.py in 17 | # `pre_build` so `build` doesn't do anything, then actually build in 18 | # `post_build`. 19 | pre_build: 20 | - mkdir dummy 21 | - touch dummy/conf.py dummy/contents.rst 22 | post_build: 23 | - rm -rf dummy _readthedocs/html 24 | - make html LANGUAGE=python3 BUILDDIR=_readthedocs 25 | 26 | # (Don't) build documentation with Sphinx 27 | sphinx: 28 | configuration: dummy/conf.py 29 | 30 | # We recommend specifying your dependencies to enable reproducible builds: 31 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 32 | python: 33 | install: 34 | - requirements: requirements.txt 35 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | License and authors 2 | =================== 3 | 4 | This license applies to all documentation and example code in this book. 5 | Data sets are provided under suitable Creative Commons licenses. 6 | 7 | | Copyright (c) 2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016 Olly Betts 8 | | Copyright (c) 2006,2007,2008,2009 Lemur Consulting Ltd 9 | | Copyright (c) 2007 Deron Meranda 10 | | Copyright (c) 2007 Jenny Black 11 | | Copyright (c) 2010,2011 Richard Boulton 12 | | Copyright (c) 2011 Justin Finkelstein 13 | | Copyright (c) 2011,2012 Dan Colish 14 | | Copyright (c) 2003,2006,2011,2012,2013,2014 James Aylett 15 | | Copyright (c) 2013 Aarsh Shah 16 | | Copyright (c) 2014 Jorge Carleitao 17 | | Copyright (c) 2014 Guarav Arora 18 | | Copyright (c) 2014 Assem Chelli 19 | | Copyright (c) 2014 Mayank Chaudhary 20 | | Copyright (c) 2016 Aakash Muttineni 21 | | Copyright (c) 2016 Vivek Pal 22 | | Copyright (c) 2016 Parth Gupta 23 | | Copyright (c) 2018 Marco Pessotto 24 | 25 | 26 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 31 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Xapian documentation sprint 2 | =========================== 3 | 4 | This is the source for `Xapian's user 5 | guide `_. 6 | Eventually this repository will be merged into the main Xapian tree. 7 | 8 | You will need the `Sphinx documentation tool `_ 9 | installed to process this documentation. You can install the `python3-sphinx` 10 | or `python-sphinx` package on Debian, Fedora and Ubuntu, or ``pip install -r 11 | requirements.txt`` to install the python package directly. 12 | 13 | You can generate versions for different programming languages (with translated 14 | examples and adjustments to the text). For full details see ``make help`` 15 | but for example to generate an HTML version for C++ use:: 16 | 17 | make html LANGUAGE=c++ 18 | 19 | The default if `LANGUAGE` isn't specified (e.g. when you run just ``make 20 | html``) is to build for `python3`. 21 | 22 | You can chat to us on matrix or via our mailing lists. Links to 23 | these are `on our website `_. 24 | -------------------------------------------------------------------------------- /advanced/index.rst: -------------------------------------------------------------------------------- 1 | Advanced features 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | postingsource 8 | unigramlm 9 | custom_weighting 10 | admin_notes 11 | scalability 12 | replication 13 | serialisation 14 | -------------------------------------------------------------------------------- /attic/clustering.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Clustering 3 | ========== 4 | 5 | Document this once we have support for it. 6 | -------------------------------------------------------------------------------- /attic/eset.rst: -------------------------------------------------------------------------------- 1 | ====================== 2 | Building an expand set 3 | ====================== 4 | 5 | .. todo:: 6 | 7 | Introduction, rationale and example. 8 | Documentation of the algorithm used by Enquire::get_eset(). 9 | 10 | See https://lists.xapian.org/pipermail/xapian-discuss/2008-February/005263.html 11 | for a short discussion on the size of the RSet to use. 12 | -------------------------------------------------------------------------------- /attic/geospatial.rst: -------------------------------------------------------------------------------- 1 | Geospatial searches 2 | =================== 3 | 4 | Xapian's geospatial support is currently still a work in progress, and 5 | until it is generally available this documentation will either be 6 | empty or may be out of date. 7 | 8 | .. todo:: Write about how to index geolocation information, and how to use the 9 | geo posting sources and keymaker to sort by distance, bias results by 10 | distance, and limit results by distance. Discuss storing geo 11 | bounding boxes terms for accelerating distance limited searches. 12 | -------------------------------------------------------------------------------- /attic/pagination.rst: -------------------------------------------------------------------------------- 1 | Pagination 2 | ========== 3 | 4 | This will be a howto. 5 | 6 | .. todo:: pagination (ask for pagesize*#pages + 1) 7 | .. todo:: get_matches_estimated() and check_at_least 8 | -------------------------------------------------------------------------------- /attic/query_authorisation.rst: -------------------------------------------------------------------------------- 1 | Query authorisation 2 | =================== 3 | 4 | Say you are building a system that allows people to write private 5 | diary entries, and only share them with specific people. You wouldn't 6 | want search to expose those entries to everyone, so you need to build 7 | understanding of your authorisation scheme into the search system. 8 | 9 | .. todo:: list up front the various methods 10 | 11 | .. todo:: mention that omindex now indexes Unix user and group permissions. 12 | 13 | Filtering results 14 | ----------------- 15 | 16 | .. todo:: Discuss filtering results coming back from a query, and the problems 17 | with just doing that. 18 | 19 | Putting authorisation data into the search index 20 | ------------------------------------------------ 21 | 22 | .. todo:: Discuss implementing auth schemes by indexing 23 | appropriate data. 24 | 25 | Hybrid schemes 26 | -------------- 27 | 28 | .. todo:: Discuss hybrid schemes (implementing auth using 29 | indexed terms, and also filtering results). 30 | 31 | Timeliness of index authorisation 32 | --------------------------------- 33 | 34 | .. todo:: Discuss issues relating 35 | to updates (in particular, how fast does something need to be hidden 36 | if it is changed to being private). 37 | -------------------------------------------------------------------------------- /attic/range_performance.rst: -------------------------------------------------------------------------------- 1 | Performance of Value Ranges 2 | =========================== 3 | 4 | If combined with a suitable term-based query (such as an `OP_AND` 5 | query over one or more terms), this performance impact will be less 6 | because the range operation will only have to run over the potential 7 | matches, which are reduced from the entire database by the term-based 8 | query. 9 | 10 | If, as well as using document values, you also convert groups of those 11 | values into terms, you can provide those term-based queries even when 12 | your users are only interested in a pure range search. For instance, 13 | consider the population information. If you divide the range of 14 | populations into a number of subranges, you can allocate a term to 15 | describe each. We'll use a prefix of `XP` (for "population") here. 16 | 17 | +------------------+------+ 18 | | Population range | Term | 19 | +==================+======+ 20 | | 0 - 10 million | XP0 | 21 | +------------------+------+ 22 | | 10 - 20 million | XP1 | 23 | +------------------+------+ 24 | | 20 - 30 million | XP2 | 25 | +------------------+------+ 26 | | 30 - 40 million | XP3 | 27 | +------------------+------+ 28 | 29 | Then you can use a custom :xapian-class:`RangeProcessor` to produce a 30 | query which uses :xapian-just-constant:`OP_VALUE_RANGE` to match the 31 | range exactly, but first limits the number of documents that this 32 | needs to consider use the filter terms above. For instance, if the user asks 33 | for '..15000000', you can use :xapian-just-constant:`OP_FILTER` to limit 34 | the value range subquery to only considering documents matching a 35 | :xapian-just-constant:`OP_AND` subquery with terms `XP0` and `XP1`. 36 | 37 | .. todo:: possibly implementing this example would help make it more clear. 38 | 39 | .. todo:: Now ticket #663 is done and 40 | we have RangeProcessor, we can move this to advanced and the range 41 | queries howto should point here. 42 | -------------------------------------------------------------------------------- /attic/remote.rst: -------------------------------------------------------------------------------- 1 | ============================= 2 | Working with remote databases 3 | ============================= 4 | 5 | .. contents:: Table of contents 6 | 7 | .. todo:: This needs writing from scratch; the previous document wasn't very good and is license entangled. 8 | -------------------------------------------------------------------------------- /code/c++/.gitignore: -------------------------------------------------------------------------------- 1 | .libs 2 | *.lo 3 | *.o 4 | built 5 | -------------------------------------------------------------------------------- /code/c++/delete1.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | 8 | // Start of example code. 9 | static void delete_docs(const string &dbpath, char ** identifiers) 10 | { 11 | // Open the database we're going to be deleting from. 12 | Xapian::WritableDatabase db(dbpath, Xapian::DB_OPEN); 13 | 14 | while (*identifiers) { 15 | string idterm = "Q"; 16 | idterm += *identifiers; 17 | db.delete_document(idterm); 18 | ++identifiers; 19 | } 20 | } 21 | // End of example code. 22 | 23 | int main(int argc, char** argv) { 24 | if (argc < 3) { 25 | cerr << "Usage: " << argv[0] << " DBPATH ID...\n"; 26 | return 1; 27 | } 28 | delete_docs(argv[1], argv + 2); 29 | } 30 | -------------------------------------------------------------------------------- /code/c++/search1.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "support.h" 9 | 10 | using namespace std; 11 | 12 | // Start of example code. 13 | static void 14 | search(const string & dbpath, const string & querystring, 15 | Xapian::doccount offset = 0, Xapian::doccount pagesize = 10) 16 | { 17 | // offset - defines starting point within result set. 18 | // pagesize - defines number of records to retrieve. 19 | 20 | // Open the database we're going to search. 21 | Xapian::Database db(dbpath); 22 | 23 | // Set up a QueryParser with a stemmer and suitable prefixes. 24 | Xapian::QueryParser queryparser; 25 | queryparser.set_stemmer(Xapian::Stem("en")); 26 | queryparser.set_stemming_strategy(queryparser.STEM_SOME); 27 | // Start of prefix configuration. 28 | queryparser.add_prefix("title", "S"); 29 | queryparser.add_prefix("description", "XD"); 30 | // End of prefix configuration. 31 | 32 | // And parse the query. 33 | Xapian::Query query = queryparser.parse_query(querystring); 34 | 35 | // Use an Enquire object on the database to run the query. 36 | Xapian::Enquire enquire(db); 37 | enquire.set_query(query); 38 | 39 | // And print out something about each match. 40 | Xapian::MSet mset = enquire.get_mset(offset, pagesize); 41 | 42 | clog << "'" << querystring << "'[" << offset << ":" << offset + pagesize 43 | << "] ="; 44 | for (Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m) { 45 | Xapian::docid did = *m; 46 | cout << m.get_rank() + 1 << ": #" << setfill('0') << setw(3) << did 47 | << ' '; 48 | 49 | const size_t DOC_FIELD_TITLE = 1; 50 | const string & data = m.get_document().get_data(); 51 | cout << get_field(data, DOC_FIELD_TITLE) << '\n'; 52 | // Log the document id. 53 | clog << ' ' << did; 54 | } 55 | clog << '\n'; 56 | } 57 | // End of example code. 58 | 59 | int main(int argc, char** argv) { 60 | if (argc < 3) { 61 | cerr << "Usage: " << argv[0] << " DBPATH QUERYTERM...\n"; 62 | return 1; 63 | } 64 | const char * dbpath = argv[1]; 65 | 66 | // Join the rest of the arguments with spaces to make the query string. 67 | string querystring; 68 | for (argv += 2; *argv; ++argv) { 69 | if (!querystring.empty()) querystring += ' '; 70 | querystring += *argv; 71 | } 72 | 73 | search(dbpath, querystring); 74 | } 75 | -------------------------------------------------------------------------------- /code/c++/search_filters2.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "support.h" 10 | 11 | using namespace std; 12 | 13 | static void 14 | search(const string & dbpath, const string & querystring, 15 | Xapian::doccount offset = 0, Xapian::doccount pagesize = 10) 16 | { 17 | // offset - defines starting point within result set. 18 | // pagesize - defines number of records to retrieve. 19 | 20 | // Open the database we're going to search. 21 | Xapian::Database db(dbpath); 22 | 23 | // Start of example code. 24 | // Set up a QueryParser with a stemmer and suitable prefixes. 25 | Xapian::QueryParser queryparser; 26 | queryparser.set_stemmer(Xapian::Stem("en")); 27 | queryparser.set_stemming_strategy(queryparser.STEM_SOME); 28 | queryparser.add_prefix("title", "S"); 29 | queryparser.add_prefix("description", "XD"); 30 | queryparser.add_boolean_prefix("material", "XM"); 31 | 32 | // And parse the query. 33 | Xapian::Query query = queryparser.parse_query(querystring); 34 | // End of example code. 35 | 36 | // Use an Enquire object on the database to run the query. 37 | Xapian::Enquire enquire(db); 38 | enquire.set_query(query); 39 | 40 | // And print out something about each match. 41 | Xapian::MSet mset = enquire.get_mset(offset, pagesize); 42 | 43 | clog << "'" << querystring << "'[" << offset << ":" << offset + pagesize 44 | << "] ="; 45 | for (Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m) { 46 | Xapian::docid did = *m; 47 | cout << m.get_rank() + 1 << ": #" << setfill('0') << setw(3) << did 48 | << ' '; 49 | 50 | const size_t DOC_FIELD_TITLE = 1; 51 | const string & data = m.get_document().get_data(); 52 | cout << get_field(data, DOC_FIELD_TITLE) << '\n'; 53 | // Log the document id. 54 | clog << ' ' << did; 55 | } 56 | clog << '\n'; 57 | } 58 | 59 | int main(int argc, char** argv) { 60 | if (argc < 3) { 61 | cerr << "Usage: " << argv[0] << " DBPATH QUERYTERM...\n"; 62 | return 1; 63 | } 64 | const char * dbpath = argv[1]; 65 | 66 | // Join the rest of the arguments with spaces to make the query string. 67 | string querystring; 68 | for (argv += 2; *argv; ++argv) { 69 | if (!querystring.empty()) querystring += ' '; 70 | querystring += *argv; 71 | } 72 | 73 | search(dbpath, querystring); 74 | } 75 | -------------------------------------------------------------------------------- /code/c++/search_synonyms.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "support.h" 9 | 10 | using namespace std; 11 | 12 | // Start of example code. 13 | static void 14 | search(const string & dbpath, const string & querystring, 15 | Xapian::doccount offset = 0, Xapian::doccount pagesize = 10) 16 | { 17 | // offset - defines starting point within result set. 18 | // pagesize - defines number of records to retrieve. 19 | 20 | // Open the database we're going to search. 21 | Xapian::WritableDatabase db(dbpath); 22 | 23 | // Start of adding synonyms 24 | db.add_synonym("time", "calendar"); 25 | // End of adding synonyms 26 | 27 | // Set up a QueryParser with a stemmer and suitable prefixes. 28 | Xapian::QueryParser queryparser; 29 | queryparser.set_stemmer(Xapian::Stem("en")); 30 | queryparser.set_stemming_strategy(queryparser.STEM_SOME); 31 | queryparser.add_prefix("title", "S"); 32 | queryparser.add_prefix("description", "XD"); 33 | 34 | // Start of set database 35 | queryparser.set_database(db); 36 | // End of set database 37 | 38 | // And parse the query. 39 | Xapian::Query query = queryparser.parse_query(querystring, 40 | queryparser.FLAG_DEFAULT | 41 | queryparser.FLAG_SYNONYM); 42 | 43 | // Use an Enquire object on the database to run the query. 44 | Xapian::Enquire enquire(db); 45 | enquire.set_query(query); 46 | 47 | // And print out something about each match. 48 | Xapian::MSet mset = enquire.get_mset(offset, pagesize); 49 | 50 | clog << "'" << querystring << "'[" << offset << ":" << offset + pagesize 51 | << "] ="; 52 | for (Xapian::MSetIterator m = mset.begin(); m != mset.end(); ++m) { 53 | Xapian::docid did = *m; 54 | cout << m.get_rank() + 1 << ": #" << setfill('0') << setw(3) << did 55 | << ' '; 56 | 57 | const size_t DOC_FIELD_TITLE = 1; 58 | const string & data = m.get_document().get_data(); 59 | cout << get_field(data, DOC_FIELD_TITLE) << '\n'; 60 | // Log the document id. 61 | clog << ' ' << did; 62 | } 63 | clog << '\n'; 64 | } 65 | // End of example code. 66 | 67 | int main(int argc, char** argv) { 68 | if (argc < 3) { 69 | cerr << "Usage: " << argv[0] << " DBPATH QUERYTERM...\n"; 70 | return 1; 71 | } 72 | const char * dbpath = argv[1]; 73 | 74 | // Join the rest of the arguments with spaces to make the query string. 75 | string querystring; 76 | for (argv += 2; *argv; ++argv) { 77 | if (!querystring.empty()) querystring += ' '; 78 | querystring += *argv; 79 | } 80 | 81 | search(dbpath, querystring); 82 | } 83 | -------------------------------------------------------------------------------- /code/c++/support.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | bool csv_parse_line(std::ifstream & csv, std::vector & fields); 6 | 7 | std::string get_field(const std::string & data, size_t field); 8 | 9 | bool max_number_in_string(const std::string & s, double *n_ptr); 10 | 11 | bool first_number_in_string(const std::string & s, double *n_ptr); 12 | 13 | std::string format_date(const std::string& yyyymmdd); 14 | 15 | std::string format_numeral(std::string n); 16 | 17 | double distance_between_coords(const std::pair& a, 18 | const std::pair& b); 19 | -------------------------------------------------------------------------------- /code/expected.out/delete1.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/delete1.out -------------------------------------------------------------------------------- /code/expected.out/index1.db_title=3asunwatch.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index1.db_title=3asunwatch.out -------------------------------------------------------------------------------- /code/expected.out/index1.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index1.out -------------------------------------------------------------------------------- /code/expected.out/index_facets.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_facets.out -------------------------------------------------------------------------------- /code/expected.out/index_filters.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_filters.out -------------------------------------------------------------------------------- /code/expected.out/index_ranges.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_ranges.out -------------------------------------------------------------------------------- /code/expected.out/index_ranges2.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_ranges2.out -------------------------------------------------------------------------------- /code/expected.out/index_values_with_geo.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/expected.out/index_values_with_geo.out -------------------------------------------------------------------------------- /code/expected.out/search1.db_Dent_watch.out: -------------------------------------------------------------------------------- 1 | 1: #046 Model by Dent of mechanism for setting hands and winding up 2 | 2: #004 Watch with Chinese duplex escapement 3 | 3: #018 Solar/Sidereal verge watch with epicyclic maintaining power 4 | 4: #013 Watch timer by P 5 | 5: #094 Model of a Lever Escapement , 1850-1883 6 | 6: #093 Model of Graham's Cylinder Escapement, 1850-1883 7 | 7: #033 A device by Favag of Neuchatel which enables a stop watch to 8 | 8: #015 Ingersoll "Dan Dare" automaton pocket watch with pin-pallet 9 | 9: #086 Model representing Earnshaw's detent chronometer escapement, 1950-1883 10 | 10: #036 Universal 'Tri-Compax' chronographic wrist watch 11 | 'Dent watch'[0:10] = 46 4 18 13 94 93 33 15 86 36 12 | -------------------------------------------------------------------------------- /code/expected.out/search1.db_description=3a=5c=22leather_case=5c=22_AND_title=3asundial.out: -------------------------------------------------------------------------------- 1 | 1: #055 Silver altitude sundial in leather case 2 | 'description:"leather case" AND title:sundial'[0:10] = 55 3 | -------------------------------------------------------------------------------- /code/expected.out/search1.db_watch.out: -------------------------------------------------------------------------------- 1 | 1: #004 Watch with Chinese duplex escapement 2 | 2: #018 Solar/Sidereal verge watch with epicyclic maintaining power 3 | 3: #013 Watch timer by P 4 | 4: #033 A device by Favag of Neuchatel which enables a stop watch to 5 | 5: #015 Ingersoll "Dan Dare" automaton pocket watch with pin-pallet 6 | 6: #036 Universal 'Tri-Compax' chronographic wrist watch 7 | 7: #046 Model by Dent of mechanism for setting hands and winding up 8 | 'watch'[0:10] = 4 18 13 33 15 36 46 9 | -------------------------------------------------------------------------------- /code/expected.out/search1.out: -------------------------------------------------------------------------------- 1 | 1: #001 Ansonia Sunwatch (pocket compas dial) 2 | 'title:sunwatch'[0:10] = 1 3 | -------------------------------------------------------------------------------- /code/expected.out/search_facets.out: -------------------------------------------------------------------------------- 1 | 1: #044 Two-dial clock by the Self-Winding Clock Co; as used on the 2 | 2: #096 Clock with Hipp pendulum (an electric driven clock with Hipp 3 | 3: #012 Assembled and unassembled EXA electric clock kit 4 | 4: #098 'Pond' electric clock movement (no dial) 5 | 5: #083 Harrison's eight-day wooden clock movement, 1715. 6 | 6: #005 "Ever Ready" ceiling clock 7 | 7: #039 Electric clock of the Bain type 8 | 8: #061 Van der Plancke master clock 9 | 9: #064 Morse electrical clock, dial mechanism 10 | 10: #052 Reconstruction of Dondi's Astronomical Clock, 1974 11 | Facet: Bain, Alexander; count: 3 12 | Facet: Bloxam, J. M.; count: 1 13 | Facet: Braun (maker); count: 1 14 | Facet: British Horo-Electric Ltd. (maker); count: 1 15 | Facet: British Vacuum Cleaner and Engineering Co. Ltd., Magneto Time division (maker); count: 1 16 | Facet: EXA; count: 1 17 | Facet: Ever Ready Co. (maker); count: 2 18 | Facet: Ferranti Ltd.; count: 1 19 | Facet: Galilei, Galileo, 1564-1642; Galilei, Vincenzio, 1606-1649; count: 1 20 | Facet: Harrison, John (maker); count: 1 21 | Facet: Hipp, M.; count: 1 22 | Facet: La Précision Cie; count: 1 23 | Facet: Lund, J.; count: 1 24 | Facet: Morse, J. S.; count: 1 25 | Facet: Self Winding Clock Company; count: 1 26 | Facet: Self-Winding Clock Co. (maker); count: 1 27 | Facet: Synchronome Co. Ltd. (maker); count: 2 28 | Facet: Thwaites and Reed Ltd.; count: 1 29 | Facet: Thwaites and Reed Ltd. (maker); count: 1 30 | Facet: Viviani, Vincenzo; count: 1 31 | Facet: Vulliamy, Benjamin, 1747-1811; count: 1 32 | Facet: Whitefriars Glass Ltd. (maker); count: 1 33 | 'clock'[0:10] = 44 96 12 98 83 5 39 61 64 52 34 | -------------------------------------------------------------------------------- /code/expected.out/search_filters.out: -------------------------------------------------------------------------------- 1 | 1: #012 Assembled and unassembled EXA electric clock kit 2 | 2: #098 'Pond' electric clock movement (no dial) 3 | 3: #052 Reconstruction of Dondi's Astronomical Clock, 1974 4 | 4: #059 Electrically operated clock controller 5 | 5: #024 Regulator Clock with Gravity Escapement 6 | 6: #097 Bain's subsidiary electric clock 7 | 7: #009 Copy of a Dwerrihouse skeleton clock with coup-perdu escape 8 | 8: #091 Pendulum clock designed by Galileo in 1642 and made by his son in 1649, model. 9 | 'clock'[0:10] = 12 98 52 59 24 97 9 91 10 | -------------------------------------------------------------------------------- /code/expected.out/search_filters2.out: -------------------------------------------------------------------------------- 1 | 1: #012 Assembled and unassembled EXA electric clock kit 2 | 2: #098 'Pond' electric clock movement (no dial) 3 | 3: #052 Reconstruction of Dondi's Astronomical Clock, 1974 4 | 4: #059 Electrically operated clock controller 5 | 5: #024 Regulator Clock with Gravity Escapement 6 | 6: #097 Bain's subsidiary electric clock 7 | 7: #009 Copy of a Dwerrihouse skeleton clock with coup-perdu escape 8 | 8: #091 Pendulum clock designed by Galileo in 1642 and made by his son in 1649, model. 9 | 'clock material:"steel (metal)"'[0:10] = 12 98 52 59 24 97 9 91 10 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges.db_..50mm.out: -------------------------------------------------------------------------------- 1 | 1: #031 (1588) overall diameter: 50 mm 2 | Portable universal equinoctial sundial, in brass, signed "A 3 | 2: #073 (1701-1721) overall: 15 mm x 44.45 mm, weight: 0.055kg 4 | Universal pocket sundial 5 | 3: #074 (1596) overall: 13 mm x 44.45 mm x 44.45 mm, weight: 0.095kg 6 | Sundial, made as a locket, gilt metal, part silver 7 | '..50mm'[0:10] = 31 73 74 8 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges.db_1000..mm_1800..1899.out: -------------------------------------------------------------------------------- 1 | 1: #024 (1845-1855) overall: 1850 mm x 350 mm x 250 mm 2 | Regulator Clock with Gravity Escapement 3 | '1000..mm 1800..1899'[0:10] = 24 4 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges.db_1980..1989.out: -------------------------------------------------------------------------------- 1 | 1: #050 (1984) overall: 105 mm x 75 mm x 57 mm, 2 | Quartz Analogue "no battery" wristwatch by Pulsar Quartz (CA 3 | 2: #051 (1984) overall: 85 mm x 65 mm x 38 mm, 4 | Analogue quartz clock with voice controlled alarm by Braun, 5 | '1980..1989'[0:10] = 50 51 6 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges.db_clock_1960...out: -------------------------------------------------------------------------------- 1 | 1: #052 (1974) clock: 1185 x 780 mm, 122 kg; rewind unit: 460 x 640 x 350 mm 2 | Reconstruction of Dondi's Astronomical Clock, 1974 3 | 2: #051 (1984) overall: 85 mm x 65 mm x 38 mm, 4 | Analogue quartz clock with voice controlled alarm by Braun, 5 | 3: #009 (1973) overall: 380 mm x 300 mm x 192 mm, weight: 6.45kg 6 | Copy of a Dwerrihouse skeleton clock with coup-perdu escape 7 | 'clock 1960..'[0:10] = 52 51 9 8 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges2.statesdb_10000000...out: -------------------------------------------------------------------------------- 1 | 1: #007 State of California September 9, 1850 2 | Population 37,253,956 3 | 2: #019 State of Texas December 29, 1845 4 | Population 25,145,561 5 | 3: #027 State of Illinois December 3, 1818 6 | Population 12,830,632 7 | 4: #030 State of Ohio March 1, 1803 8 | Population 11,536,504 9 | 5: #035 State of Florida March 3, 1845 10 | Population 18,801,310 11 | 6: #040 Commonwealth of Pennsylvania December 12, 1787 12 | Population 12,702,379 13 | 7: #041 State of New York July 26, 1788 14 | Population 19,378,102 15 | '10000000..'[0:10] = 7 19 27 30 35 40 41 16 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges2.statesdb_11=2f08=2f1889..07=2f10=2f1890.out: -------------------------------------------------------------------------------- 1 | 1: #001 State of Washington November 11, 1889 2 | Population 6,744,496 3 | 2: #004 State of Montana November 8, 1889 4 | Population 989,415 5 | 3: #005 Idaho July 3, 1890 6 | Population 1,567,582 7 | 4: #010 State of Wyoming July 10, 1890 8 | Population 563,626 9 | '11/08/1889..07/10/1890'[0:10] = 1 4 5 10 10 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges2.statesdb_1780..1789_10000000...out: -------------------------------------------------------------------------------- 1 | 1: #040 Commonwealth of Pennsylvania December 12, 1787 2 | Population 12,702,379 3 | 2: #041 State of New York July 26, 1788 4 | Population 19,378,102 5 | '1780..1789 10000000..'[0:10] = 40 41 6 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges2.statesdb_1800..1899.out: -------------------------------------------------------------------------------- 1 | 1: #001 State of Washington November 11, 1889 2 | Population 6,744,496 3 | 2: #002 State of Arkansas June 15, 1836 4 | Population 2,915,918 5 | 3: #003 State of Oregon February 14, 1859 6 | Population 3,831,074 7 | 4: #004 State of Montana November 8, 1889 8 | Population 989,415 9 | 5: #005 Idaho July 3, 1890 10 | Population 1,567,582 11 | 6: #006 State of Nevada October 31, 1864 12 | Population 2,700,551 13 | 7: #007 State of California September 9, 1850 14 | Population 37,253,956 15 | 8: #009 State of Utah January 4, 1896 16 | Population 2,763,885 17 | 9: #010 State of Wyoming July 10, 1890 18 | Population 563,626 19 | 10: #011 State of Colorado August 1, 1876 20 | Population 5,029,196 21 | '1800..1899'[0:10] = 1 2 3 4 5 6 7 9 10 11 22 | -------------------------------------------------------------------------------- /code/expected.out/search_ranges2.statesdb_spanish.out: -------------------------------------------------------------------------------- 1 | 1: #004 State of Montana November 8, 1889 2 | Population 989,415 3 | 2: #019 State of Texas December 29, 1845 4 | Population 25,145,561 5 | 'spanish'[0:10] = 4 19 6 | -------------------------------------------------------------------------------- /code/expected.out/search_sorting.out: -------------------------------------------------------------------------------- 1 | 1: #019 State of Texas December 29, 1845 2 | Population 25,145,561 3 | 2: #004 State of Montana November 8, 1889 4 | Population 989,415 5 | 'spanish'[0:10] = 19 4 6 | -------------------------------------------------------------------------------- /code/expected.out/search_sorting2.out: -------------------------------------------------------------------------------- 1 | 1: #040 Commonwealth of Pennsylvania December 12, 1787 2 | Population 12,702,379 3 | 2: #043 State of New Jersey December 18, 1787 4 | Population 8,791,894 5 | 3: #049 State of Delaware December 7, 1787 6 | Population 897,934 7 | 4: #041 State of New York July 26, 1788 8 | Population 19,378,102 9 | 5: #034 State of Georgia January 2, 1788 10 | Population 9,687,653 11 | 6: #038 Commonwealth of Virginia June 25, 1788 12 | Population 8,001,024 13 | 7: #046 Commonwealth of Massachusetts February 6, 1788 14 | Population 6,547,629 15 | 8: #050 State of Maryland April 28, 1788 16 | Population 5,773,552 17 | 9: #036 State of South Carolina May 23, 1788 18 | Population 4,625,384 19 | 10: #048 State of Connecticut January 9, 1788 20 | Population 3,574,097 21 | 'State'[0:10] = 40 43 49 41 34 38 46 50 36 48 22 | -------------------------------------------------------------------------------- /code/expected.out/search_sorting3.out: -------------------------------------------------------------------------------- 1 | 1: #050 State of Maryland April 28, 1788 2 | Population 5,773,552 3 | 2: #049 State of Delaware December 7, 1787 4 | Population 897,934 5 | 3: #040 Commonwealth of Pennsylvania December 12, 1787 6 | Population 12,702,379 7 | 4: #043 State of New Jersey December 18, 1787 8 | Population 8,791,894 9 | 5: #039 State of West Virginia June 20, 1863 10 | Population 1,859,815 11 | 6: #037 State of North Carolina November 21, 1789 12 | Population 9,535,483 13 | 7: #041 State of New York July 26, 1788 14 | Population 19,378,102 15 | 8: #038 Commonwealth of Virginia June 25, 1788 16 | Population 8,001,024 17 | 9: #048 State of Connecticut January 9, 1788 18 | Population 3,574,097 19 | 10: #036 State of South Carolina May 23, 1788 20 | Population 4,625,384 21 | 'State'[0:10] = 50 49 40 43 39 37 41 38 48 36 22 | -------------------------------------------------------------------------------- /code/expected.out/search_synonyms.db_time.out: -------------------------------------------------------------------------------- 1 | 1: #065 Electric time piece with hands but without dial (no pendulum 2 | 2: #058 The "Empire" clock, to show the time at various longitudes, 3 | 3: #041 Frequency and time measuring instrument type TSA3436 by Venn 4 | 4: #056 Single sandglass in 4 pillared wood mount, running time 15 1 5 | 5: #043 Loughborough-Hayes automatic timing apparatus. Used by the R 6 | 6: #011 "Timetrunk" by Hines and Co., Glasgow (a sandglass for timin 7 | 7: #016 Copy of the gearing of the Byzantine sundial-calendar (1983- 8 | 8: #045 Master clock of the "Silent Electric" type made by the Magne 9 | 9: #018 Solar/Sidereal verge watch with epicyclic maintaining power 10 | 'time'[0:10] = 65 58 41 56 43 11 16 45 18 11 | -------------------------------------------------------------------------------- /code/expected.out/search_synonyms.out: -------------------------------------------------------------------------------- 1 | 1: #016 Copy of the gearing of the Byzantine sundial-calendar (1983- 2 | 2: #072 German Perpetual Calendar in gilt metal 3 | 3: #065 Electric time piece with hands but without dial (no pendulum 4 | 4: #068 Ornate brass Perpetual Calendar 5 | 5: #058 The "Empire" clock, to show the time at various longitudes, 6 | 6: #041 Frequency and time measuring instrument type TSA3436 by Venn 7 | 7: #056 Single sandglass in 4 pillared wood mount, running time 15 1 8 | 8: #043 Loughborough-Hayes automatic timing apparatus. Used by the R 9 | 9: #026 Sundial and compass with perpetual calendar and lunar circles 10 | 10: #036 Universal 'Tri-Compax' chronographic wrist watch 11 | '~time'[0:10] = 16 72 65 68 58 41 56 43 26 36 12 | -------------------------------------------------------------------------------- /code/java/.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | -------------------------------------------------------------------------------- /code/java/delete1.java: -------------------------------------------------------------------------------- 1 | package code.java; 2 | 3 | import org.xapian.WritableDatabase; 4 | import org.xapian.XapianConstants; 5 | 6 | public class delete1 { 7 | 8 | // Command line args - dbpath identifiers... 9 | public static void main(String[] args) { 10 | if (args.length < 2) { 11 | System.out.println("Insufficient number of arguments (should be dbpath identifiers...)"); 12 | return; 13 | } 14 | // Splitting the array to obtain an array of identifiers 15 | String[] identifierArgs = new String[args.length - 1]; 16 | System.arraycopy(args, 1, identifierArgs, 0, identifierArgs.length); 17 | deleteDocs(args[0], identifierArgs); 18 | } 19 | 20 | // Start of example code. 21 | public static void deleteDocs(String dbpath, String[] identifierArgs) { 22 | // Open the database we're going to be deleting from. 23 | WritableDatabase db = new WritableDatabase(dbpath, XapianConstants.DB_OPEN); 24 | 25 | for (String identifierArg : identifierArgs) { 26 | String idterm = "Q" + identifierArg; 27 | db.deleteDocument(idterm); 28 | } 29 | 30 | // Commit to delete documents from disk 31 | db.commit(); 32 | } 33 | // End of example code. 34 | } 35 | -------------------------------------------------------------------------------- /code/java/index1.java.data=2f100-objects-v1.csv_db.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/code/java/index1.java.data=2f100-objects-v1.csv_db.out -------------------------------------------------------------------------------- /code/java/support.java: -------------------------------------------------------------------------------- 1 | /* Support code for Java examples */ 2 | package code.java; 3 | 4 | import java.util.ArrayList; 5 | 6 | public class support { 7 | // Returns an ArrayList of the parsed CSV line 8 | public static ArrayList parseCsvLine(String csvLine) { 9 | ArrayList words = new ArrayList(); 10 | boolean insideQuote = false, endEarly = false; 11 | int start = 0, end = 0; 12 | for (int i = 0; i < csvLine.length()-1; i++) { 13 | if(csvLine.charAt(i) == ',' && !insideQuote) { 14 | if (endEarly) { 15 | words.add(csvLine.substring(start,i-1).replace("\"\"","\"")); 16 | endEarly = false; 17 | } else { 18 | words.add(csvLine.substring(start,i)); 19 | } 20 | 21 | if(csvLine.charAt(i+1) == '"') { 22 | start = i + 2; 23 | i++; 24 | endEarly = true; 25 | insideQuote = true; 26 | } else { 27 | start = i + 1; 28 | } 29 | } else if (csvLine.charAt(i) == '"') { 30 | insideQuote = !insideQuote; 31 | } 32 | } 33 | words.add(csvLine.substring(start)); 34 | return words; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /code/perl/Support.pm: -------------------------------------------------------------------------------- 1 | package Support; 2 | use strict; 3 | use warnings; 4 | use Text::CSV; 5 | use Data::Dumper; 6 | use DateTime; 7 | use DateTime::Format::Strptime;; 8 | 9 | sub parse_csv { 10 | my $file = shift; 11 | my $csv = Text::CSV->new ({ 12 | eol => "\r\n", 13 | sep_char => ',', 14 | binary => 1, 15 | }) 16 | or die "Cannot use CSV: ".Text::CSV->error_diag (); 17 | open(my $fh, "<:encoding(UTF-8)", $file) or die "$file: $!"; 18 | 19 | my $header = $csv->getline($fh); 20 | 21 | $csv->column_names(@$header); 22 | my @out; 23 | while (my $ref = $csv->getline_hr($fh)) { 24 | push @out, $ref; 25 | } 26 | $csv->eof or die $csv->error_diag(); 27 | close $fh or die "$file: $!"; 28 | return @out; 29 | } 30 | 31 | sub log_matches { 32 | my ($query, $offset, $page_size, $matches) = @_; 33 | printf(q{'%s'[%i:%i] = %s}, $query, $offset, $offset + $page_size, 34 | join(' ', @$matches)); 35 | print "\n"; 36 | } 37 | 38 | sub numbers_from_string { 39 | my $string = shift; 40 | return unless $string; 41 | my @all; 42 | while ($string =~ m/([\d\.]*\d[\d\.]*)/g) { 43 | push @all, $1; 44 | } 45 | return @all; 46 | } 47 | 48 | sub parse_states { 49 | my @records = parse_csv(@_); 50 | return grep { length($_->{order}) } @records; 51 | } 52 | 53 | sub format_numeral { 54 | my $number = shift; 55 | if ($number =~ m/\A[0-9]+\z/) { 56 | if ($number eq '0') { 57 | return $number; 58 | } 59 | else { 60 | my @out; 61 | my @all = reverse(split('', $number)); 62 | for (my $i = 0; $i < @all; $i++) { 63 | if ($i and (($i % 3) == 0)) { 64 | push @out, ','; 65 | } 66 | push @out, $all[$i]; 67 | } 68 | return join('', reverse @out); 69 | } 70 | } 71 | else { 72 | die "Numeral should be an integer"; 73 | } 74 | } 75 | 76 | sub format_date { 77 | my $date = shift; 78 | my $strp = DateTime::Format::Strptime->new(pattern => '%Y%m%d'); 79 | my $dt = $strp->parse_datetime($date); 80 | return $dt->month_name . ' ' . $dt->day . ', ' . $dt->year; 81 | }; 82 | 83 | 1; 84 | -------------------------------------------------------------------------------- /code/perl/delete1.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | 19 | my ($db_path, @ids) = @ARGV; 20 | die "Usage $0 DBPATH ID..." unless $db_path && @ids; 21 | 22 | delete_docs($db_path, @ids); 23 | 24 | ### Start of example code. 25 | sub delete_docs { 26 | my ($db_path, @ids) = @_; 27 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 28 | foreach my $id (@ids) { 29 | $db->delete_document_by_term("Q$id"); 30 | } 31 | } 32 | ### End of example code. 33 | -------------------------------------------------------------------------------- /code/perl/index1.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | 24 | 25 | my ($data_path, $db_path) = @ARGV; 26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path; 27 | 28 | index_csv($data_path, $db_path); 29 | 30 | ### Start of example code. 31 | sub index_csv { 32 | my ($data_path, $db_path) = @_; 33 | # Create or open the database we're going to be writing to. 34 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 35 | # Set up a TermGenerator that we'll use in indexing. 36 | my $term_generator = Search::Xapian::TermGenerator->new; 37 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en')); 38 | foreach my $rec (Support::parse_csv($data_path)) { 39 | # print Dumper($rec); 40 | my $doc = Search::Xapian::Document->new; 41 | $term_generator->set_document($doc); 42 | # Index each field with a suitable prefix. 43 | $term_generator->index_text($rec->{TITLE}, 1, 'S'); 44 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD'); 45 | 46 | # Index fields without prefixes for general search. 47 | $term_generator->index_text($rec->{TITLE}); 48 | $term_generator->increase_termpos(); 49 | $term_generator->index_text($rec->{DESCRIPTION}); 50 | 51 | # Store all the fields for display purposes. 52 | $doc->set_data(encode_json($rec)); 53 | 54 | # We use the identifier to ensure each object ends up in the 55 | # database only once no matter how many times we run the 56 | # indexer. 57 | my $idterm = "Q" . $rec->{id_NUMBER}; 58 | $doc->add_boolean_term($idterm); 59 | $db->replace_document_by_term($idterm, $doc); 60 | } 61 | } 62 | ### End of example code. 63 | -------------------------------------------------------------------------------- /code/perl/index_facets.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | use Encode qw/encode/; 24 | 25 | 26 | my ($data_path, $db_path) = @ARGV; 27 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path; 28 | 29 | index_csv($data_path, $db_path); 30 | 31 | ### Start of example code. 32 | sub index_csv { 33 | my ($data_path, $db_path) = @_; 34 | # Create or open the database we're going to be writing to. 35 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 36 | # Set up a TermGenerator that we'll use in indexing. 37 | my $term_generator = Search::Xapian::TermGenerator->new; 38 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en')); 39 | foreach my $rec (Support::parse_csv($data_path)) { 40 | # print Dumper($rec); 41 | my $doc = Search::Xapian::Document->new; 42 | $term_generator->set_document($doc); 43 | # Index each field with a suitable prefix. 44 | $term_generator->index_text($rec->{TITLE}, 1, 'S'); 45 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD'); 46 | 47 | # Index fields without prefixes for general search. 48 | $term_generator->index_text($rec->{TITLE}); 49 | $term_generator->increase_termpos(); 50 | $term_generator->index_text($rec->{DESCRIPTION}); 51 | 52 | # Store all the fields for display purposes. 53 | $doc->set_data(encode_json($rec)); 54 | 55 | # add the collection and the maker into value slots 56 | if ($rec->{COLLECTION}) { 57 | $doc->add_value(0, encode('UTF-8', $rec->{COLLECTION})); 58 | } 59 | if ($rec->{MAKER}) { 60 | $doc->add_value(1, encode('UTF-8', $rec->{MAKER})); 61 | } 62 | 63 | 64 | # We use the identifier to ensure each object ends up in the 65 | # database only once no matter how many times we run the 66 | # indexer. 67 | my $idterm = "Q" . $rec->{id_NUMBER}; 68 | $doc->add_boolean_term($idterm); 69 | $db->replace_document_by_term($idterm, $doc); 70 | } 71 | } 72 | ### End of example code. 73 | -------------------------------------------------------------------------------- /code/perl/index_filters.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | 24 | 25 | my ($data_path, $db_path) = @ARGV; 26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path; 27 | 28 | index_csv($data_path, $db_path); 29 | 30 | sub index_csv { 31 | my ($data_path, $db_path) = @_; 32 | # Create or open the database we're going to be writing to. 33 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 34 | # Set up a TermGenerator that we'll use in indexing. 35 | my $term_generator = Search::Xapian::TermGenerator->new; 36 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en')); 37 | foreach my $rec (Support::parse_csv($data_path)) { 38 | # print Dumper($rec); 39 | my $doc = Search::Xapian::Document->new; 40 | $term_generator->set_document($doc); 41 | # Index each field with a suitable prefix. 42 | $term_generator->index_text($rec->{TITLE}, 1, 'S'); 43 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD'); 44 | 45 | ### Start of new indexing code. 46 | # Index the MATERIALS field, splitting on semicolons. 47 | foreach my $material (split(/;/, $rec->{MATERIALS})) { 48 | $material =~ s/\A\s*//; 49 | $material =~ s/\s*\z//; 50 | $material = lc($material); 51 | if (length($material)) { 52 | $doc->add_boolean_term('XM' . $material); 53 | } 54 | } 55 | ### End of new indexing code. 56 | 57 | # Index fields without prefixes for general search. 58 | $term_generator->index_text($rec->{TITLE}); 59 | $term_generator->increase_termpos(); 60 | $term_generator->index_text($rec->{DESCRIPTION}); 61 | 62 | # Store all the fields for display purposes. 63 | $doc->set_data(encode_json($rec)); 64 | 65 | # We use the identifier to ensure each object ends up in the 66 | # database only once no matter how many times we run the 67 | # indexer. 68 | my $idterm = "Q" . $rec->{id_NUMBER}; 69 | $doc->add_boolean_term($idterm); 70 | $db->replace_document_by_term($idterm, $doc); 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /code/perl/index_ranges.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | 24 | 25 | my ($data_path, $db_path) = @ARGV; 26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path; 27 | 28 | index_csv($data_path, $db_path); 29 | 30 | sub index_csv { 31 | my ($data_path, $db_path) = @_; 32 | # Create or open the database we're going to be writing to. 33 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 34 | # Set up a TermGenerator that we'll use in indexing. 35 | my $term_generator = Search::Xapian::TermGenerator->new; 36 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en')); 37 | foreach my $rec (Support::parse_csv($data_path)) { 38 | # print Dumper($rec); 39 | my $doc = Search::Xapian::Document->new; 40 | $term_generator->set_document($doc); 41 | # Index each field with a suitable prefix. 42 | $term_generator->index_text($rec->{TITLE}, 1, 'S'); 43 | $term_generator->index_text($rec->{DESCRIPTION}, 1, 'XD'); 44 | 45 | ### Start of example code. 46 | if (my @numbers = Support::numbers_from_string($rec->{MEASUREMENTS})) { 47 | # index the higher one 48 | $doc->add_value(0, Search::Xapian::sortable_serialise((sort { $b <=> $a } @numbers)[0])); 49 | } 50 | if (my @years = Support::numbers_from_string($rec->{DATE_MADE})) { 51 | # index the first one 52 | $doc->add_value(1, Search::Xapian::sortable_serialise($years[0])); 53 | } 54 | ### End of example code. 55 | 56 | # Index fields without prefixes for general search. 57 | $term_generator->index_text($rec->{TITLE}); 58 | $term_generator->increase_termpos(); 59 | $term_generator->index_text($rec->{DESCRIPTION}); 60 | 61 | # Store all the fields for display purposes. 62 | $doc->set_data(encode_json($rec)); 63 | 64 | # We use the identifier to ensure each object ends up in the 65 | # database only once no matter how many times we run the 66 | # indexer. 67 | my $idterm = "Q" . $rec->{id_NUMBER}; 68 | $doc->add_boolean_term($idterm); 69 | $db->replace_document_by_term($idterm, $doc); 70 | } 71 | } 72 | 73 | -------------------------------------------------------------------------------- /code/perl/index_ranges2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | 24 | 25 | my ($data_path, $db_path) = @ARGV; 26 | die "Usage $0 DATAPATH DBPATH" unless $data_path && $db_path; 27 | 28 | index_csv($data_path, $db_path); 29 | 30 | ### Start of example code. 31 | sub index_csv { 32 | my ($data_path, $db_path) = @_; 33 | # Create or open the database we're going to be writing to. 34 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 35 | # Set up a TermGenerator that we'll use in indexing. 36 | my $term_generator = Search::Xapian::TermGenerator->new; 37 | $term_generator->set_stemmer(Search::Xapian::Stem->new('en')); 38 | foreach my $rec (Support::parse_states($data_path)) { 39 | my $doc = Search::Xapian::Document->new; 40 | $term_generator->set_document($doc); 41 | 42 | ### Start of example code. 43 | # Index each field with a suitable prefix. 44 | $term_generator->index_text($rec->{name}, 1, 'S'); 45 | $term_generator->index_text($rec->{description}, 1, 'XD'); 46 | $term_generator->index_text($rec->{motto}, 1, 'XD'); 47 | 48 | # Index fields without prefixes for general search. 49 | $term_generator->index_text($rec->{name}); 50 | $term_generator->increase_termpos(); 51 | $term_generator->index_text($rec->{description}); 52 | $term_generator->increase_termpos(); 53 | $term_generator->index_text($rec->{motto}); 54 | 55 | if (length($rec->{admitted})) { 56 | $doc->add_value(1, Search::Xapian::sortable_serialise(substr($rec->{admitted}, 0, 4))); 57 | $doc->add_value(2, $rec->{admitted}); 58 | } 59 | if (length($rec->{population})) { 60 | $doc->add_value(3, Search::Xapian::sortable_serialise(int($rec->{population}))); 61 | } 62 | ### End of example code. 63 | 64 | # Store all the fields for display purposes. 65 | $doc->set_data(encode_json($rec)); 66 | 67 | # We use the identifier to ensure each object ends up in the 68 | # database only once no matter how many times we run the 69 | # indexer. 70 | my $idterm = "Q" . $rec->{order}; 71 | $doc->add_boolean_term($idterm); 72 | $db->replace_document_by_term($idterm, $doc); 73 | } 74 | } 75 | ### End of example code. 76 | -------------------------------------------------------------------------------- /code/perl/search1.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | 23 | 24 | my ($db_path, @terms) = @ARGV; 25 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms; 26 | 27 | search($db_path, join(' ', @terms)); 28 | 29 | ### Start of example code. 30 | sub search { 31 | my ($db_path, $query_string, $offset, $pagesize) = @_; 32 | $offset ||= 0; 33 | $pagesize ||= 10; 34 | my $db = Search::Xapian::Database->new($db_path); 35 | # Set up a QueryParser with a stemmer and suitable prefixes 36 | my $queryparser = Search::Xapian::QueryParser->new; 37 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en')); 38 | $queryparser->set_stemming_strategy(STEM_SOME); 39 | 40 | # Start of prefix configuration. 41 | $queryparser->add_prefix(title => "S"); 42 | $queryparser->add_prefix(description => "XD"); 43 | # End of prefix configuration. 44 | 45 | # And parse the query 46 | my $query = $queryparser->parse_query($query_string); 47 | 48 | # Use an Enquire object on the database to run the query 49 | my $enquire = $db->enquire($query); 50 | 51 | # And print out something about each match 52 | my @matches; 53 | 54 | my $mset = $enquire->get_mset($offset, $pagesize); 55 | foreach my $item ($mset->items) { 56 | my $fields = decode_json($item->get_document->get_data); 57 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE}); 58 | print "\n"; 59 | push @matches, $item->get_docid; 60 | } 61 | Support::log_matches($query_string, $offset, $pagesize, \@matches); 62 | } 63 | ### End of example code. 64 | -------------------------------------------------------------------------------- /code/perl/search_facets.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Encode qw/decode/; 22 | use Support; 23 | use Data::Dumper; 24 | binmode STDOUT, ":encoding(UTF-8)"; 25 | binmode STDERR, ":encoding(UTF-8)"; 26 | 27 | 28 | my ($db_path, @terms) = @ARGV; 29 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms; 30 | 31 | search($db_path, join(' ', @terms)); 32 | 33 | sub search { 34 | my ($db_path, $query_string, $offset, $pagesize) = @_; 35 | $offset ||= 0; 36 | $pagesize ||= 10; 37 | my $db = Search::Xapian::Database->new($db_path); 38 | # Set up a QueryParser with a stemmer and suitable prefixes 39 | my $queryparser = Search::Xapian::QueryParser->new; 40 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en')); 41 | $queryparser->set_stemming_strategy(STEM_SOME); 42 | 43 | # Start of prefix configuration. 44 | $queryparser->add_prefix(title => "S"); 45 | $queryparser->add_prefix(description => "XD"); 46 | # End of prefix configuration. 47 | 48 | # And parse the query 49 | my $query = $queryparser->parse_query($query_string); 50 | 51 | # Use an Enquire object on the database to run the query 52 | my $enquire = $db->enquire($query); 53 | 54 | # And print out something about each match 55 | my @matches; 56 | 57 | ### Start of example code. 58 | 59 | # Set up a spy to inspect the MAKER value at slot 1 60 | my $spy = Search::Xapian::ValueCountMatchSpy->new(1); 61 | $enquire->add_matchspy($spy); 62 | 63 | my $mset = $enquire->get_mset($offset, $pagesize, 100); 64 | foreach my $item ($mset->items) { 65 | my $fields = decode_json($item->get_document->get_data); 66 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE}); 67 | print "\n"; 68 | push @matches, $item->get_docid; 69 | } 70 | # Fetch and display the spy values 71 | my $end = $spy->values_end; 72 | # it looks like the values are not decoded coming out. 73 | for (my $it = $spy->values_begin; $it != $end; $it++) { 74 | print "Facet: " . decode('UTF-8', $it->get_termname) . "; count: " . $it->get_termfreq . "\n" 75 | } 76 | 77 | 78 | Support::log_matches($query_string, $offset, $pagesize, \@matches); 79 | ### End of example code. 80 | } 81 | -------------------------------------------------------------------------------- /code/perl/search_filters.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | 24 | 25 | my ($db_path, $query_string, @materials) = @ARGV; 26 | die "Usage: $0 DB_PATH QUERY MATERIALS..." unless $db_path && $query_string; 27 | 28 | search($db_path, $query_string, \@materials); 29 | 30 | sub search { 31 | my ($db_path, $query_string, $materials, $offset, $pagesize) = @_; 32 | $materials ||= []; 33 | $offset ||= 0; 34 | $pagesize ||= 10; 35 | 36 | my $db = Search::Xapian::Database->new($db_path); 37 | 38 | ### Start of example code. 39 | # Set up a QueryParser with a stemmer and suitable prefixes 40 | my $queryparser = Search::Xapian::QueryParser->new; 41 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en')); 42 | $queryparser->set_stemming_strategy(STEM_SOME); 43 | 44 | # Start of prefix configuration. 45 | $queryparser->add_prefix(title => "S"); 46 | $queryparser->add_prefix(description => "XD"); 47 | 48 | # End of prefix configuration. 49 | 50 | # And parse the query 51 | my $query = $queryparser->parse_query($query_string); 52 | 53 | # there is no pod for Search::Xapian::Query, but works anyway. Operator + list. 54 | 55 | if (@$materials) { 56 | my $material_query = Search::Xapian::Query->new(OP_OR, 57 | map { Search::Xapian::Query->new('XM' . lc($_)) } 58 | @$materials); 59 | $query = Search::Xapian::Query->new(OP_FILTER, $query, $material_query); 60 | } 61 | ### End of example code. 62 | 63 | # Use an Enquire object on the database to run the query 64 | my $enquire = $db->enquire($query); 65 | 66 | # And print out something about each match 67 | my @matches; 68 | 69 | my $mset = $enquire->get_mset($offset, $pagesize); 70 | foreach my $item ($mset->items) { 71 | my $fields = decode_json($item->get_document->get_data); 72 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE}); 73 | print "\n"; 74 | push @matches, $item->get_docid; 75 | } 76 | Support::log_matches($query_string, $offset, $pagesize, \@matches); 77 | } 78 | -------------------------------------------------------------------------------- /code/perl/search_filters2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | use Data::Dumper; 23 | 24 | 25 | my ($db_path, @terms) = @ARGV; 26 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms; 27 | 28 | search($db_path, join(' ', @terms)); 29 | 30 | sub search { 31 | my ($db_path, $query_string, $offset, $pagesize) = @_; 32 | $offset ||= 0; 33 | $pagesize ||= 10; 34 | 35 | my $db = Search::Xapian::Database->new($db_path); 36 | 37 | ### Start of example code. 38 | # Set up a QueryParser with a stemmer and suitable prefixes 39 | my $queryparser = Search::Xapian::QueryParser->new; 40 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en')); 41 | $queryparser->set_stemming_strategy(STEM_SOME); 42 | 43 | # Start of prefix configuration. 44 | $queryparser->add_prefix(title => "S"); 45 | $queryparser->add_prefix(description => "XD"); 46 | 47 | # allow the user to specify material:.... in the query 48 | $queryparser->add_boolean_prefix(material => "XM"); 49 | # End of prefix configuration. 50 | 51 | # And parse the query 52 | my $query = $queryparser->parse_query($query_string); 53 | 54 | ### End of example code. 55 | 56 | # Use an Enquire object on the database to run the query 57 | my $enquire = $db->enquire($query); 58 | 59 | # And print out something about each match 60 | my @matches; 61 | 62 | my $mset = $enquire->get_mset($offset, $pagesize); 63 | foreach my $item ($mset->items) { 64 | my $fields = decode_json($item->get_document->get_data); 65 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE}); 66 | print "\n"; 67 | push @matches, $item->get_docid; 68 | } 69 | Support::log_matches($query_string, $offset, $pagesize, \@matches); 70 | } 71 | -------------------------------------------------------------------------------- /code/perl/search_sorting.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | 23 | 24 | my ($db_path, @terms) = @ARGV; 25 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms; 26 | 27 | search($db_path, join(' ', @terms)); 28 | 29 | sub search { 30 | my ($db_path, $query_string, $offset, $pagesize) = @_; 31 | $offset ||= 0; 32 | $pagesize ||= 10; 33 | my $db = Search::Xapian::Database->new($db_path); 34 | # Set up a QueryParser with a stemmer and suitable prefixes 35 | my $queryparser = Search::Xapian::QueryParser->new; 36 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en')); 37 | $queryparser->set_stemming_strategy(STEM_SOME); 38 | 39 | # Start of prefix configuration. 40 | $queryparser->add_prefix(title => "S"); 41 | $queryparser->add_prefix(description => "XD"); 42 | # End of prefix configuration. 43 | 44 | # And parse the query 45 | my $query = $queryparser->parse_query($query_string); 46 | 47 | # Use an Enquire object on the database to run the query 48 | my $enquire = $db->enquire($query); 49 | 50 | # Start of example code. 51 | $enquire->set_sort_by_value_then_relevance(1, 0); 52 | # End of example code. 53 | 54 | 55 | # And print out something about each match 56 | my @matches; 57 | 58 | my $mset = $enquire->get_mset($offset, $pagesize); 59 | foreach my $item ($mset->items) { 60 | my $fields = decode_json($item->get_document->get_data); 61 | printf(qq{%i: #%3.3i %s %s\n Population %s\n}, 62 | $item->get_rank + 1, 63 | $item->get_docid, 64 | $fields->{name}, 65 | Support::format_date($fields->{admitted}), 66 | Support::format_numeral($fields->{population})); 67 | push @matches, $item->get_docid; 68 | } 69 | Support::log_matches($query_string, $offset, $pagesize, \@matches); 70 | } 71 | 72 | -------------------------------------------------------------------------------- /code/perl/search_synonyms.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | BEGIN { 7 | eval { 8 | require Xapian; 9 | Xapian->import(':all'); 10 | Xapian::search_xapian_compat(); 11 | }; 12 | if ($@) { 13 | require Search::Xapian; 14 | Search::Xapian->import(':all'); 15 | } 16 | } 17 | 18 | use JSON::MaybeXS; 19 | use FindBin qw($Bin); 20 | use lib $Bin; 21 | use Support; 22 | 23 | 24 | my ($db_path, @terms) = @ARGV; 25 | die "Usage: $0 DB_PATH QUERY..." unless $db_path && @terms; 26 | 27 | search($db_path, join(' ', @terms)); 28 | 29 | ### Start of example code. 30 | sub search { 31 | my ($db_path, $query_string, $offset, $pagesize) = @_; 32 | $offset ||= 0; 33 | $pagesize ||= 10; 34 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_OPEN); 35 | 36 | # Start of adding synonyms 37 | $db->add_synonym(time => 'calendar'); 38 | # End of adding synonyms 39 | 40 | # Set up a QueryParser with a stemmer and suitable prefixes 41 | my $queryparser = Search::Xapian::QueryParser->new; 42 | $queryparser->set_stemmer(Search::Xapian::Stem->new('en')); 43 | $queryparser->set_stemming_strategy(STEM_SOME); 44 | 45 | # Start of prefix configuration. 46 | $queryparser->add_prefix(title => "S"); 47 | $queryparser->add_prefix(description => "XD"); 48 | # End of prefix configuration. 49 | 50 | # Start of set database 51 | $queryparser->set_database($db); 52 | # End of set database 53 | 54 | # And parse the query 55 | my $query = $queryparser->parse_query($query_string, FLAG_SYNONYM); 56 | 57 | # Use an Enquire object on the database to run the query 58 | my $enquire = $db->enquire($query); 59 | 60 | # And print out something about each match 61 | my @matches; 62 | 63 | my $mset = $enquire->get_mset($offset, $pagesize); 64 | foreach my $item ($mset->items) { 65 | my $fields = decode_json($item->get_document->get_data); 66 | printf(q{%i: #%3.3i %s}, $item->get_rank + 1, $item->get_docid, $fields->{TITLE}); 67 | print "\n"; 68 | push @matches, $item->get_docid; 69 | } 70 | Support::log_matches($query_string, $offset, $pagesize, \@matches); 71 | } 72 | ### End of example code. 73 | -------------------------------------------------------------------------------- /code/perl/strings.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use utf8; 4 | use strict; 5 | use warnings; 6 | use Encode qw/decode encode is_utf8/; 7 | use Test::More; 8 | use File::Temp; 9 | use Search::Xapian ':all'; 10 | 11 | my $decoded_string = "Đe ši Šu"; 12 | my $encoded_string = encode('UTF-8', $decoded_string); 13 | # most misnamed function ever, and it's internal anyway. Should be is_decoded() 14 | ok is_utf8($decoded_string), "decode is decoded"; 15 | ok !is_utf8($encoded_string), "encoded is encoded"; 16 | isnt $decoded_string, $encoded_string, "Strings differ"; 17 | 18 | my $tmp = File::Temp->newdir; 19 | my $db_path = $tmp->dirname; 20 | 21 | foreach my $store_encoded (0..1) { 22 | # index 23 | { 24 | my $db = Search::Xapian::WritableDatabase->new($db_path, DB_CREATE_OR_OPEN); 25 | my $term_generator = Search::Xapian::TermGenerator->new; 26 | $term_generator->set_stemmer(Search::Xapian::Stem->new('none')); 27 | my $doc = Search::Xapian::Document->new; 28 | $term_generator->index_text('try'); 29 | 30 | # this is the gist of the demostration. It doesn't care if the 31 | # stored string is encoded or decoded. We always get back the 32 | # encoded one. 33 | if ($store_encoded) { 34 | $doc->set_data($encoded_string); 35 | $doc->add_value(0, $encoded_string); 36 | } 37 | else { 38 | $doc->set_data($decoded_string); 39 | $doc->add_value(0, $decoded_string); 40 | } 41 | 42 | my $id = 'Qtry1'; 43 | $doc->add_boolean_term($id); 44 | $db->replace_document_by_term($id, $doc); 45 | } 46 | # search and test 47 | { 48 | my $db = Search::Xapian::Database->new($db_path); 49 | my $query = Search::Xapian::Query->new('Qtry1'); 50 | my $enquire = $db->enquire($query); 51 | my ($res) = $enquire->get_mset(0, 1)->items; 52 | my $doc = $res->get_document; 53 | is $doc->get_data, $encoded_string; 54 | isnt $doc->get_data, $decoded_string, "data is binary"; 55 | is $doc->get_value(0), $encoded_string; 56 | isnt $doc->get_value(0), $decoded_string, "value is binary as well"; 57 | } 58 | } 59 | 60 | done_testing; 61 | -------------------------------------------------------------------------------- /code/php/delete1.php: -------------------------------------------------------------------------------- 1 | delete_document($idterm); 14 | } 15 | } 16 | // End of example code. 17 | 18 | if ($argc < 3) { 19 | print "Usage: php $argv[0] DBPATH ID...\n"; 20 | die(); 21 | } 22 | 23 | // Call the delete_docs function. 24 | delete_docs($argv[1], array_slice($argv, 2)); 25 | ?> 26 | -------------------------------------------------------------------------------- /code/php/index1.php: -------------------------------------------------------------------------------- 1 | set_stemmer(new XapianStem('en')); 14 | 15 | // Open the file. 16 | $fH = open_file($datapath); 17 | 18 | // Read the header row in. 19 | $headers = get_csv_headers($fH); 20 | 21 | while (($row = parse_csv_row($fH, $headers)) !== false) { 22 | // '$row' maps field name to value. The field names come from the 23 | // first row of the CSV file. 24 | // 25 | // We're just going to use DESCRIPTION, TITLE and id_NUMBER. 26 | $description = $row['DESCRIPTION']; 27 | $title = $row['TITLE']; 28 | $identifier = $row['id_NUMBER']; 29 | 30 | // We make a document and tell the term generator to use this. 31 | $doc = new XapianDocument(); 32 | $termgenerator->set_document($doc); 33 | 34 | // Index each field with a suitable prefix. 35 | $termgenerator->index_text($title, 1, 'S'); 36 | $termgenerator->index_text($description, 1, 'XD'); 37 | 38 | // Index fields without prefixes for general search. 39 | $termgenerator->index_text($title); 40 | $termgenerator->increase_termpos(); 41 | $termgenerator->index_text($description); 42 | 43 | // Store all the fields for display purposes. 44 | $doc->set_data(json_encode($row)); 45 | 46 | // We use the identifier to ensure each object ends up in the 47 | // database only once no matter how many times we run the 48 | // indexer. 49 | $idterm = "Q".$identifier; 50 | $doc->add_boolean_term($idterm); 51 | $db->replace_document($idterm, $doc); 52 | } 53 | } 54 | // End of example code. 55 | 56 | if ($argc != 3) { 57 | print "Usage: php $argv[0] DATAPATH DBPATH\n"; 58 | die(); 59 | } 60 | 61 | // Call the index function. 62 | index($argv[1], $argv[2]); 63 | ?> 64 | -------------------------------------------------------------------------------- /code/php/index_facets.php: -------------------------------------------------------------------------------- 1 | set_stemmer(new XapianStem('en')); 14 | 15 | // open the file 16 | $fH = open_file($datapath); 17 | 18 | // Read the header row in 19 | $headers = get_csv_headers($fH); 20 | 21 | while (($row = parse_csv_row($fH, $headers)) !== false) { 22 | // mapping from field name to value using first row headers 23 | // We're just going to use id_NUMBER, TITLE and DESCRIPTION 24 | $description = $row['DESCRIPTION']; 25 | $title = $row['TITLE']; 26 | $identifier = $row['id_NUMBER']; 27 | $collection = $row['COLLECTION']; 28 | $maker = $row['MAKER']; 29 | 30 | // we make a document and tell the term generator to use this 31 | $doc = new XapianDocument(); 32 | $termgenerator->set_document($doc); 33 | 34 | // index each field with a suitable prefix 35 | $termgenerator->index_text($title, 1, 'S'); 36 | $termgenerator->index_text($description, 1, 'XD'); 37 | 38 | // index fields without prefixes for general search 39 | $termgenerator->index_text($title); 40 | $termgenerator->increase_termpos(); 41 | $termgenerator->index_text($description); 42 | 43 | // add the collection as a value in slot 0 44 | $doc->add_value(0, $collection); 45 | 46 | // add the maker as a value in slot 1 47 | $doc->add_value(1, $maker); 48 | 49 | // store all the fields for display purposes 50 | $doc->set_data(json_encode($row)); 51 | 52 | // we use the identifier to ensure each object ends up 53 | // in the database only once no matter how many times 54 | // we run the indexer 55 | $idterm = "Q".$identifier; 56 | $doc->add_boolean_term($idterm); 57 | $db->replace_document($idterm, $doc); 58 | } 59 | } 60 | // End of example code 61 | 62 | if ($argc != 3) { 63 | print "Usage: php $argv[0] DATAPATH DBPATH\n"; 64 | die(); 65 | } 66 | 67 | // call the index function 68 | index($argv[1], $argv[2]); 69 | ?> 70 | -------------------------------------------------------------------------------- /code/php/logger.php: -------------------------------------------------------------------------------- 1 | 14 | -------------------------------------------------------------------------------- /code/php/parsecsv.php: -------------------------------------------------------------------------------- 1 | column associations from open file 4 | * 5 | * @param resource $fH Open file resource 6 | * 7 | * @return array Associative array of column name => column number 8 | */ 9 | function get_csv_headers ($fH) 10 | { 11 | return fgetcsv($fH); 12 | } 13 | 14 | /** 15 | * Handles file opening and error reporting if file in unavailable 16 | * 17 | * @param string $file Path of file to open 18 | * 19 | * @return resource Open file handle 20 | */ 21 | function open_file ($file) 22 | { 23 | // Open the CSV file 24 | $fH = fopen($file, "r"); 25 | if ($fH === false) { 26 | die("Failed to open input file {$file} for reading\n"); 27 | } 28 | 29 | return $fH; 30 | } 31 | 32 | /** 33 | * Reads a row of data from a CSV file 34 | * 35 | * @param resource $fH Open file handle 36 | * @param array $headers Indexed array of column names 37 | * 38 | * @return mixed False if EOF; indexed array of data otherwise 39 | */ 40 | function parse_csv_row ($fH, $headers) 41 | { 42 | $row = fgetcsv($fH); 43 | $data = array(); 44 | 45 | if (is_array($row) === false) 46 | { 47 | return false; 48 | } 49 | 50 | foreach ($row as $key => $value) { 51 | $data[$headers[$key]] = $value; 52 | } 53 | 54 | return $data; 55 | } 56 | ?> 57 | -------------------------------------------------------------------------------- /code/php/search1.php: -------------------------------------------------------------------------------- 1 | set_stemmer(new XapianStem("en")); 18 | $queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME); 19 | // Start of prefix configuration. 20 | $queryparser->add_prefix("title", "S"); 21 | $queryparser->add_prefix("description", "XD"); 22 | // End of prefix configuration. 23 | 24 | // And parse the query 25 | $query = $queryparser->parse_query($querystring); 26 | 27 | // Use an Enquire object on the database to run the query 28 | $enquire = new XapianEnquire($db); 29 | $enquire->set_query($query); 30 | 31 | // Retrieve the matches and compute start and end points 32 | $matches = $enquire->get_mset($offset, $pagesize); 33 | $match = $matches->begin(); 34 | $end = $matches->end(); 35 | 36 | // Use an array to record the DocIds of each match 37 | $docids = array(); 38 | 39 | while (!($match->equals($end))) 40 | { 41 | // retrieve the document and its data 42 | $doc = $match->get_document(); 43 | $fields = json_decode($doc->get_data()); 44 | $position = $match->get_rank() + 1; 45 | 46 | // record the docid 47 | $docid = $match->get_docid(); 48 | $docids[] = $docid; 49 | 50 | // display the results 51 | printf("%d: #%3.3d %s\n", $position, $docid, $fields->TITLE); 52 | 53 | // increment MSet iterator and our counter 54 | $match->next(); 55 | } 56 | 57 | // Finally, make sure we log the query and displayed results 58 | printf( 59 | "'%s'[%d:%d] = %s\n", 60 | $querystring, 61 | $offset, 62 | $offset+$pagesize, 63 | implode(" ", $docids) 64 | ); 65 | } 66 | ## End of example code. 67 | 68 | if ($argc < 3) { 69 | print "Usage: php $argv[0] DBPATH QUERYTERM...\n"; 70 | die(); 71 | } 72 | 73 | search($argv[1], join(' ', array_slice($argv, 2))); 74 | ?> 75 | -------------------------------------------------------------------------------- /code/php/search_filters2.php: -------------------------------------------------------------------------------- 1 | set_stemmer(new XapianStem("en")); 18 | $queryparser->set_stemming_strategy(XapianQueryParser::STEM_SOME); 19 | $queryparser->add_prefix("title", "S"); 20 | $queryparser->add_prefix("description", "XD"); 21 | $queryparser->add_boolean_prefix("material", "XM"); 22 | 23 | // And parse the query 24 | $query = $queryparser->parse_query($querystring); 25 | ### End of example code. 26 | 27 | // Use an Enquire object on the database to run the query 28 | $enquire = new XapianEnquire($db); 29 | $enquire->set_query($query); 30 | 31 | // Retrieve the matches and compute start and end points 32 | $matches = $enquire->get_mset($offset, $pagesize); 33 | $match = $matches->begin(); 34 | $end = $matches->end(); 35 | 36 | // Use an array to record the DocIds of each match 37 | $docids = array(); 38 | 39 | while (!($match->equals($end))) 40 | { 41 | // retrieve the document and its data 42 | $doc = $match->get_document(); 43 | $fields = json_decode($doc->get_data()); 44 | $position = $match->get_rank() + 1; 45 | 46 | // record the docid 47 | $docid = $match->get_docid(); 48 | $docids[] = $docid; 49 | 50 | // display the results 51 | printf("%d: #%3.3d %s\n", $position, $docid, $fields->TITLE); 52 | 53 | // increment MSet iterator and our counter 54 | $match->next(); 55 | } 56 | 57 | // Finally, make sure we log the query and displayed results 58 | printf("'%s'[%d:%d] = %s\n", 59 | $querystring, 60 | $offset, 61 | $offset+$pagesize, 62 | implode(" ", $docids) 63 | ); 64 | } 65 | 66 | if ($argc < 2) { 67 | print "Usage: php $argv[0] DBPATH QUERYTERM...\n"; 68 | die(); 69 | } 70 | 71 | search($argv[1], join(' ', array_slice($argv, 2))); 72 | ?> 73 | -------------------------------------------------------------------------------- /code/python/delete1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import xapian 5 | 6 | ### Start of example code. 7 | def delete_docs(dbpath, identifiers): 8 | # Open the database we're going to be deleting from. 9 | db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN) 10 | 11 | for identifier in identifiers: 12 | idterm = u'Q' + identifier 13 | db.delete_document(idterm) 14 | ### End of example code. 15 | 16 | if len(sys.argv) < 3: 17 | print("Usage: %s DBPATH ID..." % sys.argv[0]) 18 | sys.exit(1) 19 | 20 | delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:]) 21 | -------------------------------------------------------------------------------- /code/python/index1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | ### Start of example code. 9 | def index(datapath, dbpath): 10 | # Create or open the database we're going to be writing to. 11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 12 | 13 | # Set up a TermGenerator that we'll use in indexing. 14 | termgenerator = xapian.TermGenerator() 15 | termgenerator.set_stemmer(xapian.Stem("en")) 16 | 17 | for fields in parse_csv_file(datapath): 18 | # 'fields' is a dictionary mapping from field name to value. 19 | # Pick out the fields we're going to index. 20 | description = fields.get('DESCRIPTION', u'') 21 | title = fields.get('TITLE', u'') 22 | identifier = fields.get('id_NUMBER', u'') 23 | 24 | # We make a document and tell the term generator to use this. 25 | doc = xapian.Document() 26 | termgenerator.set_document(doc) 27 | 28 | # Index each field with a suitable prefix. 29 | termgenerator.index_text(title, 1, 'S') 30 | termgenerator.index_text(description, 1, 'XD') 31 | 32 | # Index fields without prefixes for general search. 33 | termgenerator.index_text(title) 34 | termgenerator.increase_termpos() 35 | termgenerator.index_text(description) 36 | 37 | # Store all the fields for display purposes. 38 | doc.set_data(json.dumps(fields)) 39 | 40 | # We use the identifier to ensure each object ends up in the 41 | # database only once no matter how many times we run the 42 | # indexer. 43 | idterm = u"Q" + identifier 44 | doc.add_boolean_term(idterm) 45 | db.replace_document(idterm, doc) 46 | ### End of example code. 47 | 48 | if len(sys.argv) != 3: 49 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 50 | sys.exit(1) 51 | 52 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 53 | -------------------------------------------------------------------------------- /code/python/index_facets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | ### Start of example code. 9 | def index(datapath, dbpath): 10 | # Create or open the database we're going to be writing to. 11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 12 | 13 | # Set up a TermGenerator that we'll use in indexing. 14 | termgenerator = xapian.TermGenerator() 15 | termgenerator.set_stemmer(xapian.Stem("en")) 16 | 17 | for fields in parse_csv_file(datapath): 18 | # 'fields' is a dictionary mapping from field name to value. 19 | # Pick out the fields we're going to index. 20 | description = fields.get('DESCRIPTION', u'') 21 | title = fields.get('TITLE', u'') 22 | identifier = fields.get('id_NUMBER', u'') 23 | collection = fields.get('COLLECTION', u'') 24 | maker = fields.get('MAKER', u'') 25 | 26 | # We make a document and tell the term generator to use this. 27 | doc = xapian.Document() 28 | termgenerator.set_document(doc) 29 | 30 | # Index each field with a suitable prefix. 31 | termgenerator.index_text(title, 1, 'S') 32 | termgenerator.index_text(description, 1, 'XD') 33 | 34 | # Index fields without prefixes for general search. 35 | termgenerator.index_text(title) 36 | termgenerator.increase_termpos() 37 | termgenerator.index_text(description) 38 | 39 | # Add the collection as a value in slot 0. 40 | doc.add_value(0, collection) 41 | 42 | # Add the maker as a value in slot 1. 43 | doc.add_value(1, maker) 44 | 45 | # Store all the fields for display purposes. 46 | doc.set_data(json.dumps(fields)) 47 | 48 | # We use the identifier to ensure each object ends up in the 49 | # database only once no matter how many times we run the 50 | # indexer. 51 | idterm = u"Q" + identifier 52 | doc.add_boolean_term(idterm) 53 | db.replace_document(idterm, doc) 54 | ### End of example code. 55 | 56 | if len(sys.argv) != 3: 57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 58 | sys.exit(1) 59 | 60 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 61 | -------------------------------------------------------------------------------- /code/python/index_filters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_csv_file(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | description = fields.get('DESCRIPTION', u'') 20 | title = fields.get('TITLE', u'') 21 | identifier = fields.get('id_NUMBER', u'') 22 | 23 | # We make a document and tell the term generator to use this. 24 | doc = xapian.Document() 25 | termgenerator.set_document(doc) 26 | 27 | # Index each field with a suitable prefix. 28 | termgenerator.index_text(title, 1, 'S') 29 | termgenerator.index_text(description, 1, 'XD') 30 | 31 | # Index fields without prefixes for general search. 32 | termgenerator.index_text(title) 33 | termgenerator.increase_termpos() 34 | termgenerator.index_text(description) 35 | 36 | ### Start of new indexing code. 37 | # Index the MATERIALS field, splitting on semicolons. 38 | for material in fields.get('MATERIALS', u'').split(';'): 39 | material = material.strip().lower() 40 | if len(material) > 0: 41 | doc.add_boolean_term('XM' + material) 42 | ### End of new indexing code. 43 | 44 | # Store all the fields for display purposes. 45 | doc.set_data(json.dumps(fields)) 46 | 47 | # We use the identifier to ensure each object ends up in the 48 | # database only once no matter how many times we run the 49 | # indexer. 50 | idterm = u"Q" + identifier 51 | doc.add_boolean_term(idterm) 52 | db.replace_document(idterm, doc) 53 | 54 | if len(sys.argv) != 3: 55 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 56 | sys.exit(1) 57 | 58 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 59 | -------------------------------------------------------------------------------- /code/python/index_ranges.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import numbers_from_string, parse_csv_file 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_csv_file(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | description = fields.get('DESCRIPTION', u'') 20 | title = fields.get('TITLE', u'') 21 | identifier = fields.get('id_NUMBER', u'') 22 | 23 | # We make a document and tell the term generator to use this. 24 | doc = xapian.Document() 25 | termgenerator.set_document(doc) 26 | 27 | # Index each field with a suitable prefix. 28 | termgenerator.index_text(title, 1, 'S') 29 | termgenerator.index_text(description, 1, 'XD') 30 | 31 | # Index fields without prefixes for general search. 32 | termgenerator.index_text(title) 33 | termgenerator.increase_termpos() 34 | termgenerator.index_text(description) 35 | 36 | # Store all the fields for display purposes. 37 | doc.set_data(json.dumps(fields)) 38 | 39 | ### Start of example code. 40 | # parse the two values we need 41 | measurements = fields.get('MEASUREMENTS', u'') 42 | if len(measurements) > 0: 43 | numbers = numbers_from_string(measurements) 44 | if len(numbers) > 0: 45 | doc.add_value(0, xapian.sortable_serialise(max(numbers))) 46 | 47 | date_made = fields.get('DATE_MADE', u'') 48 | years = numbers_from_string(date_made) 49 | if len(years) > 0: 50 | doc.add_value(1, xapian.sortable_serialise(years[0])) 51 | ### End of example code. 52 | 53 | # We use the identifier to ensure each object ends up in the 54 | # database only once no matter how many times we run the 55 | # indexer. 56 | idterm = u"Q" + identifier 57 | doc.add_boolean_term(idterm) 58 | db.replace_document(idterm, doc) 59 | 60 | if len(sys.argv) != 3: 61 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 62 | sys.exit(1) 63 | 64 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 65 | -------------------------------------------------------------------------------- /code/python/index_ranges2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | from support import parse_states 5 | import sys 6 | import xapian 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_states(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | name = fields.get('name', u'') 20 | description = fields.get('description', u'') 21 | motto = fields.get('motto', u'') 22 | admitted = fields.get('admitted', None) 23 | population = fields.get('population', None) 24 | order = fields.get('order', u'') 25 | 26 | # We make a document and tell the term generator to use this. 27 | doc = xapian.Document() 28 | termgenerator.set_document(doc) 29 | 30 | ### Start of example code. 31 | # Index each field with a suitable prefix. 32 | termgenerator.index_text(name, 1, 'S') 33 | termgenerator.index_text(description, 1, 'XD') 34 | termgenerator.index_text(motto, 1, 'XM') 35 | 36 | # Index fields without prefixes for general search. 37 | termgenerator.index_text(name) 38 | termgenerator.increase_termpos() 39 | termgenerator.index_text(description) 40 | termgenerator.increase_termpos() 41 | termgenerator.index_text(motto) 42 | 43 | # Add document values. 44 | if admitted is not None: 45 | doc.add_value(1, xapian.sortable_serialise(int(admitted[:4]))) 46 | doc.add_value(2, admitted) # YYYYMMDD 47 | if population is not None: 48 | doc.add_value(3, xapian.sortable_serialise(int(population))) 49 | ### End of example code. 50 | 51 | # Store all the fields for display purposes. 52 | doc.set_data(json.dumps(fields)) 53 | 54 | # We use the order to ensure each object ends up in the 55 | # database only once no matter how many times we run the 56 | # indexer. 57 | idterm = u"Q" + order 58 | doc.add_boolean_term(idterm) 59 | db.replace_document(idterm, doc) 60 | 61 | if len(sys.argv) != 3: 62 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 63 | sys.exit(1) 64 | 65 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 66 | -------------------------------------------------------------------------------- /code/python/index_sorting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_csv_file(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | description = fields.get('DESCRIPTION', u'') 20 | title = fields.get('TITLE', u'') 21 | identifier = fields.get('id_NUMBER', u'') 22 | collection = fields.get('COLLECTION', u'') 23 | maker = fields.get('MAKER', u'') 24 | 25 | # We make a document and tell the term generator to use this. 26 | doc = xapian.Document() 27 | termgenerator.set_document(doc) 28 | 29 | # Index each field with a suitable prefix. 30 | termgenerator.index_text(title, 1, 'S') 31 | termgenerator.index_text(description, 1, 'XD') 32 | 33 | # Index fields without prefixes for general search. 34 | termgenerator.index_text(title) 35 | termgenerator.increase_termpos() 36 | termgenerator.index_text(description) 37 | 38 | ### Start of example code. 39 | # add the collection as a value in slot 0 40 | doc.add_value(0, collection) 41 | 42 | # add the maker as a value in slot 1 43 | doc.add_value(1, maker) 44 | ### End of example code. 45 | 46 | # Store all the fields for display purposes. 47 | doc.set_data(json.dumps(fields)) 48 | 49 | # We use the identifier to ensure each object ends up in the 50 | # database only once no matter how many times we run the 51 | # indexer. 52 | idterm = u"Q" + identifier 53 | doc.add_boolean_term(idterm) 54 | db.replace_document(idterm, doc) 55 | 56 | if len(sys.argv) != 3: 57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 58 | sys.exit(1) 59 | 60 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 61 | -------------------------------------------------------------------------------- /code/python/postingsource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import xapian 4 | 5 | ### Start of class header and constructor. 6 | class ExternalWeightPostingSource(xapian.PostingSource): 7 | """ 8 | A Xapian posting source returning weights from an external source. 9 | """ 10 | def __init__(self, wtsource): 11 | xapian.PostingSource.__init__(self) 12 | self.wtsource = wtsource 13 | ### End of class header and constructor. 14 | 15 | ### Start of init. 16 | def init(self, db): 17 | self.db = db 18 | self.alldocs = db.postlist('') 19 | self.set_maxweight(self.wtsource.get_maxweight()) 20 | ### End of init. 21 | 22 | ### Start of termfreq methods. 23 | def get_termfreq_min(self): return self.db.get_doccount() 24 | def get_termfreq_est(self): return self.db.get_doccount() 25 | def get_termfreq_max(self): return self.db.get_doccount() 26 | ### End of termfreq methods. 27 | 28 | ### Start of get_weight. 29 | def get_weight(self): 30 | doc = self.db.get_document(self.current.docid) 31 | return self.wtsource.get_weight(doc) 32 | ### End of get_weight. 33 | 34 | ### Start of get_docid. 35 | def get_docid(self): 36 | return self.current.docid 37 | ### End of get_docid. 38 | 39 | ### Start of at_end. 40 | def at_end(self): 41 | return self.current is None 42 | ### End of at_end. 43 | 44 | ### Start of next. 45 | def next(self, minweight): 46 | try: 47 | self.current = self.alldocs.next() 48 | except StopIteration: 49 | self.current = None 50 | ### End of next. 51 | 52 | ### Start of skip_to. 53 | def skip_to(self, docid, minweight): 54 | try: 55 | self.current = self.alldocs.skip_to(docid) 56 | except StopIteration: 57 | self.current = None 58 | ### End of skip_to. 59 | -------------------------------------------------------------------------------- /code/python/search1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | ### Start of example code. 9 | def search(dbpath, querystring, offset=0, pagesize=10): 10 | # offset - defines starting point within result set 11 | # pagesize - defines number of records to retrieve 12 | 13 | # Open the database we're going to search. 14 | db = xapian.Database(dbpath) 15 | 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = xapian.QueryParser() 18 | queryparser.set_stemmer(xapian.Stem("en")) 19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 20 | # Start of prefix configuration. 21 | queryparser.add_prefix("title", "S") 22 | queryparser.add_prefix("description", "XD") 23 | # End of prefix configuration. 24 | 25 | # And parse the query 26 | query = queryparser.parse_query(querystring) 27 | 28 | # Use an Enquire object on the database to run the query 29 | enquire = xapian.Enquire(db) 30 | enquire.set_query(query) 31 | 32 | # And print out something about each match 33 | matches = [] 34 | for match in enquire.get_mset(offset, pagesize): 35 | fields = json.loads(match.document.get_data()) 36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 37 | 'rank': match.rank + 1, 38 | 'docid': match.docid, 39 | 'title': fields.get('TITLE', u''), 40 | }) 41 | matches.append(match.docid) 42 | 43 | # Finally, make sure we log the query and displayed results 44 | support.log_matches(querystring, offset, pagesize, matches) 45 | ### End of example code. 46 | 47 | if len(sys.argv) < 3: 48 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 49 | sys.exit(1) 50 | 51 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 52 | -------------------------------------------------------------------------------- /code/python/search_facets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | 29 | # And print out something about each match 30 | matches = [] 31 | 32 | ### Start of example code. 33 | # Set up a spy to inspect the MAKER value at slot 1 34 | spy = xapian.ValueCountMatchSpy(1) 35 | enquire.add_matchspy(spy) 36 | 37 | for match in enquire.get_mset(offset, pagesize, 100): 38 | fields = json.loads(match.document.get_data()) 39 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 40 | 'rank': match.rank + 1, 41 | 'docid': match.docid, 42 | 'title': fields.get('TITLE', u''), 43 | }) 44 | matches.append(match.docid) 45 | 46 | # Fetch and display the spy values 47 | for facet in spy.values(): 48 | print("Facet: %(term)s; count: %(count)i" % { 49 | 'term' : facet.term, 50 | 'count' : facet.termfreq 51 | }) 52 | 53 | # Finally, make sure we log the query and displayed results 54 | support.log_matches(querystring, offset, pagesize, matches) 55 | ### End of example code. 56 | 57 | if len(sys.argv) < 3: 58 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 59 | sys.exit(1) 60 | 61 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 62 | -------------------------------------------------------------------------------- /code/python/search_filters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, materials, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | ### Start of example code. 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = xapian.QueryParser() 18 | queryparser.set_stemmer(xapian.Stem("en")) 19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 20 | queryparser.add_prefix("title", "S") 21 | queryparser.add_prefix("description", "XD") 22 | 23 | # And parse the query 24 | query = queryparser.parse_query(querystring) 25 | 26 | if len(materials) > 0: 27 | # Filter the results to ones which contain at least one of the 28 | # materials. 29 | 30 | # Build a query for each material value 31 | material_queries = [ 32 | xapian.Query('XM' + material.lower()) 33 | for material in materials 34 | ] 35 | 36 | # Combine these queries with an OR operator 37 | material_query = xapian.Query(xapian.Query.OP_OR, material_queries) 38 | 39 | # Use the material query to filter the main query 40 | query = xapian.Query(xapian.Query.OP_FILTER, query, material_query) 41 | ### End of example code. 42 | 43 | # Use an Enquire object on the database to run the query 44 | enquire = xapian.Enquire(db) 45 | enquire.set_query(query) 46 | 47 | # And print out something about each match 48 | matches = [] 49 | for match in enquire.get_mset(offset, pagesize): 50 | fields = json.loads(match.document.get_data()) 51 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 52 | 'rank': match.rank + 1, 53 | 'docid': match.docid, 54 | 'title': fields.get('TITLE', u''), 55 | }) 56 | matches.append(match.docid) 57 | 58 | # Finally, make sure we log the query and displayed results 59 | support.log_matches(querystring, offset, pagesize, matches) 60 | 61 | if len(sys.argv) < 3: 62 | print("Usage: %s DBPATH QUERY [MATERIALS...]" % sys.argv[0]) 63 | sys.exit(1) 64 | 65 | search(dbpath = sys.argv[1], querystring = sys.argv[2], 66 | materials = sys.argv[3:]) 67 | -------------------------------------------------------------------------------- /code/python/search_filters2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | ### Start of example code. 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = xapian.QueryParser() 18 | queryparser.set_stemmer(xapian.Stem("en")) 19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 20 | queryparser.add_prefix("title", "S") 21 | queryparser.add_prefix("description", "XD") 22 | queryparser.add_boolean_prefix("material", "XM") 23 | 24 | # And parse the query 25 | query = queryparser.parse_query(querystring) 26 | ### End of example code. 27 | 28 | # Use an Enquire object on the database to run the query 29 | enquire = xapian.Enquire(db) 30 | enquire.set_query(query) 31 | 32 | # And print out something about each match 33 | matches = [] 34 | for match in enquire.get_mset(offset, pagesize): 35 | fields = json.loads(match.document.get_data()) 36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 37 | 'rank': match.rank + 1, 38 | 'docid': match.docid, 39 | 'title': fields.get('TITLE', u''), 40 | }) 41 | matches.append(match.docid) 42 | # Finally, make sure we log the query and displayed results 43 | support.log_matches(querystring, offset, pagesize, matches) 44 | 45 | if len(sys.argv) < 3: 46 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 47 | sys.exit(1) 48 | 49 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 50 | -------------------------------------------------------------------------------- /code/python/search_ranges.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | # and add in range processors 22 | queryparser.add_rangeprocessor( 23 | xapian.NumberRangeProcessor(0, 'mm', xapian.RP_SUFFIX) 24 | ) 25 | queryparser.add_rangeprocessor( 26 | xapian.NumberRangeProcessor(1) 27 | ) 28 | 29 | # And parse the query 30 | query = queryparser.parse_query(querystring) 31 | 32 | # Use an Enquire object on the database to run the query 33 | enquire = xapian.Enquire(db) 34 | enquire.set_query(query) 35 | 36 | # And print out something about each match 37 | matches = [] 38 | for match in enquire.get_mset(offset, pagesize): 39 | fields = json.loads(match.document.get_data()) 40 | print(u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n %(title)s" % { 41 | 'rank': match.rank + 1, 42 | 'docid': match.docid, 43 | 'measurements': fields.get('MEASUREMENTS', u''), 44 | 'date': fields.get('DATE_MADE', u''), 45 | 'title': fields.get('TITLE', u''), 46 | }) 47 | matches.append(match.docid) 48 | 49 | # Finally, make sure we log the query and displayed results 50 | support.log_matches(querystring, offset, pagesize, matches) 51 | 52 | if len(sys.argv) < 3: 53 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 54 | sys.exit(1) 55 | 56 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 57 | -------------------------------------------------------------------------------- /code/python/search_sorting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | # Start of example code. 29 | enquire.set_sort_by_value_then_relevance(1, False) 30 | # End of example code. 31 | 32 | # And print out something about each match 33 | matches = [] 34 | for match in enquire.get_mset(offset, pagesize): 35 | fields = json.loads(match.document.get_data()) 36 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 37 | 'rank': match.rank + 1, 38 | 'docid': match.docid, 39 | 'name': fields.get('name', u''), 40 | 'date': support.format_date(fields.get('admitted', u'')), 41 | 'pop': support.format_numeral(int(fields.get('population', 0))), 42 | 'lat': fields.get('latitude', u''), 43 | 'lon': fields.get('longitude', u''), 44 | }) 45 | matches.append(match.docid) 46 | 47 | # Finally, make sure we log the query and displayed results 48 | support.log_matches(querystring, offset, pagesize, matches) 49 | 50 | if len(sys.argv) < 3: 51 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 52 | sys.exit(1) 53 | 54 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 55 | -------------------------------------------------------------------------------- /code/python/search_sorting2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | # Start of example code. 29 | keymaker = xapian.MultiValueKeyMaker() 30 | keymaker.add_value(1, False) 31 | keymaker.add_value(3, True) 32 | enquire.set_sort_by_key_then_relevance(keymaker, False) 33 | # End of example code. 34 | 35 | # And print out something about each match 36 | matches = [] 37 | for match in enquire.get_mset(offset, pagesize): 38 | fields = json.loads(match.document.get_data()) 39 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 40 | 'rank': match.rank + 1, 41 | 'docid': match.docid, 42 | 'name': fields.get('name', u''), 43 | 'date': support.format_date(fields.get('admitted', u'')), 44 | 'pop': support.format_numeral(int(fields.get('population', 0))), 45 | 'lat': fields.get('latitude', u''), 46 | 'lon': fields.get('longitude', u''), 47 | }) 48 | matches.append(match.docid) 49 | 50 | # Finally, make sure we log the query and displayed results 51 | support.log_matches(querystring, offset, pagesize, matches) 52 | 53 | if len(sys.argv) < 3: 54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 55 | sys.exit(1) 56 | 57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 58 | -------------------------------------------------------------------------------- /code/python/search_sorting3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | # Start of example code. 29 | class DistanceKeyMaker(xapian.KeyMaker): 30 | def __call__(self, doc): 31 | # we want to return a sortable string which represents 32 | # the distance from Washington, DC to the middle of this 33 | # state. 34 | coords = map(float, doc.get_value(4).split(',')) 35 | washington = (38.012, -77.037) 36 | return xapian.sortable_serialise( 37 | support.distance_between_coords(coords, washington) 38 | ) 39 | enquire.set_sort_by_key_then_relevance(DistanceKeyMaker(), False) 40 | # End of example code. 41 | 42 | # And print out something about each match 43 | matches = [] 44 | for match in enquire.get_mset(offset, pagesize): 45 | fields = json.loads(match.document.get_data()) 46 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 47 | 'rank': match.rank + 1, 48 | 'docid': match.docid, 49 | 'name': fields.get('name', u''), 50 | 'date': support.format_date(fields.get('admitted', u'')), 51 | 'pop': support.format_numeral(int(fields.get('population', 0))), 52 | 'lat': fields.get('latitude', u''), 53 | 'lon': fields.get('longitude', u''), 54 | }) 55 | matches.append(match.docid) 56 | 57 | # Finally, make sure we log the query and displayed results 58 | support.log_matches(querystring, offset, pagesize, matches) 59 | 60 | if len(sys.argv) < 3: 61 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 62 | sys.exit(1) 63 | 64 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 65 | -------------------------------------------------------------------------------- /code/python/search_synonyms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | ### Start of example code. 9 | def search(dbpath, querystring, offset=0, pagesize=10): 10 | # offset - defines starting point within result set 11 | # pagesize - defines number of records to retrieve 12 | 13 | # Open the database we're going to search. 14 | db = xapian.WritableDatabase(dbpath) 15 | 16 | # Start of adding synonyms 17 | db.add_synonym("time", "calendar") 18 | # End of adding synonyms 19 | 20 | # Set up a QueryParser with a stemmer and suitable prefixes 21 | queryparser = xapian.QueryParser() 22 | queryparser.set_stemmer(xapian.Stem("en")) 23 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 24 | queryparser.add_prefix("title", "S") 25 | queryparser.add_prefix("description", "XD") 26 | 27 | # Start of set database 28 | queryparser.set_database(db) 29 | # End of set database 30 | 31 | # And parse the query 32 | query = queryparser.parse_query(querystring, queryparser.FLAG_SYNONYM) 33 | 34 | # Use an Enquire object on the database to run the query 35 | enquire = xapian.Enquire(db) 36 | enquire.set_query(query) 37 | 38 | # And print out something about each match 39 | matches = [] 40 | for match in enquire.get_mset(offset, pagesize): 41 | fields = json.loads(match.document.get_data()) 42 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 43 | 'rank': match.rank + 1, 44 | 'docid': match.docid, 45 | 'title': fields.get('TITLE', u''), 46 | }) 47 | matches.append(match.docid) 48 | 49 | # Finally, make sure we log the query and displayed results 50 | support.log_matches(querystring, offset, pagesize, matches) 51 | ### End of example code. 52 | 53 | if len(sys.argv) < 3: 54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 55 | sys.exit(1) 56 | 57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 58 | -------------------------------------------------------------------------------- /code/python3/delete1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import xapian 5 | 6 | ### Start of example code. 7 | def delete_docs(dbpath, identifiers): 8 | # Open the database we're going to be deleting from. 9 | db = xapian.WritableDatabase(dbpath, xapian.DB_OPEN) 10 | 11 | for identifier in identifiers: 12 | idterm = u'Q' + identifier 13 | db.delete_document(idterm) 14 | ### End of example code. 15 | 16 | if len(sys.argv) < 3: 17 | print("Usage: %s DBPATH ID..." % sys.argv[0]) 18 | sys.exit(1) 19 | 20 | delete_docs(dbpath = sys.argv[1], identifiers=sys.argv[2:]) 21 | -------------------------------------------------------------------------------- /code/python3/index1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | ### Start of example code. 9 | def index(datapath, dbpath): 10 | # Create or open the database we're going to be writing to. 11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 12 | 13 | # Set up a TermGenerator that we'll use in indexing. 14 | termgenerator = xapian.TermGenerator() 15 | termgenerator.set_stemmer(xapian.Stem("en")) 16 | 17 | for fields in parse_csv_file(datapath): 18 | # 'fields' is a dictionary mapping from field name to value. 19 | # Pick out the fields we're going to index. 20 | description = fields.get('DESCRIPTION', u'') 21 | title = fields.get('TITLE', u'') 22 | identifier = fields.get('id_NUMBER', u'') 23 | 24 | # We make a document and tell the term generator to use this. 25 | doc = xapian.Document() 26 | termgenerator.set_document(doc) 27 | 28 | # Index each field with a suitable prefix. 29 | termgenerator.index_text(title, 1, 'S') 30 | termgenerator.index_text(description, 1, 'XD') 31 | 32 | # Index fields without prefixes for general search. 33 | termgenerator.index_text(title) 34 | termgenerator.increase_termpos() 35 | termgenerator.index_text(description) 36 | 37 | # Store all the fields for display purposes. 38 | doc.set_data(json.dumps(fields)) 39 | 40 | # We use the identifier to ensure each object ends up in the 41 | # database only once no matter how many times we run the 42 | # indexer. 43 | idterm = u"Q" + identifier 44 | doc.add_boolean_term(idterm) 45 | db.replace_document(idterm, doc) 46 | ### End of example code. 47 | 48 | if len(sys.argv) != 3: 49 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 50 | sys.exit(1) 51 | 52 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 53 | -------------------------------------------------------------------------------- /code/python3/index_facets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | ### Start of example code. 9 | def index(datapath, dbpath): 10 | # Create or open the database we're going to be writing to. 11 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 12 | 13 | # Set up a TermGenerator that we'll use in indexing. 14 | termgenerator = xapian.TermGenerator() 15 | termgenerator.set_stemmer(xapian.Stem("en")) 16 | 17 | for fields in parse_csv_file(datapath): 18 | # 'fields' is a dictionary mapping from field name to value. 19 | # Pick out the fields we're going to index. 20 | description = fields.get('DESCRIPTION', u'') 21 | title = fields.get('TITLE', u'') 22 | identifier = fields.get('id_NUMBER', u'') 23 | collection = fields.get('COLLECTION', u'') 24 | maker = fields.get('MAKER', u'') 25 | 26 | # We make a document and tell the term generator to use this. 27 | doc = xapian.Document() 28 | termgenerator.set_document(doc) 29 | 30 | # Index each field with a suitable prefix. 31 | termgenerator.index_text(title, 1, 'S') 32 | termgenerator.index_text(description, 1, 'XD') 33 | 34 | # Index fields without prefixes for general search. 35 | termgenerator.index_text(title) 36 | termgenerator.increase_termpos() 37 | termgenerator.index_text(description) 38 | 39 | # Add the collection as a value in slot 0. 40 | doc.add_value(0, collection) 41 | 42 | # Add the maker as a value in slot 1. 43 | doc.add_value(1, maker) 44 | 45 | # Store all the fields for display purposes. 46 | doc.set_data(json.dumps(fields)) 47 | 48 | # We use the identifier to ensure each object ends up in the 49 | # database only once no matter how many times we run the 50 | # indexer. 51 | idterm = u"Q" + identifier 52 | doc.add_boolean_term(idterm) 53 | db.replace_document(idterm, doc) 54 | ### End of example code. 55 | 56 | if len(sys.argv) != 3: 57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 58 | sys.exit(1) 59 | 60 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 61 | -------------------------------------------------------------------------------- /code/python3/index_filters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_csv_file(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | description = fields.get('DESCRIPTION', u'') 20 | title = fields.get('TITLE', u'') 21 | identifier = fields.get('id_NUMBER', u'') 22 | 23 | # We make a document and tell the term generator to use this. 24 | doc = xapian.Document() 25 | termgenerator.set_document(doc) 26 | 27 | # Index each field with a suitable prefix. 28 | termgenerator.index_text(title, 1, 'S') 29 | termgenerator.index_text(description, 1, 'XD') 30 | 31 | # Index fields without prefixes for general search. 32 | termgenerator.index_text(title) 33 | termgenerator.increase_termpos() 34 | termgenerator.index_text(description) 35 | 36 | ### Start of new indexing code. 37 | # Index the MATERIALS field, splitting on semicolons. 38 | for material in fields.get('MATERIALS', u'').split(';'): 39 | material = material.strip().lower() 40 | if len(material) > 0: 41 | doc.add_boolean_term('XM' + material) 42 | ### End of new indexing code. 43 | 44 | # Store all the fields for display purposes. 45 | doc.set_data(json.dumps(fields)) 46 | 47 | # We use the identifier to ensure each object ends up in the 48 | # database only once no matter how many times we run the 49 | # indexer. 50 | idterm = u"Q" + identifier 51 | doc.add_boolean_term(idterm) 52 | db.replace_document(idterm, doc) 53 | 54 | if len(sys.argv) != 3: 55 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 56 | sys.exit(1) 57 | 58 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 59 | -------------------------------------------------------------------------------- /code/python3/index_ranges.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import numbers_from_string, parse_csv_file 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_csv_file(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | description = fields.get('DESCRIPTION', u'') 20 | title = fields.get('TITLE', u'') 21 | identifier = fields.get('id_NUMBER', u'') 22 | 23 | # We make a document and tell the term generator to use this. 24 | doc = xapian.Document() 25 | termgenerator.set_document(doc) 26 | 27 | # Index each field with a suitable prefix. 28 | termgenerator.index_text(title, 1, 'S') 29 | termgenerator.index_text(description, 1, 'XD') 30 | 31 | # Index fields without prefixes for general search. 32 | termgenerator.index_text(title) 33 | termgenerator.increase_termpos() 34 | termgenerator.index_text(description) 35 | 36 | # Store all the fields for display purposes. 37 | doc.set_data(json.dumps(fields)) 38 | 39 | ### Start of example code. 40 | # parse the two values we need 41 | measurements = fields.get('MEASUREMENTS', u'') 42 | if len(measurements) > 0: 43 | numbers = numbers_from_string(measurements) 44 | if len(numbers) > 0: 45 | doc.add_value(0, xapian.sortable_serialise(max(numbers))) 46 | 47 | date_made = fields.get('DATE_MADE', u'') 48 | years = numbers_from_string(date_made) 49 | if len(years) > 0: 50 | doc.add_value(1, xapian.sortable_serialise(years[0])) 51 | ### End of example code. 52 | 53 | # We use the identifier to ensure each object ends up in the 54 | # database only once no matter how many times we run the 55 | # indexer. 56 | idterm = u"Q" + identifier 57 | doc.add_boolean_term(idterm) 58 | db.replace_document(idterm, doc) 59 | 60 | if len(sys.argv) != 3: 61 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 62 | sys.exit(1) 63 | 64 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 65 | -------------------------------------------------------------------------------- /code/python3/index_ranges2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | from support import parse_states 5 | import sys 6 | import xapian 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_states(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | name = fields.get('name', u'') 20 | description = fields.get('description', u'') 21 | motto = fields.get('motto', u'') 22 | admitted = fields.get('admitted', None) 23 | population = fields.get('population', None) 24 | order = fields.get('order', u'') 25 | 26 | # We make a document and tell the term generator to use this. 27 | doc = xapian.Document() 28 | termgenerator.set_document(doc) 29 | 30 | ### Start of example code. 31 | # Index each field with a suitable prefix. 32 | termgenerator.index_text(name, 1, 'S') 33 | termgenerator.index_text(description, 1, 'XD') 34 | termgenerator.index_text(motto, 1, 'XM') 35 | 36 | # Index fields without prefixes for general search. 37 | termgenerator.index_text(name) 38 | termgenerator.increase_termpos() 39 | termgenerator.index_text(description) 40 | termgenerator.increase_termpos() 41 | termgenerator.index_text(motto) 42 | 43 | # Add document values. 44 | if admitted is not None: 45 | doc.add_value(1, xapian.sortable_serialise(int(admitted[:4]))) 46 | doc.add_value(2, admitted) # YYYYMMDD 47 | if population is not None: 48 | doc.add_value(3, xapian.sortable_serialise(int(population))) 49 | ### End of example code. 50 | 51 | # Store all the fields for display purposes. 52 | doc.set_data(json.dumps(fields)) 53 | 54 | # We use the order to ensure each object ends up in the 55 | # database only once no matter how many times we run the 56 | # indexer. 57 | idterm = u"Q" + order 58 | doc.add_boolean_term(idterm) 59 | db.replace_document(idterm, doc) 60 | 61 | if len(sys.argv) != 3: 62 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 63 | sys.exit(1) 64 | 65 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 66 | -------------------------------------------------------------------------------- /code/python3/index_sorting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | from support import parse_csv_file 7 | 8 | def index(datapath, dbpath): 9 | # Create or open the database we're going to be writing to. 10 | db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | termgenerator = xapian.TermGenerator() 14 | termgenerator.set_stemmer(xapian.Stem("en")) 15 | 16 | for fields in parse_csv_file(datapath): 17 | # 'fields' is a dictionary mapping from field name to value. 18 | # Pick out the fields we're going to index. 19 | description = fields.get('DESCRIPTION', u'') 20 | title = fields.get('TITLE', u'') 21 | identifier = fields.get('id_NUMBER', u'') 22 | collection = fields.get('COLLECTION', u'') 23 | maker = fields.get('MAKER', u'') 24 | 25 | # We make a document and tell the term generator to use this. 26 | doc = xapian.Document() 27 | termgenerator.set_document(doc) 28 | 29 | # Index each field with a suitable prefix. 30 | termgenerator.index_text(title, 1, 'S') 31 | termgenerator.index_text(description, 1, 'XD') 32 | 33 | # Index fields without prefixes for general search. 34 | termgenerator.index_text(title) 35 | termgenerator.increase_termpos() 36 | termgenerator.index_text(description) 37 | 38 | ### Start of example code. 39 | # add the collection as a value in slot 0 40 | doc.add_value(0, collection) 41 | 42 | # add the maker as a value in slot 1 43 | doc.add_value(1, maker) 44 | ### End of example code. 45 | 46 | # Store all the fields for display purposes. 47 | doc.set_data(json.dumps(fields)) 48 | 49 | # We use the identifier to ensure each object ends up in the 50 | # database only once no matter how many times we run the 51 | # indexer. 52 | idterm = u"Q" + identifier 53 | doc.add_boolean_term(idterm) 54 | db.replace_document(idterm, doc) 55 | 56 | if len(sys.argv) != 3: 57 | print("Usage: %s DATAPATH DBPATH" % sys.argv[0]) 58 | sys.exit(1) 59 | 60 | index(datapath = sys.argv[1], dbpath = sys.argv[2]) 61 | -------------------------------------------------------------------------------- /code/python3/postingsource.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import xapian 4 | 5 | ### Start of class header and constructor. 6 | class ExternalWeightPostingSource(xapian.PostingSource): 7 | """ 8 | A Xapian posting source returning weights from an external source. 9 | """ 10 | def __init__(self, wtsource): 11 | xapian.PostingSource.__init__(self) 12 | self.wtsource = wtsource 13 | ### End of class header and constructor. 14 | 15 | ### Start of init. 16 | def init(self, db): 17 | self.db = db 18 | self.alldocs = db.postlist('') 19 | self.set_maxweight(self.wtsource.get_maxweight()) 20 | ### End of init. 21 | 22 | ### Start of termfreq methods. 23 | def get_termfreq_min(self): return self.db.get_doccount() 24 | def get_termfreq_est(self): return self.db.get_doccount() 25 | def get_termfreq_max(self): return self.db.get_doccount() 26 | ### End of termfreq methods. 27 | 28 | ### Start of get_weight. 29 | def get_weight(self): 30 | doc = self.db.get_document(self.current.docid) 31 | return self.wtsource.get_weight(doc) 32 | ### End of get_weight. 33 | 34 | ### Start of get_docid. 35 | def get_docid(self): 36 | return self.current.docid 37 | ### End of get_docid. 38 | 39 | ### Start of at_end. 40 | def at_end(self): 41 | return self.current is None 42 | ### End of at_end. 43 | 44 | ### Start of next. 45 | def next(self, minweight): 46 | try: 47 | self.current = self.alldocs.next() 48 | except StopIteration: 49 | self.current = None 50 | ### End of next. 51 | 52 | ### Start of skip_to. 53 | def skip_to(self, docid, minweight): 54 | try: 55 | self.current = self.alldocs.skip_to(docid) 56 | except StopIteration: 57 | self.current = None 58 | ### End of skip_to. 59 | -------------------------------------------------------------------------------- /code/python3/search1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | ### Start of example code. 9 | def search(dbpath, querystring, offset=0, pagesize=10): 10 | # offset - defines starting point within result set 11 | # pagesize - defines number of records to retrieve 12 | 13 | # Open the database we're going to search. 14 | db = xapian.Database(dbpath) 15 | 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = xapian.QueryParser() 18 | queryparser.set_stemmer(xapian.Stem("en")) 19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 20 | # Start of prefix configuration. 21 | queryparser.add_prefix("title", "S") 22 | queryparser.add_prefix("description", "XD") 23 | # End of prefix configuration. 24 | 25 | # And parse the query 26 | query = queryparser.parse_query(querystring) 27 | 28 | # Use an Enquire object on the database to run the query 29 | enquire = xapian.Enquire(db) 30 | enquire.set_query(query) 31 | 32 | # And print out something about each match 33 | matches = [] 34 | for match in enquire.get_mset(offset, pagesize): 35 | fields = json.loads(match.document.get_data().decode('utf8')) 36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 37 | 'rank': match.rank + 1, 38 | 'docid': match.docid, 39 | 'title': fields.get('TITLE', u''), 40 | }) 41 | matches.append(match.docid) 42 | 43 | # Finally, make sure we log the query and displayed results 44 | support.log_matches(querystring, offset, pagesize, matches) 45 | ### End of example code. 46 | 47 | if len(sys.argv) < 3: 48 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 49 | sys.exit(1) 50 | 51 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 52 | -------------------------------------------------------------------------------- /code/python3/search_facets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | 29 | # And print out something about each match 30 | matches = [] 31 | 32 | ### Start of example code. 33 | # Set up a spy to inspect the MAKER value at slot 1 34 | spy = xapian.ValueCountMatchSpy(1) 35 | enquire.add_matchspy(spy) 36 | 37 | for match in enquire.get_mset(offset, pagesize, 100): 38 | fields = json.loads(match.document.get_data().decode('utf8')) 39 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 40 | 'rank': match.rank + 1, 41 | 'docid': match.docid, 42 | 'title': fields.get('TITLE', u''), 43 | }) 44 | matches.append(match.docid) 45 | 46 | # Fetch and display the spy values 47 | for facet in spy.values(): 48 | print("Facet: %(term)s; count: %(count)i" % { 49 | 'term' : facet.term.decode('utf-8'), 50 | 'count' : facet.termfreq 51 | }) 52 | 53 | # Finally, make sure we log the query and displayed results 54 | support.log_matches(querystring, offset, pagesize, matches) 55 | ### End of example code. 56 | 57 | if len(sys.argv) < 3: 58 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 59 | sys.exit(1) 60 | 61 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 62 | -------------------------------------------------------------------------------- /code/python3/search_filters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, materials, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | ### Start of example code. 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = xapian.QueryParser() 18 | queryparser.set_stemmer(xapian.Stem("en")) 19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 20 | queryparser.add_prefix("title", "S") 21 | queryparser.add_prefix("description", "XD") 22 | 23 | # And parse the query 24 | query = queryparser.parse_query(querystring) 25 | 26 | if len(materials) > 0: 27 | # Filter the results to ones which contain at least one of the 28 | # materials. 29 | 30 | # Build a query for each material value 31 | material_queries = [ 32 | xapian.Query('XM' + material.lower()) 33 | for material in materials 34 | ] 35 | 36 | # Combine these queries with an OR operator 37 | material_query = xapian.Query(xapian.Query.OP_OR, material_queries) 38 | 39 | # Use the material query to filter the main query 40 | query = xapian.Query(xapian.Query.OP_FILTER, query, material_query) 41 | ### End of example code. 42 | 43 | # Use an Enquire object on the database to run the query 44 | enquire = xapian.Enquire(db) 45 | enquire.set_query(query) 46 | 47 | # And print out something about each match 48 | matches = [] 49 | for match in enquire.get_mset(offset, pagesize): 50 | fields = json.loads(match.document.get_data().decode('utf8')) 51 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 52 | 'rank': match.rank + 1, 53 | 'docid': match.docid, 54 | 'title': fields.get('TITLE', u''), 55 | }) 56 | matches.append(match.docid) 57 | 58 | # Finally, make sure we log the query and displayed results 59 | support.log_matches(querystring, offset, pagesize, matches) 60 | 61 | if len(sys.argv) < 3: 62 | print("Usage: %s DBPATH QUERY [MATERIALS...]" % sys.argv[0]) 63 | sys.exit(1) 64 | 65 | search(dbpath = sys.argv[1], querystring = sys.argv[2], 66 | materials = sys.argv[3:]) 67 | -------------------------------------------------------------------------------- /code/python3/search_filters2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | ### Start of example code. 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = xapian.QueryParser() 18 | queryparser.set_stemmer(xapian.Stem("en")) 19 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 20 | queryparser.add_prefix("title", "S") 21 | queryparser.add_prefix("description", "XD") 22 | queryparser.add_boolean_prefix("material", "XM") 23 | 24 | # And parse the query 25 | query = queryparser.parse_query(querystring) 26 | ### End of example code. 27 | 28 | # Use an Enquire object on the database to run the query 29 | enquire = xapian.Enquire(db) 30 | enquire.set_query(query) 31 | 32 | # And print out something about each match 33 | matches = [] 34 | for match in enquire.get_mset(offset, pagesize): 35 | fields = json.loads(match.document.get_data().decode('utf8')) 36 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 37 | 'rank': match.rank + 1, 38 | 'docid': match.docid, 39 | 'title': fields.get('TITLE', u''), 40 | }) 41 | matches.append(match.docid) 42 | # Finally, make sure we log the query and displayed results 43 | support.log_matches(querystring, offset, pagesize, matches) 44 | 45 | if len(sys.argv) < 3: 46 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 47 | sys.exit(1) 48 | 49 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 50 | -------------------------------------------------------------------------------- /code/python3/search_ranges.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | # and add in range processors 22 | queryparser.add_rangeprocessor( 23 | xapian.NumberRangeProcessor(0, 'mm', xapian.RP_SUFFIX) 24 | ) 25 | queryparser.add_rangeprocessor( 26 | xapian.NumberRangeProcessor(1) 27 | ) 28 | 29 | # And parse the query 30 | query = queryparser.parse_query(querystring) 31 | 32 | # Use an Enquire object on the database to run the query 33 | enquire = xapian.Enquire(db) 34 | enquire.set_query(query) 35 | 36 | # And print out something about each match 37 | matches = [] 38 | for match in enquire.get_mset(offset, pagesize): 39 | fields = json.loads(match.document.get_data().decode('utf8')) 40 | print(u"%(rank)i: #%(docid)3.3i (%(date)s) %(measurements)s\n %(title)s" % { 41 | 'rank': match.rank + 1, 42 | 'docid': match.docid, 43 | 'measurements': fields.get('MEASUREMENTS', u''), 44 | 'date': fields.get('DATE_MADE', u''), 45 | 'title': fields.get('TITLE', u''), 46 | }) 47 | matches.append(match.docid) 48 | 49 | # Finally, make sure we log the query and displayed results 50 | support.log_matches(querystring, offset, pagesize, matches) 51 | 52 | if len(sys.argv) < 3: 53 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 54 | sys.exit(1) 55 | 56 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 57 | -------------------------------------------------------------------------------- /code/python3/search_sorting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | # Start of example code. 29 | enquire.set_sort_by_value_then_relevance(1, False) 30 | # End of example code. 31 | 32 | # And print out something about each match 33 | matches = [] 34 | for match in enquire.get_mset(offset, pagesize): 35 | fields = json.loads(match.document.get_data().decode('utf8')) 36 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 37 | 'rank': match.rank + 1, 38 | 'docid': match.docid, 39 | 'name': fields.get('name', u''), 40 | 'date': support.format_date(fields.get('admitted', u'')), 41 | 'pop': support.format_numeral(int(fields.get('population', 0))), 42 | 'lat': fields.get('latitude', u''), 43 | 'lon': fields.get('longitude', u''), 44 | }) 45 | matches.append(match.docid) 46 | 47 | # Finally, make sure we log the query and displayed results 48 | support.log_matches(querystring, offset, pagesize, matches) 49 | 50 | if len(sys.argv) < 3: 51 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 52 | sys.exit(1) 53 | 54 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 55 | -------------------------------------------------------------------------------- /code/python3/search_sorting2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | def search(dbpath, querystring, offset=0, pagesize=10): 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = xapian.Database(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = xapian.QueryParser() 17 | queryparser.set_stemmer(xapian.Stem("en")) 18 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 19 | queryparser.add_prefix("title", "S") 20 | queryparser.add_prefix("description", "XD") 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = xapian.Enquire(db) 27 | enquire.set_query(query) 28 | # Start of example code. 29 | keymaker = xapian.MultiValueKeyMaker() 30 | keymaker.add_value(1, False) 31 | keymaker.add_value(3, True) 32 | enquire.set_sort_by_key_then_relevance(keymaker, False) 33 | # End of example code. 34 | 35 | # And print out something about each match 36 | matches = [] 37 | for match in enquire.get_mset(offset, pagesize): 38 | fields = json.loads(match.document.get_data().decode('utf8')) 39 | print(u"%(rank)i: #%(docid)3.3i %(name)s %(date)s\n Population %(pop)s" % { 40 | 'rank': match.rank + 1, 41 | 'docid': match.docid, 42 | 'name': fields.get('name', u''), 43 | 'date': support.format_date(fields.get('admitted', u'')), 44 | 'pop': support.format_numeral(int(fields.get('population', 0))), 45 | 'lat': fields.get('latitude', u''), 46 | 'lon': fields.get('longitude', u''), 47 | }) 48 | matches.append(match.docid) 49 | 50 | # Finally, make sure we log the query and displayed results 51 | support.log_matches(querystring, offset, pagesize, matches) 52 | 53 | if len(sys.argv) < 3: 54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 55 | sys.exit(1) 56 | 57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 58 | -------------------------------------------------------------------------------- /code/python3/search_synonyms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import json 4 | import sys 5 | import xapian 6 | import support 7 | 8 | ### Start of example code. 9 | def search(dbpath, querystring, offset=0, pagesize=10): 10 | # offset - defines starting point within result set 11 | # pagesize - defines number of records to retrieve 12 | 13 | # Open the database we're going to search. 14 | db = xapian.WritableDatabase(dbpath) 15 | 16 | # Start of adding synonyms 17 | db.add_synonym("time", "calendar") 18 | # End of adding synonyms 19 | 20 | # Set up a QueryParser with a stemmer and suitable prefixes 21 | queryparser = xapian.QueryParser() 22 | queryparser.set_stemmer(xapian.Stem("en")) 23 | queryparser.set_stemming_strategy(queryparser.STEM_SOME) 24 | queryparser.add_prefix("title", "S") 25 | queryparser.add_prefix("description", "XD") 26 | 27 | # Start of set database 28 | queryparser.set_database(db) 29 | # End of set database 30 | 31 | # And parse the query 32 | query = queryparser.parse_query(querystring, queryparser.FLAG_SYNONYM) 33 | 34 | # Use an Enquire object on the database to run the query 35 | enquire = xapian.Enquire(db) 36 | enquire.set_query(query) 37 | 38 | # And print out something about each match 39 | matches = [] 40 | for match in enquire.get_mset(offset, pagesize): 41 | fields = json.loads(match.document.get_data().decode('utf8')) 42 | print(u"%(rank)i: #%(docid)3.3i %(title)s" % { 43 | 'rank': match.rank + 1, 44 | 'docid': match.docid, 45 | 'title': fields.get('TITLE', u''), 46 | }) 47 | matches.append(match.docid) 48 | 49 | # Finally, make sure we log the query and displayed results 50 | support.log_matches(querystring, offset, pagesize, matches) 51 | ### End of example code. 52 | 53 | if len(sys.argv) < 3: 54 | print("Usage: %s DBPATH QUERYTERM..." % sys.argv[0]) 55 | sys.exit(1) 56 | 57 | search(dbpath = sys.argv[1], querystring = " ".join(sys.argv[2:])) 58 | -------------------------------------------------------------------------------- /code/ruby/delete1.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | 6 | ### Start of example code. 7 | def delete_docs(dbpath, identifiers) 8 | db = Xapian::WritableDatabase.new(dbpath, Xapian::DB_OPEN) 9 | identifiers.each do |identifier| 10 | idterm = "Q#{identifier}" 11 | db.delete_document(idterm) 12 | end 13 | end 14 | ### End of example code. 15 | 16 | abort "Usage #{__FILE__} DBPATH ID..." if ARGV.length < 2 17 | 18 | delete_docs(ARGV[0], ARGV[1..]) 19 | -------------------------------------------------------------------------------- /code/ruby/index1.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | ### Start of example code. 9 | def index_csv(data_path, db_path) 10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN) 11 | term_generator = Xapian::TermGenerator.new 12 | term_generator.stemmer = Xapian::Stem.new('en') 13 | parse_csv_file(data_path).each do |row| 14 | doc = Xapian::Document.new 15 | term_generator.document = doc 16 | term_generator.index_text(row['TITLE'].to_s, 1, 'S') 17 | term_generator.index_text(row['DESCRIPTION'].to_s, 1, 'XD') 18 | term_generator.index_text(row['TITLE'].to_s) 19 | term_generator.increase_termpos 20 | term_generator.index_text(row['DESCRIPTION'].to_s) 21 | doc.data = row.to_h.to_json 22 | idterm = "Q#{row['id_NUMBER']}" 23 | doc.add_boolean_term(idterm) 24 | db.replace_document(idterm, doc) 25 | end 26 | end 27 | ### End of example code. 28 | 29 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2 30 | 31 | index_csv(ARGV[0], ARGV[1]) 32 | -------------------------------------------------------------------------------- /code/ruby/index_facets.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | ### Start of example code. 9 | def index(data_path, db_path) 10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | term_generator = Xapian::TermGenerator.new 14 | term_generator.stemmer = Xapian::Stem.new('en') 15 | 16 | parse_csv_file(data_path).each do |row| 17 | title = row['TITLE'].to_s 18 | description = row['DESCRIPTION'].to_s 19 | identifier = row['id_NUMBER'].to_s 20 | collection = row['COLLECTION'].to_s 21 | maker = row['MAKER'].to_s 22 | 23 | # We make a document and tell the term generator to use this. 24 | doc = Xapian::Document.new 25 | term_generator.document = doc 26 | 27 | # Index each field with a suitable prefix. 28 | term_generator.index_text(title, 1, 'S') 29 | term_generator.index_text(description, 1, 'XD') 30 | 31 | # Index fields without prefixes for general search. 32 | term_generator.index_text(title) 33 | term_generator.increase_termpos 34 | term_generator.index_text(description) 35 | 36 | # Add the collection as a value in slot 0. 37 | doc.add_value(0, collection) 38 | 39 | # Add the maker as a value in slot 1. 40 | doc.add_value(1, maker) 41 | 42 | # Store all the fields for display purposes. 43 | doc.data = row.to_h.to_json 44 | 45 | # We use the identifier to ensure each object ends up in the 46 | # database only once no matter how many times we run the indexer. 47 | idterm = "Q#{identifier}" 48 | doc.add_boolean_term(idterm) 49 | db.replace_document(idterm, doc) 50 | end 51 | end 52 | ### End of example code. 53 | 54 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2 55 | 56 | index(ARGV[0], ARGV[1]) 57 | -------------------------------------------------------------------------------- /code/ruby/index_filters.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | ### Start of example code. 9 | def index(data_path, db_path) 10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN) 11 | term_generator = Xapian::TermGenerator.new 12 | term_generator.stemmer = Xapian::Stem.new('en') 13 | parse_csv_file(data_path).each do |row| 14 | doc = Xapian::Document.new 15 | term_generator.document = doc 16 | term_generator.index_text(row['TITLE'].to_s, 1, 'S') 17 | term_generator.index_text(row['DESCRIPTION'].to_s, 1, 'XD') 18 | term_generator.index_text(row['TITLE'].to_s) 19 | term_generator.increase_termpos 20 | term_generator.index_text(row['DESCRIPTION'].to_s) 21 | 22 | ### Start of new indexing code. 23 | # Index the MATERIALS field, splitting on semicolons. 24 | row['MATERIALS'].to_s.split(';').each do |material| 25 | material.strip! 26 | material.downcase! 27 | doc.add_boolean_term("XM#{material}") if material.length.positive? 28 | end 29 | ### End of new indexing code. 30 | 31 | doc.data = row.to_h.to_json 32 | idterm = "Q#{row['id_NUMBER']}" 33 | doc.add_boolean_term(idterm) 34 | db.replace_document(idterm, doc) 35 | end 36 | end 37 | ### End of example code. 38 | 39 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2 40 | 41 | index(ARGV[0], ARGV[1]) 42 | -------------------------------------------------------------------------------- /code/ruby/index_ranges.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def index(data_path, db_path) 9 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN) 10 | 11 | # Set up a TermGenerator that we'll use in indexing. 12 | term_generator = Xapian::TermGenerator.new 13 | term_generator.stemmer = Xapian::Stem.new('en') 14 | 15 | parse_csv_file(data_path).each do |row| 16 | doc = Xapian::Document.new 17 | term_generator.document = doc 18 | 19 | # Index each field with a suitable prefix. 20 | term_generator.index_text(row['TITLE'].to_s, 1, 'S') 21 | term_generator.index_text(row['DESCRIPTION'].to_s, 1, 'XD') 22 | 23 | # Index fields without prefixes for general search. 24 | term_generator.index_text(row['TITLE'].to_s) 25 | term_generator.increase_termpos 26 | term_generator.index_text(row['DESCRIPTION'].to_s) 27 | 28 | doc.data = row.to_h.to_json 29 | 30 | ### Start of example code. 31 | 32 | # parse the two values we need 33 | measurements = row['MEASUREMENTS'].to_s 34 | unless measurements.empty? 35 | numbers = numbers_from_string(measurements) 36 | doc.add_value(0, Xapian.sortable_serialise(numbers.max)) unless numbers.empty? 37 | 38 | date_made = row['DATE_MADE'].to_s 39 | years = numbers_from_string(date_made) 40 | doc.add_value(1, Xapian.sortable_serialise(years[0])) unless years.empty? 41 | end 42 | ### End of example code. 43 | 44 | # We use the identifier to ensure each object ends up in the 45 | # database only once no matter how many times we run the indexer. 46 | 47 | idterm = "Q#{row['id_NUMBER']}" 48 | doc.add_boolean_term(idterm) 49 | db.replace_document(idterm, doc) 50 | end 51 | end 52 | 53 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2 54 | 55 | index(ARGV[0], ARGV[1]) 56 | -------------------------------------------------------------------------------- /code/ruby/index_ranges2.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def index(data_path, db_path) 9 | # Create or open the database we're going to be writing to. 10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | term_generator = Xapian::TermGenerator.new 14 | term_generator.stemmer = Xapian::Stem.new('en') 15 | 16 | parse_states(data_path).each do |row| 17 | # We make a document and tell the term generator to use this. 18 | doc = Xapian::Document.new 19 | term_generator.document = doc 20 | 21 | ### Start of example code. 22 | # Index each field with a suitable prefix. 23 | term_generator.index_text(row['name'].to_s, 1, 'S') 24 | term_generator.index_text(row['description'].to_s, 1, 'XD') 25 | term_generator.index_text(row['motto'].to_s, 1, 'XM') 26 | 27 | # Index fields without prefixes for general search. 28 | term_generator.index_text(row['name'].to_s) 29 | term_generator.increase_termpos 30 | term_generator.index_text(row['description'].to_s) 31 | term_generator.increase_termpos 32 | term_generator.index_text(row['motto'].to_s) 33 | 34 | admitted = row['admitted'].to_s 35 | # Add document values. 36 | unless admitted.empty? 37 | doc.add_value(1, Xapian.sortable_serialise(admitted[0..3].to_i)) 38 | doc.add_value(2, admitted) # YYYYMMDD 39 | end 40 | 41 | doc.add_value(3, Xapian.sortable_serialise(row['population'].to_i)) if row['population'] 42 | ### End of example code. 43 | 44 | doc.data = row.to_h.to_json 45 | 46 | # We use the identifier to ensure each object ends up in the 47 | # database only once no matter how many times we run the indexer. 48 | 49 | idterm = "Q#{row['order']}" 50 | doc.add_boolean_term(idterm) 51 | db.replace_document(idterm, doc) 52 | end 53 | end 54 | 55 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2 56 | 57 | index(ARGV[0], ARGV[1]) 58 | -------------------------------------------------------------------------------- /code/ruby/index_values_with_geo.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def index(data_path, db_path) 9 | # Create or open the database we're going to be writing to. 10 | db = Xapian::WritableDatabase.new(db_path, Xapian::DB_CREATE_OR_OPEN) 11 | 12 | # Set up a TermGenerator that we'll use in indexing. 13 | term_generator = Xapian::TermGenerator.new 14 | term_generator.stemmer = Xapian::Stem.new('en') 15 | 16 | parse_states(data_path).each do |row| 17 | # We make a document and tell the term generator to use this. 18 | doc = Xapian::Document.new 19 | term_generator.document = doc 20 | 21 | # Index each field with a suitable prefix. 22 | term_generator.index_text(row['name'].to_s, 1, 'S') 23 | term_generator.index_text(row['description'].to_s, 1, 'XD') 24 | term_generator.index_text(row['motto'].to_s, 1, 'XM') 25 | 26 | # Index fields without prefixes for general search. 27 | term_generator.index_text(row['name'].to_s) 28 | term_generator.increase_termpos 29 | term_generator.index_text(row['description'].to_s) 30 | term_generator.increase_termpos 31 | term_generator.index_text(row['motto'].to_s) 32 | 33 | admitted = row['admitted'].to_s 34 | # Add document values. 35 | unless admitted.empty? 36 | doc.add_value(1, Xapian.sortable_serialise(admitted[0..3].to_i)) 37 | doc.add_value(2, admitted) # YYYYMMDD 38 | end 39 | 40 | doc.add_value(3, Xapian.sortable_serialise(row['population'].to_i)) if row['population'] 41 | 42 | ### Start of example code. 43 | doc.add_value(4, "#{row['midlat'].to_f},#{row['midlon'].to_f}") if row['midlat'] && row['midlon'] 44 | ### End of example code. 45 | 46 | doc.data = row.to_h.to_json 47 | 48 | # We use the identifier to ensure each object ends up in the 49 | # database only once no matter how many times we run the indexer. 50 | 51 | idterm = "Q#{row['order']}" 52 | doc.add_boolean_term(idterm) 53 | db.replace_document(idterm, doc) 54 | end 55 | end 56 | 57 | abort "Usage #{__FILE__} DATAPATH DBPATH" if ARGV.length < 2 58 | 59 | index(ARGV[0], ARGV[1]) 60 | -------------------------------------------------------------------------------- /code/ruby/search1.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | ### Start of example code. 9 | def search(dbpath, querystring, offset: 0, pagesize: 10) 10 | # offset - defines starting point within result set 11 | # pagesize - defines number of records to retrieve 12 | 13 | # Open the database we're going to search. 14 | db = Xapian::Database.new(dbpath) 15 | 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = Xapian::QueryParser.new 18 | queryparser.stemmer = Xapian::Stem.new('en') 19 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 20 | 21 | # Start of prefix configuration. 22 | queryparser.add_prefix('title', 'S') 23 | queryparser.add_prefix('description', 'XD') 24 | # End of prefix configuration. 25 | 26 | # And parse the query 27 | query = queryparser.parse_query(querystring) 28 | # Use an Enquire object on the database to run the query 29 | enquire = Xapian::Enquire.new(db) 30 | enquire.query = query 31 | matches = [] 32 | enquire.mset(offset, pagesize).matches.each do |match| 33 | fields = JSON.parse(match.document.data) 34 | printf "%i: #%3.3i %s\n", 35 | rank: match.rank + 1, 36 | docid: match.docid, 37 | title: fields['TITLE'] 38 | matches << match.docid 39 | end 40 | log_matches(querystring, offset, pagesize, matches) 41 | end 42 | ### End of example code. 43 | 44 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 45 | 46 | search(ARGV[0], ARGV[1..].join(' ')) 47 | -------------------------------------------------------------------------------- /code/ruby/search_facets.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def search(dbpath, querystring, offset: 0, pagesize: 10) 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = Xapian::Database.new(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = Xapian::QueryParser.new 17 | queryparser.stemmer = Xapian::Stem.new('en') 18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 19 | queryparser.add_prefix('title', 'S') 20 | queryparser.add_prefix('description', 'XD') 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = Xapian::Enquire.new(db) 27 | enquire.query = query 28 | 29 | # And print out something about each match 30 | matches = [] 31 | 32 | ### Start of example code. 33 | # Set up a spy to inspect the MAKER value at slot 1 34 | spy = Xapian::ValueCountMatchSpy.new(1) 35 | enquire.add_matchspy(spy) 36 | 37 | enquire.mset(offset, pagesize, 100).matches.each do |match| 38 | fields = JSON.parse(match.document.data) 39 | printf "%<rank>i: #%<docid>3.3i %<title>s\n", 40 | rank: match.rank + 1, 41 | docid: match.docid, 42 | title: fields['TITLE'] 43 | matches << match.docid 44 | end 45 | spy.values.each do |facet| 46 | printf "Facet: %<term>s; count: %<count>i\n", 47 | term: facet.term, 48 | count: facet.termfreq 49 | end 50 | 51 | # Finally, make sure we log the query and displayed results 52 | log_matches(querystring, offset, pagesize, matches) 53 | ### End of example code. 54 | end 55 | 56 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 57 | 58 | search(ARGV[0], ARGV[1..].join(' ')) 59 | -------------------------------------------------------------------------------- /code/ruby/search_filters.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def search(dbpath, querystring, materials, offset: 0, pagesize: 10) 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = Xapian::Database.new(dbpath) 14 | 15 | ### Start of example code. 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = Xapian::QueryParser.new 18 | queryparser.stemmer = Xapian::Stem.new('en') 19 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 20 | queryparser.add_prefix('title', 'S') 21 | queryparser.add_prefix('description', 'XD') 22 | 23 | # And parse the query 24 | query = queryparser.parse_query(querystring) 25 | 26 | if materials.length.positive? 27 | # Filter the results to ones which contain at least one of the 28 | # materials. 29 | 30 | # Build a query for each material value 31 | material_queries = materials.map { |material| "XM#{material.downcase}" } 32 | 33 | # Build a query for each material value 34 | material_query = Xapian::Query.new(Xapian::Query::OP_OR, material_queries) 35 | 36 | # Use the material query to filter the main query 37 | query = Xapian::Query.new(Xapian::Query::OP_FILTER, query, material_query) 38 | end 39 | ### End of example code. 40 | 41 | # Use an Enquire object on the database to run the query 42 | enquire = Xapian::Enquire.new(db) 43 | enquire.query = query 44 | 45 | # And print out something about each match 46 | matches = [] 47 | enquire.mset(offset, pagesize).matches.each do |match| 48 | fields = JSON.parse(match.document.data) 49 | printf "%<rank>i: #%<docid>3.3i %<title>s\n", 50 | rank: match.rank + 1, 51 | docid: match.docid, 52 | title: fields['TITLE'] 53 | matches << match.docid 54 | end 55 | log_matches(querystring, offset, pagesize, matches) 56 | end 57 | ### End of example code. 58 | 59 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 60 | 61 | search(ARGV[0], ARGV[1], ARGV[2..]) 62 | -------------------------------------------------------------------------------- /code/ruby/search_filters2.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def search(dbpath, querystring, offset: 0, pagesize: 10) 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = Xapian::Database.new(dbpath) 14 | 15 | ### Start of example code. 16 | # Set up a QueryParser with a stemmer and suitable prefixes 17 | queryparser = Xapian::QueryParser.new 18 | queryparser.stemmer = Xapian::Stem.new('en') 19 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 20 | queryparser.add_prefix('title', 'S') 21 | queryparser.add_prefix('description', 'XD') 22 | queryparser.add_boolean_prefix('material', 'XM') 23 | 24 | # And parse the query 25 | query = queryparser.parse_query(querystring) 26 | ### End of example code. 27 | 28 | # Use an Enquire object on the database to run the query 29 | enquire = Xapian::Enquire.new(db) 30 | enquire.query = query 31 | 32 | # And print out something about each match 33 | matches = [] 34 | enquire.mset(offset, pagesize).matches.each do |match| 35 | fields = JSON.parse(match.document.data) 36 | printf "%<rank>i: #%<docid>3.3i %<title>s\n", 37 | rank: match.rank + 1, 38 | docid: match.docid, 39 | title: fields['TITLE'] 40 | matches << match.docid 41 | end 42 | log_matches(querystring, offset, pagesize, matches) 43 | end 44 | ### End of example code. 45 | 46 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 47 | 48 | search(ARGV[0], ARGV[1]) 49 | -------------------------------------------------------------------------------- /code/ruby/search_ranges.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def search(dbpath, querystring, offset: 0, pagesize: 10) 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = Xapian::Database.new(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = Xapian::QueryParser.new 17 | queryparser.stemmer = Xapian::Stem.new('en') 18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 19 | queryparser.add_prefix('title', 'S') 20 | queryparser.add_prefix('description', 'XD') 21 | # and add in range processors 22 | queryparser.add_rangeprocessor(Xapian::NumberRangeProcessor.new(0, 'mm', Xapian::RP_SUFFIX)) 23 | queryparser.add_rangeprocessor(Xapian::NumberRangeProcessor.new(1)) 24 | 25 | # And parse the query 26 | query = queryparser.parse_query(querystring) 27 | 28 | # Use an Enquire object on the database to run the query 29 | enquire = Xapian::Enquire.new(db) 30 | enquire.query = query 31 | 32 | # And print out something about each match 33 | matches = [] 34 | enquire.mset(offset, pagesize).matches.each do |match| 35 | fields = JSON.parse(match.document.data) 36 | printf "%<rank>i: #%<docid>3.3i (%<date>s) %<measurements>s\n %<title>s\n", 37 | rank: match.rank + 1, 38 | docid: match.docid, 39 | measurements: fields['MEASUREMENTS'], 40 | date: fields['DATE_MADE'], 41 | title: fields['TITLE'] 42 | matches << match.docid 43 | end 44 | log_matches(querystring, offset, pagesize, matches) 45 | end 46 | 47 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 48 | 49 | search(ARGV[0], ARGV[1..].join(' ')) 50 | -------------------------------------------------------------------------------- /code/ruby/search_sorting.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def search(dbpath, querystring, offset: 0, pagesize: 10) 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = Xapian::Database.new(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = Xapian::QueryParser.new 17 | queryparser.stemmer = Xapian::Stem.new('en') 18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 19 | queryparser.add_prefix('title', 'S') 20 | queryparser.add_prefix('description', 'XD') 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = Xapian::Enquire.new(db) 27 | enquire.query = query 28 | 29 | # puts enquire.methods 30 | 31 | # Start of example code. 32 | enquire.sort_by_value_then_relevance!(1, false) 33 | # End of example code. 34 | 35 | # And print out something about each match 36 | matches = [] 37 | enquire.mset(offset, pagesize).matches.each do |match| 38 | fields = JSON.parse(match.document.data) 39 | printf "%<rank>i: #%<docid>3.3i %<name>s %<date>s\n Population %<pop>s\n", 40 | rank: match.rank + 1, 41 | docid: match.docid, 42 | name: fields['name'], 43 | date: format_date(fields['admitted'].to_s), 44 | pop: format_numeral(fields['population'].to_i), 45 | lat: fields['latitude'].to_s, 46 | lon: fields['longitude'].to_s 47 | 48 | matches << match.docid 49 | end 50 | # Finally, make sure we log the query and displayed results 51 | log_matches(querystring, offset, pagesize, matches) 52 | end 53 | 54 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 55 | 56 | search(ARGV[0], ARGV[1..].join(' ')) 57 | -------------------------------------------------------------------------------- /code/ruby/search_sorting2.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | def search(dbpath, querystring, offset: 0, pagesize: 10) 9 | # offset - defines starting point within result set 10 | # pagesize - defines number of records to retrieve 11 | 12 | # Open the database we're going to search. 13 | db = Xapian::Database.new(dbpath) 14 | 15 | # Set up a QueryParser with a stemmer and suitable prefixes 16 | queryparser = Xapian::QueryParser.new 17 | queryparser.stemmer = Xapian::Stem.new('en') 18 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 19 | queryparser.add_prefix('title', 'S') 20 | queryparser.add_prefix('description', 'XD') 21 | 22 | # And parse the query 23 | query = queryparser.parse_query(querystring) 24 | 25 | # Use an Enquire object on the database to run the query 26 | enquire = Xapian::Enquire.new(db) 27 | enquire.query = query 28 | 29 | # puts enquire.methods 30 | 31 | # Start of example code. 32 | keymaker = Xapian::MultiValueKeyMaker.new 33 | keymaker.add_value(1, false) 34 | keymaker.add_value(3, true) 35 | enquire.set_sort_by_key_then_relevance(keymaker, false) 36 | # End of example code. 37 | 38 | # And print out something about each match 39 | matches = [] 40 | enquire.mset(offset, pagesize).matches.each do |match| 41 | fields = JSON.parse(match.document.data) 42 | printf "%<rank>i: #%<docid>3.3i %<name>s %<date>s\n Population %<pop>s\n", 43 | rank: match.rank + 1, 44 | docid: match.docid, 45 | name: fields['name'], 46 | date: format_date(fields['admitted'].to_s), 47 | pop: format_numeral(fields['population'].to_i), 48 | lat: fields['latitude'].to_s, 49 | lon: fields['longitude'].to_s 50 | 51 | matches << match.docid 52 | end 53 | # Finally, make sure we log the query and displayed results 54 | log_matches(querystring, offset, pagesize, matches) 55 | end 56 | 57 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 58 | 59 | search(ARGV[0], ARGV[1..].join(' ')) 60 | -------------------------------------------------------------------------------- /code/ruby/search_sorting3.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | # Start of example code. 9 | class DistanceKeyMaker < Xapian::KeyMaker 10 | def __call__(doc) 11 | coords = doc.value(4).split(',').map(&:to_f) 12 | washington = [38.012, -77.037] 13 | Xapian.sortable_serialise(distance_between_coords(coords, washington)) 14 | end 15 | end 16 | ## and later 17 | # enquire.set_sort_by_key_then_relevance(DistanceKeyMaker.new, false) 18 | # End of example code. 19 | 20 | def search(dbpath, querystring, offset: 0, pagesize: 10) 21 | # offset - defines starting point within result set 22 | # pagesize - defines number of records to retrieve 23 | 24 | # Open the database we're going to search. 25 | db = Xapian::Database.new(dbpath) 26 | 27 | # Set up a QueryParser with a stemmer and suitable prefixes 28 | queryparser = Xapian::QueryParser.new 29 | queryparser.stemmer = Xapian::Stem.new('en') 30 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 31 | queryparser.add_prefix('title', 'S') 32 | queryparser.add_prefix('description', 'XD') 33 | 34 | # And parse the query 35 | query = queryparser.parse_query(querystring) 36 | 37 | # Use an Enquire object on the database to run the query 38 | enquire = Xapian::Enquire.new(db) 39 | enquire.query = query 40 | 41 | enquire.set_sort_by_key_then_relevance(DistanceKeyMaker.new, false) 42 | 43 | # And print out something about each match 44 | matches = [] 45 | enquire.mset(offset, pagesize).matches.each do |match| 46 | fields = JSON.parse(match.document.data) 47 | printf "%<rank>i: #%<docid>3.3i %<name>s %<date>s\n Population %<pop>s\n", 48 | rank: match.rank + 1, 49 | docid: match.docid, 50 | name: fields['name'], 51 | date: format_date(fields['admitted'].to_s), 52 | pop: format_numeral(fields['population'].to_i), 53 | lat: fields['latitude'].to_s, 54 | lon: fields['longitude'].to_s 55 | 56 | matches << match.docid 57 | end 58 | # Finally, make sure we log the query and displayed results 59 | log_matches(querystring, offset, pagesize, matches) 60 | end 61 | 62 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 63 | 64 | search(ARGV[0], ARGV[1..].join(' ')) 65 | -------------------------------------------------------------------------------- /code/ruby/search_synonyms.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require 'xapian' 5 | require 'json' 6 | require_relative 'support' 7 | 8 | ### Start of example code. 9 | def search(dbpath, querystring, offset: 0, pagesize: 10) 10 | # offset - defines starting point within result set 11 | # pagesize - defines number of records to retrieve 12 | 13 | # Open the database we're going to search. 14 | db = Xapian::WritableDatabase.new(dbpath) 15 | 16 | # Start of adding synonyms 17 | db.add_synonym("time", "calendar") 18 | # End of adding synonyms 19 | 20 | # Set up a QueryParser with a stemmer and suitable prefixes 21 | queryparser = Xapian::QueryParser.new 22 | queryparser.stemmer = Xapian::Stem.new('en') 23 | queryparser.stemming_strategy = Xapian::QueryParser::STEM_SOME 24 | queryparser.add_prefix('title', 'S') 25 | queryparser.add_prefix('description', 'XD') 26 | 27 | # Start of set database 28 | queryparser.database = db 29 | # End of set database 30 | 31 | # And parse the query 32 | query = queryparser.parse_query(querystring, 33 | Xapian::QueryParser::FLAG_SYNONYM) 34 | 35 | # Use an Enquire object on the database to run the query 36 | enquire = Xapian::Enquire.new(db) 37 | enquire.query = query 38 | 39 | # And print out something about each match 40 | matches = [] 41 | enquire.mset(offset, pagesize).matches.each do |match| 42 | fields = JSON.parse(match.document.data) 43 | printf "%<rank>i: #%<docid>3.3i %<title>s\n", 44 | rank: match.rank + 1, 45 | docid: match.docid, 46 | title: fields['TITLE'] 47 | matches << match.docid 48 | end 49 | log_matches(querystring, offset, pagesize, matches) 50 | end 51 | ### End of example code. 52 | 53 | abort "Usage #{__FILE__} DBPATH QUERY..." if ARGV.length < 2 54 | 55 | search(ARGV[0], ARGV[1..].join(' ')) 56 | -------------------------------------------------------------------------------- /code/ruby/support.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'csv' 4 | require 'date' 5 | 6 | def parse_csv_file(datapath) 7 | CSV.read(datapath, headers: true) 8 | end 9 | 10 | def parse_states(datapath) 11 | CSV.read(datapath, headers: true).select { |r| r['order'] } 12 | end 13 | 14 | def log_matches(querystring, offset, pagesize, matches) 15 | puts "'#{querystring}'[#{offset}:#{offset + pagesize}] = #{matches.join(' ')}" 16 | end 17 | 18 | def numbers_from_string(string) 19 | out = [] 20 | string.scan(/[\d.]*\d[\d.]*/) do |n| 21 | out << n.to_f 22 | end 23 | out 24 | end 25 | 26 | def distance_between_coords(latlon1, latlon2) 27 | Math.sqrt(((latlon2[0] - latlon1[0])**2) + 28 | ((latlon2[1] - latlon1[1])**2)) 29 | end 30 | 31 | def format_numeral(numeral, sep: ',') 32 | raise 'Numeral must be an int type to format' unless numeral.is_a?(Integer) 33 | 34 | out = [] 35 | numeral.to_s.split('').reverse.each_with_index do |s, i| 36 | out << sep if i.positive? && (i % 3).zero? && i != numeral.to_s.size 37 | out << s 38 | end 39 | out.reverse.join('') 40 | end 41 | 42 | def format_date(datestr) 43 | raise "Could not parse date to format 'YYYYMMDD'" unless datestr.is_a? String 44 | 45 | date = DateTime.strptime(datestr, '%Y%m%d') 46 | "#{date.strftime('%B')} #{date.day}, #{date.year}" 47 | end 48 | -------------------------------------------------------------------------------- /concepts/concurrency.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Concurrency 3 | =========== 4 | 5 | --------- 6 | Threading 7 | --------- 8 | 9 | Xapian does not provide explicit support for multi-threading, though it 10 | can be used in a multi-threaded program if you are aware of the details 11 | described below. 12 | 13 | Xapian doesn't maintain any global state, so you can safely use Xapian in a 14 | multi-threaded program provided you don't share objects between threads. 15 | In practice this restriction is often not a problem - each thread can 16 | create its own :xapian-class:`Database` object, and everything will work 17 | fine. 18 | 19 | If you really want to access the same Xapian object from multiple threads, 20 | then you need to ensure that it won't ever be accessed concurrently (if you 21 | don't ensure this bad things are likely to happen - for example crashes 22 | or even data corruption). One way to prevent concurrent access is to 23 | require that a thread gets an exclusive lock on a mutex while the access is 24 | made. 25 | 26 | Xapian doesn't include thread locking code to avoid imposing an overhead 27 | when it isn't needed. And in practice the caller can often lock over 28 | several operations, which wouldn't work if the locking code was in 29 | Xapian itself. 30 | 31 | Be aware that some Xapian objects will keep internal references to others 32 | - for example, if you call :xapian-method:`Database::get_document()`, the 33 | resulting :xapian-class:`Document` object will keep a reference to the 34 | :xapian-class:`Database` object, and so you can't safely use the 35 | :xapian-class:`Database` object in one thread at the same time as using the 36 | :xapian-class:`Document` object in another. 37 | -------------------------------------------------------------------------------- /concepts/index.rst: -------------------------------------------------------------------------------- 1 | Core concepts 2 | ============= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | concurrency 8 | indexing/index 9 | search/index 10 | introduction 11 | -------------------------------------------------------------------------------- /concepts/indexing/documents.rst: -------------------------------------------------------------------------------- 1 | Documents 2 | ========= 3 | 4 | A document in Xapian is simply an item which is returned by a search. When 5 | building a new search system, a key thing to decide is what the documents 6 | in your system are going to be. There's often an obvious choice here, but 7 | in many cases there are alternatives. For example, for a search over a 8 | website, it seems natural to have one document for each page of the site. 9 | However, you could instead choose to use one document for each paragraph of 10 | each page, or to group pages together into subjects and have one document 11 | for each subject. 12 | 13 | Documents are identified in a database by a unique positive integer id, 14 | known as the `document ID`. Currently this is a 32-bit quantity by 15 | default (you can configure xapian-core with ``--enable-64bit-docid`` 16 | to get 64-bit docids). 17 | 18 | Documents have three components: `data`, `terms` and `values`. We'll 19 | discuss terms and data first - values are useful for some more advanced 20 | search types. 21 | 22 | Document Data 23 | ------------- 24 | 25 | The `document data` is an arbitrary binary blob of data associated with the 26 | document. Xapian treats this as completely opaque, and does nothing with 27 | this data other than storing it in the database (compressed with zlib if it 28 | is compressible) and returning it when requested. 29 | 30 | It can be used to hold a reference to the document elsewhere (such as the 31 | primary key in an external database table), or could be used to store the 32 | full text of the document. 33 | 34 | Generally you use the document data to store any information you need in order 35 | to display the resulting document to the user (or to whatever process consumes 36 | the results of searches). Xapian doesn't enforce a serialisation scheme for 37 | putting structured information into the document data, so you can use whatever 38 | is most appropriate for your application. 39 | 40 | Protocol buffers are often a good choice - there's support for them for most 41 | programming languages, they use a compact representation which doesn't 42 | explicitly store the field names, and you can add new fields without 43 | invalidating existing encoded data. 44 | 45 | Some other possible options are a simple scheme using newlines to separate 46 | ``name=value`` entries (like Omega uses), JSON, XML, or a language-specific 47 | serialisation such as ``pickle`` in Python. 48 | 49 | .. todo:: Talk about the importance of batching changes where feasible 50 | -------------------------------------------------------------------------------- /concepts/indexing/index.rst: -------------------------------------------------------------------------------- 1 | Indexing concepts 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | databases 8 | documents 9 | terms 10 | termgenerator 11 | uniqueness 12 | values 13 | limitations 14 | 15 | .. todo:: cover user metadata; note it is included in transactions 16 | -------------------------------------------------------------------------------- /concepts/indexing/termgenerator.rst: -------------------------------------------------------------------------------- 1 | Term Generator 2 | ============== 3 | 4 | Rather than force all users to write their own code to process text into terms 5 | for indexing, Xapian provides a :xapian-class:`TermGenerator` class. This parses 6 | chunks of text, producing appropriate terms, and adds them to a document. 7 | 8 | The :xapian-class:`TermGenerator` can be configured to perform stemming (and 9 | stopwording) when generating terms. It can optionally store information about 10 | the positions of words within the text, and can apply field-specific prefixes 11 | to the generated terms to allow searches to be restricted to specific 12 | fields. It can also add additional information to the database for use 13 | when performing spelling correction. 14 | 15 | If you're using the :xapian-class:`TermGenerator` to process text in this way, 16 | you will probably want to use the :doc:`QueryParser <../search/queryparser>` 17 | (described later) when performing searches. 18 | -------------------------------------------------------------------------------- /concepts/indexing/uniqueness.rst: -------------------------------------------------------------------------------- 1 | Using identifiers with Xapian 2 | ============================= 3 | 4 | Every document stored in a Xapian database has a unique positive integer 5 | id, either assigned automatically or manually. 6 | 7 | Often the documents which you're indexing with Xapian will already have 8 | unique ids, and you'll want to be able to use these to reindex an updated 9 | version of an existing document, or delete an expired document from the 10 | Xapian index. There are two ways of approaching this. 11 | 12 | One is to use a one-to-one mapping between your identifiers and Xapian 13 | docids. This will work if your identifiers are positive integers and they 14 | all fit within 32 bits (under about 4 billion), or if they are 64-bit 15 | and you configure xapian-core with `--enable-64bit-docid`. 16 | 17 | The other is to use a special term containing your identifier, which will 18 | work for any type of identifier. Typically you will prefix this (by 19 | convention with 'Q') to avoid collisions with other terms. Terms have a 20 | limited length (245 bytes in glass and chert), so if your unique identifiers 21 | are really long you'll need to do something more complicated. 22 | 23 | For more information on both techniques, `see our FAQ on this`_. 24 | 25 | .. _see our FAQ on this: https://trac.xapian.org/wiki/FAQ/UniqueIds 26 | -------------------------------------------------------------------------------- /concepts/search/index.rst: -------------------------------------------------------------------------------- 1 | Search concepts 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | queries 8 | queryparser 9 | ranked_matches 10 | search_limitations 11 | -------------------------------------------------------------------------------- /concepts/search/ranked_matches.rst: -------------------------------------------------------------------------------- 1 | Ranked matches 2 | ============== 3 | 4 | When you run a Query using Xapian, what you get is a list of `ranked` 5 | `matches`. 6 | 7 | Each match is a Xapian Document which satisfies the Query, with a 8 | `weight`, and the list is ordered by decreasing weight, the weight 9 | being an indicator of how good a match that Document is for the query 10 | that was run: a higher weight means a better match. The `rank` of each 11 | match is simply the position in the list of all matches, starting from 12 | 0. Some other search systems use the word "score" instead of weight. 13 | 14 | The actual weight is calculated by a `weighting scheme`; Xapian comes 15 | with a few different ones or you can write your own, although often 16 | the default is fine. (It uses a scheme called BM25, which takes into 17 | account things like how common a matching term is in a matching 18 | document compared to in the entire database, and the lengths of 19 | different matching documents.) 20 | 21 | Rather than having to run through the entire list of matches from the 22 | beginning, you actually ask for a sub-range of the entire list of 23 | matches, from an offset and extending for a given number of 24 | matches. Many search applications will provide the user with a way of 25 | "paging" through the matches, so the first page might be starting at 0 26 | for 10 matches, the second page starting at 10 for 10 matches, and so 27 | on. 28 | 29 | A page of matches in Xapian is called an MSet (for "match set"). 30 | 31 | Alternative sort orders 32 | ----------------------- 33 | 34 | Sometimes, rather than getting results sorted by `weight`, it would be more 35 | useful to get them in some other order. For example, it might be desirable 36 | to get results in order of the values stored in a date field. 37 | 38 | To do this, you first need to store the information used for the sort in a 39 | value slot, as described in the indexing documentation. You can then tell 40 | Xapian at search time to sort by the value in that slot. It is also 41 | possible to sort by the values in several slots (e.g., to sort items which 42 | have the same value in a particular slot by the value in a secondary slot). 43 | 44 | Finally, it is possible to ask Xapian to return the documents in order of 45 | the Xapian document ID numbers. 46 | -------------------------------------------------------------------------------- /data/us_states_on_wikipedia: -------------------------------------------------------------------------------- 1 | Washington_(state) 2 | Oregon 3 | Montana 4 | Idaho 5 | Nevada 6 | California 7 | Utah 8 | Wyoming 9 | Colorado 10 | New_Mexico 11 | Alaska 12 | Hawaii 13 | North_Dakota 14 | South_Dakota 15 | Nebraska 16 | Kansas 17 | Texas 18 | Minnesota 19 | Wisconsin 20 | Iowa 21 | Oklahoma 22 | Missouri 23 | Arizona 24 | Arkansas 25 | Louisiana 26 | Mississippi 27 | Illinois 28 | Indiana 29 | Michigan 30 | Ohio 31 | Kentucky 32 | Tennessee 33 | Alabama 34 | Georgia_(U.S._state) 35 | Florida 36 | South_Carolina 37 | North_Carolina 38 | Virginia 39 | West_Virginia 40 | Pennsylvania 41 | New_York 42 | Vermont 43 | New_Jersey 44 | Maine 45 | New_Hampshire 46 | Massachusetts 47 | Rhode_Island 48 | Connecticut 49 | Delaware 50 | Maryland 51 | -------------------------------------------------------------------------------- /deprecation/index.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Deprecation of features 3 | ======================= 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | deprecation 9 | features_deprecated 10 | features_removed 11 | -------------------------------------------------------------------------------- /howtos/index.rst: -------------------------------------------------------------------------------- 1 | How To... 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | boolean_filters 8 | range_queries 9 | facets 10 | sorting 11 | collapsing 12 | spelling 13 | synonyms 14 | weighting_scheme 15 | iterate_all_docs 16 | -------------------------------------------------------------------------------- /howtos/iterate_all_docs.rst: -------------------------------------------------------------------------------- 1 | Iterate through all documents 2 | ============================= 3 | 4 | Sometimes you want to access all the documents in a Xapian database. This can actually be done in two ways: 5 | 6 | .. _match-all: 7 | 8 | MatchAll Queries 9 | ---------------- 10 | 11 | The `Xapian::Query::MatchAll` query is a special static query which will match all documents in the database. 12 | If you run this query on its own, with appropriate start and end parameters, you could retrieve all the documents. 13 | However be aware that even if you paged through the result sets, when you try to access a page deep in the result 14 | set a lot of processing and memory will be used even if the page is small, so running a plain `MatchAll` query is 15 | rarely a good idea. 16 | 17 | However, this method *is* appropriate if you're constructing a complicated query, and one of the components of that 18 | query should be all the documents. In particular, since Xapian doesn't support a unary `NOT` operator, if you want to 19 | run a "pure NOT" query to retrieve all documents which do not contain a given term, this can be only be done using a 20 | `MatchAll` query and the binary `NOT` operator. 21 | 22 | .. todo: Need an example here, and probably some rewording of the previous paragraph. 23 | 24 | .. note: MatchAll queries can also be created by constructing a query with an empty term: the MatchAll class is 25 | .. syntactic sugar for this, and avoids you needing to create an instance of a query for this. 26 | 27 | Iterating through all documents 28 | ------------------------------- 29 | 30 | If you do need access to all the documents in the database, it is better to use a "posting list iterator". 31 | Such an iterator, which returns all documents in the database, can be created using:: 32 | 33 | Xapian::Database::postlist_begin("") 34 | 35 | In Xapian, a postlist is a list of the documents in which a term exists. Here, we're again using the special 36 | "empty" term, which implicitly matches all documents, to get an iterator over all documents. 37 | 38 | The iterator can be dereferenced to get the document IDs; to get the actual documents, the 39 | :xapian-method:`Database::get_document()` method should be used. 40 | 41 | .. todo: Need an example here, and probably some rewording. 42 | -------------------------------------------------------------------------------- /index.rst: -------------------------------------------------------------------------------- 1 | Getting Started with Xapian |version| 2 | ===================================== 3 | 4 | .. note:: 5 | 6 | Not all Xapian functionality is yet documented in this guide, so 7 | once you've gone through it you may wish to look at our `online API 8 | documentation`_ and also at some of the additional help available 9 | on `the Xapian wiki`_. 10 | 11 | .. _online API documentation: https://xapian.org/docs/apidoc/html/annotated.html 12 | .. _the Xapian wiki: https://trac.xapian.org/wiki/ 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | overview 18 | language_specific 19 | concepts/index 20 | practical_example/index 21 | howtos/index 22 | advanced/index 23 | deprecation/index 24 | glossary 25 | LICENSE 26 | -------------------------------------------------------------------------------- /language_specific.rst: -------------------------------------------------------------------------------- 1 | .. xapianinclude:: language_specific/LANGUAGE/index.rst 2 | -------------------------------------------------------------------------------- /language_specific/c++/index.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | C++ Specific Notes 3 | ================== 4 | 5 | Exceptions 6 | ========== 7 | 8 | Xapian reports errors by throwing exceptions. For failures in things like 9 | memory allocation, you will see exceptions derived from ``std::exception``, 10 | but exceptions related to Xapian-specific issues will be derived from 11 | ``Xapian::Error``. 12 | 13 | Uncaught exceptions will cause your program to terminate, so it's wise 14 | to at least have a top-level exception handler which can catch any 15 | exceptions and report what they were. You can call the ``get_description()`` 16 | method on a ``Xapian::Error`` object to get a human readable string including 17 | all the information the object contains. 18 | 19 | Because ``Xapian::Error`` is an abstract base class you need to catch 20 | it by reference:: 21 | 22 | try { 23 | do_something_with_xapian(); 24 | } catch (const Xapian::Error & e) { 25 | cout << "Exception: " << e.get_description() << '\n'; 26 | } catch (const std::exception & e) { 27 | cout << "Exception: " << e.what() << '\n'; 28 | } 29 | 30 | .. todo:: Xapian::Error hierarchy 31 | 32 | Object Copying 33 | ============== 34 | 35 | Objects are either reference counted handles or relatively cheap to copy. 36 | 37 | Object Ownership 38 | ================ 39 | 40 | Creator owns. 41 | 42 | .. todo:: write me 43 | 44 | STL Compatibility 45 | ================= 46 | 47 | .. todo:: write me 48 | -------------------------------------------------------------------------------- /language_specific/csharp/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/csharp/index.rst -------------------------------------------------------------------------------- /language_specific/java/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/java/index.rst -------------------------------------------------------------------------------- /language_specific/java/running_examples.rst: -------------------------------------------------------------------------------- 1 | Since there isn't a standard location to install third-party Java 2 | libraries, you will likely have to set the ``CLASSPATH`` variable 3 | appropriately to indicate that you wish to use the Xapian Java 4 | bindings. 5 | 6 | There are two parts to the bindings: a jarfile (``xapian.jar``) 7 | containing the Java classes, and the JNI library (such as 8 | ``libxapian_jni.so`` on Linux, or ``libxapian_jni.jnilib`` on macOS) 9 | that connects them to Xapian itself. The easiest way to get this 10 | working is to copy those two files to the top-level directory of this 11 | repository. If you built your own Java bindings, the files will be in 12 | ``java/built`` in the bindings source code. Then you can use the 13 | following classpath (if on Linux):: 14 | 15 | xapian.jar:libxapian_jni.so:. 16 | 17 | If you set the ``CLASSPATH`` variable to this, then the example 18 | commands will work as shown. For instance, if you're using the 19 | ``bash`` shell, you should run the following before any example 20 | commands (again, on Linux):: 21 | 22 | export CLASSPATH=xapian.jar:libxapian_jni.so:. 23 | -------------------------------------------------------------------------------- /language_specific/lua/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/lua/index.rst -------------------------------------------------------------------------------- /language_specific/perl/index.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Perl Specific Notes 3 | =================== 4 | 5 | Unicode 6 | ####### 7 | 8 | The Unicode support in Perl is good and straightforward as long as you 9 | understand how it works. A string can be either a byte string 10 | (encoded) or a character string (decoded). The correct way to deal 11 | with this matter is to decode the strings on input and encode them on 12 | output, while the code should deal with characters (not bytes, so a 13 | character with diacritics is seen a single character, not 2 or more 14 | bytes). 15 | 16 | Typically, this is done this way:: 17 | 18 | #!/usr/bin/env perl 19 | use utf8; # this says that in this file we can use unicode and will be decoded 20 | use strict; 21 | use warnings; 22 | # this encodes on output 23 | binmode STDOUT, ":encoding(UTF-8)"; 24 | binmode STDERR, ":encoding(UTF-8)"; 25 | 26 | # this opens a file and decodes it. 27 | open (my $in, '<:encoding(UTF-8)', $file); 28 | while (<$in>) { .... } 29 | 30 | # this opens a file for writing and encodes the output on print 31 | open (my $out, '>:encoding(UTF-8)', $file); 32 | print $out "Đe ši Šu\n"; 33 | 34 | Also, database drivers usually have a (recommended) setting to decode 35 | the strings coming out from the DB and encoding them before storing 36 | them, so the code deals transparently with characters. 37 | 38 | How this applies to Xapian? You usually store strings with 39 | ``set_data`` and ``add_value``. Such fields are binary fields, so they 40 | want bytes. If you pass a decoded string, it will be silently encoded. 41 | When you are going to retrieve them, the data will come out encoded, 42 | as a string of bytes, and you need to be prepared for it. You can do 43 | this using serialization. The example code stores the document data 44 | using ``encode_json`` (which produces a byte string) and on retrieving 45 | it calls ``decode_json`` (which returns decoded values). When you 46 | store a value, you encode it with ``encode`` or with the 47 | ``sortable_serialise``. Both functions produce bytes:: 48 | 49 | use Encode qw/encode decode/; 50 | use JSON::MaybeXS; 51 | # .... 52 | $doc->set_data(encode_json($rec)); 53 | $doc->add_value(0, encode('UTF-8', $string)); 54 | $doc->add_value(1, Search::Xapian::sortable_serialise($value)); 55 | 56 | If you retrieve a stored value, you need to decode it:: 57 | 58 | use Encode qw/encode decode/; 59 | use JSON::MaybeXS; 60 | # ... 61 | my $string = decode('UTF-8', $doc->get_value(0)); 62 | my $fields = decode_json($doc->get_data); 63 | 64 | See :xapian-code-example:`index_facets` and 65 | :xapian-code-example:`search_facets` for some example code. 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /language_specific/php/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/php/index.rst -------------------------------------------------------------------------------- /language_specific/ruby/index.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Ruby Specific Notes 3 | =================== 4 | -------------------------------------------------------------------------------- /language_specific/tcl/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xapian/xapian-docsprint/036dc8787a17b6a87d9c86fc4ef48a60f7abb3fa/language_specific/tcl/index.rst -------------------------------------------------------------------------------- /practical_example/index.rst: -------------------------------------------------------------------------------- 1 | .. _a-practical-example: 2 | 3 | A practical example 4 | =================== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | indexing/index 10 | searching/index 11 | -------------------------------------------------------------------------------- /practical_example/indexing/building_a_museum_catalogue.rst: -------------------------------------------------------------------------------- 1 | Building a museum catalogue 2 | =========================== 3 | 4 | We're going to build a simple search system based on museum catalogue 5 | data released under the `Creative Commons 6 | Attribution-NonCommercial-ShareAlike 7 | <https://creativecommons.org/licenses/by-nc-sa/3.0/>`_ license by the 8 | `Science Museum in London, UK <https://www.sciencemuseum.org.uk/>`_. 9 | 10 | Preparing to run the examples 11 | ----------------------------- 12 | 13 | You should download both the two sample datasets and example code as 14 | described in the :ref:`overview <overview>`, 15 | and also check that you've installed Xapian as detailed there. 16 | 17 | .. The code is provided as a gzipped tar file, which you should unpack 18 | .. into the directory you're going to use while working through this 19 | .. guide. The datasets are gzipped CSV files, which should be 20 | .. uncompressed into the same directory. You should then open an 21 | .. interactive shell in that directory. For instance, if you're using 22 | .. Python for the examples, run something like the following:: 23 | .. 24 | .. $ mkdir xapian-guide 25 | .. $ cd xapian-guide 26 | .. $ wget https://xapian.org/docs/examples/python.tgz 27 | .. $ wget https://xapian.org/data/muscat-data.csv.gz 28 | .. $ wget https://xapian.org/data/states-data.csv.gz 29 | .. $ gzip -dc python.tgz | tar xvf - && rm python.tgz 30 | .. $ gzip -d muscat-data.csv.gz 31 | .. $ gzip -d states-data.csv.gz 32 | .. 33 | .. This will leave you with two files, `muscat.csv` and `states.csv`, and 34 | .. a directory `code` which itself contains a directory `python` which 35 | .. contains all the example code. 36 | -------------------------------------------------------------------------------- /practical_example/indexing/index.rst: -------------------------------------------------------------------------------- 1 | Indexing 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | building_a_museum_catalogue 8 | what_data_is_there 9 | what_do_people_want_to_search_for 10 | index_plan 11 | writing_the_code 12 | verifying_the_index 13 | updating_the_database 14 | -------------------------------------------------------------------------------- /practical_example/indexing/index_plan.rst: -------------------------------------------------------------------------------- 1 | The index plan 2 | -------------- 3 | In order to index the CSV, we want to take two fields from each row, title 4 | and description, and turn them into suitable terms. For straightforward 5 | textual search we don't need document values. 6 | 7 | Because we're dealing with free text, and because we know the whole dataset 8 | is in English, we can use stemming so that for instance searching for 9 | "sundial" and "sundials" will both match the same documents. This way 10 | people don't need to worry too much about exactly which words to use in 11 | their query. 12 | 13 | Finally, we want a way of separating the two fields. In Xapian this is done 14 | using `term prefixes`, basically by putting short strings at the beginning 15 | of terms to indicate which field the term indexes. As well as prefixed 16 | terms, we also want to generate unprefixed terms, so that as well as 17 | searching within fields you can also search for text in any field. 18 | 19 | There are some conventional prefixes used, which is helpful if you ever need to 20 | interoperate with omega (a web-based search engine) or other compatible 21 | systems. From this, we'll use 'S' to prefix title (it stands for 'subject'), and 22 | for description we'll use 'XD'. A full list of conventional prefixes is given at 23 | the top of the `omega documentation on termprefixes`_. 24 | 25 | .. _omega documentation on termprefixes: https://xapian.org/docs/omega/termprefixes 26 | 27 | When you're indexing multiple fields like this, the term positions used for 28 | each field when indexed unprefixed need to be kept apart. Say you have a 29 | title of "The Saints", and description "Don't like rabbits? Keep reading." 30 | If you index those fields without a gap, the phrase search "Saints don't 31 | like rabbits" will match, where it really shouldn't. Usually a gap of 100 32 | between each field is enough. 33 | 34 | To write to a database, we use the WritableDatabase class, which allows us 35 | to create, update or overwrite a database. 36 | 37 | To create terms, we use Xapian's TermGenerator, a built-in class to make 38 | turning free text into terms easier. It will split into words, apply 39 | stemming, and then add term prefixes as needed. It can also take care of 40 | term positions, including the gap between different fields. 41 | -------------------------------------------------------------------------------- /practical_example/indexing/updating_the_database.rst: -------------------------------------------------------------------------------- 1 | Updating the database 2 | --------------------- 3 | 4 | If you look back at the verifying step of the database, you may notice 5 | that the first item we have indexed has the word 'compass' spelled 6 | incorrectly, which means that we will need to either update just that 7 | document, or to re-index the entire database. 8 | 9 | Reindexing the database can be done immediately using the :xapian-basename-code-example:`index1` script 10 | we used for the initial indexing; this is because we are using an external 11 | ID for each document we add to the database, taken from the `id_NUMBER` 12 | field from the original data set. We then pass this to the :xapian-method:`Database::replace_document()` 13 | method, which updates if there's already a document under that external ID, 14 | or adds a document to the database otherwise. 15 | 16 | In fact, because of this, :xapian-basename-code-example:`index1` can update just part of the 17 | database. Just give it a file with only the rows that correspond to 18 | documents that need updating. Everything else in the database will be 19 | left untouched. 20 | 21 | Deleting documents 22 | ~~~~~~~~~~~~~~~~~~ 23 | 24 | It is also possible to delete documents from the index using the 25 | :xapian-method:`Database::delete_document()` method on a 26 | :xapian-class:`WritableDatabase` object. This can be done either by Xapian docid 27 | or using unique ID terms, as with :xapian-method:`Database::replace_document()`. 28 | 29 | .. xapianexample:: delete1 30 | 31 | A copy of this code is available in :xapian-code-example:`^`. 32 | 33 | Then we just run our deletion tool, giving it identifiers taken from 34 | the `id_NUMBER` field in the data set: 35 | 36 | .. xapianrunexample:: index1 37 | :silent: 38 | :args: data/100-objects-v1.csv db 39 | 40 | .. xapianrunexample:: delete1 41 | :args: db 1953-448 1985-438 42 | 43 | After that, we expect to see two fewer documents in our database using xapian-delve: 44 | 45 | .. code-block:: none 46 | 47 | $ xapian-delve db 48 | UUID = 1820ef0a-055b-4946-ae73-67aa4ef5c226 49 | number of documents = 98 50 | average document length = 100.041 51 | document length lower bound = 33 52 | document length upper bound = 251 53 | highest document id ever used = 100 54 | has positional information = true 55 | -------------------------------------------------------------------------------- /practical_example/indexing/verifying_the_index.rst: -------------------------------------------------------------------------------- 1 | Verifying the index using xapian-delve 2 | -------------------------------------- 3 | 4 | Xapian comes with a handy utility called `xapian-delve` which can be used to 5 | inspect a database, so let's look at the one you just built. If you just 6 | pass a database path as a parameter you'll get an overview: how many documents, 7 | average term length, and some other statistics: 8 | 9 | .. code-block:: none 10 | 11 | $ xapian-delve db 12 | UUID = 1820ef0a-055b-4946-ae73-67aa4ef5c226 13 | number of documents = 100 14 | average document length = 100.58 15 | document length lower bound = 33 16 | document length upper bound = 251 17 | highest document id ever used = 100 18 | has positional information = true 19 | 20 | You can also look at an individual document, using Xapian's docid (``-d`` 21 | means output document data as well): 22 | 23 | .. code-block:: none 24 | 25 | $ xapian-delve -r 1 -d db # output has been reformatted 26 | Data for record #1: 27 | { 28 | "MEASUREMENTS": "", 29 | "DESCRIPTION": "Ansonia Sunwatch (pocket compas dial)", 30 | "PLACE_MADE": "New York county, New York state, United States", 31 | "id_NUMBER": "1974-100", 32 | "WHOLE_PART": "WHOLE", 33 | "TITLE": "Ansonia Sunwatch (pocket compas dial)", 34 | "DATE_MADE": "1922-1939", 35 | "COLLECTION": "SCM - Time Measurement", 36 | "ITEM_NAME": "Pocket horizontal sundial", 37 | "MATERIALS": "", 38 | "MAKER": "Ansonia Clock Co." 39 | } 40 | Term List for record #1: Q1974-100 Sansonia Scompas Sdial Spocket 41 | Ssunwatch XDansonia XDcompass XDdial XDpocket XDsunwatch ZSansonia 42 | ZScompas ZSdial ZSpocket ZSsunwatch ZXDansonia ZXDcompas ZXDdial 43 | ZXDpocket ZXDsunwatch Zansonia Zcompass Zdial Zpocket Zsunwatch 44 | ansonia compass dial pocket sunwatch 45 | 46 | You can also go the other way, starting with a term and finding both 47 | statistics and which documents it indexes: 48 | 49 | .. code-block:: none 50 | 51 | $ xapian-delve -t Stime db 52 | Posting List for term `Stime' (termfreq 4, collfreq 4, wdf_max 4): 53 | 41 56 58 65 54 | 55 | This means you can look documents up by identifier: 56 | 57 | .. code-block:: none 58 | 59 | $ xapian-delve -t Q1974-100 db 60 | Posting List for term `Q1974-100' (termfreq 1, collfreq 1, wdf_max 1): 61 | 1 62 | 63 | ``xapian-delve`` is frequently useful if you aren't getting the behaviour you 64 | expect from a search system, to check that the database contains the 65 | documents and terms you expect. 66 | -------------------------------------------------------------------------------- /practical_example/indexing/what_data_is_there.rst: -------------------------------------------------------------------------------- 1 | What data is there? 2 | ------------------- 3 | 4 | Each row in the CSV file is an object from the catalogue, and has a number 5 | of fields. There are: 6 | 7 | id_NUMBER: 8 | a unique identifier 9 | ITEM_NAME: 10 | a simple name, often from an established thesaurus 11 | TITLE: 12 | a short caption 13 | MAKER: 14 | the name of who made the object 15 | DATE_MADE: 16 | when the object was made, which may be a range, approximate date or 17 | unknown 18 | PLACE_MADE: 19 | where the object was made 20 | MATERIALS: 21 | what the object is made of 22 | MEASUREMENTS: 23 | the dimensions of the object 24 | DESCRIPTION 25 | a description of the object 26 | COLLECTION: 27 | the collection the object came from (eg: Science Museum - Space 28 | Technology) 29 | 30 | There are obviously a number of different types of data here: free text, 31 | identifiers, dates, places (which could be geocoded to geo coordinates), 32 | and dimensions. Additionally, COLLECTION and MAKER both come from a list of 33 | possible values. 34 | -------------------------------------------------------------------------------- /practical_example/indexing/what_do_people_want_to_search_for.rst: -------------------------------------------------------------------------------- 1 | What do people want to search for? 2 | ---------------------------------- 3 | 4 | We can think of a large number of different things that people might want 5 | to find from our catalogue. For instance, they may want to find objects 6 | that were created in Nantes, or after 1812, or by Hurd-Brown Ltd. They may 7 | want to find everything made of brass, or not containing wood, or more than 8 | a metre in length. They may care only about objects in the National Railway 9 | Museum, or in their Railway Heraldry collection, or everything not in the 10 | Railway Heraldry collection. And of course they may want to look up objects 11 | that have certain words or phrases in the title or description - "free text 12 | search", one of the most common uses of search today. 13 | 14 | In order to support all of this we'll need to use many of the features of 15 | Xapian, but to get started we'll just look at one: free text search of the 16 | title and description. 17 | 18 | In later sections of this guide we'll use the same data and build on the 19 | system we create here. 20 | -------------------------------------------------------------------------------- /practical_example/indexing/writing_the_code.rst: -------------------------------------------------------------------------------- 1 | Let's write some code 2 | --------------------- 3 | 4 | Here's the significant part of some example code to implement this index plan. 5 | 6 | .. xapianexample:: index1 7 | 8 | A full copy of this code is available in :xapian-code-example:`^`. 9 | 10 | You can run this code to index a sample data file (held in 11 | :xapian-example:`data/100-objects-v1.csv`) to a database at path ``db`` as follows: 12 | 13 | .. xapianrunexample:: index1 14 | :cleanfirst: db 15 | :args: data/100-objects-v1.csv db 16 | -------------------------------------------------------------------------------- /practical_example/searching/building.rst: -------------------------------------------------------------------------------- 1 | Building the search 2 | ------------------- 3 | 4 | Now we have our database populated with some values, it is time for 5 | the code to search the database and display some results. 6 | 7 | We want to take some text from the user, and search for it in the 8 | database; to do that we need to convert it into a Xapian Query, which 9 | you will recall is a tree made up of terms (which in this case will be 10 | the stemmed forms of words in the text from the user), and operations 11 | such as AND, OR and so forth. 12 | 13 | There are many ways to go from the user's text to a Query, but the 14 | most simple of these is to use the QueryParser. We then pass the Query 15 | to an Enquire object, which also needs setting up with a database, and 16 | is where you'd set various other options that affect how the query is 17 | run (such as sorting, for instance) which we won't address here. 18 | 19 | .. xapianexample:: search1 20 | 21 | A full copy of this code is available in :xapian-code-example:`^`. 22 | -------------------------------------------------------------------------------- /practical_example/searching/database_modified.rst: -------------------------------------------------------------------------------- 1 | Database Modified 2 | ----------------- 3 | 4 | If you're updating the same database you search from (rather than 5 | updating a separate database and then "flipping" between them, using a 6 | stub database), you may run into :xapian-class:`DatabaseModifiedError`, an 7 | exception that can be raised while reading from the database. What 8 | this means is that the database has changed too much since you opened 9 | it for Xapian to be able to continue supplying you with 10 | information. The solution here is to re-open the database with its 11 | :xapian-just-method:`reopen()` method. 12 | 13 | -------------------------------------------------------------------------------- /practical_example/searching/index.rst: -------------------------------------------------------------------------------- 1 | Searching 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | building 8 | running_the_search 9 | prefix 10 | database_modified 11 | -------------------------------------------------------------------------------- /practical_example/searching/prefix.rst: -------------------------------------------------------------------------------- 1 | Searching separate fields 2 | ------------------------- 3 | 4 | When we built our index, we used prefixes to separate the terms generated from 5 | the title and description fields. This allows us to perform searches which are 6 | restricted to the text in just one of those fields, by searching only terms 7 | with the desired prefix. 8 | 9 | When using the Query Parser, it is possible to restrict your search to 10 | certain prefixed terms (e.g. title, or description). These can be searched 11 | for either by using a search prefix (which can correlate to an indexing 12 | prefix) or as a general text document. 13 | 14 | To set up a search prefix, the QueryParser needs to be told which prefixes 15 | in the search query relate to those in the index. We did that in the previous 16 | search code: 17 | 18 | .. xapianrunexample:: index1 19 | :silent: 20 | :args: data/100-objects-v1.csv db 21 | 22 | .. xapianrunexample:: delete1 23 | :silent: 24 | :args: db 1953-448 1985-438 25 | 26 | .. xapianexample:: search1 27 | :marker: prefix configuration. 28 | 29 | This allows us to perform a search based on either field, for example: 30 | 31 | .. xapianrunexample:: search1 32 | :args: db title:sunwatch 33 | 34 | We can also combine prefixes with the logical operators to perform more 35 | complex queries (note that we need to escape quotes or else the shell 36 | will eat them): 37 | 38 | .. xapianrunexample:: search1 39 | :args: db description:\"leather case\" AND title:sundial 40 | -------------------------------------------------------------------------------- /practical_example/searching/running_the_search.rst: -------------------------------------------------------------------------------- 1 | Running a Search 2 | ---------------- 3 | To search the database we've built, you just run our simple search engine: 4 | 5 | .. xapianrunexample:: index1 6 | :silent: 7 | :args: data/100-objects-v1.csv db 8 | 9 | .. xapianrunexample:: delete1 10 | :silent: 11 | :args: db 1953-448 1985-438 12 | 13 | .. xapianrunexample:: search1 14 | :args: db watch 15 | 16 | These results show that 7 documents match our search for the term 17 | 'watch', providing the document IDs (e.g. #004) and title for each. 18 | If you want to search for multiple words, just chain them together on 19 | the command line: 20 | 21 | .. xapianrunexample:: search1 22 | :args: db Dent watch 23 | 24 | You'll notice that all of the results from the first time come back 25 | the second time also, with additional ones (the match 'Dent' but not 26 | 'watch'), because by default QueryParser will use the OR operator to 27 | combine the different search terms. Also, because #046 contains both 28 | 'Dent' and 'watch', it now ranks highest of all the matches. 29 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx~=1.8.0 2 | --------------------------------------------------------------------------------