├── NEWS ├── AUTHORS ├── swig ├── perl │ ├── run_sample.sh │ ├── t │ │ ├── 00_compile.t │ │ ├── 02_usage.t │ │ └── 01_usage.t │ ├── prepare.sh │ ├── Makefile.PL.in │ └── sample.pl ├── java │ ├── run_sample.sh │ ├── prepare.sh │ ├── Makefile │ └── Sample.java ├── php │ ├── build_ext.sh │ ├── prepare.sh │ └── Makefile ├── doc │ ├── footer.html │ └── header.html ├── ruby │ ├── extconf.rb.in │ ├── prepare.sh │ ├── sample_unicode.rb │ └── sample.rb ├── python │ ├── prepare.sh │ ├── sample.py │ ├── sample_unicode.py │ └── setup.py.in ├── export.i ├── Makefile.am ├── INSTALL ├── export.h └── export.cpp ├── frontend ├── frontend.vcproj ├── Makefile.am ├── optparse.h └── main.cpp ├── doc ├── footer.html └── header.html ├── ChangeLog ├── sample ├── Makefile.am ├── sample_unicode.cpp └── sample.cpp ├── Makefile.am ├── include ├── Makefile.am └── simstring │ ├── memory_mapped_file.h │ ├── measure.h │ ├── memory_mapped_file_posix.h │ ├── memory_mapped_file_win32.h │ ├── ngram.h │ ├── cdbpp.h │ └── simstring.h ├── autogen.sh ├── simstring.sln ├── COPYING ├── README ├── configure.in └── win32 └── stdint.h /NEWS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Naoaki Okazaki 2 | -------------------------------------------------------------------------------- /swig/perl/run_sample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | perl -I./blib/arch sample.pl 3 | -------------------------------------------------------------------------------- /swig/java/run_sample.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH java Sample 3 | 4 | -------------------------------------------------------------------------------- /frontend/frontend.vcproj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chokkan/simstring/HEAD/frontend/frontend.vcproj -------------------------------------------------------------------------------- /swig/php/build_ext.sh: -------------------------------------------------------------------------------- 1 | g++ `php-config --includes` -fPIC -c export_wrap.cpp export.cpp 2 | g++ -shared export.o export_wrap.o -o simstring.so 3 | -------------------------------------------------------------------------------- /doc/footer.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | Copyright (c) 2002-2010 by Naoaki Okazaki 4 |
$datetime 5 |
6 | 7 | 8 | -------------------------------------------------------------------------------- /swig/doc/footer.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | Copyright (c) 2002-2010 by Naoaki Okazaki 4 |
$datetime 5 |
6 | 7 | 8 | -------------------------------------------------------------------------------- /swig/ruby/extconf.rb.in: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | $CFLAGS='-I../../include' 3 | $CXXFLAGS='-I../../include' 4 | $LDFLAGS="-lstdc++" 5 | 6 | create_makefile('@PACKAGE@') 7 | 8 | -------------------------------------------------------------------------------- /swig/perl/t/00_compile.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Test::More tests => 2; 4 | use simstring; 5 | 6 | BEGIN{ 7 | use_ok 'simstring'; 8 | } 9 | require_ok( 'simstring' ); 10 | -------------------------------------------------------------------------------- /swig/perl/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # $Id:$ 3 | 4 | ln -s ../export.cpp 5 | ln -s ../export.h 6 | ln -s ../export.i 7 | 8 | if [ "$1" = "--swig" ]; 9 | then 10 | swig -c++ -perl -o export_wrap.cpp export.i 11 | fi 12 | -------------------------------------------------------------------------------- /swig/php/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ln -s ../export.cpp 4 | ln -s ../export.h 5 | ln -s ../export.i 6 | 7 | if [ "$1" = "--swig" ]; 8 | then 9 | swig -c++ -php -prefix Simstring_ -o export_wrap.cpp export.i 10 | fi 11 | -------------------------------------------------------------------------------- /swig/python/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # $Id:$ 3 | 4 | ln -s ../export.cpp 5 | ln -s ../export.h 6 | ln -s ../export.i 7 | 8 | if [ "$1" = "--swig" ]; 9 | then 10 | swig -c++ -python -o export_wrap.cpp export.i 11 | fi 12 | -------------------------------------------------------------------------------- /swig/ruby/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # $Id:$ 3 | 4 | ln -s ../export.cpp 5 | ln -s ../export.h 6 | ln -s ../export.i 7 | 8 | if [ "$1" = "--swig" ]; 9 | then 10 | swig -c++ -ruby -o export_wrap.cpp export.i 11 | fi 12 | 13 | 14 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2011-02-xx Naoaki Okazaki 2 | 3 | * SimString 1.1: 4 | - Implemented check() member function. 5 | 6 | 7 | 2010-03-07 Naoaki Okazaki 8 | 9 | * SimString 1.0: 10 | - Initial release. 11 | 12 | -------------------------------------------------------------------------------- /swig/java/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ln -s ../export.cpp 4 | ln -s ../export.h 5 | ln -s ../export.i 6 | 7 | if [ "$1" = "--swig" ]; 8 | then 9 | mkdir simstring 10 | swig -c++ -java -package simstring -outdir simstring -o export_wrap.cpp export.i 11 | fi 12 | -------------------------------------------------------------------------------- /frontend/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | 3 | bin_PROGRAMS = simstring 4 | #man_MANS = simstring.1 5 | EXTRA_DIST = \ 6 | frontend.vcproj 7 | 8 | simstring_SOURCES = \ 9 | optparse.h \ 10 | main.cpp 11 | 12 | AM_CXXFLAGS = @CXXFLAGS@ 13 | INCLUDES = @INCLUDES@ 14 | AM_LDFLAGS = @LDFLAGS@ 15 | -------------------------------------------------------------------------------- /sample/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id: Makefile.am 3 2009-07-08 06:14:45Z naoaki $ 2 | 3 | noinst_PROGRAMS = sample sample_unicode 4 | 5 | sample_SOURCES = sample.cpp 6 | 7 | sample_unicode_SOURCES = sample_unicode.cpp 8 | 9 | AM_CXXFLAGS = @CXXFLAGS@ 10 | INCLUDES = @INCLUDES@ 11 | AM_LDFLAGS = @LDFLAGS@ 12 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | 3 | SUBDIRS = include frontend sample swig 4 | 5 | docdir = $(prefix)/share/doc/@PACKAGE@ 6 | doc_DATA = README INSTALL COPYING AUTHORS ChangeLog 7 | 8 | EXTRA_DIST = \ 9 | autogen.sh \ 10 | win32/stdint.h \ 11 | simstring.sln 12 | 13 | #AUTOMAKE_OPTIONS = foreign 14 | #ACLOCAL_AMFLAGS = -I m4 15 | -------------------------------------------------------------------------------- /swig/perl/Makefile.PL.in: -------------------------------------------------------------------------------- 1 | use 5.008; 2 | use strict; 3 | use warnings; 4 | use ExtUtils::MakeMaker; 5 | 6 | WriteMakefile( 7 | 'CC' => 'g++', 8 | 'LD' => 'g++', 9 | 'NAME' => '@PACKAGE@', 10 | 'VERSION' => '@VERSION@', 11 | 'OBJECT' => 'export.o export_wrap.o', 12 | 'OPTIMIZE' => '-O2', 13 | 'INC' => '-I../../include', 14 | ); 15 | -------------------------------------------------------------------------------- /include/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id$ 2 | 3 | simstringincludedir = $(includedir)/simstring 4 | 5 | simstringinclude_HEADERS = \ 6 | simstring/cdbpp.h \ 7 | simstring/memory_mapped_file.h \ 8 | simstring/memory_mapped_file_posix.h \ 9 | simstring/ngram.h \ 10 | simstring/measure.h \ 11 | simstring/simstring.h 12 | 13 | EXTRA_DIST = \ 14 | simstring/memory_mapped_file_win32.h 15 | -------------------------------------------------------------------------------- /swig/php/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # SWIG example make file for Linux 3 | # 4 | all: 5 | g++ `php-config --includes` -fPIC -c export.cpp -I../../include 6 | g++ `php-config --includes` -fPIC -c export_wrap.cpp 7 | g++ -shared export.o export_wrap.o -o simstring.so 8 | 9 | clean: 10 | rm -rf *.o *.so *.h *.cpp *.php export.* 11 | 12 | purge: 13 | rm -rf *.o *.so *.h *.cpp export.* 14 | -------------------------------------------------------------------------------- /swig/java/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # SWIG example make file for Linux 3 | # 4 | all: 5 | g++ -fPIC -c export.cpp -I../../include 6 | g++ -fPIC -c export_wrap.cpp 7 | g++ -shared export.o export_wrap.o -o libSimString.so 8 | 9 | sample: 10 | javac simstring/*.java 11 | javac Sample.java 12 | 13 | clean: 14 | rm -rf *.o *.so *.cpp *.class export.* simstring 15 | 16 | purge: 17 | rm -rf *.o *.so *.cpp export.* simstring 18 | -------------------------------------------------------------------------------- /swig/ruby/sample_unicode.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -Ku 2 | 3 | require 'simstring' 4 | 5 | # Open a SimString database for writing with Unicode mode. 6 | db = Simstring::Writer.new('sample_unicode.db', 3, false, true) 7 | 8 | # Write a string, and close the database. 9 | db.insert('スパゲティ') 10 | db.close() 11 | 12 | 13 | # Open the database for reading. 14 | db = Simstring::Reader.new('sample_unicode.db') 15 | 16 | # Set a similarity measure and threshold. 17 | db.measure = Simstring::Cosine 18 | db.threshold = 0.6 19 | 20 | # Use an 8-bit string in UTF-8 encoding. 21 | p(db.retrieve('スパゲティー')) 22 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # $Id$ 3 | 4 | if [ "$1" = "--force" ]; 5 | then 6 | FORCE=--force 7 | NOFORCE= 8 | FORCE_MISSING=--force-missing 9 | else 10 | FORCE= 11 | NOFORCE=--no-force 12 | FORCE_MISSING= 13 | fi 14 | 15 | #libtoolize --copy $FORCE 2>&1 | sed '/^You should/d' || { 16 | # echo "libtoolize failed!" 17 | # exit 1 18 | #} 19 | 20 | aclocal $FORCE || { 21 | echo "aclocal failed!" 22 | exit 1 23 | } 24 | 25 | autoheader $FORCE || { 26 | echo "autoheader failed!" 27 | exit 1 28 | } 29 | 30 | automake -a -c $NOFORCE || { 31 | echo "automake failed!" 32 | exit 1 33 | } 34 | 35 | autoconf $FORCE || { 36 | echo "autoconf failed!" 37 | exit 1 38 | } 39 | -------------------------------------------------------------------------------- /swig/export.i: -------------------------------------------------------------------------------- 1 | %module simstring 2 | 3 | %{ 4 | #include "export.h" 5 | %} 6 | 7 | %include "std_string.i" 8 | %include "std_vector.i" 9 | %include "exception.i" 10 | 11 | namespace std { 12 | %template(StringVector) vector; 13 | } 14 | 15 | %exception { 16 | try { 17 | $action 18 | } catch(const std::invalid_argument& e) { 19 | SWIG_exception(SWIG_IOError, e.what()); 20 | } catch(const std::runtime_error& e) { 21 | SWIG_exception(SWIG_RuntimeError, e.what()); 22 | } catch (const std::exception& e) { 23 | SWIG_exception(SWIG_RuntimeError, e.what()); 24 | } catch(...) { 25 | SWIG_exception(SWIG_RuntimeError,"Unknown exception"); 26 | } 27 | } 28 | 29 | %include "export.h" 30 | 31 | -------------------------------------------------------------------------------- /swig/Makefile.am: -------------------------------------------------------------------------------- 1 | # $Id:$ 2 | 3 | EXTRA_DIST = \ 4 | python/prepare.sh \ 5 | python/setup.py.in \ 6 | python/export_wrap.cpp \ 7 | python/simstring.py \ 8 | python/sample.py \ 9 | python/sample_unicode.py \ 10 | ruby/prepare.sh \ 11 | ruby/extconf.rb.in \ 12 | ruby/export_wrap.cpp \ 13 | ruby/sample.rb \ 14 | ruby/sample_unicode.rb \ 15 | perl/prepare.sh \ 16 | perl/Makefile.PL.in \ 17 | perl/export_wrap.cpp \ 18 | perl/simstring.pm \ 19 | perl/run_sample.sh \ 20 | perl/sample.pl \ 21 | perl/t/00_compile.t \ 22 | perl/t/01_usage.t \ 23 | perl/t/02_usage.t \ 24 | java/prepare.sh \ 25 | java/Makefile \ 26 | java/export_wrap.cpp \ 27 | java/simstring/*.java \ 28 | java/run_sample.sh \ 29 | java/Sample.java \ 30 | export.i \ 31 | export.h \ 32 | export.cpp 33 | -------------------------------------------------------------------------------- /swig/ruby/sample.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'simstring' 4 | 5 | # Create a SimString database with two person names. 6 | db = Simstring::Writer.new('sample.db') 7 | db.insert('Barack Hussein Obama II') 8 | db.insert('James Gordon Brown') 9 | db.close() 10 | 11 | 12 | # Open the database for reading. 13 | db = Simstring::Reader.new('sample.db') 14 | 15 | # Use cosine similarity and threshold 0.6. 16 | db.measure = Simstring::Cosine 17 | db.threshold = 0.6 18 | p(db.retrieve('Barack Obama')) # OK. 19 | p(db.retrieve('Gordon Brown')) # OK. 20 | p(db.retrieve('Obama')) # Too dissimilar! 21 | 22 | # Use overlap coefficient and threshold 1.0. 23 | db.measure = Simstring::Overlap 24 | db.threshold = 1 25 | p(db.retrieve('Obama')) # OK. 26 | -------------------------------------------------------------------------------- /doc/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | SimString: A fast and efficient implementation of approximate string matching 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /swig/python/sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import simstring 4 | 5 | # Create a SimString database with two person names. 6 | db = simstring.writer('sample.db') 7 | db.insert('Barack Hussein Obama II') 8 | db.insert('James Gordon Brown') 9 | db.close() 10 | 11 | 12 | # Open the database for reading. 13 | db = simstring.reader('sample.db') 14 | 15 | # Use cosine similarity and threshold 0.6. 16 | db.measure = simstring.cosine 17 | db.threshold = 0.6 18 | print(db.retrieve('Barack Obama')) # OK. 19 | print(db.retrieve('Gordon Brown')) # OK. 20 | print(db.retrieve('Obama')) # Too dissimilar! 21 | 22 | # Use overlap coefficient and threshold 1.0. 23 | db.measure = simstring.overlap 24 | db.threshold = 1. 25 | print(db.retrieve('Obama')) # OK. 26 | -------------------------------------------------------------------------------- /swig/doc/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | SimString: A fast and efficient implementation of approximate string matching 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /swig/perl/sample.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use warnings; 4 | use strict; 5 | use simstring; 6 | 7 | # Create a SimString database with two person names. 8 | my $db = simstring::writer->new('sample.db'); 9 | $db->insert('Barack Hussein Obama II'); 10 | $db->insert('James Gordon Brown'); 11 | $db->close; 12 | 13 | # Open the database for reading. 14 | $db = simstring::reader->new('sample.db'); 15 | 16 | # Use cosine similarity and threshold 0.6. 17 | $db->swig_measure_set($simstring::cosine); 18 | $db->swig_threshold_set(0.6); 19 | print @{ $db->retrieve('Barack Obama') }, "\n"; # OK. 20 | print @{ $db->retrieve('Gordon Brown') }, "\n"; # OK. 21 | print @{ $db->retrieve('Obama') }, "\n"; # Too dissimilar! 22 | 23 | # Use overlap coefficient and threshold 1.0. 24 | $db->swig_measure_set($simstring::overlap); 25 | $db->swig_threshold_set(1); 26 | print @{ $db->retrieve('Obama') }, "\n"; # OK. 27 | 28 | __END__ 29 | -------------------------------------------------------------------------------- /simstring.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "frontend", "frontend\frontend.vcxproj", "{DE6A2D5D-8AFA-4471-A9A2-C8E671CCC301}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {DE6A2D5D-8AFA-4471-A9A2-C8E671CCC301}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {DE6A2D5D-8AFA-4471-A9A2-C8E671CCC301}.Debug|Win32.Build.0 = Debug|Win32 14 | {DE6A2D5D-8AFA-4471-A9A2-C8E671CCC301}.Release|Win32.ActiveCfg = Release|Win32 15 | {DE6A2D5D-8AFA-4471-A9A2-C8E671CCC301}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /swig/python/sample_unicode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | """ 5 | A Unicode sample. 6 | 7 | We assume that the source code is written in UTF-8 encoding (see the 8 | encoding declaration in line 2). We can use 8-bit strings as they are 9 | with SimString. 10 | """ 11 | 12 | import simstring 13 | 14 | # Open a SimString database for writing with Unicode mode. 15 | db = simstring.writer('sample_unicode.db', 3, False, True) 16 | 17 | # Write a string, and close the database. 18 | db.insert('スパゲティ') 19 | db.close() 20 | 21 | 22 | # Open the SimString database for reading. 23 | db = simstring.reader('sample_unicode.db') 24 | 25 | # Set a similarity measure and threshold. 26 | db.measure = simstring.cosine 27 | db.threshold = 0.6 28 | 29 | # Use an 8-bit string encoded in UTF-8. 30 | print ' '.join(db.retrieve('スパゲティー')) 31 | 32 | # Convert a Unicode object into an UTF-8 query string. 33 | print ' '.join(db.retrieve(u'スパゲティー'.encode('utf-8'))) 34 | 35 | -------------------------------------------------------------------------------- /swig/python/setup.py.in: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | setup.py file for SWIG example 5 | """ 6 | 7 | import sys 8 | import os.path 9 | 10 | def get_rootdir(): 11 | return os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) 12 | def get_includedir(): 13 | return os.path.join(get_rootdir(), 'include') 14 | 15 | def get_swigdir(): 16 | return os.path.join(get_rootdir(), 'swig') 17 | 18 | import os; os.environ['CC'] = 'g++'; os.environ['CXX'] = 'g++'; 19 | os.environ['CPP'] = 'g++'; os.environ['LDSHARED'] = 'g++' 20 | 21 | from distutils.core import setup, Extension 22 | 23 | simstring_module = Extension( 24 | '_simstring', 25 | sources = [ 26 | 'export.cpp', 27 | 'export_wrap.cpp', 28 | ], 29 | include_dirs=[get_includedir(),], 30 | extra_link_args=['-shared', '-liconv', '-lpython'], 31 | language='c++', 32 | ) 33 | 34 | setup( 35 | name = '@PACKAGE@', 36 | version = '@VERSION@', 37 | author = 'Naoaki Okazaki', 38 | description = """SimString Python module""", 39 | ext_modules = [simstring_module], 40 | py_modules = ["simstring"], 41 | ) 42 | 43 | -------------------------------------------------------------------------------- /swig/perl/t/02_usage.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Test::More tests => 5; 4 | use simstring; 5 | use utf8; 6 | 7 | # *insert = *simstringc::writer_insert; 8 | # *close = *simstringc::writer_close; 9 | 10 | # *retrieve = *simstringc::reader_retrieve; 11 | # *close = *simstringc::reader_close; 12 | # *swig_measure_get = *simstringc::reader_measure_get; 13 | # *swig_measure_set = *simstringc::reader_measure_set; 14 | # *swig_threshold_get = *simstringc::reader_threshold_get; 15 | # *swig_threshold_set = *simstringc::reader_threshold_set; 16 | 17 | # *exact = *simstringc::exact; 0 18 | # *dice = *simstringc::dice; 1 19 | # *cosine = *simstringc::cosine; 2 20 | # *jaccard = *simstringc::jaccard; 3 21 | # *overlap = *simstringc::overlap; 4 22 | 23 | my $db = simstring::writer->new('sample_unicode.db', 3, undef, 1); 24 | ok( $db, 'writer->new' ); 25 | $db->insert('スパゲティ'); 26 | $db->close; 27 | 28 | $db = simstring::reader->new('sample_unicode.db'); 29 | ok( $db, 'reader->new' ); 30 | $db->swig_measure_set($simstring::cosine); 31 | ok( $db->swig_measure_get == 2, 'measure_get: cosine' ); 32 | $db->swig_threshold_set(0.6); 33 | ok( $db->swig_threshold_get == 0.6, 'threshold_get: 0.6' ); 34 | 35 | ok( scalar @{ $db->retrieve('スパゲティー') } == 1, 'retrieve (cosine 0.6)' ); 36 | 37 | -------------------------------------------------------------------------------- /swig/INSTALL: -------------------------------------------------------------------------------- 1 | 2 | SimString SWIG interface 3 | http://www.chokkan.org/software/simstring/swig/ 4 | 5 | 6 | 7 | * DIRECTORY STRUCTURE 8 | [The current "swig" directory] 9 | ./export.i SWIG interface definition 10 | ./export.h C++ header file defining the API 11 | ./export.cpp C++ implementation of the API 12 | 13 | [Language directories] 14 | The "swig" directory contains language subdirectories, e.g., python, ruby, 15 | perl, java. These language directories store language-dependent staffs 16 | (e.g., build/install script). 17 | 18 | 19 | * BUILD INSTRUCTION 20 | [Using the distribution package (*.tar.gz)] 21 | $ cd language-directory 22 | $ ./prepare.sh 23 | $ # Run the language-dependent build script. 24 | 25 | [Using the source repository] 26 | $ cd language-directory 27 | $ ./prepare.sh --swig 28 | $ # Run the language-dependent build script. 29 | 30 | 31 | The distribution package (*.tar.gz) includes a SWIG-generated wrapper for 32 | every language, which is not managed by (committed in) the source 33 | repository. It may be ideal for a user to generate a SWIG wrapper by 34 | themselves, but the official distribution includes wrappers for the 35 | following reasons: 36 | - To build SWIG bindings on the machine where SWIG is not installed 37 | - To avoid errors caused by the version difference of SWIG 38 | -------------------------------------------------------------------------------- /sample/sample_unicode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | void retrieve( 7 | simstring::reader& dbr, 8 | const std::wstring& query, 9 | int measure, 10 | double threshold 11 | ) 12 | { 13 | // Retrieve similar strings into a string vector. 14 | std::vector xstrs; 15 | dbr.retrieve(query, measure, threshold, std::back_inserter(xstrs)); 16 | 17 | // Output the retrieved strings separated by ", ". 18 | for (int i = 0;i < (int)xstrs.size();++i) { 19 | std::wcout << (i != 0 ? L", " : L"") << xstrs[i]; 20 | } 21 | std::wcout << std::endl; 22 | } 23 | 24 | int main(int argc, char *argv[]) 25 | { 26 | // Activate std::wcout. 27 | std::locale::global(std::locale("")); 28 | std::wcout.imbue(std::locale("")); 29 | 30 | // Open a SimString database for writing (with std::wstring). 31 | simstring::ngram_generator gen(3, false); 32 | simstring::writer_base dbw(gen, "sample_unicode.db"); 33 | dbw.insert(L"スパゲティ"); 34 | dbw.close(); 35 | 36 | // Open the database for reading. 37 | simstring::reader dbr; 38 | dbr.open("sample_unicode.db"); 39 | 40 | // Output similar strings from Unicode queries. 41 | retrieve(dbr, L"スパゲティー", simstring::cosine, 0.6); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /sample/sample.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | void retrieve( 6 | simstring::reader& dbr, 7 | const std::string& query, 8 | int measure, 9 | double threshold 10 | ) 11 | { 12 | // Retrieve similar strings into a string vector. 13 | std::vector xstrs; 14 | dbr.retrieve(query, measure, threshold, std::back_inserter(xstrs)); 15 | 16 | // Output the retrieved strings separated by ", ". 17 | for (int i = 0;i < (int)xstrs.size();++i) { 18 | std::cout << (i != 0 ? ", " : "") << xstrs[i]; 19 | } 20 | std::cout << std::endl; 21 | } 22 | 23 | int main(int argc, char *argv[]) 24 | { 25 | // Create a SimString database with two person names. 26 | simstring::ngram_generator gen(3, false); 27 | simstring::writer_base dbw(gen, "sample.db"); 28 | 29 | dbw.insert("Barack Hussein Obama II"); 30 | dbw.insert("James Gordon Brown"); 31 | dbw.close(); 32 | 33 | // Open the database for reading. 34 | simstring::reader dbr; 35 | 36 | dbr.open("sample.db"); 37 | retrieve(dbr, "Barack Obama", simstring::cosine, 0.6); 38 | retrieve(dbr, "Gordon Brown", simstring::cosine, 0.6); 39 | retrieve(dbr, "Obama", simstring::cosine, 0.6); 40 | retrieve(dbr, "Obama", simstring::overlap, 1.0); 41 | 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | The BSD license. 2 | 3 | Copyright (c) 2009,2010 Naoaki Okazaki 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the names of the authors nor the names of its contributors 14 | may be used to endorse or promote products derived from this 15 | software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 21 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /swig/perl/t/01_usage.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use Test::More tests => 8; 4 | use simstring; 5 | 6 | # *insert = *simstringc::writer_insert; 7 | # *close = *simstringc::writer_close; 8 | 9 | # *retrieve = *simstringc::reader_retrieve; 10 | # *close = *simstringc::reader_close; 11 | # *swig_measure_get = *simstringc::reader_measure_get; 12 | # *swig_measure_set = *simstringc::reader_measure_set; 13 | # *swig_threshold_get = *simstringc::reader_threshold_get; 14 | # *swig_threshold_set = *simstringc::reader_threshold_set; 15 | 16 | # *exact = *simstringc::exact; 0 17 | # *dice = *simstringc::dice; 1 18 | # *cosine = *simstringc::cosine; 2 19 | # *jaccard = *simstringc::jaccard; 3 20 | # *overlap = *simstringc::overlap; 4 21 | 22 | my $db = simstring::writer->new('sample.db'); 23 | ok( $db, 'writer->new' ); 24 | $db->insert('Barack Hussein Obama II'); 25 | $db->insert('James Gordon Brown'); 26 | $db->close; 27 | 28 | $db = simstring::reader->new('sample.db'); 29 | ok( $db, 'reader->new' ); 30 | $db->swig_measure_set($simstring::cosine); 31 | ok( $db->swig_measure_get == 2, 'measure_get: cosine' ); 32 | $db->swig_threshold_set(0.6); 33 | ok( $db->swig_threshold_get == 0.6, 'threshold_get: 0.6' ); 34 | 35 | ok( ( scalar @{ $db->retrieve('Barack Obama') } == 1 36 | && scalar @{ $db->retrieve('Gordon Brown') } == 1 37 | && scalar @{ $db->retrieve('Obama') } == 0), 'retrieve (cosine 0.6)' ); 38 | 39 | $db->swig_measure_set($simstring::overlap); 40 | ok( $db->swig_measure_get == 4, 'measure_get: overlap' ); 41 | $db->swig_threshold_set(1); 42 | ok( $db->swig_threshold_get == 1, 'threshold_get: 1' ); 43 | 44 | ok( scalar @{ $db->retrieve('Obama') } == 1, 'retrieve (overlap 1)' ); 45 | -------------------------------------------------------------------------------- /swig/java/Sample.java: -------------------------------------------------------------------------------- 1 | import simstring.*; 2 | 3 | public class Sample { 4 | static { 5 | try { 6 | System.loadLibrary("SimString"); 7 | } catch (UnsatisfiedLinkError e) { 8 | System.err.println("Couldn't find the SimString library."); 9 | } 10 | } 11 | 12 | private static void sampleWriter() { 13 | // Create a SimString database with two person names. 14 | writer db = new writer("sample.db", 3, false, false); 15 | db.insert("Barack Hussein Obama II"); 16 | db.insert("James Gordon Brown"); 17 | db.close(); 18 | } 19 | 20 | private static void output(StringVector strs) { 21 | // Output the retrieved strings separated by ", ". 22 | for (int i = 0;i < strs.size();++i) { 23 | if (i != 0) { 24 | System.out.print(", "); 25 | } 26 | System.out.print(strs.get(i)); 27 | } 28 | System.out.print('\n'); 29 | } 30 | 31 | private static void sampleReader() { 32 | // Open the database for reading. 33 | reader db = new reader("sample.db"); 34 | 35 | // Use cosine similarity and threshold 0.6. 36 | db.setMeasure(simstringConstants.cosine); 37 | db.setThreshold(0.6); 38 | output(db.retrieve("Barack Obama")); 39 | output(db.retrieve("Gordon Brown")); 40 | output(db.retrieve("Obama")); 41 | 42 | // Use overlap coefficient and threshold 1.0. 43 | db.setMeasure(simstringConstants.overlap); 44 | db.setThreshold(1.); 45 | output(db.retrieve("Obama")); 46 | } 47 | 48 | public static void main(String[] args) { 49 | sampleWriter(); 50 | sampleReader(); 51 | } 52 | } 53 | 54 | -------------------------------------------------------------------------------- /include/simstring/memory_mapped_file.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory-mapped-file library compatible with Win32 and POSIX. 3 | * 4 | * Copyright (c) 2008-2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #ifndef __MEMORY_MAPPED_FILE_H__ 34 | #define __MEMORY_MAPPED_FILE_H__ 35 | 36 | #include 37 | #include 38 | 39 | class memory_mapped_file_base 40 | { 41 | public: 42 | typedef size_t size_type; 43 | 44 | memory_mapped_file_base() {} 45 | virtual ~memory_mapped_file_base() {} 46 | 47 | void open(const std::string& path, std::ios_base::openmode mode) {} 48 | bool is_open() const {return false; } 49 | void close() {} 50 | void resize(size_type size) {} 51 | size_type size() const {return 0; } 52 | char* data() const {return NULL; } 53 | const char* const_data() const {return NULL; } 54 | static int alignment() {return 0; } 55 | }; 56 | 57 | #if defined(_WIN32) 58 | #include "memory_mapped_file_win32.h" 59 | #define memory_mapped_file memory_mapped_file_win32 60 | 61 | #else 62 | #include "memory_mapped_file_posix.h" 63 | #define memory_mapped_file memory_mapped_file_posix 64 | 65 | #endif 66 | 67 | #endif/*__MEMORY_MAPPED_FILE_H__*/ 68 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | SimString 2 | Version 1.1 3 | http://www.chokkan.org/software/simstring/ 4 | 5 | 6 | 7 | * INTRODUCTION 8 | SimString is an implementation of a simple and efficient algorithm 9 | for approximate string matching. Please refer to the web site for 10 | more information about this software. 11 | 12 | 13 | 14 | * COPYRIGHT AND LICENSING INFORMATION 15 | 16 | This program is distributed under the modified BSD license. Refer to 17 | COPYING file for the precise description of the license. 18 | 19 | 20 | Portions of this software are based on CDB++. 21 | 22 | The MIT License 23 | 24 | Copyright (c) 2008,2009, Naoaki Okazaki 25 | All rights reserved. 26 | 27 | Redistribution and use in source and binary forms, with or without 28 | modification, are permitted provided that the following conditions are met: 29 | * Redistributions of source code must retain the above copyright 30 | notice, this list of conditions and the following disclaimer. 31 | * Redistributions in binary form must reproduce the above copyright 32 | notice, this list of conditions and the following disclaimer in the 33 | documentation and/or other materials provided with the distribution. 34 | * Neither the name of the authors nor the names of its contributors may 35 | be used to endorse or promote products derived from this software 36 | without specific prior written permission. 37 | 38 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 39 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 40 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 41 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 42 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 43 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 44 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 45 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 46 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 47 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 48 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 49 | 50 | 51 | Portions of this software are based on a portable stdint.h (for MSVC). 52 | 53 | Copyright (c) 2005-2007 Paul Hsieh 54 | 55 | Redistribution and use in source and binary forms, with or without 56 | modification, are permitted provided that the following conditions 57 | are met: 58 | 59 | Redistributions of source code must retain the above copyright 60 | notice, this list of conditions and the following disclaimer. 61 | 62 | Redistributions in binary form must not misrepresent the orignal 63 | source in the documentation and/or other materials provided 64 | with the distribution. 65 | 66 | The names of the authors nor its contributors may be used to 67 | endorse or promote products derived from this software without 68 | specific prior written permission. 69 | 70 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 71 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 72 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 73 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 74 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 75 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 76 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 77 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 78 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 79 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 80 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 81 | OF THE POSSIBILITY OF SUCH DAMAGE. 82 | -------------------------------------------------------------------------------- /configure.in: -------------------------------------------------------------------------------- 1 | dnl $Id: configure.in 102 2009-06-23 15:50:57Z naoaki $ 2 | dnl 3 | dnl 4 | dnl Exported and configured variables: 5 | dnl CXXFLAGS 6 | dnl LDFLAGS 7 | dnl INCLUDES 8 | 9 | 10 | dnl ------------------------------------------------------------------ 11 | dnl Initialization for autoconf 12 | dnl ------------------------------------------------------------------ 13 | AC_PREREQ(2.59) 14 | AC_INIT 15 | AC_CONFIG_SRCDIR([frontend/main.cpp]) 16 | 17 | 18 | dnl ------------------------------------------------------------------ 19 | dnl Checks for system 20 | dnl ------------------------------------------------------------------ 21 | AC_CANONICAL_HOST 22 | AC_AIX 23 | AC_GNU_SOURCE 24 | AC_ISC_POSIX 25 | AC_MINIX 26 | 27 | 28 | dnl ------------------------------------------------------------------ 29 | dnl Initialization for automake 30 | dnl ------------------------------------------------------------------ 31 | AM_INIT_AUTOMAKE(simstring, 1.1) 32 | AC_CONFIG_HEADERS(config.h) 33 | AM_MAINTAINER_MODE 34 | 35 | 36 | dnl ------------------------------------------------------------------ 37 | dnl Checks for program 38 | dnl ------------------------------------------------------------------ 39 | AC_PROG_CXX 40 | AC_PROG_INSTALL 41 | AC_PROG_LN_S 42 | 43 | 44 | dnl ------------------------------------------------------------------ 45 | dnl Initialization for variables 46 | dnl ------------------------------------------------------------------ 47 | CXXFLAGS="${ac_save_CXXFLAGS}" 48 | LDFLAGS="${ac_save_LDFLAGS}" 49 | INCLUDES="${ac_save_INCLUDES}" 50 | 51 | dnl ------------------------------------------------------------------ 52 | dnl Checks for header files. 53 | dnl ------------------------------------------------------------------ 54 | AC_HEADER_STDC 55 | AC_CHECK_HEADERS(stdint.h) 56 | AC_CHECK_HEADERS(sys/mman.h) 57 | 58 | 59 | dnl ------------------------------------------------------------------ 60 | dnl Checks for typedefs, structures, and compiler characteristics. 61 | dnl ------------------------------------------------------------------ 62 | AC_C_CONST 63 | AC_CHECK_SIZEOF 64 | AC_TYPE_SIZE_T 65 | AC_STRUCT_TM 66 | AC_CHECK_TYPES([uint32_t]) 67 | 68 | dnl ------------------------------------------------------------------ 69 | dnl Checks for debugging mode 70 | dnl ------------------------------------------------------------------ 71 | AC_ARG_ENABLE( 72 | debug, 73 | [AS_HELP_STRING([--enable-debug],[Turn on debugging])] 74 | ) 75 | 76 | if test "x$enable_debug" = "xyes"; then 77 | CXXFLAGS="-DDEBUG -O -g ${CXXFLAGS}" 78 | else 79 | CXXFLAGS="-O3 ${CXXFLAGS}" 80 | fi 81 | 82 | dnl ------------------------------------------------------------------ 83 | dnl Checks for profiling mode 84 | dnl ------------------------------------------------------------------ 85 | AC_ARG_ENABLE( 86 | profile, 87 | [AS_HELP_STRING([--enable-profile],[Turn on profiling])] 88 | ) 89 | 90 | if test "x$enable_profile" = "xyes"; then 91 | CXXFLAGS="-DPROFILE -pg ${CXXFLAGS}" 92 | fi 93 | 94 | 95 | dnl ------------------------------------------------------------------ 96 | dnl Checks for library functions. 97 | dnl ------------------------------------------------------------------ 98 | 99 | dnl Check for math library 100 | AC_CHECK_LIB(m, sqrt) 101 | AC_CHECK_LIB(mmap, mmap) 102 | 103 | INCLUDES="-I\$(top_srcdir) -I\$(top_srcdir)/include" 104 | 105 | dnl ------------------------------------------------------------------ 106 | dnl Export variables 107 | dnl ------------------------------------------------------------------ 108 | AC_SUBST(CXXFLAGS) 109 | AC_SUBST(LDFLAGS) 110 | AC_SUBST(INCLUDES) 111 | 112 | dnl ------------------------------------------------------------------ 113 | dnl Output the configure results. 114 | dnl ------------------------------------------------------------------ 115 | AC_CONFIG_FILES(Makefile include/Makefile frontend/Makefile sample/Makefile swig/Makefile swig/python/setup.py swig/ruby/extconf.rb swig/perl/Makefile.PL) 116 | AC_OUTPUT 117 | -------------------------------------------------------------------------------- /include/simstring/measure.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SimString similarity measures. 3 | * 4 | * Copyright (c) 2009,2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | 34 | #ifndef __SIMSTRING_MEASURE_H__ 35 | #define __SIMSTRING_MEASURE_H__ 36 | 37 | #include 38 | 39 | namespace simstring { namespace measure { 40 | 41 | /** 42 | * This class implements the traits of exact matching. 43 | */ 44 | struct exact 45 | { 46 | inline static int min_size(int qsize, double alpha) 47 | { 48 | return qsize; 49 | } 50 | 51 | inline static int max_size(int qsize, double alpha) 52 | { 53 | return qsize; 54 | } 55 | 56 | inline static int min_match(int qsize, int rsize, double alpha) 57 | { 58 | return qsize; 59 | } 60 | }; 61 | 62 | /** 63 | * This class implements the traits of dice coefficient. 64 | */ 65 | struct dice 66 | { 67 | inline static int min_size(int qsize, double alpha) 68 | { 69 | return (int)std::ceil(alpha * qsize / (2. - qsize)); 70 | } 71 | 72 | inline static int max_size(int qsize, double alpha) 73 | { 74 | return (int)std::floor((2. - alpha) * qsize / alpha); 75 | } 76 | 77 | inline static int min_match(int qsize, int rsize, double alpha) 78 | { 79 | return (int)std::ceil(0.5 * alpha * (qsize + rsize)); 80 | } 81 | }; 82 | 83 | /** 84 | * This class implements the traits of cosine coefficient. 85 | */ 86 | struct cosine 87 | { 88 | inline static int min_size(int qsize, double alpha) 89 | { 90 | return (int)std::ceil(alpha * alpha * qsize); 91 | } 92 | 93 | inline static int max_size(int qsize, double alpha) 94 | { 95 | return (int)std::floor(qsize / (alpha * alpha)); 96 | } 97 | 98 | inline static int min_match(int qsize, int rsize, double alpha) 99 | { 100 | return (int)std::ceil(alpha * std::sqrt((double)qsize * rsize)); 101 | } 102 | }; 103 | 104 | /** 105 | * This class implements the traits of Jaccard coefficient. 106 | */ 107 | struct jaccard 108 | { 109 | inline static int min_size(int qsize, double alpha) 110 | { 111 | return (int)std::ceil(alpha * qsize); 112 | } 113 | 114 | inline static int max_size(int qsize, double alpha) 115 | { 116 | return (int)std::floor(qsize / alpha); 117 | } 118 | 119 | inline static int min_match(int qsize, int rsize, double alpha) 120 | { 121 | return (int)std::ceil(alpha * (qsize + rsize) / (1 + alpha)); 122 | } 123 | }; 124 | 125 | /** 126 | * This class implements the traits of overlap coefficient. 127 | */ 128 | struct overlap 129 | { 130 | inline static int min_size(int qsize, double alpha) 131 | { 132 | return 1; 133 | } 134 | 135 | inline static int max_size(int qsize, double alpha) 136 | { 137 | return (int)INT_MAX; 138 | } 139 | 140 | inline static int min_match(int qsize, int rsize, double alpha) 141 | { 142 | return (int)std::ceil(alpha * std::min(qsize, rsize)); 143 | } 144 | }; 145 | 146 | }; }; 147 | 148 | #endif/*__SIMSTRING_MEASURE_H__*/ 149 | -------------------------------------------------------------------------------- /include/simstring/memory_mapped_file_posix.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory-mapped-file implementation for POSIX. 3 | * 4 | * Copyright (c) 2008-2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #ifndef __MEMORY_MAPPED_FILE_POSIX_H__ 34 | #define __MEMORY_MAPPED_FILE_POSIX_H__ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | class memory_mapped_file_posix : 47 | public memory_mapped_file_base 48 | { 49 | public: 50 | typedef size_t size_type; 51 | 52 | protected: 53 | int m_fd; 54 | std::ios_base::openmode m_mode; 55 | void* m_data; 56 | size_type m_size; 57 | 58 | public: 59 | memory_mapped_file_posix() 60 | { 61 | m_fd = -1; 62 | m_mode = std::ios_base::in; 63 | m_data = NULL; 64 | m_size = 0; 65 | } 66 | 67 | virtual ~memory_mapped_file_posix() 68 | { 69 | close(); 70 | } 71 | 72 | void open(const std::string& path, std::ios_base::openmode mode) 73 | { 74 | int flags = 0; 75 | struct stat buf; 76 | 77 | if (mode & std::ios_base::in) { 78 | flags = O_RDONLY; 79 | } 80 | if (mode & std::ios_base::out) { 81 | flags = O_RDWR | O_CREAT; 82 | } 83 | if (mode & std::ios_base::trunc) { 84 | flags |= (O_RDWR | O_TRUNC); 85 | } 86 | 87 | m_fd = ::open(path.c_str(), flags, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); 88 | if (m_fd != -1) { 89 | if (::fstat(m_fd, &buf) == 0) { 90 | m_mode = mode; 91 | this->resize((size_type)buf.st_size); 92 | } else { 93 | ::close(m_fd); 94 | m_fd = -1; 95 | } 96 | } 97 | } 98 | 99 | bool is_open() const 100 | { 101 | return (m_fd != -1); 102 | } 103 | 104 | void close() 105 | { 106 | this->free(); 107 | if (m_fd != -1) { 108 | ::close(m_fd); 109 | m_fd = -1; 110 | } 111 | } 112 | 113 | bool resize(size_type size) 114 | { 115 | if (size == 0) { 116 | this->free(); 117 | return true; 118 | } 119 | 120 | if (m_fd == -1) { 121 | return false; 122 | } 123 | 124 | this->free(); 125 | 126 | if ((m_mode & std::ios_base::out) && m_size < size) { 127 | /* Try to expand the file to the specified size. */ 128 | if (::lseek(m_size, size, SEEK_SET) >= 0) { 129 | char c; 130 | if (read(m_fd, &c, sizeof(char)) == -1) { 131 | c = 0; 132 | } 133 | if (write(m_fd, &c, sizeof(char)) == -1) { 134 | return false; // Failed to write the last position. 135 | } 136 | } else { 137 | return false; // Failed to expand the file. 138 | } 139 | } 140 | 141 | /* Map the file into process memory. */ 142 | m_data = ::mmap( 143 | NULL, 144 | size, 145 | (m_mode & std::ios_base::out) ? (PROT_READ | PROT_WRITE) : PROT_READ, 146 | MAP_SHARED, 147 | m_fd, 148 | 0); 149 | 150 | m_size = size; 151 | return true; 152 | } 153 | 154 | void free() 155 | { 156 | if (m_data != NULL) { 157 | ::munmap(m_data, m_size); 158 | m_data = NULL; 159 | } 160 | m_size = 0; 161 | } 162 | 163 | size_type size() const 164 | { 165 | return m_size; 166 | } 167 | 168 | char* data() const 169 | { 170 | return reinterpret_cast(m_data); 171 | } 172 | 173 | const char* const_data() const 174 | { 175 | return reinterpret_cast(m_data); 176 | } 177 | 178 | static int alignment() 179 | { 180 | return 0; 181 | } 182 | }; 183 | 184 | #endif/*__MEMORY_MAPPED_FILE_POSIX_H__*/ 185 | -------------------------------------------------------------------------------- /include/simstring/memory_mapped_file_win32.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory-mapped-file implementation for Win32. 3 | * 4 | * Copyright (c) 2008-2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #ifndef __MEMORY_MAPPED_FILE_WIN32_H__ 34 | #define __MEMORY_MAPPED_FILE_WIN32_H__ 35 | 36 | #ifndef NOMINMAX 37 | #define NOMINMAX // To fix min/max conflicts with STL. 38 | #endif 39 | 40 | #include 41 | #include 42 | 43 | class memory_mapped_file_win32 : 44 | public memory_mapped_file_base 45 | { 46 | public: 47 | typedef size_t size_type; 48 | 49 | protected: 50 | HANDLE m_hFile; 51 | HANDLE m_hMapping; 52 | std::ios_base::openmode m_mode; 53 | char* m_data; 54 | size_type m_size; 55 | 56 | public: 57 | memory_mapped_file_win32() 58 | { 59 | m_hFile = INVALID_HANDLE_VALUE; 60 | m_hMapping = INVALID_HANDLE_VALUE; 61 | m_mode = 0; 62 | m_data = NULL; 63 | m_size = 0; 64 | } 65 | 66 | virtual ~memory_mapped_file_win32() 67 | { 68 | close(); 69 | } 70 | 71 | void open(const std::string& path, std::ios_base::openmode mode) 72 | { 73 | DWORD dwDesiredAccess = 0; 74 | DWORD dwCreationDisposition = 0; 75 | 76 | if (mode & std::ios_base::in) { 77 | dwDesiredAccess |= GENERIC_READ; 78 | dwCreationDisposition = OPEN_EXISTING; 79 | } 80 | if (mode & std::ios_base::out) { 81 | dwDesiredAccess |= GENERIC_WRITE; 82 | dwCreationDisposition = CREATE_NEW; 83 | } 84 | if (mode & std::ios_base::trunc) { 85 | dwDesiredAccess = (GENERIC_READ | GENERIC_WRITE); 86 | dwCreationDisposition = CREATE_ALWAYS; 87 | } 88 | 89 | m_hFile = CreateFileA( 90 | path.c_str(), 91 | dwDesiredAccess, 92 | 0, 93 | NULL, 94 | dwCreationDisposition, 95 | FILE_ATTRIBUTE_NORMAL, 96 | NULL 97 | ); 98 | 99 | if (m_hFile != INVALID_HANDLE_VALUE) { 100 | m_mode = mode; 101 | this->resize((size_type)GetFileSize(m_hFile, NULL)); 102 | } 103 | } 104 | 105 | bool is_open() const 106 | { 107 | return (m_hFile != INVALID_HANDLE_VALUE); 108 | } 109 | 110 | void close() 111 | { 112 | this->free(); 113 | if (m_hFile != INVALID_HANDLE_VALUE) { 114 | CloseHandle(m_hFile); 115 | m_hFile = INVALID_HANDLE_VALUE; 116 | } 117 | } 118 | 119 | bool resize(size_type size) 120 | { 121 | if (size == 0) { 122 | this->free(); 123 | return true; 124 | } 125 | 126 | if (m_hFile == INVALID_HANDLE_VALUE) { 127 | return false; 128 | } 129 | 130 | this->free(); 131 | DWORD flProtect = (m_mode & std::ios_base::out) ? PAGE_READWRITE : PAGE_READONLY; 132 | m_hMapping = CreateFileMappingA( 133 | m_hFile, 134 | NULL, 135 | flProtect, 136 | 0, 137 | (DWORD)size, 138 | NULL 139 | ); 140 | 141 | if (m_hMapping == NULL) { 142 | CloseHandle(m_hFile); 143 | m_hFile = NULL; 144 | return false; 145 | } 146 | 147 | DWORD dwDesiredAccess = (m_mode & std::ios_base::out) ? FILE_MAP_ALL_ACCESS : FILE_MAP_READ; 148 | m_data = (char*)MapViewOfFile( 149 | m_hMapping, 150 | dwDesiredAccess, 151 | 0, 152 | 0, 153 | 0 154 | ); 155 | 156 | if (m_data == NULL) { 157 | CloseHandle(m_hMapping); 158 | m_hMapping = NULL; 159 | CloseHandle(m_hFile); 160 | m_hFile = NULL; 161 | return false; 162 | } 163 | 164 | m_size = size; 165 | return true; 166 | } 167 | 168 | void free() 169 | { 170 | if (m_data != NULL) { 171 | UnmapViewOfFile(m_data); 172 | m_data = NULL; 173 | } 174 | if (m_hMapping != INVALID_HANDLE_VALUE) { 175 | CloseHandle(m_hMapping); 176 | m_hMapping = NULL; 177 | } 178 | m_size = 0; 179 | } 180 | 181 | size_type size() const 182 | { 183 | return m_size; 184 | } 185 | 186 | char* data() const 187 | { 188 | return m_data; 189 | } 190 | 191 | const char* const_data() const 192 | { 193 | return m_data; 194 | } 195 | 196 | static int alignment() 197 | { 198 | return 0; 199 | } 200 | }; 201 | 202 | #endif/*__MEMORY_MAPPED_FILE_WIN32_H__*/ -------------------------------------------------------------------------------- /include/simstring/ngram.h: -------------------------------------------------------------------------------- 1 | /* 2 | * N-gram generator. 3 | * 4 | * Copyright (c) 2009,2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #ifndef __NGRAM_H__ 34 | #define __NGRAM_H__ 35 | 36 | #include 37 | #include 38 | #include 39 | 40 | namespace simstring 41 | { 42 | 43 | /** 44 | * Obtain a set of letter n-grams in a string. 45 | * @param str The string. 46 | * @param ins The insert iterator that receives the set of n-grams. 47 | * @param n The unit of n-grams. 48 | * @param be \c true to generate n-grams that encode begin and end of 49 | * a string. 50 | */ 51 | template < 52 | class string_type, 53 | class insert_iterator 54 | > 55 | static void 56 | ngrams( 57 | const string_type& str, 58 | insert_iterator ins, 59 | int n, 60 | bool be 61 | ) 62 | { 63 | typedef typename string_type::value_type char_type; 64 | typedef std::basic_stringstream stringstream_type; 65 | typedef std::map ngram_stat_type; 66 | const char_type mark = (char_type)0x01; 67 | 68 | string_type src; 69 | if (be) { 70 | // Append marks for begin/end of the string. 71 | for (int i = 0;i < n-1;++i) src += mark; 72 | src += str; 73 | for (int i = 0;i < n-1;++i) src += mark; 74 | } else if ((int)str.length() < n) { 75 | // Pad marks when the string is shorter than n. 76 | src = str; 77 | for (int i = 0;i < n - (int)str.length();++i) { 78 | src += mark; 79 | } 80 | } else { 81 | src = str; 82 | } 83 | 84 | // Count n-grams in the string. 85 | ngram_stat_type stat; 86 | for (typename string_type::size_type i = 0;i < src.length()-n+1;++i) { 87 | string_type ngram = src.substr(i, n); 88 | ++stat[ngram]; 89 | } 90 | 91 | // Convert the n-gram stat into a set. 92 | typename ngram_stat_type::const_iterator it; 93 | for (it = stat.begin();it != stat.end();++it) { 94 | *ins = it->first; 95 | // Append numbers if the same n-gram occurs more than once. 96 | for (int i = 2;i <= it->second;++i) { 97 | stringstream_type ss; 98 | ss << it->first << i; 99 | *ins = ss.str(); 100 | } 101 | } 102 | } 103 | 104 | /** 105 | * N-gram generator. 106 | * 107 | * This class generates n-grams for a string. 108 | */ 109 | class ngram_generator 110 | { 111 | protected: 112 | int m_n; ///< The unit of n-grams. 113 | bool m_be; ///< The flag for begin/end of tokens. 114 | 115 | public: 116 | /** 117 | * Constructs an instance as a tri-gram generator. 118 | */ 119 | ngram_generator() : m_n(3), m_be(false) 120 | { 121 | } 122 | 123 | /** 124 | * Constructs an instance as an n-gram generator. 125 | * @param n The unit of n-grams. 126 | * @param be \c true to generate n-grams that encode begin and 127 | * end of a string. 128 | */ 129 | ngram_generator(int n, bool be=false) : m_n(n), m_be(be) 130 | { 131 | } 132 | 133 | /** 134 | * Sets the parameters for n-gram generation. 135 | * @param n The unit of n-grams. 136 | * @param be \c true to generate n-grams that encode begin and 137 | * end of a string. 138 | */ 139 | void set(int n, bool be=false) 140 | { 141 | m_n = n; 142 | m_be = be; 143 | } 144 | 145 | /** 146 | * Gets the unit of n-grams. 147 | * @return int The unit of n-grams. 148 | */ 149 | int get_n() const 150 | { 151 | return m_n; 152 | } 153 | 154 | /** 155 | * Gets the flag for representing a begin/end of letters. 156 | * @return bool \c true if n-grams encoding the begin and end of a 157 | * string are generated. 158 | */ 159 | bool get_be() const 160 | { 161 | return m_be; 162 | } 163 | 164 | /** 165 | * Obtain a set of letter n-grams in a string. 166 | * @param str The string. 167 | * @param ins The insert iterator that receives the set of n-grams. 168 | */ 169 | template 170 | void operator()(const string_type& str, insert_iterator ins) const 171 | { 172 | ngrams(str, ins, m_n, m_be); 173 | } 174 | }; 175 | 176 | }; 177 | 178 | #endif/*__NGRAM_H__*/ 179 | -------------------------------------------------------------------------------- /swig/export.h: -------------------------------------------------------------------------------- 1 | #ifndef __EXPORT_H__ 2 | #define __EXPORT_H__ 3 | 4 | #include 5 | #include 6 | 7 | /** 8 | * \addtogroup swig_interface SimString SWIG interface 9 | * @{ 10 | * 11 | * The SimString SWIG interface. 12 | */ 13 | 14 | /** 15 | * Similarity measures. 16 | */ 17 | enum { 18 | /// Exact matching. 19 | exact, 20 | /// Dice coefficient. 21 | dice, 22 | /// Cosine coefficient. 23 | cosine, 24 | /// Jaccard coefficient. 25 | jaccard, 26 | /// Overlap coefficient. 27 | overlap, 28 | }; 29 | 30 | /** 31 | * SimString database writer. 32 | */ 33 | class writer 34 | { 35 | protected: 36 | void *m_dbw; 37 | void *m_gen; 38 | bool m_unicode; 39 | 40 | public: 41 | /** 42 | * Creates a new database. 43 | * This function creates an instance of SimString database writer 44 | * for creating a new database. If this function failes to open 45 | * the database, it throws SWIG_IOError. 46 | * 47 | * @param filename The database filename. 48 | * @param n The unit of character n-grams. 49 | * @param be \c true to represent a begin and end of strings 50 | * in character n-grams. 51 | * @param unicode \c true to use Unicode mode. In Unicode mode, 52 | * wide (\c wchar_t) characters are used in n-grams. 53 | * @throw SWIG_IOError 54 | */ 55 | writer(const char *filename, int n = 3, bool be = false, bool unicode = false); 56 | 57 | /** 58 | * Destructs the writer. 59 | * Destructing a writer object automatically closes the database. 60 | * @throw SWIG_IOError 61 | */ 62 | virtual ~writer(); 63 | 64 | /** 65 | * Inserts a string into the database. 66 | * @param string A string to be inserted to the database. This 67 | * argument must be a null-terminated byte stream. 68 | * If the database is created with Unicode mode, this 69 | * function assumes that the byte stream is encoded in 70 | * UTF-8, and converts it into a \c wchar_t string. 71 | * @throw SWIG_IOError 72 | */ 73 | void insert(const char *string); 74 | 75 | /** 76 | * Closes the database. 77 | * This function flushes and closes the database. If this function failes 78 | * to close the database, it throws SWIG_IOError. 79 | * @throw SWIG_IOError 80 | */ 81 | void close(); 82 | }; 83 | 84 | /** 85 | * SimString database reader. 86 | */ 87 | class reader 88 | { 89 | protected: 90 | void *m_dbr; 91 | 92 | public: 93 | /** 94 | * Opens a database for retrieving strings. 95 | * This function creates an instance of SimString database reader 96 | * by opening an existing database. If this function failes to open 97 | * the database, it throws SWIG_IOError. 98 | * 99 | * @param filename The database filename. 100 | * @throw SWIG_IOError 101 | */ 102 | reader(const char *filename); 103 | 104 | /** 105 | * Destructs the database reader. 106 | * Destructing the reader object automatically closes the database. 107 | */ 108 | virtual ~reader(); 109 | 110 | /** 111 | * Retrieves strings that are similar to the query string. 112 | * This function retrieves strings whose similarity with the query string 113 | * are no smaller than a threshold. Before calling this function, set the 114 | * similarity measure and threshold to \ref measure and \ref threshold 115 | * attributes of the reader object. 116 | * 117 | * @param query The query string. This argument must be a 118 | * null-terminated byte stream. If the database was 119 | * created with Unicode mode, this function assumes 120 | * that the byte stream is encoded in UTF-8, and 121 | * converts it into a wchar_t string. 122 | * @return The array of strings retrieved for the query. 123 | * If the database was created with Unicode mode, 124 | * this function returns strings in UTF-8. 125 | * @see measure The similarity function used by this function. 126 | * @see threshold The similarity value used by this function. 127 | */ 128 | std::vector retrieve(const char *query); 129 | 130 | /** 131 | * Checks the existence of a string that is similar to the query string. 132 | * This function examines the existence of a string whose similarity with 133 | * the query string is no smaller than a threshold. Before calling this 134 | * function, set the similarity measure and threshold to \ref measure and 135 | * \ref threshold attributes of the reader object. 136 | * 137 | * @param query The query string. This argument must be a 138 | * null-terminated byte stream. If the database was 139 | * created with Unicode mode, this function assumes 140 | * that the byte stream is encoded in UTF-8, and 141 | * converts it into a wchar_t string. 142 | * @return \c true if a similar string exists, 143 | * \c false otherwise. 144 | * @see measure The similarity function used by this function. 145 | * @see threshold The similarity value used by this function. 146 | */ 147 | bool check(const char *query); 148 | 149 | /** 150 | * Closes a database. 151 | */ 152 | void close(); 153 | 154 | public: 155 | /** 156 | * Similarity measure. 157 | * Specify a similarity measure for approximate string retrieval used 158 | * by retrieve() function. 159 | * @see exact, cosine, dice, jaccard, overlap 160 | */ 161 | int measure; 162 | 163 | /** 164 | * Threshold for the similarity measure. 165 | * Specify a threshold for approximate string retrieval used by 166 | * retrieve() function. 167 | */ 168 | double threshold; 169 | }; 170 | 171 | /** @} */ 172 | 173 | /** 174 | @mainpage SimString SWIG interface 175 | 176 | @section intro Introduction 177 | 178 | This document describes a SWIG interface that bridges SimString with various 179 | programing languages including Python and Ruby. Although SimString currently 180 | distribution provides SWIG wrappers for Python and Ruby, it may be easy to 181 | build libraries for other languages. 182 | 183 | SimString module provides two simple classes ::writer and ::reader. 184 | In the ::writer class, one can create a SimString database using the 185 | constructor writer::writer, and call the member function writer::insert for 186 | inserting a string into the database. 187 | In the ::reader class, one can open an existing SimString database with the 188 | constructor reader::reader, specify a similarity measure and threshold with 189 | two attributes reader::measure and reader::threshold, and call the member 190 | function reader::retrieve for performing approximate string matching. 191 | 192 | SimString module always uses 8-bit null-terminated byte streams in 193 | writer::insert and reader::retrieve functions. The encoding of byte streams 194 | can be arbitrary, but must be UTF-8 for a database in Unicode mode. 195 | 196 | @section api Documentation 197 | 198 | - @ref swig_interface "SWIG interface" 199 | 200 | @section language Language-specific Notes 201 | 202 | @subsection language-ruby Ruby 203 | - Because of the naming convention, the initial letter of a name is 204 | capitalized as follows: 205 | the module name (Simstring), class names (Writer and Reader), 206 | and similarity measures (Exact, Dice, Cosine, Jaccard, Overlap). 207 | 208 | @section sample Sample Programs 209 | 210 | @subsection python Python 211 | 212 | A basic sample. 213 | 214 | @include python/sample.py 215 | 216 | A Unicode sample. 217 | 218 | @include python/sample_unicode.py 219 | 220 | @subsection ruby Ruby 221 | 222 | A basic sample. 223 | 224 | @include ruby/sample.rb 225 | 226 | A Unicode sample. 227 | 228 | @include ruby/sample_unicode.rb 229 | 230 | @subsection perl Perl 231 | 232 | A basic sample. 233 | 234 | @include perl/sample.pl 235 | 236 | @subsection java Java 237 | 238 | A basic sample. 239 | 240 | @include java/Sample.java 241 | 242 | */ 243 | 244 | #endif/*__EXPORT_H__*/ 245 | -------------------------------------------------------------------------------- /frontend/optparse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * An event-driven parser for command-line arguments. 3 | * 4 | * Copyright (c) 2004-2005 by Naoaki Okazaki 5 | * 6 | * This software is provided 'as-is', without any express or implied 7 | * warranty. In no event will the authors be held liable for any damages 8 | * arising from the use of this software. 9 | * 10 | * Permission is granted to anyone to use this software for any purpose, 11 | * including commercial applications, and to alter it and redistribute it 12 | * freely, subject to the following restrictions (known as zlib license): 13 | * 14 | * 1. The origin of this software must not be misrepresented; you must not 15 | * claim that you wrote the original software. If you use this software 16 | * in a product, an acknowledgment in the product documentation would be 17 | * appreciated but is not required. 18 | * 2. Altered source versions must be plainly marked as such, and must not be 19 | * misrepresented as being the original software. 20 | * 3. This notice may not be removed or altered from any source distribution. 21 | * 22 | * Naoaki Okazaki 23 | * 24 | */ 25 | 26 | /* $Id$ */ 27 | 28 | /* 29 | * Class 'optparse' implements a parser for GNU-style command-line arguments. 30 | * Inherit this class to define your own option variables and to implement an 31 | * option handler with macros, BEGIN_OPTION_MAP, ON_OPTION(_WITH_ARG), and 32 | * END_OPTION_MAP. Consult the sample program attached at the bottom of this 33 | * source code. 34 | * 35 | * This code was comfirmed to be compiled with MCVC++ 2003 and gcc 3.3. 36 | * Define _BUILD_NCL_SAMPLE if you want to build a sample program. 37 | * $ g++ -D_BUILD_NCL_SAMPLE -xc++ optparse.h 38 | */ 39 | 40 | #ifndef __NCL_OPTPRASE_H__ 41 | #define __NCL_OPTPRASE_H__ 42 | 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | 49 | #ifdef USE_NCL_NAMESPACE 50 | namespace ncl { 51 | #endif/*USE_NCL_NAMESPACE*/ 52 | 53 | 54 | /** 55 | * An event-driven parser for command-line arguments. 56 | * @author Naoaki Okazaki 57 | */ 58 | class optparse { 59 | public: 60 | /** 61 | * Exception class for unrecognized options. 62 | */ 63 | class unrecognized_option : public std::invalid_argument { 64 | public: 65 | unrecognized_option(char shortopt) 66 | : std::invalid_argument(std::string("-") + shortopt) {} 67 | unrecognized_option(const std::string& longopt) 68 | : std::invalid_argument(std::string("--") + longopt) {} 69 | }; 70 | /** 71 | * Exception class for invalid values. 72 | */ 73 | class invalid_value : public std::invalid_argument { 74 | public: 75 | invalid_value(const std::string& message) 76 | : std::invalid_argument(message) {} 77 | }; 78 | 79 | public: 80 | /** Construct. */ 81 | optparse() {} 82 | /** Destruct. */ 83 | virtual ~optparse() {} 84 | 85 | /** 86 | * Parse options. 87 | * @param argv array of null-terminated strings to be parsed 88 | * @param num_argv specifies the number, in strings, of the array 89 | * @return the number of used arguments 90 | * @throws optparse_exception 91 | */ 92 | int parse(char * const argv[], int num_argv) 93 | { 94 | int i; 95 | for (i = 1;i < num_argv;++i) { 96 | const char *token = argv[i]; 97 | if (*token++ == '-') { 98 | const char *next_token = (i+1 < num_argv) ? argv[i+1] : ""; 99 | if (!*token) { 100 | break; // only '-' was found. 101 | } else if (*token == '-') { 102 | const char *arg = std::strchr(++token, '='); 103 | if (arg) { 104 | arg++; 105 | } else { 106 | arg = next_token; 107 | } 108 | int ret = handle_option(0, token, arg); 109 | if (ret < 0) { 110 | throw unrecognized_option(token); 111 | } 112 | if (arg == next_token) { 113 | i += ret; 114 | } 115 | } else { 116 | char c; 117 | while ((c = *token++) != '\0') { 118 | const char *arg = *token ? token : next_token; 119 | int ret = handle_option(c, token, arg); 120 | if (ret < 0) { 121 | throw unrecognized_option(c); 122 | } 123 | if (ret > 0) { 124 | if (arg == token) { 125 | token = ""; 126 | } else { 127 | i++; 128 | } 129 | } 130 | } // while 131 | } // else (*token == '-') 132 | } else { 133 | break; // a non-option argument was fonud. 134 | } 135 | } // for (i) 136 | 137 | return i; 138 | } 139 | 140 | protected: 141 | /** 142 | * Option handler 143 | * This function should be overridden by inheritance class. 144 | * @param c short option character, 0 for long option 145 | * @param longname long option name 146 | * @param arg an argument for the option 147 | * @return 0 (success); 148 | 1 (success with use of an argument); 149 | -1 (failed, unrecognized option) 150 | * @throws option_parser_exception 151 | */ 152 | virtual int handle_option(char c, const char *longname, const char *arg) 153 | { 154 | return 0; 155 | } 156 | 157 | int __optstrcmp(const char *option, const char *longname) 158 | { 159 | const char *p = std::strchr(option, '='); 160 | return p ? 161 | std::strncmp(option, longname, p-option) : 162 | std::strcmp(option, longname); 163 | } 164 | }; 165 | 166 | 167 | /** The begin of inline option map. */ 168 | #define BEGIN_OPTION_MAP_INLINE() \ 169 | virtual int handle_option(char __c, const char *__longname, const char *arg) \ 170 | { \ 171 | int used_args = 0; \ 172 | if (0) { \ 173 | 174 | /** Define of option map. */ 175 | #define DEFINE_OPTION_MAP() \ 176 | virtual int handle_option(char __c, const char *__longname, const char *arg); 177 | 178 | /** Begin of option map implimentation. */ 179 | #define BEGIN_OPTION_MAP(_Class) \ 180 | int _Class::handle_option(char __c, const char *__longname, const char *arg) \ 181 | { \ 182 | int used_args = 0; \ 183 | if (0) { \ 184 | 185 | /** An entry of option map */ 186 | #define ON_OPTION(test) \ 187 | return used_args; \ 188 | } else if (test) { \ 189 | used_args = 0; \ 190 | 191 | #define ON_OPTION_WITH_ARG(test) \ 192 | return used_args; \ 193 | } else if (test) { \ 194 | used_args = 1; \ 195 | 196 | /** The end of option map implementation */ 197 | #define END_OPTION_MAP() \ 198 | return used_args; \ 199 | } \ 200 | return -1; \ 201 | } \ 202 | 203 | /** A predicator for short options */ 204 | #define SHORTOPT(x) (__c == x) 205 | /** A predicator for long options */ 206 | #define LONGOPT(x) (!__c && __optstrcmp(__longname, x) == 0) 207 | 208 | 209 | #ifdef USE_NCL_NAMESPACE 210 | }; 211 | #endif/*USE_NCL_NAMESPACE*/ 212 | 213 | 214 | 215 | 216 | 217 | 218 | #ifdef _BUILD_NCL_SAMPLE 219 | 220 | #include 221 | #include 222 | 223 | /** 224 | * A class to store parameters specified by command-line arguments 225 | */ 226 | class option : public optparse { 227 | public: 228 | int bytes; 229 | int lines; 230 | bool quiet; 231 | 232 | option() : bytes(0), lines(0), quiet(false) {} 233 | 234 | BEGIN_OPTION_MAP_INLINE() 235 | ON_OPTION(SHORTOPT('b') || LONGOPT("bytes")) 236 | bytes = std::atoi(arg); 237 | used_args = 1; // Notify the parser of a consumption of argument. 238 | 239 | ON_OPTION_WITH_ARG(SHORTOPT('l') || LONGOPT("lines")) 240 | lines = std::atoi(arg); 241 | // no need of the notification: used_args variable will be set to 1. 242 | 243 | ON_OPTION(SHORTOPT('q') || LONGOPT("quiet") || LONGOPT("silent")) 244 | quiet = true; 245 | 246 | END_OPTION_MAP() 247 | }; 248 | 249 | int main(int argc, char *argv[]) 250 | { 251 | try { 252 | option opt; 253 | int argused = opt.parse(&argv[1], argc-1); // Skip argv[0]. 254 | 255 | std::cout << "used argv: " << argused << std::endl; 256 | std::cout << "bytes: " << opt.bytes << std::endl; 257 | std::cout << "lines: " << opt.lines << std::endl; 258 | std::cout << "quiet: " << opt.quiet << std::endl; 259 | } catch (const optparse::unrecognized_option& e) { 260 | std::cout << "unrecognized option: " << e.what() << std::endl; 261 | return 1; 262 | } catch (const optparse::invalid_value& e) { 263 | std::cout << "invalid value: " << e.what() << std::endl; 264 | return 1; 265 | } 266 | 267 | return 0; 268 | } 269 | 270 | #endif/*_BUILD_NCL_SAMPLE*/ 271 | 272 | 273 | #endif/*__NCL_OPTPRASE_H__*/ 274 | -------------------------------------------------------------------------------- /swig/export.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "export.h" 11 | 12 | #define UTF16 "UTF-16LE" 13 | #define UTF32 "UTF-32LE" 14 | 15 | #ifdef USE_LIBICONV_GNU 16 | #define iconv_open libiconv_open 17 | #define iconv_convert libiconv_convert 18 | #define iconv_close libiconv_close 19 | #endif/*USE_LIBICONV_GNU*/ 20 | 21 | #ifndef ICONV_CONST 22 | #define ICONV_CONST 23 | #endif/*ICONV_CONST*/ 24 | 25 | template 26 | bool iconv_convert(iconv_t cd, const source_type& src, destination_type& dst) 27 | { 28 | typedef typename source_type::value_type source_char_type; 29 | typedef typename destination_type::value_type destination_char_type; 30 | 31 | const char *inbuf = reinterpret_cast(src.c_str()); 32 | size_t inbytesleft = sizeof(source_char_type) * src.length(); 33 | while (inbytesleft > 0) { 34 | char buffer[1024]; 35 | char *p = buffer; 36 | size_t outbytesleft = 1024; 37 | int ret = iconv(cd, (ICONV_CONST char **)&inbuf, &inbytesleft, &p, &outbytesleft); 38 | if (ret == -1 && errno != E2BIG) { 39 | return false; 40 | } 41 | dst.append( 42 | reinterpret_cast(buffer), 43 | (1024 - outbytesleft) / sizeof(destination_char_type) 44 | ); 45 | } 46 | return true; 47 | } 48 | 49 | int translate_measure(int measure) 50 | { 51 | switch (measure) { 52 | case exact: 53 | return simstring::exact; 54 | case dice: 55 | return simstring::dice; 56 | case cosine: 57 | return simstring::cosine; 58 | case jaccard: 59 | return simstring::jaccard; 60 | case overlap: 61 | return simstring::overlap; 62 | } 63 | throw std::invalid_argument("Unknown similarity measure specified"); 64 | } 65 | 66 | 67 | 68 | typedef simstring::ngram_generator ngram_generator_type; 69 | typedef simstring::writer_base writer_type; 70 | typedef simstring::writer_base uwriter_type; 71 | typedef simstring::reader reader_type; 72 | 73 | writer::writer(const char *filename, int n, bool be, bool unicode) 74 | : m_dbw(NULL), m_gen(NULL), m_unicode(unicode) 75 | { 76 | ngram_generator_type *gen = new ngram_generator_type(n, be); 77 | if (unicode) { 78 | uwriter_type *dbw = new uwriter_type(*gen, filename); 79 | if (dbw->fail()) { 80 | std::string message = dbw->error(); 81 | delete dbw; 82 | delete gen; 83 | throw std::invalid_argument(message); 84 | } 85 | m_dbw = dbw; 86 | m_gen = gen; 87 | 88 | } else { 89 | writer_type *dbw = new writer_type(*gen, filename); 90 | if (dbw->fail()) { 91 | std::string message = dbw->error(); 92 | delete dbw; 93 | delete gen; 94 | throw std::invalid_argument(message); 95 | } 96 | m_dbw = dbw; 97 | m_gen = gen; 98 | } 99 | } 100 | 101 | writer::~writer() 102 | { 103 | if (m_unicode) { 104 | uwriter_type* dbw = reinterpret_cast(m_dbw); 105 | ngram_generator_type* gen = reinterpret_cast(m_gen); 106 | 107 | dbw->close(); 108 | if (dbw->fail()) { 109 | std::string message = dbw->error(); 110 | delete dbw; 111 | delete gen; 112 | throw std::runtime_error(message); 113 | } 114 | delete dbw; 115 | delete gen; 116 | 117 | } else { 118 | writer_type* dbw = reinterpret_cast(m_dbw); 119 | ngram_generator_type* gen = reinterpret_cast(m_gen); 120 | 121 | dbw->close(); 122 | if (dbw->fail()) { 123 | std::string message = dbw->error(); 124 | delete dbw; 125 | delete gen; 126 | throw std::runtime_error(message); 127 | } 128 | delete dbw; 129 | delete gen; 130 | } 131 | } 132 | 133 | void writer::insert(const char *string) 134 | { 135 | if (m_unicode) { 136 | uwriter_type* dbw = reinterpret_cast(m_dbw); 137 | 138 | std::wstring str; 139 | iconv_t cd = iconv_open("WCHAR_T", "UTF-8"); 140 | iconv_convert(cd, std::string(string), str); 141 | iconv_close(cd); 142 | 143 | dbw->insert(str); 144 | if (dbw->fail()) { 145 | throw std::runtime_error(dbw->error()); 146 | } 147 | 148 | } else { 149 | writer_type* dbw = reinterpret_cast(m_dbw); 150 | dbw->insert(string); 151 | if (dbw->fail()) { 152 | throw std::runtime_error(dbw->error()); 153 | } 154 | } 155 | } 156 | 157 | void writer::close() 158 | { 159 | if (m_unicode) { 160 | uwriter_type* dbw = reinterpret_cast(m_dbw); 161 | dbw->close(); 162 | if (dbw->fail()) { 163 | throw std::runtime_error(dbw->error()); 164 | } 165 | 166 | } else { 167 | writer_type* dbw = reinterpret_cast(m_dbw); 168 | dbw->close(); 169 | if (dbw->fail()) { 170 | throw std::runtime_error(dbw->error()); 171 | } 172 | } 173 | } 174 | 175 | 176 | 177 | reader::reader(const char *filename) 178 | : m_dbr(NULL), measure(cosine), threshold(0.7) 179 | { 180 | reader_type *dbr = new reader_type; 181 | 182 | if (!dbr->open(filename)) { 183 | delete dbr; 184 | throw std::invalid_argument("Failed to open the database"); 185 | } 186 | 187 | m_dbr = dbr; 188 | } 189 | 190 | reader::~reader() 191 | { 192 | this->close(); 193 | delete reinterpret_cast(m_dbr); 194 | } 195 | 196 | template 197 | void retrieve_thru( 198 | reader_type& dbr, 199 | const std::string& query, 200 | int measure, 201 | double threshold, 202 | insert_iterator_type ins 203 | ) 204 | { 205 | switch (measure) { 206 | case exact: 207 | dbr.retrieve(query, threshold, ins); 208 | break; 209 | case dice: 210 | dbr.retrieve(query, threshold, ins); 211 | break; 212 | case cosine: 213 | dbr.retrieve(query, threshold, ins); 214 | break; 215 | case jaccard: 216 | dbr.retrieve(query, threshold, ins); 217 | break; 218 | case overlap: 219 | dbr.retrieve(query, threshold, ins); 220 | break; 221 | } 222 | } 223 | 224 | template 225 | void retrieve_iconv( 226 | reader_type& dbr, 227 | const std::string& query, 228 | const char *encoding, 229 | int measure, 230 | double threshold, 231 | insert_iterator_type ins 232 | ) 233 | { 234 | typedef std::basic_string string_type; 235 | typedef std::vector strings_type; 236 | 237 | // Translate the character encoding of the query string from UTF-8 to the target encoding. 238 | string_type qstr; 239 | iconv_t fwd = iconv_open(encoding, "UTF-8"); 240 | iconv_convert(fwd, query, qstr); 241 | iconv_close(fwd); 242 | 243 | strings_type xstrs; 244 | switch (measure) { 245 | case exact: 246 | dbr.retrieve(qstr, threshold, std::back_inserter(xstrs)); 247 | break; 248 | case dice: 249 | dbr.retrieve(qstr, threshold, std::back_inserter(xstrs)); 250 | break; 251 | case cosine: 252 | dbr.retrieve(qstr, threshold, std::back_inserter(xstrs)); 253 | break; 254 | case jaccard: 255 | dbr.retrieve(qstr, threshold, std::back_inserter(xstrs)); 256 | break; 257 | case overlap: 258 | dbr.retrieve(qstr, threshold, std::back_inserter(xstrs)); 259 | break; 260 | } 261 | 262 | // Translate back the character encoding of retrieved strings into UTF-8. 263 | iconv_t bwd = iconv_open("UTF-8", encoding); 264 | for (typename strings_type::const_iterator it = xstrs.begin();it != xstrs.end();++it) { 265 | std::string dst; 266 | iconv_convert(bwd, *it, dst); 267 | *ins = dst; 268 | } 269 | iconv_close(bwd); 270 | } 271 | 272 | std::vector reader::retrieve(const char *query) 273 | { 274 | reader_type& dbr = *reinterpret_cast(m_dbr); 275 | std::vector ret; 276 | 277 | switch (dbr.char_size()) { 278 | case 1: 279 | retrieve_thru(dbr, query, this->measure, this->threshold, std::back_inserter(ret)); 280 | break; 281 | case 2: 282 | #if defined(__apple_build_version__) 283 | throw std::runtime_error("UTF16 not supported in macOS, due to compatibility issues with libc++."); 284 | #else 285 | retrieve_iconv(dbr, query, UTF16, this->measure, this->threshold, std::back_inserter(ret)); 286 | #endif 287 | break; 288 | case 4: 289 | #if defined(__apple_build_version__) 290 | throw std::runtime_error("UTF32 not supported in macOS, due to compatibility issues with libc++."); 291 | #else 292 | retrieve_iconv(dbr, query, UTF32, this->measure, this->threshold, std::back_inserter(ret)); 293 | #endif 294 | break; 295 | } 296 | 297 | return ret; 298 | } 299 | 300 | bool reader::check(const char *query) 301 | { 302 | reader_type& dbr = *reinterpret_cast(m_dbr); 303 | 304 | if (dbr.char_size() == 1) { 305 | std::string qstr = query; 306 | return dbr.check(qstr, translate_measure(this->measure), this->threshold); 307 | #if defined(__apple_build_version__) 308 | } else { 309 | throw std::runtime_error("UTF16/32 not supported in macOS, due to compatibility issues with libc++."); 310 | #else 311 | } else if (dbr.char_size() == 2) { 312 | std::basic_string qstr; 313 | iconv_t fwd = iconv_open(UTF16, "UTF-8"); 314 | iconv_convert(fwd, std::string(query), qstr); 315 | iconv_close(fwd); 316 | return dbr.check(qstr, translate_measure(this->measure), this->threshold); 317 | } else if (dbr.char_size() == 4) { 318 | std::basic_string qstr; 319 | iconv_t fwd = iconv_open(UTF32, "UTF-8"); 320 | iconv_convert(fwd, std::string(query), qstr); 321 | iconv_close(fwd); 322 | return dbr.check(qstr, translate_measure(this->measure), this->threshold); 323 | #endif 324 | } 325 | 326 | return false; 327 | } 328 | 329 | void reader::close() 330 | { 331 | reader_type& dbr = *reinterpret_cast(m_dbr); 332 | dbr.close(); 333 | } 334 | -------------------------------------------------------------------------------- /frontend/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SimString frontend. 3 | * 4 | * Copyright (c) 2009,2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #include "optparse.h" 46 | 47 | class option 48 | { 49 | public: 50 | enum { 51 | MODE_RETRIEVE = 0, 52 | MODE_BUILD, 53 | MODE_HELP, 54 | MODE_VERSION, 55 | }; 56 | 57 | enum { 58 | CC_CHAR = 0, // char 59 | CC_WCHAR, // wchar_t 60 | }; 61 | 62 | int mode; 63 | int code; 64 | std::string name; 65 | 66 | int ngram_size; 67 | bool be; 68 | int measure; 69 | double threshold; 70 | bool echo_back; 71 | bool quiet; 72 | bool benchmark; 73 | 74 | public: 75 | option() : 76 | mode(MODE_RETRIEVE), 77 | code(CC_CHAR), 78 | name(""), 79 | ngram_size(3), 80 | be(false), 81 | measure(simstring::cosine), 82 | threshold(0.7), 83 | echo_back(false), 84 | quiet(false), 85 | benchmark(false) 86 | { 87 | } 88 | }; 89 | 90 | class option_parser : 91 | public option, 92 | public optparse 93 | { 94 | BEGIN_OPTION_MAP_INLINE() 95 | ON_OPTION(SHORTOPT('b') || LONGOPT("build")) 96 | mode = MODE_BUILD; 97 | 98 | ON_OPTION_WITH_ARG(SHORTOPT('d') || LONGOPT("database")) 99 | name = arg; 100 | 101 | ON_OPTION(SHORTOPT('u') || LONGOPT("unicode")) 102 | code = CC_WCHAR; 103 | 104 | ON_OPTION_WITH_ARG(SHORTOPT('n') || LONGOPT("ngram")) 105 | ngram_size = std::atoi(arg); 106 | 107 | ON_OPTION(SHORTOPT('m') || LONGOPT("mark")) 108 | be = true; 109 | 110 | ON_OPTION_WITH_ARG(SHORTOPT('s') || LONGOPT("similarity")) 111 | if (std::strcmp(arg, "exact") == 0) { 112 | measure = simstring::exact; 113 | } else if (std::strcmp(arg, "dice") == 0) { 114 | measure = simstring::dice; 115 | } else if (std::strcmp(arg, "cosine") == 0) { 116 | measure = simstring::cosine; 117 | } else if (std::strcmp(arg, "jaccard") == 0) { 118 | measure = simstring::jaccard; 119 | } else if (std::strcmp(arg, "overlap") == 0) { 120 | measure = simstring::overlap; 121 | } 122 | 123 | ON_OPTION_WITH_ARG(SHORTOPT('t') || LONGOPT("threshold")) 124 | threshold = std::atof(arg); 125 | 126 | ON_OPTION(SHORTOPT('e') || LONGOPT("echo")) 127 | echo_back = true; 128 | 129 | ON_OPTION(SHORTOPT('q') || LONGOPT("quiet")) 130 | quiet = true; 131 | 132 | ON_OPTION(SHORTOPT('p') || LONGOPT("benchmark")) 133 | benchmark = true; 134 | 135 | ON_OPTION(SHORTOPT('v') || LONGOPT("version")) 136 | mode = MODE_VERSION; 137 | 138 | ON_OPTION(SHORTOPT('h') || LONGOPT("help")) 139 | mode = MODE_HELP; 140 | 141 | END_OPTION_MAP() 142 | }; 143 | 144 | int usage(std::ostream& os, const char *argv0) 145 | { 146 | os << "USAGE: " << argv0 << " [OPTIONS]" << std::endl; 147 | os << "This utility finds strings in the database (DB) such that they have similarity," << std::endl; 148 | os << "in the similarity measure (SIM), no smaller than the threshold (TH) with" << std::endl; 149 | os << "queries read from STDIN. When -b (--build) option is specified, this utility" << std::endl; 150 | os << "builds a database (DB) for strings read from STDIN." << std::endl; 151 | os << std::endl; 152 | os << "OPTIONS:" << std::endl; 153 | os << " -b, --build build a database for strings read from STDIN" << std::endl; 154 | os << " -d, --database=DB specify a database file" << std::endl; 155 | os << " -u, --unicode use Unicode (wchar_t) for representing characters" << std::endl; 156 | os << " -n, --ngram=N specify the unit of n-grams (DEFAULT=3)" << std::endl; 157 | os << " -m, --mark include marks for begins and ends of strings" << std::endl; 158 | os << " -s, --similarity=SIM specify a similarity measure (DEFAULT='cosine'):" << std::endl; 159 | os << " exact exact match" << std::endl; 160 | os << " dice dice coefficient" << std::endl; 161 | os << " cosine cosine coefficient" << std::endl; 162 | os << " jaccard jaccard coefficient" << std::endl; 163 | os << " overlap overlap coefficient" << std::endl; 164 | os << " -t, --threshold=TH specify the threshold (DEFAULT=0.7)" << std::endl; 165 | os << " -e, --echo-back echo back query strings to the output" << std::endl; 166 | os << " -q, --quiet suppress supplemental information from the output" << std::endl; 167 | os << " -p, --benchmark show benchmark result (retrieved strings are suppressed)" << std::endl; 168 | os << " -v, --version show this version information and exit" << std::endl; 169 | os << " -h, --help show this help message and exit" << std::endl; 170 | os << std::endl; 171 | return 0; 172 | } 173 | 174 | int version(std::ostream& os) 175 | { 176 | os << SIMSTRING_NAME " "; 177 | os << SIMSTRING_MAJOR_VERSION << "." << SIMSTRING_MINOR_VERSION << " "; 178 | os << SIMSTRING_COPYRIGHT << std::endl; 179 | os << std::endl; 180 | return 0; 181 | } 182 | 183 | template 184 | int build(option& opt, istream_type& is) 185 | { 186 | typedef std::basic_string string_type; 187 | typedef simstring::ngram_generator ngram_generator_type; 188 | typedef simstring::writer_base writer_type; 189 | 190 | std::ostream& os = std::cout; 191 | std::ostream& es = std::cerr; 192 | 193 | // Show the copyright information. 194 | version(os); 195 | 196 | // Show parameters for database construction. 197 | os << "Constructing the database" << std::endl; 198 | os << "Database name: " << opt.name << std::endl; 199 | os << "N-gram length: " << opt.ngram_size << std::endl; 200 | os << "Begin/end marks: " << std::boolalpha << opt.be << std::endl; 201 | os << "Char type: " << typeid(char_type).name() << " (" << sizeof(char_type) << ")" << std::endl; 202 | os.flush(); 203 | 204 | // Open the database for construction. 205 | clock_t clk = std::clock(); 206 | ngram_generator_type gen(opt.ngram_size, opt.be); 207 | writer_type db(gen, opt.name); 208 | if (db.fail()) { 209 | es << "ERROR: " << db.error() << std::endl; 210 | return 1; 211 | } 212 | 213 | // Insert every string from STDIN into the database. 214 | int n = 0; 215 | for (;;) { 216 | // Read a line. 217 | string_type line; 218 | std::getline(is, line); 219 | if (is.eof()) { 220 | break; 221 | } 222 | 223 | // Insert the string. 224 | if (!db.insert(line)) { 225 | es << "ERROR: " << db.error() << std::endl; 226 | return 1; 227 | } 228 | 229 | // Progress report. 230 | if (!opt.quiet && ++n % 10000 == 0) { 231 | os << "Number of strings: " << n << std::endl; 232 | os.flush(); 233 | } 234 | } 235 | os << "Number of strings: " << n << std::endl; 236 | os << std::endl; 237 | os.flush(); 238 | 239 | // Finalize the database. 240 | os << "Flushing the database" << std::endl; 241 | if (!db.close()) { 242 | es << "ERROR: " << db.error() << std::endl; 243 | return 1; 244 | } 245 | os << std::endl; 246 | 247 | // Report the elaped time for construction. 248 | os << "Total number of strings: " << n << std::endl; 249 | os << "Seconds required: " 250 | << (std::clock() - clk) / (double)CLOCKS_PER_SEC << std::endl; 251 | os << std::endl; 252 | os.flush(); 253 | 254 | return 0; 255 | } 256 | 257 | // widen for strings only with ASCII characters. 258 | template 259 | std::basic_string widen(const std::string& str) 260 | { 261 | std::basic_string dst; 262 | std::string::const_iterator it; 263 | for (it = str.begin();it != str.end();++it) { 264 | dst += static_cast(*it); 265 | } 266 | return dst; 267 | } 268 | 269 | template 270 | int retrieve(option& opt, istream_type& is, ostream_type& os) 271 | { 272 | typedef std::basic_string string_type; 273 | typedef std::vector strings_type; 274 | typedef simstring::reader reader_type; 275 | 276 | std::ostream& es = std::cerr; 277 | 278 | // Open the database. 279 | reader_type db; 280 | if (!db.open(opt.name)) { 281 | es << "ERROR: " << db.error() << std::endl; 282 | return 1; 283 | } 284 | 285 | // Check the size of characters. 286 | if (db.char_size() != sizeof(char_type)) { 287 | es << "ERROR: Inconsistent character encoding " << 288 | "(DB:" << db.char_size() << ", " << 289 | "CUR:" << sizeof(char_type) << "): " << std::endl; 290 | es << "This problem may be solved by specifying -u (--unicode) option." << std::endl; 291 | return 1; 292 | } 293 | 294 | int num_queries = 0; 295 | int num_retrieved = 0; 296 | clock_t clk_total = 0; 297 | for (;;) { 298 | // Read a line. 299 | string_type line; 300 | std::getline(is, line); 301 | if (is.eof()) { 302 | break; 303 | } 304 | 305 | // Issue a query. 306 | strings_type xstrs; 307 | clock_t clk = std::clock(); 308 | db.retrieve(line, opt.measure, opt.threshold, std::back_inserter(xstrs)); 309 | clock_t elapsed = (std::clock() - clk); 310 | 311 | // Update stats. 312 | clk_total += elapsed; 313 | num_retrieved += (int)xstrs.size(); 314 | ++num_queries; 315 | 316 | // Do not output results when the benchmarking flag is on. 317 | if (!opt.benchmark) { 318 | // Output the query string if necessary. 319 | if (opt.echo_back) { 320 | os << line << std::endl; 321 | } 322 | 323 | // Output the retrieved strings. 324 | typename strings_type::const_iterator it; 325 | for (it = xstrs.begin();it != xstrs.end();++it) { 326 | os << os.widen('\t') << *it << std::endl; 327 | } 328 | os.flush(); 329 | } 330 | 331 | // Do not output information when the quiet flag is on. 332 | if (!opt.quiet) { 333 | os << 334 | xstrs.size() << 335 | widen(" strings retrieved (") << 336 | (std::clock() - clk) / (double)CLOCKS_PER_SEC << 337 | widen(" sec)") << std::endl; 338 | } 339 | } 340 | 341 | // Output the benchmark information if necessary. 342 | if (opt.benchmark) { 343 | os << 344 | widen("Total number of queries: ") << 345 | num_queries << std::endl; 346 | os << 347 | widen("Seconds per query: ") << 348 | clk_total / (double)CLOCKS_PER_SEC / num_queries << std::endl; 349 | os << 350 | widen("Number of retrieved strings per query: ") << 351 | num_retrieved / (double)num_queries << std::endl; 352 | } 353 | 354 | return 0; 355 | } 356 | 357 | int main(int argc, char *argv[]) 358 | { 359 | // Parse the command-line options. 360 | option_parser opt; 361 | try { 362 | int arg_used = opt.parse(argv, argc); 363 | } catch (const optparse::unrecognized_option& e) { 364 | std::cerr << "ERROR: unrecognized option: " << e.what() << std::endl; 365 | return 1; 366 | } catch (const optparse::invalid_value& e) { 367 | std::cerr << "ERROR: " << e.what() << std::endl; 368 | return 1; 369 | } 370 | 371 | // Change the locale of wcin and wcout if necessary. 372 | if (opt.code == option::CC_WCHAR) { 373 | std::ios_base::sync_with_stdio(false); 374 | std::locale::global(std::locale("")); 375 | std::wcout.imbue(std::locale("")); 376 | std::wcin.imbue(std::locale("")); 377 | } 378 | 379 | // Branches for the processing mode. 380 | switch (opt.mode) { 381 | case option::MODE_HELP: 382 | return usage(std::cout, argv[0]); 383 | case option::MODE_VERSION: 384 | return version(std::cout); 385 | case option::MODE_BUILD: 386 | if (opt.code == option::CC_CHAR) { 387 | return build(opt, std::cin); 388 | } else if (opt.code == option::CC_WCHAR) { 389 | return build(opt, std::wcin); 390 | } 391 | break; 392 | case option::MODE_RETRIEVE: 393 | if (opt.code == option::CC_CHAR) { 394 | return retrieve(opt, std::cin, std::cout); 395 | } else if (opt.code == option::CC_WCHAR) { 396 | return retrieve(opt, std::wcin, std::wcout); 397 | } 398 | break; 399 | } 400 | 401 | // An unknown processing mode. 402 | return 1; 403 | } 404 | -------------------------------------------------------------------------------- /win32/stdint.h: -------------------------------------------------------------------------------- 1 | /* A portable stdint.h 2 | * 3 | * Copyright (c) 2005-2007 Paul Hsieh 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions 7 | * are met: 8 | * 9 | * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * 12 | * Redistributions in binary form must not misrepresent the orignal 13 | * source in the documentation and/or other materials provided 14 | * with the distribution. 15 | * 16 | * The names of the authors nor its contributors may be used to 17 | * endorse or promote products derived from this software without 18 | * specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 25 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 29 | * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 31 | * OF THE POSSIBILITY OF SUCH DAMAGE. 32 | * 33 | **************************************************************************** 34 | * 35 | * Version 0.1.8 36 | * 37 | * The ANSI C standard committee, for the C99 standard, specified the 38 | * inclusion of a new standard include file called stdint.h. This is 39 | * a very useful and long desired include file which contains several 40 | * very precise definitions for integer scalar types that is 41 | * critically important for making portable several classes of 42 | * applications including cryptography, hashing, variable length 43 | * integer libraries and so on. But for most developers its likely 44 | * useful just for programming sanity. 45 | * 46 | * The problem is that most compiler vendors have decided not to 47 | * implement the C99 standard, and the next C++ language standard 48 | * (which has a lot more mindshare these days) will be a long time in 49 | * coming and its unknown whether or not it will include stdint.h or 50 | * how much adoption it will have. Either way, it will be a long time 51 | * before all compilers come with a stdint.h and it also does nothing 52 | * for the extremely large number of compilers available today which 53 | * do not include this file, or anything comparable to it. 54 | * 55 | * So that's what this file is all about. Its an attempt to build a 56 | * single universal include file that works on as many platforms as 57 | * possible to deliver what stdint.h is supposed to. A few things 58 | * that should be noted about this file: 59 | * 60 | * 1) It is not guaranteed to be portable and/or present an identical 61 | * interface on all platforms. The extreme variability of the 62 | * ANSI C standard makes this an impossibility right from the 63 | * very get go. Its really only meant to be useful for the vast 64 | * majority of platforms that possess the capability of 65 | * implementing usefully and precisely defined, standard sized 66 | * integer scalars. Systems which are not intrinsically 2s 67 | * complement may produce invalid constants. 68 | * 69 | * 2) There is an unavoidable use of non-reserved symbols. 70 | * 71 | * 3) Other standard include files are invoked. 72 | * 73 | * 4) This file may come in conflict with future platforms that do 74 | * include stdint.h. The hope is that one or the other can be 75 | * used with no real difference. 76 | * 77 | * 5) In the current verison, if your platform can't represent 78 | * int32_t, int16_t and int8_t, it just dumps out with a compiler 79 | * error. 80 | * 81 | * 6) 64 bit integers may or may not be defined. Test for their 82 | * presence with the test: #ifdef INT64_MAX or #ifdef UINT64_MAX. 83 | * Note that this is different from the C99 specification which 84 | * requires the existence of 64 bit support in the compiler. If 85 | * this is not defined for your platform, yet it is capable of 86 | * dealing with 64 bits then it is because this file has not yet 87 | * been extended to cover all of your system's capabilities. 88 | * 89 | * 7) (u)intptr_t may or may not be defined. Test for its presence 90 | * with the test: #ifdef PTRDIFF_MAX. If this is not defined 91 | * for your platform, then it is because this file has not yet 92 | * been extended to cover all of your system's capabilities, not 93 | * because its optional. 94 | * 95 | * 8) The following might not been defined even if your platform is 96 | * capable of defining it: 97 | * 98 | * WCHAR_MIN 99 | * WCHAR_MAX 100 | * (u)int64_t 101 | * PTRDIFF_MIN 102 | * PTRDIFF_MAX 103 | * (u)intptr_t 104 | * 105 | * 9) The following have not been defined: 106 | * 107 | * WINT_MIN 108 | * WINT_MAX 109 | * 110 | * 10) The criteria for defining (u)int_least(*)_t isn't clear, 111 | * except for systems which don't have a type that precisely 112 | * defined 8, 16, or 32 bit types (which this include file does 113 | * not support anyways). Default definitions have been given. 114 | * 115 | * 11) The criteria for defining (u)int_fast(*)_t isn't something I 116 | * would trust to any particular compiler vendor or the ANSI C 117 | * committee. It is well known that "compatible systems" are 118 | * commonly created that have very different performance 119 | * characteristics from the systems they are compatible with, 120 | * especially those whose vendors make both the compiler and the 121 | * system. Default definitions have been given, but its strongly 122 | * recommended that users never use these definitions for any 123 | * reason (they do *NOT* deliver any serious guarantee of 124 | * improved performance -- not in this file, nor any vendor's 125 | * stdint.h). 126 | * 127 | * 12) The following macros: 128 | * 129 | * PRINTF_INTMAX_MODIFIER 130 | * PRINTF_INT64_MODIFIER 131 | * PRINTF_INT32_MODIFIER 132 | * PRINTF_INT16_MODIFIER 133 | * PRINTF_LEAST64_MODIFIER 134 | * PRINTF_LEAST32_MODIFIER 135 | * PRINTF_LEAST16_MODIFIER 136 | * PRINTF_INTPTR_MODIFIER 137 | * 138 | * are strings which have been defined as the modifiers required 139 | * for the "d", "u" and "x" printf formats to correctly output 140 | * (u)intmax_t, (u)int64_t, (u)int32_t, (u)int16_t, (u)least64_t, 141 | * (u)least32_t, (u)least16_t and (u)intptr_t types respectively. 142 | * PRINTF_INTPTR_MODIFIER is not defined for some systems which 143 | * provide their own stdint.h. PRINTF_INT64_MODIFIER is not 144 | * defined if INT64_MAX is not defined. These are an extension 145 | * beyond what C99 specifies must be in stdint.h. 146 | * 147 | * In addition, the following macros are defined: 148 | * 149 | * PRINTF_INTMAX_HEX_WIDTH 150 | * PRINTF_INT64_HEX_WIDTH 151 | * PRINTF_INT32_HEX_WIDTH 152 | * PRINTF_INT16_HEX_WIDTH 153 | * PRINTF_INT8_HEX_WIDTH 154 | * PRINTF_INTMAX_DEC_WIDTH 155 | * PRINTF_INT64_DEC_WIDTH 156 | * PRINTF_INT32_DEC_WIDTH 157 | * PRINTF_INT16_DEC_WIDTH 158 | * PRINTF_INT8_DEC_WIDTH 159 | * 160 | * Which specifies the maximum number of characters required to 161 | * print the number of that type in either hexadecimal or decimal. 162 | * These are an extension beyond what C99 specifies must be in 163 | * stdint.h. 164 | * 165 | * Compilers tested (all with 0 warnings at their highest respective 166 | * settings): Borland Turbo C 2.0, WATCOM C/C++ 11.0 (16 bits and 32 167 | * bits), Microsoft Visual C++ 6.0 (32 bit), Microsoft Visual Studio 168 | * .net (VC7), Intel C++ 4.0, GNU gcc v3.3.3 169 | * 170 | * This file should be considered a work in progress. Suggestions for 171 | * improvements, especially those which increase coverage are strongly 172 | * encouraged. 173 | * 174 | * Acknowledgements 175 | * 176 | * The following people have made significant contributions to the 177 | * development and testing of this file: 178 | * 179 | * Chris Howie 180 | * John Steele Scott 181 | * Dave Thorup 182 | * 183 | */ 184 | 185 | #include 186 | #include 187 | #include 188 | 189 | /* 190 | * For gcc with _STDINT_H, fill in the PRINTF_INT*_MODIFIER macros, and 191 | * do nothing else. On the Mac OS X version of gcc this is _STDINT_H_. 192 | */ 193 | 194 | #if ((defined(__STDC__) && __STDC__ && __STDC_VERSION__ >= 199901L) || (defined (__WATCOMC__) && (defined (_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (defined(_STDINT_H) || defined(_STDINT_H_)) )) && !defined (_PSTDINT_H_INCLUDED) 195 | #include 196 | #define _PSTDINT_H_INCLUDED 197 | # ifndef PRINTF_INT64_MODIFIER 198 | # define PRINTF_INT64_MODIFIER "ll" 199 | # endif 200 | # ifndef PRINTF_INT32_MODIFIER 201 | # define PRINTF_INT32_MODIFIER "l" 202 | # endif 203 | # ifndef PRINTF_INT16_MODIFIER 204 | # define PRINTF_INT16_MODIFIER "h" 205 | # endif 206 | # ifndef PRINTF_INTMAX_MODIFIER 207 | # define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER 208 | # endif 209 | # ifndef PRINTF_INT64_HEX_WIDTH 210 | # define PRINTF_INT64_HEX_WIDTH "16" 211 | # endif 212 | # ifndef PRINTF_INT32_HEX_WIDTH 213 | # define PRINTF_INT32_HEX_WIDTH "8" 214 | # endif 215 | # ifndef PRINTF_INT16_HEX_WIDTH 216 | # define PRINTF_INT16_HEX_WIDTH "4" 217 | # endif 218 | # ifndef PRINTF_INT8_HEX_WIDTH 219 | # define PRINTF_INT8_HEX_WIDTH "2" 220 | # endif 221 | # ifndef PRINTF_INT64_DEC_WIDTH 222 | # define PRINTF_INT64_DEC_WIDTH "20" 223 | # endif 224 | # ifndef PRINTF_INT32_DEC_WIDTH 225 | # define PRINTF_INT32_DEC_WIDTH "10" 226 | # endif 227 | # ifndef PRINTF_INT16_DEC_WIDTH 228 | # define PRINTF_INT16_DEC_WIDTH "5" 229 | # endif 230 | # ifndef PRINTF_INT8_DEC_WIDTH 231 | # define PRINTF_INT8_DEC_WIDTH "3" 232 | # endif 233 | # ifndef PRINTF_INTMAX_HEX_WIDTH 234 | # define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH 235 | # endif 236 | # ifndef PRINTF_INTMAX_DEC_WIDTH 237 | # define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH 238 | # endif 239 | #endif 240 | 241 | #ifndef _PSTDINT_H_INCLUDED 242 | #define _PSTDINT_H_INCLUDED 243 | 244 | #ifndef SIZE_MAX 245 | # define SIZE_MAX (~(size_t)0) 246 | #endif 247 | 248 | /* 249 | * Deduce the type assignments from limits.h under the assumption that 250 | * integer sizes in bits are powers of 2, and follow the ANSI 251 | * definitions. 252 | */ 253 | 254 | #ifndef UINT8_MAX 255 | # define UINT8_MAX 0xff 256 | #endif 257 | #ifndef uint8_t 258 | # if (UCHAR_MAX == UINT8_MAX) || defined (S_SPLINT_S) 259 | typedef unsigned char uint8_t; 260 | # define UINT8_C(v) ((uint8_t) v) 261 | # else 262 | # error "Platform not supported" 263 | # endif 264 | #endif 265 | 266 | #ifndef INT8_MAX 267 | # define INT8_MAX 0x7f 268 | #endif 269 | #ifndef INT8_MIN 270 | # define INT8_MIN INT8_C(0x80) 271 | #endif 272 | #ifndef int8_t 273 | # if (SCHAR_MAX == INT8_MAX) || defined (S_SPLINT_S) 274 | typedef signed char int8_t; 275 | # define INT8_C(v) ((int8_t) v) 276 | # else 277 | # error "Platform not supported" 278 | # endif 279 | #endif 280 | 281 | #ifndef UINT16_MAX 282 | # define UINT16_MAX 0xffff 283 | #endif 284 | #ifndef uint16_t 285 | #if (UINT_MAX == UINT16_MAX) || defined (S_SPLINT_S) 286 | typedef unsigned int uint16_t; 287 | # ifndef PRINTF_INT16_MODIFIER 288 | # define PRINTF_INT16_MODIFIER "" 289 | # endif 290 | # define UINT16_C(v) ((uint16_t) (v)) 291 | #elif (USHRT_MAX == UINT16_MAX) 292 | typedef unsigned short uint16_t; 293 | # define UINT16_C(v) ((uint16_t) (v)) 294 | # ifndef PRINTF_INT16_MODIFIER 295 | # define PRINTF_INT16_MODIFIER "h" 296 | # endif 297 | #else 298 | #error "Platform not supported" 299 | #endif 300 | #endif 301 | 302 | #ifndef INT16_MAX 303 | # define INT16_MAX 0x7fff 304 | #endif 305 | #ifndef INT16_MIN 306 | # define INT16_MIN INT16_C(0x8000) 307 | #endif 308 | #ifndef int16_t 309 | #if (INT_MAX == INT16_MAX) || defined (S_SPLINT_S) 310 | typedef signed int int16_t; 311 | # define INT16_C(v) ((int16_t) (v)) 312 | # ifndef PRINTF_INT16_MODIFIER 313 | # define PRINTF_INT16_MODIFIER "" 314 | # endif 315 | #elif (SHRT_MAX == INT16_MAX) 316 | typedef signed short int16_t; 317 | # define INT16_C(v) ((int16_t) (v)) 318 | # ifndef PRINTF_INT16_MODIFIER 319 | # define PRINTF_INT16_MODIFIER "h" 320 | # endif 321 | #else 322 | #error "Platform not supported" 323 | #endif 324 | #endif 325 | 326 | #ifndef UINT32_MAX 327 | # define UINT32_MAX (0xffffffffUL) 328 | #endif 329 | #ifndef uint32_t 330 | #if (ULONG_MAX == UINT32_MAX) || defined (S_SPLINT_S) 331 | typedef unsigned long uint32_t; 332 | # define UINT32_C(v) v ## UL 333 | # ifndef PRINTF_INT32_MODIFIER 334 | # define PRINTF_INT32_MODIFIER "l" 335 | # endif 336 | #elif (UINT_MAX == UINT32_MAX) 337 | typedef unsigned int uint32_t; 338 | # ifndef PRINTF_INT32_MODIFIER 339 | # define PRINTF_INT32_MODIFIER "" 340 | # endif 341 | # define UINT32_C(v) v ## U 342 | #elif (USHRT_MAX == UINT32_MAX) 343 | typedef unsigned short uint32_t; 344 | # define UINT32_C(v) ((unsigned short) (v)) 345 | # ifndef PRINTF_INT32_MODIFIER 346 | # define PRINTF_INT32_MODIFIER "" 347 | # endif 348 | #else 349 | #error "Platform not supported" 350 | #endif 351 | #endif 352 | 353 | #ifndef INT32_MAX 354 | # define INT32_MAX (0x7fffffffL) 355 | #endif 356 | #ifndef INT32_MIN 357 | # define INT32_MIN INT32_C(0x80000000) 358 | #endif 359 | #ifndef int32_t 360 | #if (LONG_MAX == INT32_MAX) || defined (S_SPLINT_S) 361 | typedef signed long int32_t; 362 | # define INT32_C(v) v ## L 363 | # ifndef PRINTF_INT32_MODIFIER 364 | # define PRINTF_INT32_MODIFIER "l" 365 | # endif 366 | #elif (INT_MAX == INT32_MAX) 367 | typedef signed int int32_t; 368 | # define INT32_C(v) v 369 | # ifndef PRINTF_INT32_MODIFIER 370 | # define PRINTF_INT32_MODIFIER "" 371 | # endif 372 | #elif (SHRT_MAX == INT32_MAX) 373 | typedef signed short int32_t; 374 | # define INT32_C(v) ((short) (v)) 375 | # ifndef PRINTF_INT32_MODIFIER 376 | # define PRINTF_INT32_MODIFIER "" 377 | # endif 378 | #else 379 | #error "Platform not supported" 380 | #endif 381 | #endif 382 | 383 | /* 384 | * The macro stdint_int64_defined is temporarily used to record 385 | * whether or not 64 integer support is available. It must be 386 | * defined for any 64 integer extensions for new platforms that are 387 | * added. 388 | */ 389 | 390 | #undef stdint_int64_defined 391 | #if (defined(__STDC__) && defined(__STDC_VERSION__)) || defined (S_SPLINT_S) 392 | # if (__STDC__ && __STDC_VERSION >= 199901L) || defined (S_SPLINT_S) 393 | # define stdint_int64_defined 394 | typedef long long int64_t; 395 | typedef unsigned long long uint64_t; 396 | # define UINT64_C(v) v ## ULL 397 | # define INT64_C(v) v ## LL 398 | # ifndef PRINTF_INT64_MODIFIER 399 | # define PRINTF_INT64_MODIFIER "ll" 400 | # endif 401 | # endif 402 | #endif 403 | 404 | #if !defined (stdint_int64_defined) 405 | # if defined(__GNUC__) 406 | # define stdint_int64_defined 407 | __extension__ typedef long long int64_t; 408 | __extension__ typedef unsigned long long uint64_t; 409 | # define UINT64_C(v) v ## ULL 410 | # define INT64_C(v) v ## LL 411 | # ifndef PRINTF_INT64_MODIFIER 412 | # define PRINTF_INT64_MODIFIER "ll" 413 | # endif 414 | # elif defined(__MWERKS__) || defined (__SUNPRO_C) || defined (__SUNPRO_CC) || defined (__APPLE_CC__) || defined (_LONG_LONG) || defined (_CRAYC) || defined (S_SPLINT_S) 415 | # define stdint_int64_defined 416 | typedef long long int64_t; 417 | typedef unsigned long long uint64_t; 418 | # define UINT64_C(v) v ## ULL 419 | # define INT64_C(v) v ## LL 420 | # ifndef PRINTF_INT64_MODIFIER 421 | # define PRINTF_INT64_MODIFIER "ll" 422 | # endif 423 | # elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined (__BORLANDC__) && __BORLANDC__ > 0x460) || defined (__alpha) || defined (__DECC) 424 | # define stdint_int64_defined 425 | typedef __int64 int64_t; 426 | typedef unsigned __int64 uint64_t; 427 | # define UINT64_C(v) v ## UI64 428 | # define INT64_C(v) v ## I64 429 | # ifndef PRINTF_INT64_MODIFIER 430 | # define PRINTF_INT64_MODIFIER "I64" 431 | # endif 432 | # endif 433 | #endif 434 | 435 | #if !defined (LONG_LONG_MAX) && defined (INT64_C) 436 | # define LONG_LONG_MAX INT64_C (9223372036854775807) 437 | #endif 438 | #ifndef ULONG_LONG_MAX 439 | # define ULONG_LONG_MAX UINT64_C (18446744073709551615) 440 | #endif 441 | 442 | #if !defined (INT64_MAX) && defined (INT64_C) 443 | # define INT64_MAX INT64_C (9223372036854775807) 444 | #endif 445 | #if !defined (INT64_MIN) && defined (INT64_C) 446 | # define INT64_MIN INT64_C (-9223372036854775808) 447 | #endif 448 | #if !defined (UINT64_MAX) && defined (INT64_C) 449 | # define UINT64_MAX UINT64_C (18446744073709551615) 450 | #endif 451 | 452 | /* 453 | * Width of hexadecimal for number field. 454 | */ 455 | 456 | #ifndef PRINTF_INT64_HEX_WIDTH 457 | # define PRINTF_INT64_HEX_WIDTH "16" 458 | #endif 459 | #ifndef PRINTF_INT32_HEX_WIDTH 460 | # define PRINTF_INT32_HEX_WIDTH "8" 461 | #endif 462 | #ifndef PRINTF_INT16_HEX_WIDTH 463 | # define PRINTF_INT16_HEX_WIDTH "4" 464 | #endif 465 | #ifndef PRINTF_INT8_HEX_WIDTH 466 | # define PRINTF_INT8_HEX_WIDTH "2" 467 | #endif 468 | 469 | #ifndef PRINTF_INT64_DEC_WIDTH 470 | # define PRINTF_INT64_DEC_WIDTH "20" 471 | #endif 472 | #ifndef PRINTF_INT32_DEC_WIDTH 473 | # define PRINTF_INT32_DEC_WIDTH "10" 474 | #endif 475 | #ifndef PRINTF_INT16_DEC_WIDTH 476 | # define PRINTF_INT16_DEC_WIDTH "5" 477 | #endif 478 | #ifndef PRINTF_INT8_DEC_WIDTH 479 | # define PRINTF_INT8_DEC_WIDTH "3" 480 | #endif 481 | 482 | /* 483 | * Ok, lets not worry about 128 bit integers for now. Moore's law says 484 | * we don't need to worry about that until about 2040 at which point 485 | * we'll have bigger things to worry about. 486 | */ 487 | 488 | #ifdef stdint_int64_defined 489 | typedef int64_t intmax_t; 490 | typedef uint64_t uintmax_t; 491 | # define INTMAX_MAX INT64_MAX 492 | # define INTMAX_MIN INT64_MIN 493 | # define UINTMAX_MAX UINT64_MAX 494 | # define UINTMAX_C(v) UINT64_C(v) 495 | # define INTMAX_C(v) INT64_C(v) 496 | # ifndef PRINTF_INTMAX_MODIFIER 497 | # define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER 498 | # endif 499 | # ifndef PRINTF_INTMAX_HEX_WIDTH 500 | # define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH 501 | # endif 502 | # ifndef PRINTF_INTMAX_DEC_WIDTH 503 | # define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH 504 | # endif 505 | #else 506 | typedef int32_t intmax_t; 507 | typedef uint32_t uintmax_t; 508 | # define INTMAX_MAX INT32_MAX 509 | # define UINTMAX_MAX UINT32_MAX 510 | # define UINTMAX_C(v) UINT32_C(v) 511 | # define INTMAX_C(v) INT32_C(v) 512 | # ifndef PRINTF_INTMAX_MODIFIER 513 | # define PRINTF_INTMAX_MODIFIER PRINTF_INT32_MODIFIER 514 | # endif 515 | # ifndef PRINTF_INTMAX_HEX_WIDTH 516 | # define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT32_HEX_WIDTH 517 | # endif 518 | # ifndef PRINTF_INTMAX_DEC_WIDTH 519 | # define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT32_DEC_WIDTH 520 | # endif 521 | #endif 522 | 523 | /* 524 | * Because this file currently only supports platforms which have 525 | * precise powers of 2 as bit sizes for the default integers, the 526 | * least definitions are all trivial. Its possible that a future 527 | * version of this file could have different definitions. 528 | */ 529 | 530 | #ifndef stdint_least_defined 531 | typedef int8_t int_least8_t; 532 | typedef uint8_t uint_least8_t; 533 | typedef int16_t int_least16_t; 534 | typedef uint16_t uint_least16_t; 535 | typedef int32_t int_least32_t; 536 | typedef uint32_t uint_least32_t; 537 | # define PRINTF_LEAST32_MODIFIER PRINTF_INT32_MODIFIER 538 | # define PRINTF_LEAST16_MODIFIER PRINTF_INT16_MODIFIER 539 | # define UINT_LEAST8_MAX UINT8_MAX 540 | # define INT_LEAST8_MAX INT8_MAX 541 | # define UINT_LEAST16_MAX UINT16_MAX 542 | # define INT_LEAST16_MAX INT16_MAX 543 | # define UINT_LEAST32_MAX UINT32_MAX 544 | # define INT_LEAST32_MAX INT32_MAX 545 | # define INT_LEAST8_MIN INT8_MIN 546 | # define INT_LEAST16_MIN INT16_MIN 547 | # define INT_LEAST32_MIN INT32_MIN 548 | # ifdef stdint_int64_defined 549 | typedef int64_t int_least64_t; 550 | typedef uint64_t uint_least64_t; 551 | # define PRINTF_LEAST64_MODIFIER PRINTF_INT64_MODIFIER 552 | # define UINT_LEAST64_MAX UINT64_MAX 553 | # define INT_LEAST64_MAX INT64_MAX 554 | # define INT_LEAST64_MIN INT64_MIN 555 | # endif 556 | #endif 557 | #undef stdint_least_defined 558 | 559 | /* 560 | * The ANSI C committee pretending to know or specify anything about 561 | * performance is the epitome of misguided arrogance. The mandate of 562 | * this file is to *ONLY* ever support that absolute minimum 563 | * definition of the fast integer types, for compatibility purposes. 564 | * No extensions, and no attempt to suggest what may or may not be a 565 | * faster integer type will ever be made in this file. Developers are 566 | * warned to stay away from these types when using this or any other 567 | * stdint.h. 568 | */ 569 | 570 | typedef int_least8_t int_fast8_t; 571 | typedef uint_least8_t uint_fast8_t; 572 | typedef int_least16_t int_fast16_t; 573 | typedef uint_least16_t uint_fast16_t; 574 | typedef int_least32_t int_fast32_t; 575 | typedef uint_least32_t uint_fast32_t; 576 | #define UINT_FAST8_MAX UINT_LEAST8_MAX 577 | #define INT_FAST8_MAX INT_LEAST8_MAX 578 | #define UINT_FAST16_MAX UINT_LEAST16_MAX 579 | #define INT_FAST16_MAX INT_LEAST16_MAX 580 | #define UINT_FAST32_MAX UINT_LEAST32_MAX 581 | #define INT_FAST32_MAX INT_LEAST32_MAX 582 | #define INT_FAST8_MIN IN_LEASTT8_MIN 583 | #define INT_FAST16_MIN INT_LEAST16_MIN 584 | #define INT_FAST32_MIN INT_LEAST32_MIN 585 | #ifdef stdint_int64_defined 586 | typedef int_least64_t int_fast64_t; 587 | typedef uint_least64_t uint_fast64_t; 588 | # define UINT_FAST64_MAX UINT_LEAST64_MAX 589 | # define INT_FAST64_MAX INT_LEAST64_MAX 590 | # define INT_FAST64_MIN INT_LEAST64_MIN 591 | #endif 592 | 593 | #undef stdint_int64_defined 594 | 595 | /* 596 | * Whatever piecemeal, per compiler thing we can do about the wchar_t 597 | * type limits. 598 | */ 599 | 600 | #if defined(__WATCOMC__) || defined(_MSC_VER) || defined (__GNUC__) 601 | # include 602 | # ifndef WCHAR_MIN 603 | # define WCHAR_MIN 0 604 | # endif 605 | # ifndef WCHAR_MAX 606 | # define WCHAR_MAX ((wchar_t)-1) 607 | # endif 608 | #endif 609 | 610 | /* 611 | * Whatever piecemeal, per compiler/platform thing we can do about the 612 | * (u)intptr_t types and limits. 613 | */ 614 | 615 | #if defined (_MSC_VER) && defined (_UINTPTR_T_DEFINED) 616 | # define STDINT_H_UINTPTR_T_DEFINED 617 | #endif 618 | 619 | #ifndef STDINT_H_UINTPTR_T_DEFINED 620 | # if defined (__alpha__) || defined (__ia64__) || defined (__x86_64__) || defined (_WIN64) 621 | # define stdint_intptr_bits 64 622 | # elif defined (__WATCOMC__) || defined (__TURBOC__) 623 | # if defined(__TINY__) || defined(__SMALL__) || defined(__MEDIUM__) 624 | # define stdint_intptr_bits 16 625 | # else 626 | # define stdint_intptr_bits 32 627 | # endif 628 | # elif defined (__i386__) || defined (_WIN32) || defined (WIN32) 629 | # define stdint_intptr_bits 32 630 | # elif defined (__INTEL_COMPILER) 631 | /* TODO -- what will Intel do about x86-64? */ 632 | # endif 633 | 634 | # ifdef stdint_intptr_bits 635 | # define stdint_intptr_glue3_i(a,b,c) a##b##c 636 | # define stdint_intptr_glue3(a,b,c) stdint_intptr_glue3_i(a,b,c) 637 | # ifndef PRINTF_INTPTR_MODIFIER 638 | # define PRINTF_INTPTR_MODIFIER stdint_intptr_glue3(PRINTF_INT,stdint_intptr_bits,_MODIFIER) 639 | # endif 640 | # ifndef PTRDIFF_MAX 641 | # define PTRDIFF_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) 642 | # endif 643 | # ifndef PTRDIFF_MIN 644 | # define PTRDIFF_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) 645 | # endif 646 | # ifndef UINTPTR_MAX 647 | # define UINTPTR_MAX stdint_intptr_glue3(UINT,stdint_intptr_bits,_MAX) 648 | # endif 649 | # ifndef INTPTR_MAX 650 | # define INTPTR_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) 651 | # endif 652 | # ifndef INTPTR_MIN 653 | # define INTPTR_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) 654 | # endif 655 | # ifndef INTPTR_C 656 | # define INTPTR_C(x) stdint_intptr_glue3(INT,stdint_intptr_bits,_C)(x) 657 | # endif 658 | # ifndef UINTPTR_C 659 | # define UINTPTR_C(x) stdint_intptr_glue3(UINT,stdint_intptr_bits,_C)(x) 660 | # endif 661 | typedef stdint_intptr_glue3(uint,stdint_intptr_bits,_t) uintptr_t; 662 | typedef stdint_intptr_glue3( int,stdint_intptr_bits,_t) intptr_t; 663 | # else 664 | /* TODO -- This following is likely wrong for some platforms, and does 665 | nothing for the definition of uintptr_t. */ 666 | typedef ptrdiff_t intptr_t; 667 | # endif 668 | # define STDINT_H_UINTPTR_T_DEFINED 669 | #endif 670 | 671 | /* 672 | * Assumes sig_atomic_t is signed and we have a 2s complement machine. 673 | */ 674 | 675 | #ifndef SIG_ATOMIC_MAX 676 | # define SIG_ATOMIC_MAX ((((sig_atomic_t) 1) << (sizeof (sig_atomic_t)*CHAR_BIT-1)) - 1) 677 | #endif 678 | 679 | #endif 680 | -------------------------------------------------------------------------------- /include/simstring/cdbpp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * C++ implementation of Constant Database (CDB++) 3 | * 4 | * Copyright (c) 2008-2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #ifndef __CDBPP_H__ 34 | #define __CDBPP_H__ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | namespace cdbpp 45 | { 46 | 47 | /** 48 | * \addtogroup cdbpp_api CDB++ API 49 | * @{ 50 | * 51 | * The CDB++ API. 52 | */ 53 | 54 | // Global constants. 55 | enum { 56 | // Version number. 57 | CDBPP_VERSION = 1, 58 | // The number of hash tables. 59 | NUM_TABLES = 256, 60 | // A constant for byte-order checking. 61 | BYTEORDER_CHECK = 0x62445371, 62 | }; 63 | 64 | 65 | 66 | 67 | /** 68 | * MurmurHash2. 69 | * 70 | * This code makes the following assumption about how your machine behaves 71 | * - We can read a 4-byte value from any address without crashing. 72 | * 73 | * It also has a few limitations: 74 | * - It will not work incrementally. 75 | * - It will not produce the same results on little-endian and big-endian 76 | * machines. 77 | * 78 | * @author Austin Appleby 79 | */ 80 | class murmurhash2 : 81 | public std::binary_function 82 | { 83 | protected: 84 | inline static uint32_t get32bits(const char *d) 85 | { 86 | return *reinterpret_cast(d); 87 | } 88 | 89 | public: 90 | inline uint32_t operator() (const void *key, size_t size) const 91 | { 92 | // 'm' and 'r' are mixing constants generated offline. 93 | // They're not really 'magic', they just happen to work well. 94 | 95 | const uint32_t m = 0x5bd1e995; 96 | const int32_t r = 24; 97 | 98 | // Initialize the hash to a 'random' value 99 | 100 | const uint32_t seed = 0x87654321; 101 | uint32_t h = seed ^ size; 102 | 103 | // Mix 4 bytes at a time into the hash 104 | 105 | const char * data = (const char *)key; 106 | 107 | while (size >= 4) 108 | { 109 | uint32_t k = get32bits(data); 110 | 111 | k *= m; 112 | k ^= k >> r; 113 | k *= m; 114 | 115 | h *= m; 116 | h ^= k; 117 | 118 | data += 4; 119 | size -= 4; 120 | } 121 | 122 | // Handle the last few bytes of the input array 123 | 124 | switch (size) 125 | { 126 | case 3: h ^= data[2] << 16; 127 | case 2: h ^= data[1] << 8; 128 | case 1: h ^= data[0]; 129 | h *= m; 130 | }; 131 | 132 | // Do a few final mixes of the hash to ensure the last few 133 | // bytes are well-incorporated. 134 | 135 | h ^= h >> 13; 136 | h *= m; 137 | h ^= h >> 15; 138 | 139 | return h; 140 | } 141 | }; 142 | 143 | 144 | 145 | 146 | struct tableref_t 147 | { 148 | uint32_t offset; // Offset to a hash table. 149 | uint32_t num; // Number of elements in the hash table. 150 | }; 151 | 152 | 153 | static uint32_t get_data_begin() 154 | { 155 | return (16 + sizeof(tableref_t) * NUM_TABLES); 156 | } 157 | 158 | 159 | 160 | /** 161 | * Exception class for the CDB++ builder. 162 | */ 163 | class builder_exception : public std::invalid_argument 164 | { 165 | public: 166 | builder_exception(const std::string& msg) 167 | : std::invalid_argument(msg) 168 | { 169 | } 170 | }; 171 | 172 | 173 | 174 | /** 175 | * CDB++ builder. 176 | */ 177 | template 178 | class builder_base 179 | { 180 | protected: 181 | // A bucket structure. 182 | struct bucket 183 | { 184 | uint32_t hash; // Hash value of the record. 185 | uint32_t offset; // Offset address to the actual record. 186 | 187 | bucket() : hash(0), offset(0) 188 | { 189 | } 190 | 191 | bucket(uint32_t h, uint32_t o) : hash(h), offset(o) 192 | { 193 | } 194 | }; 195 | 196 | // A hash table is a vector of buckets. 197 | typedef std::vector hashtable; 198 | 199 | protected: 200 | std::ofstream& m_os; // Output stream. 201 | uint32_t m_begin; 202 | uint32_t m_cur; 203 | hashtable m_ht[NUM_TABLES]; // Hash tables. 204 | 205 | public: 206 | /** 207 | * Constructs an object. 208 | * @param os The output stream to which this class write the 209 | * database. This stream must be opened in the 210 | * binary mode (\c std::ios_base::binary). 211 | */ 212 | builder_base(std::ofstream& os) : m_os(os) 213 | { 214 | m_begin = (uint32_t)m_os.tellp(); 215 | m_cur = get_data_begin(); 216 | m_os.seekp(m_begin + m_cur); 217 | } 218 | 219 | /** 220 | * Destructs an object. 221 | */ 222 | virtual ~builder_base() 223 | { 224 | this->close(); 225 | } 226 | 227 | /** 228 | * Inserts a pair of key and value to the database. 229 | * Any key in the database should be unique, but this library does not 230 | * check duplicated keys. 231 | * @param key The pointer to the key. 232 | * @param ksize The size of the key. 233 | * @param value The pointer to the value. 234 | * @param vsize The size of the value. 235 | */ 236 | template 237 | void put(const key_t *key, size_t ksize, const value_t *value, size_t vsize) 238 | { 239 | // Write out the current record. 240 | write_uint32((uint32_t)ksize); 241 | m_os.write(reinterpret_cast(key), ksize); 242 | write_uint32((uint32_t)vsize); 243 | m_os.write(reinterpret_cast(value), vsize); 244 | 245 | // Compute the hash value and choose a hash table. 246 | uint32_t hv = hash_function()(static_cast(key), ksize); 247 | hashtable& ht = m_ht[hv % NUM_TABLES]; 248 | 249 | // Store the hash value and offset to the hash table. 250 | ht.push_back(bucket(hv, m_cur)); 251 | 252 | // Increment the current position. 253 | m_cur += sizeof(uint32_t) + ksize + sizeof(uint32_t) + vsize; 254 | } 255 | 256 | protected: 257 | void close() 258 | { 259 | // Check the consistency of the stream offset. 260 | if (m_begin + m_cur != (uint32_t)m_os.tellp()) { 261 | throw builder_exception("Inconsistent stream offset"); 262 | } 263 | 264 | // Store the hash tables. At this moment, the file pointer refers to 265 | // the offset succeeding the last key/value pair. 266 | for (size_t i = 0;i < NUM_TABLES;++i) { 267 | hashtable& ht = m_ht[i]; 268 | 269 | // Do not write an empty hash table. 270 | if (!ht.empty()) { 271 | // An actual table will have the double size; half elements 272 | // in the table are kept empty. 273 | int n = ht.size() * 2; 274 | 275 | // Allocate the actual table. 276 | bucket* dst = new bucket[n]; 277 | 278 | // Put hash elements to the table with the open-address method. 279 | typename hashtable::const_iterator it; 280 | for (it = ht.begin();it != ht.end();++it) { 281 | int k = (it->hash >> 8) % n; 282 | 283 | // Find a vacant element. 284 | while (dst[k].offset != 0) { 285 | k = (k+1) % n; 286 | } 287 | 288 | // Store the hash element. 289 | dst[k].hash = it->hash; 290 | dst[k].offset = it->offset; 291 | } 292 | 293 | // Write out the new table. 294 | for (int k = 0;k < n;++k) { 295 | write_uint32(dst[k].hash); 296 | write_uint32(dst[k].offset); 297 | } 298 | 299 | // Free the table. 300 | delete[] dst; 301 | } 302 | } 303 | 304 | // Store the current position. 305 | uint32_t offset = (uint32_t)m_os.tellp(); 306 | 307 | // Rewind the stream position to the beginning. 308 | m_os.seekp(m_begin); 309 | 310 | // Write the file header. 311 | char chunkid[4] = {'C','D','B','+'}; 312 | m_os.write(chunkid, 4); 313 | write_uint32(offset - m_begin); 314 | write_uint32(CDBPP_VERSION); 315 | write_uint32(BYTEORDER_CHECK); 316 | 317 | // Write references to hash tables. At this moment, dbw->cur points 318 | // to the offset succeeding the last key/data pair. 319 | for (size_t i = 0;i < NUM_TABLES;++i) { 320 | // Offset to the hash table (or zero for non-existent tables). 321 | write_uint32(m_ht[i].empty() ? 0 : m_cur); 322 | // Bucket size is double to the number of elements. 323 | write_uint32(m_ht[i].size() * 2); 324 | // Advance the offset counter. 325 | m_cur += sizeof(uint32_t) * 2 * m_ht[i].size() * 2; 326 | } 327 | 328 | // Seek to the last position. 329 | m_os.seekp(offset); 330 | } 331 | 332 | inline void write_uint32(uint32_t value) 333 | { 334 | m_os.write(reinterpret_cast(&value), sizeof(value)); 335 | } 336 | }; 337 | 338 | 339 | 340 | /** 341 | * Exception class for the CDB++ reader. 342 | */ 343 | class cdbpp_exception : public std::invalid_argument 344 | { 345 | public: 346 | cdbpp_exception(const std::string& msg) 347 | : std::invalid_argument(msg) 348 | { 349 | } 350 | }; 351 | 352 | /** 353 | * CDB++ reader. 354 | */ 355 | template 356 | class cdbpp_base 357 | { 358 | protected: 359 | struct bucket_t 360 | { 361 | uint32_t hash; // Hash value of the record. 362 | uint32_t offset; // Offset address to the actual record. 363 | }; 364 | 365 | 366 | struct hashtable_t 367 | { 368 | uint32_t num; // Number of elements in the table. 369 | const bucket_t* buckets; // Buckets (array of bucket). 370 | }; 371 | 372 | 373 | protected: 374 | const uint8_t* m_buffer; // Pointer to the memory block. 375 | size_t m_size; // Size of the memory block. 376 | bool m_own; // 377 | 378 | hashtable_t m_ht[NUM_TABLES]; // Hash tables. 379 | size_t m_n; 380 | 381 | public: 382 | /** 383 | * Constructs an object. 384 | */ 385 | cdbpp_base() 386 | : m_buffer(NULL), m_size(0), m_own(false), m_n(0) 387 | { 388 | } 389 | 390 | /** 391 | * Constructs an object by opening a database on memory. 392 | * @param buffer The pointer to the memory image of the database. 393 | * @param size The size of the memory image. 394 | * @param own If this is set to \c true, this library will call 395 | * delete[] when the database is closed. 396 | */ 397 | cdbpp_base(const void *buffer, size_t size, bool own) 398 | : m_buffer(NULL), m_size(0), m_own(false), m_n(0) 399 | { 400 | this->open(buffer, size, own); 401 | } 402 | 403 | /** 404 | * Constructs an object by opening a database from an input stream. 405 | * @param ifs The input stream from which this library reads 406 | * a database. 407 | */ 408 | cdbpp_base(std::ifstream& ifs) 409 | : m_buffer(NULL), m_size(0), m_own(false), m_n(0) 410 | { 411 | this->open(ifs); 412 | } 413 | 414 | /** 415 | * Destructs the object. 416 | */ 417 | virtual ~cdbpp_base() 418 | { 419 | close(); 420 | } 421 | 422 | /** 423 | * Tests if the database is opened. 424 | * @return bool \c true if the database is opened, 425 | * \c false otherwise. 426 | */ 427 | bool is_open() const 428 | { 429 | return (m_buffer != NULL); 430 | } 431 | 432 | /** 433 | * Obtains the number of elements in the database. 434 | * @return size_t The number of elements. 435 | */ 436 | size_t size() const 437 | { 438 | return m_n; 439 | } 440 | 441 | /** 442 | * Tests if the database is empty. 443 | * @return bool \c true if the number of records is zero, 444 | * \c false otherwise. 445 | */ 446 | bool empty() const 447 | { 448 | return (m_n == 0); 449 | } 450 | 451 | /** 452 | * Opens the database from an input stream. 453 | * @param ifs The input stream from which this library reads 454 | * a database. 455 | */ 456 | size_t open(std::ifstream& ifs) 457 | { 458 | char chunk[4], size[4]; 459 | std::istream::pos_type offset = ifs.tellg(); 460 | 461 | do { 462 | // Read a chunk identifier. 463 | ifs.read(chunk, 4); 464 | if (ifs.fail()) { 465 | break; 466 | } 467 | 468 | // Check the chunk identifier. 469 | if (std::strncmp(chunk, "CDB+", 4) != 0) { 470 | break; 471 | } 472 | 473 | // Read the size of the chunk. 474 | ifs.read(size, 4); 475 | if (ifs.fail()) { 476 | break; 477 | } 478 | 479 | // Allocate a memory block for the chunk. 480 | uint32_t chunk_size = read_uint32(reinterpret_cast(size)); 481 | uint8_t* block = new uint8_t[chunk_size]; 482 | 483 | // Read the memory image from the stream. 484 | ifs.seekg(0, std::ios_base::beg); 485 | if (ifs.fail()) { 486 | break; 487 | } 488 | ifs.read(reinterpret_cast(block), chunk_size); 489 | if (ifs.fail()) { 490 | break; 491 | } 492 | 493 | return this->open(block, chunk_size, true); 494 | 495 | } while (0); 496 | 497 | ifs.seekg(offset, std::ios::beg); 498 | return 0; 499 | } 500 | 501 | /** 502 | * Opens the database from a memory image. 503 | * @param buffer The pointer to the memory image of the database. 504 | * @param size The size of the memory image. 505 | * @param own If this is set to \c true, this library will call 506 | * delete[] when the database is closed. 507 | */ 508 | size_t open(const void *buffer, size_t size, bool own = false) 509 | { 510 | const uint8_t *p = reinterpret_cast(buffer); 511 | 512 | // Make sure that the size of the chunk is larger than the minimum size. 513 | if (size < get_data_begin()) { 514 | throw cdbpp_exception("The memory image is smaller than a chunk header."); 515 | } 516 | 517 | // Check the chunk identifier. 518 | if (memcmp(p, "CDB+", 4) != 0) { 519 | throw cdbpp_exception("Incorrect chunk header"); 520 | } 521 | p += 4; 522 | 523 | // Read the chunk header. 524 | uint32_t csize = read_uint32(p); 525 | p += sizeof(uint32_t); 526 | uint32_t version = read_uint32(p); 527 | p += sizeof(uint32_t); 528 | uint32_t byteorder = read_uint32(p); 529 | p += sizeof(uint32_t); 530 | 531 | // Check the byte-order consistency. 532 | if (byteorder != BYTEORDER_CHECK) { 533 | throw cdbpp_exception("Inconsistent byte order"); 534 | } 535 | // Check the version number. 536 | if (version != CDBPP_VERSION) { 537 | throw cdbpp_exception("Incompatible CDB++ versions"); 538 | } 539 | // Check the chunk size. 540 | if (size < csize) { 541 | throw cdbpp_exception("The memory image is smaller than a chunk size."); 542 | } 543 | 544 | // Set memory block and size. 545 | m_buffer = reinterpret_cast(buffer); 546 | m_size = size; 547 | m_own = own; 548 | 549 | // Set pointers to the hash tables. 550 | m_n = 0; 551 | const tableref_t* ref = reinterpret_cast(p); 552 | for (size_t i = 0;i < NUM_TABLES;++i) { 553 | if (ref[i].offset) { 554 | // Set the buckets. 555 | m_ht[i].buckets = reinterpret_cast(m_buffer + ref[i].offset); 556 | m_ht[i].num = ref[i].num; 557 | } else { 558 | // An empty hash table. 559 | m_ht[i].buckets = NULL; 560 | m_ht[i].num = 0; 561 | } 562 | 563 | // The number of records is the half of the table size. 564 | m_n += (ref[i].num / 2); 565 | } 566 | 567 | return (size_t)csize; 568 | } 569 | 570 | /** 571 | * Closes the database. 572 | */ 573 | void close() 574 | { 575 | if (m_own && m_buffer != NULL) { 576 | delete[] m_buffer; 577 | } 578 | m_buffer = NULL; 579 | m_size = 0; 580 | m_n = 0; 581 | } 582 | 583 | /** 584 | * Finds the key in the database. 585 | * @param key The pointer to the key. 586 | * @param ksize The size of the key. 587 | * @param vsize The pointer of a variable to which the size of the 588 | * value returned. This parameter can be \c NULL. 589 | * @return const void* The pointer to the value. 590 | */ 591 | const void* get(const void *key, size_t ksize, size_t* vsize) const 592 | { 593 | uint32_t hv = hash_function()(key, ksize); 594 | const hashtable_t* ht = &m_ht[hv % NUM_TABLES]; 595 | 596 | if (ht->num && ht->buckets != NULL) { 597 | int n = ht->num; 598 | int k = (hv >> 8) % n; 599 | const bucket_t* p = NULL; 600 | 601 | while (p = &ht->buckets[k], p->offset) { 602 | if (p->hash == hv) { 603 | const uint8_t *q = m_buffer + p->offset; 604 | if (read_uint32(q) == ksize && 605 | memcmp(key, q + sizeof(uint32_t), ksize) == 0) { 606 | q += sizeof(uint32_t) + ksize; 607 | if (vsize != NULL) { 608 | *vsize = read_uint32(q); 609 | } 610 | return q + sizeof(uint32_t); 611 | } 612 | } 613 | k = (k+1) % n; 614 | } 615 | } 616 | 617 | if (vsize != NULL) { 618 | *vsize = 0; 619 | } 620 | return NULL; 621 | } 622 | 623 | protected: 624 | inline uint32_t read_uint32(const uint8_t* p) const 625 | { 626 | return *reinterpret_cast(p); 627 | } 628 | }; 629 | 630 | /// CDB++ builder with MurmurHash2. 631 | typedef builder_base builder; 632 | /// CDB++ reader with MurmurHash2. 633 | typedef cdbpp_base cdbpp; 634 | 635 | 636 | }; 637 | 638 | /** @} */ 639 | 640 | /** 641 | @mainpage C++ implementation of Constant Database (CDB++) 642 | 643 | @section intro Introduction 644 | 645 | Constant Database PlusPlus (CDB++) is a C++ implementation of hash database 646 | specialized for serialization and retrieval of static associations 647 | between keys and their values. The database provides several features: 648 | - Fast look-ups. This library implements the data structure of the 649 | Constant Database proposed by 650 | Daniel J. Bernstein. 651 | - Low footprint. A CDB++ database consists of a chunk header (16 bytes), 652 | hash tables (2048 bytes and 16 bytes per record), and actual records (8 bytes 653 | plus key/value size per record). 654 | - Fast hash function. CDB++ incorporates the fast and 655 | collision-resistant hash function for strings 656 | (MurmurHash 2.0) 657 | implemented by Austin Appleby. 658 | - Chunk format. The structure of CDB++ is designed to store the data in 659 | a chunk of a file; CDB++ database can be embedded into a file with other 660 | arbitrary data. 661 | - Simple write interface. CDB++ can serialize a hash database to C++ 662 | output streams (\c std::ostream). 663 | - Simple read interface. CDB++ can prepare a hash database from an input 664 | stream (\c std::istream) or from a memory block on which a database image is 665 | read or memory-mapped from a file. 666 | - Cross platform. The source code can be compiled on Microsoft Visual 667 | Studio 2008, GNU C Compiler (gcc), etc. 668 | - Very simple API. The CDB++ API exposes only a few functions; one can 669 | use this library just by looking at the sample code. 670 | - Single C++ header implementation. CDB++ is implemented in a single 671 | header file (cdbpp.h); one can use the CDB++ API only by including cdbpp.h 672 | in a source code. 673 | 674 | CDB++ does not support these for simplicity: 675 | - modifying associations 676 | - checking collisions in keys 677 | - compatibility of the database format on different byte-order architectures 678 | 679 | @section sample Sample code 680 | This sample code constructs a database "test.cdb" with 100,000 string/integer 681 | associations, "000000"/0, "000001"/1, ..., "100000"/100000 (in build function). 682 | Then the code issues string queries "000000", ..., "100000", and checks 683 | whether the values are correct (in read function). 684 | 685 | @include sample.cpp 686 | 687 | @section download Download 688 | 689 | - Source code 690 | 691 | CDB++ is distributed under the term of the 692 | modified BSD license. 693 | 694 | @section changelog History 695 | - Version 1.1 (2009-07-14): 696 | - Fixed a compile issue (a patch submitted by Takashi Imamichi). 697 | - Replaced SuperFastHash with MurmurHash 2.0 (a patch submitted by 698 | Takashi Imamichi). 699 | - Classes cdbpp::builder_base and cdbpp::cdbpp_base taking a template 700 | argument to configure a hash function. Classes cdbpp::builder and 701 | cdbpp::cdbpp are now the synonyms of 702 | \c cdbpp::builder_base and 703 | \c cdbpp::cdbpp_base, respectively. 704 | - Split the sample code into build and read functions. 705 | - Version 1.0 (2009-07-09): 706 | - Initial release. 707 | 708 | @section api Documentation 709 | 710 | - @ref cdbpp_api "CDB++ API" 711 | 712 | @section acknowledgements Acknowledgements 713 | 714 | The data structure of the constant database was originally proposed by 715 | Daniel J. Bernstein. 716 | 717 | The source code of CDB++ includes the 718 | MurmurHash 2.0 719 | implemented by Austin Appleby. 720 | 721 | The CDB++ distribution contains "a portable stdint.h", which is released by 722 | Paul Hsieh under the term of 723 | the modified BSD license, for addressing the compatibility issue of Microsoft 724 | Visual Studio 2008. The original code is available at: 725 | http://www.azillionmonkeys.com/qed/pstdint.h 726 | 727 | @section reference References 728 | - cdb by Daniel J. Bernstein. 729 | - TinyCDB - a Constant DataBase by Michael Tokarev. 730 | - Constant Database C++ Bindings by Stanislav Ievlev. 731 | - Constant Database (cdb) Internals by Yusuke Shinyama. 732 | - MurmurHash 2.0 by Austin Appleby. 733 | 734 | */ 735 | 736 | #endif/*__CDBPP_H__*/ 737 | -------------------------------------------------------------------------------- /include/simstring/simstring.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SimString. 3 | * 4 | * Copyright (c) 2009,2010 Naoaki Okazaki 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions are met: 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of the authors nor the names of its contributors may 15 | * be used to endorse or promote products derived from this software 16 | * without specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 22 | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 23 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 25 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 26 | * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 27 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | /* $Id$ */ 32 | 33 | #ifndef __SIMSTRING_H__ 34 | #define __SIMSTRING_H__ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | 49 | #include "ngram.h" 50 | #include "measure.h" 51 | #include "cdbpp.h" 52 | #include "memory_mapped_file.h" 53 | 54 | #define SIMSTRING_NAME "SimString" 55 | #define SIMSTRING_COPYRIGHT "Copyright (c) 2009-2011 Naoaki Okazaki" 56 | #define SIMSTRING_MAJOR_VERSION 1 57 | #define SIMSTRING_MINOR_VERSION 1 58 | #define SIMSTRING_STREAM_VERSION 2 59 | 60 | /** 61 | * \addtogroup api SimString C++ API 62 | * @{ 63 | * 64 | * The SimString C++ API. 65 | */ 66 | 67 | namespace simstring 68 | { 69 | 70 | enum { 71 | BYTEORDER_CHECK = 0x62445371, 72 | }; 73 | 74 | /** 75 | * Query types. 76 | */ 77 | enum { 78 | /// Exact match. 79 | exact = 0, 80 | /// Approximate string matching with dice coefficient. 81 | dice, 82 | /// Approximate string matching with cosine coefficient. 83 | cosine, 84 | /// Approximate string matching with jaccard coefficient. 85 | jaccard, 86 | /// Approximate string matching with overlap coefficient. 87 | overlap, 88 | }; 89 | 90 | 91 | 92 | /** 93 | * A writer for an n-gram database. 94 | * This template class builds an n-gram database. The first template 95 | * argument (string_tmpl) specifies the type of a key (string), the second 96 | * template argument (value_tmpl) specifies the type of a value associated 97 | * with a key, and the third template argument (ngram_generator_tmpl) 98 | * customizes generation of feature sets (n-grams) from keys. 99 | * 100 | * This class is inherited by writer_base, which adds the functionality of 101 | * managing a master string table (list of strings). 102 | * 103 | * @param string_tmpl The type of a string. 104 | * @param value_tmpl The value type. 105 | * This is required to be an integer type. 106 | * @param ngram_generator_tmpl The type of an n-gram generator. 107 | */ 108 | template < 109 | class string_tmpl, 110 | class value_tmpl, 111 | class ngram_generator_tmpl 112 | > 113 | class ngramdb_writer_base 114 | { 115 | public: 116 | /// The type representing a string. 117 | typedef string_tmpl string_type; 118 | /// The type of values associated with key strings. 119 | typedef value_tmpl value_type; 120 | /// The function type for generating n-grams from a key string. 121 | typedef ngram_generator_tmpl ngram_generator_type; 122 | /// The type representing a character. 123 | typedef typename string_type::value_type char_type; 124 | 125 | protected: 126 | /// The type of an array of n-grams. 127 | typedef std::vector ngrams_type; 128 | /// The vector type of values associated with an n-gram. 129 | typedef std::vector values_type; 130 | /// The type implementing an index (associations from n-grams to values). 131 | typedef std::map hashdb_type; 132 | /// The vector of indices for different n-gram sizes. 133 | typedef std::vector indices_type; 134 | 135 | protected: 136 | /// The vector of indices. 137 | indices_type m_indices; 138 | /// The n-gram generator. 139 | const ngram_generator_type& m_gen; 140 | /// The error message. 141 | std::stringstream m_error; 142 | 143 | public: 144 | /** 145 | * Constructs an object. 146 | * @param gen The n-gram generator. 147 | */ 148 | ngramdb_writer_base(const ngram_generator_type& gen) 149 | : m_gen(gen) 150 | { 151 | } 152 | 153 | /** 154 | * Destructs an object. 155 | */ 156 | virtual ~ngramdb_writer_base() 157 | { 158 | } 159 | 160 | /** 161 | * Clears the database. 162 | */ 163 | void clear() 164 | { 165 | m_indices.clear(); 166 | m_error.str(""); 167 | } 168 | 169 | /** 170 | * Checks whether the database is empty. 171 | * @return bool \c true if the database is empty, \c false otherwise. 172 | */ 173 | bool empty() 174 | { 175 | return m_indices.empty(); 176 | } 177 | 178 | /** 179 | * Returns the maximum length of keys in the n-gram database. 180 | * @return int The maximum length of keys. 181 | */ 182 | int max_size() const 183 | { 184 | return (int)m_indices.size(); 185 | } 186 | 187 | /** 188 | * Checks whether an error has occurred. 189 | * @return bool \c true if an error has occurred. 190 | */ 191 | bool fail() const 192 | { 193 | return !m_error.str().empty(); 194 | } 195 | 196 | /** 197 | * Returns an error message. 198 | * @return std::string The string of the error message. 199 | */ 200 | std::string error() const 201 | { 202 | return m_error.str(); 203 | } 204 | 205 | /** 206 | * Inserts a string to the n-gram database. 207 | * @param key The key string. 208 | * @param value The value associated with the string. 209 | */ 210 | bool insert(const string_type& key, const value_type& value) 211 | { 212 | // Generate n-grams from the key string. 213 | ngrams_type ngrams; 214 | m_gen(key, std::back_inserter(ngrams)); 215 | if (ngrams.empty()) { 216 | return false; 217 | } 218 | 219 | // Resize the index array for the number of the n-grams; 220 | // we build an index for each n-gram number. 221 | if (m_indices.size() < ngrams.size()) { 222 | m_indices.resize(ngrams.size()); 223 | } 224 | hashdb_type& index = m_indices[ngrams.size()-1]; 225 | 226 | // Store the associations from the n-grams to the value. 227 | typename ngrams_type::const_iterator it; 228 | for (it = ngrams.begin();it != ngrams.end();++it) { 229 | const string_type& ngram = *it; 230 | typename hashdb_type::iterator iti = index.find(ngram); 231 | if (iti == index.end()) { 232 | // Create a new posting array. 233 | values_type v(1); 234 | v[0] = value; 235 | index.insert(typename hashdb_type::value_type(ngram, v)); 236 | } else { 237 | // Append the value to the existing posting array. 238 | iti->second.push_back(value); 239 | } 240 | } 241 | 242 | return true; 243 | } 244 | 245 | /** 246 | * Stores the n-gram database to files. 247 | * @param name The prefix of file names. 248 | * @return bool \c true if the database is successfully stored, 249 | * \c false otherwise. 250 | */ 251 | bool store(const std::string& base) 252 | { 253 | // Write out all the indices to files. 254 | for (int i = 0;i < (int)m_indices.size();++i) { 255 | if (!m_indices[i].empty()) { 256 | std::stringstream ss; 257 | ss << base << '.' << i+1 << ".cdb"; 258 | bool b = this->store(ss.str(), m_indices[i]); 259 | if (!b) { 260 | return false; 261 | } 262 | } 263 | } 264 | 265 | return true; 266 | } 267 | 268 | protected: 269 | bool store(const std::string& name, const hashdb_type& index) 270 | { 271 | // Open the database file with binary mode. 272 | std::ofstream ofs(name.c_str(), std::ios::binary); 273 | if (ofs.fail()) { 274 | m_error << "Failed to open a file for writing: " << name; 275 | return false; 276 | } 277 | 278 | try { 279 | // Open a CDB++ writer. 280 | cdbpp::builder dbw(ofs); 281 | 282 | // Put associations: n-gram -> values. 283 | typename hashdb_type::const_iterator it; 284 | for (it = index.begin();it != index.end();++it) { 285 | // Put an association from an n-gram to its values. 286 | dbw.put( 287 | it->first.c_str(), 288 | sizeof(char_type) * it->first.length(), 289 | &it->second[0], 290 | sizeof(it->second[0]) * it->second.size() 291 | ); 292 | } 293 | 294 | } catch (const cdbpp::builder_exception& e) { 295 | m_error << "CDB++ error: " << e.what(); 296 | return false; 297 | } 298 | 299 | return true; 300 | } 301 | }; 302 | 303 | 304 | 305 | /** 306 | * A SimString database writer. 307 | * This template class builds a SimString database. The first template 308 | * argument (string_tmpl) specifies the type of a character, and the second 309 | * template argument (ngram_generator_tmpl) customizes generation of feature 310 | * sets (n-grams) from strings. 311 | * 312 | * Inheriting the base class ngramdb_writer_base that builds indices from 313 | * n-grams to string IDs, this class maintains associations between strings 314 | * and string IDs. 315 | * 316 | * @param string_tmpl The type of a string. 317 | * @param ngram_generator_tmpl The type of an n-gram generator. 318 | */ 319 | template < 320 | class string_tmpl, 321 | class ngram_generator_tmpl = ngram_generator 322 | > 323 | class writer_base : 324 | public ngramdb_writer_base 325 | { 326 | public: 327 | /// The type representing a string. 328 | typedef string_tmpl string_type; 329 | /// The type of values associated with key strings. 330 | typedef uint32_t value_type; 331 | /// The function type for generating n-grams from a key string. 332 | typedef ngram_generator_tmpl ngram_generator_type; 333 | /// The type representing a character. 334 | typedef typename string_type::value_type char_type; 335 | // The type of the base class. 336 | typedef ngramdb_writer_base base_type; 337 | 338 | protected: 339 | /// The base name of the database. 340 | std::string m_name; 341 | /// The output stream for the string collection. 342 | std::ofstream m_ofs; 343 | /// The number of strings in the database. 344 | int m_num_entries; 345 | 346 | public: 347 | /** 348 | * Constructs a writer object. 349 | * @param gen The n-gram generator used by this writer. 350 | */ 351 | writer_base(const ngram_generator_type& gen) 352 | : base_type(gen), m_num_entries(0) 353 | { 354 | } 355 | 356 | /** 357 | * Constructs a writer object by opening a database. 358 | * @param gen The n-gram generator used by this writer. 359 | * @param name The name of the database. 360 | */ 361 | writer_base( 362 | const ngram_generator_type& gen, 363 | const std::string& name 364 | ) 365 | : base_type(gen), m_num_entries(0) 366 | { 367 | this->open(name); 368 | } 369 | 370 | /** 371 | * Destructs a writer object. 372 | */ 373 | virtual ~writer_base() 374 | { 375 | close(); 376 | } 377 | 378 | /** 379 | * Opens a database. 380 | * @param name The name of the database. 381 | * @return bool \c true if the database is successfully opened, 382 | * \c false otherwise. 383 | */ 384 | bool open(const std::string& name) 385 | { 386 | m_num_entries = 0; 387 | 388 | // Open the master file for writing. 389 | m_ofs.open(name.c_str(), std::ios::binary); 390 | if (m_ofs.fail()) { 391 | this->m_error << "Failed to open a file for writing: " << name; 392 | return false; 393 | } 394 | 395 | // Reserve the region for a file header. 396 | if (!this->write_header(m_ofs)) { 397 | m_ofs.close(); 398 | return false; 399 | } 400 | 401 | m_name = name; 402 | return true; 403 | } 404 | 405 | /** 406 | * Closes the database. 407 | * @param name The name of the database. 408 | * @return bool \c true if the database is successfully opened, 409 | * \c false otherwise. 410 | */ 411 | bool close() 412 | { 413 | bool b = true; 414 | 415 | // Write the n-gram database to files. 416 | if (!m_name.empty()) { 417 | b &= this->store(m_name); 418 | } 419 | 420 | // Finalize the file header, and close the file. 421 | if (m_ofs.is_open()) { 422 | b &= this->write_header(m_ofs); 423 | m_ofs.close(); 424 | } 425 | 426 | // Initialize the members. 427 | m_name.clear(); 428 | m_num_entries = 0; 429 | return b; 430 | } 431 | 432 | /** 433 | * Inserts a string to the database. 434 | * @param str The string to be inserted. 435 | * @return bool \c true if the string is successfully inserted, 436 | * \c false otherwise. 437 | */ 438 | bool insert(const string_type& str) 439 | { 440 | // This will be the offset address to access the key string. 441 | value_type off = (value_type)(std::streamoff)m_ofs.tellp(); 442 | 443 | // Write the key string to the master file. 444 | m_ofs.write(reinterpret_cast(str.c_str()), sizeof(char_type) * (str.length()+1)); 445 | if (m_ofs.fail()) { 446 | this->m_error << "Failed to write a string to the master file."; 447 | return false; 448 | } 449 | ++m_num_entries; 450 | 451 | // Insert the n-grams of the key string to the database. 452 | return base_type::insert(str, off); 453 | } 454 | 455 | protected: 456 | bool write_header(std::ofstream& ofs) 457 | { 458 | uint32_t num_entries = m_num_entries; 459 | uint32_t max_size = (uint32_t)this->max_size(); 460 | uint32_t size = (uint32_t)m_ofs.tellp(); 461 | 462 | // Seek to the beginning of the master file, to which the file header 463 | // is to be written. 464 | ofs.seekp(0); 465 | if (ofs.fail()) { 466 | this->m_error << "Failed to seek the file pointer for the master file."; 467 | return false; 468 | } 469 | 470 | // Write the file header. 471 | m_ofs.write("SSDB", 4); 472 | write_uint32(BYTEORDER_CHECK); 473 | write_uint32(SIMSTRING_STREAM_VERSION); 474 | write_uint32(size); 475 | write_uint32(sizeof(char_type)); 476 | write_uint32(this->m_gen.get_n()); 477 | write_uint32(static_cast(this->m_gen.get_be())); 478 | write_uint32(num_entries); 479 | write_uint32(max_size); 480 | if (ofs.fail()) { 481 | this->m_error << "Failed to write a file header to the master file."; 482 | return false; 483 | } 484 | 485 | return true; 486 | } 487 | 488 | inline void write_uint32(uint32_t value) 489 | { 490 | m_ofs.write(reinterpret_cast(&value), sizeof(value)); 491 | } 492 | }; 493 | 494 | 495 | 496 | /** 497 | * A reader for an n-gram database. 498 | * @param value_tmpl The value type. 499 | * This is required to be an integer type. 500 | */ 501 | template < 502 | class value_tmpl 503 | > 504 | class ngramdb_reader_base 505 | { 506 | public: 507 | /// The type of a value. 508 | typedef value_tmpl value_type; 509 | 510 | protected: 511 | // An inverted list of SIDs. 512 | struct inverted_list_type 513 | { 514 | int num; 515 | const value_type* values; 516 | 517 | friend bool operator<( 518 | const inverted_list_type& x, 519 | const inverted_list_type& y 520 | ) 521 | { 522 | return (x.num < y.num); 523 | } 524 | }; 525 | // An array of inverted lists. 526 | typedef std::vector inverted_lists_type; 527 | 528 | // A hash table that retrieves SIDs from n-grams. 529 | typedef cdbpp::cdbpp hashtbl_type; 530 | 531 | // An index containing strings of the same size. 532 | struct index_type 533 | { 534 | // The memory image of the database. 535 | memory_mapped_file image; 536 | // The index. 537 | hashtbl_type table; 538 | }; 539 | 540 | // Indices with different sizes of strings. 541 | typedef std::vector indices_type; 542 | 543 | // A candidate string of retrieved results. 544 | struct candidate_type 545 | { 546 | // The SID. 547 | value_type value; 548 | // The overlap count (frequency of the SID in the inverted lists). 549 | int num; 550 | 551 | candidate_type(value_type v, int n) 552 | : value(v), num(n) 553 | { 554 | } 555 | }; 556 | 557 | // An array of candidates. 558 | typedef std::vector candidates_type; 559 | 560 | // An array of SIDs retrieved. 561 | typedef std::vector results_type; 562 | 563 | protected: 564 | // The array of the indices. 565 | indices_type m_indices; 566 | // The maximum size of strings in the database. 567 | int m_max_size; 568 | // The database name (base name of indices). 569 | std::string m_name; 570 | // The error message. 571 | std::stringstream m_error; 572 | 573 | 574 | public: 575 | /** 576 | * Constructs an object. 577 | */ 578 | ngramdb_reader_base() 579 | { 580 | } 581 | 582 | /** 583 | * Destructs an object. 584 | */ 585 | virtual ~ngramdb_reader_base() 586 | { 587 | } 588 | 589 | /** 590 | * Checks whether an error has occurred. 591 | * @return bool \c true if an error has occurred. 592 | */ 593 | bool fail() const 594 | { 595 | return !m_error.str().empty(); 596 | } 597 | 598 | /** 599 | * Returns an error message. 600 | * @return std::string The string of the error message. 601 | */ 602 | std::string error() const 603 | { 604 | return m_error.str(); 605 | } 606 | 607 | /** 608 | * Opens an n-gram database. 609 | * @param name The name of the database. 610 | * @param max_size The maximum size of the strings. 611 | */ 612 | void open(const std::string& name, int max_size) 613 | { 614 | m_name = name; 615 | m_max_size = max_size; 616 | // The maximum size corresponds to the number of indices in the database. 617 | m_indices.resize(max_size); 618 | } 619 | 620 | /** 621 | * Closes an n-gram database. 622 | */ 623 | void close() 624 | { 625 | m_name.clear(); 626 | m_indices.clear(); 627 | m_error.str(""); 628 | } 629 | 630 | /** 631 | * Performs an overlap join on inverted lists retrieved for the query. 632 | * @param query The query object that stores query n-grams, 633 | * threshold, and conditions for the similarity 634 | * measure. 635 | * @param results The SIDs that satisfies the overlap join. 636 | */ 637 | template 638 | bool overlapjoin(const query_type& query, double alpha, results_type& results, bool check) 639 | { 640 | int i; 641 | const int qsize = query.size(); 642 | 643 | // Allocate a vector of postings corresponding to n-gram queries. 644 | inverted_lists_type posts(qsize); 645 | 646 | // Compute the range of n-gram lengths for the candidate strings; 647 | // in other words, we do not have to search for strings whose n-gram 648 | // lengths are out of this range. 649 | const int xmin = std::max(measure_type::min_size(query.size(), alpha), 1); 650 | const int xmax = std::min(measure_type::max_size(query.size(), alpha), m_max_size); 651 | 652 | // Loop for each length in the range. 653 | for (int xsize = xmin;xsize <= xmax;++xsize) { 654 | // Access to the n-gram index for the length. 655 | hashtbl_type& tbl = open_index(m_name, xsize); 656 | if (!tbl.is_open()) { 657 | // Ignore an empty index. 658 | continue; 659 | } 660 | 661 | // Search for string entries that match to each query n-gram. 662 | // Note that we do not traverse each entry here, but only obtain 663 | // the number of and the pointer to the entries. 664 | typename query_type::const_iterator it; 665 | for (it = query.begin(), i = 0;it != query.end();++it, ++i) { 666 | size_t vsize; 667 | const void *values = tbl.get( 668 | it->c_str(), 669 | sizeof(it->at(0)) * it->length(), 670 | &vsize 671 | ); 672 | posts[i].num = (int)(vsize / sizeof(value_type)); 673 | posts[i].values = reinterpret_cast(values); 674 | } 675 | 676 | // Sort the query n-grams by ascending order of their frequencies. 677 | // This reduces the number of initial candidates. 678 | std::sort(posts.begin(), posts.end()); 679 | 680 | // The minimum number of n-gram matches required for the query. 681 | const int mmin = measure_type::min_match(qsize, xsize, alpha); 682 | // A candidate must match to one of n-grams in these queries. 683 | const int min_queries = qsize - mmin + 1; 684 | 685 | // Step 1: collect candidates that match to the initial queries. 686 | candidates_type cands; 687 | for (i = 0;i < min_queries;++i) { 688 | candidates_type tmp; 689 | typename candidates_type::const_iterator itc = cands.begin(); 690 | const value_type* p = posts[i].values; 691 | const value_type* last = posts[i].values + posts[i].num; 692 | 693 | while (itc != cands.end() || p != last) { 694 | if (itc == cands.end() || (p != last && itc->value > *p)) { 695 | tmp.push_back(candidate_type(*p, 1)); 696 | ++p; 697 | } else if (p == last || (itc != cands.end() && itc->value < *p)) { 698 | tmp.push_back(candidate_type(itc->value, itc->num)); 699 | ++itc; 700 | } else { 701 | tmp.push_back(candidate_type(itc->value, itc->num+1)); 702 | ++itc; 703 | ++p; 704 | } 705 | } 706 | std::swap(cands, tmp); 707 | } 708 | 709 | // No initial candidate is found. 710 | if (cands.empty()) { 711 | continue; 712 | } 713 | 714 | // Step 2: count the number of matches with remaining queries. 715 | for (;i < qsize;++i) { 716 | candidates_type tmp; 717 | typename candidates_type::const_iterator itc; 718 | const value_type* first = posts[i].values; 719 | const value_type* last = posts[i].values + posts[i].num; 720 | 721 | // For each active candidate. 722 | for (itc = cands.begin();itc != cands.end();++itc) { 723 | int num = itc->num; 724 | if (std::binary_search(first, last, itc->value)) { 725 | ++num; 726 | } 727 | 728 | if (mmin <= num) { 729 | // This candidate has sufficient matches. 730 | if (check) { 731 | return true; 732 | } 733 | results.push_back(itc->value); 734 | } else if (num + (qsize - i - 1) >= mmin) { 735 | // This candidate still has the chance. 736 | tmp.push_back(candidate_type(itc->value, num)); 737 | } 738 | } 739 | std::swap(cands, tmp); 740 | 741 | // Exit the loop if all candidates are pruned. 742 | if (cands.empty()) { 743 | break; 744 | } 745 | } 746 | 747 | if (!cands.empty()) { 748 | // Step 2 was not performed. 749 | typename candidates_type::const_iterator itc; 750 | for (itc = cands.begin();itc != cands.end();++itc) { 751 | if (mmin <= itc->num) { 752 | if (check) { 753 | return true; 754 | } 755 | results.push_back(itc->value); 756 | } 757 | } 758 | } 759 | } 760 | 761 | return !results.empty(); 762 | } 763 | 764 | protected: 765 | /** 766 | * Open the index storing strings of the specific size. 767 | * @param base The base name of the indices. 768 | * @param size The size of strings. 769 | * @return hashtbl_type& The hash table of the index. 770 | */ 771 | hashtbl_type& open_index(const std::string& base, int size) 772 | { 773 | index_type& index = m_indices[size-1]; 774 | if (!index.table.is_open()) { 775 | std::stringstream ss; 776 | ss << base << '.' << size << ".cdb"; 777 | index.image.open(ss.str().c_str(), std::ios::in); 778 | if (index.image.is_open()) { 779 | index.table.open(index.image.data(), index.image.size()); 780 | } 781 | } 782 | 783 | return index.table; 784 | } 785 | }; 786 | 787 | 788 | 789 | /** 790 | * A SimString database reader. 791 | * This template class retrieves string from a SimString database. 792 | * 793 | * Inheriting the base class ngramdb_reader_base that retrieves string IDs 794 | * from a query feature set, this class manages the master string table, 795 | * which maintains associations between strings and string IDs. 796 | */ 797 | class reader 798 | : public ngramdb_reader_base 799 | { 800 | public: 801 | /// The type of an n-gram generator. 802 | typedef ngram_generator ngram_generator_type; 803 | /// The type of the base class. 804 | typedef ngramdb_reader_base base_type; 805 | 806 | protected: 807 | int m_ngram_unit; 808 | bool m_be; 809 | int m_char_size; 810 | 811 | /// The content of the master file. 812 | std::vector m_strings; 813 | 814 | public: 815 | /** 816 | * Constructs an object. 817 | */ 818 | reader() 819 | { 820 | } 821 | 822 | /** 823 | * Destructs an object. 824 | */ 825 | virtual ~reader() 826 | { 827 | close(); 828 | } 829 | 830 | /** 831 | * Opens a SimString database. 832 | * @param name The name of the SimString database. 833 | * @return bool \c true if the database is successfully opened, 834 | * \c false otherwise. 835 | */ 836 | bool open(const std::string& name) 837 | { 838 | uint32_t num_entries, max_size; 839 | 840 | // Open the master file. 841 | std::ifstream ifs(name.c_str(), std::ios_base::in | std::ios_base::binary); 842 | if (ifs.fail()) { 843 | this->m_error << "Failed to open the master file: " << name; 844 | return false; 845 | } 846 | 847 | // Obtain the size of the master file. 848 | ifs.seekg(0, std::ios_base::end); 849 | size_t size = (size_t)ifs.tellg(); 850 | ifs.seekg(0, std::ios_base::beg); 851 | 852 | // Read the image of the master file. 853 | m_strings.resize(size); 854 | ifs.read(&m_strings[0], size); 855 | ifs.close(); 856 | 857 | // Check the file header. 858 | const char* p = &m_strings[0]; 859 | if (size < 36 || std::strncmp(p, "SSDB", 4) != 0) { 860 | this->m_error << "Incorrect file format"; 861 | return false; 862 | } 863 | p += 4; 864 | 865 | // Check the byte order. 866 | if (BYTEORDER_CHECK != read_uint32(p)) { 867 | this->m_error << "Incompatible byte order"; 868 | return false; 869 | } 870 | p += 4; 871 | 872 | // Check the version. 873 | if (SIMSTRING_STREAM_VERSION != read_uint32(p)) { 874 | this->m_error << "Incompatible stream version"; 875 | return false; 876 | } 877 | p += 4; 878 | 879 | // Check the chunk size. 880 | if (size != read_uint32(p)) { 881 | this->m_error << "Inconsistent chunk size"; 882 | return false; 883 | } 884 | p += 4; 885 | 886 | // Read the unit of n-grams, begin/end flag. 887 | m_char_size = (int)read_uint32(p); 888 | p += 4; 889 | m_ngram_unit = (int)read_uint32(p); 890 | p += 4; 891 | m_be = (read_uint32(p) != 0); 892 | p += 4; 893 | 894 | // Read the number of enties. 895 | num_entries = read_uint32(p); 896 | p += 4; 897 | 898 | // Read the maximum size of strings in the database. 899 | max_size = read_uint32(p); 900 | 901 | base_type::open(name, (int)max_size); 902 | return true; 903 | } 904 | 905 | /** 906 | * Closes the database. 907 | */ 908 | void close() 909 | { 910 | base_type::close(); 911 | } 912 | 913 | int char_size() const 914 | { 915 | return m_char_size; 916 | } 917 | 918 | /** 919 | * Retrieves strings that are similar to the query. 920 | * @param query The query string. 921 | * @param measure The similarity measure. 922 | * @param alpha The threshold for approximate string matching. 923 | * @param ins The insert iterator that receives retrieved 924 | * strings. 925 | * @see ::simstring::exact, ::simstring::dice, ::simstring::cosine, 926 | * ::simstring::jaccard, ::simstring::overlap 927 | */ 928 | template 929 | void retrieve( 930 | const string_type& query, 931 | int measure, 932 | double alpha, 933 | insert_iterator ins 934 | ) 935 | { 936 | switch (measure) { 937 | case exact: 938 | this->retrieve(query, alpha, ins); 939 | break; 940 | case dice: 941 | this->retrieve(query, alpha, ins); 942 | break; 943 | case cosine: 944 | this->retrieve(query, alpha, ins); 945 | break; 946 | case jaccard: 947 | this->retrieve(query, alpha, ins); 948 | break; 949 | case overlap: 950 | this->retrieve(query, alpha, ins); 951 | break; 952 | } 953 | } 954 | 955 | /** 956 | * Retrieves strings that are similar to the query. 957 | * @param measure_type The similarity measure. 958 | * @param query The query string. 959 | * @param alpha The threshold for approximate string matching. 960 | * @param ins The insert iterator that receives retrieved 961 | * strings. 962 | * @see ::simstring::measure::exact, ::simstring::measure::dice, 963 | * ::simstring::measure::cosine, ::simstring::measure::jaccard, 964 | * ::simstring::measure::overlap 965 | */ 966 | template 967 | void retrieve( 968 | const string_type& query, 969 | double alpha, 970 | insert_iterator ins 971 | ) 972 | { 973 | typedef std::vector ngrams_type; 974 | typedef typename string_type::value_type char_type; 975 | 976 | ngram_generator_type gen(m_ngram_unit, m_be); 977 | ngrams_type ngrams; 978 | gen(query, std::back_inserter(ngrams)); 979 | 980 | typename base_type::results_type results; 981 | base_type::overlapjoin(ngrams, alpha, results, false); 982 | 983 | typename base_type::results_type::const_iterator it; 984 | const char* strings = &m_strings[0]; 985 | for (it = results.begin();it != results.end();++it) { 986 | const char_type* xstr = reinterpret_cast(strings + *it); 987 | *ins = xstr; 988 | } 989 | } 990 | 991 | template 992 | bool check( 993 | const string_type& query, 994 | int measure, 995 | double alpha 996 | ) 997 | { 998 | switch (measure) { 999 | case exact: 1000 | return this->check(query, alpha); 1001 | case dice: 1002 | return this->check(query, alpha); 1003 | case cosine: 1004 | return this->check(query, alpha); 1005 | case jaccard: 1006 | return this->check(query, alpha); 1007 | case overlap: 1008 | return this->check(query, alpha); 1009 | } 1010 | return false; 1011 | } 1012 | 1013 | template 1014 | bool check( 1015 | const string_type& query, 1016 | double alpha 1017 | ) 1018 | { 1019 | typedef std::vector ngrams_type; 1020 | typedef typename string_type::value_type char_type; 1021 | 1022 | ngram_generator_type gen(m_ngram_unit, m_be); 1023 | ngrams_type ngrams; 1024 | gen(query, std::back_inserter(ngrams)); 1025 | 1026 | typename base_type::results_type results; 1027 | return base_type::overlapjoin(ngrams, alpha, results, true); 1028 | } 1029 | 1030 | protected: 1031 | inline uint32_t read_uint32(const char* p) const 1032 | { 1033 | return *reinterpret_cast(p); 1034 | } 1035 | }; 1036 | 1037 | }; 1038 | 1039 | /** @} */ 1040 | 1041 | /** 1042 | @mainpage SimString - A fast and efficient implementation for approximate string matching 1043 | 1044 | @section documentation Documentation 1045 | 1046 | - @ref api "SimString C++ API" 1047 | 1048 | @section sample Sample Programs 1049 | 1050 | A basic sample. 1051 | 1052 | @include sample.cpp 1053 | 1054 | A Unicode sample. 1055 | 1056 | @include sample_unicode.cpp 1057 | 1058 | */ 1059 | 1060 | #endif/*__SIMSTRING_H__*/ 1061 | --------------------------------------------------------------------------------