├── .gitignore ├── .travis.yml ├── 3rd-party ├── README.md └── sparsehash │ ├── .gitignore │ ├── AUTHORS │ ├── ChangeLog │ ├── Makefile.in │ ├── README │ ├── config.guess │ ├── config.sub │ ├── configure │ ├── depcomp │ ├── install-sh │ ├── m4 │ ├── acx_pthread.m4 │ ├── google_namespace.m4 │ ├── namespaces.m4 │ ├── stl_hash.m4 │ └── stl_hash_fun.m4 │ ├── missing │ └── src │ └── config.h.in ├── COPYING ├── LICENSES ├── MANIFEST.in ├── README.rst ├── TODO.md ├── arv ├── __init__.py ├── __main__.py ├── match.py ├── traits.py └── util.py ├── cpp ├── .gitignore ├── arv.cpp ├── arv.hpp ├── export.hpp ├── file.cpp ├── file.hpp ├── filesize.cpp ├── filesize.hpp ├── google │ ├── dense_hash_map │ ├── dense_hash_set │ ├── sparse_hash_map │ ├── sparse_hash_set │ ├── sparsehash │ │ ├── densehashtable.h │ │ ├── hashtable-common.h │ │ ├── libc_allocator_with_realloc.h │ │ └── sparsehashtable.h │ ├── sparsetable │ ├── template_util.h │ └── type_traits.h ├── mmap.cpp ├── mmap.hpp ├── parse.cpp ├── public_py_init_sym.hpp └── sparsehash │ ├── dense_hash_map │ ├── dense_hash_set │ ├── internal │ ├── densehashtable.h │ ├── hashtable-common.h │ ├── libc_allocator_with_realloc.h │ └── sparsehashtable.h │ ├── sparse_hash_map │ ├── sparse_hash_set │ ├── sparsetable │ ├── template_util.h │ └── type_traits.h ├── cython ├── .gitignore └── _arv.pyx ├── publish.sh ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── example.py ├── fake_genome.txt ├── fake_genome_female.txt ├── test_arv.py ├── test_benchmark.py ├── test_commandline.py ├── test_infer.py └── test_traits.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.o 3 | *.pyc 4 | *.so 5 | .eggs 6 | .tox 7 | __pycache__ 8 | build 9 | config.log 10 | config.status 11 | cpp/sparsehash/internal/sparseconfig.h 12 | dist 13 | genome.txt 14 | genomes 15 | GPATH 16 | GRTAGS 17 | GTAGS 18 | Makefile 19 | MANIFEST 20 | src 21 | Testing 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: python 3 | python: 4 | - "2.7" 5 | - "3.2" 6 | - "3.3" 7 | - "3.4" 8 | - "3.5" 9 | - "3.6" 10 | compiler: 11 | - gcc 12 | addons: 13 | apt: 14 | sources: 15 | - ubuntu-toolchain-r-test 16 | packages: 17 | - g++-4.8 18 | install: 19 | - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi 20 | script: 21 | - CC=g++-4.8 CXX=g++-4.8 python setup.py test 22 | -------------------------------------------------------------------------------- /3rd-party/README.md: -------------------------------------------------------------------------------- 1 | This contains copies of other open source projects. 2 | 3 | Google sparsehash is the exact same, except that I have deleted files I don't 4 | need. The only thing I'm interested in is generating its config.h file. 5 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/.gitignore: -------------------------------------------------------------------------------- 1 | *.Po 2 | *.o 3 | *.dSYM/* 4 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/AUTHORS: -------------------------------------------------------------------------------- 1 | google-sparsehash@googlegroups.com 2 | 3 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/ChangeLog: -------------------------------------------------------------------------------- 1 | Mon Oct 12 21:00:00 2015 Google Inc. 2 | 3 | * sparsehash: version 2.0.3 4 | * Fix compilation on modern compilers and operating systems 5 | 6 | Thu Feb 23 23:47:18 2012 Google Inc. 7 | 8 | * sparsehash: version 2.0.2 9 | * BUGFIX: Fix backwards compatibility for include folders 10 | 11 | Wed Feb 01 02:57:48 2012 Google Inc. 12 | 13 | * sparsehash: version 2.0.1 14 | * BUGFIX: Fix path to malloc_extension.h in time_hash_map.cc 15 | 16 | Tue Jan 31 11:33:04 2012 Google Inc. 17 | 18 | * sparsehash: version 2.0 19 | * Renamed include directory from google/ to sparsehash/ (csilvers) 20 | * Changed the 'official' sparsehash email in setup.py/etc 21 | * Renamed google-sparsehash.sln to sparsehash.sln 22 | * Changed copyright text to reflect Google's relinquished ownership 23 | 24 | Tue Dec 20 21:04:04 2011 Google Inc. 25 | 26 | * sparsehash: version 1.12 release 27 | * Add support for serializing/unserializing dense_hash_map/set to disk 28 | * New simpler and more flexible serialization API 29 | * Be more consistent about clearing on unserialize() even if it fails 30 | * Quiet some compiler warnings about unused variables 31 | * Add a timing test for iterating (suggested by google code issue 77) 32 | * Add offset_to_pos, the opposite of pos_to_offset, to sparsetable 33 | * PORTING: Add some missing #includes, needed on some systems 34 | * Die at configure-time when g++ isn't installed 35 | * Successfully make rpm's even when dpkg is missing 36 | * Improve deleted key test in util/gtl/{dense,sparse}hashtable 37 | * Update automake to 1.10.1, and autoconf to 2.62 38 | 39 | Thu Jun 23 21:12:58 2011 Google Inc. 40 | 41 | * sparsehash: version 1.11 release 42 | * Improve performance on pointer keys by ignoring always-0 low bits 43 | * Fix missing $(top_srcdir) in Makefile.am, which broke some compiles 44 | * BUGFIX: Fix a crashing typo-bug in swap() 45 | * PORTING: Remove support for old compilers that do not use 'std' 46 | * Add some new benchmarks to test for a place dense_hash_* does badly 47 | * Some cosmetic changes due to a switch to a new releasing tool 48 | 49 | Thu Jan 20 16:07:39 2011 Google Inc. 50 | 51 | * sparsehash: version 1.10 release 52 | * Follow ExtractKey return type, allowing it to return a reference 53 | * PORTING: fix MSVC 10 warnings (constifying result_type, placement-new) 54 | * Update from autoconf 2.61 to autoconf 2.65 55 | 56 | Fri Sep 24 11:37:50 2010 Google Inc. 57 | 58 | * sparsehash: version 1.9 release 59 | * Add is_enum; make all enums PODs by default (romanp) 60 | * Make find_or_insert() usable directly (dawidk) 61 | * Use zero-memory trick for allocators to reduce space use (guilin) 62 | * Fix some compiler warnings (chandlerc, eraman) 63 | * BUGFIX: int -> size_type in one function we missed (csilvers) 64 | * Added sparsehash.pc, for pkg-config (csilvers) 65 | 66 | Thu Jul 29 15:01:29 2010 Google Inc. 67 | 68 | * sparsehash: version 1.8.1 release 69 | * Remove -Werror from Makefile: gcc 4.3 gives spurious warnings 70 | 71 | Thu Jul 29 09:53:26 2010 Google Inc. 72 | 73 | * sparsehash: version 1.8 release 74 | * More support for Allocator, including allocator ctor arg (csilvers) 75 | * Repack hasthable vars to reduce container size *more* (giao) 76 | * Speed up clear() (csilvers) 77 | * Change HT_{OCCUPANCY,SHRINK}_FLT from float to int (csilvers) 78 | * Revamp test suite for more complete code & timing coverage (csilvers) 79 | * BUGFIX: Enforce max_size for dense/sparse_hashtable (giao, csilvers) 80 | * BUGFIX: Raise exception instead of crashing on overflow (csilvers) 81 | * BUGFIX: Allow extraneous const in key type (csilvers) 82 | * BUGFIX: Allow same functor for both hasher and key_equals (giao) 83 | * PORTING: remove is_convertible, which gives AIX cc fits (csilvers) 84 | * PORTING: Renamed README.windows to README_windows.txt (csilvers) 85 | * Created non-empty NEWS file (csilvers) 86 | 87 | Wed Mar 31 12:32:03 2010 Google Inc. 88 | 89 | * sparsehash: version 1.7 release 90 | * Add support for Allocator (guilin) 91 | * Add libc_allocator_with_realloc as the new default allocator (guilin) 92 | * Repack {sparse,dense}hashtable vars to reduce container size (giao) 93 | * BUGFIX: operator== no longer requires same table ordering (csilvers) 94 | * BUGFIX: fix dense_hash_*(it,it) by requiring empty-key too (csilvers) 95 | * PORTING: fix language bugs that gcc allowed (csilvers, chandlerc) 96 | * Update from autoconf 2.61 to autoconf 2.64 97 | 98 | Fri Jan 8 14:47:55 2010 Google Inc. 99 | 100 | * sparsehash: version 1.6 release 101 | * New accessor methods for deleted_key, empty_key (sjackman) 102 | * Use explicit hash functions in sparsehash tests (csilvers) 103 | * BUGFIX: Cast resize to fix SUNWspro bug (csilvers) 104 | * Check for sz overflow in min_size (csilvers) 105 | * Speed up clear() for dense and sparse hashtables (jeff) 106 | * Avoid shrinking in all cases when min-load is 0 (shaunj, csilvers) 107 | * Improve densehashtable code for the deleted key (gpike) 108 | * BUGFIX: Fix operator= when the 2 empty-keys differ (andreidam) 109 | * BUGFIX: Fix ht copying when empty-key isn't set (andreidam) 110 | * PORTING: Use TmpFile() instead of /tmp on MinGW (csilvers) 111 | * PORTING: Use filenames that work with Stratus VOS. 112 | 113 | Tue May 12 14:16:38 2009 Google Inc. 114 | 115 | * sparsehash: version 1.5.2 release 116 | * Fix compile error: not initializing set_key in all constructors 117 | 118 | Fri May 8 15:23:44 2009 Google Inc. 119 | 120 | * sparsehash: version 1.5.1 release 121 | * Fix broken equal_range() for all the hash-classes (csilvers) 122 | 123 | Wed May 6 11:28:49 2009 Google Inc. 124 | 125 | * sparsehash: version 1.5 release 126 | * Support the tr1 unordered_map (and unordered_set) API (csilvers) 127 | * Store only key for delkey; reduces need for 0-arg c-tor (csilvers) 128 | * Prefer unordered_map to hash_map for the timing test (csilvers) 129 | * PORTING: update the resource use for 64-bit machines (csilvers) 130 | * PORTING: fix MIN/MAX collisions by un-#including windows.h (csilvers) 131 | * Updated autoconf version to 2.61 and libtool version to 1.5.26 132 | 133 | Wed Jan 28 17:11:31 2009 Google Inc. 134 | 135 | * sparsehash: version 1.4 release 136 | * Allow hashtables to be <32 buckets (csilvers) 137 | * Fix initial-sizing bug: was sizing tables too small (csilvers) 138 | * Add asserts that clients don't abuse deleted/empty key (csilvers) 139 | * Improve determination of 32/64 bit for C code (csilvers) 140 | * Small fix for doc files in rpm (csilvers) 141 | 142 | Thu Nov 6 15:06:09 2008 Google Inc. 143 | 144 | * sparsehash: version 1.3 release 145 | * Add an interface to change the parameters for resizing (myl) 146 | * Document another potentially good hash function (csilvers) 147 | 148 | Thu Sep 18 13:53:20 2008 Google Inc. 149 | 150 | * sparsehash: version 1.2 release 151 | * Augment documentation to better describe namespace issues (csilvers) 152 | * BUG FIX: replace hash<> with SPARSEHASH_HASH, for windows (csilvers) 153 | * Add timing test to unittest to test repeated add+delete (csilvers) 154 | * Do better picking a new size when resizing (csilvers) 155 | * Use ::google instead of google as a namespace (csilvers) 156 | * Improve threading test at config time (csilvers) 157 | 158 | Mon Feb 11 16:30:11 2008 Google Inc. 159 | 160 | * sparsehash: version 1.1 release 161 | * Fix brown-paper-bag bug in some constructors (rafferty) 162 | * Fix problem with variables shadowing member vars, add -Wshadow 163 | 164 | Thu Nov 29 11:44:38 2007 Google Inc. 165 | 166 | * sparsehash: version 1.0.2 release 167 | * Fix a final reference to hash<> to use SPARSEHASH_HASH<> instead. 168 | 169 | Wed Nov 14 08:47:48 2007 Google Inc. 170 | 171 | * sparsehash: version 1.0.1 release :-( 172 | * Remove an unnecessary (harmful) "#define hash" in windows' config.h 173 | 174 | Tue Nov 13 15:15:46 2007 Google Inc. 175 | 176 | * sparsehash: version 1.0 release! We are now out of beta. 177 | * Clean up Makefile awk script to be more readable (csilvers) 178 | * Namespace fixes: use fewer #defines, move typedefs into namespace 179 | 180 | Fri Oct 12 12:35:24 2007 Google Inc. 181 | 182 | * sparsehash: version 0.9.1 release 183 | * Fix Makefile awk script to work on more architectures (csilvers) 184 | * Add test to test code in more 'real life' situations (csilvers) 185 | 186 | Tue Oct 9 14:15:21 2007 Google Inc. 187 | 188 | * sparsehash: version 0.9 release 189 | * More type-hygiene improvements, especially for 64-bit (csilvers) 190 | * Some configure improvements to improve portability, utility (austern) 191 | * Small bugfix for operator== for dense_hash_map (jeff) 192 | 193 | Tue Jul 3 12:55:04 2007 Google Inc. 194 | 195 | * sparsehash: version 0.8 release 196 | * Minor type-hygiene improvements: size_t for int, etc. (csilvers) 197 | * Porting improvements: tests pass on OS X, FreeBSD, Solaris (csilvers) 198 | * Full windows port! VS solution provided for all unittests (csilvers) 199 | 200 | Mon Jun 11 11:33:41 2007 Google Inc. 201 | 202 | * sparsehash: version 0.7 release 203 | * Syntax fixes to better support gcc 4.3 and VC++ 7 (mec, csilvers) 204 | * Improved windows/VC++ support (see README.windows) (csilvers) 205 | * Config improvements: better tcmalloc support and config.h (csilvers) 206 | * More robust with missing hash_map + nix 'trampoline' .h's (csilvers) 207 | * Support for STLport's hash_map/hash_fun locations (csilvers) 208 | * Add .m4 files to distribution; now all source is there (csilvers) 209 | * Tiny modification of shrink-threshhold to allow never-shrinking (amc) 210 | * Protect timing tests against aggressive optimizers (csilvers) 211 | * Extend time_hash_map to test bigger objects (csilvers) 212 | * Extend type-trait support to work with const objects (csilvers) 213 | * USER VISIBLE: speed up all code by replacing memmove with memcpy 214 | (csilvers) 215 | 216 | Tue Mar 20 17:29:34 2007 Google Inc. 217 | 218 | * sparsehash: version 0.6 release 219 | * Some improvement to type-traits (jyasskin) 220 | * Better timing results when google-perftools is installed (sanjay) 221 | * Updates and fixes to html documentation and README (csilvers) 222 | * A bit more careful about #includes (csilvers) 223 | * Fix for typo that broken compilation on some systems (csilvers) 224 | * USER VISIBLE: New clear_no_resize() method added to dense_hash_map 225 | (uszkoreit) 226 | 227 | Sat Oct 21 13:47:47 2006 Google Inc. 228 | 229 | * sparsehash: version 0.5 release 230 | * Support uint16_t (SunOS) in addition to u_int16_t (BSD) (csilvers) 231 | * Get rid of UNDERSTANDS_ITERATOR_TAGS; everyone understands (csilvers) 232 | * Test that empty-key and deleted-key differ (rbayardo) 233 | * Fix example docs: strcmp needs to test for NULL (csilvers) 234 | 235 | Sun Apr 23 22:42:35 2006 Google Inc. 236 | 237 | * sparsehash: version 0.4 release 238 | * Remove POD requirement for keys and values! (austern) 239 | * Add tr1-compatible type-traits system to speed up POD ops. (austern) 240 | * Fixed const-iterator bug where postfix ++ didn't compile. (csilvers) 241 | * Fixed iterator comparison bugs where <= was incorrect. (csilvers) 242 | * Clean up config.h to keep its #defines from conflicting. (csilvers) 243 | * Big documentation sweep and cleanup. (csilvers) 244 | * Update documentation to talk more about good hash fns. (csilvers) 245 | * Fixes to compile on MSVC (working around some MSVC bugs). (rennie) 246 | * Avoid resizing hashtable on operator[] lookups (austern) 247 | 248 | Thu Nov 3 20:12:31 2005 Google Inc. 249 | 250 | * sparsehash: version 0.3 release 251 | * Quiet compiler warnings on some compilers. (csilvers) 252 | * Some documentation fixes: example code for dense_hash_map. (csilvers) 253 | * Fix a bug where swap() wasn't swapping delete_key(). (csilvers) 254 | * set_deleted_key() and set_empty_key() now take a key only, 255 | allowing hash-map values to be forward-declared. (csilvers) 256 | * support for std::insert_iterator (and std::inserter). (csilvers) 257 | 258 | Mon May 2 07:04:46 2005 Google Inc. 259 | 260 | * sparsehash: version 0.2 release 261 | * Preliminary support for msvc++ compilation. (csilvers) 262 | * Documentation fixes -- some example code was incomplete! (csilvers) 263 | * Minimize size of config.h to avoid other-package conflicts (csilvers) 264 | * Contribute a C-based version of sparsehash that served as the 265 | inspiration for this code. One day, I hope to clean it up and 266 | support it, but for now it's just in experimental/, for playing 267 | around with. (csilvers) 268 | * Change default namespace from std to google. (csilvers) 269 | 270 | Fri Jan 14 16:53:32 2005 Google Inc. 271 | 272 | * sparsehash: initial release: 273 | The sparsehash package contains several hash-map implementations, 274 | similar in API to SGI's hash_map class, but with different 275 | performance characteristics. sparse_hash_map uses very little 276 | space overhead: 1-2 bits per entry. dense_hash_map is typically 277 | faster than the default SGI STL implementation. This package 278 | also includes hash-set analogues of these classes. 279 | 280 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/Makefile.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cslarsen/arv/3999e00361f13d404d0d86d76bcedd4d8d49c393/3rd-party/sparsehash/Makefile.in -------------------------------------------------------------------------------- /3rd-party/sparsehash/README: -------------------------------------------------------------------------------- 1 | This directory contains several hash-map implementations, similar in 2 | API to SGI's hash_map class, but with different performance 3 | characteristics. sparse_hash_map uses very little space overhead, 1-2 4 | bits per entry. dense_hash_map is very fast, particulary on lookup. 5 | (sparse_hash_set and dense_hash_set are the set versions of these 6 | routines.) On the other hand, these classes have requirements that 7 | may not make them appropriate for all applications. 8 | 9 | All these implementation use a hashtable with internal quadratic 10 | probing. This method is space-efficient -- there is no pointer 11 | overhead -- and time-efficient for good hash functions. 12 | 13 | COMPILING 14 | --------- 15 | To compile test applications with these classes, run ./configure 16 | followed by make. To install these header files on your system, run 17 | 'make install'. (On Windows, the instructions are different; see 18 | README_windows.txt.) See INSTALL for more details. 19 | 20 | This code should work on any modern C++ system. It has been tested on 21 | Linux (Ubuntu, Fedora, RedHat, Debian), Solaris 10 x86, FreeBSD 6.0, 22 | OS X 10.3 and 10.4, and Windows under both VC++7 and VC++8. 23 | 24 | USING 25 | ----- 26 | See the html files in the doc directory for small example programs 27 | that use these classes. It's enough to just include the header file: 28 | 29 | #include // or sparse_hash_set, dense_hash_map, ... 30 | google::sparse_hash_set number_mapper; 31 | 32 | and use the class the way you would other hash-map implementations. 33 | (Though see "API" below for caveats.) 34 | 35 | By default (you can change it via a flag to ./configure), these hash 36 | implementations are defined in the google namespace. 37 | 38 | API 39 | --- 40 | The API for sparse_hash_map, dense_hash_map, sparse_hash_set, and 41 | dense_hash_set, are a superset of the API of SGI's hash_map class. 42 | See doc/sparse_hash_map.html, et al., for more information about the 43 | API. 44 | 45 | The usage of these classes differ from SGI's hash_map, and other 46 | hashtable implementations, in the following major ways: 47 | 48 | 1) dense_hash_map requires you to set aside one key value as the 49 | 'empty bucket' value, set via the set_empty_key() method. This 50 | *MUST* be called before you can use the dense_hash_map. It is 51 | illegal to insert any elements into a dense_hash_map whose key is 52 | equal to the empty-key. 53 | 54 | 2) For both dense_hash_map and sparse_hash_map, if you wish to delete 55 | elements from the hashtable, you must set aside a key value as the 56 | 'deleted bucket' value, set via the set_deleted_key() method. If 57 | your hash-map is insert-only, there is no need to call this 58 | method. If you call set_deleted_key(), it is illegal to insert any 59 | elements into a dense_hash_map or sparse_hash_map whose key is 60 | equal to the deleted-key. 61 | 62 | 3) These hash-map implementation support I/O. See below. 63 | 64 | There are also some smaller differences: 65 | 66 | 1) The constructor takes an optional argument that specifies the 67 | number of elements you expect to insert into the hashtable. This 68 | differs from SGI's hash_map implementation, which takes an optional 69 | number of buckets. 70 | 71 | 2) erase() does not immediately reclaim memory. As a consequence, 72 | erase() does not invalidate any iterators, making loops like this 73 | correct: 74 | for (it = ht.begin(); it != ht.end(); ++it) 75 | if (...) ht.erase(it); 76 | As another consequence, a series of erase() calls can leave your 77 | hashtable using more memory than it needs to. The hashtable will 78 | automatically compact at the next call to insert(), but to 79 | manually compact a hashtable, you can call 80 | ht.resize(0) 81 | 82 | I/O 83 | --- 84 | In addition to the normal hash-map operations, sparse_hash_map can 85 | read and write hashtables to disk. (dense_hash_map also has the API, 86 | but it has not yet been implemented, and writes will always fail.) 87 | 88 | In the simplest case, writing a hashtable is as easy as calling two 89 | methods on the hashtable: 90 | ht.write_metadata(fp); 91 | ht.write_nopointer_data(fp); 92 | 93 | Reading in this data is equally simple: 94 | google::sparse_hash_map<...> ht; 95 | ht.read_metadata(fp); 96 | ht.read_nopointer_data(fp); 97 | 98 | The above is sufficient if the key and value do not contain any 99 | pointers: they are basic C types or agglomorations of basic C types. 100 | If the key and/or value do contain pointers, you can still store the 101 | hashtable by replacing write_nopointer_data() with a custom writing 102 | routine. See sparse_hash_map.html et al. for more information. 103 | 104 | SPARSETABLE 105 | ----------- 106 | In addition to the hash-map and hash-set classes, this package also 107 | provides sparsetable.h, an array implementation that uses space 108 | proportional to the number of elements in the array, rather than the 109 | maximum element index. It uses very little space overhead: 2 to 5 110 | bits per entry. See doc/sparsetable.html for the API. 111 | 112 | RESOURCE USAGE 113 | -------------- 114 | * sparse_hash_map has memory overhead of about 4 to 10 bits per 115 | hash-map entry, assuming a typical average occupancy of 50%. 116 | * dense_hash_map has a factor of 2-3 memory overhead: if your 117 | hashtable data takes X bytes, dense_hash_map will use 3X-4X memory 118 | total. 119 | 120 | Hashtables tend to double in size when resizing, creating an 121 | additional 50% space overhead. dense_hash_map does in fact have a 122 | significant "high water mark" memory use requirement, which is 6 times 123 | the size of hash entries in the table when resizing (when reaching 124 | 50% occupancy, the table resizes to double the previous size, and the 125 | old table (2x) is copied to the new table (4x)). 126 | 127 | sparse_hash_map, however, is written to need very little space 128 | overhead when resizing: only a few bits per hashtable entry. 129 | 130 | PERFORMANCE 131 | ----------- 132 | You can compile and run the included file time_hash_map.cc to examine 133 | the performance of sparse_hash_map, dense_hash_map, and your native 134 | hash_map implementation on your system. One test against the 135 | SGI hash_map implementation gave the following timing information for 136 | a simple find() call: 137 | SGI hash_map: 22 ns 138 | dense_hash_map: 13 ns 139 | sparse_hash_map: 117 ns 140 | SGI map: 113 ns 141 | 142 | See doc/performance.html for more detailed charts on resource usage 143 | and performance data. 144 | 145 | --- 146 | 16 March 2005 147 | (Last updated: 12 September 2010) 148 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/m4/acx_pthread.m4: -------------------------------------------------------------------------------- 1 | # This was retrieved from 2 | # http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?revision=1277&root=avahi 3 | # See also (perhaps for new versions?) 4 | # http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?root=avahi 5 | # 6 | # We've rewritten the inconsistency check code (from avahi), to work 7 | # more broadly. In particular, it no longer assumes ld accepts -zdefs. 8 | # This caused a restructing of the code, but the functionality has only 9 | # changed a little. 10 | 11 | dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]]) 12 | dnl 13 | dnl @summary figure out how to build C programs using POSIX threads 14 | dnl 15 | dnl This macro figures out how to build C programs using POSIX threads. 16 | dnl It sets the PTHREAD_LIBS output variable to the threads library and 17 | dnl linker flags, and the PTHREAD_CFLAGS output variable to any special 18 | dnl C compiler flags that are needed. (The user can also force certain 19 | dnl compiler flags/libs to be tested by setting these environment 20 | dnl variables.) 21 | dnl 22 | dnl Also sets PTHREAD_CC to any special C compiler that is needed for 23 | dnl multi-threaded programs (defaults to the value of CC otherwise). 24 | dnl (This is necessary on AIX to use the special cc_r compiler alias.) 25 | dnl 26 | dnl NOTE: You are assumed to not only compile your program with these 27 | dnl flags, but also link it with them as well. e.g. you should link 28 | dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS 29 | dnl $LIBS 30 | dnl 31 | dnl If you are only building threads programs, you may wish to use 32 | dnl these variables in your default LIBS, CFLAGS, and CC: 33 | dnl 34 | dnl LIBS="$PTHREAD_LIBS $LIBS" 35 | dnl CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 36 | dnl CC="$PTHREAD_CC" 37 | dnl 38 | dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute 39 | dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to 40 | dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). 41 | dnl 42 | dnl ACTION-IF-FOUND is a list of shell commands to run if a threads 43 | dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to 44 | dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the 45 | dnl default action will define HAVE_PTHREAD. 46 | dnl 47 | dnl Please let the authors know if this macro fails on any platform, or 48 | dnl if you have any other suggestions or comments. This macro was based 49 | dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with 50 | dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros 51 | dnl posted by Alejandro Forero Cuervo to the autoconf macro repository. 52 | dnl We are also grateful for the helpful feedback of numerous users. 53 | dnl 54 | dnl @category InstalledPackages 55 | dnl @author Steven G. Johnson 56 | dnl @version 2006-05-29 57 | dnl @license GPLWithACException 58 | dnl 59 | dnl Checks for GCC shared/pthread inconsistency based on work by 60 | dnl Marcin Owsiany 61 | 62 | 63 | AC_DEFUN([ACX_PTHREAD], [ 64 | AC_REQUIRE([AC_CANONICAL_HOST]) 65 | AC_LANG_SAVE 66 | AC_LANG_C 67 | acx_pthread_ok=no 68 | 69 | # We used to check for pthread.h first, but this fails if pthread.h 70 | # requires special compiler flags (e.g. on True64 or Sequent). 71 | # It gets checked for in the link test anyway. 72 | 73 | # First of all, check if the user has set any of the PTHREAD_LIBS, 74 | # etcetera environment variables, and if threads linking works using 75 | # them: 76 | if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then 77 | save_CFLAGS="$CFLAGS" 78 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 79 | save_LIBS="$LIBS" 80 | LIBS="$PTHREAD_LIBS $LIBS" 81 | AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS]) 82 | AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes) 83 | AC_MSG_RESULT($acx_pthread_ok) 84 | if test x"$acx_pthread_ok" = xno; then 85 | PTHREAD_LIBS="" 86 | PTHREAD_CFLAGS="" 87 | fi 88 | LIBS="$save_LIBS" 89 | CFLAGS="$save_CFLAGS" 90 | fi 91 | 92 | # We must check for the threads library under a number of different 93 | # names; the ordering is very important because some systems 94 | # (e.g. DEC) have both -lpthread and -lpthreads, where one of the 95 | # libraries is broken (non-POSIX). 96 | 97 | # Create a list of thread flags to try. Items starting with a "-" are 98 | # C compiler flags, and other items are library names, except for "none" 99 | # which indicates that we try without any flags at all, and "pthread-config" 100 | # which is a program returning the flags for the Pth emulation library. 101 | 102 | acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" 103 | 104 | # The ordering *is* (sometimes) important. Some notes on the 105 | # individual items follow: 106 | 107 | # pthreads: AIX (must check this before -lpthread) 108 | # none: in case threads are in libc; should be tried before -Kthread and 109 | # other compiler flags to prevent continual compiler warnings 110 | # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) 111 | # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) 112 | # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) 113 | # -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) 114 | # -pthreads: Solaris/gcc 115 | # -mthreads: Mingw32/gcc, Lynx/gcc 116 | # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it 117 | # doesn't hurt to check since this sometimes defines pthreads too; 118 | # also defines -D_REENTRANT) 119 | # ... -mt is also the pthreads flag for HP/aCC 120 | # pthread: Linux, etcetera 121 | # --thread-safe: KAI C++ 122 | # pthread-config: use pthread-config program (for GNU Pth library) 123 | 124 | case "${host_cpu}-${host_os}" in 125 | *solaris*) 126 | 127 | # On Solaris (at least, for some versions), libc contains stubbed 128 | # (non-functional) versions of the pthreads routines, so link-based 129 | # tests will erroneously succeed. (We need to link with -pthreads/-mt/ 130 | # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather 131 | # a function called by this macro, so we could check for that, but 132 | # who knows whether they'll stub that too in a future libc.) So, 133 | # we'll just look for -pthreads and -lpthread first: 134 | 135 | acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags" 136 | ;; 137 | esac 138 | 139 | if test x"$acx_pthread_ok" = xno; then 140 | for flag in $acx_pthread_flags; do 141 | 142 | case $flag in 143 | none) 144 | AC_MSG_CHECKING([whether pthreads work without any flags]) 145 | ;; 146 | 147 | -*) 148 | AC_MSG_CHECKING([whether pthreads work with $flag]) 149 | PTHREAD_CFLAGS="$flag" 150 | ;; 151 | 152 | pthread-config) 153 | AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no) 154 | if test x"$acx_pthread_config" = xno; then continue; fi 155 | PTHREAD_CFLAGS="`pthread-config --cflags`" 156 | PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" 157 | ;; 158 | 159 | *) 160 | AC_MSG_CHECKING([for the pthreads library -l$flag]) 161 | PTHREAD_LIBS="-l$flag" 162 | ;; 163 | esac 164 | 165 | save_LIBS="$LIBS" 166 | save_CFLAGS="$CFLAGS" 167 | LIBS="$PTHREAD_LIBS $LIBS" 168 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 169 | 170 | # Check for various functions. We must include pthread.h, 171 | # since some functions may be macros. (On the Sequent, we 172 | # need a special flag -Kthread to make this header compile.) 173 | # We check for pthread_join because it is in -lpthread on IRIX 174 | # while pthread_create is in libc. We check for pthread_attr_init 175 | # due to DEC craziness with -lpthreads. We check for 176 | # pthread_cleanup_push because it is one of the few pthread 177 | # functions on Solaris that doesn't have a non-functional libc stub. 178 | # We try pthread_create on general principles. 179 | AC_TRY_LINK([#include ], 180 | [pthread_t th; pthread_join(th, 0); 181 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 182 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 183 | [acx_pthread_ok=yes]) 184 | 185 | LIBS="$save_LIBS" 186 | CFLAGS="$save_CFLAGS" 187 | 188 | AC_MSG_RESULT($acx_pthread_ok) 189 | if test "x$acx_pthread_ok" = xyes; then 190 | break; 191 | fi 192 | 193 | PTHREAD_LIBS="" 194 | PTHREAD_CFLAGS="" 195 | done 196 | fi 197 | 198 | # Various other checks: 199 | if test "x$acx_pthread_ok" = xyes; then 200 | save_LIBS="$LIBS" 201 | LIBS="$PTHREAD_LIBS $LIBS" 202 | save_CFLAGS="$CFLAGS" 203 | CFLAGS="$CFLAGS $PTHREAD_CFLAGS" 204 | 205 | # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. 206 | AC_MSG_CHECKING([for joinable pthread attribute]) 207 | attr_name=unknown 208 | for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do 209 | AC_TRY_LINK([#include ], [int attr=$attr; return attr;], 210 | [attr_name=$attr; break]) 211 | done 212 | AC_MSG_RESULT($attr_name) 213 | if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then 214 | AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name, 215 | [Define to necessary symbol if this constant 216 | uses a non-standard name on your system.]) 217 | fi 218 | 219 | AC_MSG_CHECKING([if more special flags are required for pthreads]) 220 | flag=no 221 | case "${host_cpu}-${host_os}" in 222 | *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";; 223 | *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";; 224 | esac 225 | AC_MSG_RESULT(${flag}) 226 | if test "x$flag" != xno; then 227 | PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" 228 | fi 229 | 230 | LIBS="$save_LIBS" 231 | CFLAGS="$save_CFLAGS" 232 | # More AIX lossage: must compile with xlc_r or cc_r 233 | if test x"$GCC" != xyes; then 234 | AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC}) 235 | else 236 | PTHREAD_CC=$CC 237 | fi 238 | 239 | # The next part tries to detect GCC inconsistency with -shared on some 240 | # architectures and systems. The problem is that in certain 241 | # configurations, when -shared is specified, GCC "forgets" to 242 | # internally use various flags which are still necessary. 243 | 244 | # 245 | # Prepare the flags 246 | # 247 | save_CFLAGS="$CFLAGS" 248 | save_LIBS="$LIBS" 249 | save_CC="$CC" 250 | 251 | # Try with the flags determined by the earlier checks. 252 | # 253 | # -Wl,-z,defs forces link-time symbol resolution, so that the 254 | # linking checks with -shared actually have any value 255 | # 256 | # FIXME: -fPIC is required for -shared on many architectures, 257 | # so we specify it here, but the right way would probably be to 258 | # properly detect whether it is actually required. 259 | CFLAGS="-shared -fPIC -Wl,-z,defs $CFLAGS $PTHREAD_CFLAGS" 260 | LIBS="$PTHREAD_LIBS $LIBS" 261 | CC="$PTHREAD_CC" 262 | 263 | # In order not to create several levels of indentation, we test 264 | # the value of "$done" until we find the cure or run out of ideas. 265 | done="no" 266 | 267 | # First, make sure the CFLAGS we added are actually accepted by our 268 | # compiler. If not (and OS X's ld, for instance, does not accept -z), 269 | # then we can't do this test. 270 | if test x"$done" = xno; then 271 | AC_MSG_CHECKING([whether to check for GCC pthread/shared inconsistencies]) 272 | AC_TRY_LINK(,, , [done=yes]) 273 | 274 | if test "x$done" = xyes ; then 275 | AC_MSG_RESULT([no]) 276 | else 277 | AC_MSG_RESULT([yes]) 278 | fi 279 | fi 280 | 281 | if test x"$done" = xno; then 282 | AC_MSG_CHECKING([whether -pthread is sufficient with -shared]) 283 | AC_TRY_LINK([#include ], 284 | [pthread_t th; pthread_join(th, 0); 285 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 286 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 287 | [done=yes]) 288 | 289 | if test "x$done" = xyes; then 290 | AC_MSG_RESULT([yes]) 291 | else 292 | AC_MSG_RESULT([no]) 293 | fi 294 | fi 295 | 296 | # 297 | # Linux gcc on some architectures such as mips/mipsel forgets 298 | # about -lpthread 299 | # 300 | if test x"$done" = xno; then 301 | AC_MSG_CHECKING([whether -lpthread fixes that]) 302 | LIBS="-lpthread $PTHREAD_LIBS $save_LIBS" 303 | AC_TRY_LINK([#include ], 304 | [pthread_t th; pthread_join(th, 0); 305 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 306 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 307 | [done=yes]) 308 | 309 | if test "x$done" = xyes; then 310 | AC_MSG_RESULT([yes]) 311 | PTHREAD_LIBS="-lpthread $PTHREAD_LIBS" 312 | else 313 | AC_MSG_RESULT([no]) 314 | fi 315 | fi 316 | # 317 | # FreeBSD 4.10 gcc forgets to use -lc_r instead of -lc 318 | # 319 | if test x"$done" = xno; then 320 | AC_MSG_CHECKING([whether -lc_r fixes that]) 321 | LIBS="-lc_r $PTHREAD_LIBS $save_LIBS" 322 | AC_TRY_LINK([#include ], 323 | [pthread_t th; pthread_join(th, 0); 324 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 325 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 326 | [done=yes]) 327 | 328 | if test "x$done" = xyes; then 329 | AC_MSG_RESULT([yes]) 330 | PTHREAD_LIBS="-lc_r $PTHREAD_LIBS" 331 | else 332 | AC_MSG_RESULT([no]) 333 | fi 334 | fi 335 | if test x"$done" = xno; then 336 | # OK, we have run out of ideas 337 | AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries]) 338 | 339 | # so it's not safe to assume that we may use pthreads 340 | acx_pthread_ok=no 341 | fi 342 | 343 | AC_MSG_CHECKING([whether what we have so far is sufficient with -nostdlib]) 344 | CFLAGS="-nostdlib $CFLAGS" 345 | # we need c with nostdlib 346 | LIBS="$LIBS -lc" 347 | AC_TRY_LINK([#include ], 348 | [pthread_t th; pthread_join(th, 0); 349 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 350 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 351 | [done=yes],[done=no]) 352 | 353 | if test "x$done" = xyes; then 354 | AC_MSG_RESULT([yes]) 355 | else 356 | AC_MSG_RESULT([no]) 357 | fi 358 | 359 | if test x"$done" = xno; then 360 | AC_MSG_CHECKING([whether -lpthread saves the day]) 361 | LIBS="-lpthread $LIBS" 362 | AC_TRY_LINK([#include ], 363 | [pthread_t th; pthread_join(th, 0); 364 | pthread_attr_init(0); pthread_cleanup_push(0, 0); 365 | pthread_create(0,0,0,0); pthread_cleanup_pop(0); ], 366 | [done=yes],[done=no]) 367 | 368 | if test "x$done" = xyes; then 369 | AC_MSG_RESULT([yes]) 370 | PTHREAD_LIBS="$PTHREAD_LIBS -lpthread" 371 | else 372 | AC_MSG_RESULT([no]) 373 | AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries and -nostdlib]) 374 | fi 375 | fi 376 | 377 | CFLAGS="$save_CFLAGS" 378 | LIBS="$save_LIBS" 379 | CC="$save_CC" 380 | else 381 | PTHREAD_CC="$CC" 382 | fi 383 | 384 | AC_SUBST(PTHREAD_LIBS) 385 | AC_SUBST(PTHREAD_CFLAGS) 386 | AC_SUBST(PTHREAD_CC) 387 | 388 | # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: 389 | if test x"$acx_pthread_ok" = xyes; then 390 | ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1]) 391 | : 392 | else 393 | acx_pthread_ok=no 394 | $2 395 | fi 396 | AC_LANG_RESTORE 397 | ])dnl ACX_PTHREAD 398 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/m4/google_namespace.m4: -------------------------------------------------------------------------------- 1 | # Allow users to override the namespace we define our application's classes in 2 | # Arg $1 is the default namespace to use if --enable-namespace isn't present. 3 | 4 | # In general, $1 should be 'google', so we put all our exported symbols in a 5 | # unique namespace that is not likely to conflict with anyone else. However, 6 | # when it makes sense -- for instance, when publishing stl-like code -- you 7 | # may want to go with a different default, like 'std'. 8 | 9 | # We guarantee the invariant that GOOGLE_NAMESPACE starts with ::, 10 | # unless it's the empty string. Thus, it's always safe to do 11 | # GOOGLE_NAMESPACE::foo and be sure you're getting the foo that's 12 | # actually in the google namespace, and not some other namespace that 13 | # the namespace rules might kick in. 14 | 15 | AC_DEFUN([AC_DEFINE_GOOGLE_NAMESPACE], 16 | [google_namespace_default=[$1] 17 | AC_ARG_ENABLE(namespace, [ --enable-namespace=FOO to define these Google 18 | classes in the FOO namespace. --disable-namespace 19 | to define them in the global namespace. Default 20 | is to define them in namespace $1.], 21 | [case "$enableval" in 22 | yes) google_namespace="$google_namespace_default" ;; 23 | no) google_namespace="" ;; 24 | *) google_namespace="$enableval" ;; 25 | esac], 26 | [google_namespace="$google_namespace_default"]) 27 | if test -n "$google_namespace"; then 28 | ac_google_namespace="::$google_namespace" 29 | ac_google_start_namespace="namespace $google_namespace {" 30 | ac_google_end_namespace="}" 31 | else 32 | ac_google_namespace="" 33 | ac_google_start_namespace="" 34 | ac_google_end_namespace="" 35 | fi 36 | AC_DEFINE_UNQUOTED(GOOGLE_NAMESPACE, $ac_google_namespace, 37 | Namespace for Google classes) 38 | AC_DEFINE_UNQUOTED(_START_GOOGLE_NAMESPACE_, $ac_google_start_namespace, 39 | Puts following code inside the Google namespace) 40 | AC_DEFINE_UNQUOTED(_END_GOOGLE_NAMESPACE_, $ac_google_end_namespace, 41 | Stops putting the code inside the Google namespace) 42 | ]) 43 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/m4/namespaces.m4: -------------------------------------------------------------------------------- 1 | # Checks whether the compiler implements namespaces 2 | AC_DEFUN([AC_CXX_NAMESPACES], 3 | [AC_CACHE_CHECK(whether the compiler implements namespaces, 4 | ac_cv_cxx_namespaces, 5 | [AC_LANG_SAVE 6 | AC_LANG_CPLUSPLUS 7 | AC_TRY_COMPILE([namespace Outer { 8 | namespace Inner { int i = 0; }}], 9 | [using namespace Outer::Inner; return i;], 10 | ac_cv_cxx_namespaces=yes, 11 | ac_cv_cxx_namespaces=no) 12 | AC_LANG_RESTORE]) 13 | if test "$ac_cv_cxx_namespaces" = yes; then 14 | AC_DEFINE(HAVE_NAMESPACES, 1, [define if the compiler implements namespaces]) 15 | fi]) 16 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/m4/stl_hash.m4: -------------------------------------------------------------------------------- 1 | # We check two things: where the include file is for 2 | # unordered_map/hash_map (we prefer the first form), and what 3 | # namespace unordered/hash_map lives in within that include file. We 4 | # include AC_TRY_COMPILE for all the combinations we've seen in the 5 | # wild. We define HASH_MAP_H to the location of the header file, and 6 | # HASH_NAMESPACE to the namespace the class (unordered_map or 7 | # hash_map) is in. We define HAVE_UNORDERED_MAP if the class we found 8 | # is named unordered_map, or leave it undefined if not. 9 | 10 | # This also checks if unordered map exists. 11 | AC_DEFUN([AC_CXX_STL_HASH], 12 | [AC_REQUIRE([AC_CXX_NAMESPACES]) 13 | AC_MSG_CHECKING(the location of hash_map) 14 | AC_LANG_SAVE 15 | AC_LANG_CPLUSPLUS 16 | ac_cv_cxx_hash_map="" 17 | # First try unordered_map, but not on gcc's before 4.2 -- I've 18 | # seen unexplainable unordered_map bugs with -O2 on older gcc's. 19 | AC_TRY_COMPILE([#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)) 20 | # error GCC too old for unordered_map 21 | #endif 22 | ], 23 | [/* no program body necessary */], 24 | [stl_hash_old_gcc=no], 25 | [stl_hash_old_gcc=yes]) 26 | for location in unordered_map tr1/unordered_map; do 27 | for namespace in std std::tr1; do 28 | if test -z "$ac_cv_cxx_hash_map" -a "$stl_hash_old_gcc" != yes; then 29 | # Some older gcc's have a buggy tr1, so test a bit of code. 30 | AC_TRY_COMPILE([#include <$location>], 31 | [const ${namespace}::unordered_map t; 32 | return t.find(5) == t.end();], 33 | [ac_cv_cxx_hash_map="<$location>"; 34 | ac_cv_cxx_hash_namespace="$namespace"; 35 | ac_cv_cxx_have_unordered_map="yes";]) 36 | fi 37 | done 38 | done 39 | # Now try hash_map 40 | for location in ext/hash_map hash_map; do 41 | for namespace in __gnu_cxx "" std stdext; do 42 | if test -z "$ac_cv_cxx_hash_map"; then 43 | AC_TRY_COMPILE([#include <$location>], 44 | [${namespace}::hash_map t], 45 | [ac_cv_cxx_hash_map="<$location>"; 46 | ac_cv_cxx_hash_namespace="$namespace"; 47 | ac_cv_cxx_have_unordered_map="no";]) 48 | fi 49 | done 50 | done 51 | ac_cv_cxx_hash_set=`echo "$ac_cv_cxx_hash_map" | sed s/map/set/`; 52 | if test -n "$ac_cv_cxx_hash_map"; then 53 | AC_DEFINE(HAVE_HASH_MAP, 1, [define if the compiler has hash_map]) 54 | AC_DEFINE(HAVE_HASH_SET, 1, [define if the compiler has hash_set]) 55 | AC_DEFINE_UNQUOTED(HASH_MAP_H,$ac_cv_cxx_hash_map, 56 | [the location of or ]) 57 | AC_DEFINE_UNQUOTED(HASH_SET_H,$ac_cv_cxx_hash_set, 58 | [the location of or ]) 59 | AC_DEFINE_UNQUOTED(HASH_NAMESPACE,$ac_cv_cxx_hash_namespace, 60 | [the namespace of hash_map/hash_set]) 61 | if test "$ac_cv_cxx_have_unordered_map" = yes; then 62 | AC_DEFINE(HAVE_UNORDERED_MAP,1, 63 | [define if the compiler supports unordered_{map,set}]) 64 | fi 65 | AC_MSG_RESULT([$ac_cv_cxx_hash_map]) 66 | else 67 | AC_MSG_RESULT() 68 | AC_MSG_WARN([could not find an STL hash_map]) 69 | fi 70 | ]) 71 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/m4/stl_hash_fun.m4: -------------------------------------------------------------------------------- 1 | # We just try to figure out where hash<> is defined. It's in some file 2 | # that ends in hash_fun.h... 3 | # 4 | # Ideally we'd use AC_CACHE_CHECK, but that only lets us store one value 5 | # at a time, and we need to store two (filename and namespace). 6 | # prints messages itself, so we have to do the message-printing ourselves 7 | # via AC_MSG_CHECKING + AC_MSG_RESULT. (TODO(csilvers): can we cache?) 8 | # 9 | # tr1/functional_hash.h: new gcc's with tr1 support 10 | # stl_hash_fun.h: old gcc's (gc2.95?) 11 | # ext/hash_fun.h: newer gcc's (gcc4) 12 | # stl/_hash_fun.h: STLport 13 | 14 | AC_DEFUN([AC_CXX_STL_HASH_FUN], 15 | [AC_REQUIRE([AC_CXX_STL_HASH]) 16 | AC_MSG_CHECKING(how to include hash_fun directly) 17 | AC_LANG_SAVE 18 | AC_LANG_CPLUSPLUS 19 | ac_cv_cxx_stl_hash_fun="" 20 | for location in functional tr1/functional \ 21 | ext/hash_fun.h ext/stl_hash_fun.h \ 22 | hash_fun.h stl_hash_fun.h \ 23 | stl/_hash_fun.h; do 24 | if test -z "$ac_cv_cxx_stl_hash_fun"; then 25 | AC_TRY_COMPILE([#include <$location>], 26 | [int x = ${ac_cv_cxx_hash_namespace}::hash()(5)], 27 | [ac_cv_cxx_stl_hash_fun="<$location>";]) 28 | fi 29 | done 30 | AC_LANG_RESTORE 31 | AC_DEFINE_UNQUOTED(HASH_FUN_H,$ac_cv_cxx_stl_hash_fun, 32 | [the location of the header defining hash functions]) 33 | AC_DEFINE_UNQUOTED(HASH_NAMESPACE,$ac_cv_cxx_hash_namespace, 34 | [the namespace of the hash<> function]) 35 | AC_MSG_RESULT([$ac_cv_cxx_stl_hash_fun]) 36 | ]) 37 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/missing: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Common stub for a few missing GNU programs while installing. 3 | 4 | scriptversion=2009-04-28.21; # UTC 5 | 6 | # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006, 7 | # 2008, 2009 Free Software Foundation, Inc. 8 | # Originally by Fran,cois Pinard , 1996. 9 | 10 | # This program is free software; you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation; either version 2, or (at your option) 13 | # any later version. 14 | 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program. If not, see . 22 | 23 | # As a special exception to the GNU General Public License, if you 24 | # distribute this file as part of a program that contains a 25 | # configuration script generated by Autoconf, you may include it under 26 | # the same distribution terms that you use for the rest of that program. 27 | 28 | if test $# -eq 0; then 29 | echo 1>&2 "Try \`$0 --help' for more information" 30 | exit 1 31 | fi 32 | 33 | run=: 34 | sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p' 35 | sed_minuso='s/.* -o \([^ ]*\).*/\1/p' 36 | 37 | # In the cases where this matters, `missing' is being run in the 38 | # srcdir already. 39 | if test -f configure.ac; then 40 | configure_ac=configure.ac 41 | else 42 | configure_ac=configure.in 43 | fi 44 | 45 | msg="missing on your system" 46 | 47 | case $1 in 48 | --run) 49 | # Try to run requested program, and just exit if it succeeds. 50 | run= 51 | shift 52 | "$@" && exit 0 53 | # Exit code 63 means version mismatch. This often happens 54 | # when the user try to use an ancient version of a tool on 55 | # a file that requires a minimum version. In this case we 56 | # we should proceed has if the program had been absent, or 57 | # if --run hadn't been passed. 58 | if test $? = 63; then 59 | run=: 60 | msg="probably too old" 61 | fi 62 | ;; 63 | 64 | -h|--h|--he|--hel|--help) 65 | echo "\ 66 | $0 [OPTION]... PROGRAM [ARGUMENT]... 67 | 68 | Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an 69 | error status if there is no known handling for PROGRAM. 70 | 71 | Options: 72 | -h, --help display this help and exit 73 | -v, --version output version information and exit 74 | --run try to run the given command, and emulate it if it fails 75 | 76 | Supported PROGRAM values: 77 | aclocal touch file \`aclocal.m4' 78 | autoconf touch file \`configure' 79 | autoheader touch file \`config.h.in' 80 | autom4te touch the output file, or create a stub one 81 | automake touch all \`Makefile.in' files 82 | bison create \`y.tab.[ch]', if possible, from existing .[ch] 83 | flex create \`lex.yy.c', if possible, from existing .c 84 | help2man touch the output file 85 | lex create \`lex.yy.c', if possible, from existing .c 86 | makeinfo touch the output file 87 | tar try tar, gnutar, gtar, then tar without non-portable flags 88 | yacc create \`y.tab.[ch]', if possible, from existing .[ch] 89 | 90 | Version suffixes to PROGRAM as well as the prefixes \`gnu-', \`gnu', and 91 | \`g' are ignored when checking the name. 92 | 93 | Send bug reports to ." 94 | exit $? 95 | ;; 96 | 97 | -v|--v|--ve|--ver|--vers|--versi|--versio|--version) 98 | echo "missing $scriptversion (GNU Automake)" 99 | exit $? 100 | ;; 101 | 102 | -*) 103 | echo 1>&2 "$0: Unknown \`$1' option" 104 | echo 1>&2 "Try \`$0 --help' for more information" 105 | exit 1 106 | ;; 107 | 108 | esac 109 | 110 | # normalize program name to check for. 111 | program=`echo "$1" | sed ' 112 | s/^gnu-//; t 113 | s/^gnu//; t 114 | s/^g//; t'` 115 | 116 | # Now exit if we have it, but it failed. Also exit now if we 117 | # don't have it and --version was passed (most likely to detect 118 | # the program). This is about non-GNU programs, so use $1 not 119 | # $program. 120 | case $1 in 121 | lex*|yacc*) 122 | # Not GNU programs, they don't have --version. 123 | ;; 124 | 125 | tar*) 126 | if test -n "$run"; then 127 | echo 1>&2 "ERROR: \`tar' requires --run" 128 | exit 1 129 | elif test "x$2" = "x--version" || test "x$2" = "x--help"; then 130 | exit 1 131 | fi 132 | ;; 133 | 134 | *) 135 | if test -z "$run" && ($1 --version) > /dev/null 2>&1; then 136 | # We have it, but it failed. 137 | exit 1 138 | elif test "x$2" = "x--version" || test "x$2" = "x--help"; then 139 | # Could not run --version or --help. This is probably someone 140 | # running `$TOOL --version' or `$TOOL --help' to check whether 141 | # $TOOL exists and not knowing $TOOL uses missing. 142 | exit 1 143 | fi 144 | ;; 145 | esac 146 | 147 | # If it does not exist, or fails to run (possibly an outdated version), 148 | # try to emulate it. 149 | case $program in 150 | aclocal*) 151 | echo 1>&2 "\ 152 | WARNING: \`$1' is $msg. You should only need it if 153 | you modified \`acinclude.m4' or \`${configure_ac}'. You might want 154 | to install the \`Automake' and \`Perl' packages. Grab them from 155 | any GNU archive site." 156 | touch aclocal.m4 157 | ;; 158 | 159 | autoconf*) 160 | echo 1>&2 "\ 161 | WARNING: \`$1' is $msg. You should only need it if 162 | you modified \`${configure_ac}'. You might want to install the 163 | \`Autoconf' and \`GNU m4' packages. Grab them from any GNU 164 | archive site." 165 | touch configure 166 | ;; 167 | 168 | autoheader*) 169 | echo 1>&2 "\ 170 | WARNING: \`$1' is $msg. You should only need it if 171 | you modified \`acconfig.h' or \`${configure_ac}'. You might want 172 | to install the \`Autoconf' and \`GNU m4' packages. Grab them 173 | from any GNU archive site." 174 | files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}` 175 | test -z "$files" && files="config.h" 176 | touch_files= 177 | for f in $files; do 178 | case $f in 179 | *:*) touch_files="$touch_files "`echo "$f" | 180 | sed -e 's/^[^:]*://' -e 's/:.*//'`;; 181 | *) touch_files="$touch_files $f.in";; 182 | esac 183 | done 184 | touch $touch_files 185 | ;; 186 | 187 | automake*) 188 | echo 1>&2 "\ 189 | WARNING: \`$1' is $msg. You should only need it if 190 | you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'. 191 | You might want to install the \`Automake' and \`Perl' packages. 192 | Grab them from any GNU archive site." 193 | find . -type f -name Makefile.am -print | 194 | sed 's/\.am$/.in/' | 195 | while read f; do touch "$f"; done 196 | ;; 197 | 198 | autom4te*) 199 | echo 1>&2 "\ 200 | WARNING: \`$1' is needed, but is $msg. 201 | You might have modified some files without having the 202 | proper tools for further handling them. 203 | You can get \`$1' as part of \`Autoconf' from any GNU 204 | archive site." 205 | 206 | file=`echo "$*" | sed -n "$sed_output"` 207 | test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` 208 | if test -f "$file"; then 209 | touch $file 210 | else 211 | test -z "$file" || exec >$file 212 | echo "#! /bin/sh" 213 | echo "# Created by GNU Automake missing as a replacement of" 214 | echo "# $ $@" 215 | echo "exit 0" 216 | chmod +x $file 217 | exit 1 218 | fi 219 | ;; 220 | 221 | bison*|yacc*) 222 | echo 1>&2 "\ 223 | WARNING: \`$1' $msg. You should only need it if 224 | you modified a \`.y' file. You may need the \`Bison' package 225 | in order for those modifications to take effect. You can get 226 | \`Bison' from any GNU archive site." 227 | rm -f y.tab.c y.tab.h 228 | if test $# -ne 1; then 229 | eval LASTARG="\${$#}" 230 | case $LASTARG in 231 | *.y) 232 | SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` 233 | if test -f "$SRCFILE"; then 234 | cp "$SRCFILE" y.tab.c 235 | fi 236 | SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` 237 | if test -f "$SRCFILE"; then 238 | cp "$SRCFILE" y.tab.h 239 | fi 240 | ;; 241 | esac 242 | fi 243 | if test ! -f y.tab.h; then 244 | echo >y.tab.h 245 | fi 246 | if test ! -f y.tab.c; then 247 | echo 'main() { return 0; }' >y.tab.c 248 | fi 249 | ;; 250 | 251 | lex*|flex*) 252 | echo 1>&2 "\ 253 | WARNING: \`$1' is $msg. You should only need it if 254 | you modified a \`.l' file. You may need the \`Flex' package 255 | in order for those modifications to take effect. You can get 256 | \`Flex' from any GNU archive site." 257 | rm -f lex.yy.c 258 | if test $# -ne 1; then 259 | eval LASTARG="\${$#}" 260 | case $LASTARG in 261 | *.l) 262 | SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` 263 | if test -f "$SRCFILE"; then 264 | cp "$SRCFILE" lex.yy.c 265 | fi 266 | ;; 267 | esac 268 | fi 269 | if test ! -f lex.yy.c; then 270 | echo 'main() { return 0; }' >lex.yy.c 271 | fi 272 | ;; 273 | 274 | help2man*) 275 | echo 1>&2 "\ 276 | WARNING: \`$1' is $msg. You should only need it if 277 | you modified a dependency of a manual page. You may need the 278 | \`Help2man' package in order for those modifications to take 279 | effect. You can get \`Help2man' from any GNU archive site." 280 | 281 | file=`echo "$*" | sed -n "$sed_output"` 282 | test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` 283 | if test -f "$file"; then 284 | touch $file 285 | else 286 | test -z "$file" || exec >$file 287 | echo ".ab help2man is required to generate this page" 288 | exit $? 289 | fi 290 | ;; 291 | 292 | makeinfo*) 293 | echo 1>&2 "\ 294 | WARNING: \`$1' is $msg. You should only need it if 295 | you modified a \`.texi' or \`.texinfo' file, or any other file 296 | indirectly affecting the aspect of the manual. The spurious 297 | call might also be the consequence of using a buggy \`make' (AIX, 298 | DU, IRIX). You might want to install the \`Texinfo' package or 299 | the \`GNU make' package. Grab either from any GNU archive site." 300 | # The file to touch is that specified with -o ... 301 | file=`echo "$*" | sed -n "$sed_output"` 302 | test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` 303 | if test -z "$file"; then 304 | # ... or it is the one specified with @setfilename ... 305 | infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` 306 | file=`sed -n ' 307 | /^@setfilename/{ 308 | s/.* \([^ ]*\) *$/\1/ 309 | p 310 | q 311 | }' $infile` 312 | # ... or it is derived from the source name (dir/f.texi becomes f.info) 313 | test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info 314 | fi 315 | # If the file does not exist, the user really needs makeinfo; 316 | # let's fail without touching anything. 317 | test -f $file || exit 1 318 | touch $file 319 | ;; 320 | 321 | tar*) 322 | shift 323 | 324 | # We have already tried tar in the generic part. 325 | # Look for gnutar/gtar before invocation to avoid ugly error 326 | # messages. 327 | if (gnutar --version > /dev/null 2>&1); then 328 | gnutar "$@" && exit 0 329 | fi 330 | if (gtar --version > /dev/null 2>&1); then 331 | gtar "$@" && exit 0 332 | fi 333 | firstarg="$1" 334 | if shift; then 335 | case $firstarg in 336 | *o*) 337 | firstarg=`echo "$firstarg" | sed s/o//` 338 | tar "$firstarg" "$@" && exit 0 339 | ;; 340 | esac 341 | case $firstarg in 342 | *h*) 343 | firstarg=`echo "$firstarg" | sed s/h//` 344 | tar "$firstarg" "$@" && exit 0 345 | ;; 346 | esac 347 | fi 348 | 349 | echo 1>&2 "\ 350 | WARNING: I can't seem to be able to run \`tar' with the given arguments. 351 | You may want to install GNU tar or Free paxutils, or check the 352 | command line arguments." 353 | exit 1 354 | ;; 355 | 356 | *) 357 | echo 1>&2 "\ 358 | WARNING: \`$1' is needed, and is $msg. 359 | You might have modified some files without having the 360 | proper tools for further handling them. Check the \`README' file, 361 | it often tells you about the needed prerequisites for installing 362 | this package. You may also peek at any GNU archive site, in case 363 | some other package would contain this missing \`$1' program." 364 | exit 1 365 | ;; 366 | esac 367 | 368 | exit 0 369 | 370 | # Local variables: 371 | # eval: (add-hook 'write-file-hooks 'time-stamp) 372 | # time-stamp-start: "scriptversion=" 373 | # time-stamp-format: "%:y-%02m-%02d.%02H" 374 | # time-stamp-time-zone: "UTC" 375 | # time-stamp-end: "; # UTC" 376 | # End: 377 | -------------------------------------------------------------------------------- /3rd-party/sparsehash/src/config.h.in: -------------------------------------------------------------------------------- 1 | /* src/config.h.in. Generated from configure.ac by autoheader. */ 2 | 3 | /* Namespace for Google classes */ 4 | #undef GOOGLE_NAMESPACE 5 | 6 | /* the location of the header defining hash functions */ 7 | #undef HASH_FUN_H 8 | 9 | /* the location of or */ 10 | #undef HASH_MAP_H 11 | 12 | /* the namespace of the hash<> function */ 13 | #undef HASH_NAMESPACE 14 | 15 | /* the location of or */ 16 | #undef HASH_SET_H 17 | 18 | /* Define to 1 if you have the header file. */ 19 | #undef HAVE_GOOGLE_MALLOC_EXTENSION_H 20 | 21 | /* define if the compiler has hash_map */ 22 | #undef HAVE_HASH_MAP 23 | 24 | /* define if the compiler has hash_set */ 25 | #undef HAVE_HASH_SET 26 | 27 | /* Define to 1 if you have the header file. */ 28 | #undef HAVE_INTTYPES_H 29 | 30 | /* Define to 1 if the system has the type `long long'. */ 31 | #undef HAVE_LONG_LONG 32 | 33 | /* Define to 1 if you have the `memcpy' function. */ 34 | #undef HAVE_MEMCPY 35 | 36 | /* Define to 1 if you have the `memmove' function. */ 37 | #undef HAVE_MEMMOVE 38 | 39 | /* Define to 1 if you have the header file. */ 40 | #undef HAVE_MEMORY_H 41 | 42 | /* define if the compiler implements namespaces */ 43 | #undef HAVE_NAMESPACES 44 | 45 | /* Define if you have POSIX threads libraries and header files. */ 46 | #undef HAVE_PTHREAD 47 | 48 | /* Define to 1 if you have the header file. */ 49 | #undef HAVE_STDINT_H 50 | 51 | /* Define to 1 if you have the header file. */ 52 | #undef HAVE_STDLIB_H 53 | 54 | /* Define to 1 if you have the header file. */ 55 | #undef HAVE_STRINGS_H 56 | 57 | /* Define to 1 if you have the header file. */ 58 | #undef HAVE_STRING_H 59 | 60 | /* Define to 1 if you have the header file. */ 61 | #undef HAVE_SYS_RESOURCE_H 62 | 63 | /* Define to 1 if you have the header file. */ 64 | #undef HAVE_SYS_STAT_H 65 | 66 | /* Define to 1 if you have the header file. */ 67 | #undef HAVE_SYS_TIME_H 68 | 69 | /* Define to 1 if you have the header file. */ 70 | #undef HAVE_SYS_TYPES_H 71 | 72 | /* Define to 1 if you have the header file. */ 73 | #undef HAVE_SYS_UTSNAME_H 74 | 75 | /* Define to 1 if the system has the type `uint16_t'. */ 76 | #undef HAVE_UINT16_T 77 | 78 | /* Define to 1 if you have the header file. */ 79 | #undef HAVE_UNISTD_H 80 | 81 | /* define if the compiler supports unordered_{map,set} */ 82 | #undef HAVE_UNORDERED_MAP 83 | 84 | /* Define to 1 if the system has the type `u_int16_t'. */ 85 | #undef HAVE_U_INT16_T 86 | 87 | /* Define to 1 if the system has the type `__uint16'. */ 88 | #undef HAVE___UINT16 89 | 90 | /* Name of package */ 91 | #undef PACKAGE 92 | 93 | /* Define to the address where bug reports for this package should be sent. */ 94 | #undef PACKAGE_BUGREPORT 95 | 96 | /* Define to the full name of this package. */ 97 | #undef PACKAGE_NAME 98 | 99 | /* Define to the full name and version of this package. */ 100 | #undef PACKAGE_STRING 101 | 102 | /* Define to the one symbol short name of this package. */ 103 | #undef PACKAGE_TARNAME 104 | 105 | /* Define to the home page for this package. */ 106 | #undef PACKAGE_URL 107 | 108 | /* Define to the version of this package. */ 109 | #undef PACKAGE_VERSION 110 | 111 | /* Define to necessary symbol if this constant uses a non-standard name on 112 | your system. */ 113 | #undef PTHREAD_CREATE_JOINABLE 114 | 115 | /* The system-provided hash function including the namespace. */ 116 | #undef SPARSEHASH_HASH 117 | 118 | /* The system-provided hash function, in namespace HASH_NAMESPACE. */ 119 | #undef SPARSEHASH_HASH_NO_NAMESPACE 120 | 121 | /* Define to 1 if you have the ANSI C header files. */ 122 | #undef STDC_HEADERS 123 | 124 | /* Version number of package */ 125 | #undef VERSION 126 | 127 | /* Stops putting the code inside the Google namespace */ 128 | #undef _END_GOOGLE_NAMESPACE_ 129 | 130 | /* Puts following code inside the Google namespace */ 131 | #undef _START_GOOGLE_NAMESPACE_ 132 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft 3rd-party 2 | graft cpp 3 | graft cython 4 | graft tests 5 | include COPYING 6 | include LICENSES 7 | include README.rst 8 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | arv — a fast 23andMe parser for Python 2 | ====================================== 3 | |travis-status| |versions| |license| |pypi| 4 | 5 | Arv (Norwegian; "heritage" or "inheritance") is a Python module for parsing raw 6 | 23andMe genome files. It lets you lookup SNPs from RSIDs. 7 | 8 | .. code:: python 9 | 10 | from arv import load, unphased_match as match 11 | 12 | genome = load("genome.txt") 13 | 14 | print("You are a {gender} with {color} eyes and {complexion} skin.".format( 15 | gender = "man" if genome.y_chromosome else "woman", 16 | complexion = "light" if genome["rs1426654"] == "AA" else "dark", 17 | color = match(genome["rs12913832"], {"AA": "brown", 18 | "AG": "brown or green", 19 | "GG": "blue"}))) 20 | 21 | For my genome, this little program produces:: 22 | 23 | You are a man with blue eyes and light skin. 24 | 25 | The parser is insanely fast, having been written in finely tuned C++, exposed 26 | via Cython. A 2013 Xeon machine I've tested on parses a 24 Mb file into a hash 27 | table in about 78 ms. The newer 23andMe files are smaller, and parses in a mere 28 | 62 ms! 29 | 30 | Works with Python 2.7+ and 3+. Installable with pip! 31 | 32 | .. code:: bash 33 | 34 | $ pip install --upgrade arv 35 | 36 | See below for software requirements. 37 | 38 | Important disclaimer 39 | ==================== 40 | 41 | It's very important to tell you that I, the author of arv, am merely a 42 | *hobbyist*! I *am* a professional software developer, but *not* a geneticist, 43 | biologist, medical doctor or anything like that. 44 | 45 | Because of that, this software may not only look weird to people in the field, 46 | it may also contain serious errors. If you find any problem whatsoever, please 47 | submit a GitHub issue. 48 | 49 | This a slightly modified version of what I wrote for the original software 50 | called "dna-traits", and the same goes for this software: 51 | 52 | In addition to the GPL v3 licensing terms, and given that this code deals with 53 | health-related issues, I want to stress that the provided code most likely 54 | contains errors, or invalid genome reports. Results from this code must be 55 | interpreted as HIGHLY SPECULATIVE and may even be downright INCORRECT. Always 56 | consult an expert (medical doctor, geneticist, etc.) for guidance. I take NO 57 | RESPONSIBILITY whatsoever for any consequences of using this code, including 58 | but not limited to loss of life, money, spouses, self-esteem and so on. Use at 59 | YOUR OWN RISK. 60 | 61 | The indended use is for casual, educational purposes. If this code is used for 62 | research purposes, please cross-check key results with other software: The 63 | parser code may contain serious errors, for example. 64 | 65 | An interesting story about the research part: I once released a pretty good 66 | Mersenne Twister PRNG for C++ that ended up being used in research. Turned out 67 | the engine had bugs, and by the time I had fixed them, a poor researcher had 68 | already produced results with it (hopefully not published; I don't know). The 69 | guy had to go back and fix his stuff, and I felt terribly bad about it. 70 | 71 | So beware! 72 | 73 | Installation 74 | ============ 75 | 76 | The recommended way is to install from PyPi. 77 | 78 | .. code:: bash 79 | 80 | $ pip install arv 81 | 82 | This will most likely build Arv from source. The package will automatically 83 | install Cython, but it doesn't check if you have a C++11 compiler. Furthermore, 84 | it passes some additional compilation flags that are specific to clang/gcc. 85 | 86 | If you have problems running ``pip install arv``, please open an issue on 87 | GitHub with as much detail as possible (``g++/clang++ --version``, ``uname 88 | -a``, ``python --version`` and so on). 89 | 90 | If you set the environment variable ``ARV_DEBUG``, it will build with full 91 | warnings and debug symbols. 92 | 93 | You can also install it locally through ``setup.py``. The following builds and 94 | tests, but does not install, arv: 95 | 96 | .. code:: bash 97 | 98 | $ python setup.py test 99 | 100 | If you set the environment variable ``ARV_BENCHMARK`` to a genome filename and 101 | run the tests, it will perform a short benchmark, reporting the best parsing 102 | time on it. You can also set ``ARV_BENCHMARK_COUNT=`` to change how 103 | many times it should parse the given file. 104 | 105 | Usage 106 | ===== 107 | 108 | First you need to dump the raw genome file from 23andMe. You'll find it under 109 | the raw genome browser, and download the file. You may have to unzip it first: 110 | The parser works on the pure text files. 111 | 112 | Then you load the genome in Python with 113 | 114 | .. code:: python 115 | 116 | >>> genome = arv.load("filename.txt") 117 | >>> genome 118 | 119 | 120 | To see if there are any Y-chromosomes present in the genome, 121 | 122 | .. code:: python 123 | 124 | >>> genome.y_chromosome 125 | True 126 | 127 | The genome provides a ``dict``-like interface. To get a given SNP, just enter the RSID. 128 | 129 | .. code:: python 130 | 131 | >>> snp = genome["rs123"] 132 | >>> snp 133 | 134 | >>> snp.chromosome 135 | 7 136 | >>> snp.position 137 | 24966446 138 | >>> snp.genotype 139 | 140 | 141 | The ``Genotype`` object can be converted to a string with ``str``, but it also 142 | allows rich comparisons with strings directly: 143 | 144 | .. code:: python 145 | 146 | >>> snp.genotype == "AA" 147 | True 148 | 149 | you can get its complement with the ``~``-operator. 150 | 151 | .. code:: python 152 | 153 | >>> type(snp.genotype) 154 | 155 | >>> ~snp.genotype 156 | 157 | 158 | The complement is important due to eah SNPs orientation. All of 23andMe SNPs 159 | are oriented towards the positive ("plus") strand, based on the `GRCh37 160 | `_ reference human genome assembly 161 | build. But some SNPs on SNPedia are given with the `minus orientation 162 | `_. 163 | 164 | For example, to determine if the human in question is likely lactose tolerant 165 | or not, we can look at `rs4988235 `_. 166 | SNPedia reports its *Stabilized* orientation to be minus, so we need to use the 167 | complement: 168 | 169 | .. code:: python 170 | 171 | >>> genome["rs4988235"].genotype 172 | 173 | >>> ~genome["rs4988235"].genotype 174 | 175 | 176 | By reading a few `GWAS 177 | `_ research 178 | papers, we can build a rule to determine a human's likelihood for lactose 179 | tolerance: 180 | 181 | .. code:: python 182 | 183 | >>> arv.unphased_match(~genome["rs4988235"].genotype, { 184 | "TT": "Likely lactose tolerant", 185 | "TC": "Likely lactose tolerant", 186 | "CC": "Likely lactose intolerant", 187 | None: "Unable to determine (genotype not present)"}) 188 | 'Likely lactose tolerant' 189 | 190 | Note that reading GWAS papers for hobbyists can be a bit tricky. If you are a 191 | hobbyist, be sure to spend some time reading the paper closely, checking up 192 | SNPs on places like `SNPedia `_, `dnSNP 193 | `_ and `OpenSNP 194 | `_. Finally, have fun, but be extremely careful 195 | about drawing conclusions from your results. 196 | 197 | Command line interface 198 | ====================== 199 | 200 | You can also invoke ``arv`` from the command line: 201 | 202 | .. code:: bash 203 | 204 | $ python -m arv --help 205 | 206 | For example, you can drop into a Python REPL like so: 207 | 208 | .. code:: bash 209 | 210 | $ python -m arv --repl genome.txt 211 | genome.txt ... 960614 SNPs, male 212 | Type `genome` to see the parsed 23andMe raw genome file 213 | >>> genome 214 | 215 | >>> genome["rs123"] 216 | > 217 | 218 | If you specify several files, you can access them through the variable 219 | ``genomes``. 220 | 221 | The example at the top of this document can be run with ``--example``: 222 | 223 | .. code:: bash 224 | 225 | $ python -m arv --example genome.txt 226 | genome.txt ... 960614 SNPs, male 227 | 228 | genome.txt ... A man with blue eyes and light skin 229 | 230 | License 231 | ======= 232 | 233 | Copyright 2017 Christian Stigen Larsen 234 | 235 | Distributed under the GNU GPL v3 or later. See the file COPYING for the full 236 | license text. This software makes use of open source software; see LICENSES for 237 | details. 238 | 239 | .. |travis-status| image:: https://travis-ci.org/cslarsen/arv.svg?branch=master 240 | :alt: Travis build status 241 | :scale: 100% 242 | :target: https://travis-ci.org/cslarsen/arv 243 | 244 | .. |license| image:: https://img.shields.io/badge/license-GPL%20v3%2B-blue.svg 245 | :target: http://www.gnu.org/licenses/old-licenses/gpl-3.en.html 246 | :alt: Project License 247 | 248 | .. |versions| image:: https://img.shields.io/badge/python-2.7%2B%2C%203%2B-blue.svg 249 | :target: https://pypi.python.org/pypi/arv/ 250 | :alt: Supported Python versions 251 | 252 | .. |pypi| image:: https://badge.fury.io/py/arv.svg 253 | :target: https://badge.fury.io/py/arv 254 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | - Parse GRCh37/38 build 2 | - Make parsing safe 3 | - detect overflow while parsing integers 4 | - use fuzzing (afl, e.g.) to break the parser 5 | - Benchmark other ops 6 | - iteration 7 | - random access 8 | - Modify google dense hash map to move/emplace from buffer 9 | - try to use a nearly full buffer to make this faster 10 | - Try to move y-chromo detection out of the loop 11 | - Remove the pimpl pattern, don't need it anymore 12 | - Build with profiling 13 | - Build with gcov, do coverage testing 14 | - Use coveralls 15 | -------------------------------------------------------------------------------- /arv/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A fast 23andMe raw genome file parser. 3 | 4 | To load a genome file, 5 | 6 | >>> import arv 7 | >>> genome = arv.load("genome.txt") 8 | 9 | You can then look up genotypes from RSIDs 10 | 11 | >>> genome["rs123"] 12 | 'AA' 13 | 14 | You can also access SNPs 15 | 16 | >>> genome["rs123"] 17 | 18 | 19 | By using ``unphased_match``, you can match genotypes while disregarding the 20 | ordering of the two nucleotides. For example, ``AT`` and ``TA`` would be 21 | considered equal. Here is an example usage: 22 | 23 | genotype = genome["rs12913832"] 24 | eyecolor = unphased_match(genotype, { 25 | "AA": "brown", 26 | "AG": "brown or green", 27 | "GG": "blue", 28 | None: "unknown"}) 29 | 30 | A full example would be: 31 | 32 | import arv 33 | 34 | genome = arv.load("genome.txt") 35 | 36 | print("You are a {gender} with {color} eyes and {complexion} skin.".format( 37 | gender = "man" if genome.y_chromosome else "woman", 38 | complexion = "light" if genome["rs1426654"] == "AA" else "dark", 39 | color = unphased_match(genome["rs12913832"], { 40 | "AA": "brown", 41 | "AG": "brown or green", 42 | "GG": "blue"}))) 43 | 44 | For a given genome, this might print 45 | 46 | You are a man with blue eyes and light skin. 47 | 48 | Copyright 2014, 2016, 2017 Christian Stigen Larsen 49 | Distributed under the GNU GPL v3 or later. 50 | """ 51 | 52 | from _arv import ( 53 | _sizes, 54 | Genome, 55 | Genotype, 56 | load, 57 | SNP, 58 | ) 59 | 60 | from .match import unphased_match 61 | 62 | __author__ = "Christian Stigen Larsen" 63 | __copyright__ = "Copyright 2017 Christian Stigen Larsen" 64 | __credits__ = ["Christian Stigen Larsen", "Google"] 65 | __email__ = "csl@csl.name" 66 | __license__ = "GNU General Public License v3 or later" 67 | __maintainer__ = "Christian Stigen Larsen" 68 | __status__ = "Prototype" 69 | __version__ = "0.9.3" 70 | 71 | __all__ = [ 72 | "_sizes", 73 | "Genome", 74 | "Genotype", 75 | "load", 76 | "SNP", 77 | "unphased_match", 78 | ] 79 | -------------------------------------------------------------------------------- /arv/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command line interface to arv. 3 | 4 | This can be invoked with ``python -m arv``. 5 | 6 | Copyright 2017 Christian Stigen Larsen 7 | Distributed under the GNU GPL v3 or later. See COPYING. 8 | """ 9 | 10 | import argparse 11 | import arv 12 | import arv.traits 13 | import os 14 | import sys 15 | 16 | class ArvError(RuntimeError): 17 | pass 18 | 19 | def log(msg="\n"): 20 | sys.stdout.write(msg) 21 | sys.stdout.flush() 22 | 23 | def _parse_args(): 24 | p = argparse.ArgumentParser(prog="arv", 25 | description="Arv - a fast 23andMe parser", 26 | epilog=arv.__copyright__) 27 | 28 | p.add_argument("--repl", default=False, action="store_true", 29 | help="Open a Python REPL loaded with the given genomes") 30 | 31 | p.add_argument("--example", default=False, action="store_true", 32 | help="Shows an example report for the genome(s)") 33 | 34 | p.add_argument("--version", "-V", default=False, action="store_true", 35 | help="Shows version and exits") 36 | 37 | p.add_argument("--ethnicity", default="", type=str, 38 | help="Sets ethnicity for all genomes") 39 | 40 | p.add_argument("files", nargs="*", 41 | help="23andMe raw genome file name(s)") 42 | 43 | opts = p.parse_args() 44 | 45 | if opts.version: 46 | print("arv %s" % arv.__version__) 47 | print(arv.__copyright__) 48 | print("Distributed under the %s" % arv.__license__) 49 | sys.exit(0) 50 | 51 | if len(opts.files) == 0: 52 | p.print_help() 53 | sys.exit(1) 54 | 55 | return opts 56 | 57 | def summary(genome): 58 | """Returns a textual summary of the genome.""" 59 | return "{count} SNPs, {gender}".format( 60 | count=len(genome), gender="male" if 61 | genome.y_chromosome else "female",) 62 | 63 | def example(genome): 64 | """Returns an example report for the genome.""" 65 | gender = "man" if genome.y_chromosome else "woman" 66 | complexion = "light" if genome["rs1426654"] == "AA" else "dark" 67 | 68 | color = arv.unphased_match(genome["rs12913832"], { 69 | "AA": "brown", 70 | "AG": "brown or green", 71 | "GG": "blue"}) 72 | 73 | report = {"Description": 74 | "A {gender} with {color} eyes and {complexion} skin".format(**locals())} 75 | 76 | report.update(arv.traits.traits_report(genome)) 77 | 78 | # Format report 79 | out = [] 80 | width = max(map(len, report.keys())) 81 | for k, v in sorted(report.items()): 82 | out.append(" %-*s: %s" % (width, k, v)) 83 | 84 | return "\n" + "\n".join(out) 85 | 86 | def _main(): 87 | opts = _parse_args() 88 | 89 | genomes = [] 90 | for filename in opts.files: 91 | log("%s ... " % os.path.basename(filename)) 92 | genome = arv.load(filename, ethnicity=opts.ethnicity) 93 | log("%s\n" % summary(genome)) 94 | genomes.append(genome) 95 | 96 | if opts.example: 97 | for filename, genome in zip(opts.files, genomes): 98 | log("%s ... %s\n" % (os.path.basename(filename), example(genome))) 99 | 100 | if opts.repl: 101 | env = dict(globals()) 102 | 103 | if len(genomes) == 1: 104 | env.update({"genome": genomes[0]}) 105 | message = "Type `genome` to see the parsed 23andMe raw genome file" 106 | else: 107 | env.update({"genomes": genomes}) 108 | message = "Type `genomes` to see the parsed 23andMe raw genome files" 109 | 110 | import code 111 | code.interact(message, local=env) 112 | 113 | if __name__ == "__main__": 114 | try: 115 | _main() 116 | sys.exit(0) 117 | except ArvError as e: 118 | log("Error: %s\n" % e) 119 | sys.exit(1) 120 | -------------------------------------------------------------------------------- /arv/match.py: -------------------------------------------------------------------------------- 1 | """ 2 | Contains matching functions. 3 | 4 | Part of arv 5 | Copyright 2014, 2016, 2017 Christian Stigen Larsen 6 | Distributed under the GPL v3 or later. See COPYING. 7 | """ 8 | 9 | import arv 10 | 11 | def assert_european(genome): 12 | """If ethnicity is set, make sure it's European.""" 13 | if genome.ethnicity not in [None, "european"]: 14 | raise ValueError("Only applicable to Europeans") 15 | 16 | def unphased_match(snp, phenotypes): 17 | """Match SNP with unphased genotypes and return phenotype. 18 | 19 | Disregards phasing when comparing genotypes, meaning that an input value of 20 | "AG" will be matched against both "AG" and "GA". 21 | 22 | Arguments: 23 | genotype: Genotype (str) or SNP (arv.SNP) to match. 24 | phenotypes: Dict mapping (unphased) genotype to phenotype. 25 | 26 | Example: 27 | unphased_match(genome.rs4988235, { 28 | "AA": "Likely lactose tolerant", 29 | "AG": "Likely lactose tolerant", 30 | "GG": "Likely lactose intolerant", 31 | None: "Unknown genotype"}) 32 | 33 | The above example could return "Likely lactose tolerant", for example, 34 | or "Unknown genotype" if there was no match. Note that the key "AG" 35 | will match both "AG" and "GA" in the snp. 36 | 37 | Returns: 38 | Matching phenotype. If the `phenotypes` dict has a `None` key, it will 39 | be returned in case there is no match. 40 | """ 41 | if isinstance(snp, str): 42 | genotype = snp 43 | elif isinstance(snp, arv.Genotype): 44 | genotype = str(snp) 45 | elif isinstance(snp, arv.SNP): 46 | genotype = str(snp.genotype) 47 | else: 48 | raise TypeError(type(snp)) 49 | 50 | # Look for "IJ" 51 | if genotype in phenotypes: 52 | return phenotypes[genotype] 53 | 54 | # Look for "JI" 55 | genotype = "".join(reversed(str(snp))) 56 | if genotype in phenotypes: 57 | return phenotypes[genotype] 58 | 59 | # Use default value? 60 | if None in phenotypes: 61 | return phenotypes[None] 62 | else: 63 | raise KeyError(str(snp)) 64 | -------------------------------------------------------------------------------- /arv/traits.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | """ 4 | Used to infer some traits. 5 | 6 | Use with caution, this code may contain errors! 7 | 8 | Copyright (C) 2014, 2016 Christian Stigen Larsen 9 | Distributed under the GPL v3 or later. See COPYING. 10 | """ 11 | 12 | from arv.match import unphased_match, assert_european 13 | from arv.util import make_report 14 | 15 | def bitter_taste(genome): 16 | "Bitter taste perception." 17 | return unphased_match(genome["rs713598"], { 18 | "CC": "Probably can't taste certain bitter flavours", 19 | "CG": "Can taste bitter flavours that others can't", 20 | "GG": "Can taste bitter flavours that others can't", 21 | None: "Unable to determine"}) 22 | 23 | def breastfeeding_iq(genome): 24 | "Breastfeeding and IQ." 25 | assert_european(genome) 26 | s = "" 27 | 28 | if "C" in str(genome["rs174575"].genotype): 29 | s += "Being breastfed raised subjects' IQ by 6-7 points on average (rs174575)\n" 30 | else: 31 | s += "Little to no effect of being breastfed on IQ (rs174575)\n" 32 | 33 | if "A" in str(genome["rs1535"].genotype): 34 | s += "Being breastfed raised subjects' IQ by 4-5 points on average (rs1535)\n" 35 | else: 36 | s += "Little or no effect of being breastfed on IQ (rs1535)\n" 37 | 38 | return s 39 | 40 | def alcohol_flush_reaction(genome): 41 | "Alcohol flush reaction." 42 | return unphased_match(genome["rs671"], { 43 | "AA": "Extreme reaction (no copies of the ALDH2 gene)", 44 | "AG": "Moderate reaction (one copy of the ALDH2 gene)", 45 | "GG": "Little to no reaction (two copies of the ALDH2 gene)", 46 | None: "Unable to determine"}) 47 | 48 | def earwax_type(genome): 49 | "Earwax type." 50 | return unphased_match(genome["rs17822931"], { 51 | "CC": "Wet earwax (sticky, honey-colored)", 52 | "CT": "Wet earwax (sticky, honey-colored)", 53 | "TT": "Dry earwax (flaky, pale)", 54 | None: "Unable to determine"}) 55 | 56 | def eye_color(genome): 57 | "Eye color." 58 | assert_european(genome) 59 | return unphased_match(genome["rs12913832"], { 60 | "AA": "Brown eyes, although 14% have green and 1% have blue", 61 | "AG": "Most likely brown or green, but 7% have blue", 62 | "GG": "Most likely blue, but 30% have green and 1% brown", 63 | None: "Unable to determine"}) 64 | 65 | def lactose_intolerance(genome): 66 | "Lactose intolerance." 67 | return unphased_match(genome["rs4988235"], { 68 | "AA": "Likely lactose tolerant", 69 | "AG": "Likely lactose tolerant", 70 | "GG": "Likely lactose intolerant", 71 | None: "Unable to determine"}) 72 | 73 | def malaria_resistance(genome): 74 | "Malaria resistance (Duffy antigen)." 75 | return unphased_match(genome["rs2814778"], { 76 | "CC": "Likely resistant to P. vivax", 77 | "CT": "Likely to have some resistance to P. vivax", 78 | "TT": "Likely not resistant to P. vivax", 79 | None: "Unable to determine"}) 80 | 81 | def male_pattern_baldness(genome): 82 | """Male pattern baldness. 83 | 84 | Studies: 85 | http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=PubMed&term=18849991 86 | http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=PubMed&term=15902657 87 | http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=PubMed&term=18849994 88 | """ 89 | raise NotImplementedError() 90 | 91 | # X-chromosome: rs6625163, A->G is risk mutation (OR 1.17) 92 | # rs6113491, A->C is risk mutation (AA has OR 1.77) 93 | # TODO: Attempt to match ORs 94 | 95 | def norovirus_resistance(genome): 96 | """Norovirus resistance (most common strain).""" 97 | return unphased_match(genome["rs601338"], { 98 | "AA": "Resistant to most common strain", 99 | "AG": "Likely not resistant to most common strain", 100 | "GG": "Likely not resistant to most common strain", 101 | None: "Unable to determine"}) 102 | 103 | def muscle_performance(genome): 104 | """Muscle performance.""" 105 | return unphased_match(genome["rs1815739"], { 106 | "CC": "Likely sprinter, perhaps endurance athlete (two copies)", 107 | "CT": "Likely sprinter, perhaps endurance athlete (one copy)", 108 | "TT": "Unlikely sprinter, but likely endurance athlete (no copies)", 109 | None: "Unable to determine"}) 110 | 111 | def smoking_behaviour(genome): 112 | """Smoking behaviour.""" 113 | assert_european(genome) 114 | return unphased_match(genome["rs1051730"], { 115 | "AA": "Likely to smoke more than average", 116 | "AG": "Likely to smoke a little bit more than average", 117 | "GG": "Likely to smoke typical amount of cigarettes per day", 118 | None: "Unable to determine"}) 119 | 120 | def red_hair(genome): 121 | """Hair color; odds for red hair.""" 122 | return unphased_match(genome["rs1805007"], { 123 | "CC": "Typical odds for red hair", 124 | "CT": "Substantially increased odds for red hair", 125 | "TT": "Greatly increased odds for red hair", 126 | None: "Unable to determine"}) 127 | 128 | def blond_vs_brown_hair(genome): 129 | """Hair color; blond versus brown.""" 130 | return unphased_match(genome["rs1667394"], { 131 | "CC": "Greatly decreased odds of having blond hair vs. brown", 132 | "CT": "Decreased odds of having blond hair vs. brown", 133 | "TT": "Typical odds of having blond hair vs. brown hair", 134 | None: "Unable to determine"}) 135 | 136 | def pain_sensitivity(genome): 137 | """Pain sensitivity.""" 138 | return unphased_match(genome["rs6269"], { 139 | "AA": "Increased sensitivity to pain", 140 | "AG": "Typical sensitivity to pain", 141 | "GG": "Less sensitive to pain", 142 | None: "Unable to determine"}) 143 | 144 | def caffeine_metabolism(genome): 145 | """Caffeine metabolism.""" 146 | assert_european(genome) 147 | return unphased_match(genome["rs762551"], { 148 | "AA": "Fast metabolizer", 149 | "AC": "Slow metabolizer", 150 | "CC": "Slow metabolizer", 151 | None: "Unable to determine"}) 152 | 153 | def heroin_addiction(genome): 154 | """Heroin addiction.""" 155 | assert_european(genome) 156 | return unphased_match(genome["rs1799971"], { 157 | "AA": "Typical odds of addiction", 158 | "AG": "Higher odds of addiction", 159 | "GG": "Higher odds of addiction", 160 | None: "Unable to determine"}) 161 | 162 | def hair_curl(genome): 163 | assert_european(genome) 164 | return unphased_match(genome["rs17646946"], { 165 | "AA": "Straighter hair on average", 166 | "AG": "Straighter hair on average", 167 | "GG": "Slightly curlier hair on average"}) 168 | 169 | def hiv_aids_resistance(genome): 170 | """Resistance to HIV/AIDS.""" 171 | return unphased_match(genome["i3003626"], { 172 | "DD": "Some resistance to most common strain of HIV", 173 | "DI": "Not resistant, but may have slower disease progression", 174 | "II": "Not resistant"}) 175 | 176 | def aspargus_detection(genome): 177 | """Aspargus metabolite detection.""" 178 | assert_european(genome) 179 | return unphased_match(genome["rs4481887"], { 180 | "AA": "Higher odds of smelling aspargus in urine", 181 | "AG": "Medium odds of smelling aspargus in urine", 182 | "GG": "Typical odds of smelling aspargus in urine", 183 | None: "Unable to determine"}) 184 | 185 | def adiponectin_levels(genome): 186 | """Adiponectin levels.""" 187 | if genome.ethnicity == "asian": 188 | r = unphased_match(genome["rs1851665"], { 189 | "AA": "Slightly lower, which may be bad (rs1851665)\n", 190 | "AG": "Typical (rs1851665)\n", 191 | "GG": "Slightly higher, which is good (rs1851665)\n", 192 | None: "Unable to determine for rs1851665\n"}) 193 | 194 | r += unphased_match(genome["rs7193788"], { 195 | "AA": "Slightly higher, which is good (rs7193788)", 196 | "AG": "Typical (rs7193788)", 197 | "GG": "Slightly lower, which may be bad (rs7193788)", 198 | None: "Unable to determine for rs7193788"}) 199 | return r 200 | 201 | elif genome.ethnicity in [None, "european"]: 202 | return unphased_match(genome["rs6444175"], { 203 | "AA": "Lower, which may be bad", 204 | "AG": "Slightly lower, which may be bad", 205 | "GG": "Typical levels", 206 | None: "Unable to determine"}) 207 | 208 | def biological_age(genome): 209 | """Biological aging (telomere lengths).""" 210 | assert_european(genome) 211 | 212 | ages = { 213 | "rs10936599": {"TT": 7.82, "CT": 3.91, "CC": 0, None: 0}, 214 | "rs2736100": {"AA": 3.14, "AC": 0, "CC": -3.14, None: 0}, 215 | "rs9420907": {"AA": 0, "AC": -2.76, "CC": -5.52, None: 0}, 216 | "rs755017": {"AA": 0, "AG": -2.47, "GG": -4.94, None: 0}, 217 | "rs11100479": {"CC": 5.98, "CT": -2.99, "TT": 0, None: 0}, 218 | "rs10165485": {"TT": 0, "CT": -2.23, "CC": -4.46, None: 0}, 219 | } 220 | 221 | age = [unphased_match(genome[rsid], t) for (rsid, t) in ages.items()] 222 | 223 | def qual(age): 224 | if age <= 0: 225 | return "younger" 226 | elif age > 0: 227 | return "older" 228 | 229 | msg = "From %.1f years %s to %.1f years %s than actual age\n" % ( 230 | abs(min(age)), qual(min(age)), abs(max(age)), qual(max(age))) 231 | msg += "The sum is %.1f years %s, compared to actual age" % ( 232 | abs(sum(age)), qual(sum(age))) 233 | return msg 234 | 235 | def birth_weight(genome): 236 | """Birth weight.""" 237 | assert_european(genome) 238 | weights = { 239 | "rs7903146": {"TT": 0, "CT": -30, "CC": -60, None: 0}, 240 | "rs1799884": {"TT": +54, "CT": +27, "CC": 0, None: 0}, 241 | } 242 | weight = [unphased_match(genome[rsid], w) for (rsid, w) in weights.items()] 243 | return "From %.1fg to %.1fg (sum: %.1fg) compared to typical weight" % ( 244 | min(weight), max(weight), sum(weight)) 245 | 246 | def blood_glucose(genome): 247 | """Blood glucose.""" 248 | assert_european(genome) 249 | return unphased_match(genome["rs560887"], { 250 | "CC": "Average fasting plasma glucose levels of 5.18mmol/L", 251 | "CT": "Average fasting plasma glucose levels of 5.12mmol/L", 252 | "TT": "Average fasting plasma glucose levels of 5.06mmol/L", 253 | None: "Unable to determine"}) 254 | 255 | def traits_report(genome): 256 | """Infer traits from genome.""" 257 | return make_report(genome, [ 258 | adiponectin_levels, 259 | alcohol_flush_reaction, 260 | aspargus_detection, 261 | biological_age, 262 | birth_weight, 263 | bitter_taste, 264 | blond_vs_brown_hair, 265 | blood_glucose, 266 | breastfeeding_iq, 267 | caffeine_metabolism, 268 | earwax_type, 269 | eye_color, 270 | hair_curl, 271 | heroin_addiction, 272 | hiv_aids_resistance, 273 | lactose_intolerance, 274 | malaria_resistance, 275 | male_pattern_baldness, 276 | muscle_performance, 277 | norovirus_resistance, 278 | pain_sensitivity, 279 | red_hair, 280 | smoking_behaviour, 281 | ]) 282 | -------------------------------------------------------------------------------- /arv/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Part of arv 3 | Copyright 2017 Christian Stigen Larsen 4 | 5 | dna-traits 6 | Copyright 2014, 2016, 2017 Christian Stigen Larsen 7 | 8 | Distributed under the GPL v3 or later. See COPYING. 9 | """ 10 | 11 | def make_report(genome, functions, verbose=False): 12 | """Runs each function with genome as argument, returning a dict of 13 | results.""" 14 | report = {} 15 | 16 | for func in functions: 17 | if func.__doc__ is not None: 18 | title = func.__doc__[:func.__doc__.index(".")] 19 | else: 20 | title = func.__name__.replace("_", " ").capitalize() 21 | 22 | try: 23 | result = func(genome) 24 | if result is not None: 25 | report[title] = result 26 | except ValueError as e: 27 | if verbose: 28 | report[title] = "Error: %s" % e 29 | except AssertionError as e: 30 | if verbose: 31 | report[title] = "Error: %s" % e 32 | except KeyError as e: 33 | continue 34 | except NotImplementedError: 35 | continue 36 | 37 | return report 38 | -------------------------------------------------------------------------------- /cpp/.gitignore: -------------------------------------------------------------------------------- 1 | arv.c 2 | -------------------------------------------------------------------------------- /cpp/arv.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016, 2017 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL v3 or later. See COPYING. 9 | */ 10 | 11 | #include 12 | 13 | #include "arv.hpp" 14 | 15 | namespace { 16 | 17 | struct RSIDHash { 18 | inline std::size_t operator() (const arv::RSID& rsid) const 19 | { 20 | return static_cast(rsid); 21 | } 22 | }; 23 | 24 | struct RSIDEq { 25 | inline bool operator()(const arv::RSID& a, const arv::RSID& b) const 26 | { 27 | return a == b; 28 | } 29 | }; 30 | 31 | typedef google::dense_hash_map SNPMap; 32 | 33 | } // anonymus namespace 34 | 35 | namespace arv { 36 | 37 | const SNP NONE_SNP(CHR_NO, 0, Genotype(NONE, NONE)); 38 | 39 | static char nucleotide_char(const Nucleotide& n) 40 | { 41 | switch ( n ) { 42 | case A: return 'A'; 43 | case C: return 'C'; 44 | case D: return 'D'; 45 | case G: return 'G'; 46 | case I: return 'I'; 47 | case NONE: return '-'; 48 | case T: return 'T'; 49 | } 50 | return '-'; 51 | } 52 | 53 | Nucleotide complement(const Nucleotide& n) 54 | { 55 | switch ( n ) { 56 | case A: return T; 57 | case C: return G; 58 | case D: return D; 59 | case G: return C; 60 | case I: return I; 61 | case NONE: return NONE; 62 | case T: return A; 63 | } 64 | return NONE; 65 | } 66 | 67 | Genotype::Genotype() : first(NONE), second(NONE) 68 | { 69 | } 70 | 71 | Genotype::Genotype(const Nucleotide& a, const Nucleotide& b) 72 | : first(a), second(b) 73 | { 74 | } 75 | 76 | Genotype operator~(const Genotype& g) 77 | { 78 | return Genotype(complement(g.first), 79 | complement(g.second)); 80 | } 81 | 82 | Genotype complement(const Genotype& g) 83 | { 84 | return ~g; 85 | } 86 | 87 | bool Genotype::operator==(const Genotype& g) const 88 | { 89 | return first == g.first && second == g.second; 90 | } 91 | 92 | bool Genotype::operator<(const Genotype& g) const 93 | { 94 | if ( first < g.first ) 95 | return true; 96 | 97 | if ( first > g.first ) 98 | return false; 99 | 100 | return second < g.second; 101 | } 102 | 103 | std::string Genotype::to_string() const 104 | { 105 | char s[3] = {0}; 106 | 107 | s[0] = nucleotide_char(first); 108 | s[1] = nucleotide_char(second); 109 | 110 | if ( s[0] != '-' && s[1] == '-' ) 111 | s[1] = '\0'; 112 | 113 | return std::string(s); 114 | } 115 | 116 | SNP::SNP() : 117 | chromosome(CHR_NO), 118 | position(0), 119 | genotype(NONE, NONE) 120 | { 121 | } 122 | 123 | SNP::SNP(const Chromosome& chr, const Position& pos, const Genotype& gt) : 124 | chromosome(chr), 125 | position(pos), 126 | genotype(gt) 127 | { 128 | } 129 | 130 | SNP::SNP(const SNP& snp) : 131 | chromosome(snp.chromosome), 132 | position(snp.position), 133 | genotype(snp.genotype) 134 | { 135 | } 136 | 137 | SNP& SNP::operator=(const SNP& snp) { 138 | if ( this != &snp ) { 139 | genotype = snp.genotype; 140 | chromosome = snp.chromosome; 141 | position = snp.position; 142 | } 143 | return *this; 144 | } 145 | 146 | bool SNP::operator==(const SNP& snp) const 147 | { 148 | return position == snp.position && 149 | chromosome == snp.chromosome && 150 | genotype == snp.genotype; 151 | } 152 | 153 | bool SNP::operator<(const SNP& snp) const 154 | { 155 | if ( position > snp.position ) 156 | return false; 157 | if ( position < snp.position ) 158 | return true; 159 | 160 | // equal position 161 | if ( chromosome > snp.chromosome ) 162 | return false; 163 | if ( chromosome < snp.chromosome ) 164 | return true; 165 | 166 | // equal chromosome 167 | return genotype < snp.genotype; 168 | } 169 | 170 | bool SNP::operator>(const SNP& snp) const 171 | { 172 | return !(*this <= snp); 173 | } 174 | 175 | bool SNP::operator<=(const SNP& snp) const 176 | { 177 | return *this == snp || *this < snp; 178 | } 179 | 180 | bool SNP::operator>=(const SNP& snp) const 181 | { 182 | return *this == snp || *this > snp; 183 | } 184 | 185 | bool SNP::operator!=(const SNP& snp) const 186 | { 187 | return !(*this == snp); 188 | } 189 | 190 | bool SNP::operator==(const Genotype& g) const 191 | { 192 | return genotype == g; 193 | } 194 | 195 | struct GenomeIteratorImpl { 196 | SNPMap::const_iterator it; 197 | 198 | GenomeIteratorImpl(SNPMap::const_iterator& i): 199 | it(i) 200 | { 201 | } 202 | }; 203 | 204 | GenomeIterator::GenomeIterator(): 205 | pimpl(NULL) 206 | { 207 | } 208 | 209 | GenomeIterator::GenomeIterator(GenomeIteratorImpl* p): 210 | pimpl(p) 211 | { 212 | } 213 | 214 | GenomeIterator::~GenomeIterator() 215 | { 216 | delete pimpl; 217 | } 218 | 219 | GenomeIterator::GenomeIterator(const GenomeIterator& o): 220 | pimpl(new GenomeIteratorImpl(o.pimpl->it)) 221 | { 222 | } 223 | 224 | GenomeIterator& GenomeIterator::operator=(const GenomeIterator& o) 225 | { 226 | if ( pimpl != o.pimpl ) { 227 | delete pimpl; 228 | pimpl = new GenomeIteratorImpl(o.pimpl->it); 229 | } 230 | return *this; 231 | } 232 | 233 | void GenomeIterator::next() 234 | { 235 | ++pimpl->it; 236 | } 237 | 238 | RsidSNP GenomeIterator::value() const 239 | { 240 | return *pimpl->it; 241 | } 242 | 243 | bool GenomeIterator::operator==(const GenomeIterator& o) const 244 | { 245 | return pimpl->it == o.pimpl->it; 246 | } 247 | 248 | bool GenomeIterator::operator!=(const GenomeIterator& o) const 249 | { 250 | return pimpl->it != o.pimpl->it; 251 | } 252 | 253 | struct Genome::GenomeImpl { 254 | SNPMap snps; 255 | 256 | GenomeImpl(const std::size_t size) : 257 | snps(size) 258 | { 259 | snps.set_empty_key(0); 260 | } 261 | 262 | GenomeImpl(const GenomeImpl& g) : 263 | snps(g.snps) 264 | { 265 | snps.set_empty_key(0); 266 | } 267 | 268 | GenomeImpl& operator=(const GenomeImpl& g) 269 | { 270 | if ( this != &g ) 271 | snps = g.snps; 272 | 273 | return *this; 274 | } 275 | 276 | bool contains(const RSID& rsid) const { 277 | return snps.find(rsid) != snps.end(); 278 | } 279 | 280 | const SNP& operator[](const RSID& rsid) const { 281 | return !contains(rsid)? NONE_SNP : const_cast(snps)[rsid]; 282 | } 283 | 284 | std::string genotype(const RSID& id) const { 285 | const SNP& snp = operator[](id); 286 | return snp.genotype.to_string(); 287 | } 288 | }; 289 | 290 | Genome::Genome(): 291 | y_chromosome(false), 292 | pimpl(new GenomeImpl(0)) 293 | { 294 | } 295 | 296 | Genome::Genome(const std::size_t size): 297 | y_chromosome(false), 298 | pimpl(new GenomeImpl(size)) 299 | { 300 | } 301 | 302 | Genome::Genome(const Genome& g) : 303 | y_chromosome(g.y_chromosome), 304 | pimpl(new GenomeImpl(*g.pimpl)) 305 | { 306 | } 307 | 308 | Genome& Genome::operator=(const Genome& g) 309 | { 310 | if ( this != &g ) { 311 | *pimpl = *g.pimpl; 312 | y_chromosome = g.y_chromosome; 313 | } 314 | return *this; 315 | } 316 | 317 | Genome::~Genome() 318 | { 319 | delete pimpl; 320 | } 321 | 322 | const SNP& Genome::operator[](const RSID& rsid) const 323 | { 324 | return (*pimpl)[rsid]; 325 | } 326 | 327 | bool Genome::has(const RSID& rsid) const 328 | { 329 | return pimpl->contains(rsid); 330 | } 331 | 332 | std::size_t Genome::size() const 333 | { 334 | return pimpl->snps.size(); 335 | } 336 | 337 | double Genome::load_factor() const 338 | { 339 | return pimpl->snps.load_factor(); 340 | } 341 | 342 | void Genome::insert(const RsidSNP& obj) 343 | { 344 | pimpl->snps.insert(obj); 345 | } 346 | 347 | bool Genome::operator==(const Genome& o) const 348 | { 349 | // cheap tests first 350 | if ( !(y_chromosome == o.y_chromosome && size() == o.size() ) ) 351 | return false; 352 | else 353 | return o.pimpl->snps == pimpl->snps; 354 | } 355 | 356 | bool Genome::operator!=(const Genome& o) const 357 | { 358 | return !(*this == o); 359 | } 360 | 361 | GenomeIterator Genome::begin() const 362 | { 363 | auto i = const_cast(pimpl->snps).begin(); 364 | auto p = new GenomeIteratorImpl(i); 365 | return GenomeIterator(p); 366 | } 367 | 368 | GenomeIterator Genome::end() const 369 | { 370 | auto i = const_cast(pimpl->snps).end(); 371 | auto p = new GenomeIteratorImpl(i); 372 | return GenomeIterator(p); 373 | } 374 | 375 | } // namespace arv 376 | -------------------------------------------------------------------------------- /cpp/arv.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL v3 or later. See COPYING. 9 | */ 10 | 11 | #ifndef ARV_ARV_HPP 12 | #define ARV_ARV_HPP 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace arv { 20 | 21 | typedef std::uint32_t Position; 22 | typedef std::int32_t RSID; 23 | 24 | enum Nucleotide { 25 | NONE, A, G, C, T, D, I 26 | }; 27 | 28 | enum Chromosome { 29 | CHR_NO = 0, 30 | CHR_01 = 1, 31 | CHR_02 = 2, 32 | CHR_03 = 3, 33 | CHR_04 = 4, 34 | CHR_05 = 5, 35 | CHR_06 = 6, 36 | CHR_07 = 7, 37 | CHR_08 = 8, 38 | CHR_09 = 9, 39 | CHR_10 = 10, 40 | CHR_11 = 11, 41 | CHR_12 = 12, 42 | CHR_13 = 13, 43 | CHR_14 = 14, 44 | CHR_15 = 15, 45 | CHR_16 = 16, 46 | CHR_17 = 17, 47 | CHR_18 = 18, 48 | CHR_19 = 19, 49 | CHR_20 = 20, 50 | CHR_21 = 21, 51 | CHR_22 = 22, 52 | CHR_X = 23, 53 | CHR_Y = 24, 54 | CHR_MT = 25 // Mitochondrial DNA 55 | }; 56 | 57 | // We can get this down to a byte if we want to 58 | #pragma pack(1) 59 | struct Genotype { 60 | Nucleotide first : 3; 61 | Nucleotide second : 3; 62 | 63 | Genotype(); 64 | Genotype(const Nucleotide& a, const Nucleotide& b); 65 | 66 | friend Genotype operator~(const Genotype&); 67 | bool operator==(const Genotype& g) const; 68 | bool operator<(const Genotype& g) const; 69 | 70 | std::string to_string() const; 71 | }; 72 | 73 | #pragma pack(1) 74 | struct SNP { 75 | Chromosome chromosome : 5; 76 | Position position; 77 | Genotype genotype; 78 | 79 | SNP(); 80 | SNP(const Chromosome&, const Position&, const Genotype&); 81 | SNP(const SNP&); 82 | SNP& operator=(const SNP&); 83 | 84 | // Comparisons are based on the tuple (position, chromosome, genotype) 85 | bool operator!=(const SNP&) const; 86 | bool operator<(const SNP&) const; 87 | bool operator<=(const SNP&) const; 88 | bool operator==(const Genotype&) const; 89 | bool operator==(const SNP&) const; 90 | bool operator>(const SNP&) const; 91 | bool operator>=(const SNP&) const; 92 | }; 93 | 94 | extern const SNP NONE_SNP; 95 | 96 | struct GenomeIteratorImpl; 97 | 98 | typedef std::pair RsidSNP; 99 | 100 | struct GenomeIterator { 101 | GenomeIterator(); 102 | GenomeIterator(const GenomeIterator&); 103 | GenomeIterator(GenomeIteratorImpl*); 104 | GenomeIterator& operator=(const GenomeIterator&); 105 | ~GenomeIterator(); 106 | 107 | bool operator==(const GenomeIterator&) const; 108 | bool operator!=(const GenomeIterator&) const; 109 | 110 | void next(); 111 | RsidSNP value() const; 112 | 113 | private: 114 | GenomeIteratorImpl* pimpl; 115 | }; 116 | 117 | struct Genome { 118 | /*! 119 | * True if genome contains a Y-chromosome (with non-empty genotypes). 120 | */ 121 | bool y_chromosome; 122 | 123 | Genome(); 124 | Genome(const std::size_t size); 125 | Genome(const Genome&); 126 | Genome& operator=(const Genome&); 127 | ~Genome(); 128 | 129 | /*! 130 | * Access SNP. Throws on not found. 131 | */ 132 | const SNP& operator[](const RSID& id) const; 133 | 134 | /*! 135 | * Checks if hash table contains given RSID. 136 | */ 137 | bool has(const RSID& id) const; 138 | 139 | /*! 140 | * Add a SNP to the hash table. 141 | */ 142 | void insert(const RsidSNP&); 143 | 144 | /*! 145 | * Underlying hash table's load factor. (For developer purposes) 146 | */ 147 | double load_factor() const; 148 | 149 | /*! 150 | * Number of SNPs. 151 | */ 152 | std::size_t size() const; 153 | 154 | bool operator==(const Genome&) const; 155 | bool operator!=(const Genome&) const; 156 | 157 | GenomeIterator begin() const; 158 | GenomeIterator end() const; 159 | 160 | private: 161 | struct GenomeImpl; 162 | GenomeImpl* pimpl; 163 | }; 164 | 165 | Nucleotide complement(const Nucleotide& n); 166 | 167 | /*! 168 | * Parse a 23andMe genome text file and put contents into genome. 169 | */ 170 | void parse_file(const std::string& filename, Genome&); 171 | 172 | Genotype complement(const Genotype& g); 173 | 174 | } // namespace arv 175 | 176 | #endif // include guard 177 | -------------------------------------------------------------------------------- /cpp/export.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ARV_EXPORT_HPP 2 | #define ARV_EXPORT_HPP 3 | 4 | // Taken from https://gcc.gnu.org/wiki/Visibility 5 | #if defined _WIN32 || defined __CYGWIN__ 6 | #ifdef BUILDING_DLL 7 | #ifdef __GNUC__ 8 | #define DLL_PUBLIC __attribute__ ((dllexport)) 9 | #else 10 | #define DLL_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax. 11 | #endif 12 | #else 13 | #ifdef __GNUC__ 14 | #define DLL_PUBLIC __attribute__ ((dllimport)) 15 | #else 16 | #define DLL_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax. 17 | #endif 18 | #endif 19 | #define DLL_LOCAL 20 | #else 21 | #if __GNUC__ >= 4 22 | #define DLL_PUBLIC __attribute__ ((visibility ("default"))) 23 | #define DLL_LOCAL __attribute__ ((visibility ("hidden"))) 24 | #else 25 | #define DLL_PUBLIC 26 | #define DLL_LOCAL 27 | #endif 28 | #endif 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /cpp/file.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL v3 or later. See COPYING. 9 | */ 10 | 11 | #include "file.hpp" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace arv { 19 | 20 | File::File(const char* filename, const int flags): 21 | fd(open(filename, flags, S_IRUSR)) 22 | { 23 | if ( fd < 0 ) { 24 | std::string msg = "Could not open "; 25 | throw std::runtime_error((msg + filename).c_str()); 26 | } 27 | } 28 | 29 | File::~File() { 30 | close(fd); 31 | } 32 | 33 | } // ns arv 34 | -------------------------------------------------------------------------------- /cpp/file.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL v3 or later. See COPYING. 9 | */ 10 | 11 | #ifndef ARV_FILE_HPP 12 | #define ARV_FILE_HPP 13 | 14 | namespace arv { 15 | 16 | class File { 17 | int fd; 18 | public: 19 | File(const char* filename, const int flags); 20 | ~File(); 21 | 22 | inline operator int() const { 23 | return fd; 24 | } 25 | }; 26 | 27 | } // ns arv 28 | 29 | #endif // guard 30 | -------------------------------------------------------------------------------- /cpp/filesize.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2014, 2016 Christian Stigen Larsen 3 | * Distributed under the GPL v3 or later. See COPYING. 4 | */ 5 | 6 | #include "filesize.hpp" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace arv { 13 | 14 | std::size_t filesize(const int file_descriptor) 15 | { 16 | struct stat st; 17 | memset(&st, 0, sizeof(struct stat)); 18 | 19 | if ( fstat(file_descriptor, &st) < 0 ) 20 | throw std::runtime_error("Could not stat file"); 21 | 22 | const off_t size = st.st_size; 23 | return size < 0 ? 0 : static_cast(size); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /cpp/filesize.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traitrs 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL V3 or later. See COPYING. 9 | */ 10 | 11 | #ifndef ARV_FILESIZE_HPP 12 | #define ARV_FILESIZE_HPP 13 | 14 | #include 15 | 16 | namespace arv { 17 | 18 | std::size_t filesize(const int file_descriptor); 19 | 20 | } // namespace arv 21 | 22 | #endif // guard 23 | -------------------------------------------------------------------------------- /cpp/google/dense_hash_map: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/dense_hash_set: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparse_hash_map: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparse_hash_set: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparsehash/densehashtable.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparsehash/hashtable-common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparsehash/libc_allocator_with_realloc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparsehash/sparsehashtable.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/sparsetable: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/template_util.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/google/type_traits.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2012, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Header files have moved from the google directory to the sparsehash 31 | // directory. This forwarding file is provided only for backwards 32 | // compatibility. Use in all new code. 33 | 34 | #include 35 | -------------------------------------------------------------------------------- /cpp/mmap.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL v3 or later. See COPYING. 9 | */ 10 | 11 | #include "mmap.hpp" 12 | 13 | #include 14 | 15 | namespace arv { 16 | 17 | MMap::MMap(void *address, 18 | const std::size_t length, 19 | const int protection_level, 20 | const int flags, 21 | const int file_descriptor, 22 | const off_t offset) 23 | : l(length), 24 | p(mmap(address, length, protection_level, flags, file_descriptor, offset)) 25 | { 26 | if ( p == reinterpret_cast(-1) ) 27 | throw std::runtime_error("mmap error"); 28 | } 29 | 30 | MMap::~MMap() { 31 | munmap(p, l); 32 | } 33 | 34 | } // ns arv 35 | -------------------------------------------------------------------------------- /cpp/mmap.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL v3 or later. See COPYING. 9 | */ 10 | 11 | #ifndef ARV_MMAP_HPP 12 | #define ARV_MMAP_HPP 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace arv { 21 | 22 | class MMap { 23 | std::size_t l; 24 | void *p; 25 | public: 26 | MMap(void *address, 27 | const std::size_t length, 28 | const int protection_level, 29 | const int flags, 30 | const int file_descriptor, 31 | const off_t offset); 32 | ~MMap(); 33 | 34 | inline void* ptr() const { 35 | return p; 36 | } 37 | 38 | inline const char* c_str() const { 39 | return static_cast(p); 40 | } 41 | }; 42 | 43 | } // ns arv 44 | 45 | #endif // guard 46 | -------------------------------------------------------------------------------- /cpp/parse.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * dna-traits 3 | * Copyright 2014, 2016 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | * 6 | * arv 7 | * Copyright 2017 Christian Stigen Larsen 8 | * Distributed under the GNU GPL V3 or later. See COPYING. 9 | */ 10 | 11 | #include "arv.hpp" 12 | #include "file.hpp" 13 | #include "filesize.hpp" 14 | #include "mmap.hpp" 15 | 16 | #ifdef __GNUC__ 17 | #define likely(x) __builtin_expect((x),1) 18 | #define unlikely(x) __builtin_expect((x),0) 19 | #else 20 | #define likely(x) (x) 21 | #define unlikely(x) (x) 22 | #endif 23 | 24 | namespace arv { 25 | 26 | static Nucleotide CharToNucleotide[256] = {NONE}; 27 | 28 | static void skip_comments(const char*& s) 29 | { 30 | while ( *s == '#' ) 31 | while ( *s++ != '\n' ) 32 | ; // loop 33 | } 34 | 35 | static bool iswhite(const char c) 36 | { 37 | return c=='\t' || c=='\n' || c=='\r'; 38 | } 39 | 40 | static const char*& skipwhite(const char*& s) 41 | { 42 | while ( iswhite(*s) ) ++s; 43 | return s; 44 | } 45 | 46 | static uint32_t parse_uint32(const char*& s) 47 | { 48 | uint32_t n = 0; 49 | 50 | while ( isdigit(*s) ) 51 | n = n*10 - '0' + *s++; 52 | 53 | return n; 54 | } 55 | 56 | static int32_t parse_int32(const char*& s) 57 | { 58 | int32_t n = 0; 59 | 60 | while ( isdigit(*s) ) 61 | n = n*10 - '0' + *s++; 62 | 63 | return n; 64 | } 65 | 66 | static Nucleotide parse_nucleotide(const char*& s) 67 | { 68 | return CharToNucleotide[static_cast(*s++)]; 69 | } 70 | 71 | static Chromosome parse_chromo(const char*& s) 72 | { 73 | if ( likely(isdigit(*s)) ) 74 | return static_cast(parse_uint32(s)); 75 | 76 | const char c = *s++; 77 | 78 | if ( c == 'X' ) 79 | return CHR_X; 80 | 81 | if ( c == 'M' ) { 82 | ++s; // skip T in "MT" 83 | return CHR_MT; 84 | } 85 | 86 | if ( c == 'Y' ) 87 | return CHR_Y; 88 | 89 | return CHR_NO; 90 | } 91 | 92 | static Genotype parse_genotype(const char*& s) 93 | { 94 | const Nucleotide first = parse_nucleotide(s); 95 | const Nucleotide second = parse_nucleotide(s); 96 | return Genotype(first, second); 97 | } 98 | 99 | static void skipline(const char*& s) 100 | { 101 | while ( *s != '\n' ) ++s; 102 | } 103 | 104 | /** 105 | * Reads a 23andMe-formatted genome file. It currently uses reference human 106 | * assembly build 37 (annotation release 104). 107 | */ 108 | void parse_file(const std::string& name, Genome& genome) 109 | { 110 | using namespace arv; 111 | 112 | CharToNucleotide[static_cast('-')] = NONE; 113 | CharToNucleotide[static_cast('A')] = A; 114 | CharToNucleotide[static_cast('C')] = C; 115 | CharToNucleotide[static_cast('D')] = D; 116 | CharToNucleotide[static_cast('G')] = G; 117 | CharToNucleotide[static_cast('I')] = I; 118 | CharToNucleotide[static_cast('T')] = T; 119 | 120 | File fd(name.c_str(), O_RDONLY); 121 | MMap fmap(0, filesize(fd), PROT_READ, MAP_PRIVATE, fd, 0); 122 | auto s = fmap.c_str(); 123 | 124 | skip_comments(s); 125 | 126 | // Local cache of SNPs and RSIDs, for more locality and hence more speed. Its 127 | // size is somewhat arbitrary, but shouldn't be too big. 128 | const std::size_t BUFFER_SIZE = 200; 129 | RsidSNP buffer[BUFFER_SIZE]; 130 | size_t buffer_pos = 0; 131 | 132 | bool internal = false; // rsid or internal id 133 | 134 | for ( ; *s; ++s ) { 135 | if ( *s == 'r' ) 136 | internal = false; 137 | else if ( *s == 'i' ) 138 | internal = true; 139 | else { 140 | skipline(s); 141 | continue; 142 | } 143 | 144 | RSID& rsid = buffer[buffer_pos].first; 145 | SNP& snp = buffer[buffer_pos].second; 146 | 147 | // Skip i/rs prefix and parse number 148 | if ( !internal ) 149 | rsid = parse_int32(s += 2); 150 | else 151 | rsid = -parse_int32(s += 1); 152 | 153 | snp.chromosome = parse_chromo(skipwhite(s)); 154 | snp.position = parse_uint32(skipwhite(s)); 155 | snp.genotype = parse_genotype(skipwhite(s)); 156 | 157 | genome.y_chromosome |= (snp.chromosome == CHR_Y && snp.genotype.first != 158 | NONE); 159 | 160 | // Ordinarly, we would just call `genome.insert(rsid, snp)` here, but it's 161 | // a tad faster to stage them in an array first, and then flush it to the 162 | // hash map when it's full. 163 | 164 | if ( ++buffer_pos == BUFFER_SIZE ) { 165 | buffer_pos = 0; 166 | for ( size_t n = 0; n < BUFFER_SIZE; ++n ) 167 | genome.insert(buffer[n]); 168 | } 169 | } 170 | 171 | // Store the rest of the buffer 172 | for ( size_t n = 0; n < buffer_pos; ++n ) 173 | genome.insert(buffer[n]); 174 | } 175 | 176 | } // namespace arv 177 | -------------------------------------------------------------------------------- /cpp/public_py_init_sym.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * arv 3 | * Copyright 2017 Christian Stigen Larsen 4 | * Distributed under the GNU GPL v3 or later. See COPYING. 5 | */ 6 | 7 | #ifndef ARV_CYTHON_HPP 8 | #define ARV_CYTHON_HPP 9 | 10 | #include 11 | #include "export.hpp" 12 | 13 | // When compiling with hidden symbols (-fvisibility=hidden), we still need to 14 | // make the init function's symbol global (i.e. public). 15 | 16 | #if PY_MAJOR_VERSION < 3 17 | PyMODINIT_FUNC init_arv(void) DLL_PUBLIC; 18 | #else 19 | PyMODINIT_FUNC PyInit__arv(void) DLL_PUBLIC; 20 | #endif 21 | 22 | #endif // guard 23 | -------------------------------------------------------------------------------- /cpp/sparsehash/dense_hash_set: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2005, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // --- 31 | // 32 | // This is just a very thin wrapper over densehashtable.h, just 33 | // like sgi stl's stl_hash_set is a very thin wrapper over 34 | // stl_hashtable. The major thing we define is operator[], because 35 | // we have a concept of a data_type which stl_hashtable doesn't 36 | // (it only has a key and a value). 37 | // 38 | // This is more different from dense_hash_map than you might think, 39 | // because all iterators for sets are const (you obviously can't 40 | // change the key, and for sets there is no value). 41 | // 42 | // NOTE: this is exactly like sparse_hash_set.h, with the word 43 | // "sparse" replaced by "dense", except for the addition of 44 | // set_empty_key(). 45 | // 46 | // YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION. 47 | // 48 | // Otherwise your program will die in mysterious ways. (Note if you 49 | // use the constructor that takes an InputIterator range, you pass in 50 | // the empty key in the constructor, rather than after. As a result, 51 | // this constructor differs from the standard STL version.) 52 | // 53 | // In other respects, we adhere mostly to the STL semantics for 54 | // hash-map. One important exception is that insert() may invalidate 55 | // iterators entirely -- STL semantics are that insert() may reorder 56 | // iterators, but they all still refer to something valid in the 57 | // hashtable. Not so for us. Likewise, insert() may invalidate 58 | // pointers into the hashtable. (Whether insert invalidates iterators 59 | // and pointers depends on whether it results in a hashtable resize). 60 | // On the plus side, delete() doesn't invalidate iterators or pointers 61 | // at all, or even change the ordering of elements. 62 | // 63 | // Here are a few "power user" tips: 64 | // 65 | // 1) set_deleted_key(): 66 | // If you want to use erase() you must call set_deleted_key(), 67 | // in addition to set_empty_key(), after construction. 68 | // The deleted and empty keys must differ. 69 | // 70 | // 2) resize(0): 71 | // When an item is deleted, its memory isn't freed right 72 | // away. This allows you to iterate over a hashtable, 73 | // and call erase(), without invalidating the iterator. 74 | // To force the memory to be freed, call resize(0). 75 | // For tr1 compatibility, this can also be called as rehash(0). 76 | // 77 | // 3) min_load_factor(0.0) 78 | // Setting the minimum load factor to 0.0 guarantees that 79 | // the hash table will never shrink. 80 | // 81 | // Roughly speaking: 82 | // (1) dense_hash_set: fastest, uses the most memory unless entries are small 83 | // (2) sparse_hash_set: slowest, uses the least memory 84 | // (3) hash_set / unordered_set (STL): in the middle 85 | // 86 | // Typically I use sparse_hash_set when I care about space and/or when 87 | // I need to save the hashtable on disk. I use hash_set otherwise. I 88 | // don't personally use dense_hash_set ever; some people use it for 89 | // small sets with lots of lookups. 90 | // 91 | // - dense_hash_set has, typically, about 78% memory overhead (if your 92 | // data takes up X bytes, the hash_set uses .78X more bytes in overhead). 93 | // - sparse_hash_set has about 4 bits overhead per entry. 94 | // - sparse_hash_set can be 3-7 times slower than the others for lookup and, 95 | // especially, inserts. See time_hash_map.cc for details. 96 | // 97 | // See /usr/(local/)?doc/sparsehash-*/dense_hash_set.html 98 | // for information about how to use this class. 99 | 100 | #ifndef _DENSE_HASH_SET_H_ 101 | #define _DENSE_HASH_SET_H_ 102 | 103 | #include 104 | #include // needed by stl_alloc 105 | #include // for equal_to<>, select1st<>, etc 106 | #include // for alloc 107 | #include // for pair<> 108 | #include // IWYU pragma: export 109 | #include 110 | #include HASH_FUN_H // for hash<> 111 | _START_GOOGLE_NAMESPACE_ 112 | 113 | template , // defined in sparseconfig.h 115 | class EqualKey = std::equal_to, 116 | class Alloc = libc_allocator_with_realloc > 117 | class dense_hash_set { 118 | private: 119 | // Apparently identity is not stl-standard, so we define our own 120 | struct Identity { 121 | typedef const Value& result_type; 122 | const Value& operator()(const Value& v) const { return v; } 123 | }; 124 | struct SetKey { 125 | void operator()(Value* value, const Value& new_key) const { 126 | *value = new_key; 127 | } 128 | }; 129 | 130 | // The actual data 131 | typedef dense_hashtable ht; 133 | ht rep; 134 | 135 | public: 136 | typedef typename ht::key_type key_type; 137 | typedef typename ht::value_type value_type; 138 | typedef typename ht::hasher hasher; 139 | typedef typename ht::key_equal key_equal; 140 | typedef Alloc allocator_type; 141 | 142 | typedef typename ht::size_type size_type; 143 | typedef typename ht::difference_type difference_type; 144 | typedef typename ht::const_pointer pointer; 145 | typedef typename ht::const_pointer const_pointer; 146 | typedef typename ht::const_reference reference; 147 | typedef typename ht::const_reference const_reference; 148 | 149 | typedef typename ht::const_iterator iterator; 150 | typedef typename ht::const_iterator const_iterator; 151 | typedef typename ht::const_local_iterator local_iterator; 152 | typedef typename ht::const_local_iterator const_local_iterator; 153 | 154 | 155 | // Iterator functions -- recall all iterators are const 156 | iterator begin() const { return rep.begin(); } 157 | iterator end() const { return rep.end(); } 158 | 159 | // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements. 160 | local_iterator begin(size_type i) const { return rep.begin(i); } 161 | local_iterator end(size_type i) const { return rep.end(i); } 162 | 163 | 164 | // Accessor functions 165 | allocator_type get_allocator() const { return rep.get_allocator(); } 166 | hasher hash_funct() const { return rep.hash_funct(); } 167 | hasher hash_function() const { return hash_funct(); } // tr1 name 168 | key_equal key_eq() const { return rep.key_eq(); } 169 | 170 | 171 | // Constructors 172 | explicit dense_hash_set(size_type expected_max_items_in_table = 0, 173 | const hasher& hf = hasher(), 174 | const key_equal& eql = key_equal(), 175 | const allocator_type& alloc = allocator_type()) 176 | : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { 177 | } 178 | 179 | template 180 | dense_hash_set(InputIterator f, InputIterator l, 181 | const key_type& empty_key_val, 182 | size_type expected_max_items_in_table = 0, 183 | const hasher& hf = hasher(), 184 | const key_equal& eql = key_equal(), 185 | const allocator_type& alloc = allocator_type()) 186 | : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { 187 | set_empty_key(empty_key_val); 188 | rep.insert(f, l); 189 | } 190 | // We use the default copy constructor 191 | // We use the default operator=() 192 | // We use the default destructor 193 | 194 | void clear() { rep.clear(); } 195 | // This clears the hash set without resizing it down to the minimum 196 | // bucket count, but rather keeps the number of buckets constant 197 | void clear_no_resize() { rep.clear_no_resize(); } 198 | void swap(dense_hash_set& hs) { rep.swap(hs.rep); } 199 | 200 | 201 | // Functions concerning size 202 | size_type size() const { return rep.size(); } 203 | size_type max_size() const { return rep.max_size(); } 204 | bool empty() const { return rep.empty(); } 205 | size_type bucket_count() const { return rep.bucket_count(); } 206 | size_type max_bucket_count() const { return rep.max_bucket_count(); } 207 | 208 | // These are tr1 methods. bucket() is the bucket the key is or would be in. 209 | size_type bucket_size(size_type i) const { return rep.bucket_size(i); } 210 | size_type bucket(const key_type& key) const { return rep.bucket(key); } 211 | float load_factor() const { 212 | return size() * 1.0f / bucket_count(); 213 | } 214 | float max_load_factor() const { 215 | float shrink, grow; 216 | rep.get_resizing_parameters(&shrink, &grow); 217 | return grow; 218 | } 219 | void max_load_factor(float new_grow) { 220 | float shrink, grow; 221 | rep.get_resizing_parameters(&shrink, &grow); 222 | rep.set_resizing_parameters(shrink, new_grow); 223 | } 224 | // These aren't tr1 methods but perhaps ought to be. 225 | float min_load_factor() const { 226 | float shrink, grow; 227 | rep.get_resizing_parameters(&shrink, &grow); 228 | return shrink; 229 | } 230 | void min_load_factor(float new_shrink) { 231 | float shrink, grow; 232 | rep.get_resizing_parameters(&shrink, &grow); 233 | rep.set_resizing_parameters(new_shrink, grow); 234 | } 235 | // Deprecated; use min_load_factor() or max_load_factor() instead. 236 | void set_resizing_parameters(float shrink, float grow) { 237 | rep.set_resizing_parameters(shrink, grow); 238 | } 239 | 240 | void resize(size_type hint) { rep.resize(hint); } 241 | void rehash(size_type hint) { resize(hint); } // the tr1 name 242 | 243 | // Lookup routines 244 | iterator find(const key_type& key) const { return rep.find(key); } 245 | 246 | size_type count(const key_type& key) const { return rep.count(key); } 247 | 248 | std::pair equal_range(const key_type& key) const { 249 | return rep.equal_range(key); 250 | } 251 | 252 | 253 | // Insertion routines 254 | std::pair insert(const value_type& obj) { 255 | std::pair p = rep.insert(obj); 256 | return std::pair(p.first, p.second); // const to non-const 257 | } 258 | template void insert(InputIterator f, InputIterator l) { 259 | rep.insert(f, l); 260 | } 261 | void insert(const_iterator f, const_iterator l) { 262 | rep.insert(f, l); 263 | } 264 | // Required for std::insert_iterator; the passed-in iterator is ignored. 265 | iterator insert(iterator, const value_type& obj) { 266 | return insert(obj).first; 267 | } 268 | 269 | // Deletion and empty routines 270 | // THESE ARE NON-STANDARD! I make you specify an "impossible" key 271 | // value to identify deleted and empty buckets. You can change the 272 | // deleted key as time goes on, or get rid of it entirely to be insert-only. 273 | void set_empty_key(const key_type& key) { rep.set_empty_key(key); } 274 | key_type empty_key() const { return rep.empty_key(); } 275 | 276 | void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } 277 | void clear_deleted_key() { rep.clear_deleted_key(); } 278 | key_type deleted_key() const { return rep.deleted_key(); } 279 | 280 | // These are standard 281 | size_type erase(const key_type& key) { return rep.erase(key); } 282 | void erase(iterator it) { rep.erase(it); } 283 | void erase(iterator f, iterator l) { rep.erase(f, l); } 284 | 285 | 286 | // Comparison 287 | bool operator==(const dense_hash_set& hs) const { return rep == hs.rep; } 288 | bool operator!=(const dense_hash_set& hs) const { return rep != hs.rep; } 289 | 290 | 291 | // I/O -- this is an add-on for writing metainformation to disk 292 | // 293 | // For maximum flexibility, this does not assume a particular 294 | // file type (though it will probably be a FILE *). We just pass 295 | // the fp through to rep. 296 | 297 | // If your keys and values are simple enough, you can pass this 298 | // serializer to serialize()/unserialize(). "Simple enough" means 299 | // value_type is a POD type that contains no pointers. Note, 300 | // however, we don't try to normalize endianness. 301 | typedef typename ht::NopointerSerializer NopointerSerializer; 302 | 303 | // serializer: a class providing operator()(OUTPUT*, const value_type&) 304 | // (writing value_type to OUTPUT). You can specify a 305 | // NopointerSerializer object if appropriate (see above). 306 | // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a 307 | // pointer to a class providing size_t Write(const void*, size_t), 308 | // which writes a buffer into a stream (which fp presumably 309 | // owns) and returns the number of bytes successfully written. 310 | // Note basic_ostream is not currently supported. 311 | template 312 | bool serialize(ValueSerializer serializer, OUTPUT* fp) { 313 | return rep.serialize(serializer, fp); 314 | } 315 | 316 | // serializer: a functor providing operator()(INPUT*, value_type*) 317 | // (reading from INPUT and into value_type). You can specify a 318 | // NopointerSerializer object if appropriate (see above). 319 | // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a 320 | // pointer to a class providing size_t Read(void*, size_t), 321 | // which reads into a buffer from a stream (which fp presumably 322 | // owns) and returns the number of bytes successfully read. 323 | // Note basic_istream is not currently supported. 324 | template 325 | bool unserialize(ValueSerializer serializer, INPUT* fp) { 326 | return rep.unserialize(serializer, fp); 327 | } 328 | }; 329 | 330 | template 331 | inline void swap(dense_hash_set& hs1, 332 | dense_hash_set& hs2) { 333 | hs1.swap(hs2); 334 | } 335 | 336 | _END_GOOGLE_NAMESPACE_ 337 | 338 | #endif /* _DENSE_HASH_SET_H_ */ 339 | -------------------------------------------------------------------------------- /cpp/sparsehash/internal/hashtable-common.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // --- 31 | // 32 | // Provides classes shared by both sparse and dense hashtable. 33 | // 34 | // sh_hashtable_settings has parameters for growing and shrinking 35 | // a hashtable. It also packages zero-size functor (ie. hasher). 36 | // 37 | // Other functions and classes provide common code for serializing 38 | // and deserializing hashtables to a stream (such as a FILE*). 39 | 40 | #ifndef UTIL_GTL_HASHTABLE_COMMON_H_ 41 | #define UTIL_GTL_HASHTABLE_COMMON_H_ 42 | 43 | #include 44 | #include 45 | #include 46 | #include // for size_t 47 | #include 48 | #include // For length_error 49 | 50 | _START_GOOGLE_NAMESPACE_ 51 | 52 | template struct SparsehashCompileAssert { }; 53 | #define SPARSEHASH_COMPILE_ASSERT(expr, msg) \ 54 | __attribute__((unused)) typedef SparsehashCompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] 55 | 56 | namespace sparsehash_internal { 57 | 58 | // Adaptor methods for reading/writing data from an INPUT or OUPTUT 59 | // variable passed to serialize() or unserialize(). For now we 60 | // have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note 61 | // they are pointers, unlike typical use), or else a pointer to 62 | // something that supports a Read()/Write() method. 63 | // 64 | // For technical reasons, we implement read_data/write_data in two 65 | // stages. The actual work is done in *_data_internal, which takes 66 | // the stream argument twice: once as a template type, and once with 67 | // normal type information. (We only use the second version.) We do 68 | // this because of how C++ picks what function overload to use. If we 69 | // implemented this the naive way: 70 | // bool read_data(istream* is, const void* data, size_t length); 71 | // template read_data(T* fp, const void* data, size_t length); 72 | // C++ would prefer the second version for every stream type except 73 | // istream. However, we want C++ to prefer the first version for 74 | // streams that are *subclasses* of istream, such as istringstream. 75 | // This is not possible given the way template types are resolved. So 76 | // we split the stream argument in two, one of which is templated and 77 | // one of which is not. The specialized functions (like the istream 78 | // version above) ignore the template arg and use the second, 'type' 79 | // arg, getting subclass matching as normal. The 'catch-all' 80 | // functions (the second version above) use the template arg to deduce 81 | // the type, and use a second, void* arg to achieve the desired 82 | // 'catch-all' semantics. 83 | 84 | // ----- low-level I/O for FILE* ---- 85 | 86 | template 87 | inline bool read_data_internal(Ignored*, FILE* fp, 88 | void* data, size_t length) { 89 | return fread(data, length, 1, fp) == 1; 90 | } 91 | 92 | template 93 | inline bool write_data_internal(Ignored*, FILE* fp, 94 | const void* data, size_t length) { 95 | return fwrite(data, length, 1, fp) == 1; 96 | } 97 | 98 | // ----- low-level I/O for iostream ---- 99 | 100 | // We want the caller to be responsible for #including , not 101 | // us, because iostream is a big header! According to the standard, 102 | // it's only legal to delay the instantiation the way we want to if 103 | // the istream/ostream is a template type. So we jump through hoops. 104 | template 105 | inline bool read_data_internal_for_istream(ISTREAM* fp, 106 | void* data, size_t length) { 107 | return fp->read(reinterpret_cast(data), length).good(); 108 | } 109 | template 110 | inline bool read_data_internal(Ignored*, std::istream* fp, 111 | void* data, size_t length) { 112 | return read_data_internal_for_istream(fp, data, length); 113 | } 114 | 115 | template 116 | inline bool write_data_internal_for_ostream(OSTREAM* fp, 117 | const void* data, size_t length) { 118 | return fp->write(reinterpret_cast(data), length).good(); 119 | } 120 | template 121 | inline bool write_data_internal(Ignored*, std::ostream* fp, 122 | const void* data, size_t length) { 123 | return write_data_internal_for_ostream(fp, data, length); 124 | } 125 | 126 | // ----- low-level I/O for custom streams ---- 127 | 128 | // The INPUT type needs to support a Read() method that takes a 129 | // buffer and a length and returns the number of bytes read. 130 | template 131 | inline bool read_data_internal(INPUT* fp, void*, 132 | void* data, size_t length) { 133 | return static_cast(fp->Read(data, length)) == length; 134 | } 135 | 136 | // The OUTPUT type needs to support a Write() operation that takes 137 | // a buffer and a length and returns the number of bytes written. 138 | template 139 | inline bool write_data_internal(OUTPUT* fp, void*, 140 | const void* data, size_t length) { 141 | return static_cast(fp->Write(data, length)) == length; 142 | } 143 | 144 | // ----- low-level I/O: the public API ---- 145 | 146 | template 147 | inline bool read_data(INPUT* fp, void* data, size_t length) { 148 | return read_data_internal(fp, fp, data, length); 149 | } 150 | 151 | template 152 | inline bool write_data(OUTPUT* fp, const void* data, size_t length) { 153 | return write_data_internal(fp, fp, data, length); 154 | } 155 | 156 | // Uses read_data() and write_data() to read/write an integer. 157 | // length is the number of bytes to read/write (which may differ 158 | // from sizeof(IntType), allowing us to save on a 32-bit system 159 | // and load on a 64-bit system). Excess bytes are taken to be 0. 160 | // INPUT and OUTPUT must match legal inputs to read/write_data (above). 161 | template 162 | bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) { 163 | *value = 0; 164 | unsigned char byte; 165 | // We require IntType to be unsigned or else the shifting gets all screwy. 166 | SPARSEHASH_COMPILE_ASSERT(static_cast(-1) > static_cast(0), 167 | serializing_int_requires_an_unsigned_type); 168 | for (size_t i = 0; i < length; ++i) { 169 | if (!read_data(fp, &byte, sizeof(byte))) return false; 170 | *value |= static_cast(byte) << ((length - 1 - i) * 8); 171 | } 172 | return true; 173 | } 174 | 175 | template 176 | bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) { 177 | unsigned char byte; 178 | // We require IntType to be unsigned or else the shifting gets all screwy. 179 | SPARSEHASH_COMPILE_ASSERT(static_cast(-1) > static_cast(0), 180 | serializing_int_requires_an_unsigned_type); 181 | for (size_t i = 0; i < length; ++i) { 182 | byte = (sizeof(value) <= length-1 - i) 183 | ? 0 : static_cast((value >> ((length-1 - i) * 8)) & 255); 184 | if (!write_data(fp, &byte, sizeof(byte))) return false; 185 | } 186 | return true; 187 | } 188 | 189 | // If your keys and values are simple enough, you can pass this 190 | // serializer to serialize()/unserialize(). "Simple enough" means 191 | // value_type is a POD type that contains no pointers. Note, 192 | // however, we don't try to normalize endianness. 193 | // This is the type used for NopointerSerializer. 194 | template struct pod_serializer { 195 | template 196 | bool operator()(INPUT* fp, value_type* value) const { 197 | return read_data(fp, value, sizeof(*value)); 198 | } 199 | 200 | template 201 | bool operator()(OUTPUT* fp, const value_type& value) const { 202 | return write_data(fp, &value, sizeof(value)); 203 | } 204 | }; 205 | 206 | 207 | // Settings contains parameters for growing and shrinking the table. 208 | // It also packages zero-size functor (ie. hasher). 209 | // 210 | // It does some munging of the hash value in cases where we think 211 | // (fear) the original hash function might not be very good. In 212 | // particular, the default hash of pointers is the identity hash, 213 | // so probably all the low bits are 0. We identify when we think 214 | // we're hashing a pointer, and chop off the low bits. Note this 215 | // isn't perfect: even when the key is a pointer, we can't tell 216 | // for sure that the hash is the identity hash. If it's not, this 217 | // is needless work (and possibly, though not likely, harmful). 218 | 219 | template 221 | class sh_hashtable_settings : public HashFunc { 222 | public: 223 | typedef Key key_type; 224 | typedef HashFunc hasher; 225 | typedef SizeType size_type; 226 | 227 | public: 228 | sh_hashtable_settings(const hasher& hf, 229 | const float ht_occupancy_flt, 230 | const float ht_empty_flt) 231 | : hasher(hf), 232 | enlarge_threshold_(0), 233 | shrink_threshold_(0), 234 | consider_shrink_(false), 235 | use_empty_(false), 236 | use_deleted_(false), 237 | num_ht_copies_(0) { 238 | set_enlarge_factor(ht_occupancy_flt); 239 | set_shrink_factor(ht_empty_flt); 240 | } 241 | 242 | size_type hash(const key_type& v) const { 243 | // We munge the hash value when we don't trust hasher::operator(). 244 | return hash_munger::MungedHash(hasher::operator()(v)); 245 | } 246 | 247 | float enlarge_factor() const { 248 | return enlarge_factor_; 249 | } 250 | void set_enlarge_factor(float f) { 251 | enlarge_factor_ = f; 252 | } 253 | float shrink_factor() const { 254 | return shrink_factor_; 255 | } 256 | void set_shrink_factor(float f) { 257 | shrink_factor_ = f; 258 | } 259 | 260 | size_type enlarge_threshold() const { 261 | return enlarge_threshold_; 262 | } 263 | void set_enlarge_threshold(size_type t) { 264 | enlarge_threshold_ = t; 265 | } 266 | size_type shrink_threshold() const { 267 | return shrink_threshold_; 268 | } 269 | void set_shrink_threshold(size_type t) { 270 | shrink_threshold_ = t; 271 | } 272 | 273 | size_type enlarge_size(size_type x) const { 274 | return static_cast(x * enlarge_factor_); 275 | } 276 | size_type shrink_size(size_type x) const { 277 | return static_cast(x * shrink_factor_); 278 | } 279 | 280 | bool consider_shrink() const { 281 | return consider_shrink_; 282 | } 283 | void set_consider_shrink(bool t) { 284 | consider_shrink_ = t; 285 | } 286 | 287 | bool use_empty() const { 288 | return use_empty_; 289 | } 290 | void set_use_empty(bool t) { 291 | use_empty_ = t; 292 | } 293 | 294 | bool use_deleted() const { 295 | return use_deleted_; 296 | } 297 | void set_use_deleted(bool t) { 298 | use_deleted_ = t; 299 | } 300 | 301 | size_type num_ht_copies() const { 302 | return static_cast(num_ht_copies_); 303 | } 304 | void inc_num_ht_copies() { 305 | ++num_ht_copies_; 306 | } 307 | 308 | // Reset the enlarge and shrink thresholds 309 | void reset_thresholds(size_type num_buckets) { 310 | set_enlarge_threshold(enlarge_size(num_buckets)); 311 | set_shrink_threshold(shrink_size(num_buckets)); 312 | // whatever caused us to reset already considered 313 | set_consider_shrink(false); 314 | } 315 | 316 | // Caller is resposible for calling reset_threshold right after 317 | // set_resizing_parameters. 318 | void set_resizing_parameters(float shrink, float grow) { 319 | assert(shrink >= 0.0); 320 | assert(grow <= 1.0); 321 | if (shrink > grow/2.0f) 322 | shrink = grow / 2.0f; // otherwise we thrash hashtable size 323 | set_shrink_factor(shrink); 324 | set_enlarge_factor(grow); 325 | } 326 | 327 | // This is the smallest size a hashtable can be without being too crowded 328 | // If you like, you can give a min #buckets as well as a min #elts 329 | size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) { 330 | float enlarge = enlarge_factor(); 331 | size_type sz = HT_MIN_BUCKETS; // min buckets allowed 332 | while ( sz < min_buckets_wanted || 333 | num_elts >= static_cast(sz * enlarge) ) { 334 | // This just prevents overflowing size_type, since sz can exceed 335 | // max_size() here. 336 | if (static_cast(sz * 2) < sz) { 337 | throw std::length_error("resize overflow"); // protect against overflow 338 | } 339 | sz *= 2; 340 | } 341 | return sz; 342 | } 343 | 344 | private: 345 | template class hash_munger { 346 | public: 347 | static size_t MungedHash(size_t hash) { 348 | return hash; 349 | } 350 | }; 351 | // This matches when the hashtable key is a pointer. 352 | template class hash_munger { 353 | public: 354 | static size_t MungedHash(size_t hash) { 355 | // TODO(csilvers): consider rotating instead: 356 | // static const int shift = (sizeof(void *) == 4) ? 2 : 3; 357 | // return (hash << (sizeof(hash) * 8) - shift)) | (hash >> shift); 358 | // This matters if we ever change sparse/dense_hash_* to compare 359 | // hashes before comparing actual values. It's speedy on x86. 360 | return hash / sizeof(void*); // get rid of known-0 bits 361 | } 362 | }; 363 | 364 | size_type enlarge_threshold_; // table.size() * enlarge_factor 365 | size_type shrink_threshold_; // table.size() * shrink_factor 366 | float enlarge_factor_; // how full before resize 367 | float shrink_factor_; // how empty before resize 368 | // consider_shrink=true if we should try to shrink before next insert 369 | bool consider_shrink_; 370 | bool use_empty_; // used only by densehashtable, not sparsehashtable 371 | bool use_deleted_; // false until delkey has been set 372 | // num_ht_copies is a counter incremented every Copy/Move 373 | unsigned int num_ht_copies_; 374 | }; 375 | 376 | } // namespace sparsehash_internal 377 | 378 | #undef SPARSEHASH_COMPILE_ASSERT 379 | _END_GOOGLE_NAMESPACE_ 380 | 381 | #endif // UTIL_GTL_HASHTABLE_COMMON_H_ 382 | -------------------------------------------------------------------------------- /cpp/sparsehash/internal/libc_allocator_with_realloc.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2010, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // --- 31 | 32 | #ifndef UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ 33 | #define UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ 34 | 35 | #include 36 | #include // for malloc/realloc/free 37 | #include // for ptrdiff_t 38 | #include // for placement new 39 | 40 | _START_GOOGLE_NAMESPACE_ 41 | 42 | template 43 | class libc_allocator_with_realloc { 44 | public: 45 | typedef T value_type; 46 | typedef size_t size_type; 47 | typedef ptrdiff_t difference_type; 48 | 49 | typedef T* pointer; 50 | typedef const T* const_pointer; 51 | typedef T& reference; 52 | typedef const T& const_reference; 53 | 54 | libc_allocator_with_realloc() {} 55 | libc_allocator_with_realloc(const libc_allocator_with_realloc&) {} 56 | ~libc_allocator_with_realloc() {} 57 | 58 | pointer address(reference r) const { return &r; } 59 | const_pointer address(const_reference r) const { return &r; } 60 | 61 | pointer allocate(size_type n, const_pointer = 0) { 62 | return static_cast(malloc(n * sizeof(value_type))); 63 | } 64 | void deallocate(pointer p, size_type) { 65 | free(p); 66 | } 67 | pointer reallocate(pointer p, size_type n) { 68 | return static_cast(realloc(p, n * sizeof(value_type))); 69 | } 70 | 71 | size_type max_size() const { 72 | return static_cast(-1) / sizeof(value_type); 73 | } 74 | 75 | void construct(pointer p, const value_type& val) { 76 | new(p) value_type(val); 77 | } 78 | void destroy(pointer p) { p->~value_type(); } 79 | 80 | template 81 | libc_allocator_with_realloc(const libc_allocator_with_realloc&) {} 82 | 83 | template 84 | struct rebind { 85 | typedef libc_allocator_with_realloc other; 86 | }; 87 | }; 88 | 89 | // libc_allocator_with_realloc specialization. 90 | template<> 91 | class libc_allocator_with_realloc { 92 | public: 93 | typedef void value_type; 94 | typedef size_t size_type; 95 | typedef ptrdiff_t difference_type; 96 | typedef void* pointer; 97 | typedef const void* const_pointer; 98 | 99 | template 100 | struct rebind { 101 | typedef libc_allocator_with_realloc other; 102 | }; 103 | }; 104 | 105 | template 106 | inline bool operator==(const libc_allocator_with_realloc&, 107 | const libc_allocator_with_realloc&) { 108 | return true; 109 | } 110 | 111 | template 112 | inline bool operator!=(const libc_allocator_with_realloc&, 113 | const libc_allocator_with_realloc&) { 114 | return false; 115 | } 116 | 117 | _END_GOOGLE_NAMESPACE_ 118 | 119 | #endif // UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_ 120 | -------------------------------------------------------------------------------- /cpp/sparsehash/sparse_hash_set: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2005, Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // --- 31 | // 32 | // This is just a very thin wrapper over sparsehashtable.h, just 33 | // like sgi stl's stl_hash_set is a very thin wrapper over 34 | // stl_hashtable. The major thing we define is operator[], because 35 | // we have a concept of a data_type which stl_hashtable doesn't 36 | // (it only has a key and a value). 37 | // 38 | // This is more different from sparse_hash_map than you might think, 39 | // because all iterators for sets are const (you obviously can't 40 | // change the key, and for sets there is no value). 41 | // 42 | // We adhere mostly to the STL semantics for hash-map. One important 43 | // exception is that insert() may invalidate iterators entirely -- STL 44 | // semantics are that insert() may reorder iterators, but they all 45 | // still refer to something valid in the hashtable. Not so for us. 46 | // Likewise, insert() may invalidate pointers into the hashtable. 47 | // (Whether insert invalidates iterators and pointers depends on 48 | // whether it results in a hashtable resize). On the plus side, 49 | // delete() doesn't invalidate iterators or pointers at all, or even 50 | // change the ordering of elements. 51 | // 52 | // Here are a few "power user" tips: 53 | // 54 | // 1) set_deleted_key(): 55 | // Unlike STL's hash_map, if you want to use erase() you 56 | // *must* call set_deleted_key() after construction. 57 | // 58 | // 2) resize(0): 59 | // When an item is deleted, its memory isn't freed right 60 | // away. This allows you to iterate over a hashtable, 61 | // and call erase(), without invalidating the iterator. 62 | // To force the memory to be freed, call resize(0). 63 | // For tr1 compatibility, this can also be called as rehash(0). 64 | // 65 | // 3) min_load_factor(0.0) 66 | // Setting the minimum load factor to 0.0 guarantees that 67 | // the hash table will never shrink. 68 | // 69 | // Roughly speaking: 70 | // (1) dense_hash_set: fastest, uses the most memory unless entries are small 71 | // (2) sparse_hash_set: slowest, uses the least memory 72 | // (3) hash_set / unordered_set (STL): in the middle 73 | // 74 | // Typically I use sparse_hash_set when I care about space and/or when 75 | // I need to save the hashtable on disk. I use hash_set otherwise. I 76 | // don't personally use dense_hash_set ever; some people use it for 77 | // small sets with lots of lookups. 78 | // 79 | // - dense_hash_set has, typically, about 78% memory overhead (if your 80 | // data takes up X bytes, the hash_set uses .78X more bytes in overhead). 81 | // - sparse_hash_set has about 4 bits overhead per entry. 82 | // - sparse_hash_set can be 3-7 times slower than the others for lookup and, 83 | // especially, inserts. See time_hash_map.cc for details. 84 | // 85 | // See /usr/(local/)?doc/sparsehash-*/sparse_hash_set.html 86 | // for information about how to use this class. 87 | 88 | #ifndef _SPARSE_HASH_SET_H_ 89 | #define _SPARSE_HASH_SET_H_ 90 | 91 | #include 92 | #include // needed by stl_alloc 93 | #include // for equal_to<> 94 | #include // for alloc (which we don't use) 95 | #include // for pair<> 96 | #include 97 | #include // IWYU pragma: export 98 | #include HASH_FUN_H // for hash<> 99 | 100 | _START_GOOGLE_NAMESPACE_ 101 | 102 | template , // defined in sparseconfig.h 104 | class EqualKey = std::equal_to, 105 | class Alloc = libc_allocator_with_realloc > 106 | class sparse_hash_set { 107 | private: 108 | // Apparently identity is not stl-standard, so we define our own 109 | struct Identity { 110 | typedef const Value& result_type; 111 | const Value& operator()(const Value& v) const { return v; } 112 | }; 113 | struct SetKey { 114 | void operator()(Value* value, const Value& new_key) const { 115 | *value = new_key; 116 | } 117 | }; 118 | 119 | typedef sparse_hashtable ht; 121 | ht rep; 122 | 123 | public: 124 | typedef typename ht::key_type key_type; 125 | typedef typename ht::value_type value_type; 126 | typedef typename ht::hasher hasher; 127 | typedef typename ht::key_equal key_equal; 128 | typedef Alloc allocator_type; 129 | 130 | typedef typename ht::size_type size_type; 131 | typedef typename ht::difference_type difference_type; 132 | typedef typename ht::const_pointer pointer; 133 | typedef typename ht::const_pointer const_pointer; 134 | typedef typename ht::const_reference reference; 135 | typedef typename ht::const_reference const_reference; 136 | 137 | typedef typename ht::const_iterator iterator; 138 | typedef typename ht::const_iterator const_iterator; 139 | typedef typename ht::const_local_iterator local_iterator; 140 | typedef typename ht::const_local_iterator const_local_iterator; 141 | 142 | 143 | // Iterator functions -- recall all iterators are const 144 | iterator begin() const { return rep.begin(); } 145 | iterator end() const { return rep.end(); } 146 | 147 | // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements. 148 | local_iterator begin(size_type i) const { return rep.begin(i); } 149 | local_iterator end(size_type i) const { return rep.end(i); } 150 | 151 | 152 | // Accessor functions 153 | allocator_type get_allocator() const { return rep.get_allocator(); } 154 | hasher hash_funct() const { return rep.hash_funct(); } 155 | hasher hash_function() const { return hash_funct(); } // tr1 name 156 | key_equal key_eq() const { return rep.key_eq(); } 157 | 158 | 159 | // Constructors 160 | explicit sparse_hash_set(size_type expected_max_items_in_table = 0, 161 | const hasher& hf = hasher(), 162 | const key_equal& eql = key_equal(), 163 | const allocator_type& alloc = allocator_type()) 164 | : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { 165 | } 166 | 167 | template 168 | sparse_hash_set(InputIterator f, InputIterator l, 169 | size_type expected_max_items_in_table = 0, 170 | const hasher& hf = hasher(), 171 | const key_equal& eql = key_equal(), 172 | const allocator_type& alloc = allocator_type()) 173 | : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) { 174 | rep.insert(f, l); 175 | } 176 | // We use the default copy constructor 177 | // We use the default operator=() 178 | // We use the default destructor 179 | 180 | void clear() { rep.clear(); } 181 | void swap(sparse_hash_set& hs) { rep.swap(hs.rep); } 182 | 183 | 184 | // Functions concerning size 185 | size_type size() const { return rep.size(); } 186 | size_type max_size() const { return rep.max_size(); } 187 | bool empty() const { return rep.empty(); } 188 | size_type bucket_count() const { return rep.bucket_count(); } 189 | size_type max_bucket_count() const { return rep.max_bucket_count(); } 190 | 191 | // These are tr1 methods. bucket() is the bucket the key is or would be in. 192 | size_type bucket_size(size_type i) const { return rep.bucket_size(i); } 193 | size_type bucket(const key_type& key) const { return rep.bucket(key); } 194 | float load_factor() const { 195 | return size() * 1.0f / bucket_count(); 196 | } 197 | float max_load_factor() const { 198 | float shrink, grow; 199 | rep.get_resizing_parameters(&shrink, &grow); 200 | return grow; 201 | } 202 | void max_load_factor(float new_grow) { 203 | float shrink, grow; 204 | rep.get_resizing_parameters(&shrink, &grow); 205 | rep.set_resizing_parameters(shrink, new_grow); 206 | } 207 | // These aren't tr1 methods but perhaps ought to be. 208 | float min_load_factor() const { 209 | float shrink, grow; 210 | rep.get_resizing_parameters(&shrink, &grow); 211 | return shrink; 212 | } 213 | void min_load_factor(float new_shrink) { 214 | float shrink, grow; 215 | rep.get_resizing_parameters(&shrink, &grow); 216 | rep.set_resizing_parameters(new_shrink, grow); 217 | } 218 | // Deprecated; use min_load_factor() or max_load_factor() instead. 219 | void set_resizing_parameters(float shrink, float grow) { 220 | rep.set_resizing_parameters(shrink, grow); 221 | } 222 | 223 | void resize(size_type hint) { rep.resize(hint); } 224 | void rehash(size_type hint) { resize(hint); } // the tr1 name 225 | 226 | // Lookup routines 227 | iterator find(const key_type& key) const { return rep.find(key); } 228 | 229 | size_type count(const key_type& key) const { return rep.count(key); } 230 | 231 | std::pair equal_range(const key_type& key) const { 232 | return rep.equal_range(key); 233 | } 234 | 235 | 236 | // Insertion routines 237 | std::pair insert(const value_type& obj) { 238 | std::pair p = rep.insert(obj); 239 | return std::pair(p.first, p.second); // const to non-const 240 | } 241 | template void insert(InputIterator f, InputIterator l) { 242 | rep.insert(f, l); 243 | } 244 | void insert(const_iterator f, const_iterator l) { 245 | rep.insert(f, l); 246 | } 247 | // Required for std::insert_iterator; the passed-in iterator is ignored. 248 | iterator insert(iterator, const value_type& obj) { 249 | return insert(obj).first; 250 | } 251 | 252 | // Deletion routines 253 | // THESE ARE NON-STANDARD! I make you specify an "impossible" key 254 | // value to identify deleted buckets. You can change the key as 255 | // time goes on, or get rid of it entirely to be insert-only. 256 | void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); } 257 | void clear_deleted_key() { rep.clear_deleted_key(); } 258 | key_type deleted_key() const { return rep.deleted_key(); } 259 | 260 | // These are standard 261 | size_type erase(const key_type& key) { return rep.erase(key); } 262 | void erase(iterator it) { rep.erase(it); } 263 | void erase(iterator f, iterator l) { rep.erase(f, l); } 264 | 265 | 266 | // Comparison 267 | bool operator==(const sparse_hash_set& hs) const { return rep == hs.rep; } 268 | bool operator!=(const sparse_hash_set& hs) const { return rep != hs.rep; } 269 | 270 | 271 | // I/O -- this is an add-on for writing metainformation to disk 272 | // 273 | // For maximum flexibility, this does not assume a particular 274 | // file type (though it will probably be a FILE *). We just pass 275 | // the fp through to rep. 276 | 277 | // If your keys and values are simple enough, you can pass this 278 | // serializer to serialize()/unserialize(). "Simple enough" means 279 | // value_type is a POD type that contains no pointers. Note, 280 | // however, we don't try to normalize endianness. 281 | typedef typename ht::NopointerSerializer NopointerSerializer; 282 | 283 | // serializer: a class providing operator()(OUTPUT*, const value_type&) 284 | // (writing value_type to OUTPUT). You can specify a 285 | // NopointerSerializer object if appropriate (see above). 286 | // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a 287 | // pointer to a class providing size_t Write(const void*, size_t), 288 | // which writes a buffer into a stream (which fp presumably 289 | // owns) and returns the number of bytes successfully written. 290 | // Note basic_ostream is not currently supported. 291 | template 292 | bool serialize(ValueSerializer serializer, OUTPUT* fp) { 293 | return rep.serialize(serializer, fp); 294 | } 295 | 296 | // serializer: a functor providing operator()(INPUT*, value_type*) 297 | // (reading from INPUT and into value_type). You can specify a 298 | // NopointerSerializer object if appropriate (see above). 299 | // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a 300 | // pointer to a class providing size_t Read(void*, size_t), 301 | // which reads into a buffer from a stream (which fp presumably 302 | // owns) and returns the number of bytes successfully read. 303 | // Note basic_istream is not currently supported. 304 | // NOTE: Since value_type is const Key, ValueSerializer 305 | // may need to do a const cast in order to fill in the key. 306 | // NOTE: if Key is not a POD type, the serializer MUST use 307 | // placement-new to initialize its value, rather than a normal 308 | // equals-assignment or similar. (The value_type* passed into 309 | // the serializer points to garbage memory.) 310 | template 311 | bool unserialize(ValueSerializer serializer, INPUT* fp) { 312 | return rep.unserialize(serializer, fp); 313 | } 314 | 315 | // The four methods below are DEPRECATED. 316 | // Use serialize() and unserialize() for new code. 317 | template 318 | bool write_metadata(OUTPUT *fp) { return rep.write_metadata(fp); } 319 | 320 | template 321 | bool read_metadata(INPUT *fp) { return rep.read_metadata(fp); } 322 | 323 | template 324 | bool write_nopointer_data(OUTPUT *fp) { return rep.write_nopointer_data(fp); } 325 | 326 | template 327 | bool read_nopointer_data(INPUT *fp) { return rep.read_nopointer_data(fp); } 328 | }; 329 | 330 | template 331 | inline void swap(sparse_hash_set& hs1, 332 | sparse_hash_set& hs2) { 333 | hs1.swap(hs2); 334 | } 335 | 336 | _END_GOOGLE_NAMESPACE_ 337 | 338 | #endif /* _SPARSE_HASH_SET_H_ */ 339 | -------------------------------------------------------------------------------- /cpp/sparsehash/template_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2005 Google Inc. 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Google Inc. nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // ---- 31 | // 32 | // Template metaprogramming utility functions. 33 | // 34 | // This code is compiled directly on many platforms, including client 35 | // platforms like Windows, Mac, and embedded systems. Before making 36 | // any changes here, make sure that you're not breaking any platforms. 37 | // 38 | // 39 | // The names choosen here reflect those used in tr1 and the boost::mpl 40 | // library, there are similar operations used in the Loki library as 41 | // well. I prefer the boost names for 2 reasons: 42 | // 1. I think that portions of the Boost libraries are more likely to 43 | // be included in the c++ standard. 44 | // 2. It is not impossible that some of the boost libraries will be 45 | // included in our own build in the future. 46 | // Both of these outcomes means that we may be able to directly replace 47 | // some of these with boost equivalents. 48 | // 49 | #ifndef BASE_TEMPLATE_UTIL_H_ 50 | #define BASE_TEMPLATE_UTIL_H_ 51 | 52 | #include 53 | _START_GOOGLE_NAMESPACE_ 54 | 55 | // Types small_ and big_ are guaranteed such that sizeof(small_) < 56 | // sizeof(big_) 57 | typedef char small_; 58 | 59 | struct big_ { 60 | char dummy[2]; 61 | }; 62 | 63 | // Identity metafunction. 64 | template 65 | struct identity_ { 66 | typedef T type; 67 | }; 68 | 69 | // integral_constant, defined in tr1, is a wrapper for an integer 70 | // value. We don't really need this generality; we could get away 71 | // with hardcoding the integer type to bool. We use the fully 72 | // general integer_constant for compatibility with tr1. 73 | 74 | template 75 | struct integral_constant { 76 | static const T value = v; 77 | typedef T value_type; 78 | typedef integral_constant type; 79 | }; 80 | 81 | template const T integral_constant::value; 82 | 83 | 84 | // Abbreviations: true_type and false_type are structs that represent boolean 85 | // true and false values. Also define the boost::mpl versions of those names, 86 | // true_ and false_. 87 | typedef integral_constant true_type; 88 | typedef integral_constant false_type; 89 | typedef true_type true_; 90 | typedef false_type false_; 91 | 92 | // if_ is a templatized conditional statement. 93 | // if_ is a compile time evaluation of cond. 94 | // if_<>::type contains A if cond is true, B otherwise. 95 | template 96 | struct if_{ 97 | typedef A type; 98 | }; 99 | 100 | template 101 | struct if_ { 102 | typedef B type; 103 | }; 104 | 105 | 106 | // type_equals_ is a template type comparator, similar to Loki IsSameType. 107 | // type_equals_::value is true iff "A" is the same type as "B". 108 | // 109 | // New code should prefer base::is_same, defined in base/type_traits.h. 110 | // It is functionally identical, but is_same is the standard spelling. 111 | template 112 | struct type_equals_ : public false_ { 113 | }; 114 | 115 | template 116 | struct type_equals_ : public true_ { 117 | }; 118 | 119 | // and_ is a template && operator. 120 | // and_::value evaluates "A::value && B::value". 121 | template 122 | struct and_ : public integral_constant { 123 | }; 124 | 125 | // or_ is a template || operator. 126 | // or_::value evaluates "A::value || B::value". 127 | template 128 | struct or_ : public integral_constant { 129 | }; 130 | 131 | 132 | _END_GOOGLE_NAMESPACE_ 133 | 134 | #endif // BASE_TEMPLATE_UTIL_H_ 135 | -------------------------------------------------------------------------------- /cython/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | set -e 3 | WHEEL_TOOL=`which wheel` /usr/bin/python2.7 setup.py sdist bdist_wheel 4 | find dist -type f -exec gpg2 --detach-sign -a {} \; 5 | twine upload dist/* 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cslarsen/arv/3999e00361f13d404d0d86d76bcedd4d8d49c393/setup.cfg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from setuptools.command.build_ext import build_ext 3 | import os 4 | import shutil 5 | import unittest 6 | 7 | class ArvOptions: 8 | debug = os.getenv("ARV_DEBUG", False) 9 | debug_symbols = False 10 | hidden_symbols = True 11 | strip = True 12 | warnings = True 13 | 14 | # Currently just assume we have gcc/clang 15 | is_gcc = True 16 | 17 | @staticmethod 18 | def compile_flags(): 19 | flags = [] 20 | 21 | if not ArvOptions.is_gcc: 22 | return flags 23 | 24 | flags += ["--std=c++11", # REQUIRED 25 | "-DBUILDING_DLL"] # REQUIRED 26 | 27 | if ArvOptions.warnings: 28 | flags += ["-W", "-Wall"] 29 | 30 | if not ArvOptions.debug: 31 | # Make the binary a good bit faster 32 | flags += [ 33 | "-fdata-sections", # small impact, i.e. not important 34 | "-ffunction-sections", # small impact, i.e. not important 35 | "-fno-rtti", # small impact, i.e. not important 36 | "-march=native", # important 37 | "-mtune=native", # important, but could use march=generic 38 | "-O3", # important, but O2 also works fine 39 | ] 40 | 41 | if not ArvOptions.debug_symbols: 42 | flags += ["-g0"] 43 | 44 | if ArvOptions.hidden_symbols: 45 | flags += ["-fvisibility=hidden", # I like clean binaries 46 | "-include", "cpp/public_py_init_sym.hpp"] 47 | return flags 48 | 49 | @staticmethod 50 | def link_flags(): 51 | flags = [] 52 | 53 | if not ArvOptions.is_gcc: 54 | return flags 55 | 56 | if ArvOptions.strip: 57 | flags += ["-Wl,-s"] 58 | 59 | return flags 60 | 61 | # From http://stackoverflow.com/a/26698408/21028 62 | class lazy_cythonize(list): 63 | def __init__(self, callback): 64 | self._list, self.callback = None, callback 65 | def c_list(self): 66 | if self._list is None: self._list = self.callback() 67 | return self._list 68 | def __iter__(self): 69 | for e in self.c_list(): yield e 70 | def __getitem__(self, ii): return self.c_list()[ii] 71 | def __len__(self): return len(self.c_list()) 72 | 73 | def configure_google_hashmap(): 74 | script = os.path.join("3rd-party", "sparsehash", "configure") 75 | config = os.path.join("cpp", "sparsehash", "internal", "sparseconfig.h") 76 | 77 | if not os.path.isfile(config): 78 | print("Configuring Google hash map") 79 | if os.system(script) == 0: 80 | shutil.copy(os.path.join("src", "config.h"), config) 81 | else: 82 | raise RuntimeError("Error configuring Google hash map") 83 | 84 | class BuildExt(build_ext): 85 | def run(self): 86 | configure_google_hashmap() 87 | return build_ext.run(self) 88 | 89 | def extensions(): 90 | from Cython.Build import cythonize 91 | import multiprocessing 92 | 93 | exts = [ 94 | Extension("_arv", [ 95 | "cpp/arv.cpp", 96 | "cpp/file.cpp", 97 | "cpp/filesize.cpp", 98 | "cpp/mmap.cpp", 99 | "cpp/parse.cpp", 100 | "cython/_arv.pyx", 101 | ], 102 | language="c++", 103 | include_dirs=["cpp"], 104 | extra_compile_args=ArvOptions.compile_flags(), 105 | extra_link_args=ArvOptions.link_flags(), 106 | ), 107 | ] 108 | #configure_google_hashmap() 109 | return cythonize(exts, nthreads=multiprocessing.cpu_count()) 110 | 111 | def slurp(filename): 112 | with open(filename, "rt") as f: 113 | return f.read() 114 | 115 | def get_testsuite(): 116 | loader = unittest.TestLoader() 117 | suite = loader.discover("tests", pattern="test*.py") 118 | return suite 119 | 120 | setup( 121 | name="arv", 122 | packages=["arv"], 123 | version="0.9.3", 124 | description="A fast 23andMe raw genome file parser", 125 | author="Christian Stigen Larsen", 126 | author_email="csl@csl.name", 127 | url="https://github.com/cslarsen/arv", 128 | license="https://www.gnu.org/licenses/gpl-3.0.html", 129 | long_description=slurp("README.rst"), 130 | keywords=[ 131 | "23andMe", 132 | "bio", 133 | "biology", 134 | "biopython", 135 | "disease", 136 | "DNA", 137 | "gene", 138 | "genome", 139 | "health", 140 | "protein", 141 | "RNA", 142 | "RSID", 143 | "SNP", 144 | ], 145 | platforms=["unix", "linux", "osx"], 146 | install_requires=["cython>=0.25"], 147 | setup_requires=["cython>=0.25"], 148 | ext_modules=lazy_cythonize(extensions), 149 | test_suite="setup.get_testsuite", 150 | cmdclass={'build_ext': BuildExt}, 151 | classifiers=[ 152 | "Development Status :: 3 - Alpha", 153 | "Natural Language :: English", 154 | "Operating System :: MacOS :: MacOS X", 155 | "Operating System :: POSIX", 156 | "Operating System :: Unix", 157 | "Programming Language :: Python", 158 | "Programming Language :: Python :: 2", 159 | "Programming Language :: Python :: 2.7", 160 | "Programming Language :: Python :: 3", 161 | "Programming Language :: Python :: 3.2", 162 | "Programming Language :: Python :: 3.3", 163 | "Programming Language :: Python :: 3.4", 164 | "Programming Language :: Python :: 3.5", 165 | "Programming Language :: Python :: 3.6", 166 | ], 167 | ) 168 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cslarsen/arv/3999e00361f13d404d0d86d76bcedd4d8d49c393/tests/__init__.py -------------------------------------------------------------------------------- /tests/example.py: -------------------------------------------------------------------------------- 1 | from arv import load, unphased_match as match 2 | 3 | genome = load("genome.txt") 4 | 5 | print("You are a {gender} with {color} eyes and {complexion} skin.".format( 6 | gender = "man" if genome.y_chromosome else "woman", 7 | complexion = "light" if genome["rs1426654"] == "AA" else "dark", 8 | color = match(genome["rs12913832"], {"AA": "brown", 9 | "AG": "brown or green", 10 | "GG": "blue"}))) 11 | -------------------------------------------------------------------------------- /tests/fake_genome.txt: -------------------------------------------------------------------------------- 1 | # This data file generated by 23andMe at: Wed Mar 15 12:34:56 2017 2 | # 3 | # Below is a text version of your data. Fields are TAB-separated 4 | # Each line corresponds to a single SNP. For each SNP, we provide its identifier 5 | # (an rsid or an internal id), its location on the reference human genome, and the 6 | # genotype call oriented with respect to the plus strand on the human reference sequence. 7 | # We are using reference human assembly build 37 (also known as Annotation Release 104). 8 | # Note that it is possible that data downloaded at different times may be different due to ongoing 9 | # improvements in our ability to call genotypes. More information about these changes can be found at: 10 | # https://www.23andme.com/you/download/revisions/ 11 | # 12 | # More information on reference human assembly build 37 (aka Annotation Release 104): 13 | # http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606 14 | # 15 | # rsid chromosome position genotype 16 | rs4477212 1 82154 AT 17 | rs4672279 2 59444675 GT 18 | rs4536786 3 140049121 CA 19 | rs7715122 5 94197884 AT 20 | rs11980927 7 20010422 GG 21 | rs10810289 9 14899708 AA 22 | rs10488822 11 35984271 TC 23 | rs913897 13 73892459 AC 24 | rs1540613 16 80476182 AG 25 | rs6123756 20 56556146 TT 26 | rs6015286 20 57048415 -- 27 | rs6026400 20 57183524 CC 28 | rs742927 Y 57183914 GG 29 | i3001754 MT 16256 A 30 | i3001755 MT 16257 -- 31 | i3001759 MT 16258 -- 32 | i3001761 MT 16259 -- 33 | i3001773 MT 16265 T 34 | i4000755 MT 16548 C 35 | i4000759 MT 16567 G 36 | rs1426654 15 48426484 AA 37 | rs12913832 15 28365618 GG 38 | rs28504042 MT 1549 -- 39 | rs3135027 MT 1598 G 40 | rs671 12 112241766 GG 41 | -------------------------------------------------------------------------------- /tests/fake_genome_female.txt: -------------------------------------------------------------------------------- 1 | # This data file generated by 23andMe at: Wed Mar 15 12:34:56 2017 2 | # 3 | # Below is a text version of your data. Fields are TAB-separated 4 | # Each line corresponds to a single SNP. For each SNP, we provide its identifier 5 | # (an rsid or an internal id), its location on the reference human genome, and the 6 | # genotype call oriented with respect to the plus strand on the human reference sequence. 7 | # We are using reference human assembly build 37 (also known as Annotation Release 104). 8 | # Note that it is possible that data downloaded at different times may be different due to ongoing 9 | # improvements in our ability to call genotypes. More information about these changes can be found at: 10 | # https://www.23andme.com/you/download/revisions/ 11 | # 12 | # More information on reference human assembly build 37 (aka Annotation Release 104): 13 | # http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606 14 | # 15 | # rsid chromosome position genotype 16 | rs4477212 1 82154 AT 17 | rs4672279 2 59444675 GT 18 | rs4536786 3 140049121 CA 19 | rs7715122 5 94197884 AT 20 | rs11980927 7 20010422 GG 21 | rs10810289 9 14899708 AA 22 | rs10488822 11 35984271 TC 23 | rs913897 13 73892459 AC 24 | rs1540613 16 80476182 AG 25 | rs6123756 20 56556146 TT 26 | rs6015286 20 57048415 -- 27 | rs6026400 20 57183524 CC 28 | i3001754 MT 16256 A 29 | i3001755 MT 16257 -- 30 | i3001759 MT 16258 -- 31 | i3001761 MT 16259 -- 32 | i3001773 MT 16265 T 33 | i4000755 MT 16548 C 34 | i4000759 MT 16567 G 35 | rs1426654 15 48426484 AA 36 | rs12913832 15 28365618 GG 37 | rs28504042 MT 1549 -- 38 | rs3135027 MT 1598 G 39 | rs671 12 112241766 GG 40 | -------------------------------------------------------------------------------- /tests/test_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various benchmarks for arv. 3 | 4 | arv 5 | Copyright 2017 Christian Stigen Larsen 6 | Distributed under the GNU GPL v3 or later; see COPYING. 7 | """ 8 | 9 | import argparse 10 | import arv 11 | import contextlib 12 | import os 13 | import random 14 | import sys 15 | import time 16 | import unittest 17 | 18 | benchmarks = { 19 | "parsing": "arv.load(filename)", 20 | 21 | "random access": 22 | r""" 23 | for n in xrange(5000): 24 | try: 25 | pos = random.randint(genome.first, genome.last) 26 | snp = genome[pos] 27 | except KeyError: 28 | # RSIDs are not contiguous 29 | pass 30 | """, 31 | 32 | "iterate items in genome": 33 | r""" 34 | assert(False) # this is too slow at the moment 35 | num = 0 36 | for snp in genome: 37 | num += 1 38 | assert(num == len(genome)) 39 | """, 40 | 41 | "iterate rsids": 42 | r""" 43 | num = 0 44 | for snp in genome.rsids: 45 | num += 1 46 | assert(num == len(genome)) 47 | """, 48 | 49 | "iterate snps": 50 | r""" 51 | num = 0 52 | for snp in genome.snps: 53 | num += 1 54 | assert(num == len(genome)) 55 | """, 56 | } 57 | 58 | def log(msg, stream=sys.stdout): 59 | stream.write(msg) 60 | stream.flush() 61 | 62 | if sys.version_info[:2] >= (3, 3): 63 | mark_time = time.perf_counter 64 | else: 65 | mark_time = time.clock 66 | 67 | @contextlib.contextmanager 68 | def timed_block(): 69 | start = mark_time() 70 | elapsed = None 71 | yield lambda: elapsed 72 | elapsed = mark_time() - start 73 | 74 | def benchmark(times, code, **local_args): 75 | stream = local_args.get("stream", sys.stdout) 76 | prefix = local_args.get("prefix", "") 77 | best = 1e9 78 | for no in range(times): 79 | localvars = { 80 | "arv": arv, 81 | "sys": sys 82 | } 83 | if sys.version_info[0] >= 3: 84 | localvars["xrange"] = range 85 | 86 | localvars.update(local_args) 87 | 88 | with timed_block() as elapsed: 89 | exec(code, localvars) 90 | 91 | elapsed = elapsed() 92 | if elapsed < best: 93 | if round(elapsed, 4) < round(best, 4): 94 | log("\n%s%6.4fs " % (prefix, elapsed), stream=stream) 95 | best = elapsed 96 | else: 97 | log(".", stream=stream) 98 | return best 99 | 100 | def all_benchmarks(filename, times): 101 | log("Benchmarking arv %s at %s\n" % (arv.__version__, arv.__file__)) 102 | log("Measuring time with %s\n\n" % mark_time) 103 | 104 | results = {} 105 | genome = arv.load(filename) 106 | 107 | for name, code in sorted(benchmarks.items()): 108 | log("Benchmarking %s x %d ... " % (repr(name), times)) 109 | try: 110 | results[name] = benchmark(times, code, filename=filename, 111 | genome=genome, random=random) 112 | except Exception as e: 113 | log(str(e)) 114 | finally: 115 | if name == "parsing": 116 | genome = arv.load(filename) 117 | log(" %.2g SNPs / second" % (len(genome)/results[name])) 118 | log("\n") 119 | 120 | return results 121 | 122 | class BenchmarkTests(unittest.TestCase): 123 | @unittest.skipUnless(os.getenv("ARV_BENCHMARK", None) is not None, 124 | "Specify ARV_BENCHMARK= to benchmark") 125 | def test_parser_speed(self): 126 | filename = os.getenv("ARV_BENCHMARK") 127 | self.assertTrue(os.path.isfile(filename), 128 | "File not found: %s" % filename) 129 | try: 130 | times = int(os.getenv("ARV_BENCHMARK_COUNT", "40")) 131 | except: 132 | times = 40 133 | code = benchmarks["parsing"] 134 | seconds = benchmark(times, code, filename=filename, stream=sys.stderr, 135 | prefix=" ") 136 | genome = arv.load(filename) 137 | sys.stderr.flush() 138 | sys.stderr.write(" %d SNPs in ~%dms or %.1g SNPs/second ... " % ( 139 | len(genome), int(round(seconds, 3)*1000), len(genome)/seconds)) 140 | sys.stderr.flush() 141 | 142 | 143 | if __name__ == "__main__": 144 | p = argparse.ArgumentParser() 145 | p.add_argument("--times", "-t", default=20, type=int) 146 | p.add_argument("--filename", "-f", required=True, type=str) 147 | args = p.parse_args() 148 | all_benchmarks(args.filename, args.times) 149 | -------------------------------------------------------------------------------- /tests/test_commandline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for arv. 3 | 4 | arv 5 | Copyright 2017 Christian Stigen Larsen 6 | Distributed under the GNU GPL v3 or later; see COPYING. 7 | """ 8 | 9 | import arv 10 | import os 11 | import subprocess 12 | import sys 13 | import unittest 14 | 15 | class ArvModuleTests(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls): 18 | cls.genome_path = os.path.join(os.path.dirname(__file__), 19 | "fake_genome.txt") 20 | 21 | def _execute(self, *args): 22 | output = subprocess.check_output([sys.executable, "-m", "arv"] + 23 | list(args), universal_newlines=True) 24 | return output.replace("\r\n", "\n").split("\n") 25 | 26 | def test_help(self): 27 | self.assertTrue("\n".join(self._execute("--help")). 28 | startswith("usage: arv [-h]")) 29 | 30 | def test_example(self): 31 | self.assertEqual(self._execute("--example", "--ethnicity=europan", 32 | self.genome_path), 33 | ["fake_genome.txt ... 25 SNPs, male", 34 | "fake_genome.txt ... ", 35 | " Alcohol flush reaction: Little to no reaction (two copies of the ALDH2 gene)", 36 | " Description : A man with blue eyes and light skin", 37 | ""]) 38 | 39 | -------------------------------------------------------------------------------- /tests/test_infer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inferring tests for arv. 3 | 4 | arv 5 | Copyright 2017 Christian Stigen Larsen 6 | Distributed under the GNU GPL v3 or later; see COPYING. 7 | """ 8 | 9 | import arv 10 | import unittest 11 | 12 | class ArvInferTests(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.genome = arv.load("tests/fake_genome.txt") 16 | 17 | def test_infer_gender(self): 18 | gender = "man" if self.genome.y_chromosome else "woman" 19 | self.assertEqual(gender, "man") 20 | 21 | def test_infer_complexion(self): 22 | complexion = "light" if self.genome["rs1426654"].genotype == "AA" else "dark" 23 | self.assertEqual(complexion, "light") 24 | 25 | # Rich comparison with string 26 | complexion = "light" if self.genome["rs1426654"] == "AA" else "dark" 27 | self.assertEqual(complexion, "light") 28 | 29 | def test_infer_unphased_match_eyecolor(self): 30 | eyecolor = arv.unphased_match(self.genome["rs12913832"], { 31 | "AA": "brown eyes", 32 | "AG": "brown or green eyes", 33 | "GG": "blue eyes", 34 | None: "unknown"}) 35 | self.assertEqual(eyecolor, "blue eyes") 36 | -------------------------------------------------------------------------------- /tests/test_traits.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inferring tests for arv. 3 | 4 | arv 5 | Copyright 2017 Christian Stigen Larsen 6 | Distributed under the GNU GPL v3 or later; see COPYING. 7 | """ 8 | 9 | import arv 10 | import arv.traits 11 | import unittest 12 | 13 | class ArvTraitsTest(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.genome = arv.load("tests/fake_genome.txt", ethnicity="european") 17 | 18 | def test_alcohol_flush_reaction(self): 19 | self.assertEqual(self.genome["rs671"], "GG") 20 | self.assertEqual(arv.traits.alcohol_flush_reaction(self.genome), 21 | "Little to no reaction (two copies of the ALDH2 gene)") 22 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py34 3 | 4 | [testenv] 5 | commands = {envpython} setup.py test 6 | --------------------------------------------------------------------------------