├── .gitignore
├── .travis.yml
├── 3rd-party
    ├── README.md
    └── sparsehash
    │   ├── .gitignore
    │   ├── AUTHORS
    │   ├── ChangeLog
    │   ├── Makefile.in
    │   ├── README
    │   ├── config.guess
    │   ├── config.sub
    │   ├── configure
    │   ├── depcomp
    │   ├── install-sh
    │   ├── m4
    │       ├── acx_pthread.m4
    │       ├── google_namespace.m4
    │       ├── namespaces.m4
    │       ├── stl_hash.m4
    │       └── stl_hash_fun.m4
    │   ├── missing
    │   └── src
    │       └── config.h.in
├── COPYING
├── LICENSES
├── MANIFEST.in
├── README.rst
├── TODO.md
├── arv
    ├── __init__.py
    ├── __main__.py
    ├── match.py
    ├── traits.py
    └── util.py
├── cpp
    ├── .gitignore
    ├── arv.cpp
    ├── arv.hpp
    ├── export.hpp
    ├── file.cpp
    ├── file.hpp
    ├── filesize.cpp
    ├── filesize.hpp
    ├── google
    │   ├── dense_hash_map
    │   ├── dense_hash_set
    │   ├── sparse_hash_map
    │   ├── sparse_hash_set
    │   ├── sparsehash
    │   │   ├── densehashtable.h
    │   │   ├── hashtable-common.h
    │   │   ├── libc_allocator_with_realloc.h
    │   │   └── sparsehashtable.h
    │   ├── sparsetable
    │   ├── template_util.h
    │   └── type_traits.h
    ├── mmap.cpp
    ├── mmap.hpp
    ├── parse.cpp
    ├── public_py_init_sym.hpp
    └── sparsehash
    │   ├── dense_hash_map
    │   ├── dense_hash_set
    │   ├── internal
    │       ├── densehashtable.h
    │       ├── hashtable-common.h
    │       ├── libc_allocator_with_realloc.h
    │       └── sparsehashtable.h
    │   ├── sparse_hash_map
    │   ├── sparse_hash_set
    │   ├── sparsetable
    │   ├── template_util.h
    │   └── type_traits.h
├── cython
    ├── .gitignore
    └── _arv.pyx
├── publish.sh
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── example.py
    ├── fake_genome.txt
    ├── fake_genome_female.txt
    ├── test_arv.py
    ├── test_benchmark.py
    ├── test_commandline.py
    ├── test_infer.py
    └── test_traits.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info
 2 | *.o
 3 | *.pyc
 4 | *.so
 5 | .eggs
 6 | .tox
 7 | __pycache__
 8 | build
 9 | config.log
10 | config.status
11 | cpp/sparsehash/internal/sparseconfig.h
12 | dist
13 | genome.txt
14 | genomes
15 | GPATH
16 | GRTAGS
17 | GTAGS
18 | Makefile
19 | MANIFEST
20 | src
21 | Testing
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | python:
 4 |   - "2.7"
 5 |   - "3.2"
 6 |   - "3.3"
 7 |   - "3.4"
 8 |   - "3.5"
 9 |   - "3.6"
10 | compiler:
11 |     - gcc
12 | addons:
13 |   apt:
14 |     sources:
15 |       - ubuntu-toolchain-r-test
16 |     packages:
17 |       - g++-4.8
18 | install:
19 |   - if [ "$CXX" = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
20 | script:
21 |   - CC=g++-4.8 CXX=g++-4.8 python setup.py test
22 | 


--------------------------------------------------------------------------------
/3rd-party/README.md:
--------------------------------------------------------------------------------
1 | This contains copies of other open source projects.
2 | 
3 | Google sparsehash is the exact same, except that I have deleted files I don't
4 | need. The only thing I'm interested in is generating its config.h file.
5 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/.gitignore:
--------------------------------------------------------------------------------
1 | *.Po
2 | *.o
3 | *.dSYM/*
4 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/AUTHORS:
--------------------------------------------------------------------------------
1 | google-sparsehash@googlegroups.com
2 | 
3 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/ChangeLog:
--------------------------------------------------------------------------------
  1 | Mon Oct 12 21:00:00 2015 Google Inc. <google-sparsehash@googlegroups.com>
  2 | 
  3 | 	* sparsehash: version 2.0.3
  4 | 	* Fix compilation on modern compilers and operating systems
  5 | 
  6 | Thu Feb 23 23:47:18 2012 Google Inc. <google-sparsehash@googlegroups.com>
  7 | 
  8 | 	* sparsehash: version 2.0.2
  9 | 	* BUGFIX: Fix backwards compatibility for <google> include folders
 10 | 
 11 | Wed Feb 01 02:57:48 2012 Google Inc. <google-sparsehash@googlegroups.com>
 12 | 
 13 | 	* sparsehash: version 2.0.1
 14 | 	* BUGFIX: Fix path to malloc_extension.h in time_hash_map.cc
 15 | 
 16 | Tue Jan 31 11:33:04 2012  Google Inc. <google-sparsehash@googlegroups.com>
 17 | 
 18 | 	* sparsehash: version 2.0
 19 | 	* Renamed include directory from google/ to sparsehash/ (csilvers)
 20 | 	* Changed the 'official' sparsehash email in setup.py/etc
 21 | 	* Renamed google-sparsehash.sln to sparsehash.sln
 22 | 	* Changed copyright text to reflect Google's relinquished ownership
 23 | 
 24 | Tue Dec 20 21:04:04 2011  Google Inc. <opensource@google.com>
 25 | 
 26 | 	* sparsehash: version 1.12 release
 27 | 	* Add support for serializing/unserializing dense_hash_map/set to disk
 28 | 	* New simpler and more flexible serialization API
 29 | 	* Be more consistent about clearing on unserialize() even if it fails
 30 | 	* Quiet some compiler warnings about unused variables
 31 | 	* Add a timing test for iterating (suggested by google code issue 77)
 32 | 	* Add offset_to_pos, the opposite of pos_to_offset, to sparsetable
 33 | 	* PORTING: Add some missing #includes, needed on some systems
 34 | 	* Die at configure-time when g++ isn't installed
 35 | 	* Successfully make rpm's even when dpkg is missing
 36 | 	* Improve deleted key test in util/gtl/{dense,sparse}hashtable
 37 | 	* Update automake to 1.10.1, and autoconf to 2.62
 38 | 
 39 | Thu Jun 23 21:12:58 2011  Google Inc. <opensource@google.com>
 40 | 
 41 | 	* sparsehash: version 1.11 release
 42 | 	* Improve performance on pointer keys by ignoring always-0 low bits
 43 | 	* Fix missing $(top_srcdir) in Makefile.am, which broke some compiles
 44 | 	* BUGFIX: Fix a crashing typo-bug in swap()
 45 | 	* PORTING: Remove support for old compilers that do not use 'std'
 46 | 	* Add some new benchmarks to test for a place dense_hash_* does badly
 47 | 	* Some cosmetic changes due to a switch to a new releasing tool
 48 | 
 49 | Thu Jan 20 16:07:39 2011  Google Inc. <opensource@google.com>
 50 | 
 51 | 	* sparsehash: version 1.10 release
 52 | 	* Follow ExtractKey return type, allowing it to return a reference
 53 | 	* PORTING: fix MSVC 10 warnings (constifying result_type, placement-new)
 54 | 	* Update from autoconf 2.61 to autoconf 2.65
 55 | 	
 56 | Fri Sep 24 11:37:50 2010  Google Inc. <opensource@google.com>
 57 | 
 58 | 	* sparsehash: version 1.9 release
 59 | 	* Add is_enum; make all enums PODs by default (romanp)
 60 | 	* Make find_or_insert() usable directly (dawidk)
 61 | 	* Use zero-memory trick for allocators to reduce space use (guilin)
 62 | 	* Fix some compiler warnings (chandlerc, eraman)
 63 | 	* BUGFIX: int -> size_type in one function we missed (csilvers)
 64 | 	* Added sparsehash.pc, for pkg-config (csilvers)
 65 | 
 66 | Thu Jul 29 15:01:29 2010  Google Inc. <opensource@google.com>
 67 | 
 68 | 	* sparsehash: version 1.8.1 release
 69 | 	* Remove -Werror from Makefile: gcc 4.3 gives spurious warnings
 70 | 
 71 | Thu Jul 29 09:53:26 2010  Google Inc. <opensource@google.com>
 72 | 
 73 | 	* sparsehash: version 1.8 release
 74 | 	* More support for Allocator, including allocator ctor arg (csilvers)
 75 | 	* Repack hasthable vars to reduce container size *more* (giao)
 76 | 	* Speed up clear() (csilvers)
 77 | 	* Change HT_{OCCUPANCY,SHRINK}_FLT from float to int (csilvers)
 78 | 	* Revamp test suite for more complete code & timing coverage (csilvers)
 79 | 	* BUGFIX: Enforce max_size for dense/sparse_hashtable (giao, csilvers)
 80 | 	* BUGFIX: Raise exception instead of crashing on overflow (csilvers)
 81 | 	* BUGFIX: Allow extraneous const in key type (csilvers)
 82 | 	* BUGFIX: Allow same functor for both hasher and key_equals (giao)
 83 | 	* PORTING: remove is_convertible, which gives AIX cc fits (csilvers)
 84 | 	* PORTING: Renamed README.windows to README_windows.txt (csilvers)
 85 | 	* Created non-empty NEWS file (csilvers)
 86 | 
 87 | Wed Mar 31 12:32:03 2010  Google Inc. <opensource@google.com>
 88 | 
 89 | 	* sparsehash: version 1.7 release
 90 | 	* Add support for Allocator (guilin)
 91 | 	* Add libc_allocator_with_realloc as the new default allocator (guilin)
 92 | 	* Repack {sparse,dense}hashtable vars to reduce container size (giao)
 93 | 	* BUGFIX: operator== no longer requires same table ordering (csilvers)
 94 | 	* BUGFIX: fix dense_hash_*(it,it) by requiring empty-key too (csilvers)
 95 | 	* PORTING: fix language bugs that gcc allowed (csilvers, chandlerc)
 96 | 	* Update from autoconf 2.61 to autoconf 2.64
 97 | 
 98 | Fri Jan  8 14:47:55 2010  Google Inc. <opensource@google.com>
 99 | 
100 | 	* sparsehash: version 1.6 release
101 | 	* New accessor methods for deleted_key, empty_key (sjackman)
102 | 	* Use explicit hash functions in sparsehash tests (csilvers)
103 | 	* BUGFIX: Cast resize to fix SUNWspro bug (csilvers)
104 | 	* Check for sz overflow in min_size (csilvers)
105 | 	* Speed up clear() for dense and sparse hashtables (jeff)
106 | 	* Avoid shrinking in all cases when min-load is 0 (shaunj, csilvers)
107 | 	* Improve densehashtable code for the deleted key (gpike)
108 | 	* BUGFIX: Fix operator= when the 2 empty-keys differ (andreidam)
109 | 	* BUGFIX: Fix ht copying when empty-key isn't set (andreidam)
110 | 	* PORTING: Use TmpFile() instead of /tmp on MinGW (csilvers)
111 | 	* PORTING: Use filenames that work with Stratus VOS.
112 | 
113 | Tue May 12 14:16:38 2009  Google Inc. <opensource@google.com>
114 | 
115 | 	* sparsehash: version 1.5.2 release
116 | 	* Fix compile error: not initializing set_key in all constructors
117 | 
118 | Fri May  8 15:23:44 2009  Google Inc. <opensource@google.com>
119 | 
120 | 	* sparsehash: version 1.5.1 release
121 | 	* Fix broken equal_range() for all the hash-classes (csilvers)
122 | 
123 | Wed May  6 11:28:49 2009  Google Inc. <opensource@google.com>
124 | 
125 | 	* sparsehash: version 1.5 release
126 | 	* Support the tr1 unordered_map (and unordered_set) API (csilvers)
127 | 	* Store only key for delkey; reduces need for 0-arg c-tor (csilvers)
128 | 	* Prefer unordered_map to hash_map for the timing test (csilvers)
129 | 	* PORTING: update the resource use for 64-bit machines (csilvers)
130 | 	* PORTING: fix MIN/MAX collisions by un-#including windows.h (csilvers)
131 | 	* Updated autoconf version to 2.61 and libtool version to 1.5.26
132 | 
133 | Wed Jan 28 17:11:31 2009  Google Inc. <opensource@google.com>
134 | 
135 | 	* sparsehash: version 1.4 release
136 | 	* Allow hashtables to be <32 buckets (csilvers)
137 | 	* Fix initial-sizing bug: was sizing tables too small (csilvers)
138 | 	* Add asserts that clients don't abuse deleted/empty key (csilvers)
139 | 	* Improve determination of 32/64 bit for C code (csilvers)
140 | 	* Small fix for doc files in rpm (csilvers)
141 | 
142 | Thu Nov  6 15:06:09 2008  Google Inc. <opensource@google.com>
143 | 
144 | 	* sparsehash: version 1.3 release
145 | 	* Add an interface to change the parameters for resizing (myl)
146 | 	* Document another potentially good hash function (csilvers)
147 | 
148 | Thu Sep 18 13:53:20 2008  Google Inc. <opensource@google.com>
149 | 
150 | 	* sparsehash: version 1.2 release
151 | 	* Augment documentation to better describe namespace issues (csilvers)
152 | 	* BUG FIX: replace hash<> with SPARSEHASH_HASH, for windows (csilvers)
153 | 	* Add timing test to unittest to test repeated add+delete (csilvers)
154 | 	* Do better picking a new size when resizing (csilvers)
155 | 	* Use ::google instead of google as a namespace (csilvers)
156 | 	* Improve threading test at config time (csilvers)
157 | 
158 | Mon Feb 11 16:30:11 2008  Google Inc. <opensource@google.com>
159 | 
160 | 	* sparsehash: version 1.1 release
161 | 	* Fix brown-paper-bag bug in some constructors (rafferty)
162 | 	* Fix problem with variables shadowing member vars, add -Wshadow
163 | 	
164 | Thu Nov 29 11:44:38 2007  Google Inc. <opensource@google.com>
165 | 
166 | 	* sparsehash: version 1.0.2 release
167 | 	* Fix a final reference to hash<> to use SPARSEHASH_HASH<> instead.
168 | 	
169 | Wed Nov 14 08:47:48 2007  Google Inc. <opensource@google.com>
170 | 
171 | 	* sparsehash: version 1.0.1 release :-(
172 | 	* Remove an unnecessary (harmful) "#define hash" in windows' config.h
173 | 	
174 | Tue Nov 13 15:15:46 2007  Google Inc. <opensource@google.com>
175 | 
176 | 	* sparsehash: version 1.0 release!  We are now out of beta.
177 | 	* Clean up Makefile awk script to be more readable (csilvers)
178 | 	* Namespace fixes: use fewer #defines, move typedefs into namespace
179 | 	
180 | Fri Oct 12 12:35:24 2007  Google Inc. <opensource@google.com>
181 | 
182 | 	* sparsehash: version 0.9.1 release
183 | 	* Fix Makefile awk script to work on more architectures (csilvers)
184 | 	* Add test to test code in more 'real life' situations (csilvers)
185 | 
186 | Tue Oct  9 14:15:21 2007  Google Inc. <opensource@google.com>
187 | 
188 | 	* sparsehash: version 0.9 release
189 | 	* More type-hygiene improvements, especially for 64-bit (csilvers)
190 | 	* Some configure improvements to improve portability, utility (austern)
191 | 	* Small bugfix for operator== for dense_hash_map (jeff)
192 | 
193 | Tue Jul  3 12:55:04 2007  Google Inc. <opensource@google.com>
194 | 
195 | 	* sparsehash: version 0.8 release
196 | 	* Minor type-hygiene improvements: size_t for int, etc. (csilvers)
197 | 	* Porting improvements: tests pass on OS X, FreeBSD, Solaris (csilvers)
198 | 	* Full windows port!  VS solution provided for all unittests (csilvers)
199 | 
200 | Mon Jun 11 11:33:41 2007  Google Inc. <opensource@google.com>
201 | 
202 | 	* sparsehash: version 0.7 release
203 | 	* Syntax fixes to better support gcc 4.3 and VC++ 7 (mec, csilvers)
204 | 	* Improved windows/VC++ support (see README.windows) (csilvers)
205 | 	* Config improvements: better tcmalloc support and config.h (csilvers)
206 | 	* More robust with missing hash_map + nix 'trampoline' .h's (csilvers)
207 | 	* Support for STLport's hash_map/hash_fun locations (csilvers)
208 | 	* Add .m4 files to distribution; now all source is there (csilvers)
209 | 	* Tiny modification of shrink-threshhold to allow never-shrinking (amc)
210 | 	* Protect timing tests against aggressive optimizers (csilvers)
211 | 	* Extend time_hash_map to test bigger objects (csilvers)
212 | 	* Extend type-trait support to work with const objects (csilvers)
213 | 	* USER VISIBLE: speed up all code by replacing memmove with memcpy
214 | 	  (csilvers)
215 | 
216 | Tue Mar 20 17:29:34 2007  Google Inc. <opensource@google.com>
217 | 
218 | 	* sparsehash: version 0.6 release
219 | 	* Some improvement to type-traits (jyasskin)
220 | 	* Better timing results when google-perftools is installed (sanjay)
221 | 	* Updates and fixes to html documentation and README (csilvers)
222 | 	* A bit more careful about #includes (csilvers)
223 | 	* Fix for typo that broken compilation on some systems (csilvers)
224 | 	* USER VISIBLE: New clear_no_resize() method added to dense_hash_map
225 |           (uszkoreit)
226 | 
227 | Sat Oct 21 13:47:47 2006  Google Inc. <opensource@google.com>
228 | 
229 | 	* sparsehash: version 0.5 release
230 | 	* Support uint16_t (SunOS) in addition to u_int16_t (BSD) (csilvers)
231 | 	* Get rid of UNDERSTANDS_ITERATOR_TAGS; everyone understands (csilvers)
232 | 	* Test that empty-key and deleted-key differ (rbayardo)
233 | 	* Fix example docs: strcmp needs to test for NULL (csilvers)
234 | 
235 | Sun Apr 23 22:42:35 2006  Google Inc. <opensource@google.com>
236 | 
237 | 	* sparsehash: version 0.4 release
238 | 	* Remove POD requirement for keys and values! (austern)
239 | 	* Add tr1-compatible type-traits system to speed up POD ops. (austern)
240 | 	* Fixed const-iterator bug where postfix ++ didn't compile. (csilvers)
241 | 	* Fixed iterator comparison bugs where <= was incorrect. (csilvers)
242 | 	* Clean up config.h to keep its #defines from conflicting. (csilvers)
243 | 	* Big documentation sweep and cleanup. (csilvers)
244 | 	* Update documentation to talk more about good hash fns. (csilvers)
245 | 	* Fixes to compile on MSVC (working around some MSVC bugs). (rennie)
246 | 	* Avoid resizing hashtable on operator[] lookups (austern)
247 | 
248 | Thu Nov  3 20:12:31 2005  Google Inc. <opensource@google.com>
249 | 
250 | 	* sparsehash: version 0.3 release
251 | 	* Quiet compiler warnings on some compilers. (csilvers)
252 | 	* Some documentation fixes: example code for dense_hash_map. (csilvers)
253 | 	* Fix a bug where swap() wasn't swapping delete_key(). (csilvers)
254 | 	* set_deleted_key() and set_empty_key() now take a key only,
255 | 	  allowing hash-map values to be forward-declared. (csilvers)
256 | 	* support for std::insert_iterator (and std::inserter). (csilvers)
257 | 
258 | Mon May  2 07:04:46 2005  Google Inc. <opensource@google.com>
259 | 
260 | 	* sparsehash: version 0.2 release
261 | 	* Preliminary support for msvc++ compilation. (csilvers)
262 | 	* Documentation fixes -- some example code was incomplete! (csilvers)
263 | 	* Minimize size of config.h to avoid other-package conflicts (csilvers)
264 | 	* Contribute a C-based version of sparsehash that served as the
265 | 	  inspiration for this code.  One day, I hope to clean it up and
266 | 	  support it, but for now it's just in experimental/, for playing
267 | 	  around with. (csilvers)
268 | 	* Change default namespace from std to google. (csilvers)
269 | 
270 | Fri Jan 14 16:53:32 2005  Google Inc. <opensource@google.com>
271 | 
272 | 	* sparsehash: initial release:
273 | 	  The sparsehash package contains several hash-map implementations,
274 | 	  similar in API to SGI's hash_map class, but with different
275 | 	  performance characteristics.  sparse_hash_map uses very little
276 | 	  space overhead: 1-2 bits per entry.  dense_hash_map is typically
277 | 	  faster than the default SGI STL implementation.  This package
278 | 	  also includes hash-set analogues of these classes.
279 | 
280 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/Makefile.in:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cslarsen/arv/3999e00361f13d404d0d86d76bcedd4d8d49c393/3rd-party/sparsehash/Makefile.in


--------------------------------------------------------------------------------
/3rd-party/sparsehash/README:
--------------------------------------------------------------------------------
  1 | This directory contains several hash-map implementations, similar in
  2 | API to SGI's hash_map class, but with different performance
  3 | characteristics.  sparse_hash_map uses very little space overhead, 1-2
  4 | bits per entry.  dense_hash_map is very fast, particulary on lookup.
  5 | (sparse_hash_set and dense_hash_set are the set versions of these
  6 | routines.)  On the other hand, these classes have requirements that
  7 | may not make them appropriate for all applications.
  8 | 
  9 | All these implementation use a hashtable with internal quadratic
 10 | probing.  This method is space-efficient -- there is no pointer
 11 | overhead -- and time-efficient for good hash functions.
 12 | 
 13 | COMPILING
 14 | ---------
 15 | To compile test applications with these classes, run ./configure
 16 | followed by make.  To install these header files on your system, run
 17 | 'make install'.  (On Windows, the instructions are different; see
 18 | README_windows.txt.)  See INSTALL for more details.
 19 | 
 20 | This code should work on any modern C++ system.  It has been tested on
 21 | Linux (Ubuntu, Fedora, RedHat, Debian), Solaris 10 x86, FreeBSD 6.0,
 22 | OS X 10.3 and 10.4, and Windows under both VC++7 and VC++8.
 23 | 
 24 | USING
 25 | -----
 26 | See the html files in the doc directory for small example programs
 27 | that use these classes.  It's enough to just include the header file:
 28 | 
 29 |    #include <sparsehash/sparse_hash_map> // or sparse_hash_set, dense_hash_map, ...
 30 |    google::sparse_hash_set<int, int> number_mapper;
 31 | 
 32 | and use the class the way you would other hash-map implementations.
 33 | (Though see "API" below for caveats.)
 34 | 
 35 | By default (you can change it via a flag to ./configure), these hash
 36 | implementations are defined in the google namespace.
 37 | 
 38 | API
 39 | ---
 40 | The API for sparse_hash_map, dense_hash_map, sparse_hash_set, and
 41 | dense_hash_set, are a superset of the API of SGI's hash_map class.
 42 | See doc/sparse_hash_map.html, et al., for more information about the
 43 | API.
 44 | 
 45 | The usage of these classes differ from SGI's hash_map, and other
 46 | hashtable implementations, in the following major ways:
 47 | 
 48 | 1) dense_hash_map requires you to set aside one key value as the
 49 |    'empty bucket' value, set via the set_empty_key() method.  This
 50 |    *MUST* be called before you can use the dense_hash_map.  It is
 51 |    illegal to insert any elements into a dense_hash_map whose key is
 52 |    equal to the empty-key.
 53 | 
 54 | 2) For both dense_hash_map and sparse_hash_map, if you wish to delete
 55 |    elements from the hashtable, you must set aside a key value as the
 56 |    'deleted bucket' value, set via the set_deleted_key() method.  If
 57 |    your hash-map is insert-only, there is no need to call this
 58 |    method.  If you call set_deleted_key(), it is illegal to insert any
 59 |    elements into a dense_hash_map or sparse_hash_map whose key is
 60 |    equal to the deleted-key.
 61 | 
 62 | 3) These hash-map implementation support I/O.  See below.
 63 | 
 64 | There are also some smaller differences:
 65 | 
 66 | 1) The constructor takes an optional argument that specifies the
 67 |    number of elements you expect to insert into the hashtable.  This
 68 |    differs from SGI's hash_map implementation, which takes an optional
 69 |    number of buckets.
 70 | 
 71 | 2) erase() does not immediately reclaim memory.  As a consequence,
 72 |    erase() does not invalidate any iterators, making loops like this
 73 |    correct:
 74 |       for (it = ht.begin(); it != ht.end(); ++it)
 75 |         if (...) ht.erase(it);
 76 |    As another consequence, a series of erase() calls can leave your
 77 |    hashtable using more memory than it needs to.  The hashtable will
 78 |    automatically compact at the next call to insert(), but to
 79 |    manually compact a hashtable, you can call
 80 |       ht.resize(0)
 81 | 
 82 | I/O
 83 | ---
 84 | In addition to the normal hash-map operations, sparse_hash_map can
 85 | read and write hashtables to disk.  (dense_hash_map also has the API,
 86 | but it has not yet been implemented, and writes will always fail.)
 87 | 
 88 | In the simplest case, writing a hashtable is as easy as calling two
 89 | methods on the hashtable:
 90 |    ht.write_metadata(fp);
 91 |    ht.write_nopointer_data(fp);
 92 | 
 93 | Reading in this data is equally simple:
 94 |    google::sparse_hash_map<...> ht;
 95 |    ht.read_metadata(fp);
 96 |    ht.read_nopointer_data(fp);
 97 | 
 98 | The above is sufficient if the key and value do not contain any
 99 | pointers: they are basic C types or agglomorations of basic C types.
100 | If the key and/or value do contain pointers, you can still store the
101 | hashtable by replacing write_nopointer_data() with a custom writing
102 | routine.  See sparse_hash_map.html et al. for more information.
103 | 
104 | SPARSETABLE
105 | -----------
106 | In addition to the hash-map and hash-set classes, this package also
107 | provides sparsetable.h, an array implementation that uses space
108 | proportional to the number of elements in the array, rather than the
109 | maximum element index.  It uses very little space overhead: 2 to 5
110 | bits per entry.  See doc/sparsetable.html for the API.
111 | 
112 | RESOURCE USAGE
113 | --------------
114 | * sparse_hash_map has memory overhead of about 4 to 10 bits per 
115 |   hash-map entry, assuming a typical average occupancy of 50%.
116 | * dense_hash_map has a factor of 2-3 memory overhead: if your
117 |   hashtable data takes X bytes, dense_hash_map will use 3X-4X memory
118 |   total.
119 | 
120 | Hashtables tend to double in size when resizing, creating an
121 | additional 50% space overhead.  dense_hash_map does in fact have a
122 | significant "high water mark" memory use requirement, which is 6 times
123 | the size of hash entries in the table when resizing (when reaching 
124 | 50% occupancy, the table resizes to double the previous size, and the 
125 | old table (2x) is copied to the new table (4x)).
126 | 
127 | sparse_hash_map, however, is written to need very little space
128 | overhead when resizing: only a few bits per hashtable entry.
129 | 
130 | PERFORMANCE
131 | -----------
132 | You can compile and run the included file time_hash_map.cc to examine
133 | the performance of sparse_hash_map, dense_hash_map, and your native
134 | hash_map implementation on your system.  One test against the
135 | SGI hash_map implementation gave the following timing information for
136 | a simple find() call:
137 |    SGI hash_map:     22 ns
138 |    dense_hash_map:   13 ns
139 |    sparse_hash_map: 117 ns
140 |    SGI map:         113 ns
141 | 
142 | See doc/performance.html for more detailed charts on resource usage
143 | and performance data.
144 | 
145 | ---
146 | 16 March 2005
147 | (Last updated: 12 September 2010)
148 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/m4/acx_pthread.m4:
--------------------------------------------------------------------------------
  1 | # This was retrieved from
  2 | #    http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?revision=1277&root=avahi
  3 | # See also (perhaps for new versions?)
  4 | #    http://svn.0pointer.de/viewvc/trunk/common/acx_pthread.m4?root=avahi
  5 | #
  6 | # We've rewritten the inconsistency check code (from avahi), to work
  7 | # more broadly.  In particular, it no longer assumes ld accepts -zdefs.
  8 | # This caused a restructing of the code, but the functionality has only
  9 | # changed a little.
 10 | 
 11 | dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
 12 | dnl
 13 | dnl @summary figure out how to build C programs using POSIX threads
 14 | dnl
 15 | dnl This macro figures out how to build C programs using POSIX threads.
 16 | dnl It sets the PTHREAD_LIBS output variable to the threads library and
 17 | dnl linker flags, and the PTHREAD_CFLAGS output variable to any special
 18 | dnl C compiler flags that are needed. (The user can also force certain
 19 | dnl compiler flags/libs to be tested by setting these environment
 20 | dnl variables.)
 21 | dnl
 22 | dnl Also sets PTHREAD_CC to any special C compiler that is needed for
 23 | dnl multi-threaded programs (defaults to the value of CC otherwise).
 24 | dnl (This is necessary on AIX to use the special cc_r compiler alias.)
 25 | dnl
 26 | dnl NOTE: You are assumed to not only compile your program with these
 27 | dnl flags, but also link it with them as well. e.g. you should link
 28 | dnl with $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS
 29 | dnl $LIBS
 30 | dnl
 31 | dnl If you are only building threads programs, you may wish to use
 32 | dnl these variables in your default LIBS, CFLAGS, and CC:
 33 | dnl
 34 | dnl        LIBS="$PTHREAD_LIBS $LIBS"
 35 | dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 36 | dnl        CC="$PTHREAD_CC"
 37 | dnl
 38 | dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
 39 | dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to
 40 | dnl that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
 41 | dnl
 42 | dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
 43 | dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands to
 44 | dnl run it if it is not found. If ACTION-IF-FOUND is not specified, the
 45 | dnl default action will define HAVE_PTHREAD.
 46 | dnl
 47 | dnl Please let the authors know if this macro fails on any platform, or
 48 | dnl if you have any other suggestions or comments. This macro was based
 49 | dnl on work by SGJ on autoconf scripts for FFTW (www.fftw.org) (with
 50 | dnl help from M. Frigo), as well as ac_pthread and hb_pthread macros
 51 | dnl posted by Alejandro Forero Cuervo to the autoconf macro repository.
 52 | dnl We are also grateful for the helpful feedback of numerous users.
 53 | dnl
 54 | dnl @category InstalledPackages
 55 | dnl @author Steven G. Johnson <stevenj@alum.mit.edu>
 56 | dnl @version 2006-05-29
 57 | dnl @license GPLWithACException
 58 | dnl 
 59 | dnl Checks for GCC shared/pthread inconsistency based on work by
 60 | dnl Marcin Owsiany <marcin@owsiany.pl>
 61 | 
 62 | 
 63 | AC_DEFUN([ACX_PTHREAD], [
 64 | AC_REQUIRE([AC_CANONICAL_HOST])
 65 | AC_LANG_SAVE
 66 | AC_LANG_C
 67 | acx_pthread_ok=no
 68 | 
 69 | # We used to check for pthread.h first, but this fails if pthread.h
 70 | # requires special compiler flags (e.g. on True64 or Sequent).
 71 | # It gets checked for in the link test anyway.
 72 | 
 73 | # First of all, check if the user has set any of the PTHREAD_LIBS,
 74 | # etcetera environment variables, and if threads linking works using
 75 | # them:
 76 | if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
 77 |         save_CFLAGS="$CFLAGS"
 78 |         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
 79 |         save_LIBS="$LIBS"
 80 |         LIBS="$PTHREAD_LIBS $LIBS"
 81 |         AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
 82 |         AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
 83 |         AC_MSG_RESULT($acx_pthread_ok)
 84 |         if test x"$acx_pthread_ok" = xno; then
 85 |                 PTHREAD_LIBS=""
 86 |                 PTHREAD_CFLAGS=""
 87 |         fi
 88 |         LIBS="$save_LIBS"
 89 |         CFLAGS="$save_CFLAGS"
 90 | fi
 91 | 
 92 | # We must check for the threads library under a number of different
 93 | # names; the ordering is very important because some systems
 94 | # (e.g. DEC) have both -lpthread and -lpthreads, where one of the
 95 | # libraries is broken (non-POSIX).
 96 | 
 97 | # Create a list of thread flags to try.  Items starting with a "-" are
 98 | # C compiler flags, and other items are library names, except for "none"
 99 | # which indicates that we try without any flags at all, and "pthread-config"
100 | # which is a program returning the flags for the Pth emulation library.
101 | 
102 | acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config"
103 | 
104 | # The ordering *is* (sometimes) important.  Some notes on the
105 | # individual items follow:
106 | 
107 | # pthreads: AIX (must check this before -lpthread)
108 | # none: in case threads are in libc; should be tried before -Kthread and
109 | #       other compiler flags to prevent continual compiler warnings
110 | # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
111 | # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
112 | # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
113 | # -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
114 | # -pthreads: Solaris/gcc
115 | # -mthreads: Mingw32/gcc, Lynx/gcc
116 | # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
117 | #      doesn't hurt to check since this sometimes defines pthreads too;
118 | #      also defines -D_REENTRANT)
119 | #      ... -mt is also the pthreads flag for HP/aCC
120 | # pthread: Linux, etcetera
121 | # --thread-safe: KAI C++
122 | # pthread-config: use pthread-config program (for GNU Pth library)
123 | 
124 | case "${host_cpu}-${host_os}" in
125 |         *solaris*)
126 | 
127 |         # On Solaris (at least, for some versions), libc contains stubbed
128 |         # (non-functional) versions of the pthreads routines, so link-based
129 |         # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
130 |         # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
131 |         # a function called by this macro, so we could check for that, but
132 |         # who knows whether they'll stub that too in a future libc.)  So,
133 |         # we'll just look for -pthreads and -lpthread first:
134 | 
135 |         acx_pthread_flags="-pthreads pthread -mt -pthread $acx_pthread_flags"
136 |         ;;
137 | esac
138 | 
139 | if test x"$acx_pthread_ok" = xno; then
140 | for flag in $acx_pthread_flags; do
141 | 
142 |         case $flag in
143 |                 none)
144 |                 AC_MSG_CHECKING([whether pthreads work without any flags])
145 |                 ;;
146 | 
147 |                 -*)
148 |                 AC_MSG_CHECKING([whether pthreads work with $flag])
149 |                 PTHREAD_CFLAGS="$flag"
150 |                 ;;
151 | 
152 | 		pthread-config)
153 | 		AC_CHECK_PROG(acx_pthread_config, pthread-config, yes, no)
154 | 		if test x"$acx_pthread_config" = xno; then continue; fi
155 | 		PTHREAD_CFLAGS="`pthread-config --cflags`"
156 | 		PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`"
157 | 		;;
158 | 
159 |                 *)
160 |                 AC_MSG_CHECKING([for the pthreads library -l$flag])
161 |                 PTHREAD_LIBS="-l$flag"
162 |                 ;;
163 |         esac
164 | 
165 |         save_LIBS="$LIBS"
166 |         save_CFLAGS="$CFLAGS"
167 |         LIBS="$PTHREAD_LIBS $LIBS"
168 |         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
169 | 
170 |         # Check for various functions.  We must include pthread.h,
171 |         # since some functions may be macros.  (On the Sequent, we
172 |         # need a special flag -Kthread to make this header compile.)
173 |         # We check for pthread_join because it is in -lpthread on IRIX
174 |         # while pthread_create is in libc.  We check for pthread_attr_init
175 |         # due to DEC craziness with -lpthreads.  We check for
176 |         # pthread_cleanup_push because it is one of the few pthread
177 |         # functions on Solaris that doesn't have a non-functional libc stub.
178 |         # We try pthread_create on general principles.
179 |         AC_TRY_LINK([#include <pthread.h>],
180 |                     [pthread_t th; pthread_join(th, 0);
181 |                      pthread_attr_init(0); pthread_cleanup_push(0, 0);
182 |                      pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
183 |                     [acx_pthread_ok=yes])
184 | 
185 |         LIBS="$save_LIBS"
186 |         CFLAGS="$save_CFLAGS"
187 | 
188 |         AC_MSG_RESULT($acx_pthread_ok)
189 |         if test "x$acx_pthread_ok" = xyes; then
190 |                 break;
191 |         fi
192 | 
193 |         PTHREAD_LIBS=""
194 |         PTHREAD_CFLAGS=""
195 | done
196 | fi
197 | 
198 | # Various other checks:
199 | if test "x$acx_pthread_ok" = xyes; then
200 |         save_LIBS="$LIBS"
201 |         LIBS="$PTHREAD_LIBS $LIBS"
202 |         save_CFLAGS="$CFLAGS"
203 |         CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
204 | 
205 |         # Detect AIX lossage: JOINABLE attribute is called UNDETACHED.
206 | 	AC_MSG_CHECKING([for joinable pthread attribute])
207 | 	attr_name=unknown
208 | 	for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do
209 | 	    AC_TRY_LINK([#include <pthread.h>], [int attr=$attr; return attr;],
210 |                         [attr_name=$attr; break])
211 | 	done
212 |         AC_MSG_RESULT($attr_name)
213 |         if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then
214 |             AC_DEFINE_UNQUOTED(PTHREAD_CREATE_JOINABLE, $attr_name,
215 |                                [Define to necessary symbol if this constant
216 |                                 uses a non-standard name on your system.])
217 |         fi
218 | 
219 |         AC_MSG_CHECKING([if more special flags are required for pthreads])
220 |         flag=no
221 |         case "${host_cpu}-${host_os}" in
222 |             *-aix* | *-freebsd* | *-darwin*) flag="-D_THREAD_SAFE";;
223 |             *solaris* | *-osf* | *-hpux*) flag="-D_REENTRANT";;
224 |         esac
225 |         AC_MSG_RESULT(${flag})
226 |         if test "x$flag" != xno; then
227 |             PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
228 |         fi
229 | 
230 |         LIBS="$save_LIBS"
231 |         CFLAGS="$save_CFLAGS"
232 |         # More AIX lossage: must compile with xlc_r or cc_r
233 | 	if test x"$GCC" != xyes; then
234 |           AC_CHECK_PROGS(PTHREAD_CC, xlc_r cc_r, ${CC})
235 |         else
236 |           PTHREAD_CC=$CC
237 | 	fi
238 | 
239 | 	# The next part tries to detect GCC inconsistency with -shared on some
240 | 	# architectures and systems. The problem is that in certain
241 | 	# configurations, when -shared is specified, GCC "forgets" to
242 | 	# internally use various flags which are still necessary.
243 | 	
244 | 	#
245 | 	# Prepare the flags
246 | 	#
247 | 	save_CFLAGS="$CFLAGS"
248 | 	save_LIBS="$LIBS"
249 | 	save_CC="$CC"
250 | 	
251 | 	# Try with the flags determined by the earlier checks.
252 | 	#
253 | 	# -Wl,-z,defs forces link-time symbol resolution, so that the
254 | 	# linking checks with -shared actually have any value
255 | 	#
256 | 	# FIXME: -fPIC is required for -shared on many architectures,
257 | 	# so we specify it here, but the right way would probably be to
258 | 	# properly detect whether it is actually required.
259 | 	CFLAGS="-shared -fPIC -Wl,-z,defs $CFLAGS $PTHREAD_CFLAGS"
260 | 	LIBS="$PTHREAD_LIBS $LIBS"
261 | 	CC="$PTHREAD_CC"
262 | 	
263 | 	# In order not to create several levels of indentation, we test
264 | 	# the value of "$done" until we find the cure or run out of ideas.
265 | 	done="no"
266 | 	
267 | 	# First, make sure the CFLAGS we added are actually accepted by our
268 | 	# compiler.  If not (and OS X's ld, for instance, does not accept -z),
269 | 	# then we can't do this test.
270 | 	if test x"$done" = xno; then
271 | 	   AC_MSG_CHECKING([whether to check for GCC pthread/shared inconsistencies])
272 | 	   AC_TRY_LINK(,, , [done=yes])
273 | 	
274 | 	   if test "x$done" = xyes ; then
275 | 	      AC_MSG_RESULT([no])
276 | 	   else
277 | 	      AC_MSG_RESULT([yes])
278 | 	   fi
279 | 	fi
280 | 	
281 | 	if test x"$done" = xno; then
282 | 	   AC_MSG_CHECKING([whether -pthread is sufficient with -shared])
283 | 	   AC_TRY_LINK([#include <pthread.h>],
284 | 	      [pthread_t th; pthread_join(th, 0);
285 | 	      pthread_attr_init(0); pthread_cleanup_push(0, 0);
286 | 	      pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
287 | 	      [done=yes])
288 | 	   
289 | 	   if test "x$done" = xyes; then
290 | 	      AC_MSG_RESULT([yes])
291 | 	   else
292 | 	      AC_MSG_RESULT([no])
293 | 	   fi
294 | 	fi
295 | 	
296 | 	#
297 | 	# Linux gcc on some architectures such as mips/mipsel forgets
298 | 	# about -lpthread
299 | 	#
300 | 	if test x"$done" = xno; then
301 | 	   AC_MSG_CHECKING([whether -lpthread fixes that])
302 | 	   LIBS="-lpthread $PTHREAD_LIBS $save_LIBS"
303 | 	   AC_TRY_LINK([#include <pthread.h>],
304 | 	      [pthread_t th; pthread_join(th, 0);
305 | 	      pthread_attr_init(0); pthread_cleanup_push(0, 0);
306 | 	      pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
307 | 	      [done=yes])
308 | 	
309 | 	   if test "x$done" = xyes; then
310 | 	      AC_MSG_RESULT([yes])
311 | 	      PTHREAD_LIBS="-lpthread $PTHREAD_LIBS"
312 | 	   else
313 | 	      AC_MSG_RESULT([no])
314 | 	   fi
315 | 	fi
316 | 	#
317 | 	# FreeBSD 4.10 gcc forgets to use -lc_r instead of -lc
318 | 	#
319 | 	if test x"$done" = xno; then
320 | 	   AC_MSG_CHECKING([whether -lc_r fixes that])
321 | 	   LIBS="-lc_r $PTHREAD_LIBS $save_LIBS"
322 | 	   AC_TRY_LINK([#include <pthread.h>],
323 | 	       [pthread_t th; pthread_join(th, 0);
324 | 	        pthread_attr_init(0); pthread_cleanup_push(0, 0);
325 | 	        pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
326 | 	       [done=yes])
327 | 	
328 | 	   if test "x$done" = xyes; then
329 | 	      AC_MSG_RESULT([yes])
330 | 	      PTHREAD_LIBS="-lc_r $PTHREAD_LIBS"
331 | 	   else
332 | 	      AC_MSG_RESULT([no])
333 | 	   fi
334 | 	fi
335 | 	if test x"$done" = xno; then
336 | 	   # OK, we have run out of ideas
337 | 	   AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries])
338 | 	
339 | 	   # so it's not safe to assume that we may use pthreads
340 | 	   acx_pthread_ok=no
341 | 	fi
342 | 	
343 | 	AC_MSG_CHECKING([whether what we have so far is sufficient with -nostdlib])
344 | 	CFLAGS="-nostdlib $CFLAGS"
345 | 	# we need c with nostdlib
346 | 	LIBS="$LIBS -lc"
347 | 	AC_TRY_LINK([#include <pthread.h>],
348 | 	      [pthread_t th; pthread_join(th, 0);
349 | 	       pthread_attr_init(0); pthread_cleanup_push(0, 0);
350 | 	       pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
351 | 	      [done=yes],[done=no])
352 | 
353 | 	if test "x$done" = xyes; then
354 | 	   AC_MSG_RESULT([yes])
355 | 	else
356 | 	   AC_MSG_RESULT([no])
357 | 	fi
358 | 	
359 | 	if test x"$done" = xno; then
360 | 	   AC_MSG_CHECKING([whether -lpthread saves the day])
361 | 	   LIBS="-lpthread $LIBS"
362 | 	   AC_TRY_LINK([#include <pthread.h>],
363 | 	      [pthread_t th; pthread_join(th, 0);
364 | 	       pthread_attr_init(0); pthread_cleanup_push(0, 0);
365 | 	       pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
366 | 	      [done=yes],[done=no])
367 | 
368 | 	   if test "x$done" = xyes; then
369 | 	      AC_MSG_RESULT([yes])
370 | 	      PTHREAD_LIBS="$PTHREAD_LIBS -lpthread"
371 | 	   else
372 | 	      AC_MSG_RESULT([no])
373 | 	      AC_MSG_WARN([Impossible to determine how to use pthreads with shared libraries and -nostdlib])
374 | 	   fi
375 | 	fi
376 | 
377 | 	CFLAGS="$save_CFLAGS"
378 | 	LIBS="$save_LIBS"
379 | 	CC="$save_CC"
380 | else
381 |         PTHREAD_CC="$CC"
382 | fi
383 | 
384 | AC_SUBST(PTHREAD_LIBS)
385 | AC_SUBST(PTHREAD_CFLAGS)
386 | AC_SUBST(PTHREAD_CC)
387 | 
388 | # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
389 | if test x"$acx_pthread_ok" = xyes; then
390 |         ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
391 |         :
392 | else
393 |         acx_pthread_ok=no
394 |         $2
395 | fi
396 | AC_LANG_RESTORE
397 | ])dnl ACX_PTHREAD
398 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/m4/google_namespace.m4:
--------------------------------------------------------------------------------
 1 | # Allow users to override the namespace we define our application's classes in
 2 | # Arg $1 is the default namespace to use if --enable-namespace isn't present.
 3 | 
 4 | # In general, $1 should be 'google', so we put all our exported symbols in a
 5 | # unique namespace that is not likely to conflict with anyone else.  However,
 6 | # when it makes sense -- for instance, when publishing stl-like code -- you
 7 | # may want to go with a different default, like 'std'.
 8 | 
 9 | # We guarantee the invariant that GOOGLE_NAMESPACE starts with ::,
10 | # unless it's the empty string.  Thus, it's always safe to do
11 | # GOOGLE_NAMESPACE::foo and be sure you're getting the foo that's
12 | # actually in the google namespace, and not some other namespace that
13 | # the namespace rules might kick in.
14 | 
15 | AC_DEFUN([AC_DEFINE_GOOGLE_NAMESPACE],
16 |   [google_namespace_default=[$1]
17 |    AC_ARG_ENABLE(namespace, [  --enable-namespace=FOO to define these Google
18 |                              classes in the FOO namespace. --disable-namespace
19 |                              to define them in the global namespace. Default
20 |                              is to define them in namespace $1.],
21 |                  [case "$enableval" in
22 |                     yes) google_namespace="$google_namespace_default" ;;
23 |                      no) google_namespace="" ;;
24 |                       *) google_namespace="$enableval" ;;
25 |                   esac],
26 |                  [google_namespace="$google_namespace_default"])
27 |    if test -n "$google_namespace"; then
28 |      ac_google_namespace="::$google_namespace"
29 |      ac_google_start_namespace="namespace $google_namespace {"
30 |      ac_google_end_namespace="}"
31 |    else
32 |      ac_google_namespace=""
33 |      ac_google_start_namespace=""
34 |      ac_google_end_namespace=""
35 |    fi
36 |    AC_DEFINE_UNQUOTED(GOOGLE_NAMESPACE, $ac_google_namespace,
37 |                       Namespace for Google classes)
38 |    AC_DEFINE_UNQUOTED(_START_GOOGLE_NAMESPACE_, $ac_google_start_namespace,
39 |                       Puts following code inside the Google namespace)
40 |    AC_DEFINE_UNQUOTED(_END_GOOGLE_NAMESPACE_,  $ac_google_end_namespace,
41 |                       Stops putting the code inside the Google namespace)
42 | ])
43 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/m4/namespaces.m4:
--------------------------------------------------------------------------------
 1 | # Checks whether the compiler implements namespaces
 2 | AC_DEFUN([AC_CXX_NAMESPACES],
 3 |  [AC_CACHE_CHECK(whether the compiler implements namespaces,
 4 |                  ac_cv_cxx_namespaces,
 5 |                  [AC_LANG_SAVE
 6 |                   AC_LANG_CPLUSPLUS
 7 |                   AC_TRY_COMPILE([namespace Outer {
 8 |                                     namespace Inner { int i = 0; }}],
 9 |                                  [using namespace Outer::Inner; return i;],
10 |                                  ac_cv_cxx_namespaces=yes,
11 |                                  ac_cv_cxx_namespaces=no)
12 |                   AC_LANG_RESTORE])
13 |   if test "$ac_cv_cxx_namespaces" = yes; then
14 |     AC_DEFINE(HAVE_NAMESPACES, 1, [define if the compiler implements namespaces])
15 |   fi])
16 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/m4/stl_hash.m4:
--------------------------------------------------------------------------------
 1 | # We check two things: where the include file is for
 2 | # unordered_map/hash_map (we prefer the first form), and what
 3 | # namespace unordered/hash_map lives in within that include file.  We
 4 | # include AC_TRY_COMPILE for all the combinations we've seen in the
 5 | # wild.  We define HASH_MAP_H to the location of the header file, and
 6 | # HASH_NAMESPACE to the namespace the class (unordered_map or
 7 | # hash_map) is in.  We define HAVE_UNORDERED_MAP if the class we found
 8 | # is named unordered_map, or leave it undefined if not.
 9 | 
10 | # This also checks if unordered map exists.
11 | AC_DEFUN([AC_CXX_STL_HASH],
12 |   [AC_REQUIRE([AC_CXX_NAMESPACES])
13 |    AC_MSG_CHECKING(the location of hash_map) 
14 |    AC_LANG_SAVE
15 |    AC_LANG_CPLUSPLUS
16 |    ac_cv_cxx_hash_map=""
17 |    # First try unordered_map, but not on gcc's before 4.2 -- I've
18 |    # seen unexplainable unordered_map bugs with -O2 on older gcc's.
19 |    AC_TRY_COMPILE([#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
20 |                    # error GCC too old for unordered_map
21 |                    #endif
22 | 		   ],
23 |                    [/* no program body necessary */],
24 | 		   [stl_hash_old_gcc=no],
25 |                    [stl_hash_old_gcc=yes])
26 |    for location in unordered_map tr1/unordered_map; do
27 |      for namespace in std std::tr1; do
28 |        if test -z "$ac_cv_cxx_hash_map" -a "$stl_hash_old_gcc" != yes; then
29 |          # Some older gcc's have a buggy tr1, so test a bit of code.
30 |          AC_TRY_COMPILE([#include <$location>],
31 |                         [const ${namespace}::unordered_map<int, int> t;
32 |                          return t.find(5) == t.end();],
33 |                         [ac_cv_cxx_hash_map="<$location>";
34 |                          ac_cv_cxx_hash_namespace="$namespace";
35 | 			 ac_cv_cxx_have_unordered_map="yes";])
36 |        fi
37 |      done
38 |    done
39 |    # Now try hash_map
40 |    for location in ext/hash_map hash_map; do
41 |      for namespace in __gnu_cxx "" std stdext; do
42 |        if test -z "$ac_cv_cxx_hash_map"; then
43 |          AC_TRY_COMPILE([#include <$location>],
44 |                         [${namespace}::hash_map<int, int> t],
45 |                         [ac_cv_cxx_hash_map="<$location>";
46 |                          ac_cv_cxx_hash_namespace="$namespace";
47 | 			 ac_cv_cxx_have_unordered_map="no";])
48 |        fi
49 |      done
50 |    done
51 |    ac_cv_cxx_hash_set=`echo "$ac_cv_cxx_hash_map" | sed s/map/set/`;
52 |    if test -n "$ac_cv_cxx_hash_map"; then
53 |       AC_DEFINE(HAVE_HASH_MAP, 1, [define if the compiler has hash_map])
54 |       AC_DEFINE(HAVE_HASH_SET, 1, [define if the compiler has hash_set])
55 |       AC_DEFINE_UNQUOTED(HASH_MAP_H,$ac_cv_cxx_hash_map,
56 |                          [the location of <unordered_map> or <hash_map>])
57 |       AC_DEFINE_UNQUOTED(HASH_SET_H,$ac_cv_cxx_hash_set,
58 |                          [the location of <unordered_set> or <hash_set>])
59 |       AC_DEFINE_UNQUOTED(HASH_NAMESPACE,$ac_cv_cxx_hash_namespace,
60 |                          [the namespace of hash_map/hash_set])
61 |       if test "$ac_cv_cxx_have_unordered_map" = yes; then
62 |         AC_DEFINE(HAVE_UNORDERED_MAP,1,
63 |                   [define if the compiler supports unordered_{map,set}])
64 |       fi
65 |       AC_MSG_RESULT([$ac_cv_cxx_hash_map])
66 |    else
67 |       AC_MSG_RESULT()
68 |       AC_MSG_WARN([could not find an STL hash_map])
69 |    fi
70 | ])
71 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/m4/stl_hash_fun.m4:
--------------------------------------------------------------------------------
 1 | # We just try to figure out where hash<> is defined.  It's in some file
 2 | # that ends in hash_fun.h...
 3 | #
 4 | # Ideally we'd use AC_CACHE_CHECK, but that only lets us store one value
 5 | # at a time, and we need to store two (filename and namespace).
 6 | # prints messages itself, so we have to do the message-printing ourselves
 7 | # via AC_MSG_CHECKING + AC_MSG_RESULT.  (TODO(csilvers): can we cache?)
 8 | #
 9 | # tr1/functional_hash.h: new gcc's with tr1 support
10 | # stl_hash_fun.h: old gcc's (gc2.95?)
11 | # ext/hash_fun.h: newer gcc's (gcc4)
12 | # stl/_hash_fun.h: STLport
13 | 
14 | AC_DEFUN([AC_CXX_STL_HASH_FUN],
15 |   [AC_REQUIRE([AC_CXX_STL_HASH])
16 |    AC_MSG_CHECKING(how to include hash_fun directly)
17 |    AC_LANG_SAVE
18 |    AC_LANG_CPLUSPLUS
19 |    ac_cv_cxx_stl_hash_fun=""
20 |    for location in functional tr1/functional \
21 |                    ext/hash_fun.h ext/stl_hash_fun.h \
22 |                    hash_fun.h stl_hash_fun.h \
23 |                    stl/_hash_fun.h; do
24 |      if test -z "$ac_cv_cxx_stl_hash_fun"; then
25 |        AC_TRY_COMPILE([#include <$location>],
26 |                       [int x = ${ac_cv_cxx_hash_namespace}::hash<int>()(5)],
27 |                       [ac_cv_cxx_stl_hash_fun="<$location>";])
28 |      fi
29 |    done
30 |    AC_LANG_RESTORE
31 |    AC_DEFINE_UNQUOTED(HASH_FUN_H,$ac_cv_cxx_stl_hash_fun,
32 |                       [the location of the header defining hash functions])
33 |    AC_DEFINE_UNQUOTED(HASH_NAMESPACE,$ac_cv_cxx_hash_namespace,
34 |                       [the namespace of the hash<> function])
35 |    AC_MSG_RESULT([$ac_cv_cxx_stl_hash_fun])
36 | ])
37 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/missing:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Common stub for a few missing GNU programs while installing.
  3 | 
  4 | scriptversion=2009-04-28.21; # UTC
  5 | 
  6 | # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006,
  7 | # 2008, 2009 Free Software Foundation, Inc.
  8 | # Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
  9 | 
 10 | # This program is free software; you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License as published by
 12 | # the Free Software Foundation; either version 2, or (at your option)
 13 | # any later version.
 14 | 
 15 | # This program is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | # GNU General Public License for more details.
 19 | 
 20 | # You should have received a copy of the GNU General Public License
 21 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 22 | 
 23 | # As a special exception to the GNU General Public License, if you
 24 | # distribute this file as part of a program that contains a
 25 | # configuration script generated by Autoconf, you may include it under
 26 | # the same distribution terms that you use for the rest of that program.
 27 | 
 28 | if test $# -eq 0; then
 29 |   echo 1>&2 "Try \`$0 --help' for more information"
 30 |   exit 1
 31 | fi
 32 | 
 33 | run=:
 34 | sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
 35 | sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
 36 | 
 37 | # In the cases where this matters, `missing' is being run in the
 38 | # srcdir already.
 39 | if test -f configure.ac; then
 40 |   configure_ac=configure.ac
 41 | else
 42 |   configure_ac=configure.in
 43 | fi
 44 | 
 45 | msg="missing on your system"
 46 | 
 47 | case $1 in
 48 | --run)
 49 |   # Try to run requested program, and just exit if it succeeds.
 50 |   run=
 51 |   shift
 52 |   "$@" && exit 0
 53 |   # Exit code 63 means version mismatch.  This often happens
 54 |   # when the user try to use an ancient version of a tool on
 55 |   # a file that requires a minimum version.  In this case we
 56 |   # we should proceed has if the program had been absent, or
 57 |   # if --run hadn't been passed.
 58 |   if test $? = 63; then
 59 |     run=:
 60 |     msg="probably too old"
 61 |   fi
 62 |   ;;
 63 | 
 64 |   -h|--h|--he|--hel|--help)
 65 |     echo "\
 66 | $0 [OPTION]... PROGRAM [ARGUMENT]...
 67 | 
 68 | Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
 69 | error status if there is no known handling for PROGRAM.
 70 | 
 71 | Options:
 72 |   -h, --help      display this help and exit
 73 |   -v, --version   output version information and exit
 74 |   --run           try to run the given command, and emulate it if it fails
 75 | 
 76 | Supported PROGRAM values:
 77 |   aclocal      touch file \`aclocal.m4'
 78 |   autoconf     touch file \`configure'
 79 |   autoheader   touch file \`config.h.in'
 80 |   autom4te     touch the output file, or create a stub one
 81 |   automake     touch all \`Makefile.in' files
 82 |   bison        create \`y.tab.[ch]', if possible, from existing .[ch]
 83 |   flex         create \`lex.yy.c', if possible, from existing .c
 84 |   help2man     touch the output file
 85 |   lex          create \`lex.yy.c', if possible, from existing .c
 86 |   makeinfo     touch the output file
 87 |   tar          try tar, gnutar, gtar, then tar without non-portable flags
 88 |   yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
 89 | 
 90 | Version suffixes to PROGRAM as well as the prefixes \`gnu-', \`gnu', and
 91 | \`g' are ignored when checking the name.
 92 | 
 93 | Send bug reports to <bug-automake@gnu.org>."
 94 |     exit $?
 95 |     ;;
 96 | 
 97 |   -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
 98 |     echo "missing $scriptversion (GNU Automake)"
 99 |     exit $?
100 |     ;;
101 | 
102 |   -*)
103 |     echo 1>&2 "$0: Unknown \`$1' option"
104 |     echo 1>&2 "Try \`$0 --help' for more information"
105 |     exit 1
106 |     ;;
107 | 
108 | esac
109 | 
110 | # normalize program name to check for.
111 | program=`echo "$1" | sed '
112 |   s/^gnu-//; t
113 |   s/^gnu//; t
114 |   s/^g//; t'`
115 | 
116 | # Now exit if we have it, but it failed.  Also exit now if we
117 | # don't have it and --version was passed (most likely to detect
118 | # the program).  This is about non-GNU programs, so use $1 not
119 | # $program.
120 | case $1 in
121 |   lex*|yacc*)
122 |     # Not GNU programs, they don't have --version.
123 |     ;;
124 | 
125 |   tar*)
126 |     if test -n "$run"; then
127 |        echo 1>&2 "ERROR: \`tar' requires --run"
128 |        exit 1
129 |     elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
130 |        exit 1
131 |     fi
132 |     ;;
133 | 
134 |   *)
135 |     if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
136 |        # We have it, but it failed.
137 |        exit 1
138 |     elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
139 |        # Could not run --version or --help.  This is probably someone
140 |        # running `$TOOL --version' or `$TOOL --help' to check whether
141 |        # $TOOL exists and not knowing $TOOL uses missing.
142 |        exit 1
143 |     fi
144 |     ;;
145 | esac
146 | 
147 | # If it does not exist, or fails to run (possibly an outdated version),
148 | # try to emulate it.
149 | case $program in
150 |   aclocal*)
151 |     echo 1>&2 "\
152 | WARNING: \`$1' is $msg.  You should only need it if
153 |          you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
154 |          to install the \`Automake' and \`Perl' packages.  Grab them from
155 |          any GNU archive site."
156 |     touch aclocal.m4
157 |     ;;
158 | 
159 |   autoconf*)
160 |     echo 1>&2 "\
161 | WARNING: \`$1' is $msg.  You should only need it if
162 |          you modified \`${configure_ac}'.  You might want to install the
163 |          \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
164 |          archive site."
165 |     touch configure
166 |     ;;
167 | 
168 |   autoheader*)
169 |     echo 1>&2 "\
170 | WARNING: \`$1' is $msg.  You should only need it if
171 |          you modified \`acconfig.h' or \`${configure_ac}'.  You might want
172 |          to install the \`Autoconf' and \`GNU m4' packages.  Grab them
173 |          from any GNU archive site."
174 |     files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
175 |     test -z "$files" && files="config.h"
176 |     touch_files=
177 |     for f in $files; do
178 |       case $f in
179 |       *:*) touch_files="$touch_files "`echo "$f" |
180 | 				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
181 |       *) touch_files="$touch_files $f.in";;
182 |       esac
183 |     done
184 |     touch $touch_files
185 |     ;;
186 | 
187 |   automake*)
188 |     echo 1>&2 "\
189 | WARNING: \`$1' is $msg.  You should only need it if
190 |          you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
191 |          You might want to install the \`Automake' and \`Perl' packages.
192 |          Grab them from any GNU archive site."
193 |     find . -type f -name Makefile.am -print |
194 | 	   sed 's/\.am$/.in/' |
195 | 	   while read f; do touch "$f"; done
196 |     ;;
197 | 
198 |   autom4te*)
199 |     echo 1>&2 "\
200 | WARNING: \`$1' is needed, but is $msg.
201 |          You might have modified some files without having the
202 |          proper tools for further handling them.
203 |          You can get \`$1' as part of \`Autoconf' from any GNU
204 |          archive site."
205 | 
206 |     file=`echo "$*" | sed -n "$sed_output"`
207 |     test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
208 |     if test -f "$file"; then
209 | 	touch $file
210 |     else
211 | 	test -z "$file" || exec >$file
212 | 	echo "#! /bin/sh"
213 | 	echo "# Created by GNU Automake missing as a replacement of"
214 | 	echo "#  $ $@"
215 | 	echo "exit 0"
216 | 	chmod +x $file
217 | 	exit 1
218 |     fi
219 |     ;;
220 | 
221 |   bison*|yacc*)
222 |     echo 1>&2 "\
223 | WARNING: \`$1' $msg.  You should only need it if
224 |          you modified a \`.y' file.  You may need the \`Bison' package
225 |          in order for those modifications to take effect.  You can get
226 |          \`Bison' from any GNU archive site."
227 |     rm -f y.tab.c y.tab.h
228 |     if test $# -ne 1; then
229 |         eval LASTARG="\${$#}"
230 | 	case $LASTARG in
231 | 	*.y)
232 | 	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
233 | 	    if test -f "$SRCFILE"; then
234 | 	         cp "$SRCFILE" y.tab.c
235 | 	    fi
236 | 	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
237 | 	    if test -f "$SRCFILE"; then
238 | 	         cp "$SRCFILE" y.tab.h
239 | 	    fi
240 | 	  ;;
241 | 	esac
242 |     fi
243 |     if test ! -f y.tab.h; then
244 | 	echo >y.tab.h
245 |     fi
246 |     if test ! -f y.tab.c; then
247 | 	echo 'main() { return 0; }' >y.tab.c
248 |     fi
249 |     ;;
250 | 
251 |   lex*|flex*)
252 |     echo 1>&2 "\
253 | WARNING: \`$1' is $msg.  You should only need it if
254 |          you modified a \`.l' file.  You may need the \`Flex' package
255 |          in order for those modifications to take effect.  You can get
256 |          \`Flex' from any GNU archive site."
257 |     rm -f lex.yy.c
258 |     if test $# -ne 1; then
259 |         eval LASTARG="\${$#}"
260 | 	case $LASTARG in
261 | 	*.l)
262 | 	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
263 | 	    if test -f "$SRCFILE"; then
264 | 	         cp "$SRCFILE" lex.yy.c
265 | 	    fi
266 | 	  ;;
267 | 	esac
268 |     fi
269 |     if test ! -f lex.yy.c; then
270 | 	echo 'main() { return 0; }' >lex.yy.c
271 |     fi
272 |     ;;
273 | 
274 |   help2man*)
275 |     echo 1>&2 "\
276 | WARNING: \`$1' is $msg.  You should only need it if
277 | 	 you modified a dependency of a manual page.  You may need the
278 | 	 \`Help2man' package in order for those modifications to take
279 | 	 effect.  You can get \`Help2man' from any GNU archive site."
280 | 
281 |     file=`echo "$*" | sed -n "$sed_output"`
282 |     test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
283 |     if test -f "$file"; then
284 | 	touch $file
285 |     else
286 | 	test -z "$file" || exec >$file
287 | 	echo ".ab help2man is required to generate this page"
288 | 	exit $?
289 |     fi
290 |     ;;
291 | 
292 |   makeinfo*)
293 |     echo 1>&2 "\
294 | WARNING: \`$1' is $msg.  You should only need it if
295 |          you modified a \`.texi' or \`.texinfo' file, or any other file
296 |          indirectly affecting the aspect of the manual.  The spurious
297 |          call might also be the consequence of using a buggy \`make' (AIX,
298 |          DU, IRIX).  You might want to install the \`Texinfo' package or
299 |          the \`GNU make' package.  Grab either from any GNU archive site."
300 |     # The file to touch is that specified with -o ...
301 |     file=`echo "$*" | sed -n "$sed_output"`
302 |     test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
303 |     if test -z "$file"; then
304 |       # ... or it is the one specified with @setfilename ...
305 |       infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
306 |       file=`sed -n '
307 | 	/^@setfilename/{
308 | 	  s/.* \([^ ]*\) *$/\1/
309 | 	  p
310 | 	  q
311 | 	}' $infile`
312 |       # ... or it is derived from the source name (dir/f.texi becomes f.info)
313 |       test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
314 |     fi
315 |     # If the file does not exist, the user really needs makeinfo;
316 |     # let's fail without touching anything.
317 |     test -f $file || exit 1
318 |     touch $file
319 |     ;;
320 | 
321 |   tar*)
322 |     shift
323 | 
324 |     # We have already tried tar in the generic part.
325 |     # Look for gnutar/gtar before invocation to avoid ugly error
326 |     # messages.
327 |     if (gnutar --version > /dev/null 2>&1); then
328 |        gnutar "$@" && exit 0
329 |     fi
330 |     if (gtar --version > /dev/null 2>&1); then
331 |        gtar "$@" && exit 0
332 |     fi
333 |     firstarg="$1"
334 |     if shift; then
335 | 	case $firstarg in
336 | 	*o*)
337 | 	    firstarg=`echo "$firstarg" | sed s/o//`
338 | 	    tar "$firstarg" "$@" && exit 0
339 | 	    ;;
340 | 	esac
341 | 	case $firstarg in
342 | 	*h*)
343 | 	    firstarg=`echo "$firstarg" | sed s/h//`
344 | 	    tar "$firstarg" "$@" && exit 0
345 | 	    ;;
346 | 	esac
347 |     fi
348 | 
349 |     echo 1>&2 "\
350 | WARNING: I can't seem to be able to run \`tar' with the given arguments.
351 |          You may want to install GNU tar or Free paxutils, or check the
352 |          command line arguments."
353 |     exit 1
354 |     ;;
355 | 
356 |   *)
357 |     echo 1>&2 "\
358 | WARNING: \`$1' is needed, and is $msg.
359 |          You might have modified some files without having the
360 |          proper tools for further handling them.  Check the \`README' file,
361 |          it often tells you about the needed prerequisites for installing
362 |          this package.  You may also peek at any GNU archive site, in case
363 |          some other package would contain this missing \`$1' program."
364 |     exit 1
365 |     ;;
366 | esac
367 | 
368 | exit 0
369 | 
370 | # Local variables:
371 | # eval: (add-hook 'write-file-hooks 'time-stamp)
372 | # time-stamp-start: "scriptversion="
373 | # time-stamp-format: "%:y-%02m-%02d.%02H"
374 | # time-stamp-time-zone: "UTC"
375 | # time-stamp-end: "; # UTC"
376 | # End:
377 | 


--------------------------------------------------------------------------------
/3rd-party/sparsehash/src/config.h.in:
--------------------------------------------------------------------------------
  1 | /* src/config.h.in.  Generated from configure.ac by autoheader.  */
  2 | 
  3 | /* Namespace for Google classes */
  4 | #undef GOOGLE_NAMESPACE
  5 | 
  6 | /* the location of the header defining hash functions */
  7 | #undef HASH_FUN_H
  8 | 
  9 | /* the location of <unordered_map> or <hash_map> */
 10 | #undef HASH_MAP_H
 11 | 
 12 | /* the namespace of the hash<> function */
 13 | #undef HASH_NAMESPACE
 14 | 
 15 | /* the location of <unordered_set> or <hash_set> */
 16 | #undef HASH_SET_H
 17 | 
 18 | /* Define to 1 if you have the <google/malloc_extension.h> header file. */
 19 | #undef HAVE_GOOGLE_MALLOC_EXTENSION_H
 20 | 
 21 | /* define if the compiler has hash_map */
 22 | #undef HAVE_HASH_MAP
 23 | 
 24 | /* define if the compiler has hash_set */
 25 | #undef HAVE_HASH_SET
 26 | 
 27 | /* Define to 1 if you have the <inttypes.h> header file. */
 28 | #undef HAVE_INTTYPES_H
 29 | 
 30 | /* Define to 1 if the system has the type `long long'. */
 31 | #undef HAVE_LONG_LONG
 32 | 
 33 | /* Define to 1 if you have the `memcpy' function. */
 34 | #undef HAVE_MEMCPY
 35 | 
 36 | /* Define to 1 if you have the `memmove' function. */
 37 | #undef HAVE_MEMMOVE
 38 | 
 39 | /* Define to 1 if you have the <memory.h> header file. */
 40 | #undef HAVE_MEMORY_H
 41 | 
 42 | /* define if the compiler implements namespaces */
 43 | #undef HAVE_NAMESPACES
 44 | 
 45 | /* Define if you have POSIX threads libraries and header files. */
 46 | #undef HAVE_PTHREAD
 47 | 
 48 | /* Define to 1 if you have the <stdint.h> header file. */
 49 | #undef HAVE_STDINT_H
 50 | 
 51 | /* Define to 1 if you have the <stdlib.h> header file. */
 52 | #undef HAVE_STDLIB_H
 53 | 
 54 | /* Define to 1 if you have the <strings.h> header file. */
 55 | #undef HAVE_STRINGS_H
 56 | 
 57 | /* Define to 1 if you have the <string.h> header file. */
 58 | #undef HAVE_STRING_H
 59 | 
 60 | /* Define to 1 if you have the <sys/resource.h> header file. */
 61 | #undef HAVE_SYS_RESOURCE_H
 62 | 
 63 | /* Define to 1 if you have the <sys/stat.h> header file. */
 64 | #undef HAVE_SYS_STAT_H
 65 | 
 66 | /* Define to 1 if you have the <sys/time.h> header file. */
 67 | #undef HAVE_SYS_TIME_H
 68 | 
 69 | /* Define to 1 if you have the <sys/types.h> header file. */
 70 | #undef HAVE_SYS_TYPES_H
 71 | 
 72 | /* Define to 1 if you have the <sys/utsname.h> header file. */
 73 | #undef HAVE_SYS_UTSNAME_H
 74 | 
 75 | /* Define to 1 if the system has the type `uint16_t'. */
 76 | #undef HAVE_UINT16_T
 77 | 
 78 | /* Define to 1 if you have the <unistd.h> header file. */
 79 | #undef HAVE_UNISTD_H
 80 | 
 81 | /* define if the compiler supports unordered_{map,set} */
 82 | #undef HAVE_UNORDERED_MAP
 83 | 
 84 | /* Define to 1 if the system has the type `u_int16_t'. */
 85 | #undef HAVE_U_INT16_T
 86 | 
 87 | /* Define to 1 if the system has the type `__uint16'. */
 88 | #undef HAVE___UINT16
 89 | 
 90 | /* Name of package */
 91 | #undef PACKAGE
 92 | 
 93 | /* Define to the address where bug reports for this package should be sent. */
 94 | #undef PACKAGE_BUGREPORT
 95 | 
 96 | /* Define to the full name of this package. */
 97 | #undef PACKAGE_NAME
 98 | 
 99 | /* Define to the full name and version of this package. */
100 | #undef PACKAGE_STRING
101 | 
102 | /* Define to the one symbol short name of this package. */
103 | #undef PACKAGE_TARNAME
104 | 
105 | /* Define to the home page for this package. */
106 | #undef PACKAGE_URL
107 | 
108 | /* Define to the version of this package. */
109 | #undef PACKAGE_VERSION
110 | 
111 | /* Define to necessary symbol if this constant uses a non-standard name on
112 |    your system. */
113 | #undef PTHREAD_CREATE_JOINABLE
114 | 
115 | /* The system-provided hash function including the namespace. */
116 | #undef SPARSEHASH_HASH
117 | 
118 | /* The system-provided hash function, in namespace HASH_NAMESPACE. */
119 | #undef SPARSEHASH_HASH_NO_NAMESPACE
120 | 
121 | /* Define to 1 if you have the ANSI C header files. */
122 | #undef STDC_HEADERS
123 | 
124 | /* Version number of package */
125 | #undef VERSION
126 | 
127 | /* Stops putting the code inside the Google namespace */
128 | #undef _END_GOOGLE_NAMESPACE_
129 | 
130 | /* Puts following code inside the Google namespace */
131 | #undef _START_GOOGLE_NAMESPACE_
132 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft 3rd-party
2 | graft cpp
3 | graft cython
4 | graft tests
5 | include COPYING
6 | include LICENSES
7 | include README.rst
8 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | arv — a fast 23andMe parser for Python
  2 | ======================================
  3 | |travis-status| |versions| |license| |pypi|
  4 | 
  5 | Arv (Norwegian; "heritage" or "inheritance") is a Python module for parsing raw
  6 | 23andMe genome files. It lets you lookup SNPs from RSIDs.
  7 | 
  8 | .. code:: python
  9 | 
 10 |   from arv import load, unphased_match as match
 11 | 
 12 |   genome = load("genome.txt")
 13 | 
 14 |   print("You are a {gender} with {color} eyes and {complexion} skin.".format(
 15 |     gender     = "man" if genome.y_chromosome else "woman",
 16 |     complexion = "light" if genome["rs1426654"] == "AA" else "dark",
 17 |     color      = match(genome["rs12913832"], {"AA": "brown",
 18 |                                               "AG": "brown or green",
 19 |                                               "GG": "blue"})))
 20 | 
 21 | For my genome, this little program produces::
 22 | 
 23 |     You are a man with blue eyes and light skin.
 24 | 
 25 | The parser is insanely fast, having been written in finely tuned C++, exposed
 26 | via Cython. A 2013 Xeon machine I've tested on parses a 24 Mb file into a hash
 27 | table in about 78 ms. The newer 23andMe files are smaller, and parses in a mere
 28 | 62 ms!
 29 | 
 30 | Works with Python 2.7+ and 3+. Installable with pip!
 31 | 
 32 | .. code:: bash
 33 | 
 34 |     $ pip install --upgrade arv
 35 | 
 36 | See below for software requirements.
 37 | 
 38 | Important disclaimer
 39 | ====================
 40 | 
 41 | It's very important to tell you that I, the author of arv, am merely a
 42 | *hobbyist*! I *am* a professional software developer, but *not* a geneticist,
 43 | biologist, medical doctor or anything like that.
 44 | 
 45 | Because of that, this software may not only look weird to people in the field,
 46 | it may also contain serious errors. If you find any problem whatsoever, please
 47 | submit a GitHub issue.
 48 | 
 49 | This a slightly modified version of what I wrote for the original software
 50 | called "dna-traits", and the same goes for this software:
 51 | 
 52 | In addition to the GPL v3 licensing terms, and given that this code deals with
 53 | health-related issues, I want to stress that the provided code most likely
 54 | contains errors, or invalid genome reports. Results from this code must be
 55 | interpreted as HIGHLY SPECULATIVE and may even be downright INCORRECT. Always
 56 | consult an expert (medical doctor, geneticist, etc.) for guidance. I take NO
 57 | RESPONSIBILITY whatsoever for any consequences of using this code, including
 58 | but not limited to loss of life, money, spouses, self-esteem and so on. Use at
 59 | YOUR OWN RISK.
 60 | 
 61 | The indended use is for casual, educational purposes. If this code is used for
 62 | research purposes, please cross-check key results with other software: The
 63 | parser code may contain serious errors, for example.
 64 | 
 65 | An interesting story about the research part: I once released a pretty good
 66 | Mersenne Twister PRNG for C++ that ended up being used in research. Turned out
 67 | the engine had bugs, and by the time I had fixed them, a poor researcher had
 68 | already produced results with it (hopefully not published; I don't know). The
 69 | guy had to go back and fix his stuff, and I felt terribly bad about it.
 70 | 
 71 | So beware!
 72 | 
 73 | Installation
 74 | ============
 75 | 
 76 | The recommended way is to install from PyPi.
 77 | 
 78 | .. code:: bash
 79 | 
 80 |     $ pip install arv
 81 | 
 82 | This will most likely build Arv from source. The package will automatically
 83 | install Cython, but it doesn't check if you have a C++11 compiler. Furthermore,
 84 | it passes some additional compilation flags that are specific to clang/gcc.
 85 | 
 86 | If you have problems running ``pip install arv``, please open an issue on
 87 | GitHub with as much detail as possible (``g++/clang++ --version``, ``uname
 88 | -a``, ``python --version`` and so on).
 89 | 
 90 | If you set the environment variable ``ARV_DEBUG``, it will build with full
 91 | warnings and debug symbols.
 92 | 
 93 | You can also install it locally through ``setup.py``. The following builds and
 94 | tests, but does not install, arv:
 95 | 
 96 | .. code:: bash
 97 | 
 98 |     $ python setup.py test
 99 | 
100 | If you set the environment variable ``ARV_BENCHMARK`` to a genome filename and
101 | run the tests, it will perform a short benchmark, reporting the best parsing
102 | time on it. You can also set ``ARV_BENCHMARK_COUNT=<number>`` to change how
103 | many times it should parse the given file.
104 | 
105 | Usage
106 | =====
107 | 
108 | First you need to dump the raw genome file from 23andMe. You'll find it under
109 | the raw genome browser, and download the file. You may have to unzip it first:
110 | The parser works on the pure text files.
111 | 
112 | Then you load the genome in Python with
113 | 
114 | .. code:: python
115 | 
116 |     >>> genome = arv.load("filename.txt")
117 |     >>> genome
118 |     <Genome: SNPs=960613, name='filename.txt'>
119 | 
120 | To see if there are any Y-chromosomes present in the genome,
121 | 
122 | .. code:: python
123 | 
124 |     >>> genome.y_chromosome
125 |     True
126 | 
127 | The genome provides a ``dict``-like interface. To get a given SNP, just enter the RSID.
128 | 
129 | .. code:: python
130 | 
131 |     >>> snp = genome["rs123"]
132 |     >>> snp
133 |     <SNP: chromosome=7 position=24966446 genotype='AA'>
134 |     >>> snp.chromosome
135 |     7
136 |     >>> snp.position
137 |     24966446
138 |     >>> snp.genotype
139 |     <Genotype 'AA'>
140 | 
141 | The ``Genotype`` object can be converted to a string with ``str``, but it also
142 | allows rich comparisons with strings directly:
143 | 
144 | .. code:: python
145 | 
146 |     >>> snp.genotype == "AA"
147 |     True
148 | 
149 | you can get its complement with the ``~``-operator.
150 | 
151 | .. code:: python
152 | 
153 |     >>> type(snp.genotype)
154 |     <class '_arv.Genotype'>
155 |     >>> ~snp.genotype
156 |     <Genotype 'TT'>
157 | 
158 | The complement is important due to eah SNPs orientation. All of 23andMe SNPs
159 | are oriented towards the positive ("plus") strand, based on the `GRCh37
160 | <https://www.ncbi.nlm.nih.gov/grc/human>`_ reference human genome assembly
161 | build. But some SNPs on SNPedia are given with the `minus orientation
162 | <http://snpedia.com/index.php/Orientation>`_.
163 | 
164 | For example, to determine if the human in question is likely lactose tolerant
165 | or not, we can look at `rs4988235 <http://snpedia.com/index.php/Rs4988235>`_.
166 | SNPedia reports its *Stabilized* orientation to be minus, so we need to use the
167 | complement:
168 | 
169 | .. code:: python
170 | 
171 |     >>> genome["rs4988235"].genotype
172 |     <Genotype 'AA'>
173 |     >>> ~genome["rs4988235"].genotype
174 |     <Genotype 'TT'>
175 | 
176 | By reading a few `GWAS
177 | <https://en.wikipedia.org/wiki/Genome-wide_association_study>`_ research
178 | papers, we can build a rule to determine a human's likelihood for lactose
179 | tolerance:
180 | 
181 | .. code:: python
182 | 
183 |     >>> arv.unphased_match(~genome["rs4988235"].genotype, {
184 |         "TT": "Likely lactose tolerant",
185 |         "TC": "Likely lactose tolerant",
186 |         "CC": "Likely lactose intolerant",
187 |         None: "Unable to determine (genotype not present)"})
188 |     'Likely lactose tolerant'
189 | 
190 | Note that reading GWAS papers for hobbyists can be a bit tricky. If you are a
191 | hobbyist, be sure to spend some time reading the paper closely, checking up
192 | SNPs on places like `SNPedia <http://snpedia.com>`_, `dnSNP
193 | <https://www.ncbi.nlm.nih.gov/projects/SNP/>`_ and `OpenSNP
194 | <https://opensnp.org/genotypes>`_. Finally, have fun, but be extremely careful
195 | about drawing conclusions from your results.
196 | 
197 | Command line interface
198 | ======================
199 | 
200 | You can also invoke ``arv`` from the command line:
201 | 
202 | .. code:: bash
203 | 
204 |     $ python -m arv --help
205 | 
206 | For example, you can drop into a Python REPL like so:
207 | 
208 | .. code:: bash
209 | 
210 |     $ python -m arv --repl genome.txt
211 |     genome.txt ... 960614 SNPs, male
212 |     Type `genome` to see the parsed 23andMe raw genome file
213 |     >>> genome
214 |     <Genome: SNPs=960614, name='genome.txt'>
215 |     >>> genome["rs123"]
216 |     <SNP: chromosome=7 position=24966446 genotype=<Genotype 'AA'>>
217 | 
218 | If you specify several files, you can access them through the variable
219 | ``genomes``.
220 | 
221 | The example at the top of this document can be run with ``--example``:
222 | 
223 | .. code:: bash
224 | 
225 |     $ python -m arv --example genome.txt
226 |     genome.txt ... 960614 SNPs, male
227 | 
228 |     genome.txt ... A man with blue eyes and light skin
229 | 
230 | License
231 | =======
232 | 
233 | Copyright 2017 Christian Stigen Larsen
234 | 
235 | Distributed under the GNU GPL v3 or later. See the file COPYING for the full
236 | license text. This software makes use of open source software; see LICENSES for
237 | details.
238 | 
239 | .. |travis-status| image:: https://travis-ci.org/cslarsen/arv.svg?branch=master
240 |     :alt: Travis build status
241 |     :scale: 100%
242 |     :target: https://travis-ci.org/cslarsen/arv
243 | 
244 | .. |license| image:: https://img.shields.io/badge/license-GPL%20v3%2B-blue.svg
245 |     :target: http://www.gnu.org/licenses/old-licenses/gpl-3.en.html
246 |     :alt: Project License
247 | 
248 | .. |versions| image:: https://img.shields.io/badge/python-2.7%2B%2C%203%2B-blue.svg
249 |     :target: https://pypi.python.org/pypi/arv/
250 |     :alt: Supported Python versions
251 | 
252 | .. |pypi| image:: https://badge.fury.io/py/arv.svg
253 |     :target: https://badge.fury.io/py/arv
254 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | - Parse GRCh37/38 build
 2 | - Make parsing safe
 3 |   - detect overflow while parsing integers
 4 |   - use fuzzing (afl, e.g.) to break the parser
 5 | - Benchmark other ops
 6 |   - iteration
 7 |   - random access
 8 | - Modify google dense hash map to move/emplace from buffer
 9 |   - try to use a nearly full buffer to make this faster
10 | - Try to move y-chromo detection out of the loop
11 | - Remove the pimpl pattern, don't need it anymore
12 | - Build with profiling
13 | - Build with gcov, do coverage testing
14 |   - Use coveralls
15 | 


--------------------------------------------------------------------------------
/arv/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A fast 23andMe raw genome file parser.
 3 | 
 4 | To load a genome file,
 5 | 
 6 |     >>> import arv
 7 |     >>> genome = arv.load("genome.txt")
 8 | 
 9 | You can then look up genotypes from RSIDs
10 | 
11 |     >>> genome["rs123"]
12 |     'AA'
13 | 
14 | You can also access SNPs
15 | 
16 |     >>> genome["rs123"]
17 |     <SNP: chromosome=1 position=112233 genotype='AA'>
18 | 
19 | By using ``unphased_match``, you can match genotypes while disregarding the
20 | ordering of the two nucleotides. For example, ``AT`` and ``TA`` would be
21 | considered equal. Here is an example usage:
22 | 
23 |         genotype = genome["rs12913832"]
24 |         eyecolor = unphased_match(genotype, {
25 |                         "AA": "brown",
26 |                         "AG": "brown or green",
27 |                         "GG": "blue",
28 |                         None: "unknown"})
29 | 
30 | A full example would be:
31 | 
32 |     import arv
33 | 
34 |     genome = arv.load("genome.txt")
35 | 
36 |     print("You are a {gender} with {color} eyes and {complexion} skin.".format(
37 |         gender     = "man" if genome.y_chromosome else "woman",
38 |         complexion = "light" if genome["rs1426654"] == "AA" else "dark",
39 |         color      = unphased_match(genome["rs12913832"], {
40 |                         "AA": "brown",
41 |                         "AG": "brown or green",
42 |                         "GG": "blue"})))
43 | 
44 | For a given genome, this might print
45 | 
46 |     You are a man with blue eyes and light skin.
47 | 
48 | Copyright 2014, 2016, 2017 Christian Stigen Larsen
49 | Distributed under the GNU GPL v3 or later.
50 | """
51 | 
52 | from _arv import (
53 |     _sizes,
54 |     Genome,
55 |     Genotype,
56 |     load,
57 |     SNP,
58 | )
59 | 
60 | from .match import unphased_match
61 | 
62 | __author__ = "Christian Stigen Larsen"
63 | __copyright__ = "Copyright 2017 Christian Stigen Larsen"
64 | __credits__ = ["Christian Stigen Larsen", "Google"]
65 | __email__ = "csl@csl.name"
66 | __license__ = "GNU General Public License v3 or later"
67 | __maintainer__ = "Christian Stigen Larsen"
68 | __status__ = "Prototype"
69 | __version__ = "0.9.3"
70 | 
71 | __all__ = [
72 |     "_sizes",
73 |     "Genome",
74 |     "Genotype",
75 |     "load",
76 |     "SNP",
77 |     "unphased_match",
78 | ]
79 | 


--------------------------------------------------------------------------------
/arv/__main__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command line interface to arv.
  3 | 
  4 | This can be invoked with ``python -m arv``.
  5 | 
  6 | Copyright 2017 Christian Stigen Larsen
  7 | Distributed under the GNU GPL v3 or later. See COPYING.
  8 | """
  9 | 
 10 | import argparse
 11 | import arv
 12 | import arv.traits
 13 | import os
 14 | import sys
 15 | 
 16 | class ArvError(RuntimeError):
 17 |     pass
 18 | 
 19 | def log(msg="\n"):
 20 |     sys.stdout.write(msg)
 21 |     sys.stdout.flush()
 22 | 
 23 | def _parse_args():
 24 |     p = argparse.ArgumentParser(prog="arv",
 25 |             description="Arv - a fast 23andMe parser",
 26 |             epilog=arv.__copyright__)
 27 | 
 28 |     p.add_argument("--repl", default=False, action="store_true",
 29 |             help="Open a Python REPL loaded with the given genomes")
 30 | 
 31 |     p.add_argument("--example", default=False, action="store_true",
 32 |             help="Shows an example report for the genome(s)")
 33 | 
 34 |     p.add_argument("--version", "-V", default=False, action="store_true",
 35 |             help="Shows version and exits")
 36 | 
 37 |     p.add_argument("--ethnicity", default="", type=str,
 38 |             help="Sets ethnicity for all genomes")
 39 | 
 40 |     p.add_argument("files", nargs="*",
 41 |             help="23andMe raw genome file name(s)")
 42 | 
 43 |     opts = p.parse_args()
 44 | 
 45 |     if opts.version:
 46 |         print("arv %s" % arv.__version__)
 47 |         print(arv.__copyright__)
 48 |         print("Distributed under the %s" % arv.__license__)
 49 |         sys.exit(0)
 50 | 
 51 |     if len(opts.files) == 0:
 52 |         p.print_help()
 53 |         sys.exit(1)
 54 | 
 55 |     return opts
 56 | 
 57 | def summary(genome):
 58 |     """Returns a textual summary of the genome."""
 59 |     return "{count} SNPs, {gender}".format(
 60 |             count=len(genome), gender="male" if
 61 |             genome.y_chromosome else "female",)
 62 | 
 63 | def example(genome):
 64 |     """Returns an example report for the genome."""
 65 |     gender = "man" if genome.y_chromosome else "woman"
 66 |     complexion = "light" if genome["rs1426654"] == "AA" else "dark"
 67 | 
 68 |     color = arv.unphased_match(genome["rs12913832"], {
 69 |         "AA": "brown",
 70 |         "AG": "brown or green",
 71 |         "GG": "blue"})
 72 | 
 73 |     report = {"Description":
 74 |         "A {gender} with {color} eyes and {complexion} skin".format(**locals())}
 75 | 
 76 |     report.update(arv.traits.traits_report(genome))
 77 | 
 78 |     # Format report
 79 |     out = []
 80 |     width = max(map(len, report.keys()))
 81 |     for k, v in sorted(report.items()):
 82 |         out.append("  %-*s: %s" % (width, k, v))
 83 | 
 84 |     return "\n" + "\n".join(out)
 85 | 
 86 | def _main():
 87 |     opts = _parse_args()
 88 | 
 89 |     genomes = []
 90 |     for filename in opts.files:
 91 |         log("%s ... " % os.path.basename(filename))
 92 |         genome = arv.load(filename, ethnicity=opts.ethnicity)
 93 |         log("%s\n" % summary(genome))
 94 |         genomes.append(genome)
 95 | 
 96 |     if opts.example:
 97 |         for filename, genome in zip(opts.files, genomes):
 98 |             log("%s ... %s\n" % (os.path.basename(filename), example(genome)))
 99 | 
100 |     if opts.repl:
101 |         env = dict(globals())
102 | 
103 |         if len(genomes) == 1:
104 |             env.update({"genome": genomes[0]})
105 |             message = "Type `genome` to see the parsed 23andMe raw genome file"
106 |         else:
107 |             env.update({"genomes": genomes})
108 |             message = "Type `genomes` to see the parsed 23andMe raw genome files"
109 | 
110 |         import code
111 |         code.interact(message, local=env)
112 | 
113 | if __name__ == "__main__":
114 |     try:
115 |         _main()
116 |         sys.exit(0)
117 |     except ArvError as e:
118 |         log("Error: %s\n" % e)
119 |         sys.exit(1)
120 | 


--------------------------------------------------------------------------------
/arv/match.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Contains matching functions.
 3 | 
 4 | Part of arv
 5 | Copyright 2014, 2016, 2017 Christian Stigen Larsen
 6 | Distributed under the GPL v3 or later. See COPYING.
 7 | """
 8 | 
 9 | import arv
10 | 
11 | def assert_european(genome):
12 |     """If ethnicity is set, make sure it's European."""
13 |     if genome.ethnicity not in [None, "european"]:
14 |         raise ValueError("Only applicable to Europeans")
15 | 
16 | def unphased_match(snp, phenotypes):
17 |     """Match SNP with unphased genotypes and return phenotype.
18 | 
19 |     Disregards phasing when comparing genotypes, meaning that an input value of
20 |     "AG" will be matched against both "AG" and "GA".
21 | 
22 |     Arguments:
23 |         genotype: Genotype (str) or SNP (arv.SNP) to match.
24 |         phenotypes: Dict mapping (unphased) genotype to phenotype.
25 | 
26 |     Example:
27 |         unphased_match(genome.rs4988235, {
28 |             "AA": "Likely lactose tolerant",
29 |             "AG": "Likely lactose tolerant",
30 |             "GG": "Likely lactose intolerant",
31 |             None: "Unknown genotype"})
32 | 
33 |         The above example could return "Likely lactose tolerant", for example,
34 |         or "Unknown genotype" if there was no match. Note that the key "AG"
35 |         will match both "AG" and "GA" in the snp.
36 | 
37 |     Returns:
38 |         Matching phenotype. If the `phenotypes` dict has a `None` key, it will
39 |         be returned in case there is no match.
40 |     """
41 |     if isinstance(snp, str):
42 |         genotype = snp
43 |     elif isinstance(snp, arv.Genotype):
44 |         genotype = str(snp)
45 |     elif isinstance(snp, arv.SNP):
46 |         genotype = str(snp.genotype)
47 |     else:
48 |         raise TypeError(type(snp))
49 | 
50 |     # Look for "IJ"
51 |     if genotype in phenotypes:
52 |         return phenotypes[genotype]
53 | 
54 |     # Look for "JI"
55 |     genotype = "".join(reversed(str(snp)))
56 |     if genotype in phenotypes:
57 |         return phenotypes[genotype]
58 | 
59 |     # Use default value?
60 |     if None in phenotypes:
61 |         return phenotypes[None]
62 |     else:
63 |         raise KeyError(str(snp))
64 | 


--------------------------------------------------------------------------------
/arv/traits.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8 -*-
  2 | 
  3 | """
  4 | Used to infer some traits.
  5 | 
  6 | Use with caution, this code may contain errors!
  7 | 
  8 | Copyright (C) 2014, 2016 Christian Stigen Larsen
  9 | Distributed under the GPL v3 or later. See COPYING.
 10 | """
 11 | 
 12 | from arv.match import unphased_match, assert_european
 13 | from arv.util import make_report
 14 | 
 15 | def bitter_taste(genome):
 16 |     "Bitter taste perception."
 17 |     return unphased_match(genome["rs713598"], {
 18 |         "CC": "Probably can't taste certain bitter flavours",
 19 |         "CG": "Can taste bitter flavours that others can't",
 20 |         "GG": "Can taste bitter flavours that others can't",
 21 |         None: "Unable to determine"})
 22 | 
 23 | def breastfeeding_iq(genome):
 24 |     "Breastfeeding and IQ."
 25 |     assert_european(genome)
 26 |     s = ""
 27 | 
 28 |     if "C" in str(genome["rs174575"].genotype):
 29 |         s += "Being breastfed raised subjects' IQ by 6-7 points on average (rs174575)\n"
 30 |     else:
 31 |         s += "Little to no effect of being breastfed on IQ (rs174575)\n"
 32 | 
 33 |     if "A" in str(genome["rs1535"].genotype):
 34 |         s += "Being breastfed raised subjects' IQ by 4-5 points on average (rs1535)\n"
 35 |     else:
 36 |         s += "Little or no effect of being breastfed on IQ (rs1535)\n"
 37 | 
 38 |     return s
 39 | 
 40 | def alcohol_flush_reaction(genome):
 41 |     "Alcohol flush reaction."
 42 |     return unphased_match(genome["rs671"], {
 43 |         "AA": "Extreme reaction (no copies of the ALDH2 gene)",
 44 |         "AG": "Moderate reaction (one copy of the ALDH2 gene)",
 45 |         "GG": "Little to no reaction (two copies of the ALDH2 gene)",
 46 |         None: "Unable to determine"})
 47 | 
 48 | def earwax_type(genome):
 49 |     "Earwax type."
 50 |     return unphased_match(genome["rs17822931"], {
 51 |         "CC": "Wet earwax (sticky, honey-colored)",
 52 |         "CT": "Wet earwax (sticky, honey-colored)",
 53 |         "TT": "Dry earwax (flaky, pale)",
 54 |         None: "Unable to determine"})
 55 | 
 56 | def eye_color(genome):
 57 |     "Eye color."
 58 |     assert_european(genome)
 59 |     return unphased_match(genome["rs12913832"], {
 60 |         "AA": "Brown eyes, although 14% have green and 1% have blue",
 61 |         "AG": "Most likely brown or green, but 7% have blue",
 62 |         "GG": "Most likely blue, but 30% have green and 1% brown",
 63 |         None: "Unable to determine"})
 64 | 
 65 | def lactose_intolerance(genome):
 66 |     "Lactose intolerance."
 67 |     return unphased_match(genome["rs4988235"], {
 68 |         "AA": "Likely lactose tolerant",
 69 |         "AG": "Likely lactose tolerant",
 70 |         "GG": "Likely lactose intolerant",
 71 |         None: "Unable to determine"})
 72 | 
 73 | def malaria_resistance(genome):
 74 |     "Malaria resistance (Duffy antigen)."
 75 |     return unphased_match(genome["rs2814778"], {
 76 |         "CC": "Likely resistant to P. vivax",
 77 |         "CT": "Likely to have some resistance to P. vivax",
 78 |         "TT": "Likely not resistant to P. vivax",
 79 |         None: "Unable to determine"})
 80 | 
 81 | def male_pattern_baldness(genome):
 82 |     """Male pattern baldness.
 83 | 
 84 |     Studies:
 85 |         http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=PubMed&term=18849991
 86 |         http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=PubMed&term=15902657
 87 |         http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=PubMed&term=18849994
 88 |     """
 89 |     raise NotImplementedError()
 90 | 
 91 |     # X-chromosome: rs6625163, A->G is risk mutation (OR 1.17)
 92 |     # rs6113491, A->C is risk mutation (AA has OR 1.77)
 93 |     # TODO: Attempt to match ORs
 94 | 
 95 | def norovirus_resistance(genome):
 96 |     """Norovirus resistance (most common strain)."""
 97 |     return unphased_match(genome["rs601338"], {
 98 |         "AA": "Resistant to most common strain",
 99 |         "AG": "Likely not resistant to most common strain",
100 |         "GG": "Likely not resistant to most common strain",
101 |         None: "Unable to determine"})
102 | 
103 | def muscle_performance(genome):
104 |     """Muscle performance."""
105 |     return unphased_match(genome["rs1815739"], {
106 |         "CC": "Likely sprinter, perhaps endurance athlete (two copies)",
107 |         "CT": "Likely sprinter, perhaps endurance athlete (one copy)",
108 |         "TT": "Unlikely sprinter, but likely endurance athlete (no copies)",
109 |         None: "Unable to determine"})
110 | 
111 | def smoking_behaviour(genome):
112 |     """Smoking behaviour."""
113 |     assert_european(genome)
114 |     return unphased_match(genome["rs1051730"], {
115 |         "AA": "Likely to smoke more than average",
116 |         "AG": "Likely to smoke a little bit more than average",
117 |         "GG": "Likely to smoke typical amount of cigarettes per day",
118 |         None: "Unable to determine"})
119 | 
120 | def red_hair(genome):
121 |     """Hair color; odds for red hair."""
122 |     return unphased_match(genome["rs1805007"], {
123 |         "CC": "Typical odds for red hair",
124 |         "CT": "Substantially increased odds for red hair",
125 |         "TT": "Greatly increased odds for red hair",
126 |         None: "Unable to determine"})
127 | 
128 | def blond_vs_brown_hair(genome):
129 |     """Hair color; blond versus brown."""
130 |     return unphased_match(genome["rs1667394"], {
131 |         "CC": "Greatly decreased odds of having blond hair vs. brown",
132 |         "CT": "Decreased odds of having blond hair vs. brown",
133 |         "TT": "Typical odds of having blond hair vs. brown hair",
134 |         None: "Unable to determine"})
135 | 
136 | def pain_sensitivity(genome):
137 |     """Pain sensitivity."""
138 |     return unphased_match(genome["rs6269"], {
139 |         "AA": "Increased sensitivity to pain",
140 |         "AG": "Typical sensitivity to pain",
141 |         "GG": "Less sensitive to pain",
142 |         None: "Unable to determine"})
143 | 
144 | def caffeine_metabolism(genome):
145 |     """Caffeine metabolism."""
146 |     assert_european(genome)
147 |     return unphased_match(genome["rs762551"], {
148 |         "AA": "Fast metabolizer",
149 |         "AC": "Slow metabolizer",
150 |         "CC": "Slow metabolizer",
151 |         None: "Unable to determine"})
152 | 
153 | def heroin_addiction(genome):
154 |     """Heroin addiction."""
155 |     assert_european(genome)
156 |     return unphased_match(genome["rs1799971"], {
157 |         "AA": "Typical odds of addiction",
158 |         "AG": "Higher odds of addiction",
159 |         "GG": "Higher odds of addiction",
160 |         None: "Unable to determine"})
161 | 
162 | def hair_curl(genome):
163 |     assert_european(genome)
164 |     return unphased_match(genome["rs17646946"], {
165 |         "AA": "Straighter hair on average",
166 |         "AG": "Straighter hair on average",
167 |         "GG": "Slightly curlier hair on average"})
168 | 
169 | def hiv_aids_resistance(genome):
170 |     """Resistance to HIV/AIDS."""
171 |     return unphased_match(genome["i3003626"], {
172 |         "DD": "Some resistance to most common strain of HIV",
173 |         "DI": "Not resistant, but may have slower disease progression",
174 |         "II": "Not resistant"})
175 | 
176 | def aspargus_detection(genome):
177 |     """Aspargus metabolite detection."""
178 |     assert_european(genome)
179 |     return unphased_match(genome["rs4481887"], {
180 |         "AA": "Higher odds of smelling aspargus in urine",
181 |         "AG": "Medium odds of smelling aspargus in urine",
182 |         "GG": "Typical odds of smelling aspargus in urine",
183 |         None: "Unable to determine"})
184 | 
185 | def adiponectin_levels(genome):
186 |     """Adiponectin levels."""
187 |     if genome.ethnicity == "asian":
188 |         r = unphased_match(genome["rs1851665"], {
189 |             "AA": "Slightly lower, which may be bad (rs1851665)\n",
190 |             "AG": "Typical (rs1851665)\n",
191 |             "GG": "Slightly higher, which is good (rs1851665)\n",
192 |             None: "Unable to determine for rs1851665\n"})
193 | 
194 |         r += unphased_match(genome["rs7193788"], {
195 |             "AA": "Slightly higher, which is good (rs7193788)",
196 |             "AG": "Typical (rs7193788)",
197 |             "GG": "Slightly lower, which may be bad (rs7193788)",
198 |             None: "Unable to determine for rs7193788"})
199 |         return r
200 | 
201 |     elif genome.ethnicity in [None, "european"]:
202 |         return unphased_match(genome["rs6444175"], {
203 |             "AA": "Lower, which may be bad",
204 |             "AG": "Slightly lower, which may be bad",
205 |             "GG": "Typical levels",
206 |             None: "Unable to determine"})
207 | 
208 | def biological_age(genome):
209 |     """Biological aging (telomere lengths)."""
210 |     assert_european(genome)
211 | 
212 |     ages = {
213 |         "rs10936599": {"TT": 7.82, "CT":  3.91, "CC":  0,    None: 0},
214 |         "rs2736100":  {"AA": 3.14, "AC": 0,     "CC": -3.14, None: 0},
215 |         "rs9420907":  {"AA": 0,    "AC": -2.76, "CC": -5.52, None: 0},
216 |         "rs755017":   {"AA": 0,    "AG": -2.47, "GG": -4.94, None: 0},
217 |         "rs11100479": {"CC": 5.98, "CT": -2.99, "TT":  0,    None: 0},
218 |         "rs10165485": {"TT": 0,    "CT": -2.23, "CC": -4.46, None: 0},
219 |     }
220 | 
221 |     age = [unphased_match(genome[rsid], t) for (rsid, t) in ages.items()]
222 | 
223 |     def qual(age):
224 |         if age <= 0:
225 |             return "younger"
226 |         elif age > 0:
227 |             return "older"
228 | 
229 |     msg = "From %.1f years %s to %.1f years %s than actual age\n" % (
230 |             abs(min(age)), qual(min(age)), abs(max(age)), qual(max(age)))
231 |     msg += "The sum is %.1f years %s, compared to actual age" % (
232 |             abs(sum(age)), qual(sum(age)))
233 |     return msg
234 | 
235 | def birth_weight(genome):
236 |     """Birth weight."""
237 |     assert_european(genome)
238 |     weights = {
239 |         "rs7903146": {"TT":   0, "CT": -30, "CC": -60, None: 0},
240 |         "rs1799884": {"TT": +54, "CT": +27, "CC":   0, None: 0},
241 |     }
242 |     weight = [unphased_match(genome[rsid], w) for (rsid, w) in weights.items()]
243 |     return "From %.1fg to %.1fg (sum: %.1fg) compared to typical weight" % (
244 |             min(weight), max(weight), sum(weight))
245 | 
246 | def blood_glucose(genome):
247 |     """Blood glucose."""
248 |     assert_european(genome)
249 |     return unphased_match(genome["rs560887"], {
250 |         "CC": "Average fasting plasma glucose levels of 5.18mmol/L",
251 |         "CT": "Average fasting plasma glucose levels of 5.12mmol/L",
252 |         "TT": "Average fasting plasma glucose levels of 5.06mmol/L",
253 |         None: "Unable to determine"})
254 | 
255 | def traits_report(genome):
256 |     """Infer traits from genome."""
257 |     return make_report(genome, [
258 |         adiponectin_levels,
259 |         alcohol_flush_reaction,
260 |         aspargus_detection,
261 |         biological_age,
262 |         birth_weight,
263 |         bitter_taste,
264 |         blond_vs_brown_hair,
265 |         blood_glucose,
266 |         breastfeeding_iq,
267 |         caffeine_metabolism,
268 |         earwax_type,
269 |         eye_color,
270 |         hair_curl,
271 |         heroin_addiction,
272 |         hiv_aids_resistance,
273 |         lactose_intolerance,
274 |         malaria_resistance,
275 |         male_pattern_baldness,
276 |         muscle_performance,
277 |         norovirus_resistance,
278 |         pain_sensitivity,
279 |         red_hair,
280 |         smoking_behaviour,
281 |     ])
282 | 


--------------------------------------------------------------------------------
/arv/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Part of arv
 3 | Copyright 2017 Christian Stigen Larsen
 4 | 
 5 | dna-traits
 6 | Copyright 2014, 2016, 2017 Christian Stigen Larsen
 7 | 
 8 | Distributed under the GPL v3 or later. See COPYING.
 9 | """
10 | 
11 | def make_report(genome, functions, verbose=False):
12 |     """Runs each function with genome as argument, returning a dict of
13 |     results."""
14 |     report = {}
15 | 
16 |     for func in functions:
17 |         if func.__doc__ is not None:
18 |             title = func.__doc__[:func.__doc__.index(".")]
19 |         else:
20 |             title = func.__name__.replace("_", " ").capitalize()
21 | 
22 |         try:
23 |             result = func(genome)
24 |             if result is not None:
25 |                 report[title] = result
26 |         except ValueError as e:
27 |             if verbose:
28 |                 report[title] = "Error: %s" % e
29 |         except AssertionError as e:
30 |             if verbose:
31 |                 report[title] = "Error: %s" % e
32 |         except KeyError as e:
33 |             continue
34 |         except NotImplementedError:
35 |             continue
36 | 
37 |     return report
38 | 


--------------------------------------------------------------------------------
/cpp/.gitignore:
--------------------------------------------------------------------------------
1 | arv.c
2 | 


--------------------------------------------------------------------------------
/cpp/arv.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * dna-traits
  3 |  * Copyright 2014, 2016, 2017 Christian Stigen Larsen
  4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
  5 |  *
  6 |  * arv
  7 |  * Copyright 2017 Christian Stigen Larsen
  8 |  * Distributed under the GNU GPL v3 or later. See COPYING.
  9 |  */
 10 | 
 11 | #include <google/dense_hash_map>
 12 | 
 13 | #include "arv.hpp"
 14 | 
 15 | namespace {
 16 | 
 17 | struct RSIDHash {
 18 |   inline std::size_t operator() (const arv::RSID& rsid) const
 19 |   {
 20 |     return static_cast<std::size_t>(rsid);
 21 |   }
 22 | };
 23 | 
 24 | struct RSIDEq {
 25 |   inline bool operator()(const arv::RSID& a, const arv::RSID& b) const
 26 |   {
 27 |     return a == b;
 28 |   }
 29 | };
 30 | 
 31 | typedef google::dense_hash_map<arv::RSID, arv::SNP, RSIDHash, RSIDEq> SNPMap;
 32 | 
 33 | } // anonymus namespace
 34 | 
 35 | namespace arv {
 36 | 
 37 | const SNP NONE_SNP(CHR_NO, 0, Genotype(NONE, NONE));
 38 | 
 39 | static char nucleotide_char(const Nucleotide& n)
 40 | {
 41 |   switch ( n ) {
 42 |     case A:    return 'A';
 43 |     case C:    return 'C';
 44 |     case D:    return 'D';
 45 |     case G:    return 'G';
 46 |     case I:    return 'I';
 47 |     case NONE: return '-';
 48 |     case T:    return 'T';
 49 |   }
 50 |   return '-';
 51 | }
 52 | 
 53 | Nucleotide complement(const Nucleotide& n)
 54 | {
 55 |   switch ( n ) {
 56 |     case A:    return T;
 57 |     case C:    return G;
 58 |     case D:    return D;
 59 |     case G:    return C;
 60 |     case I:    return I;
 61 |     case NONE: return NONE;
 62 |     case T:    return A;
 63 |   }
 64 |   return NONE;
 65 | }
 66 | 
 67 | Genotype::Genotype() : first(NONE), second(NONE)
 68 | {
 69 | }
 70 | 
 71 | Genotype::Genotype(const Nucleotide& a, const Nucleotide& b)
 72 |   : first(a), second(b)
 73 | {
 74 | }
 75 | 
 76 | Genotype operator~(const Genotype& g)
 77 | {
 78 |   return Genotype(complement(g.first),
 79 |                   complement(g.second));
 80 | }
 81 | 
 82 | Genotype complement(const Genotype& g)
 83 | {
 84 |   return ~g;
 85 | }
 86 | 
 87 | bool Genotype::operator==(const Genotype& g) const
 88 | {
 89 |   return first == g.first && second == g.second;
 90 | }
 91 | 
 92 | bool Genotype::operator<(const Genotype& g) const
 93 | {
 94 |   if ( first < g.first )
 95 |     return true;
 96 | 
 97 |   if ( first > g.first )
 98 |     return false;
 99 | 
100 |   return second < g.second;
101 | }
102 | 
103 | std::string Genotype::to_string() const
104 | {
105 |   char s[3] = {0};
106 | 
107 |   s[0] = nucleotide_char(first);
108 |   s[1] = nucleotide_char(second);
109 | 
110 |   if ( s[0] != '-' && s[1] == '-' )
111 |     s[1] = '\0';
112 | 
113 |   return std::string(s);
114 | }
115 | 
116 | SNP::SNP() :
117 |   chromosome(CHR_NO),
118 |   position(0),
119 |   genotype(NONE, NONE)
120 | {
121 | }
122 | 
123 | SNP::SNP(const Chromosome& chr, const Position& pos, const Genotype& gt) :
124 |   chromosome(chr),
125 |   position(pos),
126 |   genotype(gt)
127 | {
128 | }
129 | 
130 | SNP::SNP(const SNP& snp) :
131 |   chromosome(snp.chromosome),
132 |   position(snp.position),
133 |   genotype(snp.genotype)
134 | {
135 | }
136 | 
137 | SNP& SNP::operator=(const SNP& snp) {
138 |   if ( this != &snp ) {
139 |     genotype = snp.genotype;
140 |     chromosome = snp.chromosome;
141 |     position = snp.position;
142 |   }
143 |   return *this;
144 | }
145 | 
146 | bool SNP::operator==(const SNP& snp) const
147 | {
148 |   return position == snp.position &&
149 |          chromosome == snp.chromosome &&
150 |          genotype == snp.genotype;
151 | }
152 | 
153 | bool SNP::operator<(const SNP& snp) const
154 | {
155 |   if ( position > snp.position )
156 |     return false;
157 |   if ( position < snp.position )
158 |     return true;
159 | 
160 |   // equal position
161 |   if ( chromosome > snp.chromosome )
162 |     return false;
163 |   if ( chromosome < snp.chromosome )
164 |     return true;
165 | 
166 |   // equal chromosome
167 |   return genotype < snp.genotype;
168 | }
169 | 
170 | bool SNP::operator>(const SNP& snp) const
171 | {
172 |   return !(*this <= snp);
173 | }
174 | 
175 | bool SNP::operator<=(const SNP& snp) const
176 | {
177 |   return *this == snp || *this < snp;
178 | }
179 | 
180 | bool SNP::operator>=(const SNP& snp) const
181 | {
182 |   return *this == snp || *this > snp;
183 | }
184 | 
185 | bool SNP::operator!=(const SNP& snp) const
186 | {
187 |   return !(*this == snp);
188 | }
189 | 
190 | bool SNP::operator==(const Genotype& g) const
191 | {
192 |   return genotype == g;
193 | }
194 | 
195 | struct GenomeIteratorImpl {
196 |   SNPMap::const_iterator it;
197 | 
198 |   GenomeIteratorImpl(SNPMap::const_iterator& i):
199 |     it(i)
200 |   {
201 |   }
202 | };
203 | 
204 | GenomeIterator::GenomeIterator():
205 |   pimpl(NULL)
206 | {
207 | }
208 | 
209 | GenomeIterator::GenomeIterator(GenomeIteratorImpl* p):
210 |   pimpl(p)
211 | {
212 | }
213 | 
214 | GenomeIterator::~GenomeIterator()
215 | {
216 |   delete pimpl;
217 | }
218 | 
219 | GenomeIterator::GenomeIterator(const GenomeIterator& o):
220 |   pimpl(new GenomeIteratorImpl(o.pimpl->it))
221 | {
222 | }
223 | 
224 | GenomeIterator& GenomeIterator::operator=(const GenomeIterator& o)
225 | {
226 |   if ( pimpl != o.pimpl ) {
227 |     delete pimpl;
228 |     pimpl = new GenomeIteratorImpl(o.pimpl->it);
229 |   }
230 |   return *this;
231 | }
232 | 
233 | void GenomeIterator::next()
234 | {
235 |   ++pimpl->it;
236 | }
237 | 
238 | RsidSNP GenomeIterator::value() const
239 | {
240 |   return *pimpl->it;
241 | }
242 | 
243 | bool GenomeIterator::operator==(const GenomeIterator& o) const
244 | {
245 |   return pimpl->it == o.pimpl->it;
246 | }
247 | 
248 | bool GenomeIterator::operator!=(const GenomeIterator& o) const
249 | {
250 |   return pimpl->it != o.pimpl->it;
251 | }
252 | 
253 | struct Genome::GenomeImpl {
254 |   SNPMap snps;
255 | 
256 |   GenomeImpl(const std::size_t size) :
257 |     snps(size)
258 |   {
259 |     snps.set_empty_key(0);
260 |   }
261 | 
262 |   GenomeImpl(const GenomeImpl& g) :
263 |     snps(g.snps)
264 |   {
265 |     snps.set_empty_key(0);
266 |   }
267 | 
268 |   GenomeImpl& operator=(const GenomeImpl& g)
269 |   {
270 |     if ( this != &g )
271 |       snps = g.snps;
272 | 
273 |     return *this;
274 |   }
275 | 
276 |   bool contains(const RSID& rsid) const {
277 |     return snps.find(rsid) != snps.end();
278 |   }
279 | 
280 |   const SNP& operator[](const RSID& rsid) const {
281 |     return !contains(rsid)? NONE_SNP : const_cast<SNPMap&>(snps)[rsid];
282 |   }
283 | 
284 |   std::string genotype(const RSID& id) const {
285 |     const SNP& snp = operator[](id);
286 |     return snp.genotype.to_string();
287 |   }
288 | };
289 | 
290 | Genome::Genome():
291 |   y_chromosome(false),
292 |   pimpl(new GenomeImpl(0))
293 | {
294 | }
295 | 
296 | Genome::Genome(const std::size_t size):
297 |   y_chromosome(false),
298 |   pimpl(new GenomeImpl(size))
299 | {
300 | }
301 | 
302 | Genome::Genome(const Genome& g) :
303 |   y_chromosome(g.y_chromosome),
304 |   pimpl(new GenomeImpl(*g.pimpl))
305 | {
306 | }
307 | 
308 | Genome& Genome::operator=(const Genome& g)
309 | {
310 |   if ( this != &g ) {
311 |     *pimpl = *g.pimpl;
312 |     y_chromosome = g.y_chromosome;
313 |   }
314 |   return *this;
315 | }
316 | 
317 | Genome::~Genome()
318 | {
319 |   delete pimpl;
320 | }
321 | 
322 | const SNP& Genome::operator[](const RSID& rsid) const
323 | {
324 |   return (*pimpl)[rsid];
325 | }
326 | 
327 | bool Genome::has(const RSID& rsid) const
328 | {
329 |   return pimpl->contains(rsid);
330 | }
331 | 
332 | std::size_t Genome::size() const
333 | {
334 |   return pimpl->snps.size();
335 | }
336 | 
337 | double Genome::load_factor() const
338 | {
339 |   return pimpl->snps.load_factor();
340 | }
341 | 
342 | void Genome::insert(const RsidSNP& obj)
343 | {
344 |   pimpl->snps.insert(obj);
345 | }
346 | 
347 | bool Genome::operator==(const Genome& o) const
348 | {
349 |   // cheap tests first
350 |   if ( !(y_chromosome == o.y_chromosome && size() == o.size() ) )
351 |     return false;
352 |   else
353 |     return o.pimpl->snps == pimpl->snps;
354 | }
355 | 
356 | bool Genome::operator!=(const Genome& o) const
357 | {
358 |   return !(*this == o);
359 | }
360 | 
361 | GenomeIterator Genome::begin() const
362 | {
363 |   auto i = const_cast<const SNPMap&>(pimpl->snps).begin();
364 |   auto p = new GenomeIteratorImpl(i);
365 |   return GenomeIterator(p);
366 | }
367 | 
368 | GenomeIterator Genome::end() const
369 | {
370 |   auto i = const_cast<const SNPMap&>(pimpl->snps).end();
371 |   auto p = new GenomeIteratorImpl(i);
372 |   return GenomeIterator(p);
373 | }
374 | 
375 | } // namespace arv
376 | 


--------------------------------------------------------------------------------
/cpp/arv.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * dna-traits
  3 |  * Copyright 2014, 2016 Christian Stigen Larsen
  4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
  5 |  *
  6 |  * arv
  7 |  * Copyright 2017 Christian Stigen Larsen
  8 |  * Distributed under the GNU GPL v3 or later. See COPYING.
  9 |  */
 10 | 
 11 | #ifndef ARV_ARV_HPP
 12 | #define ARV_ARV_HPP
 13 | 
 14 | #include <cstddef>
 15 | #include <cstdint>
 16 | #include <string>
 17 | #include <vector>
 18 | 
 19 | namespace arv {
 20 | 
 21 | typedef std::uint32_t Position;
 22 | typedef std::int32_t RSID;
 23 | 
 24 | enum Nucleotide {
 25 |   NONE, A, G, C, T, D, I
 26 | };
 27 | 
 28 | enum Chromosome {
 29 |   CHR_NO =  0,
 30 |   CHR_01 =  1,
 31 |   CHR_02 =  2,
 32 |   CHR_03 =  3,
 33 |   CHR_04 =  4,
 34 |   CHR_05 =  5,
 35 |   CHR_06 =  6,
 36 |   CHR_07 =  7,
 37 |   CHR_08 =  8,
 38 |   CHR_09 =  9,
 39 |   CHR_10 = 10,
 40 |   CHR_11 = 11,
 41 |   CHR_12 = 12,
 42 |   CHR_13 = 13,
 43 |   CHR_14 = 14,
 44 |   CHR_15 = 15,
 45 |   CHR_16 = 16,
 46 |   CHR_17 = 17,
 47 |   CHR_18 = 18,
 48 |   CHR_19 = 19,
 49 |   CHR_20 = 20,
 50 |   CHR_21 = 21,
 51 |   CHR_22 = 22,
 52 |   CHR_X  = 23,
 53 |   CHR_Y  = 24,
 54 |   CHR_MT = 25 // Mitochondrial DNA
 55 | };
 56 | 
 57 | // We can get this down to a byte if we want to
 58 | #pragma pack(1)
 59 | struct Genotype {
 60 |   Nucleotide first  : 3;
 61 |   Nucleotide second : 3;
 62 | 
 63 |   Genotype();
 64 |   Genotype(const Nucleotide& a, const Nucleotide& b);
 65 | 
 66 |   friend Genotype operator~(const Genotype&);
 67 |   bool operator==(const Genotype& g) const;
 68 |   bool operator<(const Genotype& g) const;
 69 | 
 70 |   std::string to_string() const;
 71 | };
 72 | 
 73 | #pragma pack(1)
 74 | struct SNP {
 75 |   Chromosome chromosome : 5;
 76 |   Position position;
 77 |   Genotype genotype;
 78 | 
 79 |   SNP();
 80 |   SNP(const Chromosome&, const Position&, const Genotype&);
 81 |   SNP(const SNP&);
 82 |   SNP& operator=(const SNP&);
 83 | 
 84 |   // Comparisons are based on the tuple (position, chromosome, genotype)
 85 |   bool operator!=(const SNP&) const;
 86 |   bool operator<(const SNP&) const;
 87 |   bool operator<=(const SNP&) const;
 88 |   bool operator==(const Genotype&) const;
 89 |   bool operator==(const SNP&) const;
 90 |   bool operator>(const SNP&) const;
 91 |   bool operator>=(const SNP&) const;
 92 | };
 93 | 
 94 | extern const SNP NONE_SNP;
 95 | 
 96 | struct GenomeIteratorImpl;
 97 | 
 98 | typedef std::pair<RSID, SNP> RsidSNP;
 99 | 
100 | struct GenomeIterator {
101 |   GenomeIterator();
102 |   GenomeIterator(const GenomeIterator&);
103 |   GenomeIterator(GenomeIteratorImpl*);
104 |   GenomeIterator& operator=(const GenomeIterator&);
105 |   ~GenomeIterator();
106 | 
107 |   bool operator==(const GenomeIterator&) const;
108 |   bool operator!=(const GenomeIterator&) const;
109 | 
110 |   void next();
111 |   RsidSNP value() const;
112 | 
113 | private:
114 |   GenomeIteratorImpl* pimpl;
115 | };
116 | 
117 | struct Genome {
118 |   /*!
119 |    * True if genome contains a Y-chromosome (with non-empty genotypes).
120 |    */
121 |   bool y_chromosome;
122 | 
123 |   Genome();
124 |   Genome(const std::size_t size);
125 |   Genome(const Genome&);
126 |   Genome& operator=(const Genome&);
127 |   ~Genome();
128 | 
129 |   /*!
130 |    * Access SNP. Throws on not found.
131 |    */
132 |   const SNP& operator[](const RSID& id) const;
133 | 
134 |   /*!
135 |    * Checks if hash table contains given RSID.
136 |    */
137 |   bool has(const RSID& id) const;
138 | 
139 |   /*!
140 |    * Add a SNP to the hash table.
141 |    */
142 |   void insert(const RsidSNP&);
143 | 
144 |   /*!
145 |    * Underlying hash table's load factor. (For developer purposes)
146 |    */
147 |   double load_factor() const;
148 | 
149 |   /*!
150 |    * Number of SNPs.
151 |    */
152 |   std::size_t size() const;
153 | 
154 |   bool operator==(const Genome&) const;
155 |   bool operator!=(const Genome&) const;
156 | 
157 |   GenomeIterator begin() const;
158 |   GenomeIterator end() const;
159 | 
160 | private:
161 |   struct GenomeImpl;
162 |   GenomeImpl* pimpl;
163 | };
164 | 
165 | Nucleotide complement(const Nucleotide& n);
166 | 
167 | /*!
168 |  * Parse a 23andMe genome text file and put contents into genome.
169 |  */
170 | void parse_file(const std::string& filename, Genome&);
171 | 
172 | Genotype complement(const Genotype& g);
173 | 
174 | } // namespace arv
175 | 
176 | #endif // include guard
177 | 


--------------------------------------------------------------------------------
/cpp/export.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ARV_EXPORT_HPP
 2 | #define ARV_EXPORT_HPP
 3 | 
 4 | // Taken from https://gcc.gnu.org/wiki/Visibility
 5 | #if defined _WIN32 || defined __CYGWIN__
 6 |   #ifdef BUILDING_DLL
 7 |     #ifdef __GNUC__
 8 |       #define DLL_PUBLIC __attribute__ ((dllexport))
 9 |     #else
10 |       #define DLL_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax.
11 |     #endif
12 |   #else
13 |     #ifdef __GNUC__
14 |       #define DLL_PUBLIC __attribute__ ((dllimport))
15 |     #else
16 |       #define DLL_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax.
17 |     #endif
18 |   #endif
19 |   #define DLL_LOCAL
20 | #else
21 |   #if __GNUC__ >= 4
22 |     #define DLL_PUBLIC __attribute__ ((visibility ("default")))
23 |     #define DLL_LOCAL  __attribute__ ((visibility ("hidden")))
24 |   #else
25 |     #define DLL_PUBLIC
26 |     #define DLL_LOCAL
27 |   #endif
28 | #endif
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/cpp/file.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * dna-traits
 3 |  * Copyright 2014, 2016 Christian Stigen Larsen
 4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 5 |  *
 6 |  * arv
 7 |  * Copyright 2017 Christian Stigen Larsen
 8 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 9 |  */
10 | 
11 | #include "file.hpp"
12 | 
13 | #include <unistd.h>
14 | #include <fcntl.h>
15 | #include <stdexcept>
16 | #include <string>
17 | 
18 | namespace arv {
19 | 
20 | File::File(const char* filename, const int flags):
21 |   fd(open(filename, flags, S_IRUSR))
22 | {
23 |   if ( fd < 0 ) {
24 |     std::string msg = "Could not open ";
25 |     throw std::runtime_error((msg + filename).c_str());
26 |   }
27 | }
28 | 
29 | File::~File() {
30 |   close(fd);
31 | }
32 | 
33 | } // ns arv
34 | 


--------------------------------------------------------------------------------
/cpp/file.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * dna-traits
 3 |  * Copyright 2014, 2016 Christian Stigen Larsen
 4 |  * Distributed under the GPL v3 or later. See COPYING.
 5 |  *
 6 |  * arv
 7 |  * Copyright 2017 Christian Stigen Larsen
 8 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 9 |  */
10 | 
11 | #ifndef ARV_FILE_HPP
12 | #define ARV_FILE_HPP
13 | 
14 | namespace arv {
15 | 
16 | class File {
17 |   int fd;
18 | public:
19 |   File(const char* filename, const int flags);
20 |   ~File();
21 | 
22 |   inline operator int() const {
23 |     return fd;
24 |   }
25 | };
26 | 
27 | } // ns arv
28 | 
29 | #endif // guard
30 | 


--------------------------------------------------------------------------------
/cpp/filesize.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (C) 2014, 2016 Christian Stigen Larsen
 3 |  * Distributed under the GPL v3 or later. See COPYING.
 4 |  */
 5 | 
 6 | #include "filesize.hpp"
 7 | 
 8 | #include <stdexcept>
 9 | #include <string.h>
10 | #include <sys/stat.h>
11 | 
12 | namespace arv {
13 | 
14 | std::size_t filesize(const int file_descriptor)
15 | {
16 |   struct stat st;
17 |   memset(&st, 0, sizeof(struct stat));
18 | 
19 |   if ( fstat(file_descriptor, &st) < 0 )
20 |     throw std::runtime_error("Could not stat file");
21 | 
22 |   const off_t size = st.st_size;
23 |   return size < 0 ? 0 : static_cast<std::size_t>(size);
24 | }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/cpp/filesize.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * dna-traitrs
 3 |  * Copyright 2014, 2016 Christian Stigen Larsen
 4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 5 |  *
 6 |  * arv
 7 |  * Copyright 2017 Christian Stigen Larsen
 8 |  * Distributed under the GNU GPL V3 or later. See COPYING.
 9 |  */
10 | 
11 | #ifndef ARV_FILESIZE_HPP
12 | #define ARV_FILESIZE_HPP
13 | 
14 | #include <cstddef>
15 | 
16 | namespace arv {
17 | 
18 | std::size_t filesize(const int file_descriptor);
19 | 
20 | } // namespace arv
21 | 
22 | #endif // guard
23 | 


--------------------------------------------------------------------------------
/cpp/google/dense_hash_map:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/dense_hash_map>
35 | 


--------------------------------------------------------------------------------
/cpp/google/dense_hash_set:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/dense_hash_set>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparse_hash_map:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/sparse_hash_map>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparse_hash_set:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/sparse_hash_set>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparsehash/densehashtable.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/internal/densehashtable.h>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparsehash/hashtable-common.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/internal/hashtable-common.h>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparsehash/libc_allocator_with_realloc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/internal/libc_allocator_with_realloc.h>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparsehash/sparsehashtable.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/internal/sparsehashtable.h>
35 | 


--------------------------------------------------------------------------------
/cpp/google/sparsetable:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/sparsetable>
35 | 


--------------------------------------------------------------------------------
/cpp/google/template_util.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/template_util.h>
35 | 


--------------------------------------------------------------------------------
/cpp/google/type_traits.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2012, Google Inc.
 2 | // All rights reserved.
 3 | //
 4 | // Redistribution and use in source and binary forms, with or without
 5 | // modification, are permitted provided that the following conditions are
 6 | // met:
 7 | //
 8 | //     * Redistributions of source code must retain the above copyright
 9 | // notice, this list of conditions and the following disclaimer.
10 | //     * Redistributions in binary form must reproduce the above
11 | // copyright notice, this list of conditions and the following disclaimer
12 | // in the documentation and/or other materials provided with the
13 | // distribution.
14 | //     * Neither the name of Google Inc. nor the names of its
15 | // contributors may be used to endorse or promote products derived from
16 | // this software without specific prior written permission.
17 | //
18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | // Header files have moved from the google directory to the sparsehash
31 | // directory.  This forwarding file is provided only for backwards
32 | // compatibility.  Use <sparsehash/*> in all new code.
33 | 
34 | #include <sparsehash/type_traits.h>
35 | 


--------------------------------------------------------------------------------
/cpp/mmap.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * dna-traits
 3 |  * Copyright 2014, 2016 Christian Stigen Larsen
 4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 5 |  *
 6 |  * arv
 7 |  * Copyright 2017 Christian Stigen Larsen
 8 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 9 |  */
10 | 
11 | #include "mmap.hpp"
12 | 
13 | #include <stdexcept>
14 | 
15 | namespace arv {
16 | 
17 | MMap::MMap(void *address,
18 |            const std::size_t length,
19 |            const int protection_level,
20 |            const int flags,
21 |            const int file_descriptor,
22 |            const off_t offset)
23 |   : l(length),
24 |     p(mmap(address, length, protection_level, flags, file_descriptor, offset))
25 | {
26 |   if ( p == reinterpret_cast<caddr_t>(-1) )
27 |     throw std::runtime_error("mmap error");
28 | }
29 | 
30 | MMap::~MMap() {
31 |   munmap(p, l);
32 | }
33 | 
34 | } // ns arv
35 | 


--------------------------------------------------------------------------------
/cpp/mmap.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * dna-traits
 3 |  * Copyright 2014, 2016 Christian Stigen Larsen
 4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 5 |  *
 6 |  * arv
 7 |  * Copyright 2017 Christian Stigen Larsen
 8 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 9 |  */
10 | 
11 | #ifndef ARV_MMAP_HPP
12 | #define ARV_MMAP_HPP
13 | 
14 | #include <cstddef>
15 | #include <stdlib.h>
16 | #include <fcntl.h>
17 | #include <sys/mman.h>
18 | #include <sys/types.h>
19 | 
20 | namespace arv {
21 | 
22 | class MMap {
23 |   std::size_t l;
24 |   void *p;
25 | public:
26 |   MMap(void *address,
27 |        const std::size_t length,
28 |        const int protection_level,
29 |        const int flags,
30 |        const int file_descriptor,
31 |        const off_t offset);
32 |   ~MMap();
33 | 
34 |   inline void* ptr() const {
35 |     return p;
36 |   }
37 | 
38 |   inline const char* c_str() const {
39 |     return static_cast<const char*>(p);
40 |   }
41 | };
42 | 
43 | } // ns arv
44 | 
45 | #endif // guard
46 | 


--------------------------------------------------------------------------------
/cpp/parse.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * dna-traits
  3 |  * Copyright 2014, 2016 Christian Stigen Larsen
  4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
  5 |  *
  6 |  * arv
  7 |  * Copyright 2017 Christian Stigen Larsen
  8 |  * Distributed under the GNU GPL V3 or later. See COPYING.
  9 |  */
 10 | 
 11 | #include "arv.hpp"
 12 | #include "file.hpp"
 13 | #include "filesize.hpp"
 14 | #include "mmap.hpp"
 15 | 
 16 | #ifdef __GNUC__
 17 | #define likely(x)   __builtin_expect((x),1)
 18 | #define unlikely(x) __builtin_expect((x),0)
 19 | #else
 20 | #define likely(x)   (x)
 21 | #define unlikely(x) (x)
 22 | #endif
 23 | 
 24 | namespace arv {
 25 | 
 26 | static Nucleotide CharToNucleotide[256] = {NONE};
 27 | 
 28 | static void skip_comments(const char*& s)
 29 | {
 30 |   while ( *s == '#' )
 31 |     while ( *s++ != '\n' )
 32 |       ; // loop
 33 | }
 34 | 
 35 | static bool iswhite(const char c)
 36 | {
 37 |   return c=='\t' || c=='\n' || c=='\r';
 38 | }
 39 | 
 40 | static const char*& skipwhite(const char*& s)
 41 | {
 42 |   while ( iswhite(*s) ) ++s;
 43 |   return s;
 44 | }
 45 | 
 46 | static uint32_t parse_uint32(const char*& s)
 47 | {
 48 |   uint32_t n = 0;
 49 | 
 50 |   while ( isdigit(*s) )
 51 |     n = n*10 - '0' + *s++;
 52 | 
 53 |   return n;
 54 | }
 55 | 
 56 | static int32_t parse_int32(const char*& s)
 57 | {
 58 |   int32_t n = 0;
 59 | 
 60 |   while ( isdigit(*s) )
 61 |     n = n*10 - '0' + *s++;
 62 | 
 63 |   return n;
 64 | }
 65 | 
 66 | static Nucleotide parse_nucleotide(const char*& s)
 67 | {
 68 |   return CharToNucleotide[static_cast<short>(*s++)];
 69 | }
 70 | 
 71 | static Chromosome parse_chromo(const char*& s)
 72 | {
 73 |   if ( likely(isdigit(*s)) )
 74 |       return static_cast<Chromosome>(parse_uint32(s));
 75 | 
 76 |   const char c = *s++;
 77 | 
 78 |   if ( c == 'X' )
 79 |     return CHR_X;
 80 | 
 81 |   if ( c == 'M' ) {
 82 |     ++s; // skip T in "MT"
 83 |     return CHR_MT;
 84 |   }
 85 | 
 86 |   if ( c == 'Y' )
 87 |     return CHR_Y;
 88 | 
 89 |   return CHR_NO;
 90 | }
 91 | 
 92 | static Genotype parse_genotype(const char*& s)
 93 | {
 94 |   const Nucleotide first = parse_nucleotide(s);
 95 |   const Nucleotide second = parse_nucleotide(s);
 96 |   return Genotype(first, second);
 97 | }
 98 | 
 99 | static void skipline(const char*& s)
100 | {
101 |   while ( *s != '\n' ) ++s;
102 | }
103 | 
104 | /**
105 |  * Reads a 23andMe-formatted genome file.  It currently uses reference human
106 |  * assembly build 37 (annotation release 104).
107 |  */
108 | void parse_file(const std::string& name, Genome& genome)
109 | {
110 |   using namespace arv;
111 | 
112 |   CharToNucleotide[static_cast<short>('-')] = NONE;
113 |   CharToNucleotide[static_cast<short>('A')] = A;
114 |   CharToNucleotide[static_cast<short>('C')] = C;
115 |   CharToNucleotide[static_cast<short>('D')] = D;
116 |   CharToNucleotide[static_cast<short>('G')] = G;
117 |   CharToNucleotide[static_cast<short>('I')] = I;
118 |   CharToNucleotide[static_cast<short>('T')] = T;
119 | 
120 |   File fd(name.c_str(), O_RDONLY);
121 |   MMap fmap(0, filesize(fd), PROT_READ, MAP_PRIVATE, fd, 0);
122 |   auto s = fmap.c_str();
123 | 
124 |   skip_comments(s);
125 | 
126 |   // Local cache of SNPs and RSIDs, for more locality and hence more speed. Its
127 |   // size is somewhat arbitrary, but shouldn't be too big.
128 |   const std::size_t BUFFER_SIZE = 200;
129 |   RsidSNP buffer[BUFFER_SIZE];
130 |   size_t buffer_pos = 0;
131 | 
132 |   bool internal = false; // rsid or internal id
133 | 
134 |   for ( ; *s; ++s ) {
135 |     if ( *s == 'r' )
136 |       internal = false;
137 |     else if ( *s == 'i' )
138 |       internal = true;
139 |     else {
140 |       skipline(s);
141 |       continue;
142 |     }
143 | 
144 |     RSID& rsid = buffer[buffer_pos].first;
145 |     SNP& snp = buffer[buffer_pos].second;
146 | 
147 |     // Skip i/rs prefix and parse number
148 |     if ( !internal )
149 |       rsid = parse_int32(s += 2);
150 |     else
151 |       rsid = -parse_int32(s += 1);
152 | 
153 |     snp.chromosome = parse_chromo(skipwhite(s));
154 |     snp.position = parse_uint32(skipwhite(s));
155 |     snp.genotype = parse_genotype(skipwhite(s));
156 | 
157 |     genome.y_chromosome |= (snp.chromosome == CHR_Y && snp.genotype.first !=
158 |         NONE);
159 | 
160 |     // Ordinarly, we would just call `genome.insert(rsid, snp)` here, but it's
161 |     // a tad faster to stage them in an array first, and then flush it to the
162 |     // hash map when it's full.
163 | 
164 |     if ( ++buffer_pos == BUFFER_SIZE ) {
165 |       buffer_pos = 0;
166 |       for ( size_t n = 0; n < BUFFER_SIZE; ++n )
167 |         genome.insert(buffer[n]);
168 |     }
169 |   }
170 | 
171 |   // Store the rest of the buffer
172 |   for ( size_t n = 0; n < buffer_pos; ++n )
173 |     genome.insert(buffer[n]);
174 | }
175 | 
176 | } // namespace arv
177 | 


--------------------------------------------------------------------------------
/cpp/public_py_init_sym.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * arv
 3 |  * Copyright 2017 Christian Stigen Larsen
 4 |  * Distributed under the GNU GPL v3 or later. See COPYING.
 5 |  */
 6 | 
 7 | #ifndef ARV_CYTHON_HPP
 8 | #define ARV_CYTHON_HPP
 9 | 
10 | #include <Python.h>
11 | #include "export.hpp"
12 | 
13 | // When compiling with hidden symbols (-fvisibility=hidden), we still need to
14 | // make the init function's symbol global (i.e. public).
15 | 
16 | #if PY_MAJOR_VERSION < 3
17 | PyMODINIT_FUNC init_arv(void) DLL_PUBLIC;
18 | #else
19 | PyMODINIT_FUNC PyInit__arv(void) DLL_PUBLIC;
20 | #endif
21 | 
22 | #endif // guard
23 | 


--------------------------------------------------------------------------------
/cpp/sparsehash/dense_hash_set:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2005, Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ---
 31 | //
 32 | // This is just a very thin wrapper over densehashtable.h, just
 33 | // like sgi stl's stl_hash_set is a very thin wrapper over
 34 | // stl_hashtable.  The major thing we define is operator[], because
 35 | // we have a concept of a data_type which stl_hashtable doesn't
 36 | // (it only has a key and a value).
 37 | //
 38 | // This is more different from dense_hash_map than you might think,
 39 | // because all iterators for sets are const (you obviously can't
 40 | // change the key, and for sets there is no value).
 41 | //
 42 | // NOTE: this is exactly like sparse_hash_set.h, with the word
 43 | // "sparse" replaced by "dense", except for the addition of
 44 | // set_empty_key().
 45 | //
 46 | //   YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION.
 47 | //
 48 | // Otherwise your program will die in mysterious ways.  (Note if you
 49 | // use the constructor that takes an InputIterator range, you pass in
 50 | // the empty key in the constructor, rather than after.  As a result,
 51 | // this constructor differs from the standard STL version.)
 52 | //
 53 | // In other respects, we adhere mostly to the STL semantics for
 54 | // hash-map.  One important exception is that insert() may invalidate
 55 | // iterators entirely -- STL semantics are that insert() may reorder
 56 | // iterators, but they all still refer to something valid in the
 57 | // hashtable.  Not so for us.  Likewise, insert() may invalidate
 58 | // pointers into the hashtable.  (Whether insert invalidates iterators
 59 | // and pointers depends on whether it results in a hashtable resize).
 60 | // On the plus side, delete() doesn't invalidate iterators or pointers
 61 | // at all, or even change the ordering of elements.
 62 | //
 63 | // Here are a few "power user" tips:
 64 | //
 65 | //    1) set_deleted_key():
 66 | //         If you want to use erase() you must call set_deleted_key(),
 67 | //         in addition to set_empty_key(), after construction.
 68 | //         The deleted and empty keys must differ.
 69 | //
 70 | //    2) resize(0):
 71 | //         When an item is deleted, its memory isn't freed right
 72 | //         away.  This allows you to iterate over a hashtable,
 73 | //         and call erase(), without invalidating the iterator.
 74 | //         To force the memory to be freed, call resize(0).
 75 | //         For tr1 compatibility, this can also be called as rehash(0).
 76 | //
 77 | //    3) min_load_factor(0.0)
 78 | //         Setting the minimum load factor to 0.0 guarantees that
 79 | //         the hash table will never shrink.
 80 | //
 81 | // Roughly speaking:
 82 | //   (1) dense_hash_set: fastest, uses the most memory unless entries are small
 83 | //   (2) sparse_hash_set: slowest, uses the least memory
 84 | //   (3) hash_set / unordered_set (STL): in the middle
 85 | //
 86 | // Typically I use sparse_hash_set when I care about space and/or when
 87 | // I need to save the hashtable on disk.  I use hash_set otherwise.  I
 88 | // don't personally use dense_hash_set ever; some people use it for
 89 | // small sets with lots of lookups.
 90 | //
 91 | // - dense_hash_set has, typically, about 78% memory overhead (if your
 92 | //   data takes up X bytes, the hash_set uses .78X more bytes in overhead).
 93 | // - sparse_hash_set has about 4 bits overhead per entry.
 94 | // - sparse_hash_set can be 3-7 times slower than the others for lookup and,
 95 | //   especially, inserts.  See time_hash_map.cc for details.
 96 | //
 97 | // See /usr/(local/)?doc/sparsehash-*/dense_hash_set.html
 98 | // for information about how to use this class.
 99 | 
100 | #ifndef _DENSE_HASH_SET_H_
101 | #define _DENSE_HASH_SET_H_
102 | 
103 | #include <sparsehash/internal/sparseconfig.h>
104 | #include <algorithm>                        // needed by stl_alloc
105 | #include <functional>                       // for equal_to<>, select1st<>, etc
106 | #include <memory>                           // for alloc
107 | #include <utility>                          // for pair<>
108 | #include <sparsehash/internal/densehashtable.h>        // IWYU pragma: export
109 | #include <sparsehash/internal/libc_allocator_with_realloc.h>
110 | #include HASH_FUN_H                 // for hash<>
111 | _START_GOOGLE_NAMESPACE_
112 | 
113 | template <class Value,
114 |           class HashFcn = SPARSEHASH_HASH<Value>,   // defined in sparseconfig.h
115 |           class EqualKey = std::equal_to<Value>,
116 |           class Alloc = libc_allocator_with_realloc<Value> >
117 | class dense_hash_set {
118 |  private:
119 |   // Apparently identity is not stl-standard, so we define our own
120 |   struct Identity {
121 |     typedef const Value& result_type;
122 |     const Value& operator()(const Value& v) const { return v; }
123 |   };
124 |   struct SetKey {
125 |     void operator()(Value* value, const Value& new_key) const {
126 |       *value = new_key;
127 |     }
128 |   };
129 | 
130 |   // The actual data
131 |   typedef dense_hashtable<Value, Value, HashFcn, Identity, SetKey,
132 |                           EqualKey, Alloc> ht;
133 |   ht rep;
134 | 
135 |  public:
136 |   typedef typename ht::key_type key_type;
137 |   typedef typename ht::value_type value_type;
138 |   typedef typename ht::hasher hasher;
139 |   typedef typename ht::key_equal key_equal;
140 |   typedef Alloc allocator_type;
141 | 
142 |   typedef typename ht::size_type size_type;
143 |   typedef typename ht::difference_type difference_type;
144 |   typedef typename ht::const_pointer pointer;
145 |   typedef typename ht::const_pointer const_pointer;
146 |   typedef typename ht::const_reference reference;
147 |   typedef typename ht::const_reference const_reference;
148 | 
149 |   typedef typename ht::const_iterator iterator;
150 |   typedef typename ht::const_iterator const_iterator;
151 |   typedef typename ht::const_local_iterator local_iterator;
152 |   typedef typename ht::const_local_iterator const_local_iterator;
153 | 
154 | 
155 |   // Iterator functions -- recall all iterators are const
156 |   iterator begin() const                  { return rep.begin(); }
157 |   iterator end() const                    { return rep.end(); }
158 | 
159 |   // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements.
160 |   local_iterator begin(size_type i) const { return rep.begin(i); }
161 |   local_iterator end(size_type i) const   { return rep.end(i); }
162 | 
163 | 
164 |   // Accessor functions
165 |   allocator_type get_allocator() const    { return rep.get_allocator(); }
166 |   hasher hash_funct() const               { return rep.hash_funct(); }
167 |   hasher hash_function() const            { return hash_funct(); }  // tr1 name
168 |   key_equal key_eq() const                { return rep.key_eq(); }
169 | 
170 | 
171 |   // Constructors
172 |   explicit dense_hash_set(size_type expected_max_items_in_table = 0,
173 |                           const hasher& hf = hasher(),
174 |                           const key_equal& eql = key_equal(),
175 |                           const allocator_type& alloc = allocator_type())
176 |       : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
177 |   }
178 | 
179 |   template <class InputIterator>
180 |   dense_hash_set(InputIterator f, InputIterator l,
181 |                  const key_type& empty_key_val,
182 |                  size_type expected_max_items_in_table = 0,
183 |                  const hasher& hf = hasher(),
184 |                  const key_equal& eql = key_equal(),
185 |                  const allocator_type& alloc = allocator_type())
186 |       : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
187 |     set_empty_key(empty_key_val);
188 |     rep.insert(f, l);
189 |   }
190 |   // We use the default copy constructor
191 |   // We use the default operator=()
192 |   // We use the default destructor
193 | 
194 |   void clear()                        { rep.clear(); }
195 |   // This clears the hash set without resizing it down to the minimum
196 |   // bucket count, but rather keeps the number of buckets constant
197 |   void clear_no_resize()              { rep.clear_no_resize(); }
198 |   void swap(dense_hash_set& hs)       { rep.swap(hs.rep); }
199 | 
200 | 
201 |   // Functions concerning size
202 |   size_type size() const              { return rep.size(); }
203 |   size_type max_size() const          { return rep.max_size(); }
204 |   bool empty() const                  { return rep.empty(); }
205 |   size_type bucket_count() const      { return rep.bucket_count(); }
206 |   size_type max_bucket_count() const  { return rep.max_bucket_count(); }
207 | 
208 |   // These are tr1 methods.  bucket() is the bucket the key is or would be in.
209 |   size_type bucket_size(size_type i) const    { return rep.bucket_size(i); }
210 |   size_type bucket(const key_type& key) const { return rep.bucket(key); }
211 |   float load_factor() const {
212 |     return size() * 1.0f / bucket_count();
213 |   }
214 |   float max_load_factor() const {
215 |     float shrink, grow;
216 |     rep.get_resizing_parameters(&shrink, &grow);
217 |     return grow;
218 |   }
219 |   void max_load_factor(float new_grow) {
220 |     float shrink, grow;
221 |     rep.get_resizing_parameters(&shrink, &grow);
222 |     rep.set_resizing_parameters(shrink, new_grow);
223 |   }
224 |   // These aren't tr1 methods but perhaps ought to be.
225 |   float min_load_factor() const {
226 |     float shrink, grow;
227 |     rep.get_resizing_parameters(&shrink, &grow);
228 |     return shrink;
229 |   }
230 |   void min_load_factor(float new_shrink) {
231 |     float shrink, grow;
232 |     rep.get_resizing_parameters(&shrink, &grow);
233 |     rep.set_resizing_parameters(new_shrink, grow);
234 |   }
235 |   // Deprecated; use min_load_factor() or max_load_factor() instead.
236 |   void set_resizing_parameters(float shrink, float grow) {
237 |     rep.set_resizing_parameters(shrink, grow);
238 |   }
239 | 
240 |   void resize(size_type hint)         { rep.resize(hint); }
241 |   void rehash(size_type hint)         { resize(hint); }     // the tr1 name
242 | 
243 |   // Lookup routines
244 |   iterator find(const key_type& key) const           { return rep.find(key); }
245 | 
246 |   size_type count(const key_type& key) const         { return rep.count(key); }
247 | 
248 |   std::pair<iterator, iterator> equal_range(const key_type& key) const {
249 |     return rep.equal_range(key);
250 |   }
251 | 
252 | 
253 |   // Insertion routines
254 |   std::pair<iterator, bool> insert(const value_type& obj) {
255 |     std::pair<typename ht::iterator, bool> p = rep.insert(obj);
256 |     return std::pair<iterator, bool>(p.first, p.second);   // const to non-const
257 |   }
258 |   template <class InputIterator> void insert(InputIterator f, InputIterator l) {
259 |     rep.insert(f, l);
260 |   }
261 |   void insert(const_iterator f, const_iterator l) {
262 |     rep.insert(f, l);
263 |   }
264 |   // Required for std::insert_iterator; the passed-in iterator is ignored.
265 |   iterator insert(iterator, const value_type& obj)   {
266 |     return insert(obj).first;
267 |   }
268 | 
269 |   // Deletion and empty routines
270 |   // THESE ARE NON-STANDARD!  I make you specify an "impossible" key
271 |   // value to identify deleted and empty buckets.  You can change the
272 |   // deleted key as time goes on, or get rid of it entirely to be insert-only.
273 |   void set_empty_key(const key_type& key)     { rep.set_empty_key(key); }
274 |   key_type empty_key() const                  { return rep.empty_key(); }
275 | 
276 |   void set_deleted_key(const key_type& key)   { rep.set_deleted_key(key); }
277 |   void clear_deleted_key()                    { rep.clear_deleted_key(); }
278 |   key_type deleted_key() const                { return rep.deleted_key(); }
279 | 
280 |   // These are standard
281 |   size_type erase(const key_type& key)               { return rep.erase(key); }
282 |   void erase(iterator it)                            { rep.erase(it); }
283 |   void erase(iterator f, iterator l)                 { rep.erase(f, l); }
284 | 
285 | 
286 |   // Comparison
287 |   bool operator==(const dense_hash_set& hs) const    { return rep == hs.rep; }
288 |   bool operator!=(const dense_hash_set& hs) const    { return rep != hs.rep; }
289 | 
290 | 
291 |   // I/O -- this is an add-on for writing metainformation to disk
292 |   //
293 |   // For maximum flexibility, this does not assume a particular
294 |   // file type (though it will probably be a FILE *).  We just pass
295 |   // the fp through to rep.
296 | 
297 |   // If your keys and values are simple enough, you can pass this
298 |   // serializer to serialize()/unserialize().  "Simple enough" means
299 |   // value_type is a POD type that contains no pointers.  Note,
300 |   // however, we don't try to normalize endianness.
301 |   typedef typename ht::NopointerSerializer NopointerSerializer;
302 | 
303 |   // serializer: a class providing operator()(OUTPUT*, const value_type&)
304 |   //    (writing value_type to OUTPUT).  You can specify a
305 |   //    NopointerSerializer object if appropriate (see above).
306 |   // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
307 |   //    pointer to a class providing size_t Write(const void*, size_t),
308 |   //    which writes a buffer into a stream (which fp presumably
309 |   //    owns) and returns the number of bytes successfully written.
310 |   //    Note basic_ostream<not_char> is not currently supported.
311 |   template <typename ValueSerializer, typename OUTPUT>
312 |   bool serialize(ValueSerializer serializer, OUTPUT* fp) {
313 |     return rep.serialize(serializer, fp);
314 |   }
315 | 
316 |   // serializer: a functor providing operator()(INPUT*, value_type*)
317 |   //    (reading from INPUT and into value_type).  You can specify a
318 |   //    NopointerSerializer object if appropriate (see above).
319 |   // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
320 |   //    pointer to a class providing size_t Read(void*, size_t),
321 |   //    which reads into a buffer from a stream (which fp presumably
322 |   //    owns) and returns the number of bytes successfully read.
323 |   //    Note basic_istream<not_char> is not currently supported.
324 |   template <typename ValueSerializer, typename INPUT>
325 |   bool unserialize(ValueSerializer serializer, INPUT* fp) {
326 |     return rep.unserialize(serializer, fp);
327 |   }
328 | };
329 | 
330 | template <class Val, class HashFcn, class EqualKey, class Alloc>
331 | inline void swap(dense_hash_set<Val, HashFcn, EqualKey, Alloc>& hs1,
332 |                  dense_hash_set<Val, HashFcn, EqualKey, Alloc>& hs2) {
333 |   hs1.swap(hs2);
334 | }
335 | 
336 | _END_GOOGLE_NAMESPACE_
337 | 
338 | #endif /* _DENSE_HASH_SET_H_ */
339 | 


--------------------------------------------------------------------------------
/cpp/sparsehash/internal/hashtable-common.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010, Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ---
 31 | //
 32 | // Provides classes shared by both sparse and dense hashtable.
 33 | //
 34 | // sh_hashtable_settings has parameters for growing and shrinking
 35 | // a hashtable.  It also packages zero-size functor (ie. hasher).
 36 | //
 37 | // Other functions and classes provide common code for serializing
 38 | // and deserializing hashtables to a stream (such as a FILE*).
 39 | 
 40 | #ifndef UTIL_GTL_HASHTABLE_COMMON_H_
 41 | #define UTIL_GTL_HASHTABLE_COMMON_H_
 42 | 
 43 | #include <sparsehash/internal/sparseconfig.h>
 44 | #include <assert.h>
 45 | #include <stdio.h>
 46 | #include <stddef.h>                  // for size_t
 47 | #include <iosfwd>
 48 | #include <stdexcept>                 // For length_error
 49 | 
 50 | _START_GOOGLE_NAMESPACE_
 51 | 
 52 | template <bool> struct SparsehashCompileAssert { };
 53 | #define SPARSEHASH_COMPILE_ASSERT(expr, msg) \
 54 |   __attribute__((unused)) typedef SparsehashCompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
 55 | 
 56 | namespace sparsehash_internal {
 57 | 
 58 | // Adaptor methods for reading/writing data from an INPUT or OUPTUT
 59 | // variable passed to serialize() or unserialize().  For now we
 60 | // have implemented INPUT/OUTPUT for FILE*, istream*/ostream* (note
 61 | // they are pointers, unlike typical use), or else a pointer to
 62 | // something that supports a Read()/Write() method.
 63 | //
 64 | // For technical reasons, we implement read_data/write_data in two
 65 | // stages.  The actual work is done in *_data_internal, which takes
 66 | // the stream argument twice: once as a template type, and once with
 67 | // normal type information.  (We only use the second version.)  We do
 68 | // this because of how C++ picks what function overload to use.  If we
 69 | // implemented this the naive way:
 70 | //    bool read_data(istream* is, const void* data, size_t length);
 71 | //    template<typename T> read_data(T* fp,  const void* data, size_t length);
 72 | // C++ would prefer the second version for every stream type except
 73 | // istream.  However, we want C++ to prefer the first version for
 74 | // streams that are *subclasses* of istream, such as istringstream.
 75 | // This is not possible given the way template types are resolved.  So
 76 | // we split the stream argument in two, one of which is templated and
 77 | // one of which is not.  The specialized functions (like the istream
 78 | // version above) ignore the template arg and use the second, 'type'
 79 | // arg, getting subclass matching as normal.  The 'catch-all'
 80 | // functions (the second version above) use the template arg to deduce
 81 | // the type, and use a second, void* arg to achieve the desired
 82 | // 'catch-all' semantics.
 83 | 
 84 | // ----- low-level I/O for FILE* ----
 85 | 
 86 | template<typename Ignored>
 87 | inline bool read_data_internal(Ignored*, FILE* fp,
 88 |                                void* data, size_t length) {
 89 |   return fread(data, length, 1, fp) == 1;
 90 | }
 91 | 
 92 | template<typename Ignored>
 93 | inline bool write_data_internal(Ignored*, FILE* fp,
 94 |                                 const void* data, size_t length) {
 95 |   return fwrite(data, length, 1, fp) == 1;
 96 | }
 97 | 
 98 | // ----- low-level I/O for iostream ----
 99 | 
100 | // We want the caller to be responsible for #including <iostream>, not
101 | // us, because iostream is a big header!  According to the standard,
102 | // it's only legal to delay the instantiation the way we want to if
103 | // the istream/ostream is a template type.  So we jump through hoops.
104 | template<typename ISTREAM>
105 | inline bool read_data_internal_for_istream(ISTREAM* fp,
106 |                                            void* data, size_t length) {
107 |   return fp->read(reinterpret_cast<char*>(data), length).good();
108 | }
109 | template<typename Ignored>
110 | inline bool read_data_internal(Ignored*, std::istream* fp,
111 |                                void* data, size_t length) {
112 |   return read_data_internal_for_istream(fp, data, length);
113 | }
114 | 
115 | template<typename OSTREAM>
116 | inline bool write_data_internal_for_ostream(OSTREAM* fp,
117 |                                             const void* data, size_t length) {
118 |   return fp->write(reinterpret_cast<const char*>(data), length).good();
119 | }
120 | template<typename Ignored>
121 | inline bool write_data_internal(Ignored*, std::ostream* fp,
122 |                                 const void* data, size_t length) {
123 |   return write_data_internal_for_ostream(fp, data, length);
124 | }
125 | 
126 | // ----- low-level I/O for custom streams ----
127 | 
128 | // The INPUT type needs to support a Read() method that takes a
129 | // buffer and a length and returns the number of bytes read.
130 | template <typename INPUT>
131 | inline bool read_data_internal(INPUT* fp, void*,
132 |                                void* data, size_t length) {
133 |   return static_cast<size_t>(fp->Read(data, length)) == length;
134 | }
135 | 
136 | // The OUTPUT type needs to support a Write() operation that takes
137 | // a buffer and a length and returns the number of bytes written.
138 | template <typename OUTPUT>
139 | inline bool write_data_internal(OUTPUT* fp, void*,
140 |                                 const void* data, size_t length) {
141 |   return static_cast<size_t>(fp->Write(data, length)) == length;
142 | }
143 | 
144 | // ----- low-level I/O: the public API ----
145 | 
146 | template <typename INPUT>
147 | inline bool read_data(INPUT* fp, void* data, size_t length) {
148 |   return read_data_internal(fp, fp, data, length);
149 | }
150 | 
151 | template <typename OUTPUT>
152 | inline bool write_data(OUTPUT* fp, const void* data, size_t length) {
153 |   return write_data_internal(fp, fp, data, length);
154 | }
155 | 
156 | // Uses read_data() and write_data() to read/write an integer.
157 | // length is the number of bytes to read/write (which may differ
158 | // from sizeof(IntType), allowing us to save on a 32-bit system
159 | // and load on a 64-bit system).  Excess bytes are taken to be 0.
160 | // INPUT and OUTPUT must match legal inputs to read/write_data (above).
161 | template <typename INPUT, typename IntType>
162 | bool read_bigendian_number(INPUT* fp, IntType* value, size_t length) {
163 |   *value = 0;
164 |   unsigned char byte;
165 |   // We require IntType to be unsigned or else the shifting gets all screwy.
166 |   SPARSEHASH_COMPILE_ASSERT(static_cast<IntType>(-1) > static_cast<IntType>(0),
167 |                             serializing_int_requires_an_unsigned_type);
168 |   for (size_t i = 0; i < length; ++i) {
169 |     if (!read_data(fp, &byte, sizeof(byte))) return false;
170 |     *value |= static_cast<IntType>(byte) << ((length - 1 - i) * 8);
171 |   }
172 |   return true;
173 | }
174 | 
175 | template <typename OUTPUT, typename IntType>
176 | bool write_bigendian_number(OUTPUT* fp, IntType value, size_t length) {
177 |   unsigned char byte;
178 |   // We require IntType to be unsigned or else the shifting gets all screwy.
179 |   SPARSEHASH_COMPILE_ASSERT(static_cast<IntType>(-1) > static_cast<IntType>(0),
180 |                             serializing_int_requires_an_unsigned_type);
181 |   for (size_t i = 0; i < length; ++i) {
182 |     byte = (sizeof(value) <= length-1 - i)
183 |         ? 0 : static_cast<unsigned char>((value >> ((length-1 - i) * 8)) & 255);
184 |     if (!write_data(fp, &byte, sizeof(byte))) return false;
185 |   }
186 |   return true;
187 | }
188 | 
189 | // If your keys and values are simple enough, you can pass this
190 | // serializer to serialize()/unserialize().  "Simple enough" means
191 | // value_type is a POD type that contains no pointers.  Note,
192 | // however, we don't try to normalize endianness.
193 | // This is the type used for NopointerSerializer.
194 | template <typename value_type> struct pod_serializer {
195 |   template <typename INPUT>
196 |   bool operator()(INPUT* fp, value_type* value) const {
197 |     return read_data(fp, value, sizeof(*value));
198 |   }
199 | 
200 |   template <typename OUTPUT>
201 |   bool operator()(OUTPUT* fp, const value_type& value) const {
202 |     return write_data(fp, &value, sizeof(value));
203 |   }
204 | };
205 | 
206 | 
207 | // Settings contains parameters for growing and shrinking the table.
208 | // It also packages zero-size functor (ie. hasher).
209 | //
210 | // It does some munging of the hash value in cases where we think
211 | // (fear) the original hash function might not be very good.  In
212 | // particular, the default hash of pointers is the identity hash,
213 | // so probably all the low bits are 0.  We identify when we think
214 | // we're hashing a pointer, and chop off the low bits.  Note this
215 | // isn't perfect: even when the key is a pointer, we can't tell
216 | // for sure that the hash is the identity hash.  If it's not, this
217 | // is needless work (and possibly, though not likely, harmful).
218 | 
219 | template<typename Key, typename HashFunc,
220 |          typename SizeType, int HT_MIN_BUCKETS>
221 | class sh_hashtable_settings : public HashFunc {
222 |  public:
223 |   typedef Key key_type;
224 |   typedef HashFunc hasher;
225 |   typedef SizeType size_type;
226 | 
227 |  public:
228 |   sh_hashtable_settings(const hasher& hf,
229 |                         const float ht_occupancy_flt,
230 |                         const float ht_empty_flt)
231 |       : hasher(hf),
232 |         enlarge_threshold_(0),
233 |         shrink_threshold_(0),
234 |         consider_shrink_(false),
235 |         use_empty_(false),
236 |         use_deleted_(false),
237 |         num_ht_copies_(0) {
238 |     set_enlarge_factor(ht_occupancy_flt);
239 |     set_shrink_factor(ht_empty_flt);
240 |   }
241 | 
242 |   size_type hash(const key_type& v) const {
243 |     // We munge the hash value when we don't trust hasher::operator().
244 |     return hash_munger<Key>::MungedHash(hasher::operator()(v));
245 |   }
246 | 
247 |   float enlarge_factor() const {
248 |     return enlarge_factor_;
249 |   }
250 |   void set_enlarge_factor(float f) {
251 |     enlarge_factor_ = f;
252 |   }
253 |   float shrink_factor() const {
254 |     return shrink_factor_;
255 |   }
256 |   void set_shrink_factor(float f) {
257 |     shrink_factor_ = f;
258 |   }
259 | 
260 |   size_type enlarge_threshold() const {
261 |     return enlarge_threshold_;
262 |   }
263 |   void set_enlarge_threshold(size_type t) {
264 |     enlarge_threshold_ = t;
265 |   }
266 |   size_type shrink_threshold() const {
267 |     return shrink_threshold_;
268 |   }
269 |   void set_shrink_threshold(size_type t) {
270 |     shrink_threshold_ = t;
271 |   }
272 | 
273 |   size_type enlarge_size(size_type x) const {
274 |     return static_cast<size_type>(x * enlarge_factor_);
275 |   }
276 |   size_type shrink_size(size_type x) const {
277 |     return static_cast<size_type>(x * shrink_factor_);
278 |   }
279 | 
280 |   bool consider_shrink() const {
281 |     return consider_shrink_;
282 |   }
283 |   void set_consider_shrink(bool t) {
284 |     consider_shrink_ = t;
285 |   }
286 | 
287 |   bool use_empty() const {
288 |     return use_empty_;
289 |   }
290 |   void set_use_empty(bool t) {
291 |     use_empty_ = t;
292 |   }
293 | 
294 |   bool use_deleted() const {
295 |     return use_deleted_;
296 |   }
297 |   void set_use_deleted(bool t) {
298 |     use_deleted_ = t;
299 |   }
300 | 
301 |   size_type num_ht_copies() const {
302 |     return static_cast<size_type>(num_ht_copies_);
303 |   }
304 |   void inc_num_ht_copies() {
305 |     ++num_ht_copies_;
306 |   }
307 | 
308 |   // Reset the enlarge and shrink thresholds
309 |   void reset_thresholds(size_type num_buckets) {
310 |     set_enlarge_threshold(enlarge_size(num_buckets));
311 |     set_shrink_threshold(shrink_size(num_buckets));
312 |     // whatever caused us to reset already considered
313 |     set_consider_shrink(false);
314 |   }
315 | 
316 |   // Caller is resposible for calling reset_threshold right after
317 |   // set_resizing_parameters.
318 |   void set_resizing_parameters(float shrink, float grow) {
319 |     assert(shrink >= 0.0);
320 |     assert(grow <= 1.0);
321 |     if (shrink > grow/2.0f)
322 |       shrink = grow / 2.0f;     // otherwise we thrash hashtable size
323 |     set_shrink_factor(shrink);
324 |     set_enlarge_factor(grow);
325 |   }
326 | 
327 |   // This is the smallest size a hashtable can be without being too crowded
328 |   // If you like, you can give a min #buckets as well as a min #elts
329 |   size_type min_buckets(size_type num_elts, size_type min_buckets_wanted) {
330 |     float enlarge = enlarge_factor();
331 |     size_type sz = HT_MIN_BUCKETS;             // min buckets allowed
332 |     while ( sz < min_buckets_wanted ||
333 |             num_elts >= static_cast<size_type>(sz * enlarge) ) {
334 |       // This just prevents overflowing size_type, since sz can exceed
335 |       // max_size() here.
336 |       if (static_cast<size_type>(sz * 2) < sz) {
337 |         throw std::length_error("resize overflow");  // protect against overflow
338 |       }
339 |       sz *= 2;
340 |     }
341 |     return sz;
342 |   }
343 | 
344 |  private:
345 |   template<class HashKey> class hash_munger {
346 |    public:
347 |     static size_t MungedHash(size_t hash) {
348 |       return hash;
349 |     }
350 |   };
351 |   // This matches when the hashtable key is a pointer.
352 |   template<class HashKey> class hash_munger<HashKey*> {
353 |    public:
354 |     static size_t MungedHash(size_t hash) {
355 |       // TODO(csilvers): consider rotating instead:
356 |       //    static const int shift = (sizeof(void *) == 4) ? 2 : 3;
357 |       //    return (hash << (sizeof(hash) * 8) - shift)) | (hash >> shift);
358 |       // This matters if we ever change sparse/dense_hash_* to compare
359 |       // hashes before comparing actual values.  It's speedy on x86.
360 |       return hash / sizeof(void*);   // get rid of known-0 bits
361 |     }
362 |   };
363 | 
364 |   size_type enlarge_threshold_;  // table.size() * enlarge_factor
365 |   size_type shrink_threshold_;   // table.size() * shrink_factor
366 |   float enlarge_factor_;         // how full before resize
367 |   float shrink_factor_;          // how empty before resize
368 |   // consider_shrink=true if we should try to shrink before next insert
369 |   bool consider_shrink_;
370 |   bool use_empty_;    // used only by densehashtable, not sparsehashtable
371 |   bool use_deleted_;  // false until delkey has been set
372 |   // num_ht_copies is a counter incremented every Copy/Move
373 |   unsigned int num_ht_copies_;
374 | };
375 | 
376 | }  // namespace sparsehash_internal
377 | 
378 | #undef SPARSEHASH_COMPILE_ASSERT
379 | _END_GOOGLE_NAMESPACE_
380 | 
381 | #endif  // UTIL_GTL_HASHTABLE_COMMON_H_
382 | 


--------------------------------------------------------------------------------
/cpp/sparsehash/internal/libc_allocator_with_realloc.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2010, Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ---
 31 | 
 32 | #ifndef UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_
 33 | #define UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_
 34 | 
 35 | #include <sparsehash/internal/sparseconfig.h>
 36 | #include <stdlib.h>           // for malloc/realloc/free
 37 | #include <stddef.h>           // for ptrdiff_t
 38 | #include <new>                // for placement new
 39 | 
 40 | _START_GOOGLE_NAMESPACE_
 41 | 
 42 | template<class T>
 43 | class libc_allocator_with_realloc {
 44 |  public:
 45 |   typedef T value_type;
 46 |   typedef size_t size_type;
 47 |   typedef ptrdiff_t difference_type;
 48 | 
 49 |   typedef T* pointer;
 50 |   typedef const T* const_pointer;
 51 |   typedef T& reference;
 52 |   typedef const T& const_reference;
 53 | 
 54 |   libc_allocator_with_realloc() {}
 55 |   libc_allocator_with_realloc(const libc_allocator_with_realloc&) {}
 56 |   ~libc_allocator_with_realloc() {}
 57 | 
 58 |   pointer address(reference r) const  { return &r; }
 59 |   const_pointer address(const_reference r) const  { return &r; }
 60 | 
 61 |   pointer allocate(size_type n, const_pointer = 0) {
 62 |     return static_cast<pointer>(malloc(n * sizeof(value_type)));
 63 |   }
 64 |   void deallocate(pointer p, size_type) {
 65 |     free(p);
 66 |   }
 67 |   pointer reallocate(pointer p, size_type n) {
 68 |     return static_cast<pointer>(realloc(p, n * sizeof(value_type)));
 69 |   }
 70 | 
 71 |   size_type max_size() const  {
 72 |     return static_cast<size_type>(-1) / sizeof(value_type);
 73 |   }
 74 | 
 75 |   void construct(pointer p, const value_type& val) {
 76 |     new(p) value_type(val);
 77 |   }
 78 |   void destroy(pointer p) { p->~value_type(); }
 79 | 
 80 |   template <class U>
 81 |   libc_allocator_with_realloc(const libc_allocator_with_realloc<U>&) {}
 82 | 
 83 |   template<class U>
 84 |   struct rebind {
 85 |     typedef libc_allocator_with_realloc<U> other;
 86 |   };
 87 | };
 88 | 
 89 | // libc_allocator_with_realloc<void> specialization.
 90 | template<>
 91 | class libc_allocator_with_realloc<void> {
 92 |  public:
 93 |   typedef void value_type;
 94 |   typedef size_t size_type;
 95 |   typedef ptrdiff_t difference_type;
 96 |   typedef void* pointer;
 97 |   typedef const void* const_pointer;
 98 | 
 99 |   template<class U>
100 |   struct rebind {
101 |     typedef libc_allocator_with_realloc<U> other;
102 |   };
103 | };
104 | 
105 | template<class T>
106 | inline bool operator==(const libc_allocator_with_realloc<T>&,
107 |                        const libc_allocator_with_realloc<T>&) {
108 |   return true;
109 | }
110 | 
111 | template<class T>
112 | inline bool operator!=(const libc_allocator_with_realloc<T>&,
113 |                        const libc_allocator_with_realloc<T>&) {
114 |   return false;
115 | }
116 | 
117 | _END_GOOGLE_NAMESPACE_
118 | 
119 | #endif  // UTIL_GTL_LIBC_ALLOCATOR_WITH_REALLOC_H_
120 | 


--------------------------------------------------------------------------------
/cpp/sparsehash/sparse_hash_set:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2005, Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ---
 31 | //
 32 | // This is just a very thin wrapper over sparsehashtable.h, just
 33 | // like sgi stl's stl_hash_set is a very thin wrapper over
 34 | // stl_hashtable.  The major thing we define is operator[], because
 35 | // we have a concept of a data_type which stl_hashtable doesn't
 36 | // (it only has a key and a value).
 37 | //
 38 | // This is more different from sparse_hash_map than you might think,
 39 | // because all iterators for sets are const (you obviously can't
 40 | // change the key, and for sets there is no value).
 41 | //
 42 | // We adhere mostly to the STL semantics for hash-map.  One important
 43 | // exception is that insert() may invalidate iterators entirely -- STL
 44 | // semantics are that insert() may reorder iterators, but they all
 45 | // still refer to something valid in the hashtable.  Not so for us.
 46 | // Likewise, insert() may invalidate pointers into the hashtable.
 47 | // (Whether insert invalidates iterators and pointers depends on
 48 | // whether it results in a hashtable resize).  On the plus side,
 49 | // delete() doesn't invalidate iterators or pointers at all, or even
 50 | // change the ordering of elements.
 51 | //
 52 | // Here are a few "power user" tips:
 53 | //
 54 | //    1) set_deleted_key():
 55 | //         Unlike STL's hash_map, if you want to use erase() you
 56 | //         *must* call set_deleted_key() after construction.
 57 | //
 58 | //    2) resize(0):
 59 | //         When an item is deleted, its memory isn't freed right
 60 | //         away.  This allows you to iterate over a hashtable,
 61 | //         and call erase(), without invalidating the iterator.
 62 | //         To force the memory to be freed, call resize(0).
 63 | //         For tr1 compatibility, this can also be called as rehash(0).
 64 | //
 65 | //    3) min_load_factor(0.0)
 66 | //         Setting the minimum load factor to 0.0 guarantees that
 67 | //         the hash table will never shrink.
 68 | //
 69 | // Roughly speaking:
 70 | //   (1) dense_hash_set: fastest, uses the most memory unless entries are small
 71 | //   (2) sparse_hash_set: slowest, uses the least memory
 72 | //   (3) hash_set / unordered_set (STL): in the middle
 73 | //
 74 | // Typically I use sparse_hash_set when I care about space and/or when
 75 | // I need to save the hashtable on disk.  I use hash_set otherwise.  I
 76 | // don't personally use dense_hash_set ever; some people use it for
 77 | // small sets with lots of lookups.
 78 | //
 79 | // - dense_hash_set has, typically, about 78% memory overhead (if your
 80 | //   data takes up X bytes, the hash_set uses .78X more bytes in overhead).
 81 | // - sparse_hash_set has about 4 bits overhead per entry.
 82 | // - sparse_hash_set can be 3-7 times slower than the others for lookup and,
 83 | //   especially, inserts.  See time_hash_map.cc for details.
 84 | //
 85 | // See /usr/(local/)?doc/sparsehash-*/sparse_hash_set.html
 86 | // for information about how to use this class.
 87 | 
 88 | #ifndef _SPARSE_HASH_SET_H_
 89 | #define _SPARSE_HASH_SET_H_
 90 | 
 91 | #include <sparsehash/internal/sparseconfig.h>
 92 | #include <algorithm>                       // needed by stl_alloc
 93 | #include <functional>                      // for equal_to<>
 94 | #include <memory>                          // for alloc (which we don't use)
 95 | #include <utility>                         // for pair<>
 96 | #include <sparsehash/internal/libc_allocator_with_realloc.h>
 97 | #include <sparsehash/internal/sparsehashtable.h>      // IWYU pragma: export
 98 | #include HASH_FUN_H                // for hash<>
 99 | 
100 | _START_GOOGLE_NAMESPACE_
101 | 
102 | template <class Value,
103 |           class HashFcn = SPARSEHASH_HASH<Value>,   // defined in sparseconfig.h
104 |           class EqualKey = std::equal_to<Value>,
105 |           class Alloc = libc_allocator_with_realloc<Value> >
106 | class sparse_hash_set {
107 |  private:
108 |   // Apparently identity is not stl-standard, so we define our own
109 |   struct Identity {
110 |     typedef const Value& result_type;
111 |     const Value& operator()(const Value& v) const { return v; }
112 |   };
113 |   struct SetKey {
114 |     void operator()(Value* value, const Value& new_key) const {
115 |       *value = new_key;
116 |     }
117 |   };
118 | 
119 |   typedef sparse_hashtable<Value, Value, HashFcn, Identity, SetKey,
120 |                            EqualKey, Alloc> ht;
121 |   ht rep;
122 | 
123 |  public:
124 |   typedef typename ht::key_type key_type;
125 |   typedef typename ht::value_type value_type;
126 |   typedef typename ht::hasher hasher;
127 |   typedef typename ht::key_equal key_equal;
128 |   typedef Alloc allocator_type;
129 | 
130 |   typedef typename ht::size_type size_type;
131 |   typedef typename ht::difference_type difference_type;
132 |   typedef typename ht::const_pointer pointer;
133 |   typedef typename ht::const_pointer const_pointer;
134 |   typedef typename ht::const_reference reference;
135 |   typedef typename ht::const_reference const_reference;
136 | 
137 |   typedef typename ht::const_iterator iterator;
138 |   typedef typename ht::const_iterator const_iterator;
139 |   typedef typename ht::const_local_iterator local_iterator;
140 |   typedef typename ht::const_local_iterator const_local_iterator;
141 | 
142 | 
143 |   // Iterator functions -- recall all iterators are const
144 |   iterator begin() const                  { return rep.begin(); }
145 |   iterator end() const                    { return rep.end(); }
146 | 
147 |   // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements.
148 |   local_iterator begin(size_type i) const { return rep.begin(i); }
149 |   local_iterator end(size_type i) const   { return rep.end(i); }
150 | 
151 | 
152 |   // Accessor functions
153 |   allocator_type get_allocator() const    { return rep.get_allocator(); }
154 |   hasher hash_funct() const               { return rep.hash_funct(); }
155 |   hasher hash_function() const            { return hash_funct(); }  // tr1 name
156 |   key_equal key_eq() const                { return rep.key_eq(); }
157 | 
158 | 
159 |   // Constructors
160 |   explicit sparse_hash_set(size_type expected_max_items_in_table = 0,
161 |                            const hasher& hf = hasher(),
162 |                            const key_equal& eql = key_equal(),
163 |                            const allocator_type& alloc = allocator_type())
164 |       : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
165 |   }
166 | 
167 |   template <class InputIterator>
168 |   sparse_hash_set(InputIterator f, InputIterator l,
169 |                   size_type expected_max_items_in_table = 0,
170 |                   const hasher& hf = hasher(),
171 |                   const key_equal& eql = key_equal(),
172 |                   const allocator_type& alloc = allocator_type())
173 |       : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
174 |     rep.insert(f, l);
175 |   }
176 |   // We use the default copy constructor
177 |   // We use the default operator=()
178 |   // We use the default destructor
179 | 
180 |   void clear()                        { rep.clear(); }
181 |   void swap(sparse_hash_set& hs)      { rep.swap(hs.rep); }
182 | 
183 | 
184 |   // Functions concerning size
185 |   size_type size() const              { return rep.size(); }
186 |   size_type max_size() const          { return rep.max_size(); }
187 |   bool empty() const                  { return rep.empty(); }
188 |   size_type bucket_count() const      { return rep.bucket_count(); }
189 |   size_type max_bucket_count() const  { return rep.max_bucket_count(); }
190 | 
191 |   // These are tr1 methods.  bucket() is the bucket the key is or would be in.
192 |   size_type bucket_size(size_type i) const    { return rep.bucket_size(i); }
193 |   size_type bucket(const key_type& key) const { return rep.bucket(key); }
194 |   float load_factor() const {
195 |     return size() * 1.0f / bucket_count();
196 |   }
197 |   float max_load_factor() const {
198 |     float shrink, grow;
199 |     rep.get_resizing_parameters(&shrink, &grow);
200 |     return grow;
201 |   }
202 |   void max_load_factor(float new_grow) {
203 |     float shrink, grow;
204 |     rep.get_resizing_parameters(&shrink, &grow);
205 |     rep.set_resizing_parameters(shrink, new_grow);
206 |   }
207 |   // These aren't tr1 methods but perhaps ought to be.
208 |   float min_load_factor() const {
209 |     float shrink, grow;
210 |     rep.get_resizing_parameters(&shrink, &grow);
211 |     return shrink;
212 |   }
213 |   void min_load_factor(float new_shrink) {
214 |     float shrink, grow;
215 |     rep.get_resizing_parameters(&shrink, &grow);
216 |     rep.set_resizing_parameters(new_shrink, grow);
217 |   }
218 |   // Deprecated; use min_load_factor() or max_load_factor() instead.
219 |   void set_resizing_parameters(float shrink, float grow) {
220 |     rep.set_resizing_parameters(shrink, grow);
221 |   }
222 | 
223 |   void resize(size_type hint)         { rep.resize(hint); }
224 |   void rehash(size_type hint)         { resize(hint); }     // the tr1 name
225 | 
226 |   // Lookup routines
227 |   iterator find(const key_type& key) const           { return rep.find(key); }
228 | 
229 |   size_type count(const key_type& key) const         { return rep.count(key); }
230 | 
231 |   std::pair<iterator, iterator> equal_range(const key_type& key) const {
232 |     return rep.equal_range(key);
233 |   }
234 | 
235 | 
236 |   // Insertion routines
237 |   std::pair<iterator, bool> insert(const value_type& obj) {
238 |     std::pair<typename ht::iterator, bool> p = rep.insert(obj);
239 |     return std::pair<iterator, bool>(p.first, p.second);   // const to non-const
240 |   }
241 |   template <class InputIterator> void insert(InputIterator f, InputIterator l) {
242 |     rep.insert(f, l);
243 |   }
244 |   void insert(const_iterator f, const_iterator l) {
245 |     rep.insert(f, l);
246 |   }
247 |   // Required for std::insert_iterator; the passed-in iterator is ignored.
248 |   iterator insert(iterator, const value_type& obj)   {
249 |     return insert(obj).first;
250 |   }
251 | 
252 |   // Deletion routines
253 |   // THESE ARE NON-STANDARD!  I make you specify an "impossible" key
254 |   // value to identify deleted buckets.  You can change the key as
255 |   // time goes on, or get rid of it entirely to be insert-only.
256 |   void set_deleted_key(const key_type& key)   { rep.set_deleted_key(key); }
257 |   void clear_deleted_key()                    { rep.clear_deleted_key(); }
258 |   key_type deleted_key() const                { return rep.deleted_key(); }
259 | 
260 |   // These are standard
261 |   size_type erase(const key_type& key)               { return rep.erase(key); }
262 |   void erase(iterator it)                            { rep.erase(it); }
263 |   void erase(iterator f, iterator l)                 { rep.erase(f, l); }
264 | 
265 | 
266 |   // Comparison
267 |   bool operator==(const sparse_hash_set& hs) const   { return rep == hs.rep; }
268 |   bool operator!=(const sparse_hash_set& hs) const   { return rep != hs.rep; }
269 | 
270 | 
271 |   // I/O -- this is an add-on for writing metainformation to disk
272 |   //
273 |   // For maximum flexibility, this does not assume a particular
274 |   // file type (though it will probably be a FILE *).  We just pass
275 |   // the fp through to rep.
276 | 
277 |   // If your keys and values are simple enough, you can pass this
278 |   // serializer to serialize()/unserialize().  "Simple enough" means
279 |   // value_type is a POD type that contains no pointers.  Note,
280 |   // however, we don't try to normalize endianness.
281 |   typedef typename ht::NopointerSerializer NopointerSerializer;
282 | 
283 |   // serializer: a class providing operator()(OUTPUT*, const value_type&)
284 |   //    (writing value_type to OUTPUT).  You can specify a
285 |   //    NopointerSerializer object if appropriate (see above).
286 |   // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
287 |   //    pointer to a class providing size_t Write(const void*, size_t),
288 |   //    which writes a buffer into a stream (which fp presumably
289 |   //    owns) and returns the number of bytes successfully written.
290 |   //    Note basic_ostream<not_char> is not currently supported.
291 |   template <typename ValueSerializer, typename OUTPUT>
292 |   bool serialize(ValueSerializer serializer, OUTPUT* fp) {
293 |     return rep.serialize(serializer, fp);
294 |   }
295 | 
296 |   // serializer: a functor providing operator()(INPUT*, value_type*)
297 |   //    (reading from INPUT and into value_type).  You can specify a
298 |   //    NopointerSerializer object if appropriate (see above).
299 |   // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
300 |   //    pointer to a class providing size_t Read(void*, size_t),
301 |   //    which reads into a buffer from a stream (which fp presumably
302 |   //    owns) and returns the number of bytes successfully read.
303 |   //    Note basic_istream<not_char> is not currently supported.
304 |   // NOTE: Since value_type is const Key, ValueSerializer
305 |   // may need to do a const cast in order to fill in the key.
306 |   // NOTE: if Key is not a POD type, the serializer MUST use
307 |   // placement-new to initialize its value, rather than a normal
308 |   // equals-assignment or similar.  (The value_type* passed into
309 |   // the serializer points to garbage memory.)
310 |   template <typename ValueSerializer, typename INPUT>
311 |   bool unserialize(ValueSerializer serializer, INPUT* fp) {
312 |     return rep.unserialize(serializer, fp);
313 |   }
314 | 
315 |   // The four methods below are DEPRECATED.
316 |   // Use serialize() and unserialize() for new code.
317 |   template <typename OUTPUT>
318 |   bool write_metadata(OUTPUT *fp)       { return rep.write_metadata(fp); }
319 | 
320 |   template <typename INPUT>
321 |   bool read_metadata(INPUT *fp)         { return rep.read_metadata(fp); }
322 | 
323 |   template <typename OUTPUT>
324 |   bool write_nopointer_data(OUTPUT *fp) { return rep.write_nopointer_data(fp); }
325 | 
326 |   template <typename INPUT>
327 |   bool read_nopointer_data(INPUT *fp)   { return rep.read_nopointer_data(fp); }
328 | };
329 | 
330 | template <class Val, class HashFcn, class EqualKey, class Alloc>
331 | inline void swap(sparse_hash_set<Val, HashFcn, EqualKey, Alloc>& hs1,
332 |                  sparse_hash_set<Val, HashFcn, EqualKey, Alloc>& hs2) {
333 |   hs1.swap(hs2);
334 | }
335 | 
336 | _END_GOOGLE_NAMESPACE_
337 | 
338 | #endif /* _SPARSE_HASH_SET_H_ */
339 | 


--------------------------------------------------------------------------------
/cpp/sparsehash/template_util.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2005 Google Inc.
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are
  6 | // met:
  7 | //
  8 | //     * Redistributions of source code must retain the above copyright
  9 | // notice, this list of conditions and the following disclaimer.
 10 | //     * Redistributions in binary form must reproduce the above
 11 | // copyright notice, this list of conditions and the following disclaimer
 12 | // in the documentation and/or other materials provided with the
 13 | // distribution.
 14 | //     * Neither the name of Google Inc. nor the names of its
 15 | // contributors may be used to endorse or promote products derived from
 16 | // this software without specific prior written permission.
 17 | //
 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | 
 30 | // ----
 31 | //
 32 | // Template metaprogramming utility functions.
 33 | //
 34 | // This code is compiled directly on many platforms, including client
 35 | // platforms like Windows, Mac, and embedded systems.  Before making
 36 | // any changes here, make sure that you're not breaking any platforms.
 37 | //
 38 | //
 39 | // The names choosen here reflect those used in tr1 and the boost::mpl
 40 | // library, there are similar operations used in the Loki library as
 41 | // well.  I prefer the boost names for 2 reasons:
 42 | // 1.  I think that portions of the Boost libraries are more likely to
 43 | // be included in the c++ standard.
 44 | // 2.  It is not impossible that some of the boost libraries will be
 45 | // included in our own build in the future.
 46 | // Both of these outcomes means that we may be able to directly replace
 47 | // some of these with boost equivalents.
 48 | //
 49 | #ifndef BASE_TEMPLATE_UTIL_H_
 50 | #define BASE_TEMPLATE_UTIL_H_
 51 | 
 52 | #include <sparsehash/internal/sparseconfig.h>
 53 | _START_GOOGLE_NAMESPACE_
 54 | 
 55 | // Types small_ and big_ are guaranteed such that sizeof(small_) <
 56 | // sizeof(big_)
 57 | typedef char small_;
 58 | 
 59 | struct big_ {
 60 |   char dummy[2];
 61 | };
 62 | 
 63 | // Identity metafunction.
 64 | template <class T>
 65 | struct identity_ {
 66 |   typedef T type;
 67 | };
 68 | 
 69 | // integral_constant, defined in tr1, is a wrapper for an integer
 70 | // value. We don't really need this generality; we could get away
 71 | // with hardcoding the integer type to bool. We use the fully
 72 | // general integer_constant for compatibility with tr1.
 73 | 
 74 | template<class T, T v>
 75 | struct integral_constant {
 76 |   static const T value = v;
 77 |   typedef T value_type;
 78 |   typedef integral_constant<T, v> type;
 79 | };
 80 | 
 81 | template <class T, T v> const T integral_constant<T, v>::value;
 82 | 
 83 | 
 84 | // Abbreviations: true_type and false_type are structs that represent boolean
 85 | // true and false values. Also define the boost::mpl versions of those names,
 86 | // true_ and false_.
 87 | typedef integral_constant<bool, true>  true_type;
 88 | typedef integral_constant<bool, false> false_type;
 89 | typedef true_type  true_;
 90 | typedef false_type false_;
 91 | 
 92 | // if_ is a templatized conditional statement.
 93 | // if_<cond, A, B> is a compile time evaluation of cond.
 94 | // if_<>::type contains A if cond is true, B otherwise.
 95 | template<bool cond, typename A, typename B>
 96 | struct if_{
 97 |   typedef A type;
 98 | };
 99 | 
100 | template<typename A, typename B>
101 | struct if_<false, A, B> {
102 |   typedef B type;
103 | };
104 | 
105 | 
106 | // type_equals_ is a template type comparator, similar to Loki IsSameType.
107 | // type_equals_<A, B>::value is true iff "A" is the same type as "B".
108 | //
109 | // New code should prefer base::is_same, defined in base/type_traits.h.
110 | // It is functionally identical, but is_same is the standard spelling.
111 | template<typename A, typename B>
112 | struct type_equals_ : public false_ {
113 | };
114 | 
115 | template<typename A>
116 | struct type_equals_<A, A> : public true_ {
117 | };
118 | 
119 | // and_ is a template && operator.
120 | // and_<A, B>::value evaluates "A::value && B::value".
121 | template<typename A, typename B>
122 | struct and_ : public integral_constant<bool, (A::value && B::value)> {
123 | };
124 | 
125 | // or_ is a template || operator.
126 | // or_<A, B>::value evaluates "A::value || B::value".
127 | template<typename A, typename B>
128 | struct or_ : public integral_constant<bool, (A::value || B::value)> {
129 | };
130 | 
131 | 
132 | _END_GOOGLE_NAMESPACE_
133 | 
134 | #endif  // BASE_TEMPLATE_UTIL_H_
135 | 


--------------------------------------------------------------------------------
/cython/.gitignore:
--------------------------------------------------------------------------------
1 | *.c
2 | *.cpp
3 | 


--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | set -e
3 | WHEEL_TOOL=`which wheel` /usr/bin/python2.7 setup.py sdist bdist_wheel
4 | find dist -type f -exec gpg2 --detach-sign -a {} \;
5 | twine upload dist/*
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cslarsen/arv/3999e00361f13d404d0d86d76bcedd4d8d49c393/setup.cfg


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | from setuptools import setup, Extension
  2 | from setuptools.command.build_ext import build_ext
  3 | import os
  4 | import shutil
  5 | import unittest
  6 | 
  7 | class ArvOptions:
  8 |     debug = os.getenv("ARV_DEBUG", False)
  9 |     debug_symbols = False
 10 |     hidden_symbols = True
 11 |     strip = True
 12 |     warnings = True
 13 | 
 14 |     # Currently just assume we have gcc/clang
 15 |     is_gcc = True
 16 | 
 17 |     @staticmethod
 18 |     def compile_flags():
 19 |         flags = []
 20 | 
 21 |         if not ArvOptions.is_gcc:
 22 |             return flags
 23 | 
 24 |         flags += ["--std=c++11", # REQUIRED
 25 |                   "-DBUILDING_DLL"] # REQUIRED
 26 | 
 27 |         if ArvOptions.warnings:
 28 |             flags += ["-W", "-Wall"]
 29 | 
 30 |         if not ArvOptions.debug:
 31 |             # Make the binary a good bit faster
 32 |             flags += [
 33 |                 "-fdata-sections", # small impact, i.e. not important
 34 |                 "-ffunction-sections", # small impact, i.e. not important
 35 |                 "-fno-rtti", # small impact, i.e. not important
 36 |                 "-march=native", # important
 37 |                 "-mtune=native", # important, but could use march=generic
 38 |                 "-O3", # important, but O2 also works fine
 39 |             ]
 40 | 
 41 |             if not ArvOptions.debug_symbols:
 42 |                 flags += ["-g0"]
 43 | 
 44 |             if ArvOptions.hidden_symbols:
 45 |                 flags += ["-fvisibility=hidden", # I like clean binaries
 46 |                           "-include", "cpp/public_py_init_sym.hpp"]
 47 |         return flags
 48 | 
 49 |     @staticmethod
 50 |     def link_flags():
 51 |         flags = []
 52 | 
 53 |         if not ArvOptions.is_gcc:
 54 |             return flags
 55 | 
 56 |         if ArvOptions.strip:
 57 |             flags += ["-Wl,-s"]
 58 | 
 59 |         return flags
 60 | 
 61 | # From http://stackoverflow.com/a/26698408/21028
 62 | class lazy_cythonize(list):
 63 |     def __init__(self, callback):
 64 |         self._list, self.callback = None, callback
 65 |     def c_list(self):
 66 |         if self._list is None: self._list = self.callback()
 67 |         return self._list
 68 |     def __iter__(self):
 69 |         for e in self.c_list(): yield e
 70 |     def __getitem__(self, ii): return self.c_list()[ii]
 71 |     def __len__(self): return len(self.c_list())
 72 | 
 73 | def configure_google_hashmap():
 74 |     script = os.path.join("3rd-party", "sparsehash", "configure")
 75 |     config = os.path.join("cpp", "sparsehash", "internal", "sparseconfig.h")
 76 | 
 77 |     if not os.path.isfile(config):
 78 |         print("Configuring Google hash map")
 79 |         if os.system(script) == 0:
 80 |             shutil.copy(os.path.join("src", "config.h"), config)
 81 |         else:
 82 |             raise RuntimeError("Error configuring Google hash map")
 83 | 
 84 | class BuildExt(build_ext):
 85 |     def run(self):
 86 |         configure_google_hashmap()
 87 |         return build_ext.run(self)
 88 | 
 89 | def extensions():
 90 |     from Cython.Build import cythonize
 91 |     import multiprocessing
 92 | 
 93 |     exts = [
 94 |         Extension("_arv", [
 95 |                 "cpp/arv.cpp",
 96 |                 "cpp/file.cpp",
 97 |                 "cpp/filesize.cpp",
 98 |                 "cpp/mmap.cpp",
 99 |                 "cpp/parse.cpp",
100 |                 "cython/_arv.pyx",
101 |             ],
102 |             language="c++",
103 |             include_dirs=["cpp"],
104 |             extra_compile_args=ArvOptions.compile_flags(),
105 |             extra_link_args=ArvOptions.link_flags(),
106 |         ),
107 |     ]
108 |     #configure_google_hashmap()
109 |     return cythonize(exts, nthreads=multiprocessing.cpu_count())
110 | 
111 | def slurp(filename):
112 |     with open(filename, "rt") as f:
113 |         return f.read()
114 | 
115 | def get_testsuite():
116 |     loader = unittest.TestLoader()
117 |     suite = loader.discover("tests", pattern="test*.py")
118 |     return suite
119 | 
120 | setup(
121 |     name="arv",
122 |     packages=["arv"],
123 |     version="0.9.3",
124 |     description="A fast 23andMe raw genome file parser",
125 |     author="Christian Stigen Larsen",
126 |     author_email="csl@csl.name",
127 |     url="https://github.com/cslarsen/arv",
128 |     license="https://www.gnu.org/licenses/gpl-3.0.html",
129 |     long_description=slurp("README.rst"),
130 |     keywords=[
131 |         "23andMe",
132 |         "bio",
133 |         "biology",
134 |         "biopython",
135 |         "disease",
136 |         "DNA",
137 |         "gene",
138 |         "genome",
139 |         "health",
140 |         "protein",
141 |         "RNA",
142 |         "RSID",
143 |         "SNP",
144 |     ],
145 |     platforms=["unix", "linux", "osx"],
146 |     install_requires=["cython>=0.25"],
147 |     setup_requires=["cython>=0.25"],
148 |     ext_modules=lazy_cythonize(extensions),
149 |     test_suite="setup.get_testsuite",
150 |     cmdclass={'build_ext': BuildExt},
151 |     classifiers=[
152 |         "Development Status :: 3 - Alpha",
153 |         "Natural Language :: English",
154 |         "Operating System :: MacOS :: MacOS X",
155 |         "Operating System :: POSIX",
156 |         "Operating System :: Unix",
157 |         "Programming Language :: Python",
158 |         "Programming Language :: Python :: 2",
159 |         "Programming Language :: Python :: 2.7",
160 |         "Programming Language :: Python :: 3",
161 |         "Programming Language :: Python :: 3.2",
162 |         "Programming Language :: Python :: 3.3",
163 |         "Programming Language :: Python :: 3.4",
164 |         "Programming Language :: Python :: 3.5",
165 |         "Programming Language :: Python :: 3.6",
166 |     ],
167 | )
168 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cslarsen/arv/3999e00361f13d404d0d86d76bcedd4d8d49c393/tests/__init__.py


--------------------------------------------------------------------------------
/tests/example.py:
--------------------------------------------------------------------------------
 1 | from arv import load, unphased_match as match
 2 | 
 3 | genome = load("genome.txt")
 4 | 
 5 | print("You are a {gender} with {color} eyes and {complexion} skin.".format(
 6 |     gender     = "man" if genome.y_chromosome else "woman",
 7 |     complexion = "light" if genome["rs1426654"] == "AA" else "dark",
 8 |     color      = match(genome["rs12913832"], {"AA": "brown",
 9 |                                               "AG": "brown or green",
10 |                                               "GG": "blue"})))
11 | 


--------------------------------------------------------------------------------
/tests/fake_genome.txt:
--------------------------------------------------------------------------------
 1 | # This data file generated by 23andMe at: Wed Mar 15 12:34:56 2017
 2 | #
 3 | # Below is a text version of your data.  Fields are TAB-separated
 4 | # Each line corresponds to a single SNP.  For each SNP, we provide its identifier 
 5 | # (an rsid or an internal id), its location on the reference human genome, and the 
 6 | # genotype call oriented with respect to the plus strand on the human reference sequence.
 7 | # We are using reference human assembly build 37 (also known as Annotation Release 104).
 8 | # Note that it is possible that data downloaded at different times may be different due to ongoing 
 9 | # improvements in our ability to call genotypes. More information about these changes can be found at:
10 | # https://www.23andme.com/you/download/revisions/
11 | # 
12 | # More information on reference human assembly build 37 (aka Annotation Release 104):
13 | # http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606
14 | #
15 | # rsid	chromosome	position	genotype
16 | rs4477212	1	82154	AT
17 | rs4672279	2	59444675	GT
18 | rs4536786	3	140049121	CA
19 | rs7715122	5	94197884	AT
20 | rs11980927	7	20010422	GG
21 | rs10810289	9	14899708	AA
22 | rs10488822	11	35984271	TC
23 | rs913897	13	73892459	AC
24 | rs1540613	16	80476182	AG
25 | rs6123756	20	56556146	TT
26 | rs6015286	20	57048415	--
27 | rs6026400	20	57183524	CC
28 | rs742927	Y	57183914	GG
29 | i3001754	MT	16256	A
30 | i3001755	MT	16257	--
31 | i3001759	MT	16258	--
32 | i3001761	MT	16259	--
33 | i3001773	MT	16265	T
34 | i4000755	MT	16548	C
35 | i4000759	MT	16567	G
36 | rs1426654	15	48426484	AA
37 | rs12913832	15	28365618	GG
38 | rs28504042	MT	1549	--
39 | rs3135027	MT	1598	G
40 | rs671	12	112241766	GG
41 | 


--------------------------------------------------------------------------------
/tests/fake_genome_female.txt:
--------------------------------------------------------------------------------
 1 | # This data file generated by 23andMe at: Wed Mar 15 12:34:56 2017
 2 | #
 3 | # Below is a text version of your data.  Fields are TAB-separated
 4 | # Each line corresponds to a single SNP.  For each SNP, we provide its identifier 
 5 | # (an rsid or an internal id), its location on the reference human genome, and the 
 6 | # genotype call oriented with respect to the plus strand on the human reference sequence.
 7 | # We are using reference human assembly build 37 (also known as Annotation Release 104).
 8 | # Note that it is possible that data downloaded at different times may be different due to ongoing 
 9 | # improvements in our ability to call genotypes. More information about these changes can be found at:
10 | # https://www.23andme.com/you/download/revisions/
11 | # 
12 | # More information on reference human assembly build 37 (aka Annotation Release 104):
13 | # http://www.ncbi.nlm.nih.gov/mapview/map_search.cgi?taxid=9606
14 | #
15 | # rsid	chromosome	position	genotype
16 | rs4477212	1	82154	AT
17 | rs4672279	2	59444675	GT
18 | rs4536786	3	140049121	CA
19 | rs7715122	5	94197884	AT
20 | rs11980927	7	20010422	GG
21 | rs10810289	9	14899708	AA
22 | rs10488822	11	35984271	TC
23 | rs913897	13	73892459	AC
24 | rs1540613	16	80476182	AG
25 | rs6123756	20	56556146	TT
26 | rs6015286	20	57048415	--
27 | rs6026400	20	57183524	CC
28 | i3001754	MT	16256	A
29 | i3001755	MT	16257	--
30 | i3001759	MT	16258	--
31 | i3001761	MT	16259	--
32 | i3001773	MT	16265	T
33 | i4000755	MT	16548	C
34 | i4000759	MT	16567	G
35 | rs1426654	15	48426484	AA
36 | rs12913832	15	28365618	GG
37 | rs28504042	MT	1549	--
38 | rs3135027	MT	1598	G
39 | rs671	12	112241766	GG
40 | 


--------------------------------------------------------------------------------
/tests/test_benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various benchmarks for arv.
  3 | 
  4 | arv
  5 | Copyright 2017 Christian Stigen Larsen
  6 | Distributed under the GNU GPL v3 or later; see COPYING.
  7 | """
  8 | 
  9 | import argparse
 10 | import arv
 11 | import contextlib
 12 | import os
 13 | import random
 14 | import sys
 15 | import time
 16 | import unittest
 17 | 
 18 | benchmarks = {
 19 |     "parsing": "arv.load(filename)",
 20 | 
 21 |     "random access":
 22 | r"""
 23 | for n in xrange(5000):
 24 |     try:
 25 |         pos = random.randint(genome.first, genome.last)
 26 |         snp = genome[pos]
 27 |     except KeyError:
 28 |         # RSIDs are not contiguous
 29 |         pass
 30 | """,
 31 | 
 32 |     "iterate items in genome":
 33 | r"""
 34 | assert(False) # this is too slow at the moment
 35 | num = 0
 36 | for snp in genome:
 37 |     num += 1
 38 | assert(num == len(genome))
 39 | """,
 40 | 
 41 |     "iterate rsids":
 42 | r"""
 43 | num = 0
 44 | for snp in genome.rsids:
 45 |     num += 1
 46 | assert(num == len(genome))
 47 | """,
 48 | 
 49 |     "iterate snps":
 50 | r"""
 51 | num = 0
 52 | for snp in genome.snps:
 53 |     num += 1
 54 | assert(num == len(genome))
 55 | """,
 56 | }
 57 | 
 58 | def log(msg, stream=sys.stdout):
 59 |     stream.write(msg)
 60 |     stream.flush()
 61 | 
 62 | if sys.version_info[:2] >= (3, 3):
 63 |     mark_time = time.perf_counter
 64 | else:
 65 |     mark_time = time.clock
 66 | 
 67 | @contextlib.contextmanager
 68 | def timed_block():
 69 |     start = mark_time()
 70 |     elapsed = None
 71 |     yield lambda: elapsed
 72 |     elapsed = mark_time() - start
 73 | 
 74 | def benchmark(times, code, **local_args):
 75 |     stream = local_args.get("stream", sys.stdout)
 76 |     prefix = local_args.get("prefix", "")
 77 |     best = 1e9
 78 |     for no in range(times):
 79 |         localvars = {
 80 |             "arv": arv,
 81 |             "sys": sys
 82 |         }
 83 |         if sys.version_info[0] >= 3:
 84 |             localvars["xrange"] = range
 85 | 
 86 |         localvars.update(local_args)
 87 | 
 88 |         with timed_block() as elapsed:
 89 |             exec(code, localvars)
 90 | 
 91 |         elapsed = elapsed()
 92 |         if elapsed < best:
 93 |             if round(elapsed, 4) < round(best, 4):
 94 |                 log("\n%s%6.4fs " % (prefix, elapsed), stream=stream)
 95 |             best = elapsed
 96 |         else:
 97 |             log(".", stream=stream)
 98 |     return best
 99 | 
100 | def all_benchmarks(filename, times):
101 |     log("Benchmarking arv %s at %s\n" % (arv.__version__, arv.__file__))
102 |     log("Measuring time with %s\n\n" % mark_time)
103 | 
104 |     results = {}
105 |     genome = arv.load(filename)
106 | 
107 |     for name, code in sorted(benchmarks.items()):
108 |         log("Benchmarking %s x %d ... " % (repr(name), times))
109 |         try:
110 |             results[name] = benchmark(times, code, filename=filename,
111 |                     genome=genome, random=random)
112 |         except Exception as e:
113 |             log(str(e))
114 |         finally:
115 |             if name == "parsing":
116 |                 genome = arv.load(filename)
117 |                 log(" %.2g SNPs / second" % (len(genome)/results[name]))
118 |             log("\n")
119 | 
120 |     return results
121 | 
122 | class BenchmarkTests(unittest.TestCase):
123 |     @unittest.skipUnless(os.getenv("ARV_BENCHMARK", None) is not None,
124 |         "Specify ARV_BENCHMARK=<genome filename> to benchmark")
125 |     def test_parser_speed(self):
126 |         filename = os.getenv("ARV_BENCHMARK")
127 |         self.assertTrue(os.path.isfile(filename),
128 |                 "File not found: %s" % filename)
129 |         try:
130 |             times = int(os.getenv("ARV_BENCHMARK_COUNT", "40"))
131 |         except:
132 |             times = 40
133 |         code = benchmarks["parsing"]
134 |         seconds = benchmark(times, code, filename=filename, stream=sys.stderr,
135 |                 prefix="  ")
136 |         genome = arv.load(filename)
137 |         sys.stderr.flush()
138 |         sys.stderr.write(" %d SNPs in ~%dms or %.1g SNPs/second ... " % (
139 |                 len(genome), int(round(seconds, 3)*1000), len(genome)/seconds))
140 |         sys.stderr.flush()
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     p = argparse.ArgumentParser()
145 |     p.add_argument("--times", "-t", default=20, type=int)
146 |     p.add_argument("--filename", "-f", required=True, type=str)
147 |     args = p.parse_args()
148 |     all_benchmarks(args.filename, args.times)
149 | 


--------------------------------------------------------------------------------
/tests/test_commandline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for arv.
 3 | 
 4 | arv
 5 | Copyright 2017 Christian Stigen Larsen
 6 | Distributed under the GNU GPL v3 or later; see COPYING.
 7 | """
 8 | 
 9 | import arv
10 | import os
11 | import subprocess
12 | import sys
13 | import unittest
14 | 
15 | class ArvModuleTests(unittest.TestCase):
16 |     @classmethod
17 |     def setUpClass(cls):
18 |         cls.genome_path = os.path.join(os.path.dirname(__file__),
19 |                 "fake_genome.txt")
20 | 
21 |     def _execute(self, *args):
22 |         output = subprocess.check_output([sys.executable, "-m", "arv"] +
23 |                 list(args), universal_newlines=True)
24 |         return output.replace("\r\n", "\n").split("\n")
25 | 
26 |     def test_help(self):
27 |         self.assertTrue("\n".join(self._execute("--help")).
28 |                 startswith("usage: arv [-h]"))
29 | 
30 |     def test_example(self):
31 |         self.assertEqual(self._execute("--example", "--ethnicity=europan",
32 |             self.genome_path),
33 |         ["fake_genome.txt ... 25 SNPs, male",
34 |          "fake_genome.txt ... ",
35 |          "  Alcohol flush reaction: Little to no reaction (two copies of the ALDH2 gene)",
36 |          "  Description           : A man with blue eyes and light skin",
37 |          ""])
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/test_infer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Inferring tests for arv.
 3 | 
 4 | arv
 5 | Copyright 2017 Christian Stigen Larsen
 6 | Distributed under the GNU GPL v3 or later; see COPYING.
 7 | """
 8 | 
 9 | import arv
10 | import unittest
11 | 
12 | class ArvInferTests(unittest.TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         cls.genome = arv.load("tests/fake_genome.txt")
16 | 
17 |     def test_infer_gender(self):
18 |         gender = "man" if self.genome.y_chromosome else "woman"
19 |         self.assertEqual(gender, "man")
20 | 
21 |     def test_infer_complexion(self):
22 |         complexion = "light" if self.genome["rs1426654"].genotype == "AA" else "dark"
23 |         self.assertEqual(complexion, "light")
24 | 
25 |         # Rich comparison with string
26 |         complexion = "light" if self.genome["rs1426654"] == "AA" else "dark"
27 |         self.assertEqual(complexion, "light")
28 | 
29 |     def test_infer_unphased_match_eyecolor(self):
30 |         eyecolor = arv.unphased_match(self.genome["rs12913832"], {
31 |             "AA": "brown eyes",
32 |             "AG": "brown or green eyes",
33 |             "GG": "blue eyes",
34 |             None: "unknown"})
35 |         self.assertEqual(eyecolor, "blue eyes")
36 | 


--------------------------------------------------------------------------------
/tests/test_traits.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Inferring tests for arv.
 3 | 
 4 | arv
 5 | Copyright 2017 Christian Stigen Larsen
 6 | Distributed under the GNU GPL v3 or later; see COPYING.
 7 | """
 8 | 
 9 | import arv
10 | import arv.traits
11 | import unittest
12 | 
13 | class ArvTraitsTest(unittest.TestCase):
14 |     @classmethod
15 |     def setUpClass(cls):
16 |         cls.genome = arv.load("tests/fake_genome.txt", ethnicity="european")
17 | 
18 |     def test_alcohol_flush_reaction(self):
19 |         self.assertEqual(self.genome["rs671"], "GG")
20 |         self.assertEqual(arv.traits.alcohol_flush_reaction(self.genome),
21 |                 "Little to no reaction (two copies of the ALDH2 gene)")
22 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27, py34
3 | 
4 | [testenv]
5 | commands = {envpython} setup.py test
6 | 


--------------------------------------------------------------------------------