├── MANIFEST.in ├── PySeqArray ├── data │ └── 1KG_phase1_release_v3_chr22.gds └── __init__.py ├── src ├── LinkGDS.c ├── ReadByVariant.h ├── Methods.cpp ├── vectorization.h ├── Index.h ├── GetData.cpp ├── ReadByVariant.cpp ├── PySeqArray.cpp └── Index.cpp ├── .travis.yml ├── setup.py ├── .gitignore ├── README.md └── docs └── demo └── tutorial_parallel.ipynb /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | recursive-include src *.h 3 | recursive-include src *.cpp 4 | recursive-include src *.c 5 | -------------------------------------------------------------------------------- /PySeqArray/data/1KG_phase1_release_v3_chr22.gds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoreArray/PySeqArray/HEAD/PySeqArray/data/1KG_phase1_release_v3_chr22.gds -------------------------------------------------------------------------------- /src/LinkGDS.c: -------------------------------------------------------------------------------- 1 | // 2 | // LinkGDS.c: Link to the pygds package 3 | // 4 | 5 | #include 6 | 7 | // do not modify this file, PyGDS2.h is from the pygds package 8 | 9 | #include 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | - "3.5" 7 | - "3.6" 8 | # command to install dependencies 9 | install: "pip install git+git://github.com/CoreArray/pygds.git" 10 | # command to run tests 11 | script: "pip install ." 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | import numpy 3 | import os 4 | import pygds 5 | 6 | 7 | src_fnlst = [ os.path.join('src', fn) for fn in [ 8 | 'GetData.cpp', 'Index.cpp', 'Methods.cpp', 'ReadByVariant.cpp', 9 | 'PySeqArray.cpp', 'LinkGDS.c', 'vectorization.c' ] ] 10 | 11 | 12 | setup(name='PySeqArray', 13 | version = '0.1.0', 14 | description = 'Python Interface to SeqArray Files for Data Management of Whole-Genome Sequence Variant Calls', 15 | url = 'http://github.com/CoreArray/PySeqArray', 16 | author = 'Xiuwen Zheng', 17 | author_email = 'zhengxwen@gmail.com', 18 | license = 'GPLv3', 19 | packages = [ 'PySeqArray' ], 20 | install_requires = [ 'numpy', 'pygds' ], # 'multiprocessing' ], 21 | ext_modules = [ Extension('PySeqArray.ccall', 22 | src_fnlst, 23 | include_dirs = [ pygds.get_include(), numpy.get_include() ], 24 | define_macros = [ ('USING_PYTHON', None) ], 25 | ) ], 26 | package_data = { 27 | 'PySeqArray': [ 'data/*.gds' ] 28 | } 29 | ) 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PySeqArray: data manipulation of whole-genome sequencing variants with SeqArray files in Python 2 | === 3 | 4 | ![GPLv3](http://www.gnu.org/graphics/gplv3-88x31.png) 5 | [GNU General Public License, GPLv3](http://www.gnu.org/copyleft/gpl.html) (2017) 6 | 7 | [![Build Status](https://travis-ci.org/CoreArray/PySeqArray.png)](https://travis-ci.org/CoreArray/PySeqArray) 8 | 9 | pre-release version: v0.1 10 | 11 | 12 | ## Features 13 | 14 | Data management of whole-genome sequence variant calls with thousands of individuals: genotypic data (e.g., SNVs, indels and structural variation calls) and annotations in SeqArray files are stored in an array-oriented and compressed manner, with efficient data access using the Python programming language. 15 | 16 | The SeqArray format is built on top of Genomic Data Structure (GDS) data format, and defines required data structure. GDS is a flexible and portable data container with hierarchical structure to store multiple scalable array-oriented data sets. It is suited for large-scale datasets, especially for data which are much larger than the available random-access memory. It also offers the efficient operations specifically designed for integers of less than 8 bits, since a diploid genotype usually occupies fewer bits than a byte. Data compression and decompression are available with relatively efficient random access. 17 | 18 | 19 | ## Prerequisites 20 | 21 | Python 2 (2.6-2.7), and Python 3 (3.3-3.6) 22 | 23 | NumPy 1.6.0 or later 24 | 25 | [pygds](https://github.com/CoreArray/pygds) 26 | 27 | 28 | ## Installation 29 | 30 | ```sh 31 | ## require the pygds package 32 | pip install git+git://github.com/CoreArray/pygds.git 33 | ## install PySeqArray 34 | pip install git+git://github.com/CoreArray/PySeqArray.git 35 | ``` 36 | 37 | 38 | ## Citation 39 | 40 | #### Original paper (implemented in an [R/Bioconductor](http://bioconductor.org/packages/SeqArray) package): 41 | 42 | [SeqArray](http://bioconductor.org/packages/SeqArray) 43 | 44 | Zheng X, Gogarten S, Lawrence M, Stilp A, Conomos M, Weir BS, Laurie C, Levine D (2017). SeqArray -- A storage-efficient high-performance data format for WGS variant calls. *Bioinformatics*. [DOI: 10.1093/bioinformatics/btx145](http://dx.doi.org/10.1093/bioinformatics/btx145). 45 | 46 | 47 | 48 | ## SeqArray File Download 49 | 50 | * [1000 Genomes Project](http://bochet.gcc.biostat.washington.edu/seqarray/1000genomes) 51 | 52 | 53 | ## Examples 54 | 55 | ```python 56 | import PySeqArray as ps 57 | 58 | fn = ps.seqExample('1KG_phase1_release_v3_chr22.gds') 59 | f = ps.SeqArrayFile() 60 | f.open(fn) 61 | f.show() 62 | f.close() 63 | ``` 64 | 65 | ``` 66 | File: PySeqArray/data/1KG_phase1_release_v3_chr22.gds (1.1M) 67 | + [ ] * 68 | |--+ description [ ] * 69 | |--+ sample.id { Str8 1092 LZMA_ra(10.5%), 914B } * 70 | |--+ variant.id { Int32 19773 LZMA_ra(8.39%), 6.5K } * 71 | |--+ position { Int32 19773 LZMA_ra(52.0%), 40.1K } * 72 | |--+ chromosome { Str8 19773 LZMA_ra(0.28%), 166B } * 73 | |--+ allele { Str8 19773 LZMA_ra(22.7%), 109.2K } * 74 | |--+ genotype [ ] * 75 | | |--+ data { Bit2 19773x1092x2 LZMA_ra(8.17%), 861.8K } * 76 | | |--+ extra.index { Int32 0x3 LZMA_ra, 19B } * 77 | | \--+ extra { Int16 0 LZMA_ra, 19B } 78 | |--+ phase [ ] 79 | | |--+ data { Bit1 19773x1092 LZMA_ra(0.02%), 550B } * 80 | | |--+ extra.index { Int32 0x3 LZMA_ra, 19B } * 81 | | \--+ extra { Bit1 0 LZMA_ra, 19B } 82 | |--+ annotation [ ] 83 | | |--+ id { Str8 19773 LZMA_ra(35.2%), 75.2K } * 84 | | |--+ qual { Float32 19773 LZMA_ra(3.62%), 2.8K } * 85 | | |--+ filter { Int32,factor 19773 LZMA_ra(0.21%), 170B } * 86 | | |--+ info [ ] 87 | | \--+ format [ ] 88 | \--+ sample.annotation [ ] 89 | |--+ Family.ID { Str8 1092 LZMA_ra(15.3%), 1.1K } 90 | |--+ Population { Str8 1092 LZMA_ra(5.08%), 222B } 91 | |--+ Gender { Str8 1092 LZMA_ra(5.85%), 386B } 92 | \--+ Ancestry { Str8 1092 LZMA_ra(2.43%), 233B } 93 | ``` 94 | 95 | 96 | ### More examples 97 | 98 | Python tutorial with SeqArray files: [docs/demo/tutorial.ipynb](docs/demo/tutorial.ipynb) 99 | 100 | Python tutorial with multiprocessing: [docs/demo/tutorial_parallel.ipynb](docs/demo/tutorial_parallel.ipynb) 101 | 102 | -------------------------------------------------------------------------------- /src/ReadByVariant.h: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // ReadByVariant.h: Read data variant by variant 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | #include "Index.h" 23 | 24 | 25 | namespace PySeqArray 26 | { 27 | 28 | using namespace Vectorization; 29 | 30 | 31 | // ===================================================================== 32 | 33 | /// Object for reading basic variables variant by variant 34 | class COREARRAY_DLL_LOCAL CApply_Variant_Basic: public CApply_Variant 35 | { 36 | protected: 37 | C_SVType SVType; 38 | public: 39 | /// constructor 40 | CApply_Variant_Basic(CFileInfo &File, const char *var_name); 41 | virtual void ReadData(PyObject *val); 42 | virtual PyObject *NeedArray(); 43 | }; 44 | 45 | 46 | /// Object for reading positions variant by variant 47 | class COREARRAY_DLL_LOCAL CApply_Variant_Pos: public CApply_Variant 48 | { 49 | protected: 50 | int *PtrPos; 51 | PyObject *VarNode; ///< R object 52 | public: 53 | /// constructor 54 | CApply_Variant_Pos(CFileInfo &File); 55 | virtual void ReadData(PyObject *val); 56 | virtual PyObject *NeedArray(); 57 | }; 58 | 59 | 60 | /// Object for reading chromosomes variant by variant 61 | class COREARRAY_DLL_LOCAL CApply_Variant_Chrom: public CApply_Variant 62 | { 63 | protected: 64 | CChromIndex *ChromIndex; 65 | PyObject *VarNode; ///< R object 66 | public: 67 | /// constructor 68 | CApply_Variant_Chrom(CFileInfo &File); 69 | virtual void ReadData(PyObject *val); 70 | virtual PyObject *NeedArray(); 71 | }; 72 | 73 | 74 | // ===================================================================== 75 | 76 | /// Object for reading genotypes variant by variant 77 | class COREARRAY_DLL_LOCAL CApply_Variant_Geno: public CApply_Variant 78 | { 79 | protected: 80 | CGenoIndex *GenoIndex; ///< indexing genotypes 81 | ssize_t SiteCount; ///< the total number of entries at a site 82 | ssize_t CellCount; ///< the selected number of entries at a site 83 | vector Selection; ///< the buffer of selection 84 | VEC_AUTO_PTR ExtPtr; ///< a pointer to the additional buffer 85 | PyObject *VarIntGeno; ///< genotype R integer object 86 | 87 | inline int _ReadGenoData(int *Base); 88 | inline C_UInt8 _ReadGenoData(C_UInt8 *Base); 89 | 90 | public: 91 | ssize_t SampNum; ///< the number of selected samples 92 | int Ploidy; ///< ploidy 93 | 94 | /// constructor 95 | CApply_Variant_Geno(); 96 | CApply_Variant_Geno(CFileInfo &File); 97 | ~CApply_Variant_Geno(); 98 | 99 | void Init(CFileInfo &File); 100 | 101 | virtual PyObject *NeedArray(); 102 | virtual void ReadData(PyObject *val); 103 | 104 | /// read genotypes in 32-bit integer 105 | void ReadGenoData(int *Base); 106 | /// read genotypes in unsigned 8-bit intetger 107 | void ReadGenoData(C_UInt8 *Base); 108 | }; 109 | 110 | 111 | // ===================================================================== 112 | 113 | /// Object for reading genotypes (dosages) variant by variant 114 | class COREARRAY_DLL_LOCAL CApply_Variant_Dosage: public CApply_Variant_Geno 115 | { 116 | protected: 117 | VEC_AUTO_PTR ExtPtr2; ///< a pointer to the additional buffer for dosages 118 | public: 119 | /// constructor 120 | CApply_Variant_Dosage(CFileInfo &File); 121 | 122 | virtual PyObject *NeedArray(); 123 | virtual void ReadData(PyObject *val); 124 | 125 | /// read dosages in 32-bit integer 126 | void ReadDosage(int *Base); 127 | /// read dosages in unsigned 8-bit intetger 128 | void ReadDosage(C_UInt8 *Base); 129 | }; 130 | 131 | 132 | // ===================================================================== 133 | 134 | /// Object for reading phasing information variant by variant 135 | class COREARRAY_DLL_LOCAL CApply_Variant_Phase: public CApply_Variant 136 | { 137 | protected: 138 | ssize_t SiteCount; ///< the total number of entries at a site 139 | ssize_t CellCount; ///< the selected number of entries at a site 140 | bool UseRaw; ///< whether use RAW type 141 | vector Selection; ///< the buffer of selection 142 | PyObject *VarPhase; ///< genotype R object 143 | 144 | public: 145 | ssize_t SampNum; ///< the number of selected samples 146 | int Ploidy; ///< ploidy 147 | 148 | /// constructor 149 | CApply_Variant_Phase(); 150 | CApply_Variant_Phase(CFileInfo &File, bool use_raw); 151 | 152 | void Init(CFileInfo &File, bool use_raw); 153 | 154 | virtual void ReadData(PyObject *val); 155 | virtual PyObject *NeedArray(); 156 | }; 157 | 158 | 159 | // ===================================================================== 160 | 161 | /// Object for reading info variables variant by variant 162 | class COREARRAY_DLL_LOCAL CApply_Variant_Info: public CApply_Variant 163 | { 164 | protected: 165 | CIndex *VarIndex; ///< indexing the format variable 166 | C_SVType SVType; ///< data type for GDS reading 167 | C_Int32 BaseNum; ///< if 2-dim, the size of the first dimension 168 | map VarList; ///< a list of PyObject variables 169 | 170 | public: 171 | /// constructor 172 | CApply_Variant_Info(CFileInfo &File, const char *var_name); 173 | 174 | virtual void ReadData(PyObject *val); 175 | virtual PyObject *NeedArray(); 176 | }; 177 | 178 | 179 | // ===================================================================== 180 | 181 | /// Object for reading format variables variant by variant 182 | class COREARRAY_DLL_LOCAL CApply_Variant_Format: public CApply_Variant 183 | { 184 | protected: 185 | CIndex *VarIndex; ///< indexing the format variable 186 | ssize_t _TotalSampNum; ///< the total number of samples 187 | 188 | C_SVType SVType; ///< data type for GDS reading 189 | C_BOOL *SelPtr[2]; ///< pointers to selection 190 | map VarList; ///< a list of PyObject variables 191 | 192 | public: 193 | ssize_t SampNum; ///< the number of selected samples 194 | 195 | /// constructor 196 | CApply_Variant_Format(); 197 | CApply_Variant_Format(CFileInfo &File, const char *var_name); 198 | 199 | void Init(CFileInfo &File, const char *var_name); 200 | 201 | virtual void ReadData(PyObject *val); 202 | virtual PyObject *NeedArray(); 203 | }; 204 | 205 | 206 | // ===================================================================== 207 | 208 | /// Object for calculating the number of distinct alleles variant by variant 209 | class COREARRAY_DLL_LOCAL CApply_Variant_NumAllele: public CApply_Variant 210 | { 211 | private: 212 | string strbuf; 213 | public: 214 | /// constructor 215 | CApply_Variant_NumAllele(CFileInfo &File); 216 | 217 | virtual PyObject *NeedArray(); 218 | virtual void ReadData(PyObject *val); 219 | int GetNumAllele(); 220 | }; 221 | 222 | } 223 | 224 | 225 | extern "C" 226 | { 227 | 228 | /// Apply functions over margins on a working space 229 | COREARRAY_DLL_EXPORT PyObject *SEQ_Apply_Variant(PyObject *gdsfile, PyObject *var_name, 230 | PyObject *FUN, PyObject *as_is, PyObject *var_index, PyObject *param, PyObject *rho); 231 | 232 | } // extern "C" 233 | -------------------------------------------------------------------------------- /PySeqArray/__init__.py: -------------------------------------------------------------------------------- 1 | # import numpy 2 | import numpy as np 3 | # import os 4 | import os 5 | # import multiprocessing 6 | import multiprocessing as mp 7 | import multiprocessing.pool as pl 8 | # import pygds 9 | import pygds 10 | # import c library 11 | import PySeqArray.ccall as cc 12 | # other ... 13 | from sys import platform 14 | from functools import reduce 15 | 16 | 17 | ## export version number 18 | __version__ = '0.1.0' 19 | 20 | 21 | 22 | # =========================================================================== 23 | 24 | def seqExample(filename=None): 25 | """Example files 26 | 27 | Return a file name in the folder of example data. 28 | 29 | Parameters 30 | ---------- 31 | filename : str 32 | a file name in the folder of example data, or None for returning the path of example folder 33 | 34 | Returns 35 | ------- 36 | string 37 | 38 | Examples 39 | -------- 40 | >>> seqExample('1KG_phase1_release_v3_chr22.gds') 41 | """ 42 | import PySeqArray 43 | s = os.path.dirname(PySeqArray.__file__) 44 | if filename is None: 45 | return os.path.join(s, 'data') 46 | else: 47 | return os.path.join(s, 'data', filename) 48 | 49 | 50 | 51 | # =========================================================================== 52 | 53 | # define internal function using forking 54 | def _proc_fork_func(x): 55 | i = x[0]; ncpu = x[1] 56 | file = x[2]; fun = x[3]; param = x[4]; split = x[5] 57 | cc.flt_split(file.fileid, i, ncpu, split) 58 | return fun(file, param) 59 | 60 | # define a process function 61 | def _proc_func(x): 62 | i = x[0]; ncpu = x[1] 63 | fn = x[2]; fun = x[3]; param = x[4]; sel = x[5]; split = x[6] 64 | import PySeqArray 65 | import PySeqArray.ccall as cc 66 | file = PySeqArray.SeqArrayFile() 67 | file.open(fn, allow_dup=True) 68 | file.FilterSet2(sel[0], sel[1], verbose=False) 69 | cc.flt_split(file.fileid, i, ncpu, split) 70 | return fun(file, param) 71 | 72 | 73 | 74 | # =========================================================================== 75 | 76 | class SeqArrayFile(pygds.gdsfile): 77 | """ 78 | Class for SeqArray GDS files 79 | """ 80 | 81 | def __init__(self): 82 | pygds.gdsfile.__init__(self) 83 | 84 | def __del__(self): 85 | cc.file_done(self.fileid) 86 | pygds.gdsfile.__del__(self) 87 | 88 | 89 | def create(self, filename, allow_dup=False): 90 | raise Exception('not supported!') 91 | 92 | 93 | def open(self, filename, readonly=True, allow_dup=False): 94 | """Open an SeqArray file 95 | 96 | Open an existing file of SeqArray GDS for reading or writing. 97 | 98 | Parameters 99 | ---------- 100 | filename : str 101 | the file name of a new GDS file to be created 102 | readonly : bool 103 | if True, the file is opened read-only; otherwise, it is allowed to write data to the file 104 | allow_dup : bool 105 | if True, it is allowed to open a GDS file with read-only mode when it has been opened in the same session 106 | 107 | Returns 108 | ------- 109 | None 110 | 111 | See Also 112 | -------- 113 | close: close a SeqArray file 114 | """ 115 | pygds.gdsfile.open(self, filename, readonly, allow_dup) 116 | cc.file_init(self.fileid) 117 | # TODO: file checking 118 | 119 | 120 | def close(self): 121 | """Close a SeqArray file 122 | 123 | Close a SeqArray GDS file. 124 | 125 | Returns 126 | ------- 127 | None 128 | 129 | See Also 130 | -------- 131 | open : open an existing SeqArray file 132 | """ 133 | cc.file_done(self.fileid) 134 | pygds.gdsfile.close(self) 135 | 136 | 137 | def FilterSet(self, sample_id=None, variant_id=None, intersect=False, verbose=True): 138 | """Set a filter 139 | 140 | Set a filter to sample and/or variant with IDs. 141 | 142 | Parameters 143 | ---------- 144 | sample_id : str 145 | sample id to be selected 146 | variant_id : bool 147 | variant id to be selected 148 | intersect : bool 149 | if False, the candidate variants for selection are all possible variants (by default); 150 | if True, the candidate variants are from the selected variants defined via the previous call 151 | verbose : bool 152 | if True, show information 153 | 154 | Returns 155 | ------- 156 | None 157 | 158 | See Also 159 | -------- 160 | FilterReset : reset the filter 161 | """ 162 | s = not sample_id is None 163 | v = not variant_id is None 164 | if s or v: 165 | if s: 166 | cc.set_sample(self.fileid, sample_id, intersect, verbose) 167 | if v: 168 | cc.set_variant(self.fileid, variant_id, intersect, verbose) 169 | 170 | 171 | def FilterSet2(self, sample=None, variant=None, intersect=False, verbose=True): 172 | """Set a filter 173 | 174 | Set a filter to sample and/or variant with a bool vector or an index vector. 175 | 176 | Parameters 177 | ---------- 178 | sample : vector, range 179 | a bool vector, an indexing vecot for selecting samples or a range object 180 | variant : vector, range 181 | a bool vector, an indexing vecot for selecting variants or a range object 182 | intersect : bool 183 | if False, the candidate variants for selection are all possible variants (by default); 184 | if True, the candidate variants are from the selected variants defined via the previous call 185 | verbose : bool 186 | if True, show information 187 | 188 | Returns 189 | ------- 190 | None 191 | 192 | See Also 193 | -------- 194 | FilterSet : set a filter 195 | FilterReset : reset the filter 196 | """ 197 | if not sample is None: 198 | cc.set_sample2(self.fileid, sample, intersect, verbose) 199 | if not variant is None: 200 | cc.set_variant2(self.fileid, variant, intersect, verbose) 201 | 202 | 203 | def FilterReset(self, sample=True, variant=True, verbose=True): 204 | """Reset the filter 205 | 206 | Clear the existing filter on sample and/or variant. 207 | 208 | Parameters 209 | ---------- 210 | sample : str 211 | if True, reset the filter of sample 212 | variant : bool 213 | if True, reset the filter of variant 214 | verbose : bool 215 | if True, show information 216 | 217 | Returns 218 | ------- 219 | None 220 | 221 | See Also 222 | -------- 223 | FilterSet : set a filter 224 | """ 225 | if sample: 226 | cc.set_sample(self.fileid, None, False, verbose) 227 | if variant: 228 | cc.set_variant(self.fileid, None, False, verbose) 229 | 230 | 231 | def FilterPush(reset=True): 232 | """Push a filter 233 | 234 | Push the current filter to the stack. 235 | 236 | Parameters 237 | ---------- 238 | reset : bool 239 | if True, reset the filter of sample and variant after pushing 240 | 241 | Returns 242 | ------- 243 | None 244 | 245 | See Also 246 | -------- 247 | FilterPop : recover the last filter 248 | """ 249 | cc.flt_push(self.fileid, reset) 250 | 251 | 252 | def FilterPop(): 253 | """Pop a filter 254 | 255 | Pop or recover the last filter in the stack. 256 | 257 | Returns 258 | ------- 259 | None 260 | 261 | See Also 262 | -------- 263 | FilterPush : push the current filter to the stack 264 | """ 265 | cc.flt_pop(self.fileid) 266 | 267 | 268 | def FilterGet(self, sample=True): 269 | """Get a sample/variant filter 270 | 271 | Get a sample or variant filter. 272 | 273 | Parameters 274 | ---------- 275 | sample : bool 276 | If True, return the sample filter; otherwise, return the variant filter 277 | 278 | Returns 279 | ------- 280 | A numpy object (a bool vector) 281 | 282 | See Also 283 | -------- 284 | FilterSet : set a filter 285 | """ 286 | return(cc.get_filter(self.fileid, sample)) 287 | 288 | 289 | def GetData(self, name): 290 | """Get data 291 | 292 | Get data from a SeqArray file with a given variable name and a sample/variant filter 293 | 294 | Parameters 295 | ---------- 296 | name : str 297 | the variable name 298 | 299 | Returns 300 | ------- 301 | a numpy array object 302 | 303 | See Also 304 | -------- 305 | FilterSet : set a filter 306 | """ 307 | return cc.get_data(self.fileid, name) 308 | 309 | 310 | def Apply(self, name, fun, param=None, asis='none', bsize=1024, verbose=False): 311 | """Apply function over array margins 312 | 313 | Apply a user-defined function to margins of genotypes and annotations via blocking 314 | 315 | Parameters 316 | ---------- 317 | name : str, list 318 | the variable name, or a list of variable names 319 | fun : function 320 | the user-defined function 321 | param: object 322 | the parameter passed to the user-defined function if it is not None 323 | asis : str 324 | 'none', no return; 'list', a list of the returned values from the user-defined function; 325 | 'unlist', flatten the returned values from the user-defined function 326 | bsize : int 327 | block size 328 | verbose : bool 329 | show progress information if True 330 | 331 | Returns 332 | ------- 333 | None, a list or a numpy array object 334 | 335 | See Also 336 | -------- 337 | FilterSet : set a filter 338 | """ 339 | v = cc.apply(self.fileid, name, fun, param, asis, bsize, verbose) 340 | if asis == 'unlist': 341 | v = np.hstack(v) 342 | return(v) 343 | 344 | 345 | def RunParallel(self, fun, param=None, ncpu=0, split='by.variant', combine='unlist'): 346 | """Apply Functions in Parallel 347 | 348 | Apply a user-defined function in parallel over array margins 349 | 350 | Parameters 351 | ---------- 352 | fun : function 353 | the user-defined function 354 | param : object 355 | the parameter passed to the user-defined function if it is not None 356 | ncpu : int 357 | the number of cores or an instance of 'multiprocessing.pool.Pool'; 358 | 0 to use the number of cores minus 1 359 | split : str 360 | 'by.variant', 'by.sample', 'none': split the dataset by variant or sample according to multiple processes, or "none" for no split 361 | combine : str, function 362 | 'none', no return; 'list', a list of the returned values from the user-defined function; 363 | 'unlist', flatten the returned values from the user-defined function 364 | 365 | Returns 366 | ------- 367 | None, a list or a numpy array object 368 | """ 369 | # check 370 | if not isinstance(ncpu, (int, float, pl.Pool)): 371 | raise ValueError('`ncpu` should be a numeric value or `multiprocessing.pool.Pool`.') 372 | if not (combine is None or isinstance(combine, str) or callable(combine)): 373 | raise ValueError('`combine` should be None, a string or a function.') 374 | # run 375 | if isinstance(ncpu, (int, float)): 376 | if ncpu <= 0: 377 | ncpu = mp.cpu_count() - 1 378 | if ncpu <= 0: 379 | ncpu = 1 380 | if ncpu >= 1: 381 | pa = pl.Pool(processes=ncpu) 382 | else: 383 | pa = ncpu 384 | ncpu = pa._processes 385 | # run 386 | if ncpu > 1: 387 | # direct forking or not 388 | is_fork = False 389 | if isinstance(ncpu, (int, float)): 390 | is_fork = (platform=="linux" or platform=="linux2" or 391 | platform=="unix" or platform=="darwin") 392 | if is_fork: 393 | pm = [ [ i,ncpu,self,fun,param,split ] for i in range(ncpu) ] 394 | v = pa.map(_proc_fork_func, pm) 395 | else: 396 | sel = [ self.FilterGet(True), self.FilterGet(False) ] 397 | pm = [ [ i,ncpu,self.filename,fun,param,sel,split ] for i in range(ncpu) ] 398 | v = pa.map(_proc_func, pm) 399 | # output 400 | if combine is None or combine == 'none': 401 | v = None 402 | elif combine == 'unlist': 403 | v = np.hstack(v) 404 | elif callable(combine): 405 | v = reduce(combine, v) 406 | elif combine != 'list': 407 | raise ValueError('`combine` is invalid.') 408 | return v 409 | else: 410 | return fun(self, param) 411 | 412 | 413 | #### Methods #### 414 | 415 | def AlleleFreq(self, ref, ncpu, verbose=False): 416 | if not (isinstance(ref, (int, float)) or ref is None): 417 | raise ValueError('`ref` should be a numeric value or None.') 418 | self.RunParallel((lambda file, param: 419 | file.Apply('genotype', cc.calc_af, as_is='unlist')), 420 | param=[ ref, verbose ], ncpu=ncpu) 421 | 422 | -------------------------------------------------------------------------------- /src/Methods.cpp: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // Methods.cpp: the C/C++ codes for the PySeqArray package 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | #include "Index.h" 23 | 24 | #include 25 | #include 26 | 27 | // #include "ReadByVariant.h" 28 | // #include "ReadBySample.h" 29 | #include 30 | 31 | using namespace PySeqArray; 32 | 33 | 34 | extern "C" 35 | { 36 | /* 37 | // ====================================================================== 38 | 39 | /// Calculate the missing rate per variant 40 | COREARRAY_DLL_EXPORT PyObject* FC_Missing_PerVariant(PyObject* Geno) 41 | { 42 | size_t n = XLENGTH(Geno), m; 43 | if (TYPEOF(Geno) == RAWSXP) 44 | m = vec_i8_count((char*)RAW(Geno), n, NA_RAW); 45 | else 46 | m = vec_i32_count(INTEGER(Geno), n, NA_INTEGER); 47 | return ScalarReal((n > 0) ? (double(m) / n) : R_NaN); 48 | } 49 | 50 | /// Calculate the missing rate per sample 51 | COREARRAY_DLL_EXPORT PyObject* FC_Missing_PerSample(PyObject* Geno, PyObject* sum) 52 | { 53 | int *pdim = INTEGER(GET_DIM(Geno)); 54 | int num_ploidy=pdim[0], num_sample=pdim[1]; 55 | 56 | int *pG = INTEGER(Geno); 57 | int *pS = INTEGER(sum); 58 | 59 | for (int i=0; i < num_sample; i++) 60 | { 61 | for (int j=0; j < num_ploidy; j++) 62 | { 63 | if (*pG++ == NA_INTEGER) 64 | pS[i] ++; 65 | } 66 | } 67 | 68 | return R_NilValue; 69 | } 70 | */ 71 | 72 | // ====================================================================== 73 | /* 74 | /// Get a list of allele frequencies 75 | COREARRAY_DLL_LOCAL PyObject* FC_CalcAF(PyObject *self, PyObject *args) 76 | { 77 | PyObject *ref, *geno; 78 | if (!PyArg_ParseTuple(args, "OO", &ref, &geno)) 79 | return NULL; 80 | 81 | 82 | PyObject* Geno = VECTOR_ELT(List, 0); 83 | const size_t N = XLENGTH(Geno); 84 | 85 | int nAllele = Rf_asInteger(VECTOR_ELT(List, 1)); 86 | PyObject* rv = NEW_NUMERIC(nAllele); 87 | double *pV = REAL(rv); 88 | 89 | size_t n1, n2, n3; 90 | switch (nAllele) 91 | { 92 | case 2: 93 | if (TYPEOF(Geno) == RAWSXP) 94 | vec_i8_count3((const char*)RAW(Geno), N, 0, 1, NA_RAW, &n1, &n2, &n3); 95 | else 96 | vec_i32_count3(INTEGER(Geno), N, 0, 1, NA_INTEGER, &n1, &n2, &n3); 97 | n3 = N - n3; 98 | if (n3 > 0) 99 | { 100 | pV[0] = (double)n1 / n3; 101 | pV[1] = (double)n2 / n3; 102 | } else 103 | pV[0] = pV[1] = R_NaN; 104 | break; 105 | 106 | case 1: 107 | if (TYPEOF(Geno) == RAWSXP) 108 | vec_i8_count2((const char*)RAW(Geno), N, 0, NA_RAW, &n1, &n2); 109 | else 110 | vec_i32_count2(INTEGER(Geno), N, 0, NA_INTEGER, &n1, &n2); 111 | n2 = N - n2; 112 | pV[0] = (n2 > 0) ? (double)n1 / n2 : R_NaN; 113 | break; 114 | 115 | default: 116 | int num = 0; 117 | memset((void*)pV, 0, sizeof(double)*nAllele); 118 | if (TYPEOF(Geno) == RAWSXP) 119 | { 120 | C_UInt8 *p = (C_UInt8*)RAW(Geno); 121 | for (size_t n=N; n > 0; n--) 122 | { 123 | C_UInt8 g = *p++; 124 | if (g != NA_RAW) 125 | { 126 | num ++; 127 | if (g < nAllele) pV[g] ++; 128 | } 129 | } 130 | } else { 131 | int *p = INTEGER(Geno); 132 | for (size_t n=N; n > 0; n--) 133 | { 134 | int g = *p++; 135 | if (g != NA_INTEGER) 136 | { 137 | num ++; 138 | if ((0 <= g) && (g < nAllele)) 139 | pV[g] ++; 140 | } 141 | } 142 | } 143 | if (num > 0) 144 | { 145 | const double scale = 1.0 / num; 146 | for (; (nAllele--) > 0;) (*pV++) *= scale; 147 | } else { 148 | for (; (nAllele--) > 0;) (*pV++) = R_NaN; 149 | } 150 | } 151 | 152 | return rv; 153 | } 154 | */ 155 | 156 | /* 157 | // ====================================================================== 158 | 159 | static ssize_t AlleleFreq_Index = 0; 160 | static int *AlleleFreq_RefPtr = NULL; 161 | static PyObject* AlleleFreq_Allele = R_NilValue; 162 | 163 | /// Set the reference allele with an index 164 | COREARRAY_DLL_EXPORT PyObject* FC_AF_SetIndex(PyObject* RefIndex) 165 | { 166 | if (XLENGTH(RefIndex) == 1) 167 | { 168 | AlleleFreq_Index = Rf_asInteger(RefIndex); 169 | AlleleFreq_RefPtr = NULL; 170 | } else { 171 | AlleleFreq_Index = 0; 172 | AlleleFreq_RefPtr = INTEGER(RefIndex); 173 | } 174 | return R_NilValue; 175 | } 176 | 177 | /// Get allele frequencies 178 | COREARRAY_DLL_EXPORT PyObject* FC_AF_Index(PyObject* List) 179 | { 180 | PyObject* Geno = VECTOR_ELT(List, 0); 181 | const int nAllele = Rf_asInteger(VECTOR_ELT(List, 1)); 182 | 183 | const size_t N = XLENGTH(Geno); 184 | size_t n = 0, m = 0; 185 | int A = (AlleleFreq_RefPtr==NULL) ? 186 | AlleleFreq_Index : AlleleFreq_RefPtr[AlleleFreq_Index++]; 187 | 188 | if (A < nAllele) 189 | { 190 | if (TYPEOF(Geno) == RAWSXP) 191 | vec_i8_count2((const char*)RAW(Geno), N, A, NA_RAW, &m, &n); 192 | else 193 | vec_i32_count2(INTEGER(Geno), N, A, NA_INTEGER, &m, &n); 194 | n = N - n; 195 | } 196 | 197 | return ScalarReal((n > 0) ? (double(m) / n) : R_NaN); 198 | } 199 | 200 | /// Set the reference allele with string 201 | COREARRAY_DLL_EXPORT PyObject* FC_AF_SetAllele(PyObject* RefAllele) 202 | { 203 | AlleleFreq_Allele = RefAllele; 204 | AlleleFreq_Index = 0; 205 | return R_NilValue; 206 | } 207 | 208 | /// Get allele frequencies 209 | COREARRAY_DLL_EXPORT PyObject* FC_AF_Allele(PyObject* List) 210 | { 211 | PyObject* Geno = VECTOR_ELT(List, 0); 212 | int A = GetIndexOfAllele( 213 | CHAR(STRING_ELT(AlleleFreq_Allele, AlleleFreq_Index++)), 214 | CHAR(STRING_ELT(VECTOR_ELT(List, 1), 0))); 215 | 216 | size_t n = 0, m = 0; 217 | if (A >= 0) 218 | { 219 | const size_t N = XLENGTH(Geno); 220 | if (TYPEOF(Geno) == RAWSXP) 221 | { 222 | if (A < 255) 223 | vec_i8_count2((const char*)RAW(Geno), N, A, NA_RAW, &m, &n); 224 | else 225 | n = N; 226 | } else 227 | vec_i32_count2(INTEGER(Geno), N, A, NA_INTEGER, &m, &n); 228 | n = N - n; 229 | } 230 | 231 | return ScalarReal((n > 0) ? (double(m) / n) : R_NaN); 232 | } 233 | 234 | 235 | // ====================================================================== 236 | 237 | /// Convert a Sequencing GDS file to a SNP GDS file in `seqGDS2SNP()` 238 | COREARRAY_DLL_EXPORT PyObject* FC_AlleleStr(PyObject* allele) 239 | { 240 | const R_xlen_t n = XLENGTH(allele); 241 | for (R_xlen_t i=0; i < n; i++) 242 | { 243 | char *s = (char*)CHAR(STRING_ELT(allele, i)); 244 | while (*s) 245 | { 246 | if (*s == ',') 247 | { *s = '/'; break; } 248 | s ++; 249 | } 250 | } 251 | return allele; 252 | } 253 | 254 | /// Convert a Sequencing GDS file to a SNP GDS file in `seqGDS2SNP()` 255 | COREARRAY_DLL_EXPORT PyObject* FC_AlleleStr2(PyObject* allele) 256 | { 257 | const R_xlen_t n = XLENGTH(allele); 258 | for (R_xlen_t i=0; i < n; i++) 259 | { 260 | char *s = (char*)CHAR(STRING_ELT(allele, i)); 261 | while (*s) 262 | { 263 | if (*s == '/') 264 | { *s = ','; break; } 265 | s ++; 266 | } 267 | } 268 | return allele; 269 | } 270 | 271 | 272 | // ====================================================================== 273 | 274 | /// Get a list of allele counts 275 | COREARRAY_DLL_EXPORT PyObject* FC_AlleleCount(PyObject* List) 276 | { 277 | PyObject* Geno = VECTOR_ELT(List, 0); 278 | const size_t N = XLENGTH(Geno); 279 | 280 | int nAllele = Rf_asInteger(VECTOR_ELT(List, 1)); 281 | PyObject* rv = NEW_INTEGER(nAllele); 282 | int *pV = INTEGER(rv); 283 | 284 | size_t n1, n2; 285 | switch (nAllele) 286 | { 287 | case 2: 288 | if (TYPEOF(Geno) == RAWSXP) 289 | vec_i8_count2((const char*)RAW(Geno), N, 0, 1, &n1, &n2); 290 | else 291 | vec_i32_count2(INTEGER(Geno), N, 0, 1, &n1, &n2); 292 | pV[0] = n1; pV[1] = n2; 293 | break; 294 | 295 | case 1: 296 | if (TYPEOF(Geno) == RAWSXP) 297 | pV[0] = vec_i8_count((const char*)RAW(Geno), N, 0); 298 | else 299 | pV[0] = vec_i32_count(INTEGER(Geno), N, 0); 300 | break; 301 | 302 | default: 303 | memset((void*)pV, 0, sizeof(int)*nAllele); 304 | if (TYPEOF(Geno) == RAWSXP) 305 | { 306 | C_UInt8 *p = (C_UInt8*)RAW(Geno); 307 | for (size_t n=N; n > 0; n--) 308 | { 309 | C_UInt8 g = *p++; 310 | if (g < nAllele) pV[g] ++; 311 | } 312 | } else { 313 | int *p = INTEGER(Geno); 314 | for (size_t n=N; n > 0; n--) 315 | { 316 | int g = *p++; 317 | if ((0 <= g) && (g < nAllele)) pV[g] ++; 318 | } 319 | } 320 | } 321 | 322 | return rv; 323 | } 324 | 325 | 326 | /// Get a list of reference allele counts 327 | COREARRAY_DLL_EXPORT PyObject* FC_AlleleCount2(PyObject* Geno) 328 | { 329 | const size_t N = XLENGTH(Geno); 330 | size_t n0, nmiss; 331 | if (TYPEOF(Geno) == RAWSXP) 332 | vec_i8_count2((const char*)RAW(Geno), N, 0, 0xFF, &n0, &nmiss); 333 | else 334 | vec_i32_count2(INTEGER(Geno), N, 0, NA_INTEGER, &n0, &nmiss); 335 | PyObject* rv = NEW_INTEGER(2); 336 | int *p = INTEGER(rv); 337 | p[0] = n0; p[1] = nmiss; 338 | return rv; 339 | } 340 | 341 | 342 | 343 | // ====================================================================== 344 | 345 | static const char *pkg_digest = "digest"; 346 | 347 | #define PKG_LOAD(name) \ 348 | *(DL_FUNC*)(&name) = R_FindSymbol(#name, pkg_digest, NULL); \ 349 | if (!name) \ 350 | error("No function '%s' in the %s package", #name, pkg_digest); 351 | 352 | typedef struct _md5_context 353 | { 354 | C_UInt32 total[2]; 355 | C_UInt32 state[4]; 356 | C_UInt8 buffer[64]; 357 | char tmp[1024]; 358 | } md5_context; 359 | 360 | 361 | static void (*md5_starts)(md5_context*) = NULL; 362 | static void (*md5_update)(md5_context*, void*, C_UInt32) = NULL; 363 | static void (*md5_finish)(md5_context*, C_UInt8[16]) = NULL; 364 | static md5_context md5_ctx; 365 | 366 | /// Initialize digest method 367 | COREARRAY_DLL_EXPORT PyObject* FC_DigestInit(PyObject* Algo) 368 | { 369 | if (!md5_starts) PKG_LOAD(md5_starts); 370 | if (!md5_update) PKG_LOAD(md5_update); 371 | if (!md5_finish) PKG_LOAD(md5_finish); 372 | (*md5_starts)(&md5_ctx); 373 | return R_NilValue; 374 | } 375 | 376 | /// Finalize digest method 377 | COREARRAY_DLL_EXPORT PyObject* FC_DigestDone(PyObject* Algo) 378 | { 379 | C_UInt8 digest[16]; 380 | (*md5_finish)(&md5_ctx, digest); 381 | 382 | int Len = sizeof(digest); 383 | char buffer[1024 + 1]; 384 | char *p = buffer; 385 | C_UInt8 *Code = digest; 386 | for (; Len > 0; Code++, Len--) 387 | { 388 | C_UInt8 v1 = (*Code) & 0x0F; 389 | C_UInt8 v2 = (*Code) >> 4; 390 | *p++ = (v2 < 10) ? (v2 + '0') : (v2 - 10 + 'a'); 391 | *p++ = (v1 < 10) ? (v1 + '0') : (v1 - 10 + 'a'); 392 | } 393 | *p = 0; 394 | 395 | return mkString(buffer); 396 | } 397 | 398 | /// Applied digest function 399 | COREARRAY_DLL_EXPORT PyObject* FC_DigestScan(PyObject* Data) 400 | { 401 | if (Rf_isInteger(Data)) 402 | { 403 | const size_t n = XLENGTH(Data); 404 | (*md5_update)(&md5_ctx, INTEGER(Data), n*sizeof(int)); 405 | } else if (Rf_isLogical(Data)) 406 | { 407 | const size_t n = XLENGTH(Data); 408 | (*md5_update)(&md5_ctx, LOGICAL(Data), n*sizeof(int)); 409 | } else if (Rf_isReal(Data)) 410 | { 411 | const size_t n = XLENGTH(Data); 412 | (*md5_update)(&md5_ctx, REAL(Data), n*sizeof(double)); 413 | } else if (Rf_isString(Data)) 414 | { 415 | const size_t n = XLENGTH(Data); 416 | for (size_t i=0; i < n; i++) 417 | { 418 | const char *s = CHAR(STRING_ELT(Data, i)); 419 | (*md5_update)(&md5_ctx, (void*)s, strlen(s)+1); 420 | } 421 | } else if (!Rf_isNull(Data)) 422 | { 423 | error("Not support data type."); 424 | } 425 | 426 | return R_NilValue; 427 | } 428 | */ 429 | 430 | } // extern "C" 431 | -------------------------------------------------------------------------------- /src/vectorization.h: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // vectorization.h: compiler optimization with vectorization 4 | // 5 | // Copyright (C) 2016-2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public 19 | // License along with PySeqArray. 20 | // If not, see . 21 | 22 | /** 23 | * \file vectorization.h 24 | * \author Xiuwen Zheng [zhengx@u.washington.edu] 25 | * \version 1.0 26 | * \date 2016 27 | * \brief compiler optimization with vectorization 28 | * \details 29 | **/ 30 | 31 | 32 | #ifndef _HEADER_COREARRAY_VECTORIZATION_ 33 | #define _HEADER_COREARRAY_VECTORIZATION_ 34 | 35 | #include "CoreDEF.h" 36 | #include 37 | #include 38 | #include 39 | 40 | #if defined(COREARRAY_SIMD_SSE) && defined(COREARRAY_SIMD_SSE2) 41 | 42 | # include // SSE 43 | # include // SSE2 44 | 45 | # if defined(COREARRAY_SIMD_SSE3) // SSE3 46 | # include 47 | # endif 48 | 49 | # if defined(COREARRAY_SIMD_SSSE3) // SSSE3 50 | # include 51 | # endif 52 | 53 | # if defined(COREARRAY_SIMD_SSE4_1) // SSE_4_1 54 | # include 55 | # endif 56 | 57 | # if defined(COREARRAY_SIMD_SSE4_2) || defined(__POPCNT__) 58 | # define COREARRAY_HARDWARE_POPCNT 59 | # include // COREARRAY_SIMD_SSE4_2, for POPCNT 60 | # endif 61 | 62 | # if defined(COREARRAY_SIMD_AVX) || defined(COREARRAY_SIMD_AVX2) 63 | # include // AVX, AVX2 64 | # endif 65 | 66 | #endif 67 | 68 | 69 | 70 | #ifdef __cplusplus 71 | 72 | namespace Vectorization 73 | { 74 | /// an aligned pointer 75 | struct COREARRAY_DLL_DEFAULT ALIGN_PTR 76 | { 77 | private: 78 | void *alloc_ptr, *base_ptr; 79 | 80 | public: 81 | ALIGN_PTR() 82 | { alloc_ptr = base_ptr = NULL; } 83 | ALIGN_PTR(size_t n, size_t align) 84 | { 85 | alloc_ptr = base_ptr = NULL; 86 | reset(n, align); 87 | } 88 | ~ALIGN_PTR() 89 | { 90 | if (alloc_ptr) free(alloc_ptr); 91 | alloc_ptr = base_ptr = NULL; 92 | } 93 | 94 | void reset(size_t n, size_t align) 95 | { 96 | if (n > 0) 97 | { 98 | if (align < 1) align = 1; 99 | alloc_ptr = realloc(alloc_ptr, n + align - 1); 100 | if (!alloc_ptr) 101 | throw "Insufficient memory."; 102 | size_t r = ((size_t)alloc_ptr) % align; 103 | base_ptr = r ? (void*)((char*)alloc_ptr + align - r) : alloc_ptr; 104 | } else { 105 | if (alloc_ptr) free(alloc_ptr); 106 | alloc_ptr = base_ptr = NULL; 107 | } 108 | } 109 | 110 | inline void *get() { return base_ptr; } 111 | }; 112 | 113 | /// an aligned pointer with 16-byte alignment 114 | struct COREARRAY_DLL_DEFAULT ALIGN_PTR_SSE: public ALIGN_PTR 115 | { 116 | public: 117 | ALIGN_PTR_SSE(): ALIGN_PTR() 118 | { } 119 | ALIGN_PTR_SSE(size_t n): ALIGN_PTR(n, 16u) 120 | { } 121 | void reset(size_t n) 122 | { ALIGN_PTR::reset(n, 16u); } 123 | }; 124 | 125 | /// an aligned pointer with 32-byte alignment 126 | struct COREARRAY_DLL_DEFAULT ALIGN_PTR_AVX: public ALIGN_PTR 127 | { 128 | public: 129 | ALIGN_PTR_AVX(): ALIGN_PTR() 130 | { } 131 | ALIGN_PTR_AVX(size_t n): ALIGN_PTR(n, 32u) 132 | { } 133 | void reset(size_t n) 134 | { ALIGN_PTR::reset(n, 32u); } 135 | }; 136 | 137 | 138 | // auto pointer 139 | 140 | #if defined(COREARRAY_SIMD_AVX) 141 | 142 | typedef ALIGN_PTR_AVX VEC_AUTO_PTR; 143 | 144 | #elif defined(COREARRAY_SIMD_SSE) 145 | 146 | typedef ALIGN_PTR_SSE VEC_AUTO_PTR; 147 | 148 | #else 149 | struct COREARRAY_DLL_DEFAULT VEC_AUTO_PTR: public ALIGN_PTR 150 | { 151 | public: 152 | VEC_AUTO_PTR(): ALIGN_PTR() 153 | { } 154 | VEC_AUTO_PTR(size_t n): ALIGN_PTR(n, 1u) 155 | { } 156 | void reset(size_t n) 157 | { ALIGN_PTR::reset(n, 1u); } 158 | }; 159 | #endif 160 | } 161 | 162 | #endif 163 | 164 | 165 | 166 | #ifdef __cplusplus 167 | extern "C" { 168 | #endif 169 | 170 | 171 | #ifdef COREARRAY_HARDWARE_POPCNT 172 | 173 | # define POPCNT_U32(x) _mm_popcnt_u32((uint32_t)(x)) 174 | # ifdef COREARRAY_REGISTER_BIT32 175 | # define POPCNT_U64(x) \\ 176 | # _mm_popcnt_u32((uint32_t)(x)) + _mm_popcnt_u32((uint64_t)(x) >> 32) 177 | # else 178 | # define POPCNT_U64(x) _mm_popcnt_u64((uint64_t)(x)) 179 | # endif 180 | 181 | #else 182 | 183 | inline static int POPCNT_U32(uint32_t x) 184 | { 185 | x = x - ((x >> 1) & 0x55555555); 186 | x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 187 | return (((x + (x >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; 188 | } 189 | 190 | inline static int POPCNT_U64(uint64_t x) 191 | { 192 | x -= ((x >> 1) & 0x5555555555555555LLU); 193 | x = (x & 0x3333333333333333LLU) + ((x >> 2) & 0x3333333333333333LLU); 194 | x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FLLU; 195 | return (x * 0x0101010101010101LLU) >> 56; 196 | } 197 | 198 | #endif 199 | 200 | 201 | 202 | // =========================================================== 203 | 204 | #ifdef COREARRAY_SIMD_SSE 205 | 206 | # ifdef COREARRAY_SIMD_SSE3 207 | # define MM_LOADU_128(p) _mm_lddqu_si128((__m128i const*)(p)) 208 | # else 209 | # define MM_LOADU_128(p) _mm_loadu_si128((__m128i const*)(p)) 210 | # endif 211 | 212 | # ifdef COREARRAY_SIMD_SSE2 213 | # define MM_BLEND_128(a, b, mask) \ 214 | _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b)) 215 | # endif 216 | 217 | #endif 218 | 219 | 220 | #ifdef COREARRAY_SIMD_AVX 221 | 222 | # define MM_LOADU_256(p) _mm256_loadu_si256((__m256i const *)(p)) 223 | # define MM_SET_M128(v1, v0) \ 224 | _mm256_insertf128_si256(_mm256_castsi128_si256(v0), (v1), 1) 225 | 226 | # ifdef COREARRAY_SIMD_AVX2 227 | # define MM_BLEND_256(a, b, mask) \ 228 | _mm256_or_si256(_mm256_and_si256(mask, a), _mm256_andnot_si256(mask, b)) 229 | # endif 230 | 231 | #endif 232 | 233 | 234 | 235 | // =========================================================== 236 | // Sum all elements in a SIMD register 237 | // =========================================================== 238 | 239 | #ifdef COREARRAY_SIMD_SSE2 240 | inline static double vec_sum_f64(__m128d s) 241 | { 242 | return _mm_cvtsd_f64(_mm_add_pd(s, _mm_shuffle_pd(s, s, 1))); 243 | } 244 | inline static int vec_sum_i32(__m128i s) 245 | { 246 | s = _mm_add_epi32(s, _mm_shuffle_epi32(s, _MM_SHUFFLE(1,0,3,2))); 247 | s = _mm_add_epi32(s, _mm_shuffle_epi32(s, _MM_SHUFFLE(0,0,0,1))); 248 | return _mm_cvtsi128_si32(s); 249 | } 250 | inline static int vec_sum_u8(__m128i s) 251 | { 252 | s = _mm_sad_epu8(s, _mm_setzero_si128()); 253 | s = _mm_add_epi32(s, _mm_shuffle_epi32(s, 2)); 254 | return _mm_cvtsi128_si32(s); 255 | } 256 | #endif 257 | 258 | #ifdef COREARRAY_SIMD_AVX 259 | inline static double vec_avx_sum_f64(__m256d s) 260 | { 261 | s = _mm256_add_pd(_mm256_permute_pd(s, 5), s); 262 | s = _mm256_add_pd(s, _mm256_permute2f128_pd(s, s, 1)); 263 | return _mm_cvtsd_f64(_mm256_castpd256_pd128(s)); 264 | } 265 | #endif 266 | 267 | #ifdef COREARRAY_SIMD_AVX2 268 | inline static int vec_avx_sum_i32(__m256i s) 269 | { 270 | s = _mm256_hadd_epi32(s, s); 271 | s = _mm256_add_epi32(s, _mm256_permute4x64_epi64(s, _MM_SHUFFLE(1,0,3,2))); 272 | __m128i a = _mm256_castsi256_si128(s); 273 | a = _mm_add_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,0,1))); 274 | return _mm_cvtsi128_si32(a); 275 | } 276 | inline static int vec_avx_sum_u8(__m256i s) 277 | { 278 | s = _mm256_sad_epu8(s, _mm256_setzero_si256()); 279 | s = _mm256_add_epi64(s, _mm256_permute4x64_epi64(s, _MM_SHUFFLE(1,0,3,2))); 280 | return _mm256_extract_epi32(s,0) + _mm256_extract_epi32(s,2); 281 | } 282 | #endif 283 | 284 | 285 | 286 | /// get the number of non-zeros 287 | COREARRAY_DLL_DEFAULT size_t vec_i8_cnt_nonzero(const int8_t *p, size_t n); 288 | 289 | /// get the number of non-zeros and the pointer to the first non-zero value 290 | COREARRAY_DLL_DEFAULT const int8_t *vec_i8_cnt_nonzero_ptr(const int8_t *p, 291 | size_t n, size_t *out_n); 292 | 293 | 294 | 295 | // =========================================================== 296 | // functions for int8 297 | // =========================================================== 298 | 299 | /// return the pointer to the non-zeros character starting from p 300 | COREARRAY_DLL_DEFAULT const char *vec_i8_ptr_nonzero(const char *p, size_t n); 301 | 302 | /// count how many 'val' in 'p' 303 | COREARRAY_DLL_DEFAULT size_t vec_i8_count(const char *p, size_t n, char val); 304 | 305 | /// count how many val1 and val2 in p 306 | COREARRAY_DLL_DEFAULT void vec_i8_count2(const char *p, size_t n, 307 | char val1, char val2, size_t *out_n1, size_t *out_n2); 308 | 309 | /// count how many val1, val2 and val3 in p 310 | COREARRAY_DLL_DEFAULT void vec_i8_count3(const char *p, size_t n, 311 | char val1, char val2, char val3, size_t *out_n1, size_t *out_n2, 312 | size_t *out_n3); 313 | 314 | /// replace 'val' in the array of 'p' by 'substitute' 315 | COREARRAY_DLL_DEFAULT void vec_i8_replace(int8_t *p, size_t n, int8_t val, 316 | int8_t substitute); 317 | 318 | /// 319 | COREARRAY_DLL_DEFAULT void vec_i8_cnt_dosage2(const int8_t *p, 320 | int8_t *out, size_t n, int8_t val, int8_t missing, 321 | int8_t missing_substitute); 322 | 323 | 324 | 325 | // =========================================================== 326 | // functions for uint8 327 | // =========================================================== 328 | 329 | /// shifting *p right by 2 bits, assuming p is 2-byte aligned 330 | COREARRAY_DLL_DEFAULT void vec_u8_shr_b2(uint8_t *p, size_t n); 331 | 332 | 333 | 334 | // =========================================================== 335 | // functions for int16 336 | // =========================================================== 337 | 338 | /// shifting *p right by 2 bits, assuming p is 2-byte aligned 339 | COREARRAY_DLL_DEFAULT void vec_i16_shr_b2(int16_t *p, size_t n); 340 | 341 | 342 | 343 | // =========================================================== 344 | // functions for int32 345 | // =========================================================== 346 | 347 | /// count how many val in p, assuming p is 4-byte aligned 348 | COREARRAY_DLL_DEFAULT size_t vec_i32_count(const int32_t *p, size_t n, int32_t val); 349 | 350 | /// count how many val1 and val2 in p, assuming p is 4-byte aligned 351 | COREARRAY_DLL_DEFAULT void vec_i32_count2(const int32_t *p, size_t n, 352 | int32_t val1, int32_t val2, size_t *out_n1, size_t *out_n2); 353 | 354 | /// count how many val1, val2 and val3 in p, assuming p is 4-byte aligned 355 | COREARRAY_DLL_DEFAULT void vec_i32_count3(const int32_t *p, size_t n, 356 | int32_t val1, int32_t val2, int32_t val3, size_t *out_n1, size_t *out_n2, 357 | size_t *out_n3); 358 | 359 | /// 360 | COREARRAY_DLL_DEFAULT void vec_int32_set(int32_t *p, size_t n, int32_t val); 361 | 362 | /// replace 'val' in the array of 'p' by 'substitute', assuming 'p' is 4-byte aligned 363 | COREARRAY_DLL_DEFAULT void vec_i32_replace(int32_t *p, size_t n, int32_t val, 364 | int32_t substitute); 365 | 366 | /// assuming 'out' is 4-byte aligned, output (p[0]==val) + (p[1]==val) or missing_substitute 367 | COREARRAY_DLL_DEFAULT void vec_i32_cnt_dosage2(const int32_t *p, 368 | int32_t *out, size_t n, int32_t val, int32_t missing, 369 | int32_t missing_substitute); 370 | 371 | /// shifting *p right by 2 bits, assuming p is 4-byte aligned 372 | COREARRAY_DLL_DEFAULT void vec_i32_shr_b2(int32_t *p, size_t n); 373 | 374 | 375 | 376 | // =========================================================== 377 | // functions for float64 378 | // =========================================================== 379 | 380 | 381 | 382 | // =========================================================== 383 | // functions for char 384 | // =========================================================== 385 | 386 | COREARRAY_DLL_DEFAULT const char *vec_char_find_CRLF(const char *p, size_t n); 387 | 388 | 389 | 390 | #ifdef __cplusplus 391 | } 392 | #endif 393 | 394 | #endif /* _HEADER_COREARRAY_VECTORIZATION_ */ 395 | -------------------------------------------------------------------------------- /src/Index.h: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // Index.h: Indexing Objects 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | 23 | #ifndef _HEADER_SEQ_INDEX_ 24 | #define _HEADER_SEQ_INDEX_ 25 | 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include 37 | #include 38 | #include "vectorization.h" 39 | 40 | 41 | #ifndef TRUE 42 | # define TRUE 1 43 | #endif 44 | #ifndef FALSE 45 | # define FALSE 0 46 | #endif 47 | 48 | 49 | namespace PySeqArray 50 | { 51 | 52 | using namespace std; 53 | using namespace CoreArray; 54 | 55 | 56 | class ErrSeqArray; 57 | 58 | 59 | // =========================================================== 60 | // Run-length encoding (RLE) object 61 | // =========================================================== 62 | 63 | /// object with run-length encoding 64 | template class COREARRAY_DLL_LOCAL C_RLE 65 | { 66 | public: 67 | /// constructor 68 | C_RLE() 69 | { 70 | TotalLength = 0; 71 | Position = AccIndex = AccOffset = 0; 72 | } 73 | 74 | void Init() 75 | { 76 | TotalLength = 0; 77 | vector::iterator p; 78 | for (p=Lengths.begin(); p != Lengths.end(); p++) 79 | TotalLength += *p; 80 | Position = AccIndex = AccOffset = 0; 81 | } 82 | 83 | void Add(TYPE &val, C_UInt32 len) 84 | { 85 | Values.push_back(val); 86 | Lengths.push_back(len); 87 | } 88 | 89 | void Clear() 90 | { 91 | Values.clear(); Lengths.clear(); 92 | TotalLength = 0; 93 | Position = AccIndex = AccOffset = 0; 94 | } 95 | 96 | const TYPE &operator [](size_t pos) 97 | { 98 | if (pos >= TotalLength) 99 | throw "Invalid position in C_RLE."; 100 | if (pos < Position) 101 | Position = AccIndex = AccOffset = 0; 102 | for (; Position < pos; ) 103 | { 104 | size_t L = Lengths[AccIndex]; 105 | size_t n = L - AccOffset; 106 | if ((Position + n) <= pos) 107 | { 108 | AccIndex ++; AccOffset = 0; 109 | } else { 110 | n = pos - Position; AccOffset += n; 111 | } 112 | Position += n; 113 | } 114 | return Values[AccIndex]; 115 | } 116 | 117 | inline bool Empty() const { return (TotalLength <= 0); } 118 | 119 | protected: 120 | /// values according to Lengths, used in run-length encoding 121 | vector Values; 122 | /// lengths according to Values, used in run-length encoding 123 | vector Lengths; 124 | /// total number, = sum(Lengths) 125 | size_t TotalLength; 126 | /// the position relative to the total length 127 | size_t Position; 128 | /// the index in Lengths according to Position 129 | size_t AccIndex; 130 | /// the offset according the value of Lengths[AccIndex] 131 | size_t AccOffset; 132 | }; 133 | 134 | 135 | // =========================================================== 136 | // Indexing object 137 | // =========================================================== 138 | 139 | /// Indexing object with run-length encoding 140 | class COREARRAY_DLL_LOCAL CIndex 141 | { 142 | public: 143 | /// values according to Lengths, used in run-length encoding 144 | vector Values; 145 | /// lengths according to Values, used in run-length encoding 146 | vector Lengths; 147 | 148 | /// constructor 149 | CIndex(); 150 | 151 | /// load data and represent as run-length encoding 152 | void Init(PdContainer Obj); 153 | /// load data and represent as run-length encoding 154 | void InitOne(int num); 155 | /// return the accumulated sum of values and current value in Lengths and Values given by a position 156 | void GetInfo(size_t pos, C_Int64 &Sum, int &Value); 157 | /// get lengths with selection 158 | PyObject* GetLen_Sel(const C_BOOL sel[]); 159 | /// get lengths and bool selection from a set of selected variants 160 | PyObject* GetLen_Sel(const C_BOOL sel[], int &out_var_start, int &out_var_count, 161 | vector &out_var_sel); 162 | /// return true if empty 163 | inline bool Empty() const { return (TotalLength <= 0); } 164 | 165 | protected: 166 | /// total number, = sum(Lengths) 167 | size_t TotalLength; 168 | /// the position relative to the total length 169 | size_t Position; 170 | /// the accumulated sum of values in Lengths and Values according to Position 171 | C_Int64 AccSum; 172 | /// the index in Lengths according to Position 173 | size_t AccIndex; 174 | /// the offset according the value of Lengths[AccIndex] 175 | size_t AccOffset; 176 | }; 177 | 178 | 179 | /// Indexing object with run-length encoding for genotype indexing 180 | class COREARRAY_DLL_LOCAL CGenoIndex 181 | { 182 | public: 183 | /// values according to Lengths, used in run-length encoding 184 | vector Values; 185 | /// lengths according to Values, used in run-length encoding 186 | vector Lengths; 187 | 188 | /// constructor 189 | CGenoIndex(); 190 | 191 | /// load data and represent as run-length encoding 192 | void Init(PdContainer Obj); 193 | /// return the accumulated sum of values and current value in Lengths and Values given by a position 194 | void GetInfo(size_t pos, C_Int64 &Sum, C_UInt8 &Value); 195 | /// return true if empty 196 | inline bool Empty() const { return (TotalLength <= 0); } 197 | 198 | protected: 199 | /// total number, = sum(Lengths) 200 | size_t TotalLength; 201 | /// the position relative to the total length 202 | size_t Position; 203 | /// the accumulated sum of values in Lengths and Values according to Position 204 | C_Int64 AccSum; 205 | /// the index in Lengths according to Position 206 | size_t AccIndex; 207 | /// the offset according the value of Lengths[AccIndex] 208 | size_t AccOffset; 209 | }; 210 | 211 | 212 | 213 | // =========================================================== 214 | // Chromosome indexing 215 | // =========================================================== 216 | 217 | /// Chromosome indexing object 218 | class COREARRAY_DLL_LOCAL CChromIndex 219 | { 220 | public: 221 | /// range object 222 | struct TRange 223 | { 224 | int Start; ///< the starting position 225 | int Length; ///< the length 226 | }; 227 | 228 | typedef vector TRangeList; 229 | 230 | /// constructor 231 | CChromIndex(); 232 | 233 | /// clear 234 | void Clear(); 235 | 236 | /// represent chromosome codes as a RLE object in Map 237 | void AddChrom(PdGDSFolder Root); 238 | 239 | /// the total length of a TRangeList object 240 | size_t RangeTotalLength(const TRangeList &RngList); 241 | 242 | /// whether it is empty 243 | inline bool Empty() const { return Map.empty(); } 244 | 245 | inline const string &operator [](size_t pos) { return PosToChr[pos]; } 246 | 247 | /// map to TRangeList from chromosome coding 248 | map Map; 249 | 250 | protected: 251 | /// position to chromosome 252 | C_RLE PosToChr; 253 | }; 254 | 255 | 256 | 257 | // =========================================================== 258 | // Genomic Range Sets 259 | // =========================================================== 260 | 261 | /// Genomic Range Set Object 262 | class COREARRAY_DLL_LOCAL CRangeSet 263 | { 264 | public: 265 | /// range object 266 | struct TRange 267 | { 268 | int Start; ///< the starting position 269 | int End; ///< the ending (always, End >= Start) 270 | }; 271 | 272 | void Clear(); 273 | void AddRange(int start, int end); 274 | bool IsIncluded(int point); 275 | 276 | protected: 277 | /// strict weak ordering for non-overlapping, == when overlapping 278 | struct less_range 279 | { 280 | bool operator()(const TRange &lhs, const TRange &rhs) const; 281 | }; 282 | 283 | /// 284 | set _RangeSet; 285 | }; 286 | 287 | 288 | 289 | 290 | 291 | // =========================================================== 292 | // SeqArray GDS file information 293 | // =========================================================== 294 | 295 | /// selection object used in GDS file 296 | struct COREARRAY_DLL_LOCAL TSelection 297 | { 298 | vector Sample; ///< sample selection 299 | vector Variant; ///< variant selection 300 | 301 | inline C_BOOL *pSample() 302 | { return Sample.empty() ? NULL : &Sample[0]; } 303 | inline C_BOOL *pVariant() 304 | { return Variant.empty() ? NULL : &Variant[0]; } 305 | }; 306 | 307 | 308 | /// GDS file object 309 | class COREARRAY_DLL_LOCAL CFileInfo 310 | { 311 | public: 312 | list SelList; ///< a list of sample and variant selections 313 | 314 | /// constructor 315 | CFileInfo(PdGDSFolder root=NULL); 316 | /// destructor 317 | ~CFileInfo(); 318 | 319 | /// reset the root of GDS file 320 | void ResetRoot(PdGDSFolder root); 321 | /// get selection 322 | TSelection &Selection(); 323 | 324 | /// return _Chrom which has been initialized 325 | CChromIndex &Chromosome(); 326 | /// return _Position which has been initialized 327 | vector &Position(); 328 | 329 | /// return _GenoIndex which has been initialized 330 | CGenoIndex &GenoIndex(); 331 | 332 | /// return the indexing object according to variable name 333 | CIndex &VarIndex(const string &varname); 334 | 335 | /// get gds object 336 | PdAbstractArray GetObj(const char *name, C_BOOL MustExist); 337 | 338 | /// the root of gds file 339 | inline PdGDSFolder Root() { return _Root; } 340 | /// the total number of samples 341 | inline int SampleNum() const { return _SampleNum; } 342 | /// the total number of variants 343 | inline int VariantNum() const { return _VariantNum; } 344 | /// ploidy 345 | inline int Ploidy() const { return _Ploidy; } 346 | 347 | int SampleSelNum(); 348 | int VariantSelNum(); 349 | 350 | protected: 351 | PdGDSFolder _Root; ///< the root of GDS file 352 | int _SampleNum; ///< the total number of samples 353 | int _VariantNum; ///< the total number of variants 354 | int _Ploidy; ///< ploidy 355 | 356 | CChromIndex _Chrom; ///< chromosome indexing 357 | vector _Position; ///< position 358 | CGenoIndex _GenoIndex; ///< the indexing object for genotypes 359 | map _VarIndex; ///< the indexing objects for INFO/FORMAT variables 360 | }; 361 | 362 | 363 | extern std::map COREARRAY_DLL_LOCAL GDSFile_ID_Info; 364 | 365 | /// get the associated CFileInfo 366 | COREARRAY_DLL_LOCAL CFileInfo &GetFileInfo(int file_id); 367 | 368 | 369 | 370 | 371 | // =========================================================== 372 | // GDS Variable Type 373 | // =========================================================== 374 | 375 | class COREARRAY_DLL_LOCAL CVariable 376 | { 377 | public: 378 | enum TVarType 379 | { 380 | ctNone, 381 | ctBasic, ///< sample.id, variant.id, etc 382 | ctGenotype, ///< genotypes or alleles 383 | ctDosage, ///< dosage of reference or specified allele 384 | ctPhase, ///< phase information 385 | ctInfo, ///< variant annotation info field 386 | ctFormat, ///< variant annotation format field 387 | ctSampleAnnot ///< sample annotation 388 | }; 389 | }; 390 | 391 | 392 | /// The abstract class for applying functions marginally 393 | class COREARRAY_DLL_LOCAL CVarApply: public CVariable 394 | { 395 | protected: 396 | TVarType fVarType; ///< VCF data type 397 | ssize_t MarginalSize; ///< the size in MarginalSelect 398 | C_BOOL *MarginalSelect; ///< pointer to variant selection 399 | 400 | public: 401 | PdAbstractArray Node; ///< the GDS variable 402 | C_Int32 Position; ///< the index of variant/sample, starting from ZERO 403 | 404 | /// constructor 405 | CVarApply(); 406 | /// destructor 407 | virtual ~CVarApply(); 408 | 409 | /// reset 410 | virtual void Reset(); 411 | /// move to the next element 412 | virtual bool Next(); 413 | 414 | /// return a numpty array object for the next call 'ReadData()' 415 | virtual PyObject* NeedArray() = 0; 416 | /// read data to R object 417 | virtual void ReadData(PyObject *val) = 0; 418 | 419 | /// variable type 420 | inline TVarType VarType() const { return fVarType; } 421 | 422 | /// need a pointer to size of TRUEs 423 | C_BOOL *NeedTRUEs(size_t size); 424 | 425 | private: 426 | vector _TRUE; 427 | }; 428 | 429 | 430 | /// The abstract class for applying functions by variant 431 | class COREARRAY_DLL_LOCAL CApply_Variant: public CVarApply 432 | { 433 | protected: 434 | PyObject *VarNode; ///< Python object 435 | public: 436 | /// constructor 437 | CApply_Variant(); 438 | /// constructor with file information 439 | CApply_Variant(CFileInfo &File); 440 | /// destructor 441 | ~CApply_Variant(); 442 | }; 443 | 444 | 445 | class COREARRAY_DLL_LOCAL CVarApplyList: public vector 446 | { 447 | public: 448 | ~CVarApplyList(); 449 | 450 | /// return false if any return false, otherwise return true 451 | bool CallNext(); 452 | }; 453 | 454 | 455 | 456 | // =========================================================== 457 | // Progress object 458 | // =========================================================== 459 | 460 | class COREARRAY_DLL_LOCAL CProgress 461 | { 462 | public: 463 | CProgress(C_Int64 start, C_Int64 count, FILE *conn, bool newline); 464 | virtual ~CProgress(); 465 | 466 | void Forward(); 467 | virtual void ShowProgress(); 468 | 469 | protected: 470 | C_Int64 TotalCount; ///< the total number 471 | C_Int64 Counter; ///< the current counter 472 | FILE *File; ///< file object 473 | time_t _start_time; ///< the starting time 474 | bool NewLine; 475 | double _start, _step; 476 | C_Int64 _hit; 477 | vector< pair > _timer; 478 | }; 479 | 480 | class COREARRAY_DLL_LOCAL CProgressStdOut: public CProgress 481 | { 482 | public: 483 | CProgressStdOut(C_Int64 count, bool verbose); 484 | virtual void ShowProgress(); 485 | 486 | protected: 487 | time_t _last_time; 488 | bool Verbose; 489 | }; 490 | 491 | 492 | 493 | // =========================================================== 494 | // Define Functions 495 | // =========================================================== 496 | 497 | /// Get the number of TRUEs 498 | #define GetNumOfTRUE(ptr, n) vec_i8_cnt_nonzero((C_Int8*)(ptr), n) 499 | 500 | 501 | /// requires a vector of TRUEs 502 | COREARRAY_DLL_LOCAL C_BOOL *NeedArrayTRUEs(size_t len); 503 | 504 | /// Get pretty text for an integer with comma 505 | COREARRAY_DLL_LOCAL const char *PrettyInt(int val); 506 | 507 | /// Text matching, return -1 when no maching 508 | COREARRAY_DLL_LOCAL int MatchText(const char *txt, const char *list[]); 509 | 510 | /// Get the number of alleles 511 | COREARRAY_DLL_LOCAL int GetNumOfAllele(const char *allele_list); 512 | 513 | /// Get the index in an allele list 514 | COREARRAY_DLL_LOCAL int GetIndexOfAllele(const char *allele, const char *allele_list); 515 | 516 | /// Get strings split by comma 517 | COREARRAY_DLL_LOCAL void GetAlleles(const char *alleles, vector &out); 518 | 519 | 520 | /// get PdGDSObj from a SEXP object 521 | COREARRAY_DLL_LOCAL void GDS_PATH_PREFIX_CHECK(const char *path); 522 | 523 | /// check variable name 524 | COREARRAY_DLL_LOCAL void GDS_VARIABLE_NAME_CHECK(const char *p); 525 | 526 | /// get PdGDSObj from a SEXP object 527 | COREARRAY_DLL_LOCAL string GDS_PATH_PREFIX(const string &path, char prefix); 528 | 529 | 530 | 531 | // =========================================================== 532 | // Define Exception 533 | // =========================================================== 534 | 535 | class ErrSeqArray: public ErrCoreArray 536 | { 537 | public: 538 | ErrSeqArray(): ErrCoreArray() 539 | { } 540 | ErrSeqArray(const char *fmt, ...): ErrCoreArray() 541 | { _COREARRAY_ERRMACRO_(fmt); } 542 | ErrSeqArray(const std::string &msg): ErrCoreArray() 543 | { fMessage = msg; } 544 | }; 545 | 546 | 547 | 548 | // =========================================================== 549 | // Import the NumPy Package 550 | // =========================================================== 551 | 552 | const C_Int32 NA_INTEGER = 0x80000000; 553 | const C_UInt8 NA_UINT8 = 0xFF; 554 | 555 | 556 | COREARRAY_DLL_LOCAL bool numpy_init(); 557 | 558 | COREARRAY_DLL_LOCAL PyObject* numpy_new_bool(size_t n); 559 | 560 | COREARRAY_DLL_LOCAL PyObject* numpy_new_uint8(size_t n); 561 | COREARRAY_DLL_LOCAL PyObject* numpy_new_uint8_mat(size_t n1, size_t n2); 562 | COREARRAY_DLL_LOCAL PyObject* numpy_new_uint8_dim3(size_t n1, size_t n2, size_t n3); 563 | 564 | COREARRAY_DLL_LOCAL PyObject* numpy_new_int32(size_t n); 565 | COREARRAY_DLL_LOCAL PyObject* numpy_new_int32_mat(size_t n1, size_t n2); 566 | COREARRAY_DLL_LOCAL PyObject* numpy_new_int32_dim3(size_t n1, size_t n2, size_t n3); 567 | 568 | COREARRAY_DLL_LOCAL PyObject* numpy_new_string(size_t n); 569 | 570 | COREARRAY_DLL_LOCAL PyObject* numpy_new_list(size_t n); 571 | 572 | COREARRAY_DLL_LOCAL bool numpy_is_array(PyObject *obj); 573 | COREARRAY_DLL_LOCAL bool numpy_is_array_or_list(PyObject *obj); 574 | COREARRAY_DLL_LOCAL bool numpy_is_array_int(PyObject *obj); 575 | 576 | COREARRAY_DLL_LOCAL bool numpy_is_bool(PyObject *obj); 577 | COREARRAY_DLL_LOCAL bool numpy_is_uint8(PyObject *obj); 578 | COREARRAY_DLL_LOCAL bool numpy_is_int(PyObject *obj); 579 | COREARRAY_DLL_LOCAL bool numpy_is_string(PyObject *obj); 580 | 581 | 582 | COREARRAY_DLL_LOCAL size_t numpy_size(PyObject *obj); // assuming obj is PyArray 583 | 584 | COREARRAY_DLL_LOCAL void* numpy_getptr(PyObject *obj); // assuming obj is PyArray 585 | COREARRAY_DLL_LOCAL void numpy_setval(PyObject *obj, void *ptr, PyObject *val); // assuming obj is PyArray 586 | 587 | COREARRAY_DLL_LOCAL void numpy_to_int32(PyObject *obj, vector &out); 588 | COREARRAY_DLL_LOCAL void numpy_to_string(PyObject *obj, vector &out); 589 | 590 | } 591 | 592 | #endif /* _HEADER_SEQ_INDEX_ */ 593 | -------------------------------------------------------------------------------- /src/GetData.cpp: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // GetData.cpp: Get data from the GDS file 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | #include "Index.h" 23 | #include "ReadByVariant.h" 24 | // #include "ReadBySample.h" 25 | 26 | 27 | using namespace PySeqArray; 28 | 29 | extern "C" 30 | { 31 | 32 | // =========================================================== 33 | // Get data from a working space 34 | // =========================================================== 35 | 36 | /* 37 | static bool is_logical(PdGDSObj Node) 38 | { 39 | char classname[32]; 40 | classname[0] = 0; 41 | GDS_Node_GetClassName(Node, classname, sizeof(classname)); 42 | return (strcmp(classname, "dBit1") == 0); 43 | } 44 | */ 45 | 46 | 47 | // get data 48 | static PyObject* VarGetData(CFileInfo &File, const char *name) 49 | { 50 | static const char *ERR_DIM = "Invalid dimension of '%s'."; 51 | 52 | PyObject *rv_ans = NULL; 53 | TSelection &Sel = File.Selection(); 54 | 55 | if (strcmp(name, "sample.id") == 0) 56 | { 57 | // =========================================================== 58 | // sample.id 59 | 60 | PdAbstractArray N = File.GetObj(name, TRUE); 61 | // check 62 | if ((GDS_Array_DimCnt(N) != 1) || 63 | (GDS_Array_GetTotalCount(N) != File.SampleNum())) 64 | throw ErrSeqArray(ERR_DIM, name); 65 | // read 66 | C_BOOL *ss = Sel.pSample(); 67 | rv_ans = GDS_Py_Array_Read(N, NULL, NULL, &ss, svCustom); 68 | 69 | } else if (strcmp(name, "position") == 0) 70 | { 71 | int n = File.VariantSelNum(); 72 | rv_ans = numpy_new_int32(n); 73 | if (n > 0) 74 | { 75 | const int *base = &File.Position()[0]; 76 | int *p = (int*)numpy_getptr(rv_ans); 77 | C_BOOL *s = Sel.pVariant(); 78 | for (size_t m=File.VariantNum(); m > 0; m--) 79 | { 80 | if (*s++) *p++ = *base; 81 | base ++; 82 | } 83 | } 84 | 85 | } else if (strcmp(name, "chromosome") == 0) 86 | { 87 | int n = File.VariantSelNum(); 88 | rv_ans = numpy_new_string(n); 89 | if (n > 0) 90 | { 91 | CChromIndex &Chrom = File.Chromosome(); 92 | PyObject **p = (PyObject**)numpy_getptr(rv_ans); 93 | C_BOOL *s = Sel.pVariant(); 94 | size_t m = File.VariantNum(); 95 | string lastss; 96 | PyObject *last = NULL; 97 | for (size_t i=0; i < m; i++) 98 | { 99 | if (*s++) 100 | { 101 | const string &ss = Chrom[i]; 102 | if (ss != lastss) 103 | { 104 | lastss = ss; 105 | last = NULL; 106 | } 107 | if (!last) 108 | last = PYSTR_SET2(&lastss[0], lastss.size()); 109 | numpy_setval(rv_ans, p++, last); 110 | } 111 | } 112 | } 113 | 114 | } else if ( (strcmp(name, "variant.id")==0) || 115 | (strcmp(name, "allele")==0) || 116 | (strcmp(name, "annotation/id")==0) || 117 | (strcmp(name, "annotation/qual")==0) || 118 | (strcmp(name, "annotation/filter")==0) ) 119 | { 120 | // =========================================================== 121 | // variant.id, allele, annotation/id, annotation/qual, annotation/filter 122 | 123 | PdAbstractArray N = File.GetObj(name, TRUE); 124 | // check 125 | if ((GDS_Array_DimCnt(N) != 1) || 126 | (GDS_Array_GetTotalCount(N) != File.VariantNum())) 127 | throw ErrSeqArray(ERR_DIM, name); 128 | // read 129 | C_BOOL *ss = Sel.pVariant(); 130 | rv_ans = GDS_Py_Array_Read(N, NULL, NULL, &ss, svCustom); 131 | 132 | } else if (strcmp(name, "genotype") == 0) 133 | { 134 | // =========================================================== 135 | // genotypic data 136 | 137 | int nSample = File.SampleSelNum(); 138 | int nVariant = File.VariantSelNum(); 139 | 140 | if ((nSample > 0) && (nVariant > 0)) 141 | { 142 | // initialize GDS genotype Node 143 | CApply_Variant_Geno NodeVar(File); 144 | // set 145 | rv_ans = numpy_new_uint8_dim3(nVariant, nSample, File.Ploidy()); 146 | C_UInt8 *base = (C_UInt8*)numpy_getptr(rv_ans); 147 | ssize_t SIZE = (ssize_t)nSample * File.Ploidy(); 148 | do { 149 | NodeVar.ReadGenoData(base); 150 | base += SIZE; 151 | } while (NodeVar.Next()); 152 | } else 153 | rv_ans = numpy_new_uint8(0); 154 | 155 | } else if (strcmp(name, "@genotype") == 0) 156 | { 157 | static const char *VarName = "genotype/@data"; 158 | PdAbstractArray N = File.GetObj(VarName, TRUE); 159 | // check 160 | if ((GDS_Array_DimCnt(N) != 1) || 161 | (GDS_Array_GetTotalCount(N) != File.VariantNum())) 162 | throw ErrSeqArray(ERR_DIM, VarName); 163 | // read 164 | C_BOOL *ss = Sel.pVariant(); 165 | rv_ans = GDS_Py_Array_Read(N, NULL, NULL, &ss, svInt32); 166 | 167 | } else if (strcmp(name, "$dosage")==0 || strcmp(name, "#dosage")==0) 168 | { 169 | // =========================================================== 170 | // dosage data 171 | 172 | ssize_t nSample = File.SampleSelNum(); 173 | ssize_t nVariant = File.VariantSelNum(); 174 | 175 | if ((nSample > 0) && (nVariant > 0)) 176 | { 177 | // initialize GDS genotype Node 178 | CApply_Variant_Dosage NodeVar(File); 179 | // set 180 | rv_ans = numpy_new_uint8_mat(nVariant, nSample); 181 | C_UInt8 *base = (C_UInt8*)numpy_getptr(rv_ans); 182 | do { 183 | NodeVar.ReadDosage(base); 184 | base += nSample; 185 | } while (NodeVar.Next()); 186 | } else 187 | rv_ans = numpy_new_uint8(0); 188 | 189 | } else if (strcmp(name, "phase") == 0) 190 | { 191 | // =========================================================== 192 | // phase/ 193 | 194 | PdAbstractArray N = File.GetObj("phase/data", TRUE); 195 | // check 196 | int ndim = GDS_Array_DimCnt(N); 197 | C_Int32 dim[4]; 198 | GDS_Array_GetDim(N, dim, 3); 199 | if (ndim<2 || ndim>3 || dim[0]!= File.VariantNum() || 200 | dim[1]!=File.SampleNum()) 201 | throw ErrSeqArray(ERR_DIM, name); 202 | // read 203 | C_BOOL *ss[3] = { Sel.pVariant(), Sel.pSample(), NULL }; 204 | if (ndim == 3) 205 | ss[2] = NeedArrayTRUEs(dim[2]); 206 | rv_ans = GDS_Py_Array_Read(N, NULL, NULL, ss, svCustom); 207 | 208 | } else if (strncmp(name, "annotation/info/@", 17) == 0) 209 | { 210 | if (File.GetObj(name, FALSE) != NULL) 211 | { 212 | CIndex &V = File.VarIndex(name); 213 | rv_ans = V.GetLen_Sel(Sel.pVariant()); 214 | } 215 | 216 | } else if (strncmp(name, "annotation/info/", 16) == 0) 217 | { 218 | // =========================================================== 219 | // annotation/info 220 | 221 | GDS_PATH_PREFIX_CHECK(name); 222 | PdAbstractArray N = File.GetObj(name, TRUE); 223 | int ndim = GDS_Array_DimCnt(N); 224 | if ((ndim!=1) && (ndim!=2)) 225 | throw ErrSeqArray(ERR_DIM, name); 226 | 227 | string name2 = GDS_PATH_PREFIX(name, '@'); 228 | PdAbstractArray N_idx = File.GetObj(name2.c_str(), FALSE); 229 | if (N_idx == NULL) 230 | { 231 | // no index 232 | C_Int32 dim[4]; 233 | GDS_Array_GetDim(N, dim, 2); 234 | C_BOOL *ss[2] = { Sel.pVariant(), NULL }; 235 | if (ndim == 2) 236 | ss[1] = NeedArrayTRUEs(dim[1]); 237 | C_SVType SV = svCustom; // is_logical 238 | rv_ans = GDS_Py_Array_Read(N, NULL, NULL, ss, SV); 239 | 240 | } else { 241 | // with index 242 | CIndex &V = File.VarIndex(name2); 243 | int var_start, var_count; 244 | vector var_sel; 245 | PyObject *Index = V.GetLen_Sel(Sel.pVariant(), var_start, var_count, var_sel); 246 | 247 | C_BOOL *ss[2] = { &var_sel[0], NULL }; 248 | C_Int32 dimst[2] = { var_start, 0 }; 249 | C_Int32 dimcnt[2] = { var_count, 0 }; 250 | if (ndim == 2) 251 | { 252 | GDS_Array_GetDim(N, dimcnt, 2); 253 | dimcnt[0] = var_count; 254 | } 255 | PyObject *Val = GDS_Py_Array_Read(N, dimst, dimcnt, ss, svCustom); 256 | 257 | rv_ans = Py_BuildValue("{s:N,s:N}", "index", Index, "data", Val); 258 | } 259 | 260 | } else if (strncmp(name, "annotation/format/@", 19) == 0) 261 | { 262 | string name2(name); 263 | name2.erase(18, 1).append("/@data"); 264 | if (File.GetObj(name2.c_str(), FALSE) != NULL) 265 | { 266 | CIndex &V = File.VarIndex(name2.c_str()); 267 | rv_ans = V.GetLen_Sel(Sel.pVariant()); 268 | } 269 | 270 | } else if (strncmp(name, "annotation/format/", 18) == 0) 271 | { 272 | // =========================================================== 273 | // annotation/format 274 | 275 | GDS_PATH_PREFIX_CHECK(name); 276 | string name1 = string(name) + "/data"; 277 | string name2 = string(name) + "/@data"; 278 | PdAbstractArray N = File.GetObj(name1.c_str(), TRUE); 279 | 280 | // with index 281 | CIndex &V = File.VarIndex(name2); 282 | int var_start, var_count; 283 | vector var_sel; 284 | PyObject *Index = V.GetLen_Sel(Sel.pVariant(), var_start, var_count, var_sel); 285 | 286 | C_BOOL *ss[2] = { &var_sel[0], Sel.pSample() }; 287 | C_Int32 dimst[2] = { var_start, 0 }; 288 | C_Int32 dimcnt[2]; 289 | GDS_Array_GetDim(N, dimcnt, 2); 290 | dimcnt[0] = var_count; 291 | PyObject *Val = GDS_Py_Array_Read(N, dimst, dimcnt, ss, svCustom); 292 | 293 | rv_ans = Py_BuildValue("{s:N,s:N}", "index", Index, "data", Val); 294 | 295 | } else if (strncmp(name, "sample.annotation/", 18) == 0) 296 | { 297 | // =========================================================== 298 | // sample.annotation 299 | 300 | GDS_PATH_PREFIX_CHECK(name); 301 | PdAbstractArray N = File.GetObj(name, TRUE); 302 | // check 303 | int ndim = GDS_Array_DimCnt(N); 304 | if ((ndim!=1) && (ndim!=2)) 305 | throw ErrSeqArray(ERR_DIM, name); 306 | C_Int32 dim[2]; 307 | GDS_Array_GetDim(N, dim, 2); 308 | if (dim[0] != File.SampleNum()) 309 | throw ErrSeqArray(ERR_DIM, name); 310 | 311 | C_BOOL *ss[2] = { Sel.pSample(), NULL }; 312 | if (ndim == 2) 313 | ss[1] = NeedArrayTRUEs(dim[1]); 314 | rv_ans = GDS_Py_Array_Read(N, NULL, NULL, ss, svCustom); 315 | 316 | } else if (strcmp(name, "$chrom_pos")==0 || strcmp(name, "#chrom_pos")==0) 317 | { 318 | // =========================================================== 319 | // chromosome-position 320 | 321 | PdAbstractArray N1 = File.GetObj("chromosome", TRUE); 322 | PdAbstractArray N2 = File.GetObj("position", TRUE); 323 | C_Int64 n1 = GDS_Array_GetTotalCount(N1); 324 | C_Int64 n2 = GDS_Array_GetTotalCount(N2); 325 | if ((n1 != n2) || (n1 != File.VariantNum())) 326 | throw ErrSeqArray("Invalid dimension of 'chromosome' and 'position'."); 327 | 328 | vector chr; 329 | vector pos; 330 | 331 | int n = File.VariantSelNum(); 332 | chr.resize(n); 333 | pos.resize(n); 334 | C_BOOL *ss = Sel.pVariant(); 335 | 336 | GDS_Array_ReadDataEx(N1, NULL, NULL, &ss, &chr[0], svStrUTF8); 337 | GDS_Array_ReadDataEx(N2, NULL, NULL, &ss, &pos[0], svInt32); 338 | 339 | char buf1[1024] = { 0 }; 340 | char buf2[1024] = { 0 }; 341 | char *p1 = buf1, *p2 = buf2; 342 | int dup = 0; 343 | rv_ans = numpy_new_string(n); 344 | PyObject **p = (PyObject**)numpy_getptr(rv_ans); 345 | for (size_t i=0; i < (size_t)n; i++,p++) 346 | { 347 | snprintf(p1, sizeof(buf1), "%s_%d", chr[i].c_str(), pos[i]); 348 | if (strcmp(p1, p2) == 0) 349 | { 350 | dup ++; 351 | snprintf(p1, sizeof(buf1), "%s_%d_%d", chr[i].c_str(), 352 | pos[i], dup); 353 | numpy_setval(rv_ans, p, PYSTR_SET(p1)); 354 | } else { 355 | char *tmp; 356 | tmp = p1; p1 = p2; p2 = tmp; 357 | numpy_setval(rv_ans, p, PYSTR_SET(p2)); 358 | dup = 0; 359 | } 360 | } 361 | 362 | } else if (strcmp(name, "$num_allele")==0 || strcmp(name, "#num_allele")==0) 363 | { 364 | // =========================================================== 365 | // the number of distinct alleles 366 | 367 | ssize_t nVariant = File.VariantSelNum(); 368 | rv_ans = numpy_new_int32(nVariant); 369 | int *p = (int*)numpy_getptr(rv_ans); 370 | 371 | CApply_Variant_NumAllele NodeVar(File); 372 | for (ssize_t i=0; i < nVariant; i++) 373 | { 374 | p[i] = NodeVar.GetNumAllele(); 375 | NodeVar.Next(); 376 | } 377 | 378 | } else if (strcmp(name, "$ref")==0 || strcmp(name, "#ref")==0) 379 | { 380 | // =========================================================== 381 | // the reference allele 382 | 383 | PdAbstractArray N = File.GetObj("allele", TRUE); 384 | // check 385 | if ((GDS_Array_DimCnt(N) != 1) || 386 | (GDS_Array_GetTotalCount(N) != File.VariantNum())) 387 | throw ErrSeqArray(ERR_DIM, name); 388 | // read 389 | size_t n = File.VariantSelNum(); 390 | vector buffer(n); 391 | C_BOOL *ss = Sel.pVariant(); 392 | GDS_Array_ReadDataEx(N, NULL, NULL, &ss, &buffer[0], svStrUTF8); 393 | // output 394 | rv_ans = numpy_new_string(n); 395 | PyObject **pi = (PyObject**)numpy_getptr(rv_ans); 396 | for (size_t i=0; i < n; i++) 397 | { 398 | const char *p = buffer[i].c_str(); 399 | size_t m = 0; 400 | for (const char *s=p; *s!=',' && *s!=0; s++) m++; 401 | numpy_setval(rv_ans, pi, PYSTR_SET2(p, m)); 402 | pi ++; 403 | } 404 | 405 | } else if (strcmp(name, "$alt")==0 || strcmp(name, "#alt")==0) 406 | { 407 | // =========================================================== 408 | // the reference allele 409 | 410 | PdAbstractArray N = File.GetObj("allele", TRUE); 411 | // check 412 | if ((GDS_Array_DimCnt(N) != 1) || 413 | (GDS_Array_GetTotalCount(N) != File.VariantNum())) 414 | throw ErrSeqArray(ERR_DIM, name); 415 | // read 416 | size_t n = File.VariantSelNum(); 417 | vector buffer(n); 418 | C_BOOL *ss = Sel.pVariant(); 419 | GDS_Array_ReadDataEx(N, NULL, NULL, &ss, &buffer[0], svStrUTF8); 420 | // output 421 | rv_ans = numpy_new_string(n); 422 | PyObject **pi = (PyObject**)numpy_getptr(rv_ans); 423 | for (size_t i=0; i < n; i++) 424 | { 425 | const char *p = buffer[i].c_str(); 426 | for (; *p!=',' && *p!=0; p++); 427 | if (*p == ',') p++; 428 | numpy_setval(rv_ans, pi, PYSTR_SET(p)); 429 | pi ++; 430 | } 431 | 432 | } else { 433 | throw ErrSeqArray( 434 | "'%s' is not a standard variable name, and the standard format:\n" 435 | " sample.id, variant.id, position, chromosome, allele, genotype\n" 436 | " annotation/id, annotation/qual, annotation/filter\n" 437 | " annotation/info/VARIABLE_NAME, annotation/format/VARIABLE_NAME\n" 438 | " sample.annotation/VARIABLE_NAME", name); 439 | } 440 | 441 | return rv_ans; 442 | } 443 | 444 | 445 | /// Get data from a working space 446 | COREARRAY_DLL_EXPORT PyObject* SEQ_GetData(PyObject *self, PyObject *args) 447 | { 448 | int file_id; 449 | const char *name; 450 | if (!PyArg_ParseTuple(args, "is", &file_id, &name)) 451 | return NULL; 452 | 453 | COREARRAY_TRY 454 | // File information 455 | CFileInfo &File = GetFileInfo(file_id); 456 | // Get data 457 | return VarGetData(File, name); 458 | COREARRAY_CATCH_NONE 459 | } 460 | 461 | 462 | /// Apply functions over variants in block 463 | COREARRAY_DLL_EXPORT PyObject* SEQ_BApply_Variant(PyObject *self, PyObject *args) 464 | { 465 | int file_id; 466 | PyObject *name; 467 | PyObject *func; 468 | PyObject *obj; 469 | const char *as_is; 470 | int bsize; 471 | int verbose; 472 | if (!PyArg_ParseTuple(args, "iOOOsi" BSTR, &file_id, &name, &func, 473 | &obj, &as_is, &bsize, &verbose)) 474 | return NULL; 475 | 476 | if (!PyCallable_Check(func)) 477 | { 478 | PyErr_SetString(PyExc_TypeError, "'fun' must be callable."); 479 | return NULL; 480 | } 481 | if (bsize < 1) 482 | { 483 | PyErr_SetString(PyExc_ValueError, "'bsize' must be >= 1."); 484 | return NULL; 485 | } 486 | 487 | COREARRAY_TRY 488 | 489 | vector name_list; 490 | numpy_to_string(name, name_list); 491 | if (name_list.empty()) 492 | throw ErrSeqArray("'name' should be specified."); 493 | 494 | PyObject *rv_ans = NULL; 495 | 496 | // File information 497 | CFileInfo &File = GetFileInfo(file_id); 498 | // Selection 499 | TSelection &Selection = File.Selection(); 500 | 501 | // the number of selected variants 502 | int nVariant = File.VariantSelNum(); 503 | if (nVariant <= 0) 504 | throw ErrSeqArray("There is no selected variant."); 505 | 506 | // the number of data blocks 507 | int NumBlock = nVariant / bsize; 508 | if (nVariant % bsize) NumBlock ++; 509 | 510 | // as_is 511 | if (strcmp(as_is, "list")==0 || strcmp(as_is, "unlist")==0) 512 | { 513 | rv_ans = PyList_New(NumBlock); 514 | } else if (strcmp(as_is, "none") != 0) 515 | { 516 | throw ErrSeqArray("'asis' should be 'none', 'list' or 'unlist'."); 517 | } 518 | 519 | // function arguments 520 | int num_var = name_list.size(); 521 | int st_var = 0; 522 | if (obj != Py_None) { num_var++; st_var = 1; } 523 | PyObject *args = PyTuple_New(num_var); 524 | if (obj != Py_None) 525 | { 526 | Py_INCREF(obj); 527 | PyTuple_SetItem(args, 0, obj); 528 | } 529 | 530 | // local selection 531 | File.SelList.push_back(TSelection()); 532 | TSelection &Sel = File.SelList.back(); 533 | Sel.Sample = Selection.Sample; 534 | Sel.Variant.resize(File.VariantNum()); 535 | 536 | C_BOOL *pBase, *pSel, *pEnd; 537 | pBase = pSel = Selection.pVariant(); 538 | pEnd = pBase + Selection.Variant.size(); 539 | 540 | // progress object 541 | CProgressStdOut progress(NumBlock, verbose!=0); 542 | 543 | // for-loop 544 | for (int idx=0; idx < NumBlock; idx++) 545 | { 546 | // assign sub-selection 547 | { 548 | C_BOOL *pNewSel = Sel.pVariant(); 549 | memset(pNewSel, 0, Sel.Variant.size()); 550 | // for-loop 551 | for (int bs=bsize; bs > 0; bs--) 552 | { 553 | while ((pSel < pEnd) && (*pSel == FALSE)) 554 | pSel ++; 555 | if (pSel < pEnd) 556 | { 557 | pNewSel[pSel - pBase] = TRUE; 558 | pSel ++; 559 | } else 560 | break; 561 | } 562 | } 563 | 564 | // load data 565 | for (int i=st_var; i < num_var; i++) 566 | { 567 | PyObject *v = VarGetData(File, name_list[i-st_var].c_str()); 568 | PyTuple_SetItem(args, i, v); 569 | } 570 | 571 | // call Python function 572 | PyObject *val = PyObject_CallObject(func, args); 573 | if (val == NULL) 574 | { 575 | Py_DECREF(args); 576 | if (rv_ans) Py_DECREF(rv_ans); 577 | return NULL; 578 | } 579 | 580 | // store data 581 | if (rv_ans && val!=Py_None) 582 | PyList_SetItem(rv_ans, idx, val); 583 | 584 | progress.Forward(); 585 | } 586 | 587 | File.SelList.pop_back(); 588 | Py_DECREF(args); 589 | 590 | // finally 591 | if (rv_ans) return rv_ans; 592 | 593 | COREARRAY_CATCH_NONE 594 | } 595 | 596 | } // extern "C" 597 | -------------------------------------------------------------------------------- /docs/demo/tutorial_parallel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python tutorial with multiprocessing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import numpy.ma as ma\n", 20 | "import PySeqArray as ps\n", 21 | "import pandas\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import matplotlib.patches as mpatches\n", 24 | "import types\n", 25 | "\n", 26 | "%matplotlib inline" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Open an existing SeqArray file and display its structure. The dimensions of sample.id and variant.id tell you the total numbers of samples and variants, i.e., 1092 samples and 19,773 variants." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "File: /Users/sts/anaconda/lib/python3.6/site-packages/PySeqArray/data/1KG_phase1_release_v3_chr22.gds (1.1M)\n", 48 | "+ [ ] *\n", 49 | "|--+ description [ ] *\n", 50 | "|--+ sample.id { Str8 1092 LZMA_ra(10.5%), 914B } *\n", 51 | "|--+ variant.id { Int32 19773 LZMA_ra(8.39%), 6.5K } *\n", 52 | "|--+ position { Int32 19773 LZMA_ra(52.0%), 40.1K } *\n", 53 | "|--+ chromosome { Str8 19773 LZMA_ra(0.28%), 166B } *\n", 54 | "|--+ allele { Str8 19773 LZMA_ra(22.7%), 109.2K } *\n", 55 | "|--+ genotype [ ] *\n", 56 | "| |--+ data { Bit2 19773x1092x2 LZMA_ra(8.17%), 861.8K } *\n", 57 | "| |--+ extra.index { Int32 0x3 LZMA_ra, 19B } *\n", 58 | "| \\--+ extra { Int16 0 LZMA_ra, 19B }\n", 59 | "|--+ phase [ ]\n", 60 | "| |--+ data { Bit1 19773x1092 LZMA_ra(0.02%), 550B } *\n", 61 | "| |--+ extra.index { Int32 0x3 LZMA_ra, 19B } *\n", 62 | "| \\--+ extra { Bit1 0 LZMA_ra, 19B }\n", 63 | "|--+ annotation [ ]\n", 64 | "| |--+ id { Str8 19773 LZMA_ra(35.2%), 75.2K } *\n", 65 | "| |--+ qual { Float32 19773 LZMA_ra(3.62%), 2.8K } *\n", 66 | "| |--+ filter { Int32,factor 19773 LZMA_ra(0.21%), 170B } *\n", 67 | "| |--+ info [ ]\n", 68 | "| \\--+ format [ ]\n", 69 | "\\--+ sample.annotation [ ]\n", 70 | " |--+ Family.ID { Str8 1092 LZMA_ra(15.3%), 1.1K }\n", 71 | " |--+ Population { Str8 1092 LZMA_ra(5.08%), 222B }\n", 72 | " |--+ Gender { Str8 1092 LZMA_ra(5.85%), 386B }\n", 73 | " \\--+ Ancestry { Str8 1092 LZMA_ra(2.43%), 233B }\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "fn = ps.seqExample('1KG_phase1_release_v3_chr22.gds')\n", 79 | "f = ps.SeqArrayFile()\n", 80 | "f.open(fn)\n", 81 | "f.show()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Genotypic data and annotations are stored in an array-oriented manner, providing efficient data access using the Python programming language. FilterSet() and GetData() can be used together to retrieve data for a selected set of samples from a defined genomic region. Apply() applies a user-defined function to array margins of genotypes and annotations." 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Calculation of Allele Frequencies" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 3, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "f.FilterReset() # reset the filter" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "array([ 0.69505495, 0.94322344, 0.99954212, ..., 0.99679487,\n", 120 | " 0.65842491, 0.91346154])" 121 | ] 122 | }, 123 | "execution_count": 4, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "def CalcAF(geno):\n", 130 | " gm = ma.masked_array(geno==0, mask=geno==255) # create a masked array with missing genotypes\n", 131 | " v = np.mean(gm, axis=(1,2)) # gm is a 3D array\n", 132 | " v.data[v.mask] = np.nan\n", 133 | " return(v.data)\n", 134 | "\n", 135 | "f.Apply('genotype', CalcAF, asis='unlist')" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "### Using 4 cores" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 5, 148 | "metadata": { 149 | "collapsed": false, 150 | "scrolled": true 151 | }, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "array([ 0.69505495, 0.94322344, 0.99954212, ..., 0.99679487,\n", 157 | " 0.65842491, 0.91346154])" 158 | ] 159 | }, 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "def ParallelCalcAF(file, param):\n", 167 | " return file.Apply('genotype', CalcAF, asis='unlist')\n", 168 | "\n", 169 | "f.RunParallel(ParallelCalcAF, ncpu=4)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Principal Component Analysis" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 6, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "def PCA(val, geno):\n", 188 | " gm = ma.masked_array(geno, geno==255) # create a masked array with missing genotypes\n", 189 | " p = np.mean(gm, axis=1).data * 0.5 # allele frequencies (a vector)\n", 190 | " g = np.array(geno, 'double')\n", 191 | " g = (g.transpose() - 2*p) / np.sqrt(p * (1 - p)) # normalized by allele frequency\n", 192 | " g[np.isnan(g)] = 0 # correct missing values\n", 193 | " g[gm.mask.transpose()] = 0\n", 194 | " val.cov = val.cov + np.matmul(g, g.transpose()) # update the cov matrix" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### Using 4 cores" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 7, 207 | "metadata": { 208 | "collapsed": false 209 | }, 210 | "outputs": [ 211 | { 212 | "name": "stderr", 213 | "output_type": "stream", 214 | "text": [ 215 | "/Users/sts/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:5: RuntimeWarning: invalid value encountered in true_divide\n", 216 | "/Users/sts/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:5: RuntimeWarning: invalid value encountered in true_divide\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "def ParallelPCA(file, param):\n", 222 | " s = types.SimpleNamespace(cov=0)\n", 223 | " file.Apply('$dosage', PCA, s)\n", 224 | " return s.cov\n", 225 | "\n", 226 | "mat = f.RunParallel(ParallelPCA, ncpu=2, combine=(lambda x,y: x+y))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 8, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "array([[ 0.70169957, 0.00830316, 0.05113733, ..., 0.01978629,\n", 240 | " 0.05134734, 0.012317 ],\n", 241 | " [ 0.00830316, 0.62280866, 0.03326604, ..., 0.02716621,\n", 242 | " 0.02789678, 0.02354706],\n", 243 | " [ 0.05113733, 0.03326604, 0.77340245, ..., 0.01013458,\n", 244 | " 0.03042339, 0.02440108],\n", 245 | " ..., \n", 246 | " [ 0.01978629, 0.02716621, 0.01013458, ..., 0.92201769,\n", 247 | " 0.02244575, 0.01091362],\n", 248 | " [ 0.05134734, 0.02789678, 0.03042339, ..., 0.02244575,\n", 249 | " 0.63179936, 0.00977053],\n", 250 | " [ 0.012317 , 0.02354706, 0.02440108, ..., 0.01091362,\n", 251 | " 0.00977053, 0.97954933]])" 252 | ] 253 | }, 254 | "execution_count": 8, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "cov = mat * (mat.shape[0] / sum(np.diag(mat)))\n", 261 | "cov" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 9, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "array([ 39.60517218, 16.51956609, 5.18983317, ..., 0.52165852,\n", 275 | " 0.51818269, 0.55006929])" 276 | ] 277 | }, 278 | "execution_count": 9, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "w, v = np.linalg.eig(cov)\n", 285 | "w # eigenvalues" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 10, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "" 299 | ] 300 | }, 301 | "execution_count": 10, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | }, 305 | { 306 | "data": { 307 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZQAAAEKCAYAAAA1qaOTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X90XPV55/H3M2MJG2wwxkIYYxsHC2UV04Djg0PB/Fhw\n+RG2Tk+7e4JJaEMb1l0o6WmXHNJ0OT05u6dJndNNOevikIRsaEzYJm1imvXWKygkDosdDKaxEREW\nJsJ2bCEwNja2sS09+8fMHd8ZzYxGM3dm7pU+r3N0pLn3zuh7pZn73O+v52vujoiISK1SzS6AiIiM\nDwooIiISCQUUERGJhAKKiIhEQgFFREQioYAiIiKRUEAREZFIKKCIiEgkFFBERCQSk5pdgEaaOXOm\nX3jhhc0uhohIorzwwgtvuXvbaMdNqIBy4YUXsmXLlmYXQ0QkUcysv5Lj1OQlIiKRUEAREZFIKKCI\niEgkFFBERCQSCigiIhIJBRQREYmEAoqIiERCAUVERCKhgCIiIpFoakAxs5vMrNfM+szs/iL7zcwe\nzO7/uZktCu2bbmbfN7NfmNkrZnZFY0svIiJhTQsoZpYGVgM3A13AbWbWVXDYzUBH9usu4KHQvr8B\n/tndPwh8GHil7oUWEZGSmllDuRzoc/ed7n4ceBxYXnDMcuBRz9gETDezWWZ2FnA18E0Adz/u7gca\nWXgREcnXzIAyG9gVerw7u62SY+YDg8C3zGyrmX3DzM6oZ2FFRKS8pHbKTwIWAQ+5+2XAe8CIPhgA\nM7vLzLaY2ZbBwcFGllFEZEJpZkDZA8wJPb4gu62SY3YDu919c3b798kEmBHc/WF3X+zui9vaRk3n\nLyIiVWpmQHke6DCz+WbWCnwCeKLgmCeAO7KjvT4KHHT3ve6+D9hlZp3Z464HehpWchERGaFpC2y5\n+0kzuwfYAKSBR9z9ZTNbmd2/BlgP3AL0AUeAT4de4o+AtdlgtLNgn4iINJi5e7PL0DCLFy92rdgo\nIjI2ZvaCuy8e7bikdsqLiEjMTKg15eOmu2eAjTsGWdrRxrKu9or3iYjEkWooTdLdM8C9393Ko8/1\nc+93t9LdM1DRPhGRuFJAaZKNOwY5emIIgKMnhnhsc3/JfRt3aP6MiMSfAkoddfcM8MC67SNqGKs2\n9PLUKwNYaNuPXx1k1YZeAJZ2tDGlJQ3AlJY0Szs0f0ZE4k99KHUSNFsdPTHE97bs5sHbLmNZVzur\nNvSy+um+EccPO6x5po9L50xnWVc7D952mfpQRCRRVEOpk1LNVj/curvkc4ac3HHLutr54vKFCiYi\nkhgKKHVSrNmqu2eAgXffL/u8aZNbSjaViYjEmSY21lHh0N8H1m3n0ef6yz6nbWor+987zpBD2mDl\ntQu478bOss8REaknTWyMgcJmq3CtxUo8Z/BwJphApglszY9fU01FRBJBAaVBgtrKnVfN55LZZ1Jp\nvXBo2DVsWEQSQaO8GiA84mtKS5oprWOL49Mmt9SpZCIi0VENpQEKR3ztf+/EmJ7/ZM8+NXuJSOwp\noDRAJX0n5fQOHObutS8qqIhIrCmgNMhF505lxuktFfedFDo+NJyXnkVEJG7Uh1Jn3T0D3L32RY4P\nDTe7KCIidaUaSp1t3DEYWTBZsWReJK8jIlIPCih1trSjjdZ07X/mqaellYZFRGJNAaUBrlxwDgtn\nn8XU1nTVr/G7vz4/whKJiERPfSh1FJ5/kjaYUkNAERGJO9VQ6ig8/2TI4b33h6p+rYee6dOwYRGJ\nNQWUOlra0UY6NPHEqW4eCmTWS9GwYRGJMwWUOlrW1c7KaxeQTp0KI7Xkdn513yHVUkQkthRQ6uy+\nGztZ88mPMHv65Jpfa8/BY9z73a0KKiISSwooDbCsq52L26dF8lrh1R9FROJEAaVBViyZR6raDpSQ\n1nSKpR1ttb+QiEjEFFAaaFJKf24RGb+aeoUzs5vMrNfM+szs/iL7zcwezO7/uZktKtifNrOtZvaj\nxpW6Oo9t7o8kBcvxoWE1eYlILDVtYqOZpYHVwDJgN/C8mT3h7j2hw24GOrJfS4CHst8DnwVeAc5s\nSKHHKFilcdrkFp7tezuS15zSklaTl4jEUjNnyl8O9Ln7TgAzexxYDoQDynLgUXd3YJOZTTezWe6+\n18wuAD4G/DfgTxpc9lGFZ8mngCjSQ6YN7rxqvnJ6iUgsNbPJazawK/R4d3Zbpcd8Ffgc0VyrIxee\nJR8u4KSU0Ta1tarXHHI4dGxsqz2KiDRKInuJzexW4E13f6GCY+8ysy1mtmVwsHF9D+FVGvO3z+T5\nP1/G3Bmnj/k1NcJLROKsmQFlDzAn9PiC7LZKjrkS+E0z+yXwOPBvzew7xX6Juz/s7ovdfXFbW+Mu\nxsu62nnwtsu4rvNU+vrge3fPAP/l1q4xv+aVC85Rc5eIxFYzA8rzQIeZzTezVuATwBMFxzwB3JEd\n7fVR4KC773X3z7v7Be5+YfZ5/+Lun2xo6SuwrKudb336clbfvojrOjPB7OneQe797lZe2nWgorxe\nwdyVKS1pLbAlIrHWtE55dz9pZvcAG4A08Ii7v2xmK7P71wDrgVuAPuAI8OlmlbcWy7rasys3Zprc\njp4Y4ocv7q4or9c1F7cxZ8bpLO1oU+1ERGKtqeuhuPt6MkEjvG1N6GcH7h7lNZ4BnqlD8SK1tKON\n723ZzdETQ7SmUwwcer/ocZfMPpOevYcYGvZcrUSBRESSQAtsNUjQp7JxxyC79h/h6d7iAwSuvvhc\n7r3+YjbuGFStREQSRQGlgZZ1tbOsq53ungE27dyfG1Yc9pNX3+S+GzsVSEQkcRI5bDjplnW1c+dV\n8/MW3wps2/Mu/2nti40vlIhIjRRQmuTQsRMMleiVX79tL6s29Da2QCIiNVJAaZLwxMdJRfLar930\nSy2kJSKJooDSJEEn/R1XzOOhT36EWy6Zlbf/wNGTWp1RRBJFnfJNFHTSBz+v2tDL2k2/5MDRk8Cp\n1RnVQS8iSaAaSozcd2Mnq/79pbmmMKWqF5EkUQ0lZsLzVTQPRUSSRAElhsJNYSIiSaEmLxERiYQC\nioiIREIBRUREIqGAIiIikVBAERGRSCigiIhIJBRQREQkEgooIiISCQUUERGJhAKKiIhEQgFFREQi\noYAiIiKRUEAREZFIKNtwTHX3DCiFvYgkimooMdTdM8C9393Ko8/1axlgEUkMBZQY2rhjkKMnhoBT\nywCLiMSdAkoMLe1o0zLAIpI4TQ0oZnaTmfWaWZ+Z3V9kv5nZg9n9PzezRdntc8zsaTPrMbOXzeyz\njS99/QTLAN9xxTwevO0y9aGISCI0rVPezNLAamAZsBt43syecPee0GE3Ax3ZryXAQ9nvJ4E/dfcX\nzWwa8IKZdRc8N9G0DLCIJE0zayiXA33uvtPdjwOPA8sLjlkOPOoZm4DpZjbL3fe6+4sA7n4IeAWY\n3cjCN1p3zwAPrNuuDnoRia1mBpTZwK7Q492MDAqjHmNmFwKXAZsjL2FMaNSXiCRBojvlzWwq8A/A\nH7v7uyWOucvMtpjZlsHBZI6W0qgvEUmCZgaUPcCc0OMLstsqOsbMWsgEk7Xu/o+lfom7P+zui919\ncVtbMkdLadSXiCRBM2fKPw90mNl8MkHiE8CKgmOeAO4xs8fJdMYfdPe9ZmbAN4FX3P2vG1noZghG\nfWnmvIjEWdMCirufNLN7gA1AGnjE3V82s5XZ/WuA9cAtQB9wBPh09ulXAp8CtpnZS9ltf+bu6xt5\nDo2kUV8iEnfm7s0uQ8MsXrzYt2zZ0uxiiIgkipm94O6LRzsu0Z3yIiISHwooIiISCQUUERGJhNZD\niRmtgyIyMYzHz7pqKDGiGfEiE8N4/awroMSIZsSLJFepfHvFto/Xz7qavGJkaUcb39uym6MnhjQj\nXiRBghrH0RNDrN38BiuvuYj7buzM2/74z3Zx5YJzWLFk3rj9rCugxIhmxIskU7jGMTTsrHmmj0vn\nTM/bfnxomKd7B3m27206z5vKgnPPYObU01ixZN64+ayXDShm9kEy2X03u/vh0Pab3P2f6124iUgz\n4kXqr9YO8eD50ya3cOjYCaZNbiGdMoaGMxPFh5zc6wc1kcDxoWG27cnksp2Ustz28fC5LzlT3szu\nBe4ms9bIpcBn3X1ddt+L7r6oYaWMiGbKi0i4GWpKS7rkqqjhoAPk/Xz32hc5PjScOzadMi6dM52t\n/e8wDLSmU6y+fRHLutrp7hngsc39PNv3dt5zwsqVIw4qnSlfrobyGeAj7n44u+bI983sQnf/G8DK\nPE9EJLaKdYgXXsgL+z4gU7P43pbdLDj3jBGBYWjY2dr/DsHt+cnh/P1zZpzOZ64+ix9u3c2eA8dG\nlKlUOQrFfahxuYCSCpq53P2XZnYtmaAyDwUUEYmhSi640ya3lH0MjOj7CBw9McT+944Xfd1wCBl2\nePCpV4FTtZmUwWVzz2bvwWMMFzQMWYlyFJ5bEOS+t2V3LGs05YYND5jZpcGDbHC5FZgJXFLvgkl1\ntFSwTFSVzu04dOxE2ceQvwbRpJTl3UEPFUaDEvYcOMpjm/tzAWnY4YX+d0YEEwAHvv6TnWU/t0kY\nalwuoNwB7AtvcPeT7n4HcHVdSyVVGa+TpUQqMdoFN7jZmja5ZdQF64IRl9d1ZvaFY8C+d9+vqDwH\njpzg+V/ur7j8x4eG+cqGX5T83CZhob2SAcXdd7v7vhL7nq1fkaRaSbiDEamXchfc8M3WIz99nes+\neC6d7VO586r5JZuNgu0nK6yRFBp2OPz+UNF94dFdYb0Dh0veDAZB7o4r5sWyuQs0D6XpirX5Vtvx\nNl4nS0k04t6hW6tS87i6ewb4yoZf5N1sbdi+lyGHvjf7cs//4dbdzDijlasvPjc3FPjZvrcjL+fU\n09IcyQaaFHDO1FbeOnw8Vws6emKIxzb3F/0fxX1agRbYaqJiwxeBioY0lnvN8XzRkOpUOlR2vAmf\nd8DIb8Iyg2KXwcLj6mX29MkjRn6Fhx3HQc0LbJnZAjO7ssj2K83soloLKMWbqAq3Pba5f0yvuayr\nnS8uXxibN6LEw0RtDn1sc39eMIGRQaLUPXWjbrX3HjxGazr/Unx8aDiR/6NynfJfBd4tsv3d7D6p\nUbE236UdbXlvrmf73lbnutQsCR26pVQ7crG7ZyCvyaraTLhntKarfGZxhb0nww6d501lxhmjD2eO\nu3J9KO3uvq1wo7tvy050lBqVavO9csE5PN2buTsJ36mEUz2oSUvGIql54grnXtx51fyK3/8bdwzm\nzSH50Owz6XvzvRE1ltG8d3xsx49mUso4EeroTxn07js8YrJkseHMcVcuoEwvs29K1AWZqIp1sq1Y\nMo9NO/fn2runTW4Z0Q4c14lNEl/lOnTj2vdW2FS35sevMTTseZl7S5W3cJDK1Refy8ypB9m25yBv\nHS4+OTGQMorOF4lCKkXeLMhZZ43sQ0laLTJQrha4xcw+U7jRzP4AeKF+RZLC4YGHjp0YcVc1kdrB\npb7iPH8p3FSXtlOTCoPMveXKG/4c3XnVfB756es83TvI/lGCSb0tnD09r/nxw3POLth/VmJvFsvV\nUP4Y+IGZ3c6pALIYaAV+q94Fm+jCd5M/2LpnxP6k3sFI/FSS22qsoqrxhJvqpk1u4ZGfvp53c1VJ\nebe+8Q47B081dRVPz5ivXrUTgJ5fvZvXdFd4Y7ho7vREBhMoE1DcfQD4dTO7DliY3fy/3f1fGlIy\nATIfzA0v588v7Wyfyn++8YOJfdNJvEQ9fynqnFPhm6tL50zPy9wblLdYAFu1oZe/fbqvYaO1KnX0\nxBCHjp3gi8sX5raNl/ljJQOKmU0GVgILgG3AN939ZKMKJhkbdwzm5Q5KGwomWXFt90+aqDvs61Hj\nCQTBpTC1fGEAA1jzzMhg0qi5JaMJj+BK6oCJYso1eX0bOAFsBG4G/g2ZZjBpgPACPlNa0hw9MUQ6\nZay85qKa3nDj5SKchMyrSRLlDOxGZGwIl/eBdduLzrEZKhI5PtB2Bq8Nvpe3rTWdYti9ZIqVoIM+\nbcVfsxqFI7jiPgO+UuUCSpe7XwJgZt8EftaYIknhrOaxDJUs9XqFbdBJvwjX8y5YatPoO+5wAGtN\np9i1/whd559FazqVG4o744wWbrt8Hj2/OpgXUGZPn8xf/Gam6SmYRDyldVIuNUv48/dUzwB7Do5c\nywRg3owp7DlwbERQMoOF559J66Q0L/S/k9uexDkmlSgXUHIh1N1PmkW/BIqZ3QT8DZAGvuHuXyrY\nb9n9twBHgN9z9xcreW6SFV4sC9tbxyIcnMJ3WEm/CCtvWbzVesddriZduC8IYEHfSrBu+3B2Cnxr\nOsWXf/vDQCZfV9jF7dNyTWhzZpyeu+ka8swqjHdeNZ/7buwEMkFg9dOncn9NShknh50pLWn+/NYP\nAafmivX86iBAbljzA+u25wWUJM4xqUS5gPJhMwtmyhswJfvYAHf3M2v5xWaWBlYDy4DdwPNm9oS7\n94QOuxnoyH4tAR4CllT43MQay8VytCascHAKPiRD2Q9Bki/C46ndWfKVa85ctaG35FyUzETGUxOC\nA8eHhnlsc39ublegNZ1ixZJ5+Tdd4XXhhz3vwh8Elid79nFD13lcOmf6iPdfpXNikvzZK6fcKK9o\n8w2MdDnQ5+47AczscWA5EA4Ky4FHPZPBcpOZTTezWcCFFTw3UQoDQyUXy/AHYe2mflZeuyD3pg8U\nvpFrbT6Lk/HS7iynFMsMHNSku3sGWPNMX66WHcxF2bRzPw/edtmIpq/gmGDORziYhEdKhvtghoY9\nV5MvduG/78bO3Ges0vk6wWd7PH32Smlm+vrZwK7Q491kaiGjHTO7wucCYGZ3AXcBzJ07t7YS10mx\nO7JixxQGmMLax5ofv8alc/LHsIebA4AR+0Xiolhm4PBFfeOOwaKd4kHQ+eLyhXk3YsFzlna08dKu\nA7l0RgA3dJ2X+xxUc9NV6aCQiZbledyvh+LuDwMPQyZ9fZOLU1SxDMNB9TzIX1SsM31pRxtrN/Xn\nPmRDw16yXyR4veBubjy/qSWZwp8DGDnfKnzhTwGpUB9GEEAKa63hm6+wcFNWNc2nlQ4KmWiDR6pN\nwBmFPcCc0OMLstsqOaaS5yZGYSZYIO9N+MOtu4sOi1zW1c7KaxeQzq7+VqpttlGpy7We/cRS7v9d\nzXuh8HNQON8qnErla3cs5qFPfqTi1QtHy7Y81mUfKs3enOQsz9Vo2gJbZjYJeBW4nkwweB5Y4e4v\nh475GHAPmVFeS4AH3f3ySp5bTNwW2AorNVGrcIx8sYV3RuuYb0S1e6JV7Se6cv/vUvsqmQNVz3lS\nUb92pa83HuZ+VbrAVtOavLJDke8BNpAZ+vuIu79sZiuz+9cA68kEkz4yw4Y/Xe65TTiNyBRW1YMq\n+K79R/Lafq9ccM6IN2X4ucXevI0YETXRqvYTXbn/d6kacSV9DvUcaBH1a1f6ehNp8EhT+1DcfT2Z\noBHetib0swN3V/rc8SScYiKcyn7Fknl5x42WgiJ4jbGs/FjNHdVEGRYpmffHrv1HchMHg8mE3T0D\nub69wveCbjgmBq0pnwClLvDhMflTWtJ89AMz8mozd1wxj6Udbdy99sXcuPzR1qqupelqPFTtpbzw\n+6M1naLzvKm5xaHKNW8lpUlU7+HiYt/kJZUrVmUuHJMf3P0Feb/Cd4aFk7zK3R3WcidZadVeH9rk\nCr8/jg8Nc/zkcO79FX6/FBttFfeJqMoPVzsFlIQJLsa79h/JG5OfskyahxVL5uXSPwTfwzmNWtOp\nks1RhU0Z9Wi60oc2GUoF/cLmrBu6zuON/a9X1NQZ974ENcvVTgElQQqbG4JcQgCpbK614AMQbl64\noaudTTvfAuC2y4svmRp+7ZRlVo377PUdFY/OqVQ1H1rVaBqru2cg10z6+M925TWRFqtpFEtBkkTq\nB6ydAkqCFDY3zDi9hf1HMhO0ToYmNT62uT/vov3P2/bmVqn7+k92Fp0tH37tYYdXssntoq5RjPVD\nm+QaTVID4WOb+3M12iAPVqnJgi/tOlBzOpG4/J2S0CwXdwooCVKY8vrg0VOzfSeljLcOH+fKv3yK\nfe+eSrEdrOUQOD40zFc2/AJgRFPG2s1vnEqM56dmF0fZDDDWD22xLAJJ+KAnORAWGjx8nAfWbS/a\nwR6o9hzj9neKe7Nc3DVzpryMUWHK63AfyslhZ/22vew5eCxv+5yzTx/xOr0Dh7n3u1vzZjEv62pn\n5TUXkc6uUhDUHuox03css5KXdrTlEv0BPNv3diJm4jcqO0E9rFgyL/c3n5QyXt13iEef68+9ZwpT\npED155jkv5OMpICSIOGLe2s6RWqUJWqmtKT5QNsZRfcV+/Ded2Mnaz61OC+dRTjdRbPuHtumteZ+\nDkapxV2SU24s62pn9e2LssPOZ44YxRU+t0C155jkv5OMpHkoCRFedTFos/7B1j2s37a36PFBpzqQ\n62CdlDJSZiPmDMRVqeyzcS93oJKUOM1orx/L7y0cCBKsPwKMeD8mvQ9FStM8lHGk1KSwUnfqKYO2\nqa1FthufufoDVV8AGv3BHy37bJyUSnlTyQTSRvYdjPX3BjXU8GqIQcbqalcRLfY74vg/lbFTQImx\n8JyTYh3jSzvaePxnu/ImLkKmEz744H/0AzPyRuxUu5xwMy6AhSPC4hxMxvq3adach3K/N5yip+v8\ns/JuPMKrIWqOhpSigBJThU0NxSYbLutqp/O8qWzb827R1yg1e74a9bwAlqr5xGEYZyW1smr+NvWY\n81BJWUv93vDcEyCXwicIkJqjIZVQQImpwjkn13W2MWfG6SMuFjOnnpb3vBlntHD42FAuaR8QydKj\n9bqgjHZ3P9bmkCib5SqteRQO5y58XEzUwbLSspb6vYUpegKlVkNU7USKUUCJqcIL+IolxWe4r1gy\nj2f73s4FkC//9ocBirZ5A3nzCcaiXrWFKGs+UTfLVVq2wuHcwePRglupHG3V/I3H8ncMB5Hgcanm\n03KrIYoUUkCJqdEu4OFRX1cuOAcgL+gUtnkXLitczcW2HheUKGs+UTfLhctWmKI9rFgNpZrgNtpz\nygWbsfwdS/2e1bcvKtmHIlIJBZQYK3UBLzWcNrxWSuFF7q3D74+YcR6H5otSgTMOa7KUG+EULlOx\nGko1wW20DvPRmgYrrUGW+j2qgUitFFASqNRM5XCQKLzIzZx6Wq5zvjWdyjaTDfL4z3bl5hbUcjEp\nNk+m0tcrvJBV23RVj2a5SkY4lQpk5YJbsYBZLiBWEqAqDQjqYJd60cTGhAmGdm7c8VYu03AgyD5c\nuPBRMCEtaMYoXFYYapswGPUExAfWbefR506tMHnHFfMim/NQjUoWhyoWIErVskZbj32sz6n2nOJQ\nQ5Vk0MTGcWjVht7colrF0q4EAeb40DDb9rzLpJRxyewz6d13eETnfNCRH6ilz6FcbqdK+g3Gcqfe\nDJXUfIrVDkrVGMrVNko9J+ral5q3pB4UUBKiu2cgs9xvtlIyXEHF8uSws/+94yNyMX1x+UKuXHBO\nXi0lBVVfuMMBIFBLavo4zD8JlzEoR1S1pGoDpoKAxJ0CSkJs3DGYSy0/FkGfSeGkyPBwY4DUaJkm\nywgHgLH0oVRzp95I9coOUBgwofrh3CJxomzDCTFtcgvp7EV/LJf+/e9lOuev62zjzqvms3HHYG7o\nazDcGE4t0DWa7p4BHli3fUQK+WVdmZT0l86ZXnHZ4p5ptp6p1YO/F2RW1wynhxdJKgWUBOjuGeCR\nn76eq6Esmnd23oX4lktmlU1lH9RCvv6TnTz6XD93r32R7p4BViyZN6YLenDHXuriN9r+QsGdejNT\n45fTiICn9UBkPFGTVwIUdnq/9MY7rLx2Qa5pCeDJnoERs5zTKWNo2JnSkuatw++PWNb1W5++vKbV\nEws73auZexGHpq1SGtGXE7cBCCK1UEBJgKUdbXxnU3+uI37IyQWTxzb38+rAoRHBJBg6PHPqaaxY\nMi83A7pQsTkg1c7Grvbi2IghrNX+jvDfpx7ljNMABJFaKaAkRMqM4eycodZ0immTW/Kyw546Duac\nPYU9B46xbc+7pMik0SjM+RWeVQ+n5rcEx5SajX3nVfN5smcfN3SdV/RCO9aLY7jj+++e6+fmS2bx\nt7cvqvGvVfp3VNK5XmpOSb3S98e5liYyFupDSYCNOwbzJjFefN40nuzZVzQ77LDDrv1Hc8cPAw89\n0weQW9Z19e2Lik6ae7p3cMQQ47CgL6d34DCP/PR1unsGRvSbABWvFx+cW9BM5sD6bXtZtaG3sj9M\nhcbST1GqH0h9HSKja0pAMbMZZtZtZjuy388ucdxNZtZrZn1mdn9o+yoz+4WZ/dzMfmBmlQ8tSqDC\nteRf3XeI3oHDeceEO+ULw8ywk+vPCEYWhUdqFZuYmE7ZiHxgxS6qtV5ol3a0jRi19mTPvjG9RiW/\no9LO9VLnE/cRaSJx0Kwayv3AU+7eATyVfZzHzNLAauBmoAu4zcy6sru7gYXu/mvAq8DnG1LqJgmP\nhrpywTl5NZPZZ03mus42/vDaBXlBJxxggiayB9ZtZ9WG3hF34OGL5aSUYcDQsPP1n+zMG6lV7KJa\n64V2WVc7N18yK2/bDV3njek1KvkdlY4mK3U+cR+RJhIHTcnlZWa9wLXuvtfMZgHPuHtnwTFXAH/h\n7jdmH38ewN3/suC43wJ+x91vH+33Jj2XV3fPAA8+9WreCo23XDKL+TPP4MmefVx07jRmTm1l2uQW\nen51kLcOv8/MqafRdf5ZPPLT1zl6Yoi0kZttD6fyZAX9BlvfeCfv9S+ZfSb/9EdL88oQRWbgQqs2\n9Ob6Zu67sXP0J9SR8lyJ5Ks0l1ezAsoBd5+e/dmAd4LHoWN+B7jJ3f8g+/hTwBJ3v6fguH8C/pe7\nf6fE77oLuAtg7ty5H+nvLz7aKc4KO8zDjEzfQ2DujNPZ/c4Rhh3SBiuvXcCTPfvymsjCw4kL77Y/\n/a2f5adkMfjapxbrwioygTU9OaSZPQkUa7v4QviBu7uZVRXVzOwLwElgbalj3P1h4GHI1FCq+T3N\nVCyTb1guFIFbAAAMPElEQVThCb2x/0ju5yGH1U/35e0vHE5cGChWLJnHj3sHc/0w4f4XEZFy6hZQ\n3P2GUvvMbMDMZoWavN4sctgeYE7o8QXZbcFr/B5wK3C9j+Mc/MU6zFNWWXLIQgYMu7Ntz7t5C3IV\nNvH84XULMokos7UYdUCLSCWaNQ/lCeB3gS9lv68rcszzQIeZzScTSD4BrIDM6C/gc8A17n6kyHPH\njfBkQTM4ozXTYXz4/eI1lnKcUynuwyOYCudX3HdjJ5fOma5+BBEZk2aN8voSsMzMdgA3ZB9jZueb\n2XoAdz8J3ANsAF4B/t7dX84+/38A04BuM3vJzNY0+gQaJZhMmALcM4GkmmBSqDWdYmlH24hhso9t\n7ueBdduBsc0nERFpSg3F3d8Gri+y/VfALaHH64H1RY5bUNcCxsyhYydGzC0Zi0kpOFnwAieHh3lp\n14G8GlB4aeCgtgLUZVSXiIw/WgI4Abp7BviPf7elqn6TclLA1+7IDNzYuGNwxNLA13W2sWnn/rxl\nZ+FUE1k6Zay85qKmD/OtlgKjSGUqHeWl1CsxF1z0LptbNJlATYaBxzb352bQF6azB8rOjB8adtY8\n05fINTzGmmpfREan5JAxVW7uSb0UW0kwXEMJtq3d/EZubZahhA4rribVvoiUp4ASQ6PNPYnKpGx+\nllUbevOW7Q1fWItlD155zUWseaaPIS+ebiUJTUlah0QkeupDiaEH1m3n0efqO6N/6mlpjp0Yzsti\nXGzmfCmlgkY4GI7l9ZohCYFPJA6aPlNeqhe+e56UyqyDEnWHPFheMIHiTT+lLrql1vCIsimp3hd8\nrUMiEi11ysdQOLPtQ5/8CF/71GIWzj4r0t9x+P2TI7YVNv2s2tDLyu+8MKaO68JsvUGW47F2eqvT\nXCR5VEOJqcK75407Btm+52Ddft/s6ZP5+GUX5M01CfpJYGxrxAf9LtMmt+SyHI91lUN1moskj2oo\nCbG0o4104UpUNQheKkWmc37PgWO5VRghc0EPp7lPp6zijutgGPKhYyeqXnxLC1qJJI8CSkIs62rn\nxoWzRj+wQumUcV1nG9d0tuXl9/rKhl+MWHQrbZmRXWOtIdQSFLSglUjyaJRXQtRjKHFn+1Ru6Dov\n1ywVCM+Kr7VTXCOpRJJPo7zGmWJp7GvVO3CYN/a/znUfPJf/1zfIgaOZjvqgeapUcsixBAmNpBKZ\nOBRQEmLa5Ja6vO7RE0Ns2L43r7+kXPNUuKY01o52ERnf1IeSEIeOnajba4eDyezpk8sGiWKjr0RE\nQAElMd46fLwur2ucSsECMHio/O/R6CsRKUVNXgnx2puH6vK6DrSfeRp7DhwD4PjQ8Kiz5Yvl9xIR\nUUBJiBu6zqN3oC/y153Skubjl12QG+lVWOso1WeiQCIihRRQEuK+Gzt5/a33WL9tb82vNXfG6fy7\nD5+fl2G41BrymrEuIpVSQEmQv719Ed09A/zXH71M//6jVb/ORW1njFhlsVStQ2neRaRSCigJs6yr\nnQefehWoPqBA5XNJ1GciIpVSQEmY7p4BXt7zbk2v0XX+WWOaS6I+ExGphIYNJ8zGHYPUsiBwazpF\nz68Oai6JiEROASVhwvNAqhGsTx/FmiUiImFKDplAQf/Hj3vfHHPnfGHix/CaJXFfsldEmqPS5JCq\noSRQsN7In9/6IcayRMq8GafnzSOpdc0SEZEwBZQEW9bVzsLZZ1Z8/O4DI2szSqUiIlFpSkAxsxlm\n1m1mO7Lfzy5x3E1m1mtmfWZ2f5H9f2pmbmYz61/qeLr3+otpTRf/N4ZzdAEMDfuIGogWshKRqDSr\nhnI/8JS7dwBPZR/nMbM0sBq4GegCbjOzrtD+OcBvAG80pMQxtayrndW3L2Lh7LPyti+cfRZLO/Lj\nbNooWgMJmr8UTESkFs0KKMuBb2d//jbw8SLHXA70uftOdz8OPJ59XuC/A58jk99wQlvW1U7b1Na8\nbW1TW1mxZN6pZXxTxsprFyhoiEjdNGtiY7u7B0mp9gHFrnKzgV2hx7uBJQBmthzY4+7/ajaWbumJ\npXCWO8AD67ZrxruI1EXdAoqZPQmcV2TXF8IP3N3NrOJahpmdDvwZmeauSo6/C7gLYO7cuZX+msRZ\nsWQez/a9zfGhYVrTKVYsmQecmuWulRZFpN7qFlDc/YZS+8xswMxmufteM5sFvFnksD3AnNDjC7Lb\nLgLmA0Ht5ALgRTO73N33FSnHw8DDkJmHUu35xF3Ql1Iq55ayBotIvTWryesJ4HeBL2W/rytyzPNA\nh5nNJxNIPgGscPeXgXODg8zsl8Bid3+r3oWOu3I5t5Q1WETqrVkB5UvA35vZ7wP9wH8AMLPzgW+4\n+y3uftLM7gE2AGngkWwwkSooa7CI1JtSr4iISFlKvSIiIg2lgCIiIpFQQBERkUgooIiISCQUUERE\nJBIKKCIiEgkFFBERiYQCioiIREIBRUREIqGAIiIikVBAERGRSCigiIhIJBRQREQkEgooIiISCQUU\nERGJhAKKiIhEQgFFREQioYAiIiKRUEAREZFITKg15c1sEOhvdjlGMRN4q9mFaJCJdK4wsc5X5zq+\nzHP3ttEOmlABJQnMbIu7L252ORphIp0rTKzz1blOTGryEhGRSCigiIhIJBRQ4ufhZheggSbSucLE\nOl+d6wSkPhQREYmEaigiIhIJBZQmMLMZZtZtZjuy388ucdxNZtZrZn1mdn9o+yoz+4WZ/dzMfmBm\n0xtX+sqUKntov5nZg9n9PzezRZU+N26qPVczm2NmT5tZj5m9bGafbXzpx6aW/2t2f9rMtprZjxpX\n6urV+D6ebmbfz35WXzGzKxpb+iZwd301+Av4K+D+7M/3A18uckwaeA34ANAK/CvQld33G8Ck7M9f\nLvb8Jp9fybKHjrkF+D+AAR8FNlf63Dh91Xius4BF2Z+nAa+O13MN7f8T4DHgR80+n3qfL/Bt4A+y\nP7cC05t9TvX+Ug2lOZaTebOR/f7xIsdcDvS5+053Pw48nn0e7v5/3f1k9rhNwAV1Lu9YlSx7yHLg\nUc/YBEw3s1kVPjdOqj5Xd9/r7i8CuPsh4BVgdiMLP0a1/F8xswuAjwHfaGSha1D1+ZrZWcDVwDcB\n3P24ux9oZOGbQQGlOdrdfW/2531Ae5FjZgO7Qo93U/xicyeZO6Q4qaTspY6p9LzjopZzzTGzC4HL\ngM2RlzA6tZ7rV4HPAcP1KmDEajnf+cAg8K1sE983zOyMehY2DhRQ6sTMnjSz7UW+8u5wPFMfrmqo\nnZl9ATgJrI2gyNIkZjYV+Afgj9393WaXpx7M7FbgTXd/odllaZBJwCLgIXe/DHiPTPP2uDap2QUY\nr9z9hlL7zGwgaPLINge8WeSwPcCc0OMLstuC1/g94Fbg+mxQipOyZR/lmJYKnhsntZwrZtZCJpis\ndfd/rGM5o1DLuf428JtmdgswGTjTzL7j7p+sY3lrVcv5OrDb3YMa5/eZAAGl6Z04E/ELWEV+p/xf\nFTlmErCTTNU56BD8UHbfTUAP0NbscylxfiXLHjrmY+R3Zv6s0ufG6avGczXgUeCrzT6Pep9rwTHX\nkoxO+ZrOF9gIdGZ//gtgVbPPqe5/s2YXYCJ+AecATwE7gCeBGdnt5wPrQ8fdQmbkz2vAF0Lb+8i0\n276U/VrT7HMqco4jyg6sBFZmfzZgdXb/NmDxaOcd169qzxW4isyd7M9D/8tbmn0+9fq/hl4jEQGl\n1vMFLgW2ZP+/PwTObvb51PtLM+VFRCQS6pQXEZFIKKCIiEgkFFBERCQSCigiIhIJBRQREYmEAopI\nHZnZkJm9lM2S8D0zOz27/Twze9zMXjOzF8xsvZldXOT5j5jZm2a2vfGlFxkbBRSR+jrq7pe6+0Lg\nOLDSzAz4AfCMu1/k7h8BPk/xnG7/k8xEVpHYU+oVkcbZCPwacB1wwt3XBDvc/V+LPcHdf5JNHCkS\ne6qhiDSAmU0CbiYzm3ohMFGSJMoEooAiUl9TzOwlMik43iC7PobIeKQmL5H6Ourul4Y3mNnLwO80\nqTwidaMaikjj/QtwmpndFWwws18zs6VNLJNIzRRQRBrMMxlZfwu4ITts+GXgL8ms3pnHzL4LPAd0\nmtluM/v9xpZWpHLKNiwiIpFQDUVERCKhgCIiIpFQQBERkUgooIiISCQUUEREJBIKKCIiEgkFFBER\niYQCioiIROL/A2P8Ld9mMTL8AAAAAElFTkSuQmCC\n", 308 | "text/plain": [ 309 | "" 310 | ] 311 | }, 312 | "metadata": {}, 313 | "output_type": "display_data" 314 | } 315 | ], 316 | "source": [ 317 | "plt.scatter(v[:,0], v[:,1], s=10)\n", 318 | "plt.xlabel('PC 1')\n", 319 | "plt.ylabel('PC 2')" 320 | ] 321 | } 322 | ], 323 | "metadata": { 324 | "kernelspec": { 325 | "display_name": "Python 3", 326 | "language": "python", 327 | "name": "python3" 328 | }, 329 | "language_info": { 330 | "codemirror_mode": { 331 | "name": "ipython", 332 | "version": 3 333 | }, 334 | "file_extension": ".py", 335 | "mimetype": "text/x-python", 336 | "name": "python", 337 | "nbconvert_exporter": "python", 338 | "pygments_lexer": "ipython3", 339 | "version": "3.6.0" 340 | } 341 | }, 342 | "nbformat": 4, 343 | "nbformat_minor": 2 344 | } 345 | -------------------------------------------------------------------------------- /src/ReadByVariant.cpp: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // ReadByVariant.cpp: Read data variant by variant 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | #include "ReadByVariant.h" 23 | 24 | 25 | namespace PySeqArray 26 | { 27 | 28 | using namespace Vectorization; 29 | 30 | static const char *ERR_DIM = "Invalid dimension of '%s'."; 31 | // static const char *ERR_DIM_EX = "Invalid dimension of '%s': %s."; 32 | 33 | 34 | // ===================================================================== 35 | // Object for reading basic variables variant by variant 36 | 37 | /* 38 | CApply_Variant_Basic::CApply_Variant_Basic(CFileInfo &File, 39 | const char *var_name): CApply_Variant(File) 40 | { 41 | fVarType = ctBasic; 42 | Node = File.GetObj(var_name, TRUE); 43 | SVType = GDS_Array_GetSVType(Node); 44 | Reset(); 45 | } 46 | 47 | void CApply_Variant_Basic::ReadData(PyObject *val) 48 | { 49 | C_Int32 st = Position, one = 1; 50 | if (COREARRAY_SV_INTEGER(SVType)) 51 | { 52 | GDS_Array_ReadData(Node, &st, &one, INTEGER(val), svInt32); 53 | } else if (COREARRAY_SV_FLOAT(SVType)) 54 | { 55 | GDS_Array_ReadData(Node, &st, &one, REAL(val), svFloat64); 56 | } else if (COREARRAY_SV_STRING(SVType)) 57 | { 58 | string s; 59 | GDS_Array_ReadData(Node, &st, &one, &s, svStrUTF8); 60 | SET_STRING_ELT(val, 0, mkChar(s.c_str())); 61 | } 62 | } 63 | 64 | PyObject* CApply_Variant_Basic::NeedArray(int &nProtected) 65 | { 66 | if (VarNode == NULL) 67 | VarNode = RObject_GDS(Node, 1, nProtected, false); 68 | return VarNode; 69 | } 70 | */ 71 | 72 | /* 73 | // ==== 74 | 75 | CApply_Variant_Pos::CApply_Variant_Pos(CFileInfo &File): 76 | CApply_Variant(File) 77 | { 78 | fVarType = ctBasic; 79 | Node = File.GetObj("position", TRUE); 80 | PtrPos = &File.Position()[0]; 81 | VarNode = NULL; 82 | Reset(); 83 | } 84 | 85 | void CApply_Variant_Pos::ReadData(PyObject* val) 86 | { 87 | INTEGER(val)[0] = PtrPos[Position]; 88 | } 89 | 90 | PyObject* CApply_Variant_Pos::NeedArray(int &nProtected) 91 | { 92 | if (VarNode == NULL) 93 | { 94 | VarNode = PROTECT(NEW_INTEGER(1)); 95 | nProtected ++; 96 | } 97 | return VarNode; 98 | } 99 | 100 | // ==== 101 | 102 | CApply_Variant_Chrom::CApply_Variant_Chrom(CFileInfo &File): 103 | CApply_Variant(File) 104 | { 105 | fVarType = ctBasic; 106 | Node = File.GetObj("chromosome", TRUE); 107 | ChromIndex = &File.Chromosome(); 108 | VarNode = NULL; 109 | Reset(); 110 | } 111 | 112 | void CApply_Variant_Chrom::ReadData(PyObject* val) 113 | { 114 | const string &s1 = (*ChromIndex)[Position]; 115 | const char *s2 = CHAR(STRING_ELT(val, 0)); 116 | if (s1 != s2) 117 | SET_STRING_ELT(val, 0, mkChar(s1.c_str())); 118 | } 119 | 120 | PyObject* CApply_Variant_Chrom::NeedArray(int &nProtected) 121 | { 122 | if (VarNode == NULL) 123 | { 124 | VarNode = PROTECT(mkString("")); 125 | nProtected ++; 126 | } 127 | return VarNode; 128 | } 129 | */ 130 | 131 | 132 | // ===================================================================== 133 | // Object for reading genotypes variant by variant 134 | 135 | CApply_Variant_Geno::CApply_Variant_Geno(): CApply_Variant() 136 | { 137 | fVarType = ctGenotype; 138 | SiteCount = CellCount = 0; SampNum = 0; Ploidy = 0; 139 | VarIntGeno = VarNode = NULL; 140 | } 141 | 142 | CApply_Variant_Geno::CApply_Variant_Geno(CFileInfo &File): 143 | CApply_Variant() 144 | { 145 | fVarType = ctGenotype; 146 | SiteCount = CellCount = 0; SampNum = 0; Ploidy = 0; 147 | VarIntGeno = VarNode = NULL; 148 | Init(File); 149 | } 150 | 151 | CApply_Variant_Geno::~CApply_Variant_Geno() 152 | { 153 | if (VarIntGeno) Py_DECREF(VarIntGeno); 154 | } 155 | 156 | void CApply_Variant_Geno::Init(CFileInfo &File) 157 | { 158 | static const char *VAR_NAME = "genotype/data"; 159 | 160 | // initialize 161 | Node = File.GetObj(VAR_NAME, TRUE); 162 | 163 | // check 164 | if (GDS_Array_DimCnt(Node) != 3) 165 | throw ErrSeqArray(ERR_DIM, VAR_NAME); 166 | C_Int32 DLen[3]; 167 | GDS_Array_GetDim(Node, DLen, 3); 168 | if ((DLen[0] < File.VariantNum()) || (DLen[1] != File.SampleNum())) 169 | throw ErrSeqArray(ERR_DIM, VAR_NAME); 170 | 171 | // initialize 172 | MarginalSize = File.VariantNum(); 173 | MarginalSelect = File.Selection().pVariant(); 174 | GenoIndex = &File.GenoIndex(); 175 | SiteCount = ssize_t(DLen[1]) * DLen[2]; 176 | SampNum = File.SampleSelNum(); 177 | CellCount = SampNum * DLen[2]; 178 | Ploidy = File.Ploidy(); 179 | 180 | // initialize selection 181 | Selection.resize(SiteCount); 182 | C_BOOL *p = &Selection[0]; 183 | memset(p, TRUE, SiteCount); 184 | C_BOOL *s = File.Selection().pSample(); 185 | for (int n=DLen[1]; n > 0; n--) 186 | { 187 | if (*s++ == FALSE) 188 | { 189 | for (int m=DLen[2]; m > 0; m--) *p ++ = FALSE; 190 | } else { 191 | p += DLen[2]; 192 | } 193 | } 194 | 195 | ExtPtr.reset(SiteCount); 196 | VarIntGeno = VarNode = NULL; 197 | Reset(); 198 | } 199 | 200 | int CApply_Variant_Geno::_ReadGenoData(int *Base) 201 | { 202 | C_UInt8 NumIndexRaw; 203 | C_Int64 Index; 204 | GenoIndex->GetInfo(Position, Index, NumIndexRaw); 205 | 206 | if (NumIndexRaw >= 1) 207 | { 208 | CdIterator it; 209 | GDS_Iter_Position(Node, &it, Index*SiteCount); 210 | GDS_Iter_RDataEx(&it, Base, SiteCount, svInt32, &Selection[0]); 211 | 212 | const int bit_mask = 0x03; 213 | int missing = bit_mask; 214 | for (C_UInt8 i=1; i < NumIndexRaw; i++) 215 | { 216 | GDS_Iter_RDataEx(&it, ExtPtr.get(), SiteCount, svUInt8, &Selection[0]); 217 | 218 | C_UInt8 shift = i * 2; 219 | C_UInt8 *s = (C_UInt8*)ExtPtr.get(); 220 | int *p = Base; 221 | for (ssize_t n=CellCount; n > 0; n--) 222 | *p++ |= int(*s++) << shift; 223 | 224 | missing = (missing << 2) | bit_mask; 225 | } 226 | 227 | return missing; 228 | } else { 229 | memset(Base, 0, sizeof(int)*CellCount); 230 | return 0; 231 | } 232 | } 233 | 234 | C_UInt8 CApply_Variant_Geno::_ReadGenoData(C_UInt8 *Base) 235 | { 236 | C_UInt8 NumIndexRaw; 237 | C_Int64 Index; 238 | GenoIndex->GetInfo(Position, Index, NumIndexRaw); 239 | 240 | if (NumIndexRaw >= 1) 241 | { 242 | CdIterator it; 243 | GDS_Iter_Position(Node, &it, Index*SiteCount); 244 | GDS_Iter_RDataEx(&it, Base, SiteCount, svUInt8, &Selection[0]); 245 | 246 | const C_UInt8 bit_mask = 0x03; 247 | C_UInt8 missing = bit_mask; 248 | if (NumIndexRaw > 4) NumIndexRaw = 4; 249 | 250 | for (C_UInt8 i=1; i < NumIndexRaw; i++) 251 | { 252 | GDS_Iter_RDataEx(&it, ExtPtr.get(), SiteCount, svUInt8, &Selection[0]); 253 | 254 | C_UInt8 shift = i * 2; 255 | C_UInt8 *s = (C_UInt8*)ExtPtr.get(); 256 | C_UInt8 *p = Base; 257 | for (ssize_t n=CellCount; n > 0; n--) 258 | *p++ |= (*s++) << shift; 259 | 260 | missing = (missing << 2) | bit_mask; 261 | } 262 | 263 | return missing; 264 | } else { 265 | memset(Base, 0, CellCount); 266 | return 0; 267 | } 268 | } 269 | 270 | void CApply_Variant_Geno::ReadGenoData(int *Base) 271 | { 272 | int missing = _ReadGenoData(Base); 273 | vec_i32_replace(Base, CellCount, missing, NA_INTEGER); 274 | } 275 | 276 | void CApply_Variant_Geno::ReadGenoData(C_UInt8 *Base) 277 | { 278 | C_UInt8 missing = _ReadGenoData(Base); 279 | vec_i8_replace((C_Int8*)Base, CellCount, missing, NA_UINT8); 280 | } 281 | 282 | PyObject* CApply_Variant_Geno::NeedArray() 283 | { 284 | C_UInt8 NumIndexRaw; 285 | C_Int64 Index; 286 | GenoIndex->GetInfo(Position, Index, NumIndexRaw); 287 | if (NumIndexRaw > 4) 288 | { 289 | if (!VarIntGeno) 290 | VarIntGeno = numpy_new_uint8_mat(SampNum, Ploidy); 291 | return VarIntGeno; 292 | } else { 293 | if (!VarNode) 294 | VarNode = numpy_new_int32_mat(SampNum, Ploidy); 295 | return VarNode; 296 | } 297 | } 298 | 299 | void CApply_Variant_Geno::ReadData(PyObject *val) 300 | { 301 | void *ptr = numpy_getptr(val); 302 | if (numpy_is_uint8(val)) 303 | ReadGenoData((C_UInt8*)val); 304 | else 305 | ReadGenoData((int*)ptr); 306 | } 307 | 308 | 309 | 310 | // ===================================================================== 311 | // Object for reading genotypes variant by variant 312 | 313 | CApply_Variant_Dosage::CApply_Variant_Dosage(CFileInfo &File): 314 | CApply_Variant_Geno(File) 315 | { 316 | fVarType = ctDosage; 317 | ExtPtr2.reset(sizeof(int)*CellCount); 318 | } 319 | 320 | PyObject* CApply_Variant_Dosage::NeedArray() 321 | { 322 | if (!VarNode) VarNode = numpy_new_uint8(SampNum); 323 | return VarNode; 324 | } 325 | 326 | void CApply_Variant_Dosage::ReadData(PyObject *val) 327 | { 328 | void *ptr = numpy_getptr(val); 329 | if (numpy_is_uint8(val)) 330 | ReadDosage((C_UInt8*)val); 331 | else 332 | ReadDosage((int*)ptr); 333 | } 334 | 335 | void CApply_Variant_Dosage::ReadDosage(int *Base) 336 | { 337 | int *p = (int *)ExtPtr2.get(); 338 | int missing = _ReadGenoData(p); 339 | 340 | // count the number of reference allele 341 | if (Ploidy == 2) // diploid 342 | { 343 | vec_i32_cnt_dosage2(p, Base, SampNum, 0, missing, NA_INTEGER); 344 | } else { 345 | for (int n=SampNum; n > 0; n--) 346 | { 347 | int cnt = 0; 348 | for (int m=Ploidy; m > 0; m--, p++) 349 | { 350 | if (*p == 0) 351 | { 352 | if (cnt != NA_INTEGER) 353 | cnt ++; 354 | } else if (*p == missing) 355 | cnt = NA_INTEGER; 356 | } 357 | *Base ++ = cnt; 358 | } 359 | } 360 | } 361 | 362 | void CApply_Variant_Dosage::ReadDosage(C_UInt8 *Base) 363 | { 364 | C_UInt8 *p = (C_UInt8 *)ExtPtr2.get(); 365 | C_UInt8 missing = _ReadGenoData(p); 366 | 367 | // count the number of reference allele 368 | if (Ploidy == 2) // diploid 369 | { 370 | vec_i8_cnt_dosage2((int8_t *)p, (int8_t *)Base, SampNum, 0, 371 | missing, NA_UINT8); 372 | } else { 373 | C_UInt8 *p = (C_UInt8 *)ExtPtr.get(); 374 | for (int n=SampNum; n > 0; n--) 375 | { 376 | C_UInt8 cnt = 0; 377 | for (int m=Ploidy; m > 0; m--, p++) 378 | { 379 | if (*p == 0) 380 | { 381 | if (cnt != NA_UINT8) 382 | cnt ++; 383 | } else if (*p == missing) 384 | cnt = NA_UINT8; 385 | } 386 | *Base ++ = cnt; 387 | } 388 | } 389 | } 390 | 391 | 392 | /* 393 | // ===================================================================== 394 | // Object for reading phasing information variant by variant 395 | 396 | CApply_Variant_Phase::CApply_Variant_Phase(): 397 | CApply_Variant() 398 | { 399 | fVarType = ctPhase; 400 | SiteCount = CellCount = 0; 401 | SampNum = 0; Ploidy = 0; 402 | UseRaw = FALSE; 403 | VarPhase = NULL; 404 | } 405 | 406 | CApply_Variant_Phase::CApply_Variant_Phase(CFileInfo &File, bool use_raw): 407 | CApply_Variant() 408 | { 409 | fVarType = ctPhase; 410 | Init(File, use_raw); 411 | } 412 | 413 | void CApply_Variant_Phase::Init(CFileInfo &File, bool use_raw) 414 | { 415 | static const char *VAR_NAME = "phase/data"; 416 | 417 | // initialize 418 | Node = File.GetObj(VAR_NAME, TRUE); 419 | 420 | // check 421 | int DimCnt = GDS_Array_DimCnt(Node); 422 | if ((DimCnt != 2) && (DimCnt != 3)) 423 | throw ErrSeqArray(ERR_DIM, VAR_NAME); 424 | C_Int32 DLen[3] = { 0, 0, 1 }; 425 | GDS_Array_GetDim(Node, DLen, 3); 426 | if ((DLen[0] != File.VariantNum()) || (DLen[1] != File.SampleNum())) 427 | throw ErrSeqArray(ERR_DIM, VAR_NAME); 428 | 429 | // initialize 430 | MarginalSize = File.VariantNum(); 431 | MarginalSelect = File.Selection().pVariant(); 432 | SiteCount = ssize_t(DLen[1]) * DLen[2]; 433 | SampNum = File.SampleSelNum(); 434 | CellCount = SampNum * DLen[2]; 435 | Ploidy = File.Ploidy(); 436 | UseRaw = use_raw; 437 | 438 | // initialize selection 439 | Selection.resize(SiteCount); 440 | C_BOOL *p = &Selection[0]; 441 | memset(p, TRUE, SiteCount); 442 | C_BOOL *s = File.Selection().pSample(); 443 | for (int n=DLen[1]; n > 0; n--) 444 | { 445 | if (*s++ == FALSE) 446 | { 447 | for (int m=DLen[2]; m > 0; m--) *p ++ = FALSE; 448 | } else { 449 | p += DLen[2]; 450 | } 451 | } 452 | 453 | VarPhase = NULL; 454 | Reset(); 455 | } 456 | 457 | void CApply_Variant_Phase::ReadData(PyObject* val) 458 | { 459 | CdIterator it; 460 | GDS_Iter_Position(Node, &it, ssize_t(Position)*SiteCount); 461 | if (UseRaw) 462 | GDS_Iter_RDataEx(&it, RAW(val), SiteCount, svInt8, &Selection[0]); 463 | else 464 | GDS_Iter_RDataEx(&it, INTEGER(val), SiteCount, svInt32, &Selection[0]); 465 | } 466 | 467 | PyObject* CApply_Variant_Phase::NeedArray(int &nProtected) 468 | { 469 | if (VarPhase == NULL) 470 | { 471 | VarPhase = UseRaw ? NEW_RAW(CellCount) : NEW_INTEGER(CellCount); 472 | PROTECT(VarPhase); 473 | nProtected ++; 474 | if (Ploidy > 2) 475 | { 476 | PyObject* dim = NEW_INTEGER(2); 477 | int *p = INTEGER(dim); 478 | p[0] = Ploidy-1; p[1] = SampNum; 479 | SET_DIM(VarPhase, dim); 480 | } 481 | } 482 | return VarPhase; 483 | } 484 | 485 | 486 | 487 | // ===================================================================== 488 | // Object for reading info variables variant by variant 489 | 490 | CApply_Variant_Info::CApply_Variant_Info(CFileInfo &File, 491 | const char *var_name): CApply_Variant(File) 492 | { 493 | // initialize 494 | fVarType = ctInfo; 495 | Node = File.GetObj(var_name, TRUE); 496 | 497 | // check 498 | int DimCnt = GDS_Array_DimCnt(Node); 499 | if ((DimCnt != 1) && (DimCnt != 2)) 500 | throw ErrSeqArray(ERR_DIM, var_name); 501 | 502 | // initialize 503 | C_Int32 DLen[2]; 504 | GDS_Array_GetDim(Node, DLen, 2); 505 | BaseNum = (DimCnt == 2) ? DLen[1] : 1; 506 | VarIndex = &File.VarIndex(GDS_PATH_PREFIX(var_name, '@')); 507 | SVType = GDS_Array_GetSVType(Node); 508 | 509 | Reset(); 510 | } 511 | 512 | void CApply_Variant_Info::ReadData(PyObject* val) 513 | { 514 | C_Int64 IndexRaw; 515 | int NumIndexRaw; 516 | VarIndex->GetInfo(Position, IndexRaw, NumIndexRaw); 517 | 518 | if (NumIndexRaw > 0) 519 | { 520 | C_Int32 st[2] = { (C_Int32)IndexRaw, 0 }; 521 | C_Int32 cnt[2] = { NumIndexRaw, BaseNum }; 522 | 523 | if (COREARRAY_SV_INTEGER(SVType)) 524 | { 525 | GDS_Array_ReadData(Node, st, cnt, INTEGER(val), svInt32); 526 | } else if (COREARRAY_SV_FLOAT(SVType)) 527 | { 528 | GDS_Array_ReadData(Node, st, cnt, REAL(val), svFloat64); 529 | } else if (COREARRAY_SV_STRING(SVType)) 530 | { 531 | vector buffer(XLENGTH(val)); 532 | GDS_Array_ReadData(Node, st, cnt, &buffer[0], svStrUTF8); 533 | for (size_t i=0; i < buffer.size(); i++) 534 | SET_STRING_ELT(val, i, mkChar(buffer[i].c_str())); 535 | } 536 | } 537 | } 538 | 539 | PyObject* CApply_Variant_Info::NeedArray(int &nProtected) 540 | { 541 | C_Int64 IndexRaw; 542 | int NumIndexRaw; 543 | VarIndex->GetInfo(Position, IndexRaw, NumIndexRaw); 544 | if (NumIndexRaw <= 0) return R_NilValue; 545 | 546 | map::iterator it = VarList.find(NumIndexRaw); 547 | if (it == VarList.end()) 548 | { 549 | PyObject* ans = RObject_GDS(Node, BaseNum*NumIndexRaw, nProtected, true); 550 | if (BaseNum > 1) 551 | { 552 | PyObject* dim = NEW_INTEGER(2); 553 | int *p = INTEGER(dim); 554 | p[0] = BaseNum; p[1] = NumIndexRaw; 555 | SET_DIM(ans, dim); 556 | } 557 | 558 | VarList.insert(pair(NumIndexRaw, ans)); 559 | return ans; 560 | } else 561 | return it->second; 562 | } 563 | 564 | 565 | 566 | // ===================================================================== 567 | // Object for reading format variables variant by variant 568 | 569 | CApply_Variant_Format::CApply_Variant_Format(): CApply_Variant() 570 | { 571 | fVarType = ctFormat; 572 | } 573 | 574 | CApply_Variant_Format::CApply_Variant_Format(CFileInfo &File, 575 | const char *var_name): CApply_Variant() 576 | { 577 | fVarType = ctFormat; 578 | Init(File, var_name); 579 | } 580 | 581 | void CApply_Variant_Format::Init(CFileInfo &File, const char *var_name) 582 | { 583 | // initialize 584 | Node = File.GetObj(var_name, TRUE); 585 | 586 | // check 587 | int DimCnt = GDS_Array_DimCnt(Node); 588 | if (DimCnt != 2) 589 | { 590 | if (DimCnt == 3) 591 | throw ErrSeqArray(ERR_DIM_EX, var_name, 592 | "3-dim format variable is not a formal variable, please rerun 'seqVCF2GDs()'"); 593 | else 594 | throw ErrSeqArray(ERR_DIM, var_name); 595 | } 596 | C_Int32 DLen[2]; 597 | GDS_Array_GetDim(Node, DLen, 2); 598 | if (DLen[1] != File.SampleNum()) 599 | throw ErrSeqArray(ERR_DIM, var_name); 600 | 601 | // initialize 602 | SVType = GDS_Array_GetSVType(Node); 603 | MarginalSize = File.VariantNum(); 604 | MarginalSelect = File.Selection().pVariant(); 605 | VarIndex = &File.VarIndex(GDS_PATH_PREFIX(var_name, '@')); 606 | SampNum = File.SampleSelNum(); 607 | _TotalSampNum = File.SampleNum(); 608 | 609 | // initialize selection 610 | SelPtr[0] = NULL; 611 | SelPtr[1] = File.Selection().pSample(); 612 | 613 | Reset(); 614 | } 615 | 616 | void CApply_Variant_Format::ReadData(PyObject* val) 617 | { 618 | C_Int64 IndexRaw; 619 | int NumIndexRaw; 620 | VarIndex->GetInfo(Position, IndexRaw, NumIndexRaw); 621 | 622 | if (NumIndexRaw > 0) 623 | { 624 | C_Int32 st[2] = { (C_Int32)IndexRaw, 0 }; 625 | C_Int32 cnt[2] = { NumIndexRaw, (C_Int32)_TotalSampNum }; 626 | SelPtr[0] = NeedTRUEs(NumIndexRaw); 627 | 628 | if (COREARRAY_SV_INTEGER(SVType)) 629 | { 630 | GDS_Array_ReadDataEx(Node, st, cnt, SelPtr, INTEGER(val), svInt32); 631 | } else if (COREARRAY_SV_FLOAT(SVType)) 632 | { 633 | GDS_Array_ReadDataEx(Node, st, cnt, SelPtr, REAL(val), svFloat64); 634 | } else if (COREARRAY_SV_STRING(SVType)) 635 | { 636 | vector buffer(XLENGTH(val)); 637 | GDS_Array_ReadDataEx(Node, st, cnt, SelPtr, &buffer[0], svStrUTF8); 638 | for (size_t i=0; i < buffer.size(); i++) 639 | SET_STRING_ELT(val, i, mkChar(buffer[i].c_str())); 640 | } 641 | } 642 | } 643 | 644 | PyObject* CApply_Variant_Format::NeedArray(int &nProtected) 645 | { 646 | C_Int64 IndexRaw; 647 | int NumIndexRaw; 648 | VarIndex->GetInfo(Position, IndexRaw, NumIndexRaw); 649 | if (NumIndexRaw <= 0) return R_NilValue; 650 | 651 | map::iterator it = VarList.find(NumIndexRaw); 652 | if (it == VarList.end()) 653 | { 654 | PyObject* ans = RObject_GDS(Node, SampNum*NumIndexRaw, nProtected, false); 655 | PyObject* dim = NEW_INTEGER(2); 656 | int *p = INTEGER(dim); 657 | p[0] = SampNum; p[1] = NumIndexRaw; 658 | SET_DIM(ans, dim); 659 | 660 | PyObject* name_list = PROTECT(NEW_LIST(2)); 661 | PyObject* tmp = PROTECT(NEW_CHARACTER(2)); 662 | SET_STRING_ELT(tmp, 0, mkChar("sample")); 663 | SET_STRING_ELT(tmp, 1, mkChar("index")); 664 | SET_NAMES(name_list, tmp); 665 | SET_DIMNAMES(ans, name_list); 666 | UNPROTECT(2); 667 | 668 | VarList.insert(pair(NumIndexRaw, ans)); 669 | return ans; 670 | } else 671 | return it->second; 672 | } 673 | */ 674 | 675 | 676 | // ===================================================================== 677 | // Object for reading format variables variant by variant 678 | 679 | CApply_Variant_NumAllele::CApply_Variant_NumAllele(CFileInfo &File): 680 | CApply_Variant(File) 681 | { 682 | strbuf.reserve(128); 683 | fVarType = ctBasic; 684 | Node = File.GetObj("allele", TRUE); 685 | Reset(); 686 | } 687 | 688 | PyObject* CApply_Variant_NumAllele::NeedArray() 689 | { 690 | if (!VarNode) VarNode = numpy_new_int32(1); 691 | return VarNode; 692 | } 693 | 694 | void CApply_Variant_NumAllele::ReadData(PyObject *val) 695 | { 696 | int *p = (int*)numpy_getptr(val); 697 | *p = GetNumAllele(); 698 | } 699 | 700 | int CApply_Variant_NumAllele::GetNumAllele() 701 | { 702 | C_Int32 st = Position, one = 1; 703 | GDS_Array_ReadData(Node, &st, &one, &strbuf, svStrUTF8); 704 | return GetNumOfAllele(strbuf.c_str()); 705 | } 706 | 707 | } 708 | 709 | 710 | extern "C" 711 | { 712 | 713 | /* 714 | using namespace PySeqArray; 715 | 716 | // =========================================================== 717 | // Apply functions over margins on a working space 718 | // =========================================================== 719 | 720 | COREARRAY_DLL_LOCAL const char *Txt_Apply_AsIs[] = 721 | { 722 | "none", "list", "integer", "double", "character", "logical", 723 | "raw", NULL 724 | }; 725 | 726 | COREARRAY_DLL_LOCAL const char *Txt_Apply_VarIdx[] = 727 | { 728 | "none", "relative", "absolute", NULL 729 | }; 730 | 731 | 732 | 733 | /// Apply functions over margins on a working space 734 | COREARRAY_DLL_EXPORT PyObject* SEQ_Apply_Variant(PyObject* gdsfile, PyObject* var_name, 735 | PyObject* FUN, PyObject* as_is, PyObject* var_index, PyObject* param, PyObject* rho) 736 | { 737 | PyObject* pam_use_raw = RGetListElement(param, "useraw"); 738 | if (!Rf_isLogical(pam_use_raw)) 739 | error("'.useraw' must be TRUE, FALSE or NA."); 740 | int use_raw_flag = Rf_asLogical(pam_use_raw); 741 | 742 | int prog_flag = Rf_asLogical(RGetListElement(param, "progress")); 743 | if (prog_flag == NA_LOGICAL) 744 | error("'.progress' must be TRUE or FALSE."); 745 | 746 | int dup_flag = Rf_asLogical(RGetListElement(param, "list_dup")); 747 | if (dup_flag == NA_LOGICAL) 748 | error("'.list_dup' must be TRUE or FALSE."); 749 | 750 | COREARRAY_TRY 751 | 752 | // the selection 753 | CFileInfo &File = GetFileInfo(gdsfile); 754 | 755 | // the number of calling PROTECT 756 | int nProtected = 0; 757 | 758 | // the number of selected variants 759 | int nVariant = File.VariantSelNum(); 760 | if (nVariant <= 0) 761 | throw ErrSeqArray("There is no selected variant."); 762 | 763 | 764 | // =========================================================== 765 | // initialize the GDS Node list 766 | 767 | CVarApplyList NodeList; 768 | 769 | // for-loop 770 | for (int i=0; i < Rf_length(var_name); i++) 771 | { 772 | // the path of GDS variable 773 | string s = CHAR(STRING_ELT(var_name, i)); 774 | 775 | if (s=="variant.id" || s=="allele" || s=="annotation/id" || 776 | s=="annotation/qual" || s=="annotation/filter") 777 | { 778 | NodeList.push_back( 779 | new CApply_Variant_Basic(File, s.c_str())); 780 | } else if (s == "position") 781 | { 782 | NodeList.push_back(new CApply_Variant_Pos(File)); 783 | } else if (s == "chromosome") 784 | { 785 | NodeList.push_back(new CApply_Variant_Chrom(File)); 786 | } else if (s == "genotype") 787 | { 788 | NodeList.push_back( 789 | new CApply_Variant_Geno(File, use_raw_flag)); 790 | } else if (s == "phase") 791 | { 792 | NodeList.push_back( 793 | new CApply_Variant_Phase(File, use_raw_flag!=FALSE)); 794 | } else if (strncmp(s.c_str(), "annotation/info/", 16) == 0) 795 | { 796 | NodeList.push_back( 797 | new CApply_Variant_Info(File, s.c_str())); 798 | } else if (strncmp(s.c_str(), "annotation/format/", 18) == 0) 799 | { 800 | s.append("/data"); 801 | NodeList.push_back( 802 | new CApply_Variant_Format(File, s.c_str())); 803 | } else if (s == "$dosage") 804 | { 805 | NodeList.push_back( 806 | new CApply_Variant_Dosage(File, use_raw_flag)); 807 | } else if (s == "$num_allele") 808 | { 809 | NodeList.push_back(new CApply_Variant_NumAllele(File)); 810 | } else { 811 | throw ErrSeqArray( 812 | "'%s' is not a standard variable name, and the standard format:\n" 813 | " variant.id, position, chromosome, allele, annotation/id, annotation/qual, annotation/filter\n" 814 | " annotation/info/VARIABLE_NAME', annotation/format/VARIABLE_NAME", 815 | s.c_str()); 816 | } 817 | } 818 | 819 | 820 | // =========================================================== 821 | // as.is 822 | 823 | Rconnection OutputConn = NULL; 824 | PdGDSObj OutputGDS = NULL; 825 | int DatType; 826 | if (Rf_inherits(as_is, "connection")) 827 | { 828 | OutputConn = R_GetConnection(as_is); 829 | DatType = 7; 830 | } else if (Rf_inherits(as_is, "gdsn.class")) 831 | { 832 | OutputGDS = GDS_R_SEXP2Obj(as_is, FALSE); 833 | DatType = 8; 834 | } else { 835 | DatType = MatchText(CHAR(STRING_ELT(as_is, 0)), Txt_Apply_AsIs); 836 | if (DatType < 0) 837 | throw ErrSeqArray("'as.is' is not valid!"); 838 | } 839 | 840 | C_Int8 *R_rv_ptr = NULL; 841 | switch (DatType) 842 | { 843 | case 1: 844 | rv_ans = PROTECT(NEW_LIST(nVariant)); nProtected ++; 845 | break; 846 | case 2: 847 | rv_ans = PROTECT(NEW_INTEGER(nVariant)); nProtected ++; 848 | R_rv_ptr = (C_Int8 *)INTEGER(rv_ans); 849 | break; 850 | case 3: 851 | rv_ans = PROTECT(NEW_NUMERIC(nVariant)); nProtected ++; 852 | R_rv_ptr = (C_Int8 *)REAL(rv_ans); 853 | break; 854 | case 4: 855 | rv_ans = PROTECT(NEW_CHARACTER(nVariant)); nProtected ++; 856 | break; 857 | case 5: 858 | rv_ans = PROTECT(NEW_LOGICAL(nVariant)); nProtected ++; 859 | R_rv_ptr = (C_Int8 *)LOGICAL(rv_ans); 860 | break; 861 | case 6: 862 | rv_ans = PROTECT(NEW_RAW(nVariant)); nProtected ++; 863 | R_rv_ptr = (C_Int8 *)RAW(rv_ans); 864 | break; 865 | } 866 | 867 | // =========================================================== 868 | // rho 869 | if (!isEnvironment(rho)) 870 | throw ErrSeqArray("'rho' should be an environment"); 871 | 872 | 873 | // =========================================================== 874 | // initialize calling 875 | 876 | PyObject* R_call_param = R_NilValue; 877 | if (NodeList.size() > 1) 878 | { 879 | PROTECT(R_call_param = NEW_LIST(NodeList.size())); 880 | nProtected ++; 881 | // set name to R_call_param 882 | SET_NAMES(R_call_param, GET_NAMES(var_name)); 883 | } 884 | 885 | // =============================================================== 886 | // var.index 887 | int VarIdx = MatchText(CHAR(STRING_ELT(var_index, 0)), Txt_Apply_VarIdx); 888 | if (VarIdx < 0) 889 | throw ErrSeqArray("'var.index' is not valid!"); 890 | 891 | PyObject* R_fcall; 892 | PyObject* R_Index = NULL; 893 | if (VarIdx > 0) 894 | { 895 | PROTECT(R_Index = NEW_INTEGER(1)); 896 | nProtected ++; 897 | PROTECT(R_fcall = LCONS(FUN, LCONS(R_Index, 898 | LCONS(R_call_param, LCONS(R_DotsSymbol, R_NilValue))))); 899 | nProtected ++; 900 | } else { 901 | PROTECT(R_fcall = LCONS(FUN, 902 | LCONS(R_call_param, LCONS(R_DotsSymbol, R_NilValue)))); 903 | nProtected ++; 904 | } 905 | 906 | map R_fcall_map; 907 | R_fcall_map[R_call_param] = R_fcall; 908 | 909 | 910 | // =========================================================== 911 | // for-loop calling 912 | 913 | CProgressStdOut progress(nVariant, prog_flag); 914 | 915 | int ans_index = 0; 916 | do { 917 | switch (VarIdx) 918 | { 919 | case 1: // relative 920 | INTEGER(R_Index)[0] = ans_index + 1; break; 921 | case 2: // absolute 922 | INTEGER(R_Index)[0] = NodeList[0]->Position + 1; break; 923 | } 924 | 925 | if (NodeList.size() <= 1) 926 | { 927 | R_call_param = NodeList[0]->NeedArray(nProtected); 928 | map::iterator it = R_fcall_map.find(R_call_param); 929 | if (it == R_fcall_map.end()) 930 | { 931 | if (VarIdx > 0) 932 | { 933 | PROTECT(R_fcall = LCONS(FUN, LCONS(R_Index, 934 | LCONS(R_call_param, LCONS(R_DotsSymbol, R_NilValue))))); 935 | } else { 936 | PROTECT(R_fcall = LCONS(FUN, 937 | LCONS(R_call_param, LCONS(R_DotsSymbol, R_NilValue)))); 938 | } 939 | nProtected ++; 940 | R_fcall_map[R_call_param] = R_fcall; 941 | } else 942 | R_fcall = it->second; 943 | 944 | NodeList[0]->ReadData(R_call_param); 945 | 946 | } else { 947 | CVarApply **p = &NodeList[0]; 948 | size_t n = NodeList.size(); 949 | for (size_t i=0; i < n; i++, p++) 950 | { 951 | PyObject* tmp = (*p)->NeedArray(nProtected); 952 | (*p)->ReadData(tmp); 953 | SET_ELEMENT(R_call_param, i, tmp); 954 | } 955 | } 956 | 957 | // call R function 958 | PyObject* val = eval(R_fcall, rho); 959 | 960 | // store data 961 | switch (DatType) 962 | { 963 | case 1: // list 964 | if (dup_flag) val = duplicate(val); 965 | SET_ELEMENT(rv_ans, ans_index, val); 966 | break; 967 | case 2: // integer 968 | *((int*)R_rv_ptr) = Rf_asInteger(val); 969 | R_rv_ptr += sizeof(int); 970 | break; 971 | case 3: // double 972 | *((double*)R_rv_ptr) = Rf_asReal(val); 973 | R_rv_ptr += sizeof(double); 974 | break; 975 | case 4: // character 976 | SET_STRING_ELT(rv_ans, ans_index, Rf_asChar(val)); 977 | break; 978 | case 5: // logical 979 | *((int*)R_rv_ptr) = Rf_asLogical(val); 980 | R_rv_ptr += sizeof(int); 981 | break; 982 | case 6: // raw 983 | *R_rv_ptr = Rf_asInteger(val); 984 | R_rv_ptr ++; 985 | break; 986 | case 7: // connection 987 | if (OutputConn->text) 988 | { 989 | if (Rf_isList(val)) 990 | { 991 | throw ErrSeqArray("the user-defined function should return a character vector."); 992 | } else if (!Rf_isString(val)) 993 | { 994 | val = AS_CHARACTER(val); 995 | } 996 | size_t n = XLENGTH(val); 997 | for (size_t i=0; i < n; i++) 998 | { 999 | ConnPutText(OutputConn, "%s\n", CHAR(STRING_ELT(val, i))); 1000 | } 1001 | } else { 1002 | if (TYPEOF(val) != RAWSXP) 1003 | throw ErrSeqArray("the user-defined function should return a RAW vector."); 1004 | size_t n = XLENGTH(val); 1005 | size_t m = R_WriteConnection(OutputConn, RAW(val), n); 1006 | if (n != m) 1007 | throw ErrSeqArray("error in writing to a connection."); 1008 | } 1009 | break; 1010 | case 8: // gdsn.class 1011 | RAppendGDS(OutputGDS, val); 1012 | break; 1013 | } 1014 | ans_index ++; 1015 | 1016 | progress.Forward(); 1017 | 1018 | // check the end 1019 | } while (NodeList.CallNext()); 1020 | 1021 | // finally 1022 | UNPROTECT(nProtected); 1023 | 1024 | COREARRAY_CATCH 1025 | } 1026 | */ 1027 | 1028 | } // extern "C" 1029 | -------------------------------------------------------------------------------- /src/PySeqArray.cpp: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // PySeqArray.cpp: the C/C++ codes for the PySeqArray package 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | #include "Index.h" 23 | 24 | #include 25 | #include 26 | 27 | #include "ReadByVariant.h" 28 | // #include "ReadBySample.h" 29 | #include 30 | 31 | 32 | #define PY_EXPORT static 33 | 34 | 35 | // =========================================================== 36 | // Library Functions 37 | // =========================================================== 38 | 39 | extern "C" 40 | { 41 | 42 | using namespace CoreArray; 43 | using namespace PySeqArray; 44 | 45 | 46 | // =========================================================== 47 | // Open a GDS file 48 | // =========================================================== 49 | 50 | /// initialize a SeqArray file 51 | PY_EXPORT PyObject* SEQ_File_Init(PyObject *self, PyObject *args) 52 | { 53 | int file_id; 54 | if (!PyArg_ParseTuple(args, "i", &file_id)) 55 | return NULL; 56 | 57 | COREARRAY_TRY 58 | CFileInfo &file = GetFileInfo(file_id); 59 | file.Selection(); // force to initialize selection 60 | COREARRAY_CATCH_NONE 61 | } 62 | 63 | /// finalize a SeqArray file 64 | PY_EXPORT PyObject* SEQ_File_Done(PyObject *self, PyObject *args) 65 | { 66 | int file_id; 67 | if (!PyArg_ParseTuple(args, "i", &file_id)) 68 | return NULL; 69 | 70 | COREARRAY_TRY 71 | map::iterator p = GDSFile_ID_Info.find(file_id); 72 | if (p != GDSFile_ID_Info.end()) 73 | GDSFile_ID_Info.erase(p); 74 | COREARRAY_CATCH_NONE 75 | } 76 | 77 | 78 | 79 | // =========================================================== 80 | // Set a working space 81 | // =========================================================== 82 | 83 | /// push the current filter to the stack 84 | PY_EXPORT PyObject* SEQ_FilterPush(PyObject *self, PyObject *args) 85 | { 86 | int file_id; 87 | int new_flag; 88 | if (!PyArg_ParseTuple(args, "i" BSTR, &file_id, &new_flag)) return NULL; 89 | 90 | COREARRAY_TRY 91 | map::iterator it = GDSFile_ID_Info.find(file_id); 92 | if (it != GDSFile_ID_Info.end()) 93 | { 94 | if (new_flag || it->second.SelList.empty()) 95 | it->second.SelList.push_back(TSelection()); 96 | else 97 | it->second.SelList.push_back(it->second.SelList.back()); 98 | } else 99 | throw ErrSeqArray("The GDS file is closed or invalid."); 100 | COREARRAY_CATCH_NONE 101 | } 102 | 103 | 104 | /// pop up the previous filter from the stack 105 | PY_EXPORT PyObject* SEQ_FilterPop(PyObject *self, PyObject *args) 106 | { 107 | int file_id; 108 | if (!PyArg_ParseTuple(args, "i", &file_id)) return NULL; 109 | 110 | COREARRAY_TRY 111 | map::iterator it = GDSFile_ID_Info.find(file_id); 112 | if (it != GDSFile_ID_Info.end()) 113 | { 114 | if (it->second.SelList.size() <= 1) 115 | throw ErrSeqArray("No filter can be pop up."); 116 | it->second.SelList.pop_back(); 117 | } else 118 | throw ErrSeqArray("The GDS file is closed or invalid."); 119 | COREARRAY_CATCH_NONE 120 | } 121 | 122 | 123 | /// set a working space with selected sample id 124 | PY_EXPORT PyObject* SEQ_SetSpaceSample(PyObject *self, PyObject *args) 125 | { 126 | int file_id; 127 | PyObject *samp_id; 128 | int intersect, verbose; 129 | if (!PyArg_ParseTuple(args, "iO" BSTR BSTR, &file_id, &samp_id, &intersect, &verbose)) 130 | return NULL; 131 | 132 | COREARRAY_TRY 133 | 134 | CFileInfo &File = GetFileInfo(file_id); 135 | TSelection &Sel = File.Selection(); 136 | C_BOOL *pArray = Sel.pSample(); 137 | int Count = File.SampleNum(); 138 | PdAbstractArray varSamp = File.GetObj("sample.id", TRUE); 139 | 140 | if (samp_id == Py_None) 141 | { 142 | memset(pArray, TRUE, Count); 143 | } else if (numpy_is_array_or_list(samp_id)) 144 | { 145 | if (numpy_is_array_int(samp_id)) 146 | { 147 | // initialize 148 | set set_id; 149 | { 150 | vector ary; 151 | numpy_to_int32(samp_id, ary); 152 | set_id.insert(ary.begin(), ary.end()); 153 | } 154 | 155 | // sample id 156 | vector sample_id(Count); 157 | C_Int32 _st=0, _cnt=Count; 158 | GDS_Array_ReadData(varSamp, &_st, &_cnt, &sample_id[0], svInt32); 159 | 160 | // set selection 161 | if (!intersect) 162 | { 163 | for (int i=0; i < Count; i++) 164 | *pArray++ = (set_id.find(sample_id[i]) != set_id.end()); 165 | } else { 166 | for (int i=0; i < Count; i++, pArray++) 167 | { 168 | if (*pArray) 169 | *pArray = (set_id.find(sample_id[i]) != set_id.end()); 170 | } 171 | } 172 | } else { 173 | // initialize 174 | set set_id; 175 | { 176 | vector ary; 177 | numpy_to_string(samp_id, ary); 178 | set_id.insert(ary.begin(), ary.end()); 179 | } 180 | 181 | // sample id 182 | vector sample_id(Count); 183 | C_Int32 _st=0, _cnt=Count; 184 | GDS_Array_ReadData(varSamp, &_st, &_cnt, &sample_id[0], svStrUTF8); 185 | 186 | // set selection 187 | if (!intersect) 188 | { 189 | for (int i=0; i < Count; i++) 190 | *pArray++ = (set_id.find(sample_id[i]) != set_id.end()); 191 | } else { 192 | for (int i=0; i < Count; i++, pArray++) 193 | { 194 | if (*pArray) 195 | *pArray = (set_id.find(sample_id[i]) != set_id.end()); 196 | } 197 | } 198 | } 199 | } else 200 | throw ErrSeqArray("Invalid type of 'sample.id'."); 201 | 202 | if (verbose) 203 | { 204 | int n = File.SampleSelNum(); 205 | printf("# of selected samples: %s\n", PrettyInt(n)); 206 | } 207 | 208 | COREARRAY_CATCH_NONE 209 | } 210 | 211 | 212 | /// set a working space with selected sample id (bool vector or index) 213 | PY_EXPORT PyObject* SEQ_SetSpaceSample2(PyObject *self, PyObject *args) 214 | { 215 | int file_id; 216 | PyObject *samp_sel; 217 | int intersect, verbose; 218 | if (!PyArg_ParseTuple(args, "iO" BSTR BSTR, &file_id, &samp_sel, &intersect, &verbose)) 219 | return NULL; 220 | 221 | COREARRAY_TRY 222 | 223 | CFileInfo &File = GetFileInfo(file_id); 224 | TSelection &Sel = File.Selection(); 225 | C_BOOL *pArray = Sel.pSample(); 226 | int Count = File.SampleNum(); 227 | 228 | if (numpy_is_bool(samp_sel)) 229 | { 230 | // a logical vector for selected samples 231 | if (!intersect) 232 | { 233 | if (numpy_size(samp_sel) != (size_t)Count) 234 | throw ErrSeqArray("Invalid length of 'sample'."); 235 | memcpy(pArray, numpy_getptr(samp_sel), Count); 236 | } else { 237 | if (numpy_size(samp_sel) != (size_t)File.SampleSelNum()) 238 | { 239 | throw ErrSeqArray( 240 | "Invalid length of 'sample' (should be equal to the number of selected samples)."); 241 | } 242 | C_BOOL *base = (C_BOOL*)numpy_getptr(samp_sel); 243 | for (int i=0; i < Count; i++) 244 | { 245 | if (*pArray) 246 | *pArray = ((*base++) != 0); 247 | } 248 | } 249 | } else if (numpy_is_int(samp_sel)) 250 | { 251 | vector idx; 252 | numpy_to_int32(samp_sel, idx); 253 | 254 | if (!intersect) 255 | { 256 | int *pI = &idx[0]; 257 | size_t N = idx.size(); 258 | // check 259 | for (size_t i=0; i < N; i++) 260 | { 261 | int I = *pI ++; 262 | if ((I < 0) || (I >= Count)) 263 | throw ErrSeqArray("Out of range 'sample'."); 264 | } 265 | // set values 266 | memset((void*)pArray, 0, Count); 267 | pI = &idx[0]; 268 | for (size_t i=0; i < N; i++) 269 | pArray[*pI++] = TRUE; 270 | } else { 271 | int Cnt = File.SampleSelNum(); 272 | int *pI = &idx[0]; 273 | size_t N = idx.size(); 274 | // check 275 | for (size_t i=0; i < N; i++) 276 | { 277 | int I = *pI ++; 278 | if ((I < 0) || (I >= Cnt)) 279 | throw ErrSeqArray("Out of range 'sample'."); 280 | } 281 | // get the current index 282 | vector Idx; 283 | Idx.reserve(Cnt); 284 | for (int i=0; i < Count; i++) 285 | { 286 | if (pArray[i]) Idx.push_back(i); 287 | } 288 | // set values 289 | memset((void*)pArray, 0, Count); 290 | pI = &idx[0]; 291 | for (size_t i=0; i < N; i++) 292 | pArray[Idx[*pI++]] = TRUE; 293 | } 294 | } else if (samp_sel == Py_None) 295 | { 296 | memset(pArray, TRUE, Count); 297 | } else 298 | throw ErrSeqArray("Invalid type of 'sample'."); 299 | 300 | if (verbose) 301 | { 302 | int n = File.SampleSelNum(); 303 | printf("# of selected samples: %s\n", PrettyInt(n)); 304 | } 305 | 306 | COREARRAY_CATCH_NONE 307 | } 308 | 309 | 310 | /// set a working space with selected variant id 311 | PY_EXPORT PyObject* SEQ_SetSpaceVariant(PyObject *self, PyObject *args) 312 | { 313 | int file_id; 314 | PyObject *variant_id; 315 | int intersect, verbose; 316 | if (!PyArg_ParseTuple(args, "iO" BSTR BSTR, &file_id, &variant_id, &intersect, &verbose)) 317 | return NULL; 318 | 319 | COREARRAY_TRY 320 | 321 | CFileInfo &File = GetFileInfo(file_id); 322 | TSelection &Sel = File.Selection(); 323 | C_BOOL *pArray = Sel.pVariant(); 324 | int Count = File.VariantNum(); 325 | PdAbstractArray varVariant = File.GetObj("variant.id", TRUE); 326 | 327 | if (variant_id == Py_None) 328 | { 329 | memset(pArray, TRUE, Count); 330 | } else if (numpy_is_array_or_list(variant_id)) 331 | { 332 | if (numpy_is_array_int(variant_id)) 333 | { 334 | // initialize 335 | set set_id; 336 | { 337 | vector ary; 338 | numpy_to_int32(variant_id, ary); 339 | set_id.insert(ary.begin(), ary.end()); 340 | } 341 | 342 | // variant id 343 | vector var_id(Count); 344 | C_Int32 _st=0, _cnt=Count; 345 | GDS_Array_ReadData(varVariant, &_st, &_cnt, &var_id[0], svInt32); 346 | 347 | // set selection 348 | if (!intersect) 349 | { 350 | for (int i=0; i < Count; i++) 351 | *pArray++ = (set_id.find(var_id[i]) != set_id.end()); 352 | } else { 353 | for (int i=0; i < Count; i++, pArray++) 354 | { 355 | if (*pArray) 356 | *pArray = (set_id.find(var_id[i]) != set_id.end()); 357 | } 358 | } 359 | } else { 360 | // initialize 361 | set set_id; 362 | { 363 | vector ary; 364 | numpy_to_string(variant_id, ary); 365 | set_id.insert(ary.begin(), ary.end()); 366 | } 367 | 368 | // variant id 369 | vector var_id(Count); 370 | C_Int32 _st=0, _cnt=Count; 371 | GDS_Array_ReadData(varVariant, &_st, &_cnt, &var_id[0], svStrUTF8); 372 | 373 | // set selection 374 | if (!intersect) 375 | { 376 | for (int i=0; i < Count; i++) 377 | *pArray++ = (set_id.find(var_id[i]) != set_id.end()); 378 | } else { 379 | for (int i=0; i < Count; i++, pArray++) 380 | { 381 | if (*pArray) 382 | *pArray = (set_id.find(var_id[i]) != set_id.end()); 383 | } 384 | } 385 | } 386 | } else 387 | throw ErrSeqArray("Invalid type of 'variant.id'."); 388 | 389 | if (verbose) 390 | { 391 | int n = File.VariantSelNum(); 392 | printf("# of selected variants: %s\n", PrettyInt(n)); 393 | } 394 | 395 | COREARRAY_CATCH_NONE 396 | } 397 | 398 | 399 | /// set a working space with selected variant (bool vector or index) 400 | PY_EXPORT PyObject* SEQ_SetSpaceVariant2(PyObject *self, PyObject *args) 401 | { 402 | int file_id; 403 | PyObject *var_sel; 404 | int intersect, verbose; 405 | if (!PyArg_ParseTuple(args, "iO" BSTR BSTR, &file_id, &var_sel, &intersect, &verbose)) 406 | return NULL; 407 | 408 | COREARRAY_TRY 409 | 410 | CFileInfo &File = GetFileInfo(file_id); 411 | TSelection &Sel = File.Selection(); 412 | C_BOOL *pArray = Sel.pVariant(); 413 | int Count = File.VariantNum(); 414 | 415 | if (numpy_is_bool(var_sel)) 416 | { 417 | // a logical vector for selected samples 418 | if (!intersect) 419 | { 420 | if (numpy_size(var_sel) != (size_t)Count) 421 | throw ErrSeqArray("Invalid length of 'variant.sel'."); 422 | memcpy(pArray, numpy_getptr(var_sel), Count); 423 | } else { 424 | if (numpy_size(var_sel) != (size_t)File.VariantSelNum()) 425 | { 426 | throw ErrSeqArray( 427 | "Invalid length of 'variant' (should be equal to the number of selected variants)."); 428 | } 429 | // set selection 430 | C_BOOL *base = (C_BOOL*)numpy_getptr(var_sel); 431 | for (int i=0; i < Count; i++) 432 | { 433 | if (*pArray) 434 | *pArray = ((*base++) != 0); 435 | } 436 | } 437 | } else if (numpy_is_int(var_sel)) 438 | { 439 | vector idx; 440 | numpy_to_int32(var_sel, idx); 441 | 442 | if (!intersect) 443 | { 444 | int *pI = &idx[0]; 445 | size_t N = idx.size(); 446 | // check 447 | for (size_t i=0; i < N; i++) 448 | { 449 | int I = *pI ++; 450 | if ((I < 0) || (I >= Count)) 451 | throw ErrSeqArray("Out of range 'variant'."); 452 | } 453 | // set values 454 | memset((void*)pArray, 0, Count); 455 | pI = &idx[0]; 456 | for (size_t i=0; i < N; i++) 457 | pArray[*pI++] = TRUE; 458 | } else { 459 | int Cnt = File.VariantSelNum(); 460 | int *pI = &idx[0]; 461 | size_t N = idx.size(); 462 | // check 463 | for (size_t i=0; i < N; i++) 464 | { 465 | int I = *pI ++; 466 | if ((I < 0) || (I >= Cnt)) 467 | throw ErrSeqArray("Out of range 'variant'."); 468 | } 469 | // get the current index 470 | vector Idx; 471 | Idx.reserve(Cnt); 472 | for (int i=0; i < Count; i++) 473 | { 474 | if (pArray[i]) Idx.push_back(i); 475 | } 476 | // set values 477 | memset((void*)pArray, 0, Count); 478 | pI = &idx[0]; 479 | for (size_t i=0; i < N; i++) 480 | pArray[Idx[*pI++]] = TRUE; 481 | } 482 | } else if (var_sel == Py_None) 483 | { 484 | memset(pArray, TRUE, Count); 485 | } else 486 | throw ErrSeqArray("Invalid type of 'variant'."); 487 | 488 | if (verbose) 489 | { 490 | int n = File.VariantSelNum(); 491 | printf("# of selected variants: %s\n", PrettyInt(n)); 492 | } 493 | 494 | COREARRAY_CATCH_NONE 495 | } 496 | 497 | /* 498 | // ================================================================ 499 | 500 | static bool is_numeric(const string &txt) 501 | { 502 | char *endptr = (char*)(txt.c_str()); 503 | strtol(txt.c_str(), &endptr, 10); 504 | return (endptr != txt.c_str()) && (*endptr == 0); 505 | } 506 | 507 | /// set a working space flag with selected chromosome(s) 508 | PY_EXPORT PyObject* SEQ_SetChrom(PyObject* gdsfile, PyObject* include, 509 | PyObject* is_num, PyObject* frombp, PyObject* tobp, PyObject* intersect, PyObject* verbose) 510 | { 511 | int nProtected = 0; 512 | int *pFrom=NULL, *pTo=NULL; 513 | 514 | int IsNum = Rf_asLogical(is_num); 515 | int IsIntersect = Rf_asLogical(intersect); 516 | if (IsIntersect == NA_INTEGER) 517 | error("'intersect' should be either FALSE or TRUE."); 518 | 519 | if (Rf_isNull(include)) 520 | { 521 | if (!Rf_isNull(frombp)) 522 | error("'from.bp' should be NULL."); 523 | if (!Rf_isNull(tobp)) 524 | error("'to.bp' should be NULL."); 525 | } else { 526 | include = PROTECT(AS_CHARACTER(include)); 527 | nProtected ++; 528 | if (!Rf_isNull(frombp) || !Rf_isNull(tobp)) 529 | { 530 | if (RLength(include) != RLength(frombp)) 531 | error("'from.bp' should have the same length as 'include'."); 532 | if (RLength(include) != RLength(tobp)) 533 | error("'to.bp' should have the same length as 'include'."); 534 | frombp = PROTECT(AS_INTEGER(frombp)); 535 | tobp = PROTECT(AS_INTEGER(tobp)); 536 | pFrom = INTEGER(frombp); pTo = INTEGER(tobp); 537 | nProtected += 2; 538 | } 539 | } 540 | 541 | COREARRAY_TRY 542 | 543 | CFileInfo &File = GetFileInfo(gdsfile); 544 | TSelection &Sel = File.Selection(); 545 | 546 | vector &sel_array = Sel.Variant; 547 | vector tmp_array; 548 | if (IsIntersect) tmp_array.resize(sel_array.size()); 549 | 550 | vector &array = IsIntersect ? tmp_array : sel_array; 551 | memset(&array[0], FALSE, array.size()); 552 | 553 | if (Rf_isNull(include)) 554 | { 555 | // include = NULL 556 | if (IsNum == NA_INTEGER) 557 | { 558 | memset(&array[0], TRUE, array.size()); 559 | } else { 560 | CChromIndex &Chrom = File.Chromosome(); 561 | map::iterator it; 562 | for (it=Chrom.Map.begin(); it != Chrom.Map.end(); it++) 563 | { 564 | bool flag = is_numeric(it->first); 565 | if (((IsNum==TRUE) && flag) || ((IsNum==FALSE) && !flag)) 566 | { 567 | CChromIndex::TRangeList &rng = it->second; 568 | vector::iterator it; 569 | for (it=rng.begin(); it != rng.end(); it++) 570 | { 571 | memset(&array[it->Start], TRUE, it->Length); 572 | } 573 | } 574 | } 575 | } 576 | 577 | } else { 578 | // include != NULL 579 | vector *varPos = NULL; 580 | if (pFrom && pTo) 581 | varPos = &File.Position(); 582 | 583 | CChromIndex &Chrom = File.Chromosome(); 584 | map RngSets; 585 | 586 | size_t n = XLENGTH(include); 587 | for (size_t idx=0; idx < n; idx++) 588 | { 589 | string s = CHAR(STRING_ELT(include, idx)); 590 | 591 | if (IsNum == TRUE) 592 | { 593 | if (!is_numeric(s)) continue; 594 | } else if (IsNum == FALSE) 595 | { 596 | if (is_numeric(s)) continue; 597 | } 598 | 599 | map::iterator it = 600 | Chrom.Map.find(s); 601 | if (it != Chrom.Map.end()) 602 | { 603 | if (varPos) 604 | { 605 | // if from.bp and to.bp 606 | int from = pFrom[idx], to = pTo[idx]; 607 | if (from == NA_INTEGER) from = 0; 608 | if (to == NA_INTEGER) to = 2147483647; 609 | RngSets[s].AddRange(from, to); 610 | } else { 611 | // no from.bp and to.bp 612 | CChromIndex::TRangeList &rng = it->second; 613 | vector::iterator p; 614 | for (p=rng.begin(); p != rng.end(); p++) 615 | { 616 | memset(&array[p->Start], TRUE, p->Length); 617 | } 618 | } 619 | } 620 | } 621 | 622 | if (varPos) 623 | { 624 | map::iterator it; 625 | for (it=RngSets.begin(); it != RngSets.end(); it++) 626 | { 627 | CChromIndex::TRangeList &rng = Chrom.Map[it->first]; 628 | CRangeSet &RngSet = it->second; 629 | vector::const_iterator p; 630 | for (p=rng.begin(); p != rng.end(); p++) 631 | { 632 | size_t i = p->Start; 633 | size_t n = p->Length; 634 | C_Int32 *s = &((*varPos)[0]) + i; 635 | if (!IsIntersect) 636 | { 637 | for (; n > 0; n--, i++) 638 | if (RngSet.IsIncluded(*s++)) array[i] = TRUE; 639 | } else { 640 | C_BOOL *b = &sel_array[i]; 641 | for (; n > 0; n--, i++, s++) 642 | { 643 | if (*b++) 644 | if (RngSet.IsIncluded(*s)) array[i] = TRUE; 645 | } 646 | } 647 | } 648 | } 649 | } 650 | } 651 | 652 | if (IsIntersect) 653 | { 654 | C_BOOL *p = &sel_array[0]; 655 | C_BOOL *s = &array[0]; 656 | for (size_t n=sel_array.size(); n > 0; n--) 657 | (*p++) &= (*s++); 658 | } 659 | 660 | if (Rf_asLogical(verbose) == TRUE) 661 | { 662 | int n = GetNumOfTRUE(&sel_array[0], sel_array.size()); 663 | Rprintf("# of selected variants: %s\n", PrettyInt(n)); 664 | } 665 | 666 | UNPROTECT(nProtected); 667 | 668 | COREARRAY_CATCH 669 | } 670 | */ 671 | 672 | 673 | // ================================================================ 674 | 675 | /// set a working space flag with selected variant id 676 | PY_EXPORT PyObject* SEQ_GetSpace(PyObject *self, PyObject *args) 677 | { 678 | int file_id; 679 | int sample; 680 | if (!PyArg_ParseTuple(args, "i" BSTR, &file_id, &sample)) 681 | return NULL; 682 | 683 | COREARRAY_TRY 684 | 685 | CFileInfo &File = GetFileInfo(file_id); 686 | TSelection &Sel = File.Selection(); 687 | 688 | // output 689 | PyObject *rv_ans; 690 | if (sample) 691 | { 692 | size_t n = File.SampleNum(); 693 | rv_ans = numpy_new_bool(n); 694 | memcpy(numpy_getptr(rv_ans), Sel.pSample(), n); 695 | } else { 696 | size_t n = File.VariantNum(); 697 | rv_ans = numpy_new_bool(n); 698 | memcpy(numpy_getptr(rv_ans), Sel.pVariant(), n); 699 | } 700 | return rv_ans; 701 | 702 | COREARRAY_CATCH_NONE 703 | } 704 | 705 | 706 | 707 | // =========================================================== 708 | 709 | inline static C_BOOL *CLEAR_SELECTION(size_t num, C_BOOL *p) 710 | { 711 | while (num > 0) 712 | { 713 | if (*p != FALSE) { num--; *p = FALSE; } 714 | p ++; 715 | } 716 | return p; 717 | } 718 | inline static C_BOOL *SKIP_SELECTION(size_t num, C_BOOL *p) 719 | { 720 | while (num > 0) 721 | { 722 | if (*p != FALSE) num--; 723 | p ++; 724 | } 725 | return p; 726 | } 727 | 728 | /// split the selected variants according to multiple processes 729 | PY_EXPORT PyObject* SEQ_SplitSelection(PyObject *self, PyObject *args) 730 | { 731 | int file_id, proc_idx, proc_ncpu; 732 | const char *split; 733 | if (!PyArg_ParseTuple(args, "iiis", &file_id, &proc_idx, &proc_ncpu, &split)) 734 | return NULL; 735 | 736 | COREARRAY_TRY 737 | 738 | // selection object 739 | CFileInfo &File = GetFileInfo(file_id); 740 | TSelection &s = File.Selection(); 741 | 742 | // the total number of selected elements 743 | int SelectCount; 744 | C_BOOL *sel; 745 | if (strcmp(split, "by.variant") == 0) 746 | { 747 | sel = &s.Variant[0]; 748 | SelectCount = GetNumOfTRUE(sel, s.Variant.size()); 749 | } else if (strcmp(split, "by.sample") == 0) 750 | { 751 | sel = &s.Sample[0]; 752 | SelectCount = GetNumOfTRUE(sel, s.Sample.size()); 753 | } else if (strcmp(split, "none") == 0) 754 | { 755 | Py_RETURN_NONE; 756 | } else { 757 | throw ErrSeqArray("'split' should be 'by.variant', 'by.sample' or 'none'."); 758 | } 759 | 760 | // split a list 761 | vector split(proc_ncpu); 762 | double avg = (double)SelectCount / proc_ncpu; 763 | double start = 0; 764 | for (int i=0; i < proc_ncpu; i++) 765 | { 766 | start += avg; 767 | split[i] = (int)(start + 0.5); 768 | } 769 | 770 | // --------------------------------------------------- 771 | int st = 0; 772 | for (int i=0; i < proc_idx; i++) 773 | { 774 | sel = CLEAR_SELECTION(split[i] - st, sel); 775 | st = split[i]; 776 | } 777 | int ans_n = split[proc_idx] - st; 778 | sel = SKIP_SELECTION(ans_n, sel); 779 | st = split[proc_idx]; 780 | for (int i=proc_idx+1; i < proc_ncpu; i++) 781 | { 782 | sel = CLEAR_SELECTION(split[i] - st, sel); 783 | st = split[i]; 784 | } 785 | 786 | /* 787 | // --------------------------------------------------- 788 | // output 789 | if (SelFlag == TRUE) 790 | { 791 | rv_ans = NEW_LOGICAL(SelectCount); 792 | int *p = INTEGER(rv_ans); 793 | memset((void*)p, 0, sizeof(int) * size_t(SelectCount)); 794 | if (proc_idx > 0) 795 | p += split[proc_idx-1]; 796 | for (; ans_n > 0; ans_n--) *p++ = TRUE; 797 | } else { 798 | rv_ans = ScalarInteger(ans_n); 799 | } 800 | */ 801 | 802 | COREARRAY_CATCH_NONE 803 | } 804 | 805 | /* 806 | /// set a working space with selected variant id 807 | PY_EXPORT PyObject* SEQ_Summary(PyObject* gdsfile, PyObject* varname) 808 | { 809 | COREARRAY_TRY 810 | 811 | // the selection 812 | CFileInfo &File = GetFileInfo(gdsfile); 813 | TSelection &Sel = File.Selection(); 814 | // the GDS root node 815 | PdGDSFolder Root = GDS_R_SEXP2FileRoot(gdsfile); 816 | // the variable name 817 | string vn = CHAR(STRING_ELT(varname, 0)); 818 | 819 | if ((vn=="genotype") || (vn=="phase")) 820 | { 821 | PdGDSObj vGeno = GDS_Node_Path(Root, "genotype/data", TRUE); 822 | if (vGeno == NULL) 823 | { 824 | vGeno = GDS_Node_Path(Root, "genotype/~data", FALSE); 825 | if (vGeno == NULL) 826 | { 827 | throw ErrSeqArray( 828 | "There is no 'genotype/data' or 'genotype/~data'."); 829 | } 830 | } 831 | 832 | PROTECT(rv_ans = NEW_LIST(2)); 833 | 834 | PyObject* I32 = PROTECT(NEW_INTEGER(3)); 835 | SET_ELEMENT(rv_ans, 0, I32); 836 | C_Int32 Buf[4]; 837 | GDS_Array_GetDim(vGeno, Buf, 3); 838 | INTEGER(I32)[0] = Buf[2]; 839 | INTEGER(I32)[1] = Sel.Sample.size(); 840 | INTEGER(I32)[2] = Sel.Variant.size(); 841 | 842 | PyObject* S32 = PROTECT(NEW_INTEGER(3)); 843 | SET_ELEMENT(rv_ans, 1, S32); 844 | INTEGER(S32)[0] = Buf[2]; 845 | INTEGER(S32)[1] = GetNumOfTRUE(&Sel.Sample[0], Sel.Sample.size()); 846 | INTEGER(S32)[2] = GetNumOfTRUE(&Sel.Variant[0], Sel.Variant.size()); 847 | 848 | PyObject* tmp = PROTECT(NEW_CHARACTER(2)); 849 | SET_STRING_ELT(tmp, 0, mkChar("dim")); 850 | SET_STRING_ELT(tmp, 1, mkChar("seldim")); 851 | SET_NAMES(rv_ans, tmp); 852 | UNPROTECT(4); 853 | 854 | } else { 855 | PdGDSObj var = GDS_Node_Path(Root, vn.c_str(), TRUE); 856 | rv_ans = ScalarInteger(GDS_Array_GetTotalCount(var)); 857 | } 858 | 859 | COREARRAY_CATCH 860 | } 861 | 862 | 863 | /// get a logical vector with selection 864 | PY_EXPORT PyObject* SEQ_SelectFlag(PyObject* select, PyObject* len) 865 | { 866 | R_len_t n = XLENGTH(select); 867 | if (XLENGTH(len) != n) 868 | error("Index variable error."); 869 | 870 | int *p = INTEGER(len); 871 | R_len_t m = 0; 872 | for (R_len_t k=n; k > 0; k--, p++) 873 | { 874 | if (*p > 0) m += *p; 875 | } 876 | 877 | PyObject* rv_ans = NEW_LOGICAL(m); 878 | int *r = INTEGER(rv_ans), *s = INTEGER(select); 879 | p = INTEGER(len); 880 | for (; n > 0; n--, s++, p++) 881 | { 882 | for (int k=*p; k > 0; k--) 883 | *r++ = *s; 884 | } 885 | 886 | return rv_ans; 887 | } 888 | 889 | 890 | // =========================================================== 891 | // get system configuration 892 | // =========================================================== 893 | 894 | PY_EXPORT PyObject* SEQ_IntAssign(PyObject* Dst, PyObject* Src) 895 | { 896 | INTEGER(Dst)[0] = Rf_asInteger(Src); 897 | return R_NilValue; 898 | } 899 | 900 | 901 | inline static void CvtDNAString(char *p) 902 | { 903 | char c; 904 | while ((c = *p)) 905 | { 906 | c = toupper(c); 907 | if (c!='A' && c!='C' && c!='G' && c!='T' && c!='M' && c!='R' && 908 | c!='W' && c!='S' && c!='Y' && c!='K' && c!='V' && c!='H' && 909 | c!='D' && c!='B' && c!='N' && c!='-' && c!='+' && c!='.') 910 | { 911 | c = '.'; 912 | } 913 | *p++ = c; 914 | } 915 | } 916 | 917 | PY_EXPORT PyObject* SEQ_DNAStrSet(PyObject* x) 918 | { 919 | if (Rf_isVectorList(x)) 920 | { 921 | size_t nlen = XLENGTH(x); 922 | for (size_t i=0; i < nlen; i++) 923 | { 924 | PyObject* s = VECTOR_ELT(x, i); 925 | if (Rf_isString(s)) 926 | { 927 | size_t n = XLENGTH(s); 928 | for (size_t j=0; j < n; j++) 929 | CvtDNAString((char*)CHAR(STRING_ELT(s, j))); 930 | } 931 | } 932 | } else if (Rf_isString(x)) 933 | { 934 | size_t n = XLENGTH(x); 935 | for (size_t i=0; i < n; i++) 936 | CvtDNAString((char*)CHAR(STRING_ELT(x, i))); 937 | } 938 | 939 | return x; 940 | } 941 | 942 | 943 | 944 | // =========================================================== 945 | // get system configuration 946 | // =========================================================== 947 | 948 | /// the number of alleles per site 949 | PY_EXPORT PyObject* SEQ_System() 950 | { 951 | COREARRAY_TRY 952 | 953 | int nProtect = 0; 954 | rv_ans = PROTECT(NEW_LIST(2)); 955 | PyObject* nm = PROTECT(NEW_CHARACTER(2)); 956 | nProtect += 2; 957 | SET_NAMES(rv_ans, nm); 958 | 959 | // the number of logical cores 960 | SET_ELEMENT(rv_ans, 0, ScalarInteger(GDS_Mach_GetNumOfCores())); 961 | SET_STRING_ELT(nm, 0, mkChar("num.logical.core")); 962 | 963 | // compiler flags 964 | vector ss; 965 | 966 | #ifdef COREARRAY_SIMD_SSE 967 | ss.push_back("SSE"); 968 | #endif 969 | #ifdef COREARRAY_SIMD_SSE2 970 | ss.push_back("SSE2"); 971 | #endif 972 | #ifdef COREARRAY_SIMD_SSE3 973 | ss.push_back("SSE3"); 974 | #endif 975 | #ifdef COREARRAY_SIMD_SSSE3 976 | ss.push_back("SSSE3"); 977 | #endif 978 | #ifdef COREARRAY_SIMD_SSE4_1 979 | ss.push_back("SSE4.1"); 980 | #endif 981 | #ifdef COREARRAY_SIMD_SSE4_2 982 | ss.push_back("SSE4.2"); 983 | #endif 984 | #ifdef COREARRAY_SIMD_AVX 985 | ss.push_back("AVX"); 986 | #endif 987 | #ifdef COREARRAY_SIMD_AVX2 988 | ss.push_back("AVX2"); 989 | #endif 990 | #ifdef COREARRAY_SIMD_FMA 991 | ss.push_back("FMA"); 992 | #endif 993 | #ifdef COREARRAY_SIMD_FMA4 994 | ss.push_back("FMA4"); 995 | #endif 996 | PyObject* SIMD = PROTECT(NEW_CHARACTER(ss.size())); 997 | nProtect ++; 998 | SET_ELEMENT(rv_ans, 1, SIMD); 999 | SET_STRING_ELT(nm, 1, mkChar("compiler.flag")); 1000 | for (int i=0; i < (int)ss.size(); i++) 1001 | SET_STRING_ELT(SIMD, i, mkChar(ss[i].c_str())); 1002 | 1003 | UNPROTECT(nProtect); 1004 | 1005 | COREARRAY_CATCH 1006 | } 1007 | */ 1008 | 1009 | 1010 | // =========================================================== 1011 | // the initial function when the package is loaded 1012 | // =========================================================== 1013 | 1014 | // Register routines 1015 | 1016 | extern PyObject* SEQ_GetData(PyObject *self, PyObject *args); 1017 | extern PyObject* SEQ_BApply_Variant(PyObject *self, PyObject *args); 1018 | 1019 | extern PyObject* FC_CalcAF(PyObject *self, PyObject *args); 1020 | 1021 | 1022 | static PyMethodDef module_methods[] = { 1023 | // file operations 1024 | { "file_init", (PyCFunction)SEQ_File_Init, METH_VARARGS, NULL }, 1025 | { "file_done", (PyCFunction)SEQ_File_Done, METH_VARARGS, NULL }, 1026 | 1027 | { "flt_push", (PyCFunction)SEQ_FilterPush, METH_VARARGS, NULL }, 1028 | { "flt_pop", (PyCFunction)SEQ_FilterPop, METH_VARARGS, NULL }, 1029 | { "flt_split", (PyCFunction)SEQ_SplitSelection, METH_VARARGS, NULL }, 1030 | 1031 | { "set_sample", (PyCFunction)SEQ_SetSpaceSample, METH_VARARGS, NULL }, 1032 | { "set_sample2", (PyCFunction)SEQ_SetSpaceSample2, METH_VARARGS, NULL }, 1033 | { "set_variant", (PyCFunction)SEQ_SetSpaceVariant, METH_VARARGS, NULL }, 1034 | { "set_variant2", (PyCFunction)SEQ_SetSpaceVariant2, METH_VARARGS, NULL }, 1035 | 1036 | { "get_filter", (PyCFunction)SEQ_GetSpace, METH_VARARGS, NULL }, 1037 | 1038 | // get data 1039 | { "get_data", (PyCFunction)SEQ_GetData, METH_VARARGS, NULL }, 1040 | { "apply", (PyCFunction)SEQ_BApply_Variant, METH_VARARGS, NULL }, 1041 | 1042 | // get data 1043 | // { "calc_af", (PyCFunction)FC_CalcAF, METH_VARARGS, NULL }, 1044 | 1045 | // end 1046 | { NULL, NULL, 0, NULL } 1047 | }; 1048 | 1049 | 1050 | // Module entry point Python 1051 | 1052 | #if PY_MAJOR_VERSION >= 3 1053 | 1054 | static struct PyModuleDef ModStruct = 1055 | { 1056 | PyModuleDef_HEAD_INIT, 1057 | "PySeqArray.ccall", // name of module 1058 | "C functions for data manipulation", // module documentation 1059 | -1, // size of per-interpreter state of the module, or -1 if the module keeps state in global variables 1060 | module_methods 1061 | }; 1062 | 1063 | PyMODINIT_FUNC PyInit_ccall() 1064 | { 1065 | if (!numpy_init()) return NULL; 1066 | if (Init_GDS_Routines() < 0) return NULL; 1067 | #else 1068 | PyMODINIT_FUNC initccall() 1069 | { 1070 | if (!numpy_init()) return; 1071 | if (Init_GDS_Routines() < 0) return; 1072 | #endif 1073 | 1074 | // create the module and add the functions 1075 | PyObject *mod; 1076 | #if PY_MAJOR_VERSION >= 3 1077 | mod = PyModule_Create(&ModStruct); 1078 | return mod; 1079 | #else 1080 | mod = Py_InitModule("PySeqArray.ccall", module_methods); 1081 | #endif 1082 | } 1083 | 1084 | } // extern "C" 1085 | -------------------------------------------------------------------------------- /src/Index.cpp: -------------------------------------------------------------------------------- 1 | // =========================================================== 2 | // 3 | // Index.cpp: Indexing Objects 4 | // 5 | // Copyright (C) 2017 Xiuwen Zheng 6 | // 7 | // This file is part of PySeqArray. 8 | // 9 | // PySeqArray is free software: you can redistribute it and/or modify it 10 | // under the terms of the GNU General Public License Version 3 as 11 | // published by the Free Software Foundation. 12 | // 13 | // PySeqArray is distributed in the hope that it will be useful, but 14 | // WITHOUT ANY WARRANTY; without even the implied warranty of 15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | // GNU General Public License for more details. 17 | // 18 | // You should have received a copy of the GNU General Public License 19 | // along with PySeqArray. 20 | // If not, see . 21 | 22 | #include "Index.h" 23 | #include 24 | 25 | using namespace std; 26 | 27 | 28 | namespace PySeqArray 29 | { 30 | 31 | static double NaN = 0.0/0.0; 32 | 33 | // =========================================================== 34 | // Indexing object 35 | // =========================================================== 36 | 37 | CIndex::CIndex() 38 | { 39 | TotalLength = 0; 40 | Position = 0; 41 | AccSum = 0; 42 | AccIndex = AccOffset = 0; 43 | } 44 | 45 | void CIndex::Init(PdContainer Obj) 46 | { 47 | Values.clear(); 48 | Lengths.clear(); 49 | int Buffer[65536]; 50 | C_Int64 n = GDS_Array_GetTotalCount(Obj); 51 | if (n > INT_MAX) 52 | throw ErrSeqArray("Invalid dimension in CIndex."); 53 | 54 | CdIterator it; 55 | GDS_Iter_GetStart(Obj, &it); 56 | TotalLength = n; 57 | int last = -1; 58 | C_UInt32 repeat = 0; 59 | 60 | while (n > 0) 61 | { 62 | ssize_t m = (n <= 65536) ? n : 65536; 63 | GDS_Iter_RData(&it, Buffer, m, svInt32); 64 | n -= m; 65 | for (int *p = Buffer; m > 0; m--) 66 | { 67 | int v = *p++; 68 | if (v < 0) v = 0; 69 | if (v == last) 70 | { 71 | repeat ++; 72 | } else { 73 | if (repeat > 0) 74 | { 75 | Values.push_back(last); 76 | Lengths.push_back(repeat); 77 | } 78 | last = v; repeat = 1; 79 | } 80 | } 81 | } 82 | 83 | if (repeat > 0) 84 | { 85 | Values.push_back(last); 86 | Lengths.push_back(repeat); 87 | } 88 | 89 | Position = 0; 90 | AccSum = 0; 91 | AccIndex = AccOffset = 0; 92 | } 93 | 94 | void CIndex::InitOne(int num) 95 | { 96 | Values.clear(); 97 | Values.push_back(1); 98 | Lengths.clear(); 99 | Lengths.push_back(num); 100 | TotalLength = num; 101 | Position = 0; 102 | AccSum = 0; 103 | AccIndex = AccOffset = 0; 104 | } 105 | 106 | void CIndex::GetInfo(size_t pos, C_Int64 &Sum, int &Value) 107 | { 108 | if (pos >= TotalLength) 109 | throw ErrSeqArray("Invalid position in CIndex."); 110 | if (pos < Position) 111 | { 112 | Position = 0; 113 | AccSum = 0; 114 | AccIndex = AccOffset = 0; 115 | } 116 | for (; Position < pos; ) 117 | { 118 | size_t L = Lengths[AccIndex]; 119 | size_t n = L - AccOffset; 120 | if ((Position + n) <= pos) 121 | { 122 | AccSum += (Values[AccIndex] * n); 123 | AccIndex ++; AccOffset = 0; 124 | } else { 125 | n = pos - Position; 126 | AccSum += (Values[AccIndex] * n); 127 | AccOffset += n; 128 | } 129 | Position += n; 130 | } 131 | Sum = AccSum; 132 | Value = Values[AccIndex]; 133 | } 134 | 135 | PyObject* CIndex::GetLen_Sel(const C_BOOL sel[]) 136 | { 137 | size_t n; 138 | const C_BOOL *p = (C_BOOL *)vec_i8_cnt_nonzero_ptr((const int8_t *)sel, 139 | TotalLength, &n); 140 | // create a numpy array object 141 | npy_intp dims[1] = { (npy_intp)n }; 142 | PyObject *ans = PyArray_SimpleNew(1, dims, NPY_INT32); 143 | if (n > 0) 144 | { 145 | int *pV = &Values[0]; 146 | C_UInt32 *pL = &Lengths[0]; 147 | size_t L = *pL; 148 | // skip non-selection 149 | for (size_t m=p-sel; m > 0; ) 150 | { 151 | if (L == 0) 152 | { 153 | L = *(++pL); pV ++; 154 | continue; // in case, L = 0 155 | } 156 | if (L <= m) 157 | { 158 | m -= L; L = 0; 159 | } else { 160 | L -= m; m = 0; 161 | } 162 | } 163 | // get lengths 164 | int *pAns = (int*)PyArray_DATA(ans); 165 | while (n > 0) 166 | { 167 | if (L == 0) 168 | { 169 | L = *(++pL); pV ++; 170 | continue; // in case, L = 0 171 | } 172 | L--; 173 | if (*p++) 174 | { 175 | *pAns++ = *pV; 176 | n --; 177 | } 178 | } 179 | } 180 | return ans; 181 | } 182 | 183 | PyObject* CIndex::GetLen_Sel(const C_BOOL sel[], int &out_var_start, 184 | int &out_var_count, vector &out_var_sel) 185 | { 186 | size_t n; 187 | const C_BOOL *p = (C_BOOL *)vec_i8_cnt_nonzero_ptr((const int8_t *)sel, 188 | TotalLength, &n); 189 | // create a numpy array object 190 | npy_intp dims[1] = { (npy_intp)n }; 191 | PyObject *ans = PyArray_SimpleNew(1, dims, NPY_INT32); 192 | out_var_start = 0; 193 | out_var_count = 0; 194 | 195 | if (n > 0) 196 | { 197 | int *pV = &Values[0]; 198 | C_UInt32 *pL = &Lengths[0]; 199 | size_t L = *pL; 200 | // skip non-selection 201 | for (size_t m=p-sel; m > 0; ) 202 | { 203 | if (L == 0) 204 | { 205 | L = *(++pL); pV ++; 206 | continue; // in case, L = 0 207 | } 208 | if (L <= m) 209 | { 210 | m -= L; out_var_start += L * (*pV); L = 0; 211 | } else { 212 | L -= m; out_var_start += m * (*pV); m = 0; 213 | } 214 | } 215 | sel = p; 216 | // get the total length 217 | int *pVV = pV; 218 | C_UInt32 *pLL = pL; 219 | size_t LL = L; 220 | int *pAns = (int*)PyArray_DATA(ans); 221 | for (size_t m=n; m > 0; ) 222 | { 223 | if (L == 0) 224 | { 225 | L = *(++pL); pV ++; 226 | continue; // in case, L = 0 227 | } 228 | L--; 229 | out_var_count += (*pV); 230 | if (*p++) 231 | { 232 | *pAns++ = *pV; 233 | m --; 234 | } 235 | } 236 | // set bool selection 237 | out_var_sel.resize(out_var_count, TRUE); 238 | C_BOOL *pB = &out_var_sel[0]; 239 | p = sel; pV = pVV; pL = pLL; L = LL; 240 | while (n > 0) 241 | { 242 | if (L == 0) 243 | { 244 | L = *(++pL); pV ++; 245 | continue; // in case, L = 0 246 | } 247 | L--; 248 | if (*p++) 249 | { 250 | pB += *pV; n --; 251 | } else { 252 | for (size_t m=*pV; m > 0; m--) *pB++ = FALSE; 253 | } 254 | } 255 | } else { 256 | out_var_sel.clear(); 257 | } 258 | 259 | return ans; 260 | } 261 | 262 | 263 | 264 | // =========================================================== 265 | 266 | CGenoIndex::CGenoIndex() 267 | { 268 | TotalLength = 0; 269 | Position = 0; 270 | AccSum = 0; 271 | AccIndex = AccOffset = 0; 272 | } 273 | 274 | void CGenoIndex::Init(PdContainer Obj) 275 | { 276 | Values.clear(); 277 | Lengths.clear(); 278 | C_UInt16 Buffer[65536]; 279 | C_Int64 n = GDS_Array_GetTotalCount(Obj); 280 | if (n > INT_MAX) 281 | throw ErrSeqArray("Invalid dimension in CIndex."); 282 | 283 | CdIterator it; 284 | GDS_Iter_GetStart(Obj, &it); 285 | TotalLength = n; 286 | C_UInt16 last = 0xFFFF; 287 | C_UInt32 repeat = 0; 288 | 289 | while (n > 0) 290 | { 291 | ssize_t m = (n <= 65536) ? n : 65536; 292 | GDS_Iter_RData(&it, Buffer, m, svUInt16); 293 | n -= m; 294 | for (C_UInt16 *p = Buffer; m > 0; m--) 295 | { 296 | C_UInt16 v = *p++; 297 | if (v < 0) v = 0; 298 | if (v == last) 299 | { 300 | repeat ++; 301 | } else { 302 | if (repeat > 0) 303 | { 304 | Values.push_back(last); 305 | Lengths.push_back(repeat); 306 | } 307 | last = v; repeat = 1; 308 | } 309 | } 310 | } 311 | 312 | if (repeat > 0) 313 | { 314 | Values.push_back(last); 315 | Lengths.push_back(repeat); 316 | } 317 | 318 | Position = 0; 319 | AccSum = 0; 320 | AccIndex = AccOffset = 0; 321 | } 322 | 323 | void CGenoIndex::GetInfo(size_t pos, C_Int64 &Sum, C_UInt8 &Value) 324 | { 325 | if (pos >= TotalLength) 326 | throw ErrSeqArray("Invalid position in CIndex."); 327 | if (pos < Position) 328 | { 329 | Position = 0; 330 | AccSum = 0; 331 | AccIndex = AccOffset = 0; 332 | } 333 | for (; Position < pos; ) 334 | { 335 | size_t L = Lengths[AccIndex]; 336 | size_t n = L - AccOffset; 337 | if ((Position + n) <= pos) 338 | { 339 | AccSum += (Values[AccIndex] * n); 340 | AccIndex ++; AccOffset = 0; 341 | } else { 342 | n = pos - Position; 343 | AccSum += (Values[AccIndex] * n); 344 | AccOffset += n; 345 | } 346 | Position += n; 347 | } 348 | Sum = AccSum; 349 | Value = Values[AccIndex] & 0x0F; 350 | } 351 | 352 | 353 | 354 | // =========================================================== 355 | // Chromosome Indexing 356 | // =========================================================== 357 | 358 | CChromIndex::CChromIndex() { } 359 | 360 | void CChromIndex::AddChrom(PdGDSFolder Root) 361 | { 362 | PdAbstractArray varVariant = GDS_Node_Path(Root, "variant.id", TRUE); 363 | C_Int32 NumVariant = GDS_Array_GetTotalCount(varVariant); 364 | 365 | PdAbstractArray varChrom = GDS_Node_Path(Root, "chromosome", TRUE); 366 | C_Int32 NumChrom = GDS_Array_GetTotalCount(varChrom); 367 | 368 | if ((GDS_Array_DimCnt(varChrom) != 1) || (NumVariant != NumChrom)) 369 | throw ErrSeqArray("Invalid dimension of 'chromosome'."); 370 | if (NumChrom <= 0) return; 371 | 372 | C_Int32 idx=0, len=1; 373 | string last; 374 | GDS_Array_ReadData(varChrom, &idx, &len, &last, svStrUTF8); 375 | idx ++; 376 | 377 | TRange rng; 378 | rng.Start = 0; 379 | rng.Length = 1; 380 | 381 | Map.clear(); 382 | PosToChr.Clear(); 383 | 384 | const C_Int32 NMAX = 4096; 385 | string txt[NMAX]; 386 | 387 | while (idx < NumChrom) 388 | { 389 | len = NumChrom - idx; 390 | if (len > NMAX) len = NMAX; 391 | GDS_Array_ReadData(varChrom, &idx, &len, &txt, svStrUTF8); 392 | for (int i=0; i < len; i++) 393 | { 394 | if (txt[i] == last) 395 | { 396 | rng.Length ++; 397 | } else { 398 | Map[last].push_back(rng); 399 | PosToChr.Add(last, rng.Length); 400 | last = string(txt[i].begin(), txt[i].end()); 401 | rng.Start = idx + i; 402 | rng.Length = 1; 403 | } 404 | } 405 | idx += len; 406 | } 407 | 408 | Map[last].push_back(rng); 409 | PosToChr.Add(last, rng.Length); 410 | PosToChr.Init(); 411 | } 412 | 413 | void CChromIndex::Clear() 414 | { 415 | Map.clear(); 416 | } 417 | 418 | size_t CChromIndex::RangeTotalLength(const TRangeList &RngList) 419 | { 420 | size_t ans = 0; 421 | vector::const_iterator it; 422 | for (it=RngList.begin(); it != RngList.end(); it ++) 423 | ans += it->Length; 424 | return ans; 425 | } 426 | 427 | 428 | 429 | // =========================================================== 430 | // Genomic Range Set 431 | // =========================================================== 432 | 433 | bool CRangeSet::less_range::operator()(const TRange &lhs, const TRange &rhs) const 434 | { 435 | // -1 for two possible adjacent regions 436 | return (lhs.End < rhs.Start-1); 437 | } 438 | 439 | void CRangeSet::Clear() 440 | { 441 | _RangeSet.clear(); 442 | } 443 | 444 | void CRangeSet::AddRange(int start, int end) 445 | { 446 | if (end < start) end = start; 447 | TRange rng; 448 | rng.Start = start; rng.End = end; 449 | 450 | do { 451 | set::iterator it = _RangeSet.find(rng); 452 | if (it != _RangeSet.end()) 453 | { 454 | if ((rng.Start < it->Start) || (rng.End > it->End)) 455 | { 456 | if (rng.Start > it->Start) rng.Start = it->Start; 457 | if (rng.End < it->End) rng.End = it->End; 458 | _RangeSet.erase(it); 459 | } else 460 | break; 461 | } else { 462 | _RangeSet.insert(rng); 463 | break; 464 | } 465 | } while (1); 466 | } 467 | 468 | bool CRangeSet::IsIncluded(int point) 469 | { 470 | TRange rng; 471 | rng.Start = rng.End = point; 472 | set::iterator it = _RangeSet.find(rng); 473 | return it != _RangeSet.end(); 474 | } 475 | 476 | 477 | 478 | // =========================================================== 479 | // SeqArray GDS file information 480 | // =========================================================== 481 | 482 | static const char *ERR_DIM = "Invalid dimension of '%s'."; 483 | static const char *ERR_FILE_ROOT = "CFileInfo::FileRoot should be initialized."; 484 | 485 | CFileInfo::CFileInfo(PdGDSFolder root) 486 | { 487 | _Root = NULL; 488 | _SampleNum = _VariantNum = 0; 489 | ResetRoot(root); 490 | } 491 | 492 | CFileInfo::~CFileInfo() 493 | { 494 | _Root = NULL; 495 | _SampleNum = _VariantNum = 0; 496 | } 497 | 498 | void CFileInfo::ResetRoot(PdGDSFolder root) 499 | { 500 | if (_Root != root) 501 | { 502 | // initialize 503 | _Root = root; 504 | SelList.clear(); 505 | _Chrom.Clear(); 506 | _Position.clear(); 507 | 508 | // sample.id 509 | PdAbstractArray Node = GDS_Node_Path(root, "sample.id", TRUE); 510 | C_Int64 n = GDS_Array_GetTotalCount(Node); 511 | if ((n < 0) || (n > 2147483647)) 512 | throw ErrSeqArray(ERR_DIM, "sample.id"); 513 | _SampleNum = n; 514 | 515 | // variant.id 516 | Node = GDS_Node_Path(root, "variant.id", TRUE); 517 | n = GDS_Array_GetTotalCount(Node); 518 | if ((n < 0) || (n > 2147483647)) 519 | throw ErrSeqArray(ERR_DIM, "variant.id"); 520 | _VariantNum = n; 521 | 522 | // genotypes 523 | _Ploidy = -1; 524 | Node = GDS_Node_Path(root, "genotype/data", FALSE); 525 | if (Node != NULL) 526 | { 527 | if (GDS_Array_DimCnt(Node) == 3) 528 | { 529 | C_Int32 DLen[3]; 530 | GDS_Array_GetDim(Node, DLen, 3); 531 | _Ploidy = DLen[2]; 532 | } 533 | } 534 | } 535 | } 536 | 537 | TSelection &CFileInfo::Selection() 538 | { 539 | if (!_Root) 540 | throw ErrSeqArray(ERR_FILE_ROOT); 541 | if (SelList.empty()) 542 | SelList.push_back(TSelection()); 543 | 544 | TSelection &s = SelList.back(); 545 | if (s.Sample.empty()) 546 | s.Sample.resize(_SampleNum, TRUE); 547 | if (s.Variant.empty()) 548 | s.Variant.resize(_VariantNum, TRUE); 549 | 550 | return s; 551 | } 552 | 553 | CChromIndex &CFileInfo::Chromosome() 554 | { 555 | if (!_Root) 556 | throw ErrSeqArray(ERR_FILE_ROOT); 557 | if (_Chrom.Empty()) 558 | _Chrom.AddChrom(_Root); 559 | return _Chrom; 560 | } 561 | 562 | vector &CFileInfo::Position() 563 | { 564 | if (!_Root) 565 | throw ErrSeqArray(ERR_FILE_ROOT); 566 | if (_Position.empty()) 567 | { 568 | PdAbstractArray N = GetObj("position", TRUE); 569 | // check 570 | if ((GDS_Array_DimCnt(N) != 1) || 571 | (GDS_Array_GetTotalCount(N) != _VariantNum)) 572 | throw ErrSeqArray(ERR_DIM, "position"); 573 | // read 574 | _Position.resize(_VariantNum); 575 | GDS_Array_ReadData(N, NULL, NULL, &_Position[0], svInt32); 576 | } 577 | return _Position; 578 | } 579 | 580 | CGenoIndex &CFileInfo::GenoIndex() 581 | { 582 | if (_GenoIndex.Empty()) 583 | { 584 | PdAbstractArray I = GetObj("genotype/@data", TRUE); 585 | _GenoIndex.Init(I); 586 | } 587 | return _GenoIndex; 588 | } 589 | 590 | CIndex &CFileInfo::VarIndex(const string &varname) 591 | { 592 | CIndex &I = _VarIndex[varname]; 593 | if (I.Empty()) 594 | { 595 | PdAbstractArray N = GDS_Node_Path(_Root, varname.c_str(), FALSE); 596 | if (N == NULL) 597 | I.InitOne(_VariantNum); 598 | else 599 | I.Init(N); 600 | } 601 | return I; 602 | } 603 | 604 | PdAbstractArray CFileInfo::GetObj(const char *name, C_BOOL MustExist) 605 | { 606 | if (!_Root) 607 | throw ErrSeqArray(ERR_FILE_ROOT); 608 | return GDS_Node_Path(_Root, name, MustExist); 609 | } 610 | 611 | int CFileInfo::SampleSelNum() 612 | { 613 | TSelection &sel = Selection(); 614 | return vec_i8_cnt_nonzero((C_Int8*)&sel.Sample[0], _SampleNum); 615 | } 616 | 617 | int CFileInfo::VariantSelNum() 618 | { 619 | TSelection &sel = Selection(); 620 | return vec_i8_cnt_nonzero((C_Int8*)&sel.Variant[0], _VariantNum); 621 | } 622 | 623 | 624 | // =========================================================== 625 | 626 | /// File info list 627 | std::map COREARRAY_DLL_LOCAL GDSFile_ID_Info; 628 | 629 | /// get the associated CFileInfo 630 | COREARRAY_DLL_LOCAL CFileInfo &GetFileInfo(int file_id) 631 | { 632 | if (file_id < 0) 633 | throw ErrSeqArray("Invalid gdsfile object."); 634 | 635 | PdGDSFolder root = GDS_ID2FileRoot(file_id); 636 | map::iterator p = GDSFile_ID_Info.find(file_id); 637 | if (p == GDSFile_ID_Info.end()) 638 | { 639 | GDSFile_ID_Info[file_id].ResetRoot(root); 640 | p = GDSFile_ID_Info.find(file_id); 641 | } else { 642 | if (p->second.Root() != root) 643 | p->second.ResetRoot(root); 644 | } 645 | 646 | return p->second; 647 | } 648 | 649 | 650 | 651 | // =========================================================== 652 | // GDS Variable Type 653 | // =========================================================== 654 | 655 | static C_BOOL ArrayTRUEs[64] = { 656 | 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 657 | 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 658 | 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, 659 | 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1 660 | }; 661 | 662 | CVarApply::CVarApply() 663 | { 664 | fVarType = ctNone; 665 | MarginalSize = 0; 666 | MarginalSelect = NULL; 667 | Node = NULL; 668 | Position = 0; 669 | } 670 | 671 | CVarApply::~CVarApply() 672 | { } 673 | 674 | void CVarApply::Reset() 675 | { 676 | Position = 0; 677 | if (MarginalSize > 0) 678 | if (!MarginalSelect[0]) Next(); 679 | } 680 | 681 | bool CVarApply::Next() 682 | { 683 | C_BOOL *p = MarginalSelect + Position; 684 | while (Position < MarginalSize) 685 | { 686 | Position ++; 687 | if (*(++p)) break; 688 | } 689 | return (Position < MarginalSize); 690 | } 691 | 692 | C_BOOL *CVarApply::NeedTRUEs(size_t size) 693 | { 694 | if (size <= sizeof(ArrayTRUEs)) 695 | { 696 | return ArrayTRUEs; 697 | } else if (size > _TRUE.size()) 698 | { 699 | _TRUE.resize(size, TRUE); 700 | } 701 | return &_TRUE[0]; 702 | } 703 | 704 | 705 | CApply_Variant::CApply_Variant(): CVarApply() 706 | { 707 | VarNode = NULL; 708 | } 709 | 710 | CApply_Variant::CApply_Variant(CFileInfo &File): CVarApply() 711 | { 712 | MarginalSize = File.VariantNum(); 713 | MarginalSelect = File.Selection().pVariant(); 714 | VarNode = NULL; 715 | } 716 | 717 | CApply_Variant::~CApply_Variant() 718 | { 719 | if (VarNode) Py_DECREF(VarNode); 720 | } 721 | 722 | 723 | CVarApplyList::~CVarApplyList() 724 | { 725 | for (iterator p = begin(); p != end(); p++) 726 | { 727 | CVarApply *v = (*p); 728 | *p = NULL; 729 | delete v; 730 | } 731 | } 732 | 733 | bool CVarApplyList::CallNext() 734 | { 735 | bool has_next = true; 736 | for (iterator p = begin(); p != end(); p++) 737 | { 738 | if (!(*p)->Next()) 739 | has_next = false; 740 | } 741 | return has_next; 742 | } 743 | 744 | 745 | 746 | // =========================================================== 747 | // Progress object 748 | // =========================================================== 749 | 750 | static const int PROGRESS_BAR_CHAR_NUM = 50; 751 | static const int PROGRESS_LINE_NUM = 100000; 752 | 753 | static const double S_MIN = 60; 754 | static const double S_HOUR = 60 * S_MIN; 755 | static const double S_DAY = 24 * S_HOUR; 756 | static const double S_YEAR = 365 * S_DAY; 757 | 758 | static const char *time_str(double s) 759 | { 760 | if (GDS_Mach_Finite(s)) 761 | { 762 | static char buffer[64]; 763 | if (s < S_MIN) 764 | sprintf(buffer, "%.0fs", s); 765 | else if (s < S_HOUR) 766 | sprintf(buffer, "%.1fm", s/S_MIN); 767 | else if (s < S_DAY) 768 | sprintf(buffer, "%.1fh", s/S_HOUR); 769 | else if (s < S_YEAR) 770 | sprintf(buffer, "%.1fd", s/S_DAY); 771 | else 772 | sprintf(buffer, "%.1f years", s/S_YEAR); 773 | return buffer; 774 | } else 775 | return "---"; 776 | } 777 | 778 | 779 | CProgress::CProgress(C_Int64 start, C_Int64 count, FILE *conn, bool newline) 780 | { 781 | TotalCount = count; 782 | Counter = (start >= 0) ? start : 0; 783 | double percent; 784 | File = conn; 785 | NewLine = newline; 786 | 787 | if (count > 0) 788 | { 789 | int n = 100; 790 | if (n > count) n = count; 791 | if (n < 1) n = 1; 792 | _start = _step = (double)count / n; 793 | _hit = (C_Int64)(_start); 794 | if (Counter > count) Counter = count; 795 | percent = (double)Counter / count; 796 | } else { 797 | _start = _step = 0; 798 | _hit = PROGRESS_LINE_NUM; 799 | percent = 0; 800 | } 801 | 802 | time_t s; time(&s); 803 | _start_time = s; 804 | _timer.reserve(128); 805 | _timer.push_back(pair(percent, s)); 806 | 807 | ShowProgress(); 808 | } 809 | 810 | CProgress::~CProgress() 811 | { } 812 | 813 | void CProgress::Forward() 814 | { 815 | Counter ++; 816 | if (Counter >= _hit) 817 | { 818 | if (TotalCount > 0) 819 | { 820 | _start += _step; 821 | _hit = (C_Int64)(_start); 822 | if (_hit > TotalCount) _hit = TotalCount; 823 | } else { 824 | _hit += PROGRESS_LINE_NUM; 825 | } 826 | ShowProgress(); 827 | } 828 | } 829 | 830 | void CProgress::ShowProgress() 831 | { 832 | if (File) 833 | { 834 | if (TotalCount > 0) 835 | { 836 | char bar[PROGRESS_BAR_CHAR_NUM + 1]; 837 | double p = (double)Counter / TotalCount; 838 | int n = (int)round(p * PROGRESS_BAR_CHAR_NUM); 839 | memset(bar, '.', sizeof(bar)); 840 | memset(bar, '=', n); 841 | if ((Counter > 0) && (n < PROGRESS_BAR_CHAR_NUM)) 842 | bar[n] = '>'; 843 | bar[PROGRESS_BAR_CHAR_NUM] = 0; 844 | 845 | // ETC: estimated time to complete 846 | n = (int)_timer.size() - 20; // 20% as a sliding window size 847 | if (n < 0) n = 0; 848 | time_t now; time(&now); 849 | _timer.push_back(pair(p, now)); 850 | 851 | // in seconds 852 | double s = difftime(now, _timer[n].second); 853 | double diff = p - _timer[n].first; 854 | if (diff > 0) 855 | s = s / diff * (1 - p); 856 | else 857 | s = NaN; 858 | p *= 100; 859 | 860 | // show 861 | if (NewLine) 862 | { 863 | fprintf(File, "[%s] %2.0f%%, ETC: %s\n", bar, p, time_str(s)); 864 | } else { 865 | fprintf(File, "\r[%s] %2.0f%%, ETC: %s ", bar, p, time_str(s)); 866 | if (Counter >= TotalCount) fprintf(File, "\n"); 867 | } 868 | } else { 869 | int n = Counter / PROGRESS_LINE_NUM; 870 | string s(n, '.'); 871 | if (NewLine) 872 | { 873 | if (Counter > 0) 874 | fprintf(File, "[:%s (%lldk lines)]\n", s.c_str(), Counter/1000); 875 | else 876 | fprintf(File, "[: (0 line)]\n"); 877 | } else { 878 | if (Counter > 0) 879 | fprintf(File, "\r[:%s (%lldk lines)]", s.c_str(), Counter/1000); 880 | else 881 | fprintf(File, "\r[: (0 line)]"); 882 | } 883 | } 884 | fflush(File); 885 | } 886 | } 887 | 888 | 889 | CProgressStdOut::CProgressStdOut(C_Int64 count, bool verbose): 890 | CProgress(0, count, NULL, false) 891 | { 892 | if (count < 0) 893 | throw ErrSeqArray("%s, 'count' should be greater than zero.", __func__); 894 | _last_time = _timer.back().second; 895 | Verbose = verbose; 896 | ShowProgress(); 897 | } 898 | 899 | void CProgressStdOut::ShowProgress() 900 | { 901 | if (Verbose && (TotalCount > 0)) 902 | { 903 | char bar[PROGRESS_BAR_CHAR_NUM + 1]; 904 | double p = (double)Counter / TotalCount; 905 | int n = (int)round(p * PROGRESS_BAR_CHAR_NUM); 906 | memset(bar, '.', sizeof(bar)); 907 | memset(bar, '=', n); 908 | if ((Counter > 0) && (n < PROGRESS_BAR_CHAR_NUM)) 909 | bar[n] = '>'; 910 | bar[PROGRESS_BAR_CHAR_NUM] = 0; 911 | 912 | // ETC: estimated time to complete 913 | n = (int)_timer.size() - 20; // 20% as a sliding window size 914 | if (n < 0) n = 0; 915 | time_t now; time(&now); 916 | _timer.push_back(pair(p, now)); 917 | 918 | // in seconds 919 | double interval = difftime(now, _last_time); 920 | double s = difftime(now, _timer[n].second); 921 | double diff = p - _timer[n].first; 922 | if (diff > 0) 923 | s = s / diff * (1 - p); 924 | else 925 | s = NaN; 926 | p *= 100; 927 | 928 | // show 929 | if (Counter >= TotalCount) 930 | { 931 | s = difftime(_last_time, _start_time); 932 | printf("\r[%s] 100%%, completed in %s\n", bar, time_str(s)); 933 | } else if ((interval >= 5) || (Counter <= 0)) 934 | { 935 | _last_time = now; 936 | printf("\r[%s] %2.0f%%, ETC: %s ", bar, p, time_str(s)); 937 | } 938 | } 939 | } 940 | 941 | 942 | 943 | // =========================================================== 944 | // Define Functions 945 | // =========================================================== 946 | 947 | // the buffer of ArrayTRUEs 948 | static vector TrueBuffer; 949 | 950 | COREARRAY_DLL_LOCAL C_BOOL *NeedArrayTRUEs(size_t len) 951 | { 952 | if (len <= sizeof(ArrayTRUEs)) 953 | return ArrayTRUEs; 954 | else if (len > TrueBuffer.size()) 955 | TrueBuffer.resize(len, TRUE); 956 | return &TrueBuffer[0]; 957 | } 958 | 959 | 960 | static char pretty_num_buffer[32]; 961 | 962 | /// Get pretty text for an integer with comma 963 | COREARRAY_DLL_LOCAL const char *PrettyInt(int val) 964 | { 965 | char *p = pretty_num_buffer + sizeof(pretty_num_buffer); 966 | *(--p) = 0; 967 | 968 | bool sign = (val < 0); 969 | if (sign) val = -val; 970 | 971 | int digit = 0; 972 | do { 973 | *(--p) = (val % 10) + '0'; 974 | val /= 10; 975 | if (((++digit) >= 3) && (val > 0)) 976 | { 977 | *(--p) = ','; 978 | digit = 0; 979 | } 980 | } while (val > 0); 981 | 982 | if (sign) *(--p) = '-'; 983 | return p; 984 | } 985 | 986 | 987 | /// Text matching, return -1 when no maching 988 | COREARRAY_DLL_LOCAL int MatchText(const char *txt, const char *list[]) 989 | { 990 | for (int i=0; *list; list++, i++) 991 | { 992 | if (strcmp(txt, *list) == 0) 993 | return i; 994 | } 995 | return -1; 996 | } 997 | 998 | 999 | /// Get the number of alleles 1000 | COREARRAY_DLL_LOCAL int GetNumOfAllele(const char *allele_list) 1001 | { 1002 | int n = 0; 1003 | while (*allele_list) 1004 | { 1005 | if (*allele_list != ',') 1006 | { 1007 | n ++; 1008 | while ((*allele_list != ',') && (*allele_list != 0)) 1009 | allele_list ++; 1010 | if (*allele_list == ',') 1011 | { 1012 | allele_list ++; 1013 | if (*allele_list == 0) 1014 | { 1015 | n ++; 1016 | break; 1017 | } 1018 | } 1019 | } 1020 | } 1021 | return n; 1022 | } 1023 | 1024 | 1025 | /// Get the index in an allele list 1026 | COREARRAY_DLL_LOCAL int GetIndexOfAllele(const char *allele, const char *allele_list) 1027 | { 1028 | const size_t len = strlen(allele); 1029 | const char *st = allele_list; 1030 | int idx = 0; 1031 | while (*allele_list) 1032 | { 1033 | while ((*allele_list != ',') && (*allele_list != 0)) 1034 | allele_list ++; 1035 | size_t n = allele_list - st; 1036 | if ((len==n) && (strncmp(allele, st, n)==0)) 1037 | return idx; 1038 | if (*allele_list == ',') 1039 | { 1040 | idx ++; 1041 | allele_list ++; 1042 | st = allele_list; 1043 | } 1044 | } 1045 | return -1; 1046 | } 1047 | 1048 | 1049 | /// Get strings split by comma 1050 | COREARRAY_DLL_LOCAL void GetAlleles(const char *alleles, vector &out) 1051 | { 1052 | out.clear(); 1053 | const char *p, *s; 1054 | p = s = alleles; 1055 | do { 1056 | if ((*p == 0) || (*p == ',')) 1057 | { 1058 | out.push_back(string(s, p)); 1059 | if (*p == ',') p ++; 1060 | s = p; 1061 | if (*p == 0) break; 1062 | } 1063 | p ++; 1064 | } while (1); 1065 | } 1066 | 1067 | 1068 | /// get PdGDSObj from a SEXP object 1069 | COREARRAY_DLL_LOCAL void GDS_PATH_PREFIX_CHECK(const char *path) 1070 | { 1071 | for (; *path != 0; path++) 1072 | { 1073 | if ((*path == '~') || (*path == '@')) 1074 | { 1075 | throw PySeqArray::ErrSeqArray( 1076 | "the variable name contains an invalid prefix '%c'.", 1077 | *path); 1078 | } 1079 | } 1080 | } 1081 | 1082 | 1083 | COREARRAY_DLL_LOCAL void GDS_VARIABLE_NAME_CHECK(const char *p) 1084 | { 1085 | for (; *p != 0; p++) 1086 | { 1087 | if ((*p == '~') || (*p == '@') || (*p == '/')) 1088 | { 1089 | throw ErrSeqArray( 1090 | "the variable name contains an invalid prefix '%c'.", *p); 1091 | } 1092 | } 1093 | } 1094 | 1095 | 1096 | /// get PdGDSObj from a SEXP object 1097 | COREARRAY_DLL_LOCAL string GDS_PATH_PREFIX(const string &path, char prefix) 1098 | { 1099 | string s = path; 1100 | for (int i=s.size()-1; i >= 0; i--) 1101 | { 1102 | if (s[i] == '/') 1103 | { 1104 | if (((int)s.size() > i+1) && (s[i+1] == '~')) 1105 | s[i+1] = prefix; 1106 | else 1107 | s.insert(i+1, &prefix, 1); 1108 | return s; 1109 | } 1110 | } 1111 | 1112 | if ((s.size() > 0) && (s[0] == '~')) 1113 | s[0] = prefix; 1114 | else 1115 | s.insert(s.begin(), prefix); 1116 | 1117 | return s; 1118 | } 1119 | 1120 | 1121 | 1122 | // =========================================================== 1123 | // Import the NumPy Package 1124 | // =========================================================== 1125 | 1126 | // import numpy functions 1127 | #if (PY_MAJOR_VERSION >= 3) 1128 | static PyObject* _init_() { import_array(); return Py_None; } 1129 | #else 1130 | static void _init_() { import_array(); } 1131 | #endif 1132 | 1133 | COREARRAY_DLL_LOCAL bool numpy_init() 1134 | { 1135 | #if (PY_MAJOR_VERSION >= 3) 1136 | if (_init_() == NUMPY_IMPORT_ARRAY_RETVAL) return false; 1137 | #else 1138 | _init_(); 1139 | #endif 1140 | return true; 1141 | } 1142 | 1143 | 1144 | static const char *err_new_array = "Fails to allocate a new numpy array object."; 1145 | 1146 | static PyObject* new_array(size_t n, NPY_TYPES type) 1147 | { 1148 | npy_intp dims[1] = { (npy_intp)n }; 1149 | PyObject *rv = PyArray_SimpleNew(1, dims, type); 1150 | if (rv == NULL) throw ErrSeqArray(err_new_array); 1151 | return rv; 1152 | } 1153 | 1154 | 1155 | COREARRAY_DLL_LOCAL PyObject* numpy_new_bool(size_t n) 1156 | { 1157 | return new_array(n, NPY_BOOL); 1158 | } 1159 | 1160 | COREARRAY_DLL_LOCAL PyObject* numpy_new_uint8(size_t n) 1161 | { 1162 | return new_array(n, NPY_UINT8); 1163 | } 1164 | 1165 | COREARRAY_DLL_LOCAL PyObject* numpy_new_uint8_mat(size_t n1, size_t n2) 1166 | { 1167 | npy_intp dims[2] = { (npy_intp)n1, (npy_intp)n2 }; 1168 | PyObject *rv = PyArray_SimpleNew(2, dims, NPY_UINT8); 1169 | if (rv == NULL) throw ErrSeqArray(err_new_array); 1170 | return rv; 1171 | } 1172 | 1173 | COREARRAY_DLL_LOCAL PyObject* numpy_new_uint8_dim3(size_t n1, size_t n2, size_t n3) 1174 | { 1175 | npy_intp dims[3] = { (npy_intp)n1, (npy_intp)n2, (npy_intp)n3 }; 1176 | PyObject *rv = PyArray_SimpleNew(3, dims, NPY_UINT8); 1177 | if (rv == NULL) throw ErrSeqArray(err_new_array); 1178 | return rv; 1179 | } 1180 | 1181 | 1182 | COREARRAY_DLL_LOCAL PyObject* numpy_new_int32(size_t n) 1183 | { 1184 | return new_array(n, NPY_INT32); 1185 | } 1186 | 1187 | COREARRAY_DLL_LOCAL PyObject* numpy_new_int32_mat(size_t n1, size_t n2) 1188 | { 1189 | npy_intp dims[2] = { (npy_intp)n1, (npy_intp)n2 }; 1190 | PyObject *rv = PyArray_SimpleNew(2, dims, NPY_INT32); 1191 | if (rv == NULL) throw ErrSeqArray(err_new_array); 1192 | return rv; 1193 | } 1194 | 1195 | COREARRAY_DLL_LOCAL PyObject* numpy_new_int32_dim3(size_t n1, size_t n2, size_t n3) 1196 | { 1197 | npy_intp dims[3] = { (npy_intp)n1, (npy_intp)n2, (npy_intp)n3 }; 1198 | PyObject *rv = PyArray_SimpleNew(3, dims, NPY_INT32); 1199 | if (rv == NULL) throw ErrSeqArray(err_new_array); 1200 | return rv; 1201 | } 1202 | 1203 | 1204 | COREARRAY_DLL_LOCAL PyObject* numpy_new_string(size_t n) 1205 | { 1206 | return new_array(n, NPY_OBJECT); 1207 | } 1208 | 1209 | 1210 | COREARRAY_DLL_LOCAL PyObject* numpy_new_list(size_t n) 1211 | { 1212 | return new_array(n, NPY_OBJECT); 1213 | } 1214 | 1215 | 1216 | COREARRAY_DLL_LOCAL bool numpy_is_array(PyObject *obj) 1217 | { 1218 | return PyArray_Check(obj) != 0; 1219 | } 1220 | 1221 | COREARRAY_DLL_LOCAL bool numpy_is_array_or_list(PyObject *obj) 1222 | { 1223 | return PyList_Check(obj) || PyArray_Check(obj); 1224 | } 1225 | 1226 | COREARRAY_DLL_LOCAL bool numpy_is_array_int(PyObject *obj) 1227 | { 1228 | if (PyArray_Check(obj)) 1229 | { 1230 | int i = PyArray_TYPE(obj); 1231 | return (i==NPY_INT8 || i==NPY_UINT8 || i==NPY_INT16 || i==NPY_UINT16 || 1232 | i==NPY_INT32 || i==NPY_UINT32 || i==NPY_INT64 || i==NPY_UINT64); 1233 | } else 1234 | return false; 1235 | } 1236 | 1237 | COREARRAY_DLL_LOCAL bool numpy_is_bool(PyObject *obj) 1238 | { 1239 | return (PyArray_Check(obj) != 0) && (PyArray_TYPE(obj) == NPY_BOOL); 1240 | } 1241 | 1242 | COREARRAY_DLL_LOCAL bool numpy_is_uint8(PyObject *obj) 1243 | { 1244 | return (PyArray_Check(obj) != 0) && (PyArray_TYPE(obj) == NPY_UINT8); 1245 | } 1246 | 1247 | COREARRAY_DLL_LOCAL bool numpy_is_int(PyObject *obj) 1248 | { 1249 | if (PyArray_Check(obj) != 0) 1250 | { 1251 | int np = PyArray_TYPE(obj); 1252 | return (np==NPY_INT8) || (np==NPY_UINT8) || (np==NPY_INT16) || 1253 | (np==NPY_UINT16) || (np==NPY_INT32) || (np==NPY_UINT32) || 1254 | (np==NPY_INT64) || (np==NPY_UINT64); 1255 | } else 1256 | return false; 1257 | } 1258 | 1259 | COREARRAY_DLL_LOCAL bool numpy_is_string(PyObject *obj) 1260 | { 1261 | return (PyArray_Check(obj) != 0) && (PyArray_TYPE(obj) == NPY_OBJECT); 1262 | } 1263 | 1264 | 1265 | COREARRAY_DLL_LOCAL size_t numpy_size(PyObject *obj) 1266 | { 1267 | return PyArray_SIZE(obj); 1268 | } 1269 | 1270 | COREARRAY_DLL_LOCAL void* numpy_getptr(PyObject *obj) 1271 | { 1272 | if (obj) 1273 | return PyArray_DATA(obj); 1274 | else 1275 | return NULL; 1276 | } 1277 | 1278 | COREARRAY_DLL_LOCAL void numpy_setval(PyObject *obj, void *ptr, PyObject *val) 1279 | { 1280 | PyArray_SETITEM(obj, ptr, val); 1281 | } 1282 | 1283 | 1284 | COREARRAY_DLL_LOCAL void numpy_to_int32(PyObject *obj, vector &out) 1285 | { 1286 | if (PyArray_Check(obj)) 1287 | { 1288 | void *ptr = PyArray_DATA(obj); 1289 | size_t n = PyArray_SIZE(obj); 1290 | out.resize(n); 1291 | int *p = &out[0]; 1292 | switch (PyArray_TYPE(obj)) 1293 | { 1294 | case NPY_INT8: 1295 | for (C_Int8 *s=(C_Int8*)ptr; n > 0; n--) *p++ = *s++; 1296 | return; 1297 | case NPY_UINT8: 1298 | for (C_UInt8 *s=(C_UInt8*)ptr; n > 0; n--) *p++ = *s++; 1299 | return; 1300 | case NPY_INT16: 1301 | for (C_Int16 *s=(C_Int16*)ptr; n > 0; n--) *p++ = *s++; 1302 | return; 1303 | case NPY_UINT16: 1304 | for (C_UInt16 *s=(C_UInt16*)ptr; n > 0; n--) *p++ = *s++; 1305 | return; 1306 | case NPY_INT32: 1307 | for (C_Int32 *s=(C_Int32*)ptr; n > 0; n--) *p++ = *s++; 1308 | return; 1309 | case NPY_UINT32: 1310 | for (C_UInt32 *s=(C_UInt32*)ptr; n > 0; n--) *p++ = *s++; 1311 | return; 1312 | case NPY_INT64: 1313 | for (C_Int64 *s=(C_Int64*)ptr; n > 0; n--) *p++ = *s++; 1314 | return; 1315 | case NPY_UINT64: 1316 | for (C_UInt64 *s=(C_UInt64*)ptr; n > 0; n--) *p++ = *s++; 1317 | return; 1318 | } 1319 | } 1320 | throw ErrSeqArray("Fails to convert a numpty object to an integer vector."); 1321 | } 1322 | 1323 | COREARRAY_DLL_LOCAL void numpy_to_string(PyObject *obj, vector &out) 1324 | { 1325 | if (PYSTR_IS(obj)) 1326 | { 1327 | out.resize(1); 1328 | out[0] = PYSTR_CHAR(obj); 1329 | } else if (PyArray_Check(obj)) 1330 | { 1331 | PyObject **p = (PyObject**)PyArray_DATA(obj); 1332 | size_t n = PyArray_SIZE(obj); 1333 | out.resize(n); 1334 | for (size_t i=0; i < n; i++) 1335 | { 1336 | #if (PY_MAJOR_VERSION >= 3) 1337 | out[i] = PyUnicode_AsUTF8(*p++); 1338 | #else 1339 | out[i] = PyString_AsString(*p++); 1340 | #endif 1341 | } 1342 | } else if (PyList_Check(obj)) 1343 | { 1344 | size_t n = PyList_Size(obj); 1345 | out.resize(n); 1346 | for(size_t i=0; i < n; i++) 1347 | { 1348 | PyObject *p = PyList_GetItem(obj, i); 1349 | #if (PY_MAJOR_VERSION >= 3) 1350 | out[i] = PyUnicode_AsUTF8(p); 1351 | #else 1352 | out[i] = PyString_AsString(p); 1353 | #endif 1354 | } 1355 | } else 1356 | throw ErrSeqArray("Fails to convert a list or a numpty object to a string vector."); 1357 | } 1358 | 1359 | } 1360 | --------------------------------------------------------------------------------