├── example ├── target_spiked_simple.vcf.gz ├── 1KG_cftr_background.recode.vep.vcf.gz └── 1KG_cftr_background.recode.vep.vcf.gz.tbi ├── aa_weight.h ├── score_variant.h ├── Makefile ├── Makefile~ ├── background_max_scores.h ├── vvp_lookup.h ├── search_binary_bkgrnd.h ├── vvp_headers.h ├── parse_vcf.h ├── vvp_lookup.c ├── kvec.h ├── README.md ├── sds.h ├── score_variant.c ├── background_max_scores.c ├── search_binary_bkgrnd.c ├── bit_macros.h ├── aa_weight.c ├── score_variants.c ├── parse_vcf.c ├── bit_array.h ├── khash.h └── sds.c /example/target_spiked_simple.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yandell-Lab/VVP-pub/HEAD/example/target_spiked_simple.vcf.gz -------------------------------------------------------------------------------- /example/1KG_cftr_background.recode.vep.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yandell-Lab/VVP-pub/HEAD/example/1KG_cftr_background.recode.vep.vcf.gz -------------------------------------------------------------------------------- /example/1KG_cftr_background.recode.vep.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yandell-Lab/VVP-pub/HEAD/example/1KG_cftr_background.recode.vep.vcf.gz.tbi -------------------------------------------------------------------------------- /aa_weight.h: -------------------------------------------------------------------------------- 1 | // 2 | // aa_weight.h 3 | // VVP_dev_xcode 4 | // 5 | // Created by STEVEN FLYGARE on 10/11/16. 6 | // Copyright © 2016 IDbyDNA. All rights reserved. 7 | // 8 | 9 | #ifndef aa_weight_h 10 | #define aa_weight_h 11 | 12 | #include "vvp_headers.h" 13 | #include "parse_vcf.h" 14 | 15 | struct aa_matrix { 16 | char aa_change[20]; 17 | float score; 18 | float cons; 19 | float uncons; 20 | UT_hash_handle hh; 21 | }; 22 | 23 | void init_aa_score(); 24 | void get_aaw(struct transcript_anno_info ** ttai, sds ref, sds var, float phast); 25 | 26 | #endif /* aa_weight_h */ 27 | -------------------------------------------------------------------------------- /score_variant.h: -------------------------------------------------------------------------------- 1 | // 2 | // score_variant.h 3 | // vcf_parser 4 | // 5 | // Created by steven on 6/25/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #ifndef __vcf_parser__score_variant__ 10 | #define __vcf_parser__score_variant__ 11 | 12 | #include "vvp_headers.h" 13 | #include "parse_vcf.h" 14 | 15 | void score_variant_b(struct variant * v, int no_allele_frequency); 16 | void score_variant_t_b(struct variant * v, int nb, int xu, int no_allele_frequency); //nb is with nocalls taken into account, xu is the background allele count 17 | 18 | #endif /* defined(__vcf_parser__score_variant__) */ 19 | 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -lz -lm -O3 -lgsl -lgslcblas -fopenmp #-Wall 3 | TARGETS = build_background VVP 4 | 5 | all: $(TARGETS) 6 | 7 | build_background: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o 8 | $(CC) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o -o $@ $(CFLAGS) 9 | 10 | VVP: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o 11 | $(CC) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o -o $@ $(CFLAGS) 12 | 13 | .c.o: 14 | $(CC) -c $< $(CFLAGS) 15 | 16 | clean: 17 | rm -f *.o 18 | rm -f $(TARGETS) 19 | 20 | -------------------------------------------------------------------------------- /Makefile~: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -lz -lm -O3 -lgsl -lgslcblas -fopenmp #-Wall 3 | TARGETS = build_background VVP 4 | 5 | all: $(TARGETS) 6 | 7 | build_background: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o 8 | $(CC) $(CFLAGS) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o -o $@ 9 | 10 | VVP: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o 11 | $(CC) $(CFLAGS) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o -o $@ 12 | 13 | .c.o: 14 | $(CC) $(CFLAGS) -c $< 15 | 16 | clean: 17 | rm -f *.o 18 | rm -f $(TARGETS) 19 | 20 | -------------------------------------------------------------------------------- /background_max_scores.h: -------------------------------------------------------------------------------- 1 | // 2 | // background_max_scores.h 3 | // VVP_C 4 | // 5 | // Created by steven on 8/13/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #ifndef __VVP_C__background_max_scores__ 10 | #define __VVP_C__background_max_scores__ 11 | 12 | #include "vvp_headers.h" 13 | 14 | #define BKRND_GENE_NAME_LEN 50 15 | 16 | struct bkgrnd_max_scores { 17 | char gene[BKRND_GENE_NAME_LEN]; 18 | float * max_scores; 19 | UT_hash_handle hh; 20 | }; 21 | 22 | void init_bkgrnd_max(char * bkgrnd_max, int nb, char iht); 23 | 24 | void init_bkgrnd_max_b(char * bkgrnd_max, int nb, char iht); 25 | 26 | struct bkgrnd_max_scores * get_gene_max(char * gene); 27 | 28 | void cleanup_bkgrnd_max(); 29 | 30 | #endif /* defined(__VVP_C__background_max_scores__) */ 31 | 32 | -------------------------------------------------------------------------------- /vvp_lookup.h: -------------------------------------------------------------------------------- 1 | // 2 | // vvp_lookup.h 3 | // VVP_C 4 | // 5 | // Created by steven on 8/12/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #ifndef __VVP_C__vvp_lookup__ 10 | #define __VVP_C__vvp_lookup__ 11 | 12 | #include "vvp_headers.h" 13 | 14 | #define NPERCENTILES 100 15 | 16 | struct feature_lookups { 17 | char feature_name[FEATURE_NAME_LEN]; 18 | float coding_vals[NPERCENTILES]; 19 | float noncoding_vals[NPERCENTILES]; 20 | int n_coding; 21 | int n_noncoding; 22 | UT_hash_handle hh; 23 | }; 24 | 25 | void load_feature_lookups_b(sds lookup_file); 26 | 27 | int score_lookup(char * feature_name, float score, int coding); 28 | 29 | int score_lookup_b(char * feature_name, float score, int coding); 30 | 31 | void destroy_feature_lookups(); 32 | 33 | 34 | #endif /* defined(__VVP_C__vvp_lookup__) */ 35 | -------------------------------------------------------------------------------- /search_binary_bkgrnd.h: -------------------------------------------------------------------------------- 1 | // 2 | // search_binary_bkgrnd.h 3 | // VVP_C 4 | // 5 | // Created by steven on 8/11/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #ifndef __VVP_C__search_binary_bkgrnd__ 10 | #define __VVP_C__search_binary_bkgrnd__ 11 | 12 | #include "vvp_headers.h" 13 | 14 | struct chr_offsets { 15 | char chr[3]; 16 | //char * chr; 17 | uint64_t byte_start; 18 | uint64_t byte_end; 19 | int n_entries; 20 | UT_hash_handle hh; 21 | }; 22 | 23 | struct var_info { 24 | char var_type; 25 | int length; 26 | int nhet; 27 | int nhom; 28 | int nhemi; 29 | int nocall; 30 | uint64_t bit_offset; 31 | }; 32 | 33 | struct m_var_info { 34 | struct var_info ** vi; 35 | int nv; 36 | }; 37 | 38 | unsigned char * load_bin_db(sds file_prefix, int * n_background); 39 | 40 | unsigned char * load_bit_db(sds file_prefix); 41 | 42 | struct chr_offsets * load_offsets(sds file_prefix); 43 | 44 | struct m_var_info * search_binary_bkgrnd(char * chr, size_t pos, unsigned char * mm_bin, struct chr_offsets * chro); 45 | 46 | void destroy_chr_offsets(struct chr_offsets * chro); 47 | 48 | #endif /* defined(__VVP_C__search_binary_bkgrnd__) */ 49 | -------------------------------------------------------------------------------- /vvp_headers.h: -------------------------------------------------------------------------------- 1 | // 2 | // vvp_headers.h 3 | // VVP_C 4 | // 5 | // Created by steven on 8/12/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #ifndef VVP_C_vvp_headers_h 10 | #define VVP_C_vvp_headers_h 11 | 12 | #include "sds.h" 13 | #include "uthash.h" 14 | #include "kvec.h" 15 | #include "khash.h" 16 | #include "bit_array.h" 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | struct config { 30 | sds target_vcf; 31 | sds background_vcf; 32 | sds db_prefix; 33 | sds vvp_formatted; 34 | char inheritance_filters; 35 | char penetrance; 36 | int mother_sample_index; 37 | int father_sample_index; 38 | int proband_sample_index; 39 | int sibling_sample_index; 40 | int sibling_affected; 41 | int nb; 42 | int nts; 43 | int nt; 44 | sds anno_tag; 45 | int variant_pos; 46 | int gene_pos; 47 | int aa_pos; 48 | int so_pos; 49 | char iht; 50 | int format_output; 51 | int np; 52 | int only_coding; 53 | int only_snv; 54 | size_t n_permutations; 55 | size_t mat_rows; 56 | }; 57 | 58 | #ifdef _OPENMP 59 | #include 60 | #endif 61 | 62 | #define BUF_SIZE 5000000 63 | #define MAX_SCORES 100000 64 | #define LINE_BYTE_SIZE 35 65 | #define NPERCENTILES 100 66 | #define MAX_VARS 20 67 | #define FEATURE_NAME_LEN 50 68 | #define FEATURE_NAME_LENGTH 50 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /parse_vcf.h: -------------------------------------------------------------------------------- 1 | // 2 | // parse_vcf.h 3 | // VVP_dev_xcode 4 | // 5 | // Created by STEVEN FLYGARE on 10/10/16. 6 | // Copyright © 2016 IDbyDNA. All rights reserved. 7 | // 8 | 9 | #ifndef parse_vcf_h 10 | #define parse_vcf_h 11 | 12 | #include "vvp_headers.h" 13 | #include "aa_weight.h" 14 | #include 15 | 16 | struct vep_field_info { 17 | uint8_t gene_index; 18 | uint8_t transcript_index; 19 | uint8_t seq_ontology_tag_index; 20 | uint8_t amino_acid_change_index; 21 | sds annotation_tag_name; 22 | int ll_weight_index; 23 | }; 24 | 25 | struct transcript_anno_info { 26 | char transcript_name[FEATURE_NAME_LENGTH]; 27 | float aaw; 28 | float het_score; 29 | int het_vvp; 30 | float hom_score; 31 | int hom_vvp; 32 | float hemi_score; 33 | int hemi_vvp; 34 | int coding; 35 | float llw; //likelihood weight 36 | sds pref; 37 | sds pvar; 38 | kvec_t(sds) anno_tags; 39 | UT_hash_handle hh; 40 | }; 41 | 42 | struct gene_transcript { 43 | char gene_name[FEATURE_NAME_LENGTH]; 44 | struct transcript_anno_info * tai; 45 | UT_hash_handle hh; 46 | }; 47 | 48 | struct variant { 49 | sds chr; 50 | sds vid; 51 | size_t pos; 52 | sds ref; 53 | sds var; 54 | int indel; 55 | struct gene_transcript * gt; 56 | float phast; 57 | int nref; //total number of ref alleles 58 | int ni; //total number of individuals 59 | int b_nhet; 60 | int b_nhom; 61 | int b_nhemi; 62 | int b_nocall; 63 | uint64_t bit_offset; 64 | sds hemi_indv; //hemizygous individuals (comma separated list) 65 | sds het_indv; //heterozygous indivdiuals (comma separated list) 66 | sds hom_indv; //homozygous individuals (comma separated list) 67 | kvec_t(int) hemi; 68 | kvec_t(int) hets; 69 | kvec_t(int) homs; 70 | kvec_t(int) het_nocalls; //for heterozygous nocalls 71 | kvec_t(int) hom_nocalls; //for homozygous nocalls 72 | kvec_t(int) hemi_nocalls; //for hemizygous nocalls 73 | }; 74 | 75 | 76 | void initialize_parse_vcf(uint8_t gene_index, uint8_t transcript_index, uint8_t seq_ontology_tag_index, uint8_t amino_acid_change_index, sds annotation_tag_name, int ll_weight_index); 77 | 78 | struct variant * parse_vcf_line(sds line, int no_aa_weight); 79 | 80 | struct variant * parse_allele_frequency_line(sds line, int no_aa_weight); 81 | 82 | void destroy_variant(struct variant * v); 83 | 84 | void print_variant(struct variant * v); 85 | 86 | #endif /* parse_vcf_h */ 87 | -------------------------------------------------------------------------------- /vvp_lookup.c: -------------------------------------------------------------------------------- 1 | // 2 | // vvp_lookup.c 3 | // VVP_C 4 | // 5 | // Created by steven on 8/12/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #include "vvp_lookup.h" 10 | 11 | static struct feature_lookups * lookups; 12 | 13 | static unsigned char * mm_dist; 14 | static uint64_t mm_dist_size; 15 | static int dist_line_size; 16 | 17 | void load_feature_lookups_b(sds lookup_file) { 18 | 19 | int fdSrc = open(lookup_file, O_RDWR, 0); 20 | struct stat st; 21 | fstat(fdSrc, &st); 22 | mm_dist = (unsigned char *)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdSrc, 0); 23 | if(mm_dist == MAP_FAILED){ 24 | fprintf(stderr, "FATAL: could not create mmap from %s\n", lookup_file); 25 | exit(1); 26 | } 27 | 28 | mm_dist_size = st.st_size; 29 | 30 | dist_line_size = sizeof(char)*FEATURE_NAME_LEN + sizeof(float)*NPERCENTILES + sizeof(float)*NPERCENTILES + sizeof(size_t) + sizeof(size_t); 31 | 32 | } 33 | 34 | int score_lookup_b(char * feature_name, float score, int coding){ 35 | 36 | int min = 0; 37 | int max = mm_dist_size / dist_line_size; //place pointer at start of final line 38 | float * percentiles = NULL; 39 | while (max >= min) { 40 | //uint mid = min + ((max - min) >> 1); //floor average 41 | int mid = (min + max) / 2; 42 | uint64_t tmp_offset = mid*dist_line_size; 43 | int cmp = strcmp((char *)(mm_dist+tmp_offset), feature_name); 44 | if ( cmp == 0 ) { 45 | if (coding == 1) { 46 | percentiles = (float *)(mm_dist + tmp_offset + sizeof(char)*FEATURE_NAME_LEN); 47 | } 48 | else if(coding == 0) { 49 | percentiles = (float *)(mm_dist + tmp_offset + sizeof(char)*FEATURE_NAME_LEN + sizeof(float)*NPERCENTILES); 50 | } 51 | else { 52 | return -1; 53 | } 54 | int i=0; 55 | for (i = 0; i < NPERCENTILES; i++) { 56 | if ( score < percentiles[i] || fabsf(score - percentiles[i]) < .01 ) { //return if within .01 57 | return i; 58 | } 59 | } 60 | return 100; 61 | } 62 | else if (cmp < 0){ 63 | min = mid + 1; 64 | } 65 | else { 66 | max = mid - 1; 67 | } 68 | } 69 | 70 | return -1; 71 | 72 | } 73 | 74 | void destroy_feature_lookups(){ 75 | struct feature_lookups *s, *tmp; 76 | HASH_ITER(hh, lookups, s, tmp){ 77 | HASH_DEL(lookups, s); 78 | free(s); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /kvec.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "kvec.h" 30 | int main() { 31 | kvec_t(int) array; 32 | kv_init(array); 33 | kv_push(int, array, 10); // append 34 | kv_a(int, array, 20) = 5; // dynamic 35 | kv_A(array, 20) = 4; // static 36 | kv_destroy(array); 37 | return 0; 38 | } 39 | */ 40 | 41 | /* 42 | 2008-09-22 (0.1.0): 43 | 44 | * The initial version. 45 | 46 | */ 47 | 48 | #ifndef AC_KVEC_H 49 | #define AC_KVEC_H 50 | 51 | #include 52 | 53 | #ifdef USE_MALLOC_WRAPPERS 54 | # include "malloc_wrap.h" 55 | #endif 56 | 57 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 58 | 59 | #define kvec_t(type) struct { size_t n, m; type *a; } 60 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) 61 | #define kv_destroy(v) free((v).a) 62 | #define kv_A(v, i) ((v).a[(i)]) 63 | #define kv_pop(v) ((v).a[--(v).n]) 64 | #define kv_size(v) ((v).n) 65 | #define kv_max(v) ((v).m) 66 | 67 | #define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) 68 | 69 | #define kv_copy(type, v1, v0) do { \ 70 | if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ 71 | (v1).n = (v0).n; \ 72 | memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ 73 | } while (0) \ 74 | 75 | #define kv_push(type, v, x) do { \ 76 | if ((v).n == (v).m) { \ 77 | (v).m = (v).m? (v).m<<1 : 2; \ 78 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ 79 | } \ 80 | (v).a[(v).n++] = (x); \ 81 | } while (0) 82 | 83 | #define kv_pushp(type, v) ((((v).n == (v).m)? \ 84 | ((v).m = ((v).m? (v).m<<1 : 2), \ 85 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 86 | : 0), &(v).a[(v).n++]) 87 | 88 | #define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ 89 | ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ 90 | (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ 91 | : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ 92 | : 0), (v).a[(i)]) 93 | 94 | #endif 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VVP 2 | Variant prioritization / burden test. Version 1.5 3 | 4 | ## INSTALL 5 | ### DEPENDENCIES 6 | 7 | 1. Gnu scientific library (https://www.gnu.org/software/gsl/) 8 | 2. openmp compatible version of gcc. If your compiler (clang) is not, you can remove the -fopenmp flag in the Makefile. Change the line that looks like: CFLAGS = -lz -lm -O3 -lgsl -lgslcblas -fopenmp #-Wall to CFLAGS = -lz -lm -O3 -lgsl -lgslcblas #-fopenmp #-Wall 9 | 3. zlib (https://zlib.net) 10 | 4. make 11 | 12 | ### BUILD 13 | 14 | In the VVP directory: 15 | 16 | `make` 17 | 18 | Make will build 2 executables: build_background and VVP 19 | 20 | Note: This has been built and run on Mac laptops and Linux servers. 21 | 22 | ## EXAMPLE RUNNING VVP 23 | 24 | To see available parameters of the executables, run with the -h option. 25 | 26 | Before running VVP, a background must be built. From the VVP directory: 27 | 28 | `cd example` 29 | 30 | `../build_background -i 1KG_cftr_background.recode.vep.vcf.gz -o 1KG.build -b 2500 -v CSQ,4,6,1,15` 31 | 32 | The build_background step produces output to stdout for each of the variants in the background vcf file. It also creates several different output files including extensions .bin, .chr_offsets.txt, .dist. These files contained information used by VVP. 33 | 34 | To run prioritize variants using VVP (in the example folder): 35 | 36 | `../VVP -i target_spiked_simple.vcf.gz -d 1KG.build -v CSQ,4,6,1,15 1> target.spiked.vvp.out` 37 | 38 | target_spiked.vvp.out contains the vvp output. 39 | 40 | ### PREPARE VCF FILE FOR ANALYSIS 41 | 42 | The VVP pipeline does not support mulitallelic lines, these must first be decomposed. We recommend using vt decompose to accomplish this task (http://genome.sph.umich.edu/wiki/Vt). 43 | 44 | **Mandatory** preprocessing of a vcf file includes **multiallelic decomposition and VEP annotation**. It is important to decompose **BEFORE** annnotating because of potential annotation collisions. Our recommended steps are to use vt to decompose and normalize variants followed by VEP annotation. No special options in VEP are required for the variant annotation. Testing has been done with VEP v82. 45 | 46 | ## VVP BACKGROUND 47 | A prebuilt background based on gnomAD (http://gnomad.broadinstitute.org/) for use with VVP can be downloaded here (2.5GB): https://s3-us-west-2.amazonaws.com/gnomad-vvp-background/gnomad.062717.build.tar.gz 48 | 49 | ## VVP OUTPUT 50 | VVP outputs a tab delimited file with 31 columns. The columns are the following: 51 | 52 | |column name|description| 53 | |-----------|-----------| 54 | |chr| chromosome | 55 | |start| variant start coord| 56 | |ref| reference allele| 57 | |var| variant allele | 58 | |gene| gene id | 59 | |transcript| transcript id | 60 | |hemi_score| raw variant score for hemizygous genotype | 61 | |hemi_vvp| vvp score for hemizygous genotype | 62 | |nhemi| number of hemizygous indivduals | 63 | |hemi_indvs| list of hemizygous individuals | 64 | |hemi_nocall| number of hemizygous nocalls | 65 | |het_score| raw variant score for heterozygous genotype| 66 | |het_vvp| vvp score for heterozygous genotype | 67 | |nhet| number of heterozygous individuals | 68 | |het_indvs| list of heterozygous individuals | 69 | |het_nocall| number of heterozygous nocalls | 70 | |hom_score| raw variant score for homozygous genotype | 71 | |hom_vvp| vvp score for homozygous genotype| 72 | |nhom| number of homozygous individuals | 73 | |hom_indvs| list of homozygous individuals | 74 | |hom_nocall| number of homozygous nocalls | 75 | |coding_ind| 1 if variant is coding, 0 otherwise | 76 | |indel_ind| 1 if variant is an indel, 0 otherwise | 77 | |aa_score| amino acid weight | 78 | |n_bhemi| number of hemizygous background individuals | 79 | |n_bhet| number of heterozygous background individuals | 80 | |n_bhom| number of homozygous background individuals | 81 | |n_bnocall| number of alleles nocalled in background | 82 | |bit_offset| byte offset to background | 83 | |vid| variant id | 84 | |ll_weight| optional extra weight | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /sds.h: -------------------------------------------------------------------------------- 1 | /* SDS (Simple Dynamic Strings), A C dynamic strings library. 2 | * 3 | * Copyright (c) 2006-2014, Salvatore Sanfilippo 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * * Redistributions of source code must retain the above copyright notice, 10 | * this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of Redis nor the names of its contributors may be used 15 | * to endorse or promote products derived from this software without 16 | * specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | * POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #ifndef __SDS_H 32 | #define __SDS_H 33 | 34 | #define SDS_MAX_PREALLOC (1024*1024) 35 | 36 | #include 37 | #include 38 | 39 | typedef char *sds; 40 | 41 | struct sdshdr { 42 | int len; 43 | int free; 44 | char buf[]; 45 | }; 46 | 47 | static inline size_t sdslen(const sds s) { 48 | struct sdshdr *sh = (void*)(s-sizeof *sh); 49 | return sh->len; 50 | } 51 | 52 | static inline size_t sdsavail(const sds s) { 53 | struct sdshdr *sh = (void*)(s-sizeof *sh); 54 | return sh->free; 55 | } 56 | 57 | sds sdsnewlen(const void *init, size_t initlen); 58 | sds sdsnew(const char *init); 59 | sds sdsempty(void); 60 | size_t sdslen(const sds s); 61 | sds sdsdup(const sds s); 62 | void sdsfree(sds s); 63 | size_t sdsavail(const sds s); 64 | sds sdsgrowzero(sds s, size_t len); 65 | sds sdscatlen(sds s, const void *t, size_t len); 66 | sds sdscat(sds s, const char *t); 67 | sds sdscatsds(sds s, const sds t); 68 | sds sdscpylen(sds s, const char *t, size_t len); 69 | sds sdscpy(sds s, const char *t); 70 | 71 | sds sdscatvprintf(sds s, const char *fmt, va_list ap); 72 | #ifdef __GNUC__ 73 | sds sdscatprintf(sds s, const char *fmt, ...) 74 | __attribute__((format(printf, 2, 3))); 75 | #else 76 | sds sdscatprintf(sds s, const char *fmt, ...); 77 | #endif 78 | 79 | void sdstrim(sds s, const char *cset); 80 | void sdsrange(sds s, int start, int end); 81 | void sdsupdatelen(sds s); 82 | void sdsclear(sds s); 83 | int sdscmp(const sds s1, const sds s2); 84 | sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count); 85 | void sdsfreesplitres(sds *tokens, int count); 86 | void sdstolower(sds s); 87 | void sdstoupper(sds s); 88 | sds sdsfromlonglong(long long value); 89 | sds sdscatrepr(sds s, const char *p, size_t len); 90 | sds *sdssplitargs(const char *line, int *argc); 91 | sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen); 92 | sds sdsjoin(char **argv, int argc, char *sep, size_t seplen); 93 | sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen); 94 | 95 | /* Low level functions exposed to the user API */ 96 | sds sdsMakeRoomFor(sds s, size_t addlen); 97 | void sdsIncrLen(sds s, int incr); 98 | sds sdsRemoveFreeSpace(sds s); 99 | size_t sdsAllocSize(sds s); 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /score_variant.c: -------------------------------------------------------------------------------- 1 | // 2 | // score_variant.c 3 | // vcf_parser 4 | // 5 | // Created by steven on 6/25/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #include "score_variant.h" 10 | 11 | float compute_score(int nb, int nt, int xa, int xu, float aaw, float llw, int no_allele_frequency) { 12 | 13 | //nb = nb - nt >= 0 ? nb - nt : 0; 14 | //xu = xu - xa >= 0 ? xu - xa : 0; 15 | 16 | int x = xa + xu; 17 | int n = nb + nt; 18 | 19 | float p = (float)x / (float) n; 20 | 21 | if (p < 1e-10 || (1.0 - p) < 1e-10) { //if p is essentially 0 or 1, the score is 0 22 | return 0.0; 23 | } 24 | 25 | float pu = 0.0; 26 | if (nb > 0) { 27 | pu = (float)xu/(float)nb; 28 | if (pu >= 1.0) { //if everyone in the background has the allele, the score is 0 29 | return 0.0; 30 | } 31 | else if (pu < 1e-10){ 32 | pu = 1e-6; 33 | } 34 | } 35 | if (pu < 1e-10) { //in case everyone in the background is nocalled 36 | pu = 1e-6; 37 | } 38 | 39 | float pa = (float)xa / (float)nt; 40 | if (pa >= 1.0) { 41 | pa = 1.0 - 1e-6; 42 | } 43 | else if (pa <= 1e-10) { //error if pa is 0 or negative -- no affecteds with allele 44 | return -1.0; 45 | } 46 | 47 | double plog = log(p); 48 | /*if (errno == EDOM || errno == ERANGE) { 49 | fprintf(stderr, "log(p) failed, p is %f\n", p); 50 | free(vac); 51 | return -1.0; //return -1 to mean an error 52 | }*/ 53 | 54 | double iplog = log(1.0 - p); 55 | /*if (errno == EDOM || errno == ERANGE) { 56 | fprintf(stderr, "log(1.0 - p) failed, p is %f\n", p); 57 | free(vac); 58 | return -1.0; //return -1 to mean an error 59 | }*/ 60 | 61 | double pulog = log(pu); 62 | /*if (errno == EDOM || errno == ERANGE) { 63 | fprintf(stderr, "log(pu) failed, pu is %f\n", pu); 64 | free(vac); 65 | return -1.0; //return -1 to mean an error 66 | }*/ 67 | 68 | double ipulog = log(1.0 - pu); 69 | /*if (errno == EDOM || errno == ERANGE) { 70 | fprintf(stderr, "log(1.0 - pu) failed, pu is %f\n", pu); 71 | free(vac); 72 | return -1.0; //return -1 to mean an error 73 | }*/ 74 | 75 | double palog = log(pa); 76 | /*if (errno == EDOM || errno == ERANGE) { 77 | fprintf(stderr, "log(pa) failed, pa is %f\n", pa); 78 | free(vac); 79 | return -1.0; //return -1 to mean an error 80 | }*/ 81 | 82 | double ipalog = log(1.0 - pa); 83 | /*if (errno == EDOM || errno == ERANGE) { 84 | fprintf(stderr, "log(1.0 - pa) failed, pa is %f\n", pu); 85 | free(vac); 86 | return -1.0; //return -1 to mean an error 87 | }*/ 88 | 89 | double aalog = log(aaw); 90 | /*if (errno == EDOM || errno == ERANGE) { 91 | fprintf(stderr, "log(tv->aaw) failed, tv->aaw is %f\n", tv->aaw); 92 | free(vac); 93 | return -1.0; //return -1 to mean an error 94 | }*/ 95 | 96 | if (llw < 0) { 97 | llw = 1.0; 98 | } 99 | else if (llw <= 1e-10) { 100 | llw = 1e-6; 101 | } 102 | float log_llw = log(llw); 103 | 104 | float numerator = x*plog + (n-x)*iplog; 105 | float denominator = xu*pulog + (nb - xu)*ipulog + xa*palog + (nt - xa)*ipalog; 106 | float diff = no_allele_frequency == 0 ? (numerator - denominator) : 0.0; 107 | float score = -2.0*(log_llw + aalog + diff); 108 | if (score <= 0.0) { 109 | return 0.0; 110 | } 111 | 112 | return score; 113 | 114 | } 115 | 116 | void score_variant_b(struct variant * v, int no_allele_frequency){ 117 | 118 | int nb = v->nref + v->hemi.n + v->hets.n + 2*(v->homs.n); 119 | int xu = nb - v->nref; 120 | 121 | struct gene_transcript * c, * t; 122 | HASH_ITER(hh, v->gt, c, t) { 123 | struct transcript_anno_info * current, * tmp; 124 | HASH_ITER(hh, c->tai, current, tmp) { 125 | current->hemi_score = compute_score(nb, 1, 1, xu, current->aaw, current->llw, no_allele_frequency); 126 | current->het_score = compute_score(nb, 2, 1, xu, current->aaw, current->llw, no_allele_frequency); 127 | current->hom_score = compute_score(nb, 2, 2, xu, current->aaw, current->llw, no_allele_frequency); 128 | } 129 | } 130 | } 131 | 132 | void score_variant_t_b(struct variant * v, int nb, int xu, int no_allele_frequency){ 133 | 134 | struct gene_transcript * c, * t; 135 | HASH_ITER(hh, v->gt, c, t) { 136 | struct transcript_anno_info * current, * tmp; 137 | HASH_ITER(hh, c->tai, current, tmp) { 138 | current->hemi_score = compute_score(nb, 1, 1, xu, current->aaw, current->llw, no_allele_frequency); 139 | current->het_score = compute_score(nb, 2, 1, xu, current->aaw, current->llw, no_allele_frequency); 140 | current->hom_score = compute_score(nb, 2, 2, xu, current->aaw, current->llw, no_allele_frequency); 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /background_max_scores.c: -------------------------------------------------------------------------------- 1 | // 2 | // background_max_scores.c 3 | // VVP_C 4 | // 5 | // Created by steven on 8/13/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #include "background_max_scores.h" 10 | 11 | static struct bkgrnd_max_scores * bms; 12 | 13 | void get_indv_scores(struct bkgrnd_max_scores ** tbms, sds indv_scores, char iht){ 14 | 15 | int count = 0; 16 | sds * data = sdssplitlen(indv_scores, (int)sdslen(indv_scores), ";", 1, &count); 17 | int i=0; 18 | int indv = 0; 19 | float hom = 0; 20 | float het1 = 0; 21 | float het2 = 0; 22 | float sum = 0; 23 | for (i=0; i < count; i++) { 24 | sscanf(data[i], "%d:%f,%f,%f,%f", &indv, &hom, &het1, &het2, &sum); 25 | if (iht == 'r') { //recessive 26 | if ((hom > (het1 + het2)) || (het1 <= 0 || het2 <= 0)) { 27 | (*tbms)->max_scores[indv] = hom; 28 | } 29 | else { 30 | (*tbms)->max_scores[indv] = (het1 + het2); 31 | } 32 | } 33 | else if (iht == 'd') { //dominant 34 | if (het1 > het2) { 35 | (*tbms)->max_scores[indv] = het1; 36 | } 37 | else { 38 | (*tbms)->max_scores[indv] = het2; 39 | } 40 | } 41 | else { //no inheritance, choose sum 42 | (*tbms)->max_scores[indv] = sum; 43 | } 44 | } 45 | 46 | sdsfreesplitres(data, count); 47 | } 48 | 49 | void get_indv_scores_b(struct bkgrnd_max_scores ** tbms, float * scores, int nb, char iht){ 50 | 51 | int i=0; 52 | int j=0; 53 | float hom, het1, het2, sum; 54 | for (i=0; i < 4*nb; i+=4) { 55 | hom = scores[i]; 56 | het1 = scores[i+1]; 57 | het2 = scores[i+2]; 58 | sum = scores[i+3]; 59 | 60 | if (iht == 'r' || iht == 'x') { //recessive 61 | if (hom > (het1 + het2) || (het1 <= 0 || het2 <= 0)) { 62 | (*tbms)->max_scores[j] = hom; 63 | } 64 | else { 65 | (*tbms)->max_scores[j] = (het1 + het2); 66 | } 67 | } 68 | else if (iht == 'd') { //dominant 69 | /*if (hom > het1 && hom > het2) { 70 | (*tbms)->max_scores[j] = hom; 71 | }*/ 72 | if (het1 > het2) { 73 | (*tbms)->max_scores[j] = het1; 74 | } 75 | else { 76 | (*tbms)->max_scores[j] = het2; 77 | } 78 | } 79 | else { //no inheritance, choose sum 80 | (*tbms)->max_scores[j] = sum; 81 | } 82 | j++; 83 | } 84 | } 85 | 86 | 87 | 88 | #define MAX_BUF 1000000 89 | 90 | void init_bkgrnd_max(char * bkgrnd_max, int nb, char iht){ 91 | bms = NULL; 92 | 93 | FILE * max_in = fopen(bkgrnd_max, "r"); 94 | if (! max_in) { 95 | fprintf(stderr, "FATAL: could not open %s for loading\n", bkgrnd_max); 96 | exit(1); 97 | } 98 | 99 | int line_count = 0; 100 | 101 | char * buffer = malloc(sizeof(char)*MAX_BUF); 102 | while ( fgets(buffer, MAX_BUF, max_in) != NULL) { 103 | line_count += 1; 104 | if (line_count % 1000 == 0) { 105 | fprintf(stderr, "%d,", line_count); 106 | } 107 | sds tmpl = sdsnew(buffer); 108 | sdstrim(tmpl, "\n"); 109 | int count = 0; 110 | sds * data = sdssplitlen(tmpl, (int)sdslen(tmpl), "\t", 1, &count); 111 | if (count != 2) { 112 | fprintf(stderr, "WARNING: line in max_score wrong format, will be skipped: %s", tmpl); 113 | sdsfreesplitres(data, count); 114 | sdsfree(tmpl); 115 | continue; 116 | } 117 | 118 | struct bkgrnd_max_scores * tbms = (struct bkgrnd_max_scores *)calloc(1, sizeof(struct bkgrnd_max_scores)); 119 | strcpy(tbms->gene, data[0]); 120 | tbms->max_scores = (float *)calloc(nb, sizeof(float)); 121 | get_indv_scores(&tbms, data[1], iht); 122 | 123 | sdsfree(tmpl); 124 | sdsfreesplitres(data, count); 125 | 126 | HASH_ADD_STR(bms, gene, tbms); 127 | } 128 | free(buffer); 129 | } 130 | 131 | void init_bkgrnd_max_b(char * bkgrnd_max, int nb, char iht){ 132 | bms = NULL; 133 | 134 | FILE * max_in = fopen(bkgrnd_max, "rb"); 135 | if (! max_in) { 136 | fprintf(stderr, "FATAL: could not open %s for loading\n", bkgrnd_max); 137 | exit(1); 138 | } 139 | 140 | char feature[BKRND_GENE_NAME_LEN]; 141 | float * scores = (float *)malloc(sizeof(float)*4*nb); 142 | int line_count = 0; 143 | 144 | while ( fread(&feature, sizeof(char), BKRND_GENE_NAME_LEN, max_in) ) { 145 | line_count += 1; 146 | if (line_count % 10000 == 0) { 147 | fprintf(stderr, "%d,", line_count); 148 | } 149 | //read in float data 150 | memset(scores, '\0', sizeof(float)*4*nb); 151 | fread(scores, sizeof(float), 4*nb, max_in); 152 | 153 | struct bkgrnd_max_scores * tbms = (struct bkgrnd_max_scores *)calloc(1, sizeof(struct bkgrnd_max_scores)); 154 | strcpy(tbms->gene, feature); 155 | tbms->max_scores = (float *)calloc(nb, sizeof(float)); 156 | get_indv_scores_b(&tbms, scores, nb, iht); 157 | 158 | /* debug info 159 | if (strcmp(feature, "ENSG00000130283") == 0){ 160 | fprintf(stderr, "\nPRINTING BACKGROUND SCORES\n\n%s\n", feature); 161 | int scores_index = 0; 162 | int i=0; 163 | for (i=0; i < nb; i++) { 164 | if (tbms->max_scores[i] > 0) { 165 | int j = 0; 166 | fprintf(stderr, "\t%d", i); 167 | for (j = 0; j < 4; j++) { 168 | fprintf(stderr, "\t%f", scores[scores_index+j]); 169 | } 170 | fprintf(stderr, "\t%f", tbms->max_scores[i]); 171 | fprintf(stderr, "\n"); 172 | } 173 | scores_index += 4; 174 | } 175 | } 176 | */ 177 | 178 | 179 | 180 | HASH_ADD_STR(bms, gene, tbms); 181 | 182 | } 183 | 184 | free(scores); 185 | 186 | } 187 | 188 | 189 | struct bkgrnd_max_scores * get_gene_max(char * gene){ 190 | 191 | struct bkgrnd_max_scores * t = NULL; 192 | HASH_FIND_STR(bms, gene, t); 193 | return t; 194 | 195 | } 196 | 197 | void cleanup_bkgrnd_max(){ 198 | 199 | struct bkgrnd_max_scores *s, *tmp; 200 | HASH_ITER(hh, bms, s, tmp){ 201 | HASH_DEL(bms, s); 202 | free(s); 203 | } 204 | 205 | } 206 | 207 | 208 | -------------------------------------------------------------------------------- /search_binary_bkgrnd.c: -------------------------------------------------------------------------------- 1 | // 2 | // search_binary_bkgrnd.c 3 | // VVP_C 4 | // 5 | // Created by steven on 8/11/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #include "search_binary_bkgrnd.h" 10 | 11 | static uint64_t mm_size; 12 | 13 | unsigned char * load_bin_db(sds file_prefix, int * n_background){ 14 | 15 | unsigned char * mm_bin = NULL; 16 | 17 | sds bin_file = sdsdup(file_prefix); 18 | bin_file = sdscat(bin_file, ".bin"); 19 | 20 | int fdSrc = open(bin_file, O_RDWR, 0); 21 | struct stat st; 22 | fstat(fdSrc, &st); 23 | 24 | mm_bin = (unsigned char *)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdSrc, 0); 25 | if(mm_bin == MAP_FAILED){ 26 | fprintf(stderr, "FATAL: could not create mmap from %s\n", bin_file); 27 | exit(1); 28 | } 29 | 30 | //load memmap into memory 31 | /* 32 | size_t filesize = st.st_size; 33 | size_t page_size = getpagesize(); 34 | unsigned char * buf[page_size]; 35 | size_t pos=0; 36 | for (pos=0; pos < filesize; pos += page_size) { 37 | size_t this_page_size = filesize - pos; 38 | if (this_page_size > page_size){ 39 | this_page_size = page_size; 40 | } 41 | memcpy(buf, mm_bin + pos, this_page_size); 42 | } 43 | */ 44 | 45 | mm_size = st.st_size; 46 | fprintf(stderr, "MMAP size for .bin: %llu\n\n", mm_size); 47 | 48 | sdsfree(bin_file); 49 | 50 | *n_background = *((int *)mm_bin); 51 | 52 | return mm_bin; 53 | } 54 | 55 | unsigned char * load_bit_db(sds file_prefix){ 56 | unsigned char * mm_bits = NULL; 57 | 58 | sds bit_file = sdsdup(file_prefix); 59 | bit_file = sdscat(bit_file, ".bit"); 60 | 61 | int fdSrc = open(bit_file, O_RDWR, 0); 62 | struct stat st; 63 | fstat(fdSrc, &st); 64 | mm_bits = (unsigned char *)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdSrc, 0); 65 | if(mm_bits == MAP_FAILED){ 66 | fprintf(stderr, "FATAL: could not create mmap from %s\n", bit_file); 67 | exit(1); 68 | } 69 | 70 | sdsfree(bit_file); 71 | 72 | return mm_bits; 73 | 74 | } 75 | 76 | struct chr_offsets * load_offsets(sds file_prefix){ 77 | 78 | sds offset_file = sdsdup(file_prefix); 79 | offset_file = sdscat(offset_file, ".chr_offsets.txt"); 80 | 81 | FILE * offsets = fopen(offset_file, "r"); 82 | if (! offsets) { 83 | fprintf(stderr, "FATAL: could not open offsets file %s\n", offset_file); 84 | exit(1); 85 | } 86 | 87 | char chr[3]; 88 | memset(chr, '\0', 3); 89 | uint64_t start; 90 | uint64_t end; 91 | int n; 92 | struct chr_offsets * chro = NULL; 93 | 94 | 95 | while (fscanf(offsets, "%s\t%llu\t%llu\t%d", chr, &start, &end, &n) == 4) { 96 | 97 | struct chr_offsets * tc = (struct chr_offsets *)calloc(1, sizeof(struct chr_offsets)); 98 | //tc->chr = (char *)calloc(3, sizeof(char)); 99 | strcpy(tc->chr, chr); 100 | tc->byte_start = start; 101 | tc->byte_end = end; 102 | tc->n_entries = n; 103 | memset(chr, '\0', 3); 104 | 105 | struct chr_offsets * ttc = NULL; 106 | HASH_FIND_STR(chro, tc->chr, ttc); 107 | //HASH_FIND(hh, chro, &tc->chr, 3*sizeof(char), ttc); 108 | if (ttc != NULL) { 109 | fprintf(stderr, "FATAL: chromosome %s already seen in offsets", chr); 110 | exit(1); 111 | } 112 | else { 113 | HASH_ADD_STR(chro, chr, tc); 114 | //HASH_ADD(hh, chro, chr, 3*sizeof(char), tc); 115 | } 116 | 117 | } 118 | 119 | sdsfree(offset_file); 120 | fclose(offsets); 121 | 122 | return chro; 123 | 124 | } 125 | 126 | void get_variants_S(struct m_var_info ** vi, unsigned char * mm_bin, uint64_t mm_offset){ // "_S" means side effects 127 | 128 | (*vi)->vi[(*vi)->nv] = (struct var_info *)malloc(sizeof(struct var_info)); 129 | mm_offset += 4; //skip start position 130 | (*vi)->vi[(*vi)->nv]->var_type = *( (char *)(mm_bin + mm_offset) ); 131 | mm_offset += 1; 132 | (*vi)->vi[(*vi)->nv]->length = *( (int *)(mm_bin + mm_offset) ); 133 | mm_offset += 4; 134 | (*vi)->vi[(*vi)->nv]->nhet = *( (int *)(mm_bin + mm_offset) ); 135 | mm_offset += 4; 136 | (*vi)->vi[(*vi)->nv]->nhom = *( (int *)(mm_bin + mm_offset) ); 137 | mm_offset += 4; 138 | (*vi)->vi[(*vi)->nv]->nhemi = *( (int *)(mm_bin + mm_offset) ); 139 | mm_offset += 4; 140 | (*vi)->vi[(*vi)->nv]->nocall = *( (int *)(mm_bin + mm_offset) ); 141 | mm_offset += 4; 142 | (*vi)->vi[(*vi)->nv]->bit_offset = *( (uint64_t *)(mm_bin + mm_offset) ); 143 | (*vi)->nv++; //increment number of variants 144 | } 145 | 146 | 147 | struct m_var_info * search_binary_bkgrnd(char * chr, size_t pos, unsigned char * mm_bin, struct chr_offsets * chro){ 148 | 149 | struct m_var_info * vi = (struct m_var_info *)malloc(sizeof(struct m_var_info *)); 150 | vi->vi = (struct var_info **)malloc(sizeof(struct var_info *)*MAX_VARS); 151 | vi->nv = 0; 152 | 153 | struct chr_offsets * ttc = NULL; 154 | HASH_FIND_STR(chro, chr, ttc); 155 | //HASH_FIND(hh, chro, &chr, 3*sizeof(char), ttc); 156 | if (ttc == NULL) { 157 | fprintf(stderr, "WARNING: chromosome %s not in offsets\n", chr); 158 | return vi; 159 | } 160 | else { 161 | 162 | int min = 0; 163 | int max = ttc->n_entries - 1; //place pointer at start of last entry 164 | while (max >= min) { //binary search 165 | int mid = (min + max) / 2; 166 | uint64_t tmp_offset = ttc->byte_start + mid*LINE_BYTE_SIZE + 2; //+2 because only two chars were written for chromosome placeholder 167 | int * mm_pos = (int *)(mm_bin + tmp_offset); 168 | if ( (*mm_pos) == pos) { 169 | get_variants_S(&vi, mm_bin, tmp_offset); 170 | 171 | //look 'down' from found position 172 | uint64_t d_offset = tmp_offset; 173 | while (d_offset >= LINE_BYTE_SIZE && *( (int *)(mm_bin + (d_offset - LINE_BYTE_SIZE)) ) == pos) { 174 | d_offset -= LINE_BYTE_SIZE; 175 | get_variants_S(&vi, mm_bin, d_offset); 176 | } 177 | 178 | //look 'up' from found position 179 | uint64_t u_offset = tmp_offset + LINE_BYTE_SIZE; 180 | while (u_offset < mm_size && *( (int *)(mm_bin+u_offset) ) == pos) { 181 | get_variants_S(&vi, mm_bin, u_offset); 182 | u_offset += LINE_BYTE_SIZE; 183 | } 184 | 185 | return vi; 186 | 187 | } 188 | else if (*mm_pos < pos) { 189 | min = mid + 1; 190 | } 191 | else { 192 | max = mid - 1; 193 | } 194 | } 195 | } 196 | 197 | 198 | return vi; 199 | } 200 | 201 | void destroy_chr_offsets(struct chr_offsets * chro){ 202 | struct chr_offsets *s, *tmp; 203 | HASH_ITER(hh, chro, s, tmp){ 204 | HASH_DEL(chro, s); 205 | free(s); 206 | } 207 | } 208 | 209 | -------------------------------------------------------------------------------- /bit_macros.h: -------------------------------------------------------------------------------- 1 | /* 2 | bit_macros.h 3 | project: bit array C library 4 | url: https://github.com/noporpoise/BitArray/ 5 | author: Isaac Turner 6 | license: Public Domain, no warranty 7 | date: Dec 2013 8 | */ 9 | 10 | #ifndef BITSET_H_ 11 | #define BITSET_H_ 12 | 13 | #include 14 | #include 15 | 16 | // trailing_zeros is number of least significant zeros 17 | // leading_zeros is number of most significant zeros 18 | #if defined(_WIN32) 19 | #define trailing_zeros(x) ({ __typeof(x) _r; _BitScanReverse64(&_r, x); _r; }) 20 | #define leading_zeros(x) ({ __typeof(x) _r; _BitScanForward64(&_r, x); _r; }) 21 | #else 22 | #define trailing_zeros(x) ((x) ? (__typeof(x))__builtin_ctzll(x) : (__typeof(x))sizeof(x)*8) 23 | #define leading_zeros(x) ((x) ? (__typeof(x))__builtin_clzll(x) : (__typeof(x))sizeof(x)*8) 24 | #endif 25 | 26 | // Get index of top set bit. If x is 0 return nbits 27 | #define top_set_bit(x) ((x) ? sizeof(x)*8-leading_zeros(x)-1 : sizeof(x)*8) 28 | 29 | #define roundup_bits2bytes(bits) (((bits)+7)/8) 30 | #define roundup_bits2words32(bits) (((bits)+31)/32) 31 | #define roundup_bits2words64(bits) (((bits)+63)/64) 32 | 33 | // Round a number up to the nearest number that is a power of two 34 | #define roundup2pow(x) (1UL << (64 - leading_zeros(x))) 35 | 36 | #define rot32(x,r) (((x)<<(r)) | ((x)>>(32-(r)))) 37 | #define rot64(x,r) (((x)<<(r)) | ((x)>>(64-(r)))) 38 | 39 | // need to check for length == 0, undefined behaviour if uint64_t >> 64 etc 40 | #define bitmask(nbits,type) ((nbits) ? ~(type)0 >> (sizeof(type)*8-(nbits)): (type)0) 41 | #define bitmask32(nbits) bitmask(nbits,uint32_t) 42 | #define bitmask64(nbits) bitmask(nbits,uint64_t) 43 | 44 | // A possibly faster way to combine two words with a mask 45 | //#define bitmask_merge(a,b,abits) ((a & abits) | (b & ~abits)) 46 | #define bitmask_merge(a,b,abits) (b ^ ((a ^ b) & abits)) 47 | 48 | // Swap lowest four bits. A nibble is 4 bits (i.e. half a byte) 49 | #define rev_nibble(x) ((((x)&1)<<3)|(((x)&2)<<1)|(((x)&4)>>1)|(((x)&8)>>3)) 50 | 51 | // 52 | // Bit array (bitset) 53 | // 54 | // bitsetX_wrd(): get word for a given position 55 | // bitsetX_idx(): get index within word for a given position 56 | #define _VOLPTR(x) ((volatile __typeof(x) *)(&(x))) 57 | #define _VOLVALUE(x) (*_VOLPTR(x)) 58 | 59 | #define _TYPESHIFT(arr,word,shift) \ 60 | ((__typeof(*(arr)))((__typeof(*(arr)))(word) << (shift))) 61 | 62 | #define bitsetX_wrd(wrdbits,pos) ((pos) / (wrdbits)) 63 | #define bitsetX_idx(wrdbits,pos) ((pos) % (wrdbits)) 64 | 65 | #define bitset32_wrd(pos) ((pos) >> 5) 66 | #define bitset32_idx(pos) ((pos) & 31) 67 | 68 | #define bitset64_wrd(pos) ((pos) >> 6) 69 | #define bitset64_idx(pos) ((pos) & 63) 70 | 71 | // 72 | // Bit functions on arrays 73 | // 74 | #define bitset2_get(arr,wrd,idx) (((arr)[wrd] >> (idx)) & 0x1) 75 | #define bitset2_set(arr,wrd,idx) ((arr)[wrd] |= _TYPESHIFT(arr,1,idx)) 76 | #define bitset2_del(arr,wrd,idx) ((arr)[wrd] &=~ _TYPESHIFT(arr,1,idx)) 77 | #define bitset2_tgl(arr,wrd,idx) ((arr)[wrd] ^= _TYPESHIFT(arr,1,idx)) 78 | #define bitset2_or(arr,wrd,idx,bit) ((arr)[wrd] |= _TYPESHIFT(arr,bit,idx)) 79 | #define bitset2_xor(arr,wrd,idx,bit) ((arr)[wrd] = ~((arr)[wrd] ^ (~_TYPESHIFT(arr,bit,idx)))) 80 | #define bitset2_and(arr,wrd,idx,bit) ((arr)[wrd] &= (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx))) 81 | #define bitset2_cpy(arr,wrd,idx,bit) ((arr)[wrd] = ((arr)[wrd] &~ _TYPESHIFT(arr,1,idx)) | _TYPESHIFT(arr,bit,idx)) 82 | 83 | // 84 | // Thread safe versions 85 | // 86 | // They return the value of the bit (0 or 1) before it was updated 87 | #define bitset2_get_mt(arr,wrd,idx) bitset2_get(_VOLPTR(*(arr)),wrd,idx) 88 | #define bitset2_set_mt(arr,wrd,idx) ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]), _TYPESHIFT(arr,1,idx)) >> (idx))&1) 89 | #define bitset2_del_mt(arr,wrd,idx) ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), ~_TYPESHIFT(arr,1,idx)) >> (idx))&1) 90 | #define bitset2_tgl_mt(arr,wrd,idx) ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]), _TYPESHIFT(arr,1,idx)) >> (idx))&1) 91 | #define bitset2_or_mt(arr,wrd,idx,bit) ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]), _TYPESHIFT(arr,bit,idx)) >> (idx))&1) 92 | #define bitset2_xor_mt(arr,wrd,idx,bit) ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]), _TYPESHIFT(arr,bit,idx)) >> (idx))&1) 93 | #define bitset2_and_mt(arr,wrd,idx,bit) ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx))) >> (idx))&1) 94 | #define bitset2_cpy_mt(arr,wrd,idx,bit) ((bit) ? bitset2_set_mt(arr,wrd,idx) : bitset2_del_mt(arr,wrd,idx)) 95 | 96 | // 97 | // Auto detect size of type from pointer 98 | // 99 | #define bitset_wrd(arr,pos) bitsetX_wrd(sizeof(*(arr))*8,pos) 100 | #define bitset_idx(arr,pos) bitsetX_idx(sizeof(*(arr))*8,pos) 101 | #define bitset_op(func,arr,pos) func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos)) 102 | #define bitset_op2(func,arr,pos,bit) func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos), bit) 103 | 104 | // Auto-detect type size: bit functions 105 | #define bitset_get(arr,pos) bitset_op(bitset2_get, arr, pos) 106 | #define bitset_set(arr,pos) bitset_op(bitset2_set, arr, pos) 107 | #define bitset_del(arr,pos) bitset_op(bitset2_del, arr, pos) 108 | #define bitset_tgl(arr,pos) bitset_op(bitset2_tgl, arr, pos) 109 | #define bitset_or(arr,pos,bit) bitset_op2(bitset2_or, arr, pos, bit) 110 | #define bitset_xor(arr,pos,bit) bitset_op2(bitset2_xor, arr, pos, bit) 111 | #define bitset_and(arr,pos,bit) bitset_op2(bitset2_and, arr, pos, bit) 112 | #define bitset_cpy(arr,pos,bit) bitset_op2(bitset2_cpy, arr, pos, bit) 113 | 114 | // Auto-detect type size: thread safe bit functions 115 | // They return the value of the bit (0 or 1) before it was updated 116 | #define bitset_get_mt(arr,pos) bitset_op(bitset2_get_mt, arr, pos) 117 | #define bitset_set_mt(arr,pos) bitset_op(bitset2_set_mt, arr, pos) 118 | #define bitset_del_mt(arr,pos) bitset_op(bitset2_del_mt, arr, pos) 119 | #define bitset_tgl_mt(arr,pos) bitset_op(bitset2_tgl_mt, arr, pos) 120 | #define bitset_or_mt(arr,pos,bit) bitset_op2(bitset2_or_mt, arr, pos, bit) 121 | #define bitset_xor_mt(arr,pos,bit) bitset_op2(bitset2_xor_mt, arr, pos, bit) 122 | #define bitset_and_mt(arr,pos,bit) bitset_op2(bitset2_and_mt, arr, pos, bit) 123 | #define bitset_cpy_mt(arr,pos,bit) bitset_op2(bitset2_cpy_mt, arr, pos, bit) 124 | 125 | // Clearing a word does not return a meaningful value 126 | #define bitset_clear_word(arr,pos) ((arr)[bitset_wrd(arr,pos)] = 0) 127 | #define bitset_clear_word_mt(arr,pos) (_VOLVALUE((arr)[bitset_wrd(arr,pos)]) = 0) 128 | 129 | // 130 | // Compact bit array of spin locks 131 | // These are most effecient when arr is of type: volatile char* 132 | // 133 | // Acquire a lock 134 | #define bitlock_acquire_block(arr,pos,wait,abandon) do { \ 135 | size_t _w = bitset_wrd(arr,pos); \ 136 | __typeof(*(arr)) _o, _n, _b = _TYPESHIFT(arr, 1, bitset_idx(arr,pos)); \ 137 | do { \ 138 | while((_o = _VOLVALUE((arr)[_w])) & _b) { wait } \ 139 | abandon \ 140 | _n = _o | _b; \ 141 | } while(!__sync_bool_compare_and_swap(_VOLPTR((arr)[_w]), _o, _n)); \ 142 | __sync_synchronize(); /* Must not move commands to before acquiring lock */ \ 143 | } while(0) 144 | 145 | // Undefined behaviour if you do not already hold the lock 146 | #define bitlock_release(arr,pos) do { \ 147 | size_t _w = bitset_wrd(arr,pos); \ 148 | __typeof(*(arr)) _mask = ~_TYPESHIFT(arr, 1, bitset_idx(arr,pos)); \ 149 | __sync_synchronize(); /* Must get the lock before releasing it */ \ 150 | __sync_and_and_fetch(_VOLPTR((arr)[_w]), _mask); \ 151 | } while(0) 152 | 153 | #define bitlock_acquire(arr,pos) bitlock_acquire_block(arr,pos,{},{}) 154 | 155 | // calls yield if cannot acquire the lock 156 | #define bitlock_yield_acquire(arr,pos) bitlock_acquire_block(arr,pos,sched_yield();,{}) 157 | 158 | // Block until we get the lock or someone else does 159 | // sets the memory pointed to by retptr to 1 if we got the lock, 0 otherwise 160 | #define bitlock_try_acquire(arr,pos,retptr) do { \ 161 | *retptr = 1; /* default to success, set to zero if locked */ \ 162 | bitlock_acquire_block(arr,pos,{*retptr=0;break;},if(!*retptr){break;}); \ 163 | } while(0) 164 | 165 | /* 166 | * Byteswapping 167 | */ 168 | 169 | /* clang uses these to check for features */ 170 | #ifndef __has_feature 171 | #define __has_feature(x) 0 172 | #endif 173 | 174 | #ifndef __has_builtin 175 | #define __has_builtin(x) 0 176 | #endif 177 | 178 | /* GCC versions < 4.3 do not have __builtin_bswapX() */ 179 | #if ( defined(__clang__) && !__has_builtin(__builtin_bswap64) ) || \ 180 | ( !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ 181 | ( (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)) ) 182 | #define byteswap64(x) ( (((uint64_t)(x) << 56)) | \ 183 | (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \ 184 | (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \ 185 | (((uint64_t)(x) << 8) & 0xff00000000ULL) | \ 186 | (((uint64_t)(x) >> 8) & 0xff000000ULL) | \ 187 | (((uint64_t)(x) >> 24) & 0xff0000ULL) | \ 188 | (((uint64_t)(x) >> 40) & 0xff00ULL) | \ 189 | (((uint64_t)(x) >> 56)) ) 190 | 191 | #define byteswap32(x) ( (((uint32_t)(x) << 24)) | \ 192 | (((uint32_t)(x) << 8) & 0xff0000U) | \ 193 | (((uint32_t)(x) >> 8) & 0xff00U) | \ 194 | (((uint32_t)(x) >> 24)) ) 195 | 196 | /* uint16_t type might be bigger than 2 bytes, so need to mask */ 197 | #define byteswap16(x) ( (((uint16_t)(x) & 0xff) << 8) | \ 198 | (((uint16_t)(x) >> 8) & 0xff) ) 199 | #else 200 | #define byteswap64(x) __builtin_bswap64(x) 201 | #define byteswap32(x) __builtin_bswap64(x) 202 | #define byteswap16(x) __builtin_bswap64(x) 203 | #endif 204 | 205 | #endif /* BITLOCK_H_ */ 206 | -------------------------------------------------------------------------------- /aa_weight.c: -------------------------------------------------------------------------------- 1 | // 2 | // aa_weight.c 3 | // VVP_dev_xcode 4 | // 5 | // Created by STEVEN FLYGARE on 10/11/16. 6 | // Copyright © 2016 IDbyDNA. All rights reserved. 7 | // 8 | 9 | #include "aa_weight.h" 10 | 11 | #define NUM_KEYS 231 12 | 13 | #define CONS 1.4121 14 | #define UNCONS 0.3022 15 | 16 | static struct aa_matrix * aam; 17 | 18 | void init_aa_score(){ 19 | 20 | //initialize amino acid substitution scores. based on matrix0.13.log 21 | char * keys[NUM_KEYS] = { "*0X","A0D","A0E","A0G","A0M","A0N","A0P","A0R","A0S","A0T","A0V","C0F","C0G","C0H","C0L","C0R","C0S","C0W","C0Y","D0A","D0E","D0G","D0H","D0N","D0V","D0Y","E0A","E0D","E0G","E0K","E0L","E0N","E0Q","E0V","F0C","F0G","F0I","F0L","F0S","F0V","F0Y","G0A","G0C","G0D","G0E","G0K","G0M","G0R","G0S","G0T","G0V","G0W","H0C","H0D","H0L","H0N","H0P","H0Q","H0R","H0Y","I0F","I0K","I0L","I0M","I0N","I0R","I0S","I0T","I0V","K0A","K0D","K0E","K0I","K0M","K0N","K0Q","K0R","K0S","K0T","L0A","L0F","L0H","L0I","L0M","L0P","L0Q","L0R","L0S","L0V","L0W","M0I","M0K","M0L","M0R","M0T","M0V","N0D","N0E","N0H","N0I","N0K","N0S","N0T","N0Y","P0A","P0F","P0H","P0L","P0Q","P0R","P0S","P0T","Q0E","Q0H","Q0K","Q0L","Q0P","Q0R","Q0W","R0C","R0D","R0G","R0H","R0I","R0K","R0L","R0M","R0P","R0Q","R0S","R0T","R0W","S0A","S0C","S0F","S0G","S0I","S0L","S0N","S0P","S0R","S0T","S0W","S0Y","T0A","T0I","T0K","T0L","T0M","T0N","T0P","T0R","T0S","T0V","V0A","V0D","V0E","V0F","V0G","V0I","V0K","V0L","V0M","V0P","W0C","W0G","W0L","W0Q","W0R","W0S","X0*","Y0C","Y0D","Y0F","Y0H","Y0L","Y0N","Y0Q","Y0R","Y0S","del-12","del-15","del-18","del-21","del-24","del-27","del-3","del-30","del-33","del-36","del-39","del-42","del-45","del-48","del-51","del-54","del-57","del-6","del-60","del-63","del-66","del-69","del-72","del-75","del-78","del-9","ins-12","ins-15","ins-18","ins-21","ins-24","ins-27","ins-3","ins-30","ins-33","ins-36","ins-39","ins-6","ins-9","splice-a1-A","splice-a1-C","splice-a1-T","splice-a2-A","splice-a2-C","splice-a2-G","splice-b1-C","splice-b1-G","splice-b1-T","splice-b2-A","splice-b2-C","splice-b2-T" }; 22 | float scores[NUM_KEYS] = { 0.91359891142,1.84077794036,1.01098024651,0.350309375974,0.268054671093,3.32067726877,0.900077768437,3.15582095046,0.514060482919,0.540349794826,0.628795059769,2.09327639657,1.22000954491,5.7047532566,29.2743917115,0.595969916709,1.17699832069,0.986631383625,1.65452817547,0.684044202943,0.284371787196,0.93951194795,2.20525153421,1.4316783812,5.58167027113,3.02524793911,0.708654740721,0.330933515822,0.45467038898,2.40831376054,3.39154538121,16.8976235702,0.320902013172,1.53326882958,3.05351440285,2.03182992701,0.702047948101,0.65576453956,0.97621698662,1.1006862319,0.384406382773,0.400694060346,5.1693641127,2.57718986732,1.58030315672,43.6245837269,3.5884738227,2.45518665108,1.13587458176,3.85256064082,3.07938238072,2.35019764444,1.30309822248,0.633139945951,2.13287359624,0.39439020963,0.569328327198,0.378744562026,0.314781476958,0.716827634337,1.96573425468,2.86052627581,0.426846207364,0.50089173886,2.00103437015,1.89833939426,1.14017104718,0.660969407995,0.124905573926,1.35661815248,5.95459646492,0.501852900841,1.78523873226,0.937057916238,0.865792905897,0.448947142995,0.259542241486,12.7134501147,0.631500094397,19.3465545224,0.544537306052,0.699011507653,0.262519618888,0.309163611304,1.20350937887,1.1077402915,2.35289503476,0.30675297571,0.374405222367,3.04311305937,0.579552164832,3.12521265361,0.252632903491,6.19289193732,0.478336569493,0.298365415566,0.252141042725,9.14323467153,0.627308393818,1.16637157016,1.27824346828,0.525281931208,0.447002583942,1.15928116547,0.329023036095,4.36245837269,0.619017974516,1.104099356,0.952536782294,1.36239714355,0.57534565594,0.756575508116,0.408673148047,0.323726616874,0.488357458773,0.666849903837,1.33327792137,0.19037761025,12.3602987226,1.97731301247,38.3595477599,0.547403664788,1.20604622311,1.12591127563,0.18436317034,2.48919977427,0.27724034518,1.69508363222,1.01542566911,1.27602467553,0.670858341181,2.94010613985,0.0764948863701,0.683120797587,1.95994506599,0.141911296461,0.781564556232,1.62328386658,0.176043695858,0.379283579151,0.896810696342,0.193165598642,1.43143165354,1.24343124874,0.172074786805,0.641748865424,0.461683704103,0.841154544451,1.04483261198,0.43243027601,0.438703560296,1.21030361191,0.162096909993,13.905336063,0.182237547843,2.77551618023,2.86412689247,1.91502937479,0.609672706708,0.133753806045,24.7205974453,0.403390502245,0.852928702808,3.07583470517,2.61747502362,2.21746222931,0.764554560163,15.0327957437,0.618329826321,4.1469781362,27.6429358646,2.0572278011,1.60639261377,0.418993177038,0.653887955936,21.1890835245,4.47411178625,49.4411948905,42.9357745102,0.712918388241,11.9298578,4.252337046,9.9527436,10.7235,11.6106,12.4977,3.941480407,13.3848,14.2719,15.159,16.0461,16.9332,17.8203,18.7074,19.5945,20.4816,21.3687,4.033682552,22.2558,23.1429,24.03,24.9171,25.8042,26.6913,27.5784,11.59869346,6.7228767,5.192735791,17.91493848,15.7824,17.7768,19.7712,2.016674044,21.7656,23.76,25.7544,27.7488,5.507683676,15.46012184,19.20532627,7.890694571,27.59342384,13.41412178,8.094265101,7.136069027,10.05920958,13.70509544,7.857813164,7.107669134,15.21498419,11.54494088 }; 23 | float conserved[NUM_KEYS] = { 2.77861236573,3.69595112473,2.78158289687,0.650172837121,0.268054671093,3.32067726877,2.94366286187,3.15582095046,0.801800546955,0.996750910768,1.17936028122,5.28861213069,3.51612494445,5.7047532566,29.2743917115,1.50011111282,2.6192361698,1.74958858425,4.43830835802,1.57645466633,0.547517775352,1.70012194995,4.05676423293,2.28496774448,10.4578666008,6.36556517102,1.15988760881,0.539812097288,0.781150046362,4.52562300025,3.39154538121,16.8976235702,0.83872418366,4.57995969396,4.24410362144,2.03182992701,1.87862836538,1.44337630759,2.27561120922,2.89599141958,0.598952603607,0.785166678153,11.5248261708,5.07971425289,3.47003887511,43.6245837269,3.5884738227,6.27864605808,2.33286352514,3.85256064082,5.88858772202,4.08457573855,1.30309822248,1.99366375903,4.27453456609,0.793257819723,1.04526264091,0.684796430406,0.765466968588,1.73951873277,2.10305541659,2.86052627581,0.558609035575,0.842946508963,4.18451830221,3.30803100929,5.76840579829,1.34484620972,0.216933893948,1.35661815248,5.95459646492,0.959635405612,11.3263137705,1.79808858652,1.31027456561,0.798991869633,0.372756919483,12.7134501147,1.21079572772,19.3465545224,0.911324995336,1.36707373523,0.420177441363,0.404388114374,3.36859789134,2.23004033896,9.02777667655,0.727321349074,0.822596417388,10.6728140247,1.03529186134,6.20224892768,0.48968359006,11.4929802538,1.39124528104,0.636172501084,0.57484859255,9.14323467153,0.698911371496,2.59927831437,2.55393837991,0.801242646765,0.80015118965,5.28051417656,0.671230287277,4.36245837269,1.40169277902,2.37751396019,2.18064236834,3.3635339451,1.07747697859,1.30535739741,0.677419532467,0.545735892564,0.569376723886,2.09857486554,4.87592776893,0.332295194373,12.3602987226,3.97373539982,38.3595477599,1.76860483365,2.55051363,5.42844359549,0.305771298896,5.46796797706,0.27724034518,4.3981302265,1.73842692187,3.19768032726,1.40173472301,7.35966034138,0.147951304983,1.11108682406,4.66422394496,0.324122205778,1.17254681829,3.10503111267,0.270778652712,0.652075058202,2.0670264358,0.46743997153,2.79501156481,1.95778903806,0.373368631869,1.1592740375,1.02464492758,0.841154544451,2.06724076149,0.631308798028,1.0054055368,2.75016788198,0.282033600176,13.905336063,0.423548044207,5.8255335952,10.7434504572,5.23293943858,0.780226626075,0.185328912249,24.7205974453,0.6887903473,1.62309356602,3.00364520084,9.1069758573,4.00530975742,0.764554560163,15.0327957437,1.65123745094,17.1436329842,67.3013534151,6.16661163522,3.60134962738,0.658934662783,1.17161469105,21.1890835245,25.9883106502,49.4411948905,42.9357745102,1.55378185295,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 }; 24 | float unconserved[NUM_KEYS] = { 0.390219892002,0.812354782563,0.449286828113,0.183510042772,0.268054671093,3.32067726877,0.290536148385,3.15582095046,0.340439484732,0.275372722663,0.323569561891,0.708975287349,0.422714655564,5.7047532566,29.2743917115,0.22538690233,0.517322307406,0.394804713904,0.532621986266,0.31449593193,0.119266157772,0.388794499587,0.910505376573,0.865863160753,1.87832674331,1.27876960364,0.410204960454,0.166926769783,0.18642367978,1.21735790145,3.39154538121,16.8976235702,0.0911765864685,0.678079059572,1.578662795,2.03182992701,0.210313270251,0.184968941502,0.331568447668,0.353162445208,0.143651750525,0.136036478297,2.69860533082,1.02353162002,0.619292405291,43.6245837269,3.5884738227,0.979433338108,0.473256910936,3.85256064082,1.00364229821,0.948397953033,1.30309822248,0.237783373244,0.92814544074,0.187308458967,0.249322121843,0.136406961359,0.122784637955,0.266861011778,1.83082880332,2.86052627581,0.288037334269,0.165037084637,0.406503002964,1.02117973335,0.295791761021,0.253096076313,0.0582045529163,1.35661815248,5.95459646492,0.249539811041,0.832524786768,0.268632678977,0.418493902095,0.259713536399,0.125625685969,12.7134501147,0.276515832231,19.3465545224,0.210324274926,0.392462633442,0.187260114159,0.20657083957,0.408825534425,0.307800775835,0.899060319949,0.133787289883,0.164875388753,0.455947915141,0.245787934668,0.700601807416,0.0966252358557,1.55736324119,0.133534201284,0.088448145875,0.110750572984,9.14323467153,0.561742068702,0.270374767125,0.559169839753,0.247855443989,0.0969398877746,0.257397197433,0.125016641884,4.36245837269,0.27163063577,0.541802447888,0.425975847444,0.617146779204,0.262365993093,0.368982891804,0.205297857359,0.146289369473,0.368624684222,0.125170966658,0.310910943188,0.0913996058811,12.3602987226,0.736741174529,38.3595477599,0.156134012168,0.470085908769,0.328116905075,0.0666672201166,0.707678204804,0.27724034518,0.705475906971,0.425702085516,0.40261165342,0.233143554121,1.04701020896,0.0290437561938,0.335318683611,0.469242492337,0.0590635233874,0.483411059634,0.691042486655,0.0928225665578,0.176188413213,0.353220366567,0.0494178294865,0.585235802956,0.496258152798,0.0820030090174,0.314612477149,0.104240921658,0.841154544451,0.535053673856,0.28516973357,0.18638319452,0.468461345863,0.0663344666187,13.905336063,0.0806901979333,0.894868489532,0.737606912985,0.585102079237,0.413716387905,0.0870871717132,24.7205974453,0.200274410234,0.422973772401,3.2831698866,0.432116314185,0.623759854226,0.764554560163,15.0327957437,0.196710984652,0.494583125784,10.1449016233,0.70516791894,0.432951936813,0.195740296452,0.262507425604,21.1890835245,1.56917173613,49.4411948905,42.9357745102,0.188188039933,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 }; 25 | 26 | aam = NULL; 27 | int i = 0; 28 | for (i = 0; i < NUM_KEYS; i++) { 29 | struct aa_matrix * new_aa = (struct aa_matrix *)malloc(sizeof(struct aa_matrix)); 30 | memset(new_aa->aa_change,'\0',20); 31 | strcpy(new_aa->aa_change, keys[i]); 32 | new_aa->score = scores[i]; 33 | new_aa->cons = conserved[i]; 34 | new_aa->uncons = unconserved[i]; 35 | HASH_ADD_STR(aam, aa_change, new_aa); 36 | } 37 | 38 | } 39 | 40 | void get_aaw(struct transcript_anno_info ** ttai, sds ref, sds var, float phast){ 41 | 42 | (*ttai)->aaw = 1.0; //make sure aa is set to 1.0 before finding appropriate weight 43 | //check for splice terms (sequence ontology terms); most severe substitution score is assigned 44 | struct aa_matrix * taa = NULL; 45 | 46 | int i; 47 | for (i=0; i < (*ttai)->anno_tags.n; i++) { 48 | if (strcmp(kv_A((*ttai)->anno_tags, i), "splice_donor_variant") == 0) { 49 | HASH_FIND_STR(aam, "splice-a1-T", taa); 50 | if (taa != NULL) { 51 | (*ttai)->aaw = 1.0 / taa->score; 52 | (*ttai)->coding = 1; 53 | } 54 | return; 55 | } 56 | } 57 | 58 | for (i=0; i < (*ttai)->anno_tags.n; i++) { 59 | if (strcmp(kv_A((*ttai)->anno_tags, i), "splice_acceptor_variant") == 0) { 60 | HASH_FIND_STR(aam, "splice-b2-T", taa); 61 | if (taa != NULL) { 62 | (*ttai)->aaw = 1.0 / taa->score; 63 | (*ttai)->coding = 1; 64 | } 65 | return; 66 | } 67 | } 68 | 69 | if ((*ttai)->pref == NULL) { 70 | return; 71 | } 72 | 73 | //check for non-synonymous change 74 | size_t tmpl_pref = sdslen((*ttai)->pref); 75 | size_t tmpl_pvar = sdslen((*ttai)->pvar); 76 | if (strncmp((*ttai)->pref, (*ttai)->pvar, tmpl_pref < tmpl_pvar ? tmpl_pref : tmpl_pvar) != 0 || tmpl_pref != tmpl_pvar) { 77 | 78 | (*ttai)->coding = 1; 79 | int diff = abs((int)sdslen(ref) - (int)sdslen(var)); 80 | 81 | if (diff > 0) { //get aa score for indel 82 | char pchange[20]; 83 | memset(pchange, '\0', 20*sizeof(char)); 84 | if (sdslen(ref) < sdslen(var)) { //insertion 85 | if (diff%3 == 0 && diff <= 39) { 86 | sprintf(pchange, "ins-%d", diff); 87 | } 88 | else { 89 | sprintf(pchange, "ins-39"); 90 | } 91 | HASH_FIND_STR(aam, pchange, taa); 92 | if (taa != NULL) { 93 | (*ttai)->aaw = 1.0 / taa->score; 94 | } 95 | } 96 | else { //deletion 97 | if (diff%3 == 0 && diff <= 78) { 98 | sprintf(pchange, "del-%d", diff); 99 | } 100 | else { 101 | sprintf(pchange, "del-78"); 102 | } 103 | HASH_FIND_STR(aam, pchange, taa); 104 | if (taa != NULL) { 105 | (*ttai)->aaw = 1.0 / taa->score; 106 | } 107 | } 108 | } 109 | 110 | else { //get aa score for snv 111 | if (sdslen((*ttai)->pref) > 1 || sdslen((*ttai)->pvar) > 1) { //skip if amino acid is longer than 1, can happen on multiallelic site 112 | fprintf(stderr, "WARNING: AAW set to 1.0 because annotated protein variant > 1\n"); 113 | return; 114 | } 115 | char pchange[5]; 116 | memset(pchange, '\0', 5*sizeof(char)); 117 | if (strcmp((*ttai)->pref, "*") == 0) { 118 | sprintf(pchange, "*0X"); 119 | } 120 | else if (strcmp((*ttai)->pvar, "*") == 0){ 121 | sprintf(pchange, "X0*"); 122 | } 123 | else { 124 | sprintf(pchange, "%s0%s", (*ttai)->pref, (*ttai)->pvar); 125 | } 126 | HASH_FIND_STR(aam, pchange, taa); 127 | if (taa != NULL) { 128 | if (phast >= 0) { 129 | (*ttai)->aaw = 1.0 / ( ((1.0 - phast) * taa->uncons) + (phast * taa->cons) ); 130 | } 131 | else { 132 | (*ttai)->aaw = 1.0 / taa->score; 133 | } 134 | } 135 | } 136 | 137 | return; 138 | } 139 | 140 | //use phastcons score if synonymous or noncoding 141 | if (phast >= 0) { 142 | (*ttai)->aaw = 1.0 / ( ((1.0 - phast) * UNCONS) + (phast * CONS) ); 143 | return; 144 | } 145 | 146 | } 147 | -------------------------------------------------------------------------------- /score_variants.c: -------------------------------------------------------------------------------- 1 | // 2 | // main.c 3 | // vvp_score 4 | // 5 | // Created by steven on 8/11/15. 6 | // Copyright (c) 2015 yandell lab. All rights reserved. 7 | // 8 | 9 | #include "vvp_headers.h" 10 | #include "search_binary_bkgrnd.h" 11 | #include "parse_vcf.h" 12 | #include "vvp_lookup.h" 13 | #include "score_variant.h" 14 | 15 | #define WORK_SIZE 100000 16 | 17 | static sds input_vcf; 18 | static sds db_prefix; 19 | static sds output; 20 | static int ncpus; 21 | static int snv_only; 22 | static int coding_only; 23 | static int no_aa_weights; 24 | static int no_allele_frequency; 25 | static sds anno_tag_name; 26 | static uint8_t gene_index; 27 | static uint8_t transcript_index; 28 | static uint8_t so_tag_index; 29 | static uint8_t aa_index; 30 | static int ll_weight_index; 31 | 32 | static int n_background; 33 | static unsigned char * mm_bin; 34 | static struct chr_offsets * chro; 35 | 36 | void usage(int exit_code) { 37 | fprintf(stderr, "Usage: VVP [options] -i -o \n\n"); 38 | fprintf(stderr, "Options: (*mandatory)\n"); 39 | fprintf(stderr, "* -i filename Input vcf file. Can be zipped or unzipped. Can be 'stdin'\n"); 40 | fprintf(stderr, "* -d filename database prefix\n"); 41 | fprintf(stderr, "* -v string string with comma separated annotation components in info field\n"); 42 | fprintf(stderr, " Format: ,,,,\n"); 43 | fprintf(stderr, " Example: CSQ,4,6,1,15\n"); 44 | fprintf(stderr, "-o filename fomatted output file name (for use in burden permutation)\n"); 45 | fprintf(stderr, "-n # Number of threads to use, default = 1\n"); 46 | fprintf(stderr, "-w int Column index (zero based) in annotation tag as extra likelihood weight\n"); 47 | fprintf(stderr, "-x None Set to turn off AA scoring -- all AA weights will be set to 1.0\n"); 48 | fprintf(stderr, "-f None Set to not use allele frequency when scoring (Only AA weights will be used)\n"); 49 | fprintf(stderr, "-l None Set to ignore indels. Default is to score indels\n"); 50 | fprintf(stderr, "-c None Set to ignore non-coding variants. Default is to score non-coding variants.\n\n"); 51 | exit(exit_code); 52 | } 53 | 54 | void parse_command_line(int argc, const char * argv[]) { 55 | int opt; 56 | int sig; 57 | sds * tmp_info; 58 | int tmp_count; 59 | if (argc > 1 && strcmp(argv[1], "-h") == 0) 60 | usage(0); 61 | while ((opt = getopt(argc, argv, "i:d:v:o:n:w:cxlf")) != -1) { 62 | switch (opt) { 63 | case 'i' : 64 | input_vcf = sdsnew(optarg); 65 | break; 66 | case 'd' : 67 | db_prefix = sdsnew(optarg); 68 | break; 69 | case 'v' : 70 | tmp_info = sdssplitlen(optarg, (int)strlen(optarg), ",", 1, &tmp_count); 71 | if (tmp_count != 5) { 72 | fprintf(stderr, "ARGUMENT ERROR:\tmust assign five annotation components, here only %d in %s\n", tmp_count, optarg); 73 | usage(1); 74 | } 75 | anno_tag_name = sdsdup(tmp_info[0]); 76 | gene_index = atoi(tmp_info[1]); 77 | transcript_index = atoi(tmp_info[2]); 78 | so_tag_index = atoi(tmp_info[3]); 79 | aa_index = atoi(tmp_info[4]); 80 | sdsfreesplitres(tmp_info, tmp_count); 81 | break; 82 | case 'w': 83 | sig = atoi(optarg); 84 | if (sig < 0) { 85 | fprintf(stderr, "ARGUMENT ERROR:\textra weight index must be >= 0\n"); 86 | usage(1); 87 | } 88 | ll_weight_index = sig; 89 | break; 90 | case 'l' : 91 | snv_only = 1; 92 | break; 93 | case 'x' : 94 | no_aa_weights = 1; 95 | break; 96 | case 'f' : 97 | no_allele_frequency = 1; 98 | break; 99 | case 'c' : 100 | coding_only = 1; 101 | break; 102 | case 'n' : 103 | sig = atoi(optarg); 104 | if (sig < 1){ 105 | fprintf(stderr, "ARGUMENT ERROR:\tnumber of cpus must be set to an integer > 0\n"); 106 | usage(1); 107 | } 108 | ncpus = (int)sig; 109 | break; 110 | case 'o' : 111 | output = sdsnew(optarg); 112 | break; 113 | default: 114 | usage(0); 115 | break; 116 | } 117 | } 118 | 119 | if (input_vcf == NULL || sdslen(input_vcf) < 2) { 120 | fprintf(stderr, "Missing mandatory option -i\n"); 121 | usage(1); 122 | } 123 | if (db_prefix == NULL || sdslen(db_prefix) < 2) { 124 | fprintf(stderr, "Missing mandatory option -d\n"); 125 | usage(1); 126 | } 127 | if (anno_tag_name == NULL || sdslen(anno_tag_name) < 2) { 128 | fprintf(stderr, "Missing mandatory option -v\n"); 129 | usage(1); 130 | } 131 | 132 | } 133 | 134 | struct var_info * check_background_allele(struct m_var_info * bvs, struct variant * v){ 135 | 136 | int i = 0; 137 | for (i = 0; i < bvs->nv; i++) { 138 | int diff = abs((int)sdslen(v->ref) - (int)sdslen(v->var)); 139 | if (diff == 0) { //SNV or MNP 140 | 141 | if (sdslen(v->var) == 1) { //SNV 142 | if (bvs->vi[i]->var_type == v->var[0]) { //check to see if same SNV 143 | return bvs->vi[i]; 144 | } 145 | } 146 | 147 | else if (sdslen(v->var) > 1) { //MNP 148 | if (bvs->vi[i]->length == sdslen(v->var)) { 149 | return bvs->vi[i]; 150 | } 151 | } 152 | } 153 | else { //INDEL 154 | if (bvs->vi[i]->length == diff) { 155 | 156 | if ( ( (int)sdslen(v->ref) > (int)sdslen(v->var) ) && bvs->vi[i]->var_type == 'D') { 157 | return bvs->vi[i]; //deletion 158 | } 159 | else if( ( (int)sdslen(v->ref) < (int)sdslen(v->var) ) && bvs->vi[i]->var_type == 'I' ) { 160 | return bvs->vi[i]; //insertion 161 | } 162 | } 163 | } 164 | 165 | } 166 | return NULL; 167 | } 168 | 169 | void id_variant_to_string(struct variant * v){ 170 | 171 | size_t i; 172 | size_t n_indv = 0; 173 | if (v->hemi.n > n_indv) { 174 | n_indv = v->hemi.n; 175 | } 176 | if (v->hets.n > n_indv) { 177 | n_indv = v->hets.n; 178 | } 179 | if (v->homs.n > n_indv) { 180 | n_indv = v->homs.n; 181 | } 182 | 183 | 184 | 185 | for (i = 0; i < n_indv; i++) { 186 | if (i < v->hemi.n) { 187 | if (i < (v->hemi.n - 1)) { 188 | v->hemi_indv = sdscatprintf(v->hemi_indv, "%d,", kv_A(v->hemi, i)); 189 | } 190 | else { 191 | v->hemi_indv = sdscatprintf(v->hemi_indv, "%d", kv_A(v->hemi, i)); 192 | } 193 | 194 | } 195 | if (i < v->hets.n) { 196 | if (i < (v->hets.n - 1)) { 197 | v->het_indv = sdscatprintf(v->het_indv, "%d,", kv_A(v->hets, i)); 198 | } 199 | else { 200 | v->het_indv = sdscatprintf(v->het_indv, "%d", kv_A(v->hets, i)); 201 | } 202 | } 203 | if (i < v->homs.n) { 204 | if (i < (v->homs.n - 1)) { 205 | v->hom_indv = sdscatprintf(v->hom_indv, "%d,", kv_A(v->homs, i)); 206 | } 207 | else { 208 | v->hom_indv = sdscatprintf(v->hom_indv, "%d", kv_A(v->homs, i)); 209 | } 210 | } 211 | } 212 | 213 | if(sdslen(v->hemi_indv) < 1){ 214 | v->hemi_indv = sdscat(v->hemi_indv, "."); 215 | } 216 | if(sdslen(v->het_indv) < 1){ 217 | v->het_indv = sdscat(v->het_indv, "."); 218 | } 219 | if(sdslen(v->hom_indv) < 1){ 220 | v->hom_indv = sdscat(v->hom_indv, "."); 221 | } 222 | 223 | } 224 | 225 | 226 | struct variant * parse_score(sds vcf_line){ 227 | 228 | struct variant * v = parse_vcf_line(vcf_line, no_aa_weights); 229 | struct m_var_info * bvs = search_binary_bkgrnd(v->chr, v->pos, mm_bin, chro); 230 | struct var_info * bvi = check_background_allele(bvs, v); 231 | if (bvi != NULL) { 232 | v->b_nhemi = bvi->nhemi; 233 | v->b_nhet = bvi->nhet; 234 | v->b_nhom = bvi->nhom; 235 | v->b_nocall = bvi->nocall; 236 | v->bit_offset = bvi->bit_offset; 237 | score_variant_t_b(v, n_background*2 - bvi->nocall, bvi->nhet + 2*bvi->nhom + bvi->nhemi, no_allele_frequency); 238 | } 239 | else { 240 | v->b_nhemi = 0; 241 | v->b_nhet = 0; 242 | v->b_nhom = 0; 243 | v->b_nocall = 0; 244 | v->bit_offset = 0; 245 | score_variant_t_b(v, n_background*2, 0, no_allele_frequency); 246 | } 247 | 248 | id_variant_to_string(v); 249 | 250 | struct gene_transcript * c, * t; 251 | HASH_ITER(hh, v->gt, c, t) { 252 | struct transcript_anno_info * current, * tmp; 253 | HASH_ITER(hh, c->tai, current, tmp) { 254 | 255 | current->hemi_vvp = score_lookup_b(current->transcript_name, current->hemi_score, current->coding); 256 | current->het_vvp = score_lookup_b(current->transcript_name, current->het_score, current->coding); 257 | current->hom_vvp = score_lookup_b(current->transcript_name, current->hom_score, current->coding); 258 | } 259 | } 260 | 261 | size_t i; 262 | for (i=0; i < bvs->nv; i++) { 263 | free(bvs->vi[i]); 264 | } 265 | free(bvs->vi); 266 | free(bvs); 267 | 268 | return v; 269 | } 270 | 271 | void process_vcf_lines(kvec_t(sds) * vcf_lines, struct variant *** variants){ 272 | 273 | size_t n_lines = kv_size(*vcf_lines); 274 | size_t i; 275 | 276 | #pragma omp parallel for schedule(static) 277 | for (i = 0; i < n_lines; i++) { 278 | (*variants)[i] = parse_score(kv_A(*vcf_lines, i)); 279 | } 280 | 281 | } 282 | 283 | /* 284 | int scale_het(int x){ 285 | float b = 0.055; 286 | return (int)100.0*(1.0 / (1.0 + exp(b*(10.0 - x)))); 287 | }*/ 288 | 289 | 290 | int main(int argc, const char ** argv) { 291 | 292 | input_vcf = NULL; 293 | db_prefix = NULL; 294 | anno_tag_name = NULL; 295 | gene_index = 0; 296 | transcript_index = 0; 297 | so_tag_index = 0; 298 | aa_index = 0; 299 | output = NULL; 300 | ncpus = 1; 301 | no_aa_weights = 0; 302 | no_allele_frequency = 0; 303 | snv_only = 0; 304 | coding_only = 0; 305 | n_background = 0; 306 | mm_bin = NULL; 307 | chro = NULL; 308 | ll_weight_index = -1; 309 | 310 | parse_command_line(argc, argv); 311 | 312 | #ifdef _OPENMP 313 | omp_set_num_threads(ncpus); 314 | #endif 315 | 316 | 317 | FILE * formatted_output = NULL; 318 | 319 | if (output != NULL) { 320 | formatted_output = fopen(output, "w"); 321 | } 322 | 323 | sds dist_output = sdsempty(); 324 | dist_output = sdscatprintf(dist_output, "%s.dist", db_prefix); 325 | load_feature_lookups_b(dist_output); 326 | sdsfree(dist_output); 327 | 328 | mm_bin = load_bin_db(db_prefix, &n_background); //create memory map of background 329 | chro = load_offsets(db_prefix); //load byte offsets in memory map 330 | 331 | initialize_parse_vcf(gene_index, transcript_index, so_tag_index, aa_index, anno_tag_name, ll_weight_index); 332 | 333 | gzFile * gf = NULL; 334 | 335 | if (strcmp(input_vcf, "stdin") == 0){ 336 | gf = gzdopen(STDIN_FILENO, "r"); 337 | } 338 | else { 339 | gf = gzopen(input_vcf, "r"); 340 | } 341 | 342 | if (! gf) { 343 | fprintf(stderr, "FATAL: vcf file %s cannot be read\n", input_vcf); 344 | exit(1); 345 | } 346 | 347 | fprintf(stdout, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n","chr", "start", "ref", "var", "gene", "transcript", "hemi_score", "hemi_vvp", "nhemi", "hemi_indvs", "hemi_nocall", "het_score", "het_vvp", "nhet", "het_indvs", "het_nocall", "hom_score", "hom_vvp", "nhom", "hom_indvs", "hom_nocall", "coding_ind", "indel_ind", "aa_score", "n_bhemi", "n_bhet", "n_bhom", "n_bnocall", "bit_offset", "vid", "ll_weight"); 348 | 349 | char * buffer = (char *)malloc(sizeof(char)*BUF_SIZE); 350 | kvec_t(sds) vcf_lines; 351 | kv_init(vcf_lines); 352 | char * l = gzgets(gf, buffer, BUF_SIZE); 353 | if (buffer[0] != '#') { 354 | kv_push(sds, vcf_lines, sdsnew(buffer)); 355 | } 356 | 357 | while (l != NULL) { 358 | 359 | while (kv_size(vcf_lines) < WORK_SIZE) { 360 | l = gzgets(gf, buffer, BUF_SIZE); 361 | if (l == NULL) { 362 | break; 363 | } 364 | if (buffer[0] != '#') { 365 | kv_push(sds, vcf_lines, sdsnew(buffer)); 366 | } 367 | } 368 | 369 | 370 | if (kv_size(vcf_lines) < 1) { 371 | break; 372 | } 373 | 374 | size_t i; 375 | size_t n_lines = kv_size(vcf_lines); 376 | struct variant ** variants = (struct variant **)calloc(n_lines, sizeof(struct variant *)); 377 | process_vcf_lines(&vcf_lines, &variants); 378 | 379 | for (i = 0; i < n_lines; i++) { 380 | struct variant * tv = variants[i]; 381 | if (tv != NULL) { 382 | int indel_ind = (int)sdslen(tv->var) - (int)sdslen(tv->ref) == 0 ? 0 : 1; 383 | if ((tv->hemi.n > 0 || tv->hets.n > 0 || tv->homs.n > 0) && (snv_only == 0 || snv_only > indel_ind)) { 384 | struct gene_transcript * c, * t; 385 | HASH_ITER(hh, tv->gt, c, t) { 386 | struct transcript_anno_info * current, * tmp; 387 | HASH_ITER(hh, c->tai, current, tmp) { 388 | if (coding_only <= current->coding) { 389 | fprintf(stdout, "%s\t%zu\t%s\t%s\t%s\t%s\t", tv->chr, tv->pos, tv->ref, tv->var, c->gene_name, current->transcript_name); 390 | fprintf(stdout, "%f\t%d\t%zu\t%s\t%zu\t", current->hemi_score, current->hemi_vvp, tv->hemi.n, tv->hemi_indv, tv->hemi_nocalls.n); 391 | fprintf(stdout, "%f\t%d\t%zu\t%s\t%zu\t", current->het_score, current->het_vvp, tv->hets.n, tv->het_indv, tv->het_nocalls.n); 392 | fprintf(stdout, "%f\t%d\t%zu\t%s\t%zu\t", current->hom_score, current->hom_vvp, tv->homs.n, tv->hom_indv, tv->hom_nocalls.n); 393 | fprintf(stdout, "%d\t%d\t%f\t", current->coding, indel_ind, current->aaw); 394 | fprintf(stdout, "%d\t%d\t%d\t%d\t", tv->b_nhemi, tv->b_nhet, tv->b_nhom, tv->b_nocall); 395 | fprintf(stdout, "%llu\t%s\t%f\n", tv->bit_offset, tv->vid, current->llw); 396 | 397 | if (output != NULL) { 398 | int hemi_ind = (tv->b_nhemi > 0) ? 1 : 0; 399 | int het_ind = (tv->b_nhet > 0) ? 1 : 0; 400 | int hom_ind = (tv->b_nhom > 0) ? 1 : 0; 401 | fprintf(formatted_output, "%s\t%s\t%f\t%s\t%f\t%s\t%f\t%s\t%d\t%d\t%d\t%llu\t%s\n", tv->chr, current->transcript_name, current->hemi_score, tv->hemi_indv, current->het_score, tv->het_indv, current->hom_score, tv->hom_indv, hemi_ind, het_ind, hom_ind, tv->bit_offset, tv->vid); 402 | } 403 | 404 | } 405 | } 406 | } 407 | } 408 | 409 | destroy_variant(tv); 410 | } 411 | 412 | sdsfree(kv_pop(vcf_lines)); 413 | 414 | } 415 | 416 | } 417 | 418 | kv_destroy(vcf_lines); 419 | free(buffer); 420 | 421 | /*if (output != NULL) { 422 | fprintf(stderr, "\nsorting and prepping formatted output for burden calculations (stdout ready for processing)..."); 423 | //prep formatted output file for burden calculations; only works on unix systems 424 | sds sort_command = sdsnew("sort -k2,2 "); 425 | sort_command = sdscatprintf(sort_command, "%s > %s.sorted", output, output); 426 | system(sort_command); 427 | sdsfree(sort_command); 428 | 429 | sds mv_command = sdsnew("mv "); 430 | mv_command = sdscatprintf(mv_command, "%s.sorted %s", output, output); 431 | system(mv_command); 432 | fprintf(stderr, "done\n"); 433 | sdsfree(mv_command); 434 | }*/ 435 | 436 | 437 | 438 | 439 | return 0; 440 | } 441 | -------------------------------------------------------------------------------- /parse_vcf.c: -------------------------------------------------------------------------------- 1 | // 2 | // parse_vcf.c 3 | // VVP_dev_xcode 4 | // 5 | // Created by STEVEN FLYGARE on 10/10/16. 6 | // Copyright © 2016 IDbyDNA. All rights reserved. 7 | // 8 | 9 | #include "parse_vcf.h" 10 | 11 | #define ERR_GENOTYPE "ERR GENOTYPE" 12 | #define VCF_PROB "VCF FORMAT PROBLEM" 13 | #define ANNO_PROB "ANNOTATION PROBLEM" 14 | 15 | static struct vep_field_info vfi; 16 | 17 | void initialize_parse_vcf(uint8_t gene_index, uint8_t transcript_index, uint8_t seq_ontology_tag_index, uint8_t amino_acid_change_index, sds annotation_tag_name, int ll_weight_index){ 18 | 19 | init_aa_score(); 20 | 21 | vfi.gene_index = gene_index; 22 | vfi.transcript_index = transcript_index; 23 | vfi.seq_ontology_tag_index = seq_ontology_tag_index; 24 | vfi.amino_acid_change_index = amino_acid_change_index; 25 | vfi.annotation_tag_name = sdsdup(annotation_tag_name); 26 | vfi.ll_weight_index = ll_weight_index; 27 | 28 | sdsfree(annotation_tag_name); 29 | } 30 | 31 | void load_gt_info(struct variant ** v, sds * data, int data_len){ 32 | 33 | int indv_index = 0; 34 | int i; 35 | for (i = 9; i < data_len; i++) { 36 | 37 | int tmp_count = 0; 38 | sds * tmp_data = sdssplitlen(data[i], (int)sdslen(data[i]), ":", 1, &tmp_count); 39 | int gd_count = 0; 40 | sds * genotype_data = NULL; 41 | genotype_data = sdssplitlen(tmp_data[0], (int)sdslen(tmp_data[0]), "|", 1, &gd_count); 42 | if (sdslen(tmp_data[0]) > 1 && gd_count < 2) { //if "|" didn't split, then try "/" 43 | sdsfreesplitres(genotype_data, gd_count); 44 | genotype_data = sdssplitlen(tmp_data[0], (int)sdslen(tmp_data[0]), "/", 1, &gd_count); 45 | } 46 | sdsfreesplitres(tmp_data, tmp_count); 47 | 48 | if (gd_count == 1) { //hemizygous situation 49 | (*v)->ni++; 50 | if (strcmp(genotype_data[0], ".") == 0) { 51 | kv_push(int, (*v)->hemi_nocalls, indv_index); 52 | } 53 | else if (strcmp(genotype_data[0], "0") == 0){ 54 | (*v)->nref+=1; 55 | } 56 | else if (strcmp(genotype_data[0], "1") == 0){ 57 | kv_push(int, (*v)->hemi, indv_index); 58 | } 59 | } 60 | else if (gd_count == 2){ //diploid call 61 | (*v)->ni++; 62 | if (strcmp(genotype_data[0], genotype_data[1]) == 0) { //homozygous call 63 | if (strcmp(genotype_data[0], ".") == 0) { 64 | kv_push(int, (*v)->hom_nocalls, indv_index); 65 | } 66 | else if (strcmp(genotype_data[0], "0") == 0){ 67 | (*v)->nref+=2; 68 | } 69 | else if (strcmp(genotype_data[0], "1") == 0){ 70 | kv_push(int, (*v)->homs, indv_index); 71 | } 72 | } 73 | else { //heterozygous call 74 | int j; 75 | for (j = 0; j < 2; j++) { 76 | if (strcmp(genotype_data[j], ".") == 0) { 77 | kv_push(int, (*v)->het_nocalls, indv_index); 78 | } 79 | else if (strcmp(genotype_data[j], "0") == 0){ 80 | (*v)->nref+=1; 81 | } 82 | else if (strcmp(genotype_data[j], "1") == 0){ 83 | kv_push(int, (*v)->hets, indv_index); 84 | } 85 | } 86 | 87 | } 88 | } 89 | else { 90 | fprintf(stderr, "WARNING:\t%s\tgenotype problem\nchr:%s\tpos:%zu\tcol:%d\n", VCF_PROB, (*v)->chr, (*v)->pos, i); 91 | //exit(0); 92 | } 93 | indv_index++; 94 | 95 | sdsfreesplitres(genotype_data, gd_count); 96 | } 97 | } 98 | 99 | void get_bcsq_aa_change(sds aa_tag, struct transcript_anno_info ** ttai){ 100 | 101 | if (sdslen(aa_tag) < 1) { 102 | return; 103 | } 104 | int aas = 0; 105 | sds * aa = sdssplitlen(aa_tag, (int)sdslen(aa_tag), ">", 1, &aas); 106 | if (aas > 1) { 107 | //(*ttai)->pref = sdsdup(aa[0]); 108 | //(*ttai)->pvar = sdsdup(aa[1]); 109 | size_t i; 110 | for (i = 0; i < sdslen(aa[0]); i++) { 111 | if (isdigit(aa[0][i]) == 0) { 112 | (*ttai)->pref = sdsnewlen(aa[0]+i, sdslen(aa[0]) - i); 113 | break; 114 | } 115 | } 116 | 117 | for (i = 0; i < sdslen(aa[1]); i++) { 118 | if (isdigit(aa[1][i]) == 0) { 119 | (*ttai)->pvar = sdsnewlen(aa[1]+i, sdslen(aa[1]) - i); 120 | break; 121 | } 122 | } 123 | } 124 | sdsfreesplitres(aa, aas); 125 | } 126 | 127 | 128 | void get_aa_change(sds aa_tag, struct transcript_anno_info ** ttai){ 129 | 130 | if (sdslen(aa_tag) < 1) { 131 | return; 132 | } 133 | int aas = 0; 134 | sds * aa = sdssplitlen(aa_tag, (int)sdslen(aa_tag), "/", 1, &aas); 135 | if (aas > 1) { 136 | (*ttai)->pref = sdsdup(aa[0]); 137 | (*ttai)->pvar = sdsdup(aa[1]); 138 | } 139 | sdsfreesplitres(aa, aas); 140 | } 141 | 142 | 143 | void check_add_gene_transcript_tags(sds gene_name, sds transcript_name, sds annotation_tags, sds aa_tag, float ll_weight, struct variant ** v) { 144 | 145 | struct gene_transcript * tgt = NULL; 146 | HASH_FIND_STR((*v)->gt, gene_name, tgt); //check for and add gene 147 | if (tgt == NULL) { 148 | tgt = (struct gene_transcript *)malloc(sizeof(struct gene_transcript)); 149 | memset(tgt->gene_name, '\0', sizeof(char)*FEATURE_NAME_LENGTH); 150 | if (sdslen(gene_name) < 1) { 151 | strncpy(tgt->gene_name, "NONE", 4); 152 | } 153 | else { 154 | strncpy(tgt->gene_name, gene_name, sdslen(gene_name)); 155 | } 156 | tgt->tai = NULL; 157 | HASH_ADD_STR((*v)->gt, gene_name, tgt); 158 | } 159 | 160 | struct transcript_anno_info * ttai = NULL; 161 | HASH_FIND_STR(tgt->tai, transcript_name, ttai); //check for and add transcript 162 | if (ttai == NULL) { 163 | ttai = (struct transcript_anno_info *)malloc(sizeof(struct transcript_anno_info)); 164 | memset(ttai->transcript_name, '\0', sizeof(char)*FEATURE_NAME_LENGTH); 165 | if (sdslen(transcript_name) < 1) { 166 | strncpy(ttai->transcript_name, "NONE", 4); 167 | } 168 | else { 169 | strncpy(ttai->transcript_name, transcript_name, sdslen(transcript_name)); 170 | } 171 | 172 | ttai->aaw = 1.0; 173 | ttai->llw = ll_weight; 174 | ttai->het_score = -1.0; 175 | ttai->hom_score = -1.0; 176 | ttai->hemi_score = -1.0; 177 | ttai->het_vvp = -1; 178 | ttai->hom_vvp = -1; 179 | ttai->hemi_vvp = -1; 180 | ttai->coding = 0; 181 | ttai->pref = NULL; 182 | ttai->pvar = NULL; 183 | if (strncmp(vfi.annotation_tag_name, "BCSQ", 4) == 0) { 184 | get_bcsq_aa_change(aa_tag, &ttai); 185 | } 186 | else { 187 | get_aa_change(aa_tag, &ttai); //get aa weight 188 | } 189 | 190 | kv_init(ttai->anno_tags); //initialize and add annotation tags 191 | if (annotation_tags != NULL) { 192 | int tags = 0; 193 | sds * anno_tags = sdssplitlen(annotation_tags, (int)sdslen(annotation_tags), "&", 1, &tags); 194 | int i; 195 | for (i=0; i < tags; i++) { 196 | kv_push(sds, ttai->anno_tags, sdsdup(anno_tags[i])); 197 | } 198 | sdsfreesplitres(anno_tags, tags); 199 | } 200 | 201 | HASH_ADD_STR(tgt->tai, transcript_name, ttai); 202 | } 203 | else { 204 | int placeholder = 1; 205 | //fprintf(stderr, "WARNING:\t%s\nGene:\t%s\nTranscript:\t%s already added. Multiple annotations per transcript not allowed, will only use first. Variant location:\t%s\t%zu\n\n", ANNO_PROB, gene_name, transcript_name, (*v)->chr, (*v)->pos); 206 | } 207 | 208 | } 209 | 210 | void load_annotation_info(sds info_field, struct variant ** v){ 211 | 212 | int count = 0; 213 | sds * info_data = sdssplitlen(info_field, (int)sdslen(info_field), ";", 1, &count); 214 | int i = 0; 215 | for (i = 0; i < count; i++) { 216 | int tag_count = 0; 217 | sds * tag_data = sdssplitlen(info_data[i], (int)sdslen(info_data[i]), "=", 1, &tag_count); 218 | if (tag_count > 1) { 219 | if (strcmp(tag_data[0], vfi.annotation_tag_name) == 0) { 220 | int n_annotations; 221 | sds * annotations = sdssplitlen(tag_data[1], (int)sdslen(tag_data[1]), ",", 1, &n_annotations); 222 | int j; 223 | for (j = 0; j < n_annotations; j++) { 224 | //fprintf(stderr, "%s\t%d\t%s\n", (*v)->vid, j, annotations[j]); 225 | int pieces = 0; 226 | sds * anno_pieces = sdssplitlen(annotations[j], (int)sdslen(annotations[j]), "|", 1, &pieces); 227 | float ll_weight = -1.0; 228 | if (vfi.ll_weight_index >= 0) { 229 | ll_weight = atof(anno_pieces[vfi.ll_weight_index]); 230 | } 231 | //fprintf(stderr, "%d\n", tag_count); 232 | //assert(tag_count > vfi.gene_index && tag_count > vfi.transcript_index && tag_count > vfi.seq_ontology_tag_index); 233 | sds tmp_empty = sdsempty(); 234 | check_add_gene_transcript_tags(pieces > vfi.gene_index ? anno_pieces[vfi.gene_index] : tmp_empty, pieces > vfi.transcript_index ? anno_pieces[vfi.transcript_index] : tmp_empty, pieces > vfi.seq_ontology_tag_index ? anno_pieces[vfi.seq_ontology_tag_index] : tmp_empty, pieces > vfi.amino_acid_change_index ? anno_pieces[vfi.amino_acid_change_index] : tmp_empty, ll_weight, v); 235 | sdsfree(tmp_empty); 236 | sdsfreesplitres(anno_pieces, pieces); 237 | } 238 | sdsfreesplitres(annotations, n_annotations); 239 | } 240 | else if (strcmp(tag_data[0], "PHAST") == 0) { 241 | (*v)->phast = atof(tag_data[1]); 242 | } 243 | } 244 | sdsfreesplitres(tag_data, tag_count); 245 | } 246 | sdsfreesplitres(info_data, count); 247 | } 248 | 249 | 250 | struct variant * parse_vcf_line(sds line, int no_aa_weight){ 251 | 252 | //initialize variant struct values as line is parsed 253 | struct variant * v = (struct variant *)malloc(sizeof(struct variant)); 254 | v->chr = NULL; 255 | v->bit_offset = 0; 256 | sdstrim(line, "\n"); 257 | 258 | //first split line by tab 259 | int count = 0; 260 | sds * data = sdssplitlen(line, (int)sdslen(line), "\t", 1, &count); 261 | 262 | if (count < 10) { 263 | fprintf(stderr, "%s:\t vcf line has fewer than 10 columns, will skip:\t%s:%s, %s, %s\n", VCF_PROB, data[0], data[1], data[4], data[7]); 264 | sdsfreesplitres(data, count); 265 | free(v); 266 | return NULL; 267 | } 268 | 269 | v->chr = sdsdup(data[0]); 270 | v->pos = atoll(data[1]); 271 | v->vid = sdsdup(data[2]); 272 | v->ref = sdsdup(data[3]); 273 | 274 | //now find number of alternate alleles, if > 1, then throw error 275 | int num_va = 0; 276 | sds * variant_alleles = sdssplitlen(data[4], (int)sdslen(data[4]), ",", 1, &num_va); 277 | if (num_va > 1) { 278 | fprintf(stderr, "FATAL:\t vcf line has >1 alternate alleles. File needs to be decomposed before processing\n"); 279 | fprintf(stderr, "%s", line); 280 | sdsfreesplitres(variant_alleles, num_va); 281 | sdsfreesplitres(data, count); 282 | exit(0); 283 | } 284 | 285 | v->var = sdsdup(variant_alleles[0]); 286 | sdsfreesplitres(variant_alleles, num_va); 287 | //indel indicator 288 | if ((sdslen(v->ref) != sdslen(v->var)) || strcmp(v->ref, "-") == 0 || strcmp(v->var, "-") == 0) { 289 | v->indel = 1; 290 | } 291 | 292 | //load genotype information 293 | v->ni = 0; 294 | v->nref = 0; 295 | kv_init(v->hets); 296 | v->het_indv = sdsempty(); 297 | kv_init(v->homs); 298 | v->hom_indv = sdsempty(); 299 | kv_init(v->hemi); 300 | v->hemi_indv = sdsempty(); 301 | kv_init(v->het_nocalls); 302 | kv_init(v->hom_nocalls); 303 | kv_init(v->hemi_nocalls); 304 | load_gt_info(&v, data, count); 305 | 306 | //load annotation and protein information 307 | v->gt = NULL; 308 | v->phast = -1.0; 309 | load_annotation_info(data[7], &v); 310 | 311 | //add amino acid weights 312 | struct gene_transcript * c, * t; 313 | HASH_ITER(hh, v->gt, c, t) { 314 | struct transcript_anno_info * current, * tmp; 315 | HASH_ITER(hh, c->tai, current, tmp) { 316 | get_aaw(¤t, v->ref, v->var, v->phast); 317 | if (no_aa_weight == 1) { 318 | current->aaw = 1.0; 319 | } 320 | } 321 | } 322 | 323 | sdsfreesplitres(data, count); 324 | 325 | return v; 326 | } 327 | 328 | struct variant * parse_allele_frequency_line(sds line, int no_aa_weight){ 329 | 330 | //initialize variant struct values as line is parsed 331 | struct variant * v = (struct variant *)malloc(sizeof(struct variant)); 332 | v->chr = NULL; 333 | v->bit_offset = 0; 334 | sdstrim(line, "\n"); 335 | 336 | //first split line by tab 337 | int count = 0; 338 | sds * data = sdssplitlen(line, (int)sdslen(line), "\t", 1, &count); 339 | 340 | if (count < 10) { 341 | fprintf(stderr, "%s:\t vcf line has fewer than 10 columns, will skip:\t%s:%s, %s, %s\n", VCF_PROB, data[0], data[1], data[4], data[7]); 342 | sdsfreesplitres(data, count); 343 | free(v); 344 | return NULL; 345 | } 346 | 347 | v->chr = sdsdup(data[0]); 348 | v->pos = atoll(data[1]); 349 | v->vid = sdsdup(data[2]); 350 | v->ref = sdsdup(data[3]); 351 | 352 | //now find number of alternate alleles, if > 1, then throw error 353 | int num_va = 0; 354 | sds * variant_alleles = sdssplitlen(data[4], (int)sdslen(data[4]), ",", 1, &num_va); 355 | if (num_va > 1) { 356 | fprintf(stderr, "FATAL:\t vcf line has >1 alternate alleles. File needs to be decomposed before processing\n"); 357 | fprintf(stderr, "%s", line); 358 | sdsfreesplitres(variant_alleles, num_va); 359 | sdsfreesplitres(data, count); 360 | exit(0); 361 | } 362 | 363 | v->var = sdsdup(variant_alleles[0]); 364 | sdsfreesplitres(variant_alleles, num_va); 365 | //indel indicator 366 | if ((sdslen(v->ref) != sdslen(v->var)) || strcmp(v->ref, "-") == 0 || strcmp(v->var, "-") == 0) { 367 | v->indel = 1; 368 | } 369 | 370 | //load genotype information 371 | 372 | v->ni = 0; 373 | v->nref = 0; 374 | kv_init(v->hets); 375 | v->het_indv = sdsempty(); 376 | kv_init(v->homs); 377 | v->hom_indv = sdsempty(); 378 | kv_init(v->hemi); 379 | v->hemi_indv = sdsempty(); 380 | kv_init(v->het_nocalls); 381 | kv_init(v->hom_nocalls); 382 | kv_init(v->hemi_nocalls); 383 | 384 | //load_gt_info(&v, data, count); 385 | int total_called = atoi(data[5]); 386 | int n_alleles_called = atoi(data[6]); 387 | int nocalled_alleles = total_called - n_alleles_called; //number of nocall alleles 388 | int n_alleles_alt = atoi(data[7]); 389 | int n_hom_indvs = atoi(data[8]); 390 | 391 | int n_hemi = 0; 392 | int n_het = 0; 393 | if (strcmp(v->chr, "X") == 0) { 394 | n_hemi = atoi(data[9]); //hemizgyous will be the same as the number of male alleles 395 | int tmp = n_alleles_alt - ((n_hom_indvs * 2) + n_hemi); 396 | n_het = tmp >= 0 ? tmp : 0; //whatever is left will be het 397 | } 398 | else { 399 | n_het = n_alleles_alt - (n_hom_indvs *2); //number of heterozygous alleles -- same as het individuals 400 | } 401 | 402 | 403 | int n_hom_alt = n_alleles_alt - (n_het + n_hemi); //number of alleles homozygous for alt allele 404 | 405 | v->nref = n_alleles_called - (n_hom_alt + n_het + n_hemi); 406 | 407 | //generate random numbers for the individual indexes 408 | gsl_rng_env_setup(); 409 | gsl_rng * r = gsl_rng_alloc(gsl_rng_taus); 410 | 411 | int i; 412 | for (i = 0; i < (n_hom_alt / 2); i++) { 413 | int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1); 414 | kv_push(int, v->homs, z); 415 | } 416 | for (i = 0; i < n_het; i++) { 417 | int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1); 418 | kv_push(int, v->hets, z); 419 | } 420 | for (i = 0; i < n_hemi; i++) { 421 | int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1); 422 | kv_push(int, v->hemi, z); 423 | } 424 | for (i = 0; i < nocalled_alleles; i++) { 425 | int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1); 426 | kv_push(int, v->het_nocalls, z); 427 | } 428 | 429 | gsl_rng_free(r); 430 | 431 | //load annotation and protein information 432 | v->gt = NULL; 433 | v->phast = -1.0; 434 | load_annotation_info(data[10], &v); 435 | 436 | //add amino acid weights 437 | struct gene_transcript * c, * t; 438 | HASH_ITER(hh, v->gt, c, t) { 439 | struct transcript_anno_info * current, * tmp; 440 | HASH_ITER(hh, c->tai, current, tmp) { 441 | get_aaw(¤t, v->ref, v->var, v->phast); 442 | if (no_aa_weight == 1) { 443 | current->aaw = 1.0; 444 | } 445 | } 446 | } 447 | 448 | sdsfreesplitres(data, count); 449 | 450 | return v; 451 | 452 | } 453 | 454 | 455 | 456 | void destroy_variant(struct variant * v){ 457 | sdsfree(v->chr); 458 | sdsfree(v->vid); 459 | sdsfree(v->ref); 460 | sdsfree(v->var); 461 | kv_destroy(v->hets); 462 | sdsfree(v->het_indv); 463 | kv_destroy(v->homs); 464 | sdsfree(v->hom_indv); 465 | kv_destroy(v->hemi); 466 | sdsfree(v->hemi_indv); 467 | kv_destroy(v->het_nocalls); 468 | kv_destroy(v->hom_nocalls); 469 | kv_destroy(v->hemi_nocalls); 470 | 471 | struct gene_transcript * c, * t; 472 | HASH_ITER(hh, v->gt, c, t) { 473 | struct transcript_anno_info * current, * tmp; 474 | HASH_ITER(hh, c->tai, current, tmp) { 475 | int i; 476 | if(current->pref != NULL) sdsfree(current->pref); 477 | if(current->pvar != NULL) sdsfree(current->pvar); 478 | size_t tmpl = c->tai->anno_tags.n; 479 | for (i=0; i < tmpl; i++) { 480 | sdsfree(kv_pop(c->tai->anno_tags)); 481 | } 482 | kv_destroy(c->tai->anno_tags); 483 | HASH_DEL(c->tai, current); 484 | free(current); 485 | } 486 | HASH_DEL(v->gt, c); 487 | free(c); 488 | } 489 | free(v); 490 | v = NULL; 491 | } 492 | 493 | void print_vec_comma(kvec_t(int) * tmp){ 494 | int i; 495 | for (i = 0; i < (*tmp).n; i++) { 496 | fprintf(stdout, "%d,", kv_A(*tmp, i)); 497 | } 498 | } 499 | 500 | void print_variant(struct variant * v){ 501 | fprintf(stdout, "chr: %s, pos: %zu\n", v->chr, v->pos); 502 | fprintf(stdout, "id: %s\n", v->vid); 503 | fprintf(stdout, "ref: %s, var: %s\n", v->ref, v->var); 504 | fprintf(stdout, "number of individuals: %d\n", v->ni); 505 | fprintf(stdout, "number ref alleles: %d\n", v->nref); 506 | 507 | int i; 508 | if (v->hemi.n > 0) { 509 | fprintf(stdout, "HEMIZYGOUS individuals: %zu\n", v->hemi.n); 510 | //for (i = 0; i < v->hemi.n; i++) fprintf(stdout, "%d,", kv_A(v->hemi, i)); 511 | //fprintf(stdout, "\n\n"); 512 | } 513 | if (v->hets.n > 0) { 514 | fprintf(stdout, "HETEROZYGOUS individuals: %zu\n", v->hets.n); 515 | //for (i = 0; i < v->hets.n; i++) fprintf(stdout, "%d,", kv_A(v->hets, i)); 516 | //fprintf(stdout, "\n\n"); 517 | } 518 | if (v->homs.n > 0) { 519 | fprintf(stdout, "HOMOZYGOUS individuals: %zu\n", v->homs.n); 520 | //for (i = 0; i < v->homs.n; i++) fprintf(stdout, "%d,", kv_A(v->homs, i)); 521 | //fprintf(stdout, "\n\n"); 522 | } 523 | if (v->hemi_nocalls.n > 0) { 524 | fprintf(stdout, "HEMIZYGOUS nocalled individuals: %zu\n", v->hemi_nocalls.n); 525 | //for (i = 0; i < v->hemi_nocalls.n; i++) fprintf(stdout, "%d,", kv_A(v->hemi_nocalls, i)); 526 | //fprintf(stdout, "\n\n"); 527 | } 528 | if (v->het_nocalls.n > 0) { 529 | fprintf(stdout, "HETEROZYGOUS nocalled individuals: %zu\n", v->het_nocalls.n); 530 | //for (i = 0; i < v->het_nocalls.n; i++) fprintf(stdout, "%d,", kv_A(v->het_nocalls, i)); 531 | //fprintf(stdout, "\n\n"); 532 | } 533 | if (v->hom_nocalls.n > 0) { 534 | fprintf(stdout, "HOMOZYGOUS nocalled individuals: %zu\n", v->hom_nocalls.n); 535 | //for (i = 0; i < v->hom_nocalls.n; i++) fprintf(stdout, "%d,", kv_A(v->hom_nocalls, i)); 536 | //fprintf(stdout, "\n\n"); 537 | } 538 | 539 | struct gene_transcript * c, * t; 540 | HASH_ITER(hh, v->gt, c, t) { 541 | fprintf(stdout, "GENE: %s\n", c->gene_name); 542 | struct transcript_anno_info * current, * tmp; 543 | HASH_ITER(hh, c->tai, current, tmp) { 544 | fprintf(stdout, "\tTRANSCRIPT: %s\n", current->transcript_name); 545 | fprintf(stdout, "\t\taaw: %f, het_score: %f, hom_score: %f, coding: %d\n", current->aaw, current->het_score, current->hom_score, current->coding); 546 | size_t tmpl = current->anno_tags.n; 547 | for (i = 0; i < tmpl; i++) { 548 | fprintf(stdout, "\t\tSO TAG: %s\n", kv_A(current->anno_tags, i)); 549 | } 550 | } 551 | } 552 | 553 | } 554 | 555 | 556 | -------------------------------------------------------------------------------- /bit_array.h: -------------------------------------------------------------------------------- 1 | /* 2 | bit_array.h 3 | project: bit array C library 4 | url: https://github.com/noporpoise/BitArray/ 5 | maintainer: Isaac Turner 6 | license: Public Domain, no warranty 7 | date: Sep 2014 8 | */ 9 | 10 | #ifndef BIT_ARRAY_HEADER_SEEN 11 | #define BIT_ARRAY_HEADER_SEEN 12 | 13 | #include 14 | #include 15 | 16 | #include "bit_macros.h" 17 | 18 | typedef struct BIT_ARRAY BIT_ARRAY; 19 | 20 | // 64 bit words 21 | typedef uint64_t word_t, word_addr_t, bit_index_t; 22 | typedef uint8_t word_offset_t; // Offset within a 64 bit word 23 | 24 | #define BIT_INDEX_MIN 0 25 | #define BIT_INDEX_MAX (~(bit_index_t)0) 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | // 32 | // Structs 33 | // 34 | 35 | struct BIT_ARRAY 36 | { 37 | word_t* words; 38 | bit_index_t num_of_bits; 39 | // Number of words used -- this is just round_up(num_of_bits / 64) 40 | // if num_of_bits == 0, this is 0 41 | word_addr_t num_of_words; 42 | // For more efficient allocation we use realloc only to double size -- 43 | // not for adding every word. Initial size is INIT_CAPACITY_WORDS. 44 | word_addr_t capacity_in_words; 45 | }; 46 | 47 | // 48 | // Basics: Constructor, destructor, get length, resize 49 | // 50 | 51 | // Constructor - create a new bit array of length nbits 52 | BIT_ARRAY* bit_array_create(bit_index_t nbits); 53 | 54 | // Destructor - free the memory used for a bit array 55 | void bit_array_free(BIT_ARRAY* bitarray); 56 | 57 | // Allocate using existing struct 58 | BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits); 59 | void bit_array_dealloc(BIT_ARRAY* bitarr); 60 | 61 | // Get length of bit array 62 | bit_index_t bit_array_length(const BIT_ARRAY* bit_arr); 63 | 64 | // Change the size of a bit array. Enlarging an array will add zeros 65 | // to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory) 66 | char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits); 67 | 68 | // If bitarr length < num_bits, resizes to num_bits 69 | char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits); 70 | 71 | // Same as above but exit with an error message if out of memory 72 | void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits); 73 | void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits); 74 | 75 | 76 | // 77 | // Macros 78 | // 79 | 80 | // 81 | // Get, set, clear, assign and toggle individual bits 82 | // Macros for fast access -- beware: no bounds checking 83 | // 84 | 85 | #define bit_array_get(arr,i) bitset_get((arr)->words, i) 86 | #define bit_array_set(arr,i) bitset_set((arr)->words, i) 87 | #define bit_array_clear(arr,i) bitset_del((arr)->words, i) 88 | #define bit_array_toggle(arr,i) bitset_tgl((arr)->words, i) 89 | // c must be 0 or 1 90 | #define bit_array_assign(arr,i,c) bitset_cpy((arr)->words,i,c) 91 | 92 | // 93 | // Get, set, clear, assign and toggle individual bits 94 | // "Safe": use assert() to check bounds 95 | // 96 | 97 | // Get the value of a bit (returns 0 or 1) 98 | char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b); 99 | void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b); 100 | void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b); 101 | void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b); 102 | // If char c != 0, set bit; otherwise clear bit 103 | void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c); 104 | 105 | // 106 | // "Resizing": enlarge array if needed 107 | // 108 | 109 | char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b); 110 | void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b); 111 | void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b); 112 | void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b); 113 | void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c); 114 | 115 | // 116 | // Set, clear and toggle several bits at once 117 | // 118 | 119 | // Set multiple bits at once. 120 | // e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31); 121 | // Note: variable args are of type unsigned int 122 | void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...); 123 | 124 | // Clear multiple bits at once. 125 | // e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31); 126 | // Note: variable args are of type unsigned int 127 | void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...); 128 | 129 | // Toggle multiple bits at once 130 | // e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31); 131 | // Note: variable args are of type unsigned int 132 | void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...); 133 | 134 | // 135 | // Set, clear and toggle all bits in a region 136 | // 137 | 138 | // Set all the bits in a region 139 | void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len); 140 | 141 | // Clear all the bits in a region 142 | void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len); 143 | 144 | // Toggle all the bits in a region 145 | void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len); 146 | 147 | // 148 | // Set, clear and toggle all bits at once 149 | // 150 | 151 | // Set all bits in this array to 1 152 | void bit_array_set_all(BIT_ARRAY* bitarr); 153 | 154 | // Set all bits in this array to 0 155 | void bit_array_clear_all(BIT_ARRAY* bitarr); 156 | 157 | // Set all 1 bits to 0, and all 0 bits to 1 158 | void bit_array_toggle_all(BIT_ARRAY* bitarr); 159 | 160 | // 161 | // Get / set a word of a given size 162 | // 163 | 164 | // First bit is in the least significant bit position 165 | // start index must be within the range of the bit array (0 <= x < length) 166 | uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start); 167 | uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start); 168 | uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start); 169 | uint8_t bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start); 170 | uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n); 171 | 172 | // Set 64 bits at once from a particular start position 173 | void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word); 174 | void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word); 175 | void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word); 176 | void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte); 177 | void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n); 178 | 179 | // 180 | // Number of bits set 181 | // 182 | 183 | // Get the number of bits set (hamming weight) 184 | bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr); 185 | 186 | // Get the number of bits not set (length - hamming weight) 187 | bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr); 188 | 189 | // Get the number of bits set in on array and not the other. This is equivalent 190 | // to hamming weight of the XOR when the two arrays are the same length. 191 | // e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010) 192 | bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1, 193 | const BIT_ARRAY* arr2); 194 | 195 | // Parity - returns 1 if odd number of bits set, 0 if even 196 | char bit_array_parity(const BIT_ARRAY* bitarr); 197 | 198 | // 199 | // Find indices of set/clear bits 200 | // 201 | 202 | // Find the index of the next bit that is set, at or after `offset` 203 | // Returns 1 if a bit is set, otherwise 0 204 | // Index of next set bit is stored in the integer pointed to by result 205 | // If no next bit is set result is not changed 206 | char bit_array_find_next_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset, 207 | bit_index_t* result); 208 | 209 | // Find the index of the next bit that is NOT set, at or after `offset` 210 | // Returns 1 if a bit is NOT set, otherwise 0 211 | // Index of next zero bit is stored in the integer pointed to by `result` 212 | // If no next bit is zero, value at `result` is not changed 213 | char bit_array_find_next_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset, 214 | bit_index_t* result); 215 | 216 | // Find the index of the previous bit that is set, before offset. 217 | // Returns 1 if a bit is set, otherwise 0 218 | // Index of previous set bit is stored in the integer pointed to by `result` 219 | // If no previous bit is set result is not changed 220 | char bit_array_find_prev_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset, 221 | bit_index_t* result); 222 | 223 | // Find the index of the previous bit that is NOT set, before offset. 224 | // Returns 1 if a bit is clear, otherwise 0 225 | // Index of previous zero bit is stored in the integer pointed to by `result` 226 | // If no previous bit is zero result is not changed 227 | char bit_array_find_prev_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset, 228 | bit_index_t* result); 229 | 230 | // Find the index of the first bit that is set. 231 | // Returns 1 if a bit is set, otherwise 0 232 | // Index of first set bit is stored in the integer pointed to by `result` 233 | // If no bit is set result is not changed 234 | char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result); 235 | 236 | // Find the index of the first bit that is NOT set. 237 | // Returns 1 if a bit is clear, otherwise 0 238 | // Index of first zero bit is stored in the integer pointed to by `result` 239 | // If no bit is zero result is not changed 240 | char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result); 241 | 242 | // Find the index of the last bit that is set. 243 | // Returns 1 if a bit is set, otherwise 0 244 | // Index of last set bit is stored in the integer pointed to by `result` 245 | // If no bit is set result is not changed 246 | char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result); 247 | 248 | // Find the index of the last bit that is NOT set. 249 | // Returns 1 if a bit is clear, otherwise 0 250 | // Index of last zero bit is stored in the integer pointed to by `result` 251 | // If no bit is zero result is not changed 252 | char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result); 253 | 254 | 255 | // 256 | // Sorting 257 | // 258 | 259 | // Put all the 0s before all the 1s 260 | void bit_array_sort_bits(BIT_ARRAY* bitarr); 261 | 262 | // Put all the 1s before all the 0s 263 | void bit_array_sort_bits_rev(BIT_ARRAY* bitarr); 264 | 265 | 266 | // 267 | // String and printing methods 268 | // 269 | 270 | // Construct a BIT_ARRAY from a string. 271 | void bit_array_from_str(BIT_ARRAY* bitarr, const char* bitstr); 272 | 273 | // Construct a BIT_ARRAY from a substring with given on and off characters. 274 | void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset, 275 | const char* str, size_t len, 276 | const char *on, const char *off, char left_to_right); 277 | 278 | // Takes a char array to write to. `str` must be bitarr->num_of_bits+1 in 279 | // length. Terminates string with '\0' 280 | char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str); 281 | char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str); 282 | 283 | // Get a string representations for a given region, using given on/off 284 | // characters. 285 | // Note: does not null-terminate 286 | void bit_array_to_substr(const BIT_ARRAY* bitarr, 287 | bit_index_t start, bit_index_t length, 288 | char* str, char on, char off, char left_to_right); 289 | 290 | // Print this array to a file stream. Prints '0's and '1'. Doesn't print 291 | // newline. 292 | void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout); 293 | 294 | // Print a string representations for a given region, using given on/off 295 | // characters. Reverse prints from highest to lowest -- this is useful for 296 | // printing binary numbers 297 | void bit_array_print_substr(const BIT_ARRAY* bitarr, 298 | bit_index_t start, bit_index_t length, 299 | FILE* fout, char on, char off, char left_to_right); 300 | 301 | // 302 | // Decimal 303 | // 304 | 305 | // Get bit array as decimal str (e.g. 0b1101 -> "13") 306 | size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len); 307 | 308 | // Return number of characters used 309 | size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal); 310 | 311 | // 312 | // Hexidecimal 313 | // 314 | 315 | // Loads array from hex string 316 | // Returns the number of bits loaded (will be chars rounded up to multiple of 8) 317 | // (0 on failure) 318 | bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset, 319 | const char* str, size_t len); 320 | 321 | // Returns number of characters written 322 | size_t bit_array_to_hex(const BIT_ARRAY* bitarr, 323 | bit_index_t start, bit_index_t length, 324 | char* str, char uppercase); 325 | 326 | // Print bit array as hex 327 | size_t bit_array_print_hex(const BIT_ARRAY* bitarr, 328 | bit_index_t start, bit_index_t length, 329 | FILE* fout, char uppercase); 330 | 331 | // 332 | // Clone and copy 333 | // 334 | 335 | // Copy a BIT_ARRAY struct and the data it holds - returns pointer to new object 336 | #define bit_array_dup bit_array_clone 337 | BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr); 338 | 339 | // Copy bits from one array to another 340 | // Note: use MACRO bit_array_copy 341 | // Destination and source can be the same bit_array and 342 | // src/dst regions can overlap 343 | void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx, 344 | const BIT_ARRAY* src, bit_index_t srcindx, 345 | bit_index_t length); 346 | 347 | // copy all of src to dst. dst is resized to match src. 348 | void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src); 349 | 350 | // 351 | // Logic operators 352 | // 353 | 354 | // BIT_ARRAYs can all be different or the same object 355 | // dest array will be resized if it is too short 356 | // 357 | void bit_array_and(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2); 358 | void bit_array_or (BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2); 359 | void bit_array_xor(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2); 360 | void bit_array_not(BIT_ARRAY* dest, const BIT_ARRAY* src); 361 | 362 | // 363 | // Comparisons 364 | // 365 | 366 | // Note: (bit_array_cmp(a,b) == 0) <=> (bit_array_cmp_big_endian(a,b) == 0) 367 | 368 | // comparison functions return: 369 | // 1 iff bitarr1 > bitarr2 370 | // 0 iff bitarr1 == bitarr2 371 | // -1 iff bitarr1 < bitarr2 372 | 373 | // Compare two bit arrays by value stored, with index 0 being the Least 374 | // Significant Bit (LSB). Arrays do not have to be the same length. 375 | // Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side] 376 | int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2); 377 | 378 | // Compare two bit arrays by value stored, with index 0 being the Most 379 | // Significant Bit (MSB). Arrays do not have to be the same length. 380 | // Example: 10.. > 01.. [index 0 is MSB at left hand side] 381 | int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2); 382 | 383 | // compare bitarr with (bitarr2 << pos) 384 | int bit_array_cmp_words(const BIT_ARRAY *bitarr, 385 | bit_index_t pos, const BIT_ARRAY *bitarr2); 386 | 387 | // 388 | // Shift, interleave, reverse 389 | // 390 | 391 | // Shift array left/right. If fill is zero, filled with 0, otherwise 1 392 | void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill); 393 | void bit_array_shift_left (BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill); 394 | 395 | // shift left without losing any bits. Resizes bitarr. 396 | void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist, 397 | char fill); 398 | 399 | // Cyclic shift 400 | void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t dist); 401 | void bit_array_cycle_left (BIT_ARRAY* bitarr, bit_index_t dist); 402 | 403 | // Interleave 404 | // dst cannot point to the same bit array as src1 or src2 405 | // src1, src2 may point to the same bit array 406 | // abcd 1234 -> a1b2c3d4 407 | // 0011 0000 -> 00001010 408 | // 1111 0000 -> 10101010 409 | // 0101 1010 -> 01100110 410 | // Extends dst if it is too short, but does not shrink it if it is too long 411 | // if dst is longer than length(src1)+length(src2), the end bits are not altered 412 | void bit_array_interleave(BIT_ARRAY* dst, 413 | const BIT_ARRAY* src1, 414 | const BIT_ARRAY* src2); 415 | 416 | // Reverse the whole array or part of it 417 | void bit_array_reverse(BIT_ARRAY* bitarr); 418 | void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len); 419 | 420 | // 421 | // Numeric 422 | // 423 | 424 | // Returns 1 on sucess, 0 if value in array is too big 425 | char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result); 426 | 427 | // 1 iff bitarr > value 428 | // 0 iff bitarr == value 429 | // -1 iff bitarr < value 430 | int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value); 431 | 432 | // 433 | // Arithmetic 434 | // 435 | 436 | // bitarr will be extended if needed 437 | void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value); 438 | 439 | // Add `add` to `bitarr` at `pos` -- same as: 440 | // bitarr + (add << pos) 441 | // where pos can be bigger than the length of the array (bitarr will be resized) 442 | void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add); 443 | 444 | // Add `add` to `bitarr` at `pos` 445 | void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add); 446 | 447 | // If value is greater than bitarr, bitarr is not changed and 0 is returned 448 | // Returns 1 on success, 0 if value > bitarr 449 | char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value); 450 | 451 | // minus `minus` from `bitarr` at `pos` -- same as: 452 | // bitarr + (minus << pos) 453 | // Returns 1 on success, 0 if value > bitarr 454 | char bit_array_sub_word(BIT_ARRAY *bitarr, bit_index_t pos, word_t minus); 455 | 456 | // minus `minus` from `bitarr` at `pos` 457 | // Returns 1 on success, 0 if value > bitarr 458 | char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus); 459 | 460 | // Multiply by some value 461 | void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier); 462 | 463 | // bitarr = round_down(bitarr / divisor) 464 | // rem = bitarr % divisor 465 | void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem); 466 | 467 | // 468 | // Arithmetic between arrays 469 | // 470 | 471 | // dst = src1 + src2 472 | // src1, src2 and dst can all be the same BIT_ARRAY 473 | // If dst is shorter than either of src1, src2, it is enlarged 474 | void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2); 475 | 476 | // dst = src1 - src2 477 | // src1, src2 and dst can all be the same BIT_ARRAY 478 | // If dst is shorter than src1, it will be extended to be as long as src1 479 | // src1 must be greater than or equal to src2 (src1 >= src2) 480 | void bit_array_subtract(BIT_ARRAY* dst, 481 | const BIT_ARRAY* src1, const BIT_ARRAY* src2); 482 | 483 | // dst = src1 * src2 484 | // Pointers cannot all point to the same BIT_ARRAY 485 | void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2); 486 | 487 | // Results in: 488 | // quotient = dividend / divisor 489 | // dividend = dividend % divisor 490 | // (dividend is used to return the remainder) 491 | void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor); 492 | 493 | // 494 | // Read/Write bit_array to a file 495 | // 496 | // File format is [8 bytes: for number of elements in array][data] 497 | // Number of bytes of data is: (int)((num_of_bits + 7) / 8) 498 | // 499 | 500 | // Saves bit array to a file 501 | // returns the number of bytes written 502 | bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f); 503 | 504 | // Reads bit array from a file. bitarr is resized and filled. 505 | // Returns 1 on success, 0 on failure 506 | char bit_array_load(BIT_ARRAY* bitarr, FILE* f); 507 | 508 | char bit_array_load_mm(BIT_ARRAY * bitarr, unsigned char * mm, uint64_t offset); 509 | 510 | 511 | // 512 | // Hash function 513 | // 514 | 515 | // Pass seed as 0 on first call, pass previous hash value if rehashing due 516 | // to a collision 517 | // Using bob jenkins hash lookup3 518 | uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed); 519 | 520 | // 521 | // Randomness 522 | // 523 | 524 | // Set bits randomly with probability prob : 0 <= prob <= 1 525 | void bit_array_random(BIT_ARRAY* bitarr, float prob); 526 | 527 | // Shuffle the bits in an array randomly 528 | void bit_array_shuffle(BIT_ARRAY* bitarr); 529 | 530 | // Get the next permutation of an array with a fixed size and given number of 531 | // bits set. Also known as next lexicographic permutation. 532 | // Given a bit array find the next lexicographic orginisation of the bits 533 | // Number of possible combinations given by (size choose bits_set) i.e. nCk 534 | // 00011 -> 00101 -> 00110 -> 01001 -> 01010 -> 535 | // 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start) 536 | void bit_array_next_permutation(BIT_ARRAY* bitarr); 537 | 538 | // 539 | // Generally useful functions 540 | // 541 | 542 | // Generalised 'binary to string' function 543 | // Adds bits to the string in order of lsb to msb 544 | // e.g. 0b11010 (26 in decimal) would come out as "01011" 545 | char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str); 546 | 547 | // Same as above but in reverse 548 | char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str); 549 | 550 | #ifdef __cplusplus 551 | } 552 | #endif 553 | 554 | #endif 555 | -------------------------------------------------------------------------------- /khash.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 by Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* 27 | An example: 28 | 29 | #include "khash.h" 30 | KHASH_MAP_INIT_INT(32, char) 31 | int main() { 32 | int ret, is_missing; 33 | khiter_t k; 34 | khash_t(32) *h = kh_init(32); 35 | k = kh_put(32, h, 5, &ret); 36 | kh_value(h, k) = 10; 37 | k = kh_get(32, h, 10); 38 | is_missing = (k == kh_end(h)); 39 | k = kh_get(32, h, 5); 40 | kh_del(32, h, k); 41 | for (k = kh_begin(h); k != kh_end(h); ++k) 42 | if (kh_exist(h, k)) kh_value(h, k) = 1; 43 | kh_destroy(32, h); 44 | return 0; 45 | } 46 | */ 47 | 48 | /* 49 | 2011-12-29 (0.2.7): 50 | 51 | * Minor code clean up; no actual effect. 52 | 53 | 2011-09-16 (0.2.6): 54 | 55 | * The capacity is a power of 2. This seems to dramatically improve the 56 | speed for simple keys. Thank Zilong Tan for the suggestion. Reference: 57 | 58 | - http://code.google.com/p/ulib/ 59 | - http://nothings.org/computer/judy/ 60 | 61 | * Allow to optionally use linear probing which usually has better 62 | performance for random input. Double hashing is still the default as it 63 | is more robust to certain non-random input. 64 | 65 | * Added Wang's integer hash function (not used by default). This hash 66 | function is more robust to certain non-random input. 67 | 68 | 2011-02-14 (0.2.5): 69 | 70 | * Allow to declare global functions. 71 | 72 | 2009-09-26 (0.2.4): 73 | 74 | * Improve portability 75 | 76 | 2008-09-19 (0.2.3): 77 | 78 | * Corrected the example 79 | * Improved interfaces 80 | 81 | 2008-09-11 (0.2.2): 82 | 83 | * Improved speed a little in kh_put() 84 | 85 | 2008-09-10 (0.2.1): 86 | 87 | * Added kh_clear() 88 | * Fixed a compiling error 89 | 90 | 2008-09-02 (0.2.0): 91 | 92 | * Changed to token concatenation which increases flexibility. 93 | 94 | 2008-08-31 (0.1.2): 95 | 96 | * Fixed a bug in kh_get(), which has not been tested previously. 97 | 98 | 2008-08-31 (0.1.1): 99 | 100 | * Added destructor 101 | */ 102 | 103 | 104 | #ifndef __AC_KHASH_H 105 | #define __AC_KHASH_H 106 | 107 | /*! 108 | @header 109 | 110 | Generic hash table library. 111 | */ 112 | 113 | #define AC_VERSION_KHASH_H "0.2.6" 114 | 115 | #include 116 | #include 117 | #include 118 | 119 | #ifdef USE_MALLOC_WRAPPERS 120 | # include "malloc_wrap.h" 121 | #endif 122 | 123 | /* compipler specific configuration */ 124 | 125 | #if UINT_MAX == 0xffffffffu 126 | typedef unsigned int khint32_t; 127 | #elif ULONG_MAX == 0xffffffffu 128 | typedef unsigned long khint32_t; 129 | #endif 130 | 131 | #if ULONG_MAX == ULLONG_MAX 132 | typedef unsigned long khint64_t; 133 | #else 134 | typedef unsigned long long khint64_t; 135 | #endif 136 | 137 | #ifdef _MSC_VER 138 | #define kh_inline __inline 139 | #else 140 | #define kh_inline inline 141 | #endif 142 | 143 | typedef khint32_t khint_t; 144 | typedef khint_t khiter_t; 145 | 146 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) 147 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) 148 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) 149 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) 150 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) 151 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) 152 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) 153 | 154 | #ifdef KHASH_LINEAR 155 | #define __ac_inc(k, m) 1 156 | #else 157 | #define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) 158 | #endif 159 | 160 | #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) 161 | 162 | #ifndef kroundup32 163 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 164 | #endif 165 | 166 | #ifndef kcalloc 167 | #define kcalloc(N,Z) calloc(N,Z) 168 | #endif 169 | #ifndef kmalloc 170 | #define kmalloc(Z) malloc(Z) 171 | #endif 172 | #ifndef krealloc 173 | #define krealloc(P,Z) realloc(P,Z) 174 | #endif 175 | #ifndef kfree 176 | #define kfree(P) free(P) 177 | #endif 178 | 179 | static const double __ac_HASH_UPPER = 0.77; 180 | 181 | #define __KHASH_TYPE(name, khkey_t, khval_t) \ 182 | typedef struct { \ 183 | khint_t n_buckets, size, n_occupied, upper_bound; \ 184 | khint32_t *flags; \ 185 | khkey_t *keys; \ 186 | khval_t *vals; \ 187 | } kh_##name##_t; 188 | 189 | #define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ 190 | extern kh_##name##_t *kh_init_##name(void); \ 191 | extern void kh_destroy_##name(kh_##name##_t *h); \ 192 | extern void kh_clear_##name(kh_##name##_t *h); \ 193 | extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ 194 | extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ 195 | extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ 196 | extern void kh_del_##name(kh_##name##_t *h, khint_t x); 197 | 198 | #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 199 | SCOPE kh_##name##_t *kh_init_##name(void) { \ 200 | return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ 201 | } \ 202 | SCOPE void kh_destroy_##name(kh_##name##_t *h) \ 203 | { \ 204 | if (h) { \ 205 | kfree((void *)h->keys); kfree(h->flags); \ 206 | kfree((void *)h->vals); \ 207 | kfree(h); \ 208 | } \ 209 | } \ 210 | SCOPE void kh_clear_##name(kh_##name##_t *h) \ 211 | { \ 212 | if (h && h->flags) { \ 213 | memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ 214 | h->size = h->n_occupied = 0; \ 215 | } \ 216 | } \ 217 | SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ 218 | { \ 219 | if (h->n_buckets) { \ 220 | khint_t inc, k, i, last, mask; \ 221 | mask = h->n_buckets - 1; \ 222 | k = __hash_func(key); i = k & mask; \ 223 | inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ 224 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 225 | i = (i + inc) & mask; \ 226 | if (i == last) return h->n_buckets; \ 227 | } \ 228 | return __ac_iseither(h->flags, i)? h->n_buckets : i; \ 229 | } else return 0; \ 230 | } \ 231 | SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ 232 | { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ 233 | khint32_t *new_flags = 0; \ 234 | khint_t j = 1; \ 235 | { \ 236 | kroundup32(new_n_buckets); \ 237 | if (new_n_buckets < 4) new_n_buckets = 4; \ 238 | if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ 239 | else { /* hash table size to be changed (shrink or expand); rehash */ \ 240 | new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ 241 | if (!new_flags) return -1; \ 242 | memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ 243 | if (h->n_buckets < new_n_buckets) { /* expand */ \ 244 | khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ 245 | if (!new_keys) return -1; \ 246 | h->keys = new_keys; \ 247 | if (kh_is_map) { \ 248 | khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ 249 | if (!new_vals) return -1; \ 250 | h->vals = new_vals; \ 251 | } \ 252 | } /* otherwise shrink */ \ 253 | } \ 254 | } \ 255 | if (j) { /* rehashing is needed */ \ 256 | for (j = 0; j != h->n_buckets; ++j) { \ 257 | if (__ac_iseither(h->flags, j) == 0) { \ 258 | khkey_t key = h->keys[j]; \ 259 | khval_t val; \ 260 | khint_t new_mask; \ 261 | new_mask = new_n_buckets - 1; \ 262 | if (kh_is_map) val = h->vals[j]; \ 263 | __ac_set_isdel_true(h->flags, j); \ 264 | while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ 265 | khint_t inc, k, i; \ 266 | k = __hash_func(key); \ 267 | i = k & new_mask; \ 268 | inc = __ac_inc(k, new_mask); \ 269 | while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ 270 | __ac_set_isempty_false(new_flags, i); \ 271 | if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ 272 | { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ 273 | if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ 274 | __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ 275 | } else { /* write the element and jump out of the loop */ \ 276 | h->keys[i] = key; \ 277 | if (kh_is_map) h->vals[i] = val; \ 278 | break; \ 279 | } \ 280 | } \ 281 | } \ 282 | } \ 283 | if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ 284 | h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ 285 | if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ 286 | } \ 287 | kfree(h->flags); /* free the working space */ \ 288 | h->flags = new_flags; \ 289 | h->n_buckets = new_n_buckets; \ 290 | h->n_occupied = h->size; \ 291 | h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ 292 | } \ 293 | return 0; \ 294 | } \ 295 | SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ 296 | { \ 297 | khint_t x; \ 298 | if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ 299 | if (h->n_buckets > (h->size<<1)) { \ 300 | if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ 301 | *ret = -1; return h->n_buckets; \ 302 | } \ 303 | } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ 304 | *ret = -1; return h->n_buckets; \ 305 | } \ 306 | } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ 307 | { \ 308 | khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ 309 | x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ 310 | if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ 311 | else { \ 312 | inc = __ac_inc(k, mask); last = i; \ 313 | while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ 314 | if (__ac_isdel(h->flags, i)) site = i; \ 315 | i = (i + inc) & mask; \ 316 | if (i == last) { x = site; break; } \ 317 | } \ 318 | if (x == h->n_buckets) { \ 319 | if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ 320 | else x = i; \ 321 | } \ 322 | } \ 323 | } \ 324 | if (__ac_isempty(h->flags, x)) { /* not present at all */ \ 325 | h->keys[x] = key; \ 326 | __ac_set_isboth_false(h->flags, x); \ 327 | ++h->size; ++h->n_occupied; \ 328 | *ret = 1; \ 329 | } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ 330 | h->keys[x] = key; \ 331 | __ac_set_isboth_false(h->flags, x); \ 332 | ++h->size; \ 333 | *ret = 2; \ 334 | } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ 335 | return x; \ 336 | } \ 337 | SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ 338 | { \ 339 | if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ 340 | __ac_set_isdel_true(h->flags, x); \ 341 | --h->size; \ 342 | } \ 343 | } 344 | 345 | #define KHASH_DECLARE(name, khkey_t, khval_t) \ 346 | __KHASH_TYPE(name, khkey_t, khval_t) \ 347 | __KHASH_PROTOTYPES(name, khkey_t, khval_t) 348 | 349 | #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 350 | __KHASH_TYPE(name, khkey_t, khval_t) \ 351 | __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) 352 | 353 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ 354 | KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) 355 | 356 | /* --- BEGIN OF HASH FUNCTIONS --- */ 357 | 358 | /*! @function 359 | @abstract Integer hash function 360 | @param key The integer [khint32_t] 361 | @return The hash value [khint_t] 362 | */ 363 | #define kh_int_hash_func(key) (khint32_t)(key) 364 | /*! @function 365 | @abstract Integer comparison function 366 | */ 367 | #define kh_int_hash_equal(a, b) ((a) == (b)) 368 | /*! @function 369 | @abstract 64-bit integer hash function 370 | @param key The integer [khint64_t] 371 | @return The hash value [khint_t] 372 | */ 373 | #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) 374 | /*! @function 375 | @abstract 64-bit integer comparison function 376 | */ 377 | #define kh_int64_hash_equal(a, b) ((a) == (b)) 378 | /*! @function 379 | @abstract const char* hash function 380 | @param s Pointer to a null terminated string 381 | @return The hash value 382 | */ 383 | static kh_inline khint_t __ac_X31_hash_string(const char *s) 384 | { 385 | khint_t h = (khint_t)*s; 386 | if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; 387 | return h; 388 | } 389 | /*! @function 390 | @abstract Another interface to const char* hash function 391 | @param key Pointer to a null terminated string [const char*] 392 | @return The hash value [khint_t] 393 | */ 394 | #define kh_str_hash_func(key) __ac_X31_hash_string(key) 395 | /*! @function 396 | @abstract Const char* comparison function 397 | */ 398 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) 399 | 400 | static kh_inline khint_t __ac_Wang_hash(khint_t key) 401 | { 402 | key += ~(key << 15); 403 | key ^= (key >> 10); 404 | key += (key << 3); 405 | key ^= (key >> 6); 406 | key += ~(key << 11); 407 | key ^= (key >> 16); 408 | return key; 409 | } 410 | #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) 411 | 412 | /* --- END OF HASH FUNCTIONS --- */ 413 | 414 | /* Other convenient macros... */ 415 | 416 | /*! 417 | @abstract Type of the hash table. 418 | @param name Name of the hash table [symbol] 419 | */ 420 | #define khash_t(name) kh_##name##_t 421 | 422 | /*! @function 423 | @abstract Initiate a hash table. 424 | @param name Name of the hash table [symbol] 425 | @return Pointer to the hash table [khash_t(name)*] 426 | */ 427 | #define kh_init(name) kh_init_##name() 428 | 429 | /*! @function 430 | @abstract Destroy a hash table. 431 | @param name Name of the hash table [symbol] 432 | @param h Pointer to the hash table [khash_t(name)*] 433 | */ 434 | #define kh_destroy(name, h) kh_destroy_##name(h) 435 | 436 | /*! @function 437 | @abstract Reset a hash table without deallocating memory. 438 | @param name Name of the hash table [symbol] 439 | @param h Pointer to the hash table [khash_t(name)*] 440 | */ 441 | #define kh_clear(name, h) kh_clear_##name(h) 442 | 443 | /*! @function 444 | @abstract Resize a hash table. 445 | @param name Name of the hash table [symbol] 446 | @param h Pointer to the hash table [khash_t(name)*] 447 | @param s New size [khint_t] 448 | */ 449 | #define kh_resize(name, h, s) kh_resize_##name(h, s) 450 | 451 | /*! @function 452 | @abstract Insert a key to the hash table. 453 | @param name Name of the hash table [symbol] 454 | @param h Pointer to the hash table [khash_t(name)*] 455 | @param k Key [type of keys] 456 | @param r Extra return code: 0 if the key is present in the hash table; 457 | 1 if the bucket is empty (never used); 2 if the element in 458 | the bucket has been deleted [int*] 459 | @return Iterator to the inserted element [khint_t] 460 | */ 461 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r) 462 | 463 | /*! @function 464 | @abstract Retrieve a key from the hash table. 465 | @param name Name of the hash table [symbol] 466 | @param h Pointer to the hash table [khash_t(name)*] 467 | @param k Key [type of keys] 468 | @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] 469 | */ 470 | #define kh_get(name, h, k) kh_get_##name(h, k) 471 | 472 | /*! @function 473 | @abstract Remove a key from the hash table. 474 | @param name Name of the hash table [symbol] 475 | @param h Pointer to the hash table [khash_t(name)*] 476 | @param k Iterator to the element to be deleted [khint_t] 477 | */ 478 | #define kh_del(name, h, k) kh_del_##name(h, k) 479 | 480 | /*! @function 481 | @abstract Test whether a bucket contains data. 482 | @param h Pointer to the hash table [khash_t(name)*] 483 | @param x Iterator to the bucket [khint_t] 484 | @return 1 if containing data; 0 otherwise [int] 485 | */ 486 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) 487 | 488 | /*! @function 489 | @abstract Get key given an iterator 490 | @param h Pointer to the hash table [khash_t(name)*] 491 | @param x Iterator to the bucket [khint_t] 492 | @return Key [type of keys] 493 | */ 494 | #define kh_key(h, x) ((h)->keys[x]) 495 | 496 | /*! @function 497 | @abstract Get value given an iterator 498 | @param h Pointer to the hash table [khash_t(name)*] 499 | @param x Iterator to the bucket [khint_t] 500 | @return Value [type of values] 501 | @discussion For hash sets, calling this results in segfault. 502 | */ 503 | #define kh_val(h, x) ((h)->vals[x]) 504 | 505 | /*! @function 506 | @abstract Alias of kh_val() 507 | */ 508 | #define kh_value(h, x) ((h)->vals[x]) 509 | 510 | /*! @function 511 | @abstract Get the start iterator 512 | @param h Pointer to the hash table [khash_t(name)*] 513 | @return The start iterator [khint_t] 514 | */ 515 | #define kh_begin(h) (khint_t)(0) 516 | 517 | /*! @function 518 | @abstract Get the end iterator 519 | @param h Pointer to the hash table [khash_t(name)*] 520 | @return The end iterator [khint_t] 521 | */ 522 | #define kh_end(h) ((h)->n_buckets) 523 | 524 | /*! @function 525 | @abstract Get the number of elements in the hash table 526 | @param h Pointer to the hash table [khash_t(name)*] 527 | @return Number of elements in the hash table [khint_t] 528 | */ 529 | #define kh_size(h) ((h)->size) 530 | 531 | /*! @function 532 | @abstract Get the number of buckets in the hash table 533 | @param h Pointer to the hash table [khash_t(name)*] 534 | @return Number of buckets in the hash table [khint_t] 535 | */ 536 | #define kh_n_buckets(h) ((h)->n_buckets) 537 | 538 | /*! @function 539 | @abstract Iterate over the entries in the hash table 540 | @param h Pointer to the hash table [khash_t(name)*] 541 | @param kvar Variable to which key will be assigned 542 | @param vvar Variable to which value will be assigned 543 | @param code Block of code to execute 544 | */ 545 | #define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ 546 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ 547 | if (!kh_exist(h,__i)) continue; \ 548 | (kvar) = kh_key(h,__i); \ 549 | (vvar) = kh_val(h,__i); \ 550 | code; \ 551 | } } 552 | 553 | /*! @function 554 | @abstract Iterate over the values in the hash table 555 | @param h Pointer to the hash table [khash_t(name)*] 556 | @param vvar Variable to which value will be assigned 557 | @param code Block of code to execute 558 | */ 559 | #define kh_foreach_value(h, vvar, code) { khint_t __i; \ 560 | for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ 561 | if (!kh_exist(h,__i)) continue; \ 562 | (vvar) = kh_val(h,__i); \ 563 | code; \ 564 | } } 565 | 566 | /* More conenient interfaces */ 567 | 568 | /*! @function 569 | @abstract Instantiate a hash set containing integer keys 570 | @param name Name of the hash table [symbol] 571 | */ 572 | #define KHASH_SET_INIT_INT(name) \ 573 | KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) 574 | 575 | /*! @function 576 | @abstract Instantiate a hash map containing integer keys 577 | @param name Name of the hash table [symbol] 578 | @param khval_t Type of values [type] 579 | */ 580 | #define KHASH_MAP_INIT_INT(name, khval_t) \ 581 | KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) 582 | 583 | /*! @function 584 | @abstract Instantiate a hash map containing 64-bit integer keys 585 | @param name Name of the hash table [symbol] 586 | */ 587 | #define KHASH_SET_INIT_INT64(name) \ 588 | KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) 589 | 590 | /*! @function 591 | @abstract Instantiate a hash map containing 64-bit integer keys 592 | @param name Name of the hash table [symbol] 593 | @param khval_t Type of values [type] 594 | */ 595 | #define KHASH_MAP_INIT_INT64(name, khval_t) \ 596 | KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) 597 | 598 | typedef const char *kh_cstr_t; 599 | /*! @function 600 | @abstract Instantiate a hash map containing const char* keys 601 | @param name Name of the hash table [symbol] 602 | */ 603 | #define KHASH_SET_INIT_STR(name) \ 604 | KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) 605 | 606 | /*! @function 607 | @abstract Instantiate a hash map containing const char* keys 608 | @param name Name of the hash table [symbol] 609 | @param khval_t Type of values [type] 610 | */ 611 | #define KHASH_MAP_INIT_STR(name, khval_t) \ 612 | KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) 613 | 614 | #endif /* __AC_KHASH_H */ 615 | -------------------------------------------------------------------------------- /sds.c: -------------------------------------------------------------------------------- 1 | /* SDS (Simple Dynamic Strings), A C dynamic strings library. 2 | * 3 | * Copyright (c) 2006-2014, Salvatore Sanfilippo 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * * Redistributions of source code must retain the above copyright notice, 10 | * this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of Redis nor the names of its contributors may be used 15 | * to endorse or promote products derived from this software without 16 | * specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | * POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "sds.h" 38 | 39 | /* Create a new sds string with the content specified by the 'init' pointer 40 | * and 'initlen'. 41 | * If NULL is used for 'init' the string is initialized with zero bytes. 42 | * 43 | * The string is always null-termined (all the sds strings are, always) so 44 | * even if you create an sds string with: 45 | * 46 | * mystring = sdsnewlen("abc",3"); 47 | * 48 | * You can print the string with printf() as there is an implicit \0 at the 49 | * end of the string. However the string is binary safe and can contain 50 | * \0 characters in the middle, as the length is stored in the sds header. */ 51 | sds sdsnewlen(const void *init, size_t initlen) { 52 | struct sdshdr *sh; 53 | 54 | if (init) { 55 | sh = malloc(sizeof *sh+initlen+1); 56 | } else { 57 | sh = calloc(sizeof *sh+initlen+1,1); 58 | } 59 | if (sh == NULL) return NULL; 60 | sh->len = initlen; 61 | sh->free = 0; 62 | if (initlen && init) 63 | memcpy(sh->buf, init, initlen); 64 | sh->buf[initlen] = '\0'; 65 | return (char*)sh->buf; 66 | } 67 | 68 | /* Create an empty (zero length) sds string. Even in this case the string 69 | * always has an implicit null term. */ 70 | sds sdsempty(void) { 71 | return sdsnewlen("",0); 72 | } 73 | 74 | /* Create a new sds string starting from a null termined C string. */ 75 | sds sdsnew(const char *init) { 76 | size_t initlen = (init == NULL) ? 0 : strlen(init); 77 | return sdsnewlen(init, initlen); 78 | } 79 | 80 | /* Duplicate an sds string. */ 81 | sds sdsdup(const sds s) { 82 | return sdsnewlen(s, sdslen(s)); 83 | } 84 | 85 | /* Free an sds string. No operation is performed if 's' is NULL. */ 86 | void sdsfree(sds s) { 87 | if (s == NULL) return; 88 | free(s-sizeof(struct sdshdr)); 89 | } 90 | 91 | /* Set the sds string length to the length as obtained with strlen(), so 92 | * considering as content only up to the first null term character. 93 | * 94 | * This function is useful when the sds string is hacked manually in some 95 | * way, like in the following example: 96 | * 97 | * s = sdsnew("foobar"); 98 | * s[2] = '\0'; 99 | * sdsupdatelen(s); 100 | * printf("%d\n", sdslen(s)); 101 | * 102 | * The output will be "2", but if we comment out the call to sdsupdatelen() 103 | * the output will be "6" as the string was modified but the logical length 104 | * remains 6 bytes. */ 105 | void sdsupdatelen(sds s) { 106 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 107 | int reallen = strlen(s); 108 | sh->free += (sh->len-reallen); 109 | sh->len = reallen; 110 | } 111 | 112 | /* Modify an sds string on-place to make it empty (zero length). 113 | * However all the existing buffer is not discarded but set as free space 114 | * so that next append operations will not require allocations up to the 115 | * number of bytes previously available. */ 116 | void sdsclear(sds s) { 117 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 118 | sh->free += sh->len; 119 | sh->len = 0; 120 | sh->buf[0] = '\0'; 121 | } 122 | 123 | /* Enlarge the free space at the end of the sds string so that the caller 124 | * is sure that after calling this function can overwrite up to addlen 125 | * bytes after the end of the string, plus one more byte for nul term. 126 | * 127 | * Note: this does not change the *length* of the sds string as returned 128 | * by sdslen(), but only the free buffer space we have. */ 129 | sds sdsMakeRoomFor(sds s, size_t addlen) { 130 | struct sdshdr *sh, *newsh; 131 | size_t free = sdsavail(s); 132 | size_t len, newlen; 133 | 134 | if (free >= addlen) return s; 135 | len = sdslen(s); 136 | sh = (void*) (s-sizeof *sh);; 137 | newlen = (len+addlen); 138 | if (newlen < SDS_MAX_PREALLOC) 139 | newlen *= 2; 140 | else 141 | newlen += SDS_MAX_PREALLOC; 142 | newsh = realloc(sh, sizeof *newsh+newlen+1); 143 | if (newsh == NULL) return NULL; 144 | 145 | newsh->free = newlen - len; 146 | return newsh->buf; 147 | } 148 | 149 | /* Reallocate the sds string so that it has no free space at the end. The 150 | * contained string remains not altered, but next concatenation operations 151 | * will require a reallocation. 152 | * 153 | * After the call, the passed sds string is no longer valid and all the 154 | * references must be substituted with the new pointer returned by the call. */ 155 | sds sdsRemoveFreeSpace(sds s) { 156 | struct sdshdr *sh; 157 | 158 | sh = (void*) (s-sizeof *sh);; 159 | sh = realloc(sh, sizeof *sh+sh->len+1); 160 | sh->free = 0; 161 | return sh->buf; 162 | } 163 | 164 | /* Return the total size of the allocation of the specifed sds string, 165 | * including: 166 | * 1) The sds header before the pointer. 167 | * 2) The string. 168 | * 3) The free buffer at the end if any. 169 | * 4) The implicit null term. 170 | */ 171 | size_t sdsAllocSize(sds s) { 172 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 173 | 174 | return sizeof(*sh)+sh->len+sh->free+1; 175 | } 176 | 177 | /* Increment the sds length and decrements the left free space at the 178 | * end of the string according to 'incr'. Also set the null term 179 | * in the new end of the string. 180 | * 181 | * This function is used in order to fix the string length after the 182 | * user calls sdsMakeRoomFor(), writes something after the end of 183 | * the current string, and finally needs to set the new length. 184 | * 185 | * Note: it is possible to use a negative increment in order to 186 | * right-trim the string. 187 | * 188 | * Usage example: 189 | * 190 | * Using sdsIncrLen() and sdsMakeRoomFor() it is possible to mount the 191 | * following schema, to cat bytes coming from the kernel to the end of an 192 | * sds string without copying into an intermediate buffer: 193 | * 194 | * oldlen = sdslen(s); 195 | * s = sdsMakeRoomFor(s, BUFFER_SIZE); 196 | * nread = read(fd, s+oldlen, BUFFER_SIZE); 197 | * ... check for nread <= 0 and handle it ... 198 | * sdsIncrLen(s, nread); 199 | */ 200 | void sdsIncrLen(sds s, int incr) { 201 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 202 | 203 | assert(sh->free >= incr); 204 | sh->len += incr; 205 | sh->free -= incr; 206 | assert(sh->free >= 0); 207 | s[sh->len] = '\0'; 208 | } 209 | 210 | /* Grow the sds to have the specified length. Bytes that were not part of 211 | * the original length of the sds will be set to zero. 212 | * 213 | * if the specified length is smaller than the current length, no operation 214 | * is performed. */ 215 | sds sdsgrowzero(sds s, size_t len) { 216 | struct sdshdr *sh = (void*) (s-sizeof *sh); 217 | size_t totlen, curlen = sh->len; 218 | 219 | if (len <= curlen) return s; 220 | s = sdsMakeRoomFor(s,len-curlen); 221 | if (s == NULL) return NULL; 222 | 223 | /* Make sure added region doesn't contain garbage */ 224 | sh = (void*)(s-sizeof *sh); 225 | memset(s+curlen,0,(len-curlen+1)); /* also set trailing \0 byte */ 226 | totlen = sh->len+sh->free; 227 | sh->len = len; 228 | sh->free = totlen-sh->len; 229 | return s; 230 | } 231 | 232 | /* Append the specified binary-safe string pointed by 't' of 'len' bytes to the 233 | * end of the specified sds string 's'. 234 | * 235 | * After the call, the passed sds string is no longer valid and all the 236 | * references must be substituted with the new pointer returned by the call. */ 237 | sds sdscatlen(sds s, const void *t, size_t len) { 238 | struct sdshdr *sh; 239 | size_t curlen = sdslen(s); 240 | 241 | s = sdsMakeRoomFor(s,len); 242 | if (s == NULL) return NULL; 243 | sh = (void*) (s-sizeof *sh);; 244 | memcpy(s+curlen, t, len); 245 | sh->len = curlen+len; 246 | sh->free = sh->free-len; 247 | s[curlen+len] = '\0'; 248 | return s; 249 | } 250 | 251 | /* Append the specified null termianted C string to the sds string 's'. 252 | * 253 | * After the call, the passed sds string is no longer valid and all the 254 | * references must be substituted with the new pointer returned by the call. */ 255 | sds sdscat(sds s, const char *t) { 256 | return sdscatlen(s, t, strlen(t)); 257 | } 258 | 259 | /* Append the specified sds 't' to the existing sds 's'. 260 | * 261 | * After the call, the modified sds string is no longer valid and all the 262 | * references must be substituted with the new pointer returned by the call. */ 263 | sds sdscatsds(sds s, const sds t) { 264 | return sdscatlen(s, t, sdslen(t)); 265 | } 266 | 267 | /* Destructively modify the sds string 's' to hold the specified binary 268 | * safe string pointed by 't' of length 'len' bytes. */ 269 | sds sdscpylen(sds s, const char *t, size_t len) { 270 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 271 | size_t totlen = sh->free+sh->len; 272 | 273 | if (totlen < len) { 274 | s = sdsMakeRoomFor(s,len-sh->len); 275 | if (s == NULL) return NULL; 276 | sh = (void*) (s-sizeof *sh);; 277 | totlen = sh->free+sh->len; 278 | } 279 | memcpy(s, t, len); 280 | s[len] = '\0'; 281 | sh->len = len; 282 | sh->free = totlen-len; 283 | return s; 284 | } 285 | 286 | /* Like sdscpylen() but 't' must be a null-termined string so that the length 287 | * of the string is obtained with strlen(). */ 288 | sds sdscpy(sds s, const char *t) { 289 | return sdscpylen(s, t, strlen(t)); 290 | } 291 | 292 | /* Like sdscatpritf() but gets va_list instead of being variadic. */ 293 | sds sdscatvprintf(sds s, const char *fmt, va_list ap) { 294 | va_list cpy; 295 | char *buf, *t; 296 | size_t buflen = 16; 297 | 298 | while(1) { 299 | buf = malloc(buflen); 300 | if (buf == NULL) return NULL; 301 | buf[buflen-2] = '\0'; 302 | va_copy(cpy,ap); 303 | vsnprintf(buf, buflen, fmt, cpy); 304 | if (buf[buflen-2] != '\0') { 305 | free(buf); 306 | buflen *= 2; 307 | continue; 308 | } 309 | break; 310 | } 311 | t = sdscat(s, buf); 312 | free(buf); 313 | return t; 314 | } 315 | 316 | /* Append to the sds string 's' a string obtained using printf-alike format 317 | * specifier. 318 | * 319 | * After the call, the modified sds string is no longer valid and all the 320 | * references must be substituted with the new pointer returned by the call. 321 | * 322 | * Example: 323 | * 324 | * s = sdsempty("Sum is: "); 325 | * s = sdscatprintf(s,"%d+%d = %d",a,b,a+b). 326 | * 327 | * Often you need to create a string from scratch with the printf-alike 328 | * format. When this is the need, just use sdsempty() as the target string: 329 | * 330 | * s = sdscatprintf(sdsempty(), "... your format ...", args); 331 | */ 332 | sds sdscatprintf(sds s, const char *fmt, ...) { 333 | va_list ap; 334 | char *t; 335 | va_start(ap, fmt); 336 | t = sdscatvprintf(s,fmt,ap); 337 | va_end(ap); 338 | return t; 339 | } 340 | 341 | /* Remove the part of the string from left and from right composed just of 342 | * contiguous characters found in 'cset', that is a null terminted C string. 343 | * 344 | * After the call, the modified sds string is no longer valid and all the 345 | * references must be substituted with the new pointer returned by the call. 346 | * 347 | * Example: 348 | * 349 | * s = sdsnew("AA...AA.a.aa.aHelloWorld :::"); 350 | * s = sdstrim(s,"A. :"); 351 | * printf("%s\n", s); 352 | * 353 | * Output will be just "Hello World". 354 | */ 355 | void sdstrim(sds s, const char *cset) { 356 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 357 | char *start, *end, *sp, *ep; 358 | size_t len; 359 | 360 | sp = start = s; 361 | ep = end = s+sdslen(s)-1; 362 | while(sp <= end && strchr(cset, *sp)) sp++; 363 | while(ep > start && strchr(cset, *ep)) ep--; 364 | len = (sp > ep) ? 0 : ((ep-sp)+1); 365 | if (sh->buf != sp) memmove(sh->buf, sp, len); 366 | sh->buf[len] = '\0'; 367 | sh->free = sh->free+(sh->len-len); 368 | sh->len = len; 369 | } 370 | 371 | /* Turn the string into a smaller (or equal) string containing only the 372 | * substring specified by the 'start' and 'end' indexes. 373 | * 374 | * start and end can be negative, where -1 means the last character of the 375 | * string, -2 the penultimate character, and so forth. 376 | * 377 | * The interval is inclusive, so the start and end characters will be part 378 | * of the resulting string. 379 | * 380 | * The string is modified in-place. 381 | * 382 | * Example: 383 | * 384 | * s = sdsnew("Hello World"); 385 | * sdsrange(s,1,-1); => "ello World" 386 | */ 387 | void sdsrange(sds s, int start, int end) { 388 | struct sdshdr *sh = (void*) (s-sizeof *sh);; 389 | size_t newlen, len = sdslen(s); 390 | 391 | if (len == 0) return; 392 | if (start < 0) { 393 | start = len+start; 394 | if (start < 0) start = 0; 395 | } 396 | if (end < 0) { 397 | end = len+end; 398 | if (end < 0) end = 0; 399 | } 400 | newlen = (start > end) ? 0 : (end-start)+1; 401 | if (newlen != 0) { 402 | if (start >= (signed)len) { 403 | newlen = 0; 404 | } else if (end >= (signed)len) { 405 | end = len-1; 406 | newlen = (start > end) ? 0 : (end-start)+1; 407 | } 408 | } else { 409 | start = 0; 410 | } 411 | if (start && newlen) memmove(sh->buf, sh->buf+start, newlen); 412 | sh->buf[newlen] = 0; 413 | sh->free = sh->free+(sh->len-newlen); 414 | sh->len = newlen; 415 | } 416 | 417 | /* Apply tolower() to every character of the sds string 's'. */ 418 | void sdstolower(sds s) { 419 | int len = sdslen(s), j; 420 | 421 | for (j = 0; j < len; j++) s[j] = tolower(s[j]); 422 | } 423 | 424 | /* Apply toupper() to every character of the sds string 's'. */ 425 | void sdstoupper(sds s) { 426 | int len = sdslen(s), j; 427 | 428 | for (j = 0; j < len; j++) s[j] = toupper(s[j]); 429 | } 430 | 431 | /* Compare two sds strings s1 and s2 with memcmp(). 432 | * 433 | * Return value: 434 | * 435 | * 1 if s1 > s2. 436 | * -1 if s1 < s2. 437 | * 0 if s1 and s2 are exactly the same binary string. 438 | * 439 | * If two strings share exactly the same prefix, but one of the two has 440 | * additional characters, the longer string is considered to be greater than 441 | * the smaller one. */ 442 | int sdscmp(const sds s1, const sds s2) { 443 | size_t l1, l2, minlen; 444 | int cmp; 445 | 446 | l1 = sdslen(s1); 447 | l2 = sdslen(s2); 448 | minlen = (l1 < l2) ? l1 : l2; 449 | cmp = memcmp(s1,s2,minlen); 450 | if (cmp == 0) return l1-l2; 451 | return cmp; 452 | } 453 | 454 | /* Split 's' with separator in 'sep'. An array 455 | * of sds strings is returned. *count will be set 456 | * by reference to the number of tokens returned. 457 | * 458 | * On out of memory, zero length string, zero length 459 | * separator, NULL is returned. 460 | * 461 | * Note that 'sep' is able to split a string using 462 | * a multi-character separator. For example 463 | * sdssplit("foo_-_bar","_-_"); will return two 464 | * elements "foo" and "bar". 465 | * 466 | * This version of the function is binary-safe but 467 | * requires length arguments. sdssplit() is just the 468 | * same function but for zero-terminated strings. 469 | */ 470 | sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count) { 471 | int elements = 0, slots = 5, start = 0, j; 472 | sds *tokens; 473 | 474 | if (seplen < 1 || len < 0) return NULL; 475 | 476 | tokens = malloc(sizeof(sds)*slots); 477 | if (tokens == NULL) return NULL; 478 | 479 | if (len == 0) { 480 | *count = 0; 481 | return tokens; 482 | } 483 | for (j = 0; j < (len-(seplen-1)); j++) { 484 | /* make sure there is room for the next element and the final one */ 485 | if (slots < elements+2) { 486 | sds *newtokens; 487 | 488 | slots *= 2; 489 | newtokens = realloc(tokens,sizeof(sds)*slots); 490 | if (newtokens == NULL) goto cleanup; 491 | tokens = newtokens; 492 | } 493 | /* search the separator */ 494 | if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) { 495 | tokens[elements] = sdsnewlen(s+start,j-start); 496 | if (tokens[elements] == NULL) goto cleanup; 497 | elements++; 498 | start = j+seplen; 499 | j = j+seplen-1; /* skip the separator */ 500 | } 501 | } 502 | /* Add the final element. We are sure there is room in the tokens array. */ 503 | tokens[elements] = sdsnewlen(s+start,len-start); 504 | if (tokens[elements] == NULL) goto cleanup; 505 | elements++; 506 | *count = elements; 507 | return tokens; 508 | 509 | cleanup: 510 | { 511 | int i; 512 | for (i = 0; i < elements; i++) sdsfree(tokens[i]); 513 | free(tokens); 514 | *count = 0; 515 | return NULL; 516 | } 517 | } 518 | 519 | /* Free the result returned by sdssplitlen(), or do nothing if 'tokens' is NULL. */ 520 | void sdsfreesplitres(sds *tokens, int count) { 521 | if (!tokens) return; 522 | while(count--) 523 | sdsfree(tokens[count]); 524 | free(tokens); 525 | } 526 | 527 | /* Create an sds string from a long long value. It is much faster than: 528 | * 529 | * sdscatprintf(sdsempty(),"%lld\n", value); 530 | */ 531 | sds sdsfromlonglong(long long value) { 532 | char buf[32], *p; 533 | unsigned long long v; 534 | 535 | v = (value < 0) ? -value : value; 536 | p = buf+31; /* point to the last character */ 537 | do { 538 | *p-- = '0'+(v%10); 539 | v /= 10; 540 | } while(v); 541 | if (value < 0) *p-- = '-'; 542 | p++; 543 | return sdsnewlen(p,32-(p-buf)); 544 | } 545 | 546 | /* Append to the sds string "s" an escaped string representation where 547 | * all the non-printable characters (tested with isprint()) are turned into 548 | * escapes in the form "\n\r\a...." or "\x". 549 | * 550 | * After the call, the modified sds string is no longer valid and all the 551 | * references must be substituted with the new pointer returned by the call. */ 552 | sds sdscatrepr(sds s, const char *p, size_t len) { 553 | s = sdscatlen(s,"\"",1); 554 | while(len--) { 555 | switch(*p) { 556 | case '\\': 557 | case '"': 558 | s = sdscatprintf(s,"\\%c",*p); 559 | break; 560 | case '\n': s = sdscatlen(s,"\\n",2); break; 561 | case '\r': s = sdscatlen(s,"\\r",2); break; 562 | case '\t': s = sdscatlen(s,"\\t",2); break; 563 | case '\a': s = sdscatlen(s,"\\a",2); break; 564 | case '\b': s = sdscatlen(s,"\\b",2); break; 565 | default: 566 | if (isprint(*p)) 567 | s = sdscatprintf(s,"%c",*p); 568 | else 569 | s = sdscatprintf(s,"\\x%02x",(unsigned char)*p); 570 | break; 571 | } 572 | p++; 573 | } 574 | return sdscatlen(s,"\"",1); 575 | } 576 | 577 | /* Helper function for sdssplitargs() that returns non zero if 'c' 578 | * is a valid hex digit. */ 579 | int is_hex_digit(char c) { 580 | return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || 581 | (c >= 'A' && c <= 'F'); 582 | } 583 | 584 | /* Helper function for sdssplitargs() that converts a hex digit into an 585 | * integer from 0 to 15 */ 586 | int hex_digit_to_int(char c) { 587 | switch(c) { 588 | case '0': return 0; 589 | case '1': return 1; 590 | case '2': return 2; 591 | case '3': return 3; 592 | case '4': return 4; 593 | case '5': return 5; 594 | case '6': return 6; 595 | case '7': return 7; 596 | case '8': return 8; 597 | case '9': return 9; 598 | case 'a': case 'A': return 10; 599 | case 'b': case 'B': return 11; 600 | case 'c': case 'C': return 12; 601 | case 'd': case 'D': return 13; 602 | case 'e': case 'E': return 14; 603 | case 'f': case 'F': return 15; 604 | default: return 0; 605 | } 606 | } 607 | 608 | /* Split a line into arguments, where every argument can be in the 609 | * following programming-language REPL-alike form: 610 | * 611 | * foo bar "newline are supported\n" and "\xff\x00otherstuff" 612 | * 613 | * The number of arguments is stored into *argc, and an array 614 | * of sds is returned. 615 | * 616 | * The caller should free the resulting array of sds strings with 617 | * sdsfreesplitres(). 618 | * 619 | * Note that sdscatrepr() is able to convert back a string into 620 | * a quoted string in the same format sdssplitargs() is able to parse. 621 | * 622 | * The function returns the allocated tokens on success, even when the 623 | * input string is empty, or NULL if the input contains unbalanced 624 | * quotes or closed quotes followed by non space characters 625 | * as in: "foo"bar or "foo' 626 | */ 627 | sds *sdssplitargs(const char *line, int *argc) { 628 | const char *p = line; 629 | char *current = NULL; 630 | char **vector = NULL; 631 | 632 | *argc = 0; 633 | while(1) { 634 | /* skip blanks */ 635 | while(*p && isspace(*p)) p++; 636 | if (*p) { 637 | /* get a token */ 638 | int inq=0; /* set to 1 if we are in "quotes" */ 639 | int insq=0; /* set to 1 if we are in 'single quotes' */ 640 | int done=0; 641 | 642 | if (current == NULL) current = sdsempty(); 643 | while(!done) { 644 | if (inq) { 645 | if (*p == '\\' && *(p+1) == 'x' && 646 | is_hex_digit(*(p+2)) && 647 | is_hex_digit(*(p+3))) 648 | { 649 | unsigned char byte; 650 | 651 | byte = (hex_digit_to_int(*(p+2))*16)+ 652 | hex_digit_to_int(*(p+3)); 653 | current = sdscatlen(current,(char*)&byte,1); 654 | p += 3; 655 | } else if (*p == '\\' && *(p+1)) { 656 | char c; 657 | 658 | p++; 659 | switch(*p) { 660 | case 'n': c = '\n'; break; 661 | case 'r': c = '\r'; break; 662 | case 't': c = '\t'; break; 663 | case 'b': c = '\b'; break; 664 | case 'a': c = '\a'; break; 665 | default: c = *p; break; 666 | } 667 | current = sdscatlen(current,&c,1); 668 | } else if (*p == '"') { 669 | /* closing quote must be followed by a space or 670 | * nothing at all. */ 671 | if (*(p+1) && !isspace(*(p+1))) goto err; 672 | done=1; 673 | } else if (!*p) { 674 | /* unterminated quotes */ 675 | goto err; 676 | } else { 677 | current = sdscatlen(current,p,1); 678 | } 679 | } else if (insq) { 680 | if (*p == '\\' && *(p+1) == '\'') { 681 | p++; 682 | current = sdscatlen(current,"'",1); 683 | } else if (*p == '\'') { 684 | /* closing quote must be followed by a space or 685 | * nothing at all. */ 686 | if (*(p+1) && !isspace(*(p+1))) goto err; 687 | done=1; 688 | } else if (!*p) { 689 | /* unterminated quotes */ 690 | goto err; 691 | } else { 692 | current = sdscatlen(current,p,1); 693 | } 694 | } else { 695 | switch(*p) { 696 | case ' ': 697 | case '\n': 698 | case '\r': 699 | case '\t': 700 | case '\0': 701 | done=1; 702 | break; 703 | case '"': 704 | inq=1; 705 | break; 706 | case '\'': 707 | insq=1; 708 | break; 709 | default: 710 | current = sdscatlen(current,p,1); 711 | break; 712 | } 713 | } 714 | if (*p) p++; 715 | } 716 | /* add the token to the vector */ 717 | vector = realloc(vector,((*argc)+1)*sizeof(char*)); 718 | vector[*argc] = current; 719 | (*argc)++; 720 | current = NULL; 721 | } else { 722 | /* Even on empty input string return something not NULL. */ 723 | if (vector == NULL) vector = malloc(sizeof(void*)); 724 | return vector; 725 | } 726 | } 727 | 728 | err: 729 | while((*argc)--) 730 | sdsfree(vector[*argc]); 731 | free(vector); 732 | if (current) sdsfree(current); 733 | *argc = 0; 734 | return NULL; 735 | } 736 | 737 | /* Modify the string substituting all the occurrences of the set of 738 | * characters specified in the 'from' string to the corresponding character 739 | * in the 'to' array. 740 | * 741 | * For instance: sdsmapchars(mystring, "ho", "01", 2) 742 | * will have the effect of turning the string "hello" into "0ell1". 743 | * 744 | * The function returns the sds string pointer, that is always the same 745 | * as the input pointer since no resize is needed. */ 746 | sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen) { 747 | size_t j, i, l = sdslen(s); 748 | 749 | for (j = 0; j < l; j++) { 750 | for (i = 0; i < setlen; i++) { 751 | if (s[j] == from[i]) { 752 | s[j] = to[i]; 753 | break; 754 | } 755 | } 756 | } 757 | return s; 758 | } 759 | 760 | /* Join an array of C strings using the specified separator (also a C string). 761 | * Returns the result as an sds string. */ 762 | sds sdsjoin(char **argv, int argc, char *sep, size_t seplen) { 763 | sds join = sdsempty(); 764 | int j; 765 | 766 | for (j = 0; j < argc; j++) { 767 | join = sdscat(join, argv[j]); 768 | if (j != argc-1) join = sdscatlen(join,sep,seplen); 769 | } 770 | return join; 771 | } 772 | 773 | /* Like sdsjoin, but joins an array of SDS strings. */ 774 | sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen) { 775 | sds join = sdsempty(); 776 | int j; 777 | 778 | for (j = 0; j < argc; j++) { 779 | join = sdscatsds(join, argv[j]); 780 | if (j != argc-1) join = sdscatlen(join,sep,seplen); 781 | } 782 | return join; 783 | } 784 | 785 | #ifdef SDS_TEST_MAIN 786 | #include 787 | #include "testhelp.h" 788 | 789 | int main(void) { 790 | { 791 | struct sdshdr *sh; 792 | sds x = sdsnew("foo"), y; 793 | 794 | test_cond("Create a string and obtain the length", 795 | sdslen(x) == 3 && memcmp(x,"foo\0",4) == 0) 796 | 797 | sdsfree(x); 798 | x = sdsnewlen("foo",2); 799 | test_cond("Create a string with specified length", 800 | sdslen(x) == 2 && memcmp(x,"fo\0",3) == 0) 801 | 802 | x = sdscat(x,"bar"); 803 | test_cond("Strings concatenation", 804 | sdslen(x) == 5 && memcmp(x,"fobar\0",6) == 0); 805 | 806 | x = sdscpy(x,"a"); 807 | test_cond("sdscpy() against an originally longer string", 808 | sdslen(x) == 1 && memcmp(x,"a\0",2) == 0) 809 | 810 | x = sdscpy(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk"); 811 | test_cond("sdscpy() against an originally shorter string", 812 | sdslen(x) == 33 && 813 | memcmp(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk\0",33) == 0) 814 | 815 | sdsfree(x); 816 | x = sdscatprintf(sdsempty(),"%d",123); 817 | test_cond("sdscatprintf() seems working in the base case", 818 | sdslen(x) == 3 && memcmp(x,"123\0",4) ==0) 819 | 820 | sdsfree(x); 821 | x = sdsnew("xxciaoyyy"); 822 | sdstrim(x,"xy"); 823 | test_cond("sdstrim() correctly trims characters", 824 | sdslen(x) == 4 && memcmp(x,"ciao\0",5) == 0) 825 | 826 | y = sdsdup(x); 827 | sdsrange(y,1,1); 828 | test_cond("sdsrange(...,1,1)", 829 | sdslen(y) == 1 && memcmp(y,"i\0",2) == 0) 830 | 831 | sdsfree(y); 832 | y = sdsdup(x); 833 | sdsrange(y,1,-1); 834 | test_cond("sdsrange(...,1,-1)", 835 | sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0) 836 | 837 | sdsfree(y); 838 | y = sdsdup(x); 839 | sdsrange(y,-2,-1); 840 | test_cond("sdsrange(...,-2,-1)", 841 | sdslen(y) == 2 && memcmp(y,"ao\0",3) == 0) 842 | 843 | sdsfree(y); 844 | y = sdsdup(x); 845 | sdsrange(y,2,1); 846 | test_cond("sdsrange(...,2,1)", 847 | sdslen(y) == 0 && memcmp(y,"\0",1) == 0) 848 | 849 | sdsfree(y); 850 | y = sdsdup(x); 851 | sdsrange(y,1,100); 852 | test_cond("sdsrange(...,1,100)", 853 | sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0) 854 | 855 | sdsfree(y); 856 | y = sdsdup(x); 857 | sdsrange(y,100,100); 858 | test_cond("sdsrange(...,100,100)", 859 | sdslen(y) == 0 && memcmp(y,"\0",1) == 0) 860 | 861 | sdsfree(y); 862 | sdsfree(x); 863 | x = sdsnew("foo"); 864 | y = sdsnew("foa"); 865 | test_cond("sdscmp(foo,foa)", sdscmp(x,y) > 0) 866 | 867 | sdsfree(y); 868 | sdsfree(x); 869 | x = sdsnew("bar"); 870 | y = sdsnew("bar"); 871 | test_cond("sdscmp(bar,bar)", sdscmp(x,y) == 0) 872 | 873 | sdsfree(y); 874 | sdsfree(x); 875 | x = sdsnew("aar"); 876 | y = sdsnew("bar"); 877 | test_cond("sdscmp(bar,bar)", sdscmp(x,y) < 0) 878 | 879 | sdsfree(y); 880 | sdsfree(x); 881 | x = sdsnewlen("\a\n\0foo\r",7); 882 | y = sdscatrepr(sdsempty(),x,sdslen(x)); 883 | test_cond("sdscatrepr(...data...)", 884 | memcmp(y,"\"\\a\\n\\x00foo\\r\"",15) == 0) 885 | 886 | { 887 | int oldfree; 888 | 889 | sdsfree(x); 890 | x = sdsnew("0"); 891 | sh = (void*) (x-(sizeof(struct sdshdr))); 892 | test_cond("sdsnew() free/len buffers", sh->len == 1 && sh->free == 0); 893 | x = sdsMakeRoomFor(x,1); 894 | sh = (void*) (x-(sizeof(struct sdshdr))); 895 | test_cond("sdsMakeRoomFor()", sh->len == 1 && sh->free > 0); 896 | oldfree = sh->free; 897 | x[1] = '1'; 898 | sdsIncrLen(x,1); 899 | test_cond("sdsIncrLen() -- content", x[0] == '0' && x[1] == '1'); 900 | test_cond("sdsIncrLen() -- len", sh->len == 2); 901 | test_cond("sdsIncrLen() -- free", sh->free == oldfree-1); 902 | } 903 | } 904 | test_report() 905 | return 0; 906 | } 907 | #endif 908 | --------------------------------------------------------------------------------