├── example
    ├── target_spiked_simple.vcf.gz
    ├── 1KG_cftr_background.recode.vep.vcf.gz
    └── 1KG_cftr_background.recode.vep.vcf.gz.tbi
├── aa_weight.h
├── score_variant.h
├── Makefile
├── Makefile~
├── background_max_scores.h
├── vvp_lookup.h
├── search_binary_bkgrnd.h
├── vvp_headers.h
├── parse_vcf.h
├── vvp_lookup.c
├── kvec.h
├── README.md
├── sds.h
├── score_variant.c
├── background_max_scores.c
├── search_binary_bkgrnd.c
├── bit_macros.h
├── aa_weight.c
├── score_variants.c
├── parse_vcf.c
├── bit_array.h
├── khash.h
└── sds.c


/example/target_spiked_simple.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yandell-Lab/VVP-pub/HEAD/example/target_spiked_simple.vcf.gz


--------------------------------------------------------------------------------
/example/1KG_cftr_background.recode.vep.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yandell-Lab/VVP-pub/HEAD/example/1KG_cftr_background.recode.vep.vcf.gz


--------------------------------------------------------------------------------
/example/1KG_cftr_background.recode.vep.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yandell-Lab/VVP-pub/HEAD/example/1KG_cftr_background.recode.vep.vcf.gz.tbi


--------------------------------------------------------------------------------
/aa_weight.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  aa_weight.h
 3 | //  VVP_dev_xcode
 4 | //
 5 | //  Created by STEVEN FLYGARE on 10/11/16.
 6 | //  Copyright © 2016 IDbyDNA. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef aa_weight_h
10 | #define aa_weight_h
11 | 
12 | #include "vvp_headers.h"
13 | #include "parse_vcf.h"
14 | 
15 | struct aa_matrix {
16 |     char aa_change[20];
17 |     float score;
18 |     float cons;
19 |     float uncons;
20 |     UT_hash_handle hh;
21 | };
22 | 
23 | void init_aa_score();
24 | void get_aaw(struct transcript_anno_info ** ttai, sds ref, sds var, float phast);
25 | 
26 | #endif /* aa_weight_h */
27 | 


--------------------------------------------------------------------------------
/score_variant.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  score_variant.h
 3 | //  vcf_parser
 4 | //
 5 | //  Created by steven on 6/25/15.
 6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef __vcf_parser__score_variant__
10 | #define __vcf_parser__score_variant__
11 | 
12 | #include "vvp_headers.h"
13 | #include "parse_vcf.h"
14 | 
15 | void score_variant_b(struct variant * v, int no_allele_frequency);
16 | void score_variant_t_b(struct variant * v, int nb, int xu, int no_allele_frequency); //nb is with nocalls taken into account, xu is the background allele count
17 | 
18 | #endif /* defined(__vcf_parser__score_variant__) */
19 | 
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -lz -lm -O3 -lgsl -lgslcblas -fopenmp #-Wall
 3 | TARGETS = build_background VVP
 4 | 
 5 | all: $(TARGETS)
 6 | 
 7 | build_background: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o
 8 | 	$(CC) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o -o $@ $(CFLAGS)
 9 | 
10 | VVP: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o
11 | 	$(CC) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o -o $@ $(CFLAGS)
12 | 
13 | .c.o:
14 | 	$(CC) -c $< $(CFLAGS)
15 | 
16 | clean:
17 | 	rm -f *.o
18 | 	rm -f $(TARGETS)
19 | 
20 | 


--------------------------------------------------------------------------------
/Makefile~:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -lz -lm -O3 -lgsl -lgslcblas -fopenmp #-Wall
 3 | TARGETS = build_background VVP
 4 | 
 5 | all: $(TARGETS)
 6 | 
 7 | build_background: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o
 8 | 	$(CC) $(CFLAGS) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o build_background.o -o $@
 9 | 
10 | VVP: aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o
11 | 	$(CC) $(CFLAGS) aa_weight.o bit_array.o score_variant.o parse_vcf.o sds.o vvp_lookup.o search_binary_bkgrnd.o score_variants.o -o $@ 
12 | 
13 | .c.o:
14 | 	$(CC) $(CFLAGS) -c $<
15 | 
16 | clean:
17 | 	rm -f *.o
18 | 	rm -f $(TARGETS)
19 | 
20 | 


--------------------------------------------------------------------------------
/background_max_scores.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  background_max_scores.h
 3 | //  VVP_C
 4 | //
 5 | //  Created by steven on 8/13/15.
 6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef __VVP_C__background_max_scores__
10 | #define __VVP_C__background_max_scores__
11 | 
12 | #include "vvp_headers.h"
13 | 
14 | #define BKRND_GENE_NAME_LEN 50
15 | 
16 | struct bkgrnd_max_scores {
17 |     char gene[BKRND_GENE_NAME_LEN];
18 |     float * max_scores;
19 |     UT_hash_handle hh;
20 | };
21 | 
22 | void init_bkgrnd_max(char * bkgrnd_max, int nb, char iht);
23 | 
24 | void init_bkgrnd_max_b(char * bkgrnd_max, int nb, char iht);
25 | 
26 | struct bkgrnd_max_scores * get_gene_max(char * gene);
27 | 
28 | void cleanup_bkgrnd_max();
29 | 
30 | #endif /* defined(__VVP_C__background_max_scores__) */
31 | 
32 | 


--------------------------------------------------------------------------------
/vvp_lookup.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  vvp_lookup.h
 3 | //  VVP_C
 4 | //
 5 | //  Created by steven on 8/12/15.
 6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef __VVP_C__vvp_lookup__
10 | #define __VVP_C__vvp_lookup__
11 | 
12 | #include "vvp_headers.h"
13 | 
14 | #define NPERCENTILES 100
15 | 
16 | struct feature_lookups {
17 |     char feature_name[FEATURE_NAME_LEN];
18 |     float coding_vals[NPERCENTILES];
19 |     float noncoding_vals[NPERCENTILES];
20 |     int n_coding;
21 |     int n_noncoding;
22 |     UT_hash_handle hh;
23 | };
24 | 
25 | void load_feature_lookups_b(sds lookup_file);
26 | 
27 | int score_lookup(char * feature_name, float score, int coding);
28 | 
29 | int score_lookup_b(char * feature_name, float score, int coding);
30 | 
31 | void destroy_feature_lookups();
32 | 
33 | 
34 | #endif /* defined(__VVP_C__vvp_lookup__) */
35 | 


--------------------------------------------------------------------------------
/search_binary_bkgrnd.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  search_binary_bkgrnd.h
 3 | //  VVP_C
 4 | //
 5 | //  Created by steven on 8/11/15.
 6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef __VVP_C__search_binary_bkgrnd__
10 | #define __VVP_C__search_binary_bkgrnd__
11 | 
12 | #include "vvp_headers.h"
13 | 
14 | struct chr_offsets {
15 |     char chr[3];
16 |     //char * chr;
17 |     uint64_t byte_start;
18 |     uint64_t byte_end;
19 |     int n_entries;
20 |     UT_hash_handle hh;
21 | };
22 | 
23 | struct var_info {
24 |     char var_type;
25 |     int length;
26 |     int nhet;
27 |     int nhom;
28 |     int nhemi;
29 |     int nocall;
30 |     uint64_t bit_offset;
31 | };
32 | 
33 | struct m_var_info {
34 |     struct var_info ** vi;
35 |     int nv;
36 | };
37 | 
38 | unsigned char * load_bin_db(sds file_prefix, int * n_background);
39 | 
40 | unsigned char * load_bit_db(sds file_prefix);
41 | 
42 | struct chr_offsets * load_offsets(sds file_prefix);
43 | 
44 | struct m_var_info * search_binary_bkgrnd(char * chr, size_t pos, unsigned char * mm_bin, struct chr_offsets * chro);
45 | 
46 | void destroy_chr_offsets(struct chr_offsets * chro);
47 | 
48 | #endif /* defined(__VVP_C__search_binary_bkgrnd__) */
49 | 


--------------------------------------------------------------------------------
/vvp_headers.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  vvp_headers.h
 3 | //  VVP_C
 4 | //
 5 | //  Created by steven on 8/12/15.
 6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef VVP_C_vvp_headers_h
10 | #define VVP_C_vvp_headers_h
11 | 
12 | #include "sds.h"
13 | #include "uthash.h"
14 | #include "kvec.h"
15 | #include "khash.h"
16 | #include "bit_array.h"
17 | #include <math.h>
18 | #include <stdio.h>
19 | #include <sys/mman.h>
20 | #include <sys/stat.h>
21 | #include <fcntl.h>
22 | #include <string.h>
23 | #include <unistd.h>
24 | #include <stdlib.h>
25 | #include <zlib.h>
26 | #include <time.h>
27 | #include <pthread.h>
28 | 
29 | struct config {
30 |     sds target_vcf;
31 |     sds background_vcf;
32 |     sds db_prefix;
33 |     sds vvp_formatted;
34 |     char inheritance_filters;
35 |     char penetrance;
36 |     int mother_sample_index;
37 |     int father_sample_index;
38 |     int proband_sample_index;
39 |     int sibling_sample_index;
40 |     int sibling_affected;
41 |     int nb;
42 |     int nts;
43 |     int nt;
44 |     sds anno_tag;
45 |     int variant_pos;
46 |     int gene_pos;
47 |     int aa_pos;
48 |     int so_pos;
49 |     char iht;
50 |     int format_output;
51 |     int np;
52 |     int only_coding;
53 |     int only_snv;
54 |     size_t n_permutations;
55 |     size_t mat_rows;
56 | };
57 | 
58 | #ifdef _OPENMP
59 | #include <omp.h>
60 | #endif
61 | 
62 | #define BUF_SIZE 5000000
63 | #define MAX_SCORES 100000
64 | #define LINE_BYTE_SIZE 35
65 | #define NPERCENTILES 100
66 | #define MAX_VARS 20
67 | #define FEATURE_NAME_LEN 50
68 | #define FEATURE_NAME_LENGTH 50
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/parse_vcf.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  parse_vcf.h
 3 | //  VVP_dev_xcode
 4 | //
 5 | //  Created by STEVEN FLYGARE on 10/10/16.
 6 | //  Copyright © 2016 IDbyDNA. All rights reserved.
 7 | //
 8 | 
 9 | #ifndef parse_vcf_h
10 | #define parse_vcf_h
11 | 
12 | #include "vvp_headers.h"
13 | #include "aa_weight.h"
14 | #include <gsl/gsl_rng.h>
15 | 
16 | struct vep_field_info {
17 |     uint8_t gene_index;
18 |     uint8_t transcript_index;
19 |     uint8_t seq_ontology_tag_index;
20 |     uint8_t amino_acid_change_index;
21 |     sds annotation_tag_name;
22 |     int ll_weight_index;
23 | };
24 | 
25 | struct transcript_anno_info {
26 |     char transcript_name[FEATURE_NAME_LENGTH];
27 |     float aaw;
28 |     float het_score;
29 |     int het_vvp;
30 |     float hom_score;
31 |     int hom_vvp;
32 |     float hemi_score;
33 |     int hemi_vvp;
34 |     int coding;
35 |     float llw; //likelihood weight
36 |     sds pref;
37 |     sds pvar;
38 |     kvec_t(sds) anno_tags;
39 |     UT_hash_handle hh;
40 | };
41 | 
42 | struct gene_transcript {
43 |     char gene_name[FEATURE_NAME_LENGTH];
44 |     struct transcript_anno_info * tai;
45 |     UT_hash_handle hh;
46 | };
47 | 
48 | struct variant {
49 |     sds chr;
50 |     sds vid;
51 |     size_t pos;
52 |     sds ref;
53 |     sds var;
54 |     int indel;
55 |     struct gene_transcript * gt;
56 |     float phast;
57 |     int nref;  //total number of ref alleles
58 |     int ni; //total number of individuals
59 |     int b_nhet;
60 |     int b_nhom;
61 |     int b_nhemi;
62 |     int b_nocall;
63 |     uint64_t bit_offset;
64 |     sds hemi_indv; //hemizygous individuals (comma separated list)
65 |     sds het_indv; //heterozygous indivdiuals (comma separated list)
66 |     sds hom_indv; //homozygous individuals (comma separated list)
67 |     kvec_t(int) hemi;
68 |     kvec_t(int) hets;
69 |     kvec_t(int) homs;
70 |     kvec_t(int) het_nocalls; //for heterozygous nocalls
71 |     kvec_t(int) hom_nocalls; //for homozygous nocalls
72 |     kvec_t(int) hemi_nocalls; //for hemizygous nocalls
73 | };
74 | 
75 | 
76 | void initialize_parse_vcf(uint8_t gene_index, uint8_t transcript_index, uint8_t seq_ontology_tag_index, uint8_t amino_acid_change_index, sds annotation_tag_name, int ll_weight_index);
77 | 
78 | struct variant * parse_vcf_line(sds line, int no_aa_weight);
79 | 
80 | struct variant * parse_allele_frequency_line(sds line, int no_aa_weight);
81 | 
82 | void destroy_variant(struct variant * v);
83 | 
84 | void print_variant(struct variant * v);
85 | 
86 | #endif /* parse_vcf_h */
87 | 


--------------------------------------------------------------------------------
/vvp_lookup.c:
--------------------------------------------------------------------------------
 1 | //
 2 | //  vvp_lookup.c
 3 | //  VVP_C
 4 | //
 5 | //  Created by steven on 8/12/15.
 6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
 7 | //
 8 | 
 9 | #include "vvp_lookup.h"
10 | 
11 | static struct feature_lookups * lookups;
12 | 
13 | static unsigned char * mm_dist;
14 | static uint64_t mm_dist_size;
15 | static int dist_line_size;
16 | 
17 | void load_feature_lookups_b(sds lookup_file) {
18 |     
19 |     int fdSrc = open(lookup_file, O_RDWR, 0);
20 |     struct stat st;
21 |     fstat(fdSrc, &st);
22 |     mm_dist = (unsigned char *)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdSrc, 0);
23 |     if(mm_dist == MAP_FAILED){
24 |         fprintf(stderr, "FATAL:  could not create mmap from %s\n", lookup_file);
25 |         exit(1);
26 |     }
27 |     
28 |     mm_dist_size = st.st_size;
29 |     
30 |     dist_line_size = sizeof(char)*FEATURE_NAME_LEN + sizeof(float)*NPERCENTILES + sizeof(float)*NPERCENTILES + sizeof(size_t) + sizeof(size_t);
31 | 
32 | }
33 | 
34 | int score_lookup_b(char * feature_name, float score, int coding){
35 |     
36 |     int min = 0;
37 |     int max = mm_dist_size / dist_line_size; //place pointer at start of final line
38 |     float * percentiles = NULL;
39 |     while (max >= min) {
40 |         //uint mid = min + ((max - min) >> 1); //floor average
41 |         int mid = (min + max) / 2;
42 |         uint64_t tmp_offset = mid*dist_line_size;
43 |         int cmp = strcmp((char *)(mm_dist+tmp_offset), feature_name);
44 |         if ( cmp == 0 ) {
45 |             if (coding == 1) {
46 |                 percentiles = (float *)(mm_dist + tmp_offset + sizeof(char)*FEATURE_NAME_LEN);
47 |             }
48 |             else if(coding == 0) {
49 |                 percentiles = (float *)(mm_dist + tmp_offset + sizeof(char)*FEATURE_NAME_LEN + sizeof(float)*NPERCENTILES);
50 |             }
51 |             else {
52 |                 return -1;
53 |             }
54 |             int i=0;
55 |             for (i = 0; i < NPERCENTILES; i++) {
56 |                 if ( score < percentiles[i] || fabsf(score - percentiles[i]) < .01 ) { //return if within .01
57 |                     return i;
58 |                 }
59 |             }
60 |             return 100;
61 |         }
62 |         else if (cmp < 0){
63 |             min = mid + 1;
64 |         }
65 |         else {
66 |             max = mid - 1;
67 |         }
68 |     }
69 |     
70 |     return -1;
71 |     
72 | }
73 | 
74 | void destroy_feature_lookups(){
75 |     struct feature_lookups *s, *tmp;
76 |     HASH_ITER(hh, lookups, s, tmp){
77 |         HASH_DEL(lookups, s);
78 |         free(s);
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/kvec.h:
--------------------------------------------------------------------------------
 1 | /* The MIT License
 2 | 
 3 |    Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>
 4 | 
 5 |    Permission is hereby granted, free of charge, to any person obtaining
 6 |    a copy of this software and associated documentation files (the
 7 |    "Software"), to deal in the Software without restriction, including
 8 |    without limitation the rights to use, copy, modify, merge, publish,
 9 |    distribute, sublicense, and/or sell copies of the Software, and to
10 |    permit persons to whom the Software is furnished to do so, subject to
11 |    the following conditions:
12 | 
13 |    The above copyright notice and this permission notice shall be
14 |    included in all copies or substantial portions of the Software.
15 | 
16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |    SOFTWARE.
24 | */
25 | 
26 | /*
27 |   An example:
28 | 
29 | #include "kvec.h"
30 | int main() {
31 | 	kvec_t(int) array;
32 | 	kv_init(array);
33 | 	kv_push(int, array, 10); // append
34 | 	kv_a(int, array, 20) = 5; // dynamic
35 | 	kv_A(array, 20) = 4; // static
36 | 	kv_destroy(array);
37 | 	return 0;
38 | }
39 | */
40 | 
41 | /*
42 |   2008-09-22 (0.1.0):
43 | 
44 | 	* The initial version.
45 | 
46 | */
47 | 
48 | #ifndef AC_KVEC_H
49 | #define AC_KVEC_H
50 | 
51 | #include <stdlib.h>
52 | 
53 | #ifdef USE_MALLOC_WRAPPERS
54 | #  include "malloc_wrap.h"
55 | #endif
56 | 
57 | #define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
58 | 
59 | #define kvec_t(type) struct { size_t n, m; type *a; }
60 | #define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
61 | #define kv_destroy(v) free((v).a)
62 | #define kv_A(v, i) ((v).a[(i)])
63 | #define kv_pop(v) ((v).a[--(v).n])
64 | #define kv_size(v) ((v).n)
65 | #define kv_max(v) ((v).m)
66 | 
67 | #define kv_resize(type, v, s)  ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))
68 | 
69 | #define kv_copy(type, v1, v0) do {							\
70 | 		if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n);	\
71 | 		(v1).n = (v0).n;									\
72 | 		memcpy((v1).a, (v0).a, sizeof(type) * (v0).n);		\
73 | 	} while (0)												\
74 | 
75 | #define kv_push(type, v, x) do {									\
76 | 		if ((v).n == (v).m) {										\
77 | 			(v).m = (v).m? (v).m<<1 : 2;							\
78 | 			(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
79 | 		}															\
80 | 		(v).a[(v).n++] = (x);										\
81 | 	} while (0)
82 | 
83 | #define kv_pushp(type, v) ((((v).n == (v).m)?							\
84 | 						   ((v).m = ((v).m? (v).m<<1 : 2),				\
85 | 							(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0)	\
86 | 						   : 0), &(v).a[(v).n++])
87 | 
88 | #define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
89 | 						  ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
90 | 						   (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
91 | 						  : (v).n <= (size_t)(i)? (v).n = (i) + 1 \
92 | 						  : 0), (v).a[(i)])
93 | 
94 | #endif
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # VVP
 2 | Variant prioritization / burden test.  Version 1.5
 3 | 
 4 | ## INSTALL
 5 | ### DEPENDENCIES  
 6 | 
 7 | 1. Gnu scientific library (https://www.gnu.org/software/gsl/)  
 8 | 2. openmp compatible version of gcc.  If your compiler (clang) is not, you can remove the -fopenmp flag in the Makefile.  Change the line that looks like: CFLAGS = -lz -lm -O3 -lgsl -lgslcblas -fopenmp #-Wall to CFLAGS = -lz -lm -O3 -lgsl -lgslcblas #-fopenmp #-Wall
 9 | 3. zlib (https://zlib.net)
10 | 4. make
11 | 
12 | ### BUILD
13 | 
14 | In the VVP directory:
15 | 
16 | `make`
17 | 
18 | Make will build 2 executables:  build_background and VVP
19 | 
20 | Note:  This has been built and run on Mac laptops and Linux servers.  
21 | 
22 | ## EXAMPLE RUNNING VVP
23 | 
24 | To see available parameters of the executables, run with the -h option.  
25 | 
26 | Before running VVP, a background must be built.  From the VVP directory:
27 | 
28 | `cd example`
29 | 
30 | `../build_background -i 1KG_cftr_background.recode.vep.vcf.gz -o 1KG.build -b 2500 -v CSQ,4,6,1,15`
31 | 
32 | The build_background step produces output to stdout for each of the variants in the background vcf file.  It also creates several different output files including extensions .bin, .chr_offsets.txt, .dist.  These files contained information used by VVP.  
33 | 
34 | To run prioritize variants using VVP (in the example folder):
35 | 
36 | `../VVP -i target_spiked_simple.vcf.gz -d 1KG.build -v CSQ,4,6,1,15 1> target.spiked.vvp.out`
37 | 
38 | target_spiked.vvp.out contains the vvp output.
39 | 
40 | ### PREPARE VCF FILE FOR ANALYSIS
41 | 
42 | The VVP pipeline does not support mulitallelic lines, these must first be decomposed.  We recommend using vt decompose to accomplish this task (http://genome.sph.umich.edu/wiki/Vt).
43 | 
44 | **Mandatory** preprocessing of a vcf file includes **multiallelic decomposition and VEP annotation**.  It is important to decompose **BEFORE** annnotating because of potential annotation collisions.  Our recommended steps are to use vt to decompose and normalize variants followed by VEP annotation.  No special options in VEP are required for the variant annotation.  Testing has been done with VEP v82.    
45 | 
46 | ## VVP BACKGROUND
47 | A prebuilt background based on gnomAD (http://gnomad.broadinstitute.org/) for use with VVP can be downloaded here (2.5GB): https://s3-us-west-2.amazonaws.com/gnomad-vvp-background/gnomad.062717.build.tar.gz
48 | 
49 | ## VVP OUTPUT
50 | VVP outputs a tab delimited file with 31 columns.  The columns are the following: 
51 | 
52 | |column name|description|
53 | |-----------|-----------|
54 | |chr| chromosome |
55 | |start| variant start coord|
56 | |ref| reference allele|
57 | |var| variant allele |
58 | |gene| gene id |
59 | |transcript| transcript id |
60 | |hemi_score| raw variant score for hemizygous genotype |
61 | |hemi_vvp| vvp score for hemizygous genotype |
62 | |nhemi| number of hemizygous indivduals |
63 | |hemi_indvs| list of hemizygous individuals |
64 | |hemi_nocall| number of hemizygous nocalls |
65 | |het_score| raw variant score for heterozygous genotype|
66 | |het_vvp| vvp score for heterozygous genotype |
67 | |nhet| number of heterozygous individuals |
68 | |het_indvs| list of heterozygous individuals |
69 | |het_nocall| number of heterozygous nocalls |
70 | |hom_score| raw variant score for homozygous genotype |
71 | |hom_vvp| vvp score for homozygous genotype|
72 | |nhom| number of homozygous individuals |
73 | |hom_indvs| list of homozygous individuals |
74 | |hom_nocall| number of homozygous nocalls |
75 | |coding_ind| 1 if variant is coding, 0 otherwise |
76 | |indel_ind| 1 if variant is an indel, 0 otherwise |
77 | |aa_score| amino acid weight |
78 | |n_bhemi| number of hemizygous background individuals |
79 | |n_bhet| number of heterozygous background individuals |
80 | |n_bhom| number of homozygous background individuals |
81 | |n_bnocall| number of alleles nocalled in background |
82 | |bit_offset| byte offset to background |
83 | |vid| variant id |
84 | |ll_weight| optional extra weight |
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/sds.h:
--------------------------------------------------------------------------------
  1 | /* SDS (Simple Dynamic Strings), A C dynamic strings library.
  2 |  *
  3 |  * Copyright (c) 2006-2014, Salvatore Sanfilippo <antirez at gmail dot com>
  4 |  * All rights reserved.
  5 |  *
  6 |  * Redistribution and use in source and binary forms, with or without
  7 |  * modification, are permitted provided that the following conditions are met:
  8 |  *
  9 |  *   * Redistributions of source code must retain the above copyright notice,
 10 |  *     this list of conditions and the following disclaimer.
 11 |  *   * Redistributions in binary form must reproduce the above copyright
 12 |  *     notice, this list of conditions and the following disclaimer in the
 13 |  *     documentation and/or other materials provided with the distribution.
 14 |  *   * Neither the name of Redis nor the names of its contributors may be used
 15 |  *     to endorse or promote products derived from this software without
 16 |  *     specific prior written permission.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 19 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 20 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 21 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 22 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 23 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 24 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 25 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 26 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 27 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 28 |  * POSSIBILITY OF SUCH DAMAGE.
 29 |  */
 30 | 
 31 | #ifndef __SDS_H
 32 | #define __SDS_H
 33 | 
 34 | #define SDS_MAX_PREALLOC (1024*1024)
 35 | 
 36 | #include <sys/types.h>
 37 | #include <stdarg.h>
 38 | 
 39 | typedef char *sds;
 40 | 
 41 | struct sdshdr {
 42 |     int len;
 43 |     int free;
 44 |     char buf[];
 45 | };
 46 | 
 47 | static inline size_t sdslen(const sds s) {
 48 |     struct sdshdr *sh = (void*)(s-sizeof *sh);
 49 |     return sh->len;
 50 | }
 51 | 
 52 | static inline size_t sdsavail(const sds s) {
 53 |     struct sdshdr *sh = (void*)(s-sizeof *sh);
 54 |     return sh->free;
 55 | }
 56 | 
 57 | sds sdsnewlen(const void *init, size_t initlen);
 58 | sds sdsnew(const char *init);
 59 | sds sdsempty(void);
 60 | size_t sdslen(const sds s);
 61 | sds sdsdup(const sds s);
 62 | void sdsfree(sds s);
 63 | size_t sdsavail(const sds s);
 64 | sds sdsgrowzero(sds s, size_t len);
 65 | sds sdscatlen(sds s, const void *t, size_t len);
 66 | sds sdscat(sds s, const char *t);
 67 | sds sdscatsds(sds s, const sds t);
 68 | sds sdscpylen(sds s, const char *t, size_t len);
 69 | sds sdscpy(sds s, const char *t);
 70 | 
 71 | sds sdscatvprintf(sds s, const char *fmt, va_list ap);
 72 | #ifdef __GNUC__
 73 | sds sdscatprintf(sds s, const char *fmt, ...)
 74 |     __attribute__((format(printf, 2, 3)));
 75 | #else
 76 | sds sdscatprintf(sds s, const char *fmt, ...);
 77 | #endif
 78 | 
 79 | void sdstrim(sds s, const char *cset);
 80 | void sdsrange(sds s, int start, int end);
 81 | void sdsupdatelen(sds s);
 82 | void sdsclear(sds s);
 83 | int sdscmp(const sds s1, const sds s2);
 84 | sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count);
 85 | void sdsfreesplitres(sds *tokens, int count);
 86 | void sdstolower(sds s);
 87 | void sdstoupper(sds s);
 88 | sds sdsfromlonglong(long long value);
 89 | sds sdscatrepr(sds s, const char *p, size_t len);
 90 | sds *sdssplitargs(const char *line, int *argc);
 91 | sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen);
 92 | sds sdsjoin(char **argv, int argc, char *sep, size_t seplen);
 93 | sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen);
 94 | 
 95 | /* Low level functions exposed to the user API */
 96 | sds sdsMakeRoomFor(sds s, size_t addlen);
 97 | void sdsIncrLen(sds s, int incr);
 98 | sds sdsRemoveFreeSpace(sds s);
 99 | size_t sdsAllocSize(sds s);
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/score_variant.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  score_variant.c
  3 | //  vcf_parser
  4 | //
  5 | //  Created by steven on 6/25/15.
  6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
  7 | //
  8 | 
  9 | #include "score_variant.h"
 10 | 
 11 | float compute_score(int nb, int nt, int xa, int xu, float aaw, float llw, int no_allele_frequency) {
 12 |     
 13 |     //nb = nb - nt >= 0 ? nb - nt : 0;
 14 |     //xu = xu - xa >= 0 ? xu - xa : 0;
 15 |     
 16 |     int x = xa + xu;
 17 |     int n = nb + nt;
 18 |     
 19 |     float p = (float)x / (float) n;
 20 |     
 21 |     if (p < 1e-10 || (1.0 - p) < 1e-10) { //if p is essentially 0 or 1, the score is 0
 22 |         return 0.0;
 23 |     }
 24 |     
 25 |     float pu = 0.0;
 26 |     if (nb > 0) {
 27 |         pu = (float)xu/(float)nb;
 28 |         if (pu >= 1.0) { //if everyone in the background has the allele, the score is 0
 29 |             return 0.0;
 30 |         }
 31 |         else if (pu < 1e-10){
 32 |             pu = 1e-6;
 33 |         }
 34 |     }
 35 |     if (pu < 1e-10) { //in case everyone in the background is nocalled
 36 |         pu = 1e-6;
 37 |     }
 38 |     
 39 |     float pa = (float)xa / (float)nt;
 40 |     if (pa >= 1.0) {
 41 |         pa = 1.0 - 1e-6;
 42 |     }
 43 |     else if (pa <= 1e-10) { //error if pa is 0 or negative -- no affecteds with allele
 44 |         return -1.0;
 45 |     }
 46 |     
 47 |     double plog = log(p);
 48 |     /*if (errno == EDOM || errno == ERANGE) {
 49 |      fprintf(stderr, "log(p) failed, p is %f\n", p);
 50 |      free(vac);
 51 |      return -1.0; //return -1 to mean an error
 52 |      }*/
 53 |     
 54 |     double iplog = log(1.0 - p);
 55 |     /*if (errno == EDOM || errno == ERANGE) {
 56 |      fprintf(stderr, "log(1.0 - p) failed, p is %f\n", p);
 57 |      free(vac);
 58 |      return -1.0; //return -1 to mean an error
 59 |      }*/
 60 |     
 61 |     double pulog = log(pu);
 62 |     /*if (errno == EDOM || errno == ERANGE) {
 63 |      fprintf(stderr, "log(pu) failed, pu is %f\n", pu);
 64 |      free(vac);
 65 |      return -1.0; //return -1 to mean an error
 66 |      }*/
 67 |     
 68 |     double ipulog = log(1.0 - pu);
 69 |     /*if (errno == EDOM || errno == ERANGE) {
 70 |      fprintf(stderr, "log(1.0 - pu) failed, pu is %f\n", pu);
 71 |      free(vac);
 72 |      return -1.0; //return -1 to mean an error
 73 |      }*/
 74 |     
 75 |     double palog = log(pa);
 76 |     /*if (errno == EDOM || errno == ERANGE) {
 77 |      fprintf(stderr, "log(pa) failed, pa is %f\n", pa);
 78 |      free(vac);
 79 |      return -1.0; //return -1 to mean an error
 80 |      }*/
 81 |     
 82 |     double ipalog = log(1.0 - pa);
 83 |     /*if (errno == EDOM || errno == ERANGE) {
 84 |      fprintf(stderr, "log(1.0 - pa) failed, pa is %f\n", pu);
 85 |      free(vac);
 86 |      return -1.0; //return -1 to mean an error
 87 |      }*/
 88 |     
 89 |     double aalog = log(aaw);
 90 |     /*if (errno == EDOM || errno == ERANGE) {
 91 |      fprintf(stderr, "log(tv->aaw) failed, tv->aaw is %f\n", tv->aaw);
 92 |      free(vac);
 93 |      return -1.0; //return -1 to mean an error
 94 |      }*/
 95 |     
 96 |     if (llw < 0) {
 97 |         llw = 1.0;
 98 |     }
 99 |     else if (llw <= 1e-10) {
100 |         llw = 1e-6;
101 |     }
102 |     float log_llw = log(llw);
103 |     
104 |     float numerator = x*plog + (n-x)*iplog;
105 |     float denominator = xu*pulog + (nb - xu)*ipulog + xa*palog + (nt - xa)*ipalog;
106 |     float diff = no_allele_frequency == 0 ? (numerator - denominator) : 0.0;
107 |     float score = -2.0*(log_llw + aalog + diff);
108 |     if (score <= 0.0) {
109 |         return 0.0;
110 |     }
111 |     
112 |     return score;
113 | 
114 | }
115 | 
116 | void score_variant_b(struct variant * v, int no_allele_frequency){
117 |     
118 |     int nb = v->nref + v->hemi.n + v->hets.n + 2*(v->homs.n);
119 |     int xu = nb - v->nref;
120 |     
121 |     struct gene_transcript * c, * t;
122 |     HASH_ITER(hh, v->gt, c, t) {
123 |         struct transcript_anno_info * current, * tmp;
124 |         HASH_ITER(hh, c->tai, current, tmp) {
125 |             current->hemi_score = compute_score(nb, 1, 1, xu, current->aaw, current->llw, no_allele_frequency);
126 |             current->het_score = compute_score(nb, 2, 1, xu, current->aaw, current->llw, no_allele_frequency);
127 |             current->hom_score = compute_score(nb, 2, 2, xu, current->aaw, current->llw, no_allele_frequency);
128 |         }
129 |     }
130 | }
131 | 
132 | void score_variant_t_b(struct variant * v, int nb, int xu, int no_allele_frequency){
133 |     
134 |     struct gene_transcript * c, * t;
135 |     HASH_ITER(hh, v->gt, c, t) {
136 |         struct transcript_anno_info * current, * tmp;
137 |         HASH_ITER(hh, c->tai, current, tmp) {
138 |             current->hemi_score = compute_score(nb, 1, 1, xu, current->aaw, current->llw, no_allele_frequency);
139 |             current->het_score = compute_score(nb, 2, 1, xu, current->aaw, current->llw, no_allele_frequency);
140 |             current->hom_score = compute_score(nb, 2, 2, xu, current->aaw, current->llw, no_allele_frequency);
141 |         }
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/background_max_scores.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  background_max_scores.c
  3 | //  VVP_C
  4 | //
  5 | //  Created by steven on 8/13/15.
  6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
  7 | //
  8 | 
  9 | #include "background_max_scores.h"
 10 | 
 11 | static struct bkgrnd_max_scores * bms;
 12 | 
 13 | void get_indv_scores(struct bkgrnd_max_scores ** tbms, sds indv_scores, char iht){
 14 |     
 15 |     int count = 0;
 16 |     sds * data = sdssplitlen(indv_scores, (int)sdslen(indv_scores), ";", 1, &count);
 17 |     int i=0;
 18 |     int indv = 0;
 19 |     float hom = 0;
 20 |     float het1 = 0;
 21 |     float het2 = 0;
 22 |     float sum = 0;
 23 |     for (i=0; i < count; i++) {
 24 |         sscanf(data[i], "%d:%f,%f,%f,%f", &indv, &hom, &het1, &het2, &sum);
 25 |         if (iht == 'r') { //recessive
 26 |             if ((hom > (het1 + het2)) || (het1 <= 0 || het2 <= 0)) {
 27 |                 (*tbms)->max_scores[indv] = hom;
 28 |             }
 29 |             else {
 30 |                 (*tbms)->max_scores[indv] = (het1 + het2);
 31 |             }
 32 |         }
 33 |         else if (iht == 'd') { //dominant
 34 |             if (het1 > het2) {
 35 |                 (*tbms)->max_scores[indv] = het1;
 36 |             }
 37 |             else {
 38 |                 (*tbms)->max_scores[indv] = het2;
 39 |             }
 40 |         }
 41 |         else { //no inheritance, choose sum
 42 |             (*tbms)->max_scores[indv] = sum;
 43 |         }
 44 |     }
 45 |     
 46 |     sdsfreesplitres(data, count);
 47 | }
 48 | 
 49 | void get_indv_scores_b(struct bkgrnd_max_scores ** tbms, float * scores, int nb, char iht){
 50 |     
 51 |     int i=0;
 52 |     int j=0;
 53 |     float hom, het1, het2, sum;
 54 |     for (i=0; i < 4*nb; i+=4) {
 55 |         hom = scores[i];
 56 |         het1 = scores[i+1];
 57 |         het2 = scores[i+2];
 58 |         sum = scores[i+3];
 59 |         
 60 |         if (iht == 'r' || iht == 'x') { //recessive
 61 |             if (hom > (het1 + het2) || (het1 <= 0 || het2 <= 0)) {
 62 |                 (*tbms)->max_scores[j] = hom;
 63 |             }
 64 |             else {
 65 |                 (*tbms)->max_scores[j] = (het1 + het2);
 66 |             }
 67 |         }
 68 |         else if (iht == 'd') { //dominant
 69 |             /*if (hom > het1 && hom > het2) {
 70 |                 (*tbms)->max_scores[j] = hom;
 71 |             }*/
 72 |             if (het1 > het2) {
 73 |                 (*tbms)->max_scores[j] = het1;
 74 |             }
 75 |             else {
 76 |                 (*tbms)->max_scores[j] = het2;
 77 |             }
 78 |         }
 79 |         else { //no inheritance, choose sum
 80 |             (*tbms)->max_scores[j] = sum;
 81 |         }
 82 |         j++;
 83 |     }
 84 | }
 85 | 
 86 | 
 87 | 
 88 | #define MAX_BUF 1000000
 89 | 
 90 | void init_bkgrnd_max(char * bkgrnd_max, int nb, char iht){
 91 |     bms = NULL;
 92 |     
 93 |     FILE * max_in = fopen(bkgrnd_max, "r");
 94 |     if (! max_in) {
 95 |         fprintf(stderr, "FATAL: could not open %s for loading\n", bkgrnd_max);
 96 |         exit(1);
 97 |     }
 98 |     
 99 |     int line_count = 0;
100 |     
101 |     char * buffer = malloc(sizeof(char)*MAX_BUF);
102 |     while ( fgets(buffer, MAX_BUF, max_in) != NULL)  {
103 |         line_count += 1;
104 |         if (line_count % 1000 == 0) {
105 |             fprintf(stderr, "%d,", line_count);
106 |         }
107 |         sds tmpl = sdsnew(buffer);
108 |         sdstrim(tmpl, "\n");
109 |         int count = 0;
110 |         sds * data = sdssplitlen(tmpl, (int)sdslen(tmpl), "\t", 1, &count);
111 |         if (count != 2) {
112 |             fprintf(stderr, "WARNING: line in max_score wrong format, will be skipped: %s", tmpl);
113 |             sdsfreesplitres(data, count);
114 |             sdsfree(tmpl);
115 |             continue;
116 |         }
117 |         
118 |         struct bkgrnd_max_scores * tbms = (struct bkgrnd_max_scores *)calloc(1, sizeof(struct bkgrnd_max_scores));
119 |         strcpy(tbms->gene, data[0]);
120 |         tbms->max_scores = (float *)calloc(nb, sizeof(float));
121 |         get_indv_scores(&tbms, data[1], iht);
122 |         
123 |         sdsfree(tmpl);
124 |         sdsfreesplitres(data, count);
125 |         
126 |         HASH_ADD_STR(bms, gene, tbms);
127 |     }
128 |     free(buffer);
129 | }
130 | 
131 | void init_bkgrnd_max_b(char * bkgrnd_max, int nb, char iht){
132 |     bms = NULL;
133 |     
134 |     FILE * max_in = fopen(bkgrnd_max, "rb");
135 |     if (! max_in) {
136 |         fprintf(stderr, "FATAL: could not open %s for loading\n", bkgrnd_max);
137 |         exit(1);
138 |     }
139 |     
140 |     char feature[BKRND_GENE_NAME_LEN];
141 |     float * scores = (float *)malloc(sizeof(float)*4*nb);
142 |     int line_count = 0;
143 |     
144 |     while ( fread(&feature, sizeof(char), BKRND_GENE_NAME_LEN, max_in) )  {
145 |         line_count += 1;
146 |         if (line_count % 10000 == 0) {
147 |             fprintf(stderr, "%d,", line_count);
148 |         }
149 |         //read in float data
150 |         memset(scores, '\0', sizeof(float)*4*nb);
151 |         fread(scores, sizeof(float), 4*nb, max_in);
152 |         
153 |         struct bkgrnd_max_scores * tbms = (struct bkgrnd_max_scores *)calloc(1, sizeof(struct bkgrnd_max_scores));
154 |         strcpy(tbms->gene, feature);
155 |         tbms->max_scores = (float *)calloc(nb, sizeof(float));
156 |         get_indv_scores_b(&tbms, scores, nb, iht);
157 |         
158 |         /* debug info
159 |         if (strcmp(feature, "ENSG00000130283") == 0){
160 |             fprintf(stderr, "\nPRINTING BACKGROUND SCORES\n\n%s\n", feature);
161 |             int scores_index = 0;
162 |             int i=0;
163 |             for (i=0; i < nb; i++) {
164 |                 if (tbms->max_scores[i] > 0) {
165 |                     int j = 0;
166 |                     fprintf(stderr, "\t%d", i);
167 |                     for (j = 0; j < 4; j++) {
168 |                         fprintf(stderr, "\t%f", scores[scores_index+j]);
169 |                     }
170 |                     fprintf(stderr, "\t%f", tbms->max_scores[i]);
171 |                     fprintf(stderr, "\n");
172 |                 }
173 |                 scores_index += 4;
174 |             }
175 |         }
176 |         */
177 |         
178 |         
179 |         
180 |         HASH_ADD_STR(bms, gene, tbms);
181 |         
182 |     }
183 |     
184 |     free(scores);
185 |     
186 | }
187 | 
188 | 
189 | struct bkgrnd_max_scores * get_gene_max(char * gene){
190 |     
191 |     struct bkgrnd_max_scores * t = NULL;
192 |     HASH_FIND_STR(bms, gene, t);
193 |     return t;
194 |     
195 | }
196 | 
197 | void cleanup_bkgrnd_max(){
198 |     
199 |     struct bkgrnd_max_scores *s, *tmp;
200 |     HASH_ITER(hh, bms, s, tmp){
201 |         HASH_DEL(bms, s);
202 |         free(s);
203 |     }
204 | 
205 | }
206 | 
207 | 
208 | 


--------------------------------------------------------------------------------
/search_binary_bkgrnd.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  search_binary_bkgrnd.c
  3 | //  VVP_C
  4 | //
  5 | //  Created by steven on 8/11/15.
  6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
  7 | //
  8 | 
  9 | #include "search_binary_bkgrnd.h"
 10 | 
 11 | static uint64_t mm_size;
 12 | 
 13 | unsigned char * load_bin_db(sds file_prefix, int * n_background){
 14 |     
 15 |     unsigned char * mm_bin = NULL;
 16 |     
 17 |     sds bin_file = sdsdup(file_prefix);
 18 |     bin_file = sdscat(bin_file, ".bin");
 19 |     
 20 |     int fdSrc = open(bin_file, O_RDWR, 0);
 21 |     struct stat st;
 22 |     fstat(fdSrc, &st);
 23 |     
 24 |     mm_bin = (unsigned char *)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdSrc, 0);
 25 |     if(mm_bin == MAP_FAILED){
 26 |         fprintf(stderr, "FATAL:  could not create mmap from %s\n", bin_file);
 27 |         exit(1);
 28 |     }
 29 |     
 30 |     //load memmap into memory
 31 |     /*
 32 |     size_t filesize = st.st_size;
 33 |     size_t page_size = getpagesize();
 34 |     unsigned char * buf[page_size];
 35 |     size_t pos=0;
 36 |     for (pos=0; pos < filesize; pos += page_size) {
 37 |         size_t this_page_size = filesize - pos;
 38 |         if (this_page_size > page_size){
 39 |             this_page_size = page_size;
 40 |         }
 41 |         memcpy(buf, mm_bin + pos, this_page_size);
 42 |     }
 43 |     */
 44 |     
 45 |     mm_size = st.st_size;
 46 |     fprintf(stderr, "MMAP size for .bin: %llu\n\n", mm_size);
 47 |     
 48 |     sdsfree(bin_file);
 49 |     
 50 |     *n_background = *((int *)mm_bin);
 51 |     
 52 |     return mm_bin;
 53 | }
 54 | 
 55 | unsigned char * load_bit_db(sds file_prefix){
 56 |     unsigned char * mm_bits = NULL;
 57 |     
 58 |     sds bit_file = sdsdup(file_prefix);
 59 |     bit_file = sdscat(bit_file, ".bit");
 60 |     
 61 |     int fdSrc = open(bit_file, O_RDWR, 0);
 62 |     struct stat st;
 63 |     fstat(fdSrc, &st);
 64 |     mm_bits = (unsigned char *)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fdSrc, 0);
 65 |     if(mm_bits == MAP_FAILED){
 66 |         fprintf(stderr, "FATAL:  could not create mmap from %s\n", bit_file);
 67 |         exit(1);
 68 |     }
 69 |     
 70 |     sdsfree(bit_file);
 71 |     
 72 |     return mm_bits;
 73 | 
 74 | }
 75 | 
 76 | struct chr_offsets * load_offsets(sds file_prefix){
 77 |     
 78 |     sds offset_file = sdsdup(file_prefix);
 79 |     offset_file = sdscat(offset_file, ".chr_offsets.txt");
 80 |     
 81 |     FILE * offsets = fopen(offset_file, "r");
 82 |     if (! offsets) {
 83 |         fprintf(stderr, "FATAL: could not open offsets file %s\n", offset_file);
 84 |         exit(1);
 85 |     }
 86 |     
 87 |     char chr[3];
 88 |     memset(chr, '\0', 3);
 89 |     uint64_t start;
 90 |     uint64_t end;
 91 |     int n;
 92 |     struct chr_offsets * chro = NULL;
 93 | 
 94 |     
 95 |     while (fscanf(offsets, "%s\t%llu\t%llu\t%d", chr, &start, &end, &n) == 4) {
 96 |         
 97 |         struct chr_offsets * tc = (struct chr_offsets *)calloc(1, sizeof(struct chr_offsets));
 98 |         //tc->chr = (char *)calloc(3, sizeof(char));
 99 |         strcpy(tc->chr, chr);
100 |         tc->byte_start = start;
101 |         tc->byte_end = end;
102 |         tc->n_entries = n;
103 |         memset(chr, '\0', 3);
104 |         
105 |         struct chr_offsets * ttc = NULL;
106 |         HASH_FIND_STR(chro, tc->chr, ttc);
107 |         //HASH_FIND(hh, chro, &tc->chr, 3*sizeof(char), ttc);
108 |         if (ttc != NULL) {
109 |             fprintf(stderr, "FATAL: chromosome %s already seen in offsets", chr);
110 |             exit(1);
111 |         }
112 |         else {
113 |             HASH_ADD_STR(chro, chr, tc);
114 |             //HASH_ADD(hh, chro, chr, 3*sizeof(char), tc);
115 |         }
116 |         
117 |     }
118 |     
119 |     sdsfree(offset_file);
120 |     fclose(offsets);
121 |     
122 |     return chro;
123 |     
124 | }
125 | 
126 | void get_variants_S(struct m_var_info ** vi, unsigned char * mm_bin, uint64_t mm_offset){ // "_S" means side effects
127 |     
128 |     (*vi)->vi[(*vi)->nv] = (struct var_info *)malloc(sizeof(struct var_info));
129 |     mm_offset += 4; //skip start position
130 |     (*vi)->vi[(*vi)->nv]->var_type = *( (char *)(mm_bin + mm_offset) );
131 |     mm_offset += 1;
132 |     (*vi)->vi[(*vi)->nv]->length = *( (int *)(mm_bin + mm_offset) );
133 |     mm_offset += 4;
134 |     (*vi)->vi[(*vi)->nv]->nhet = *( (int *)(mm_bin + mm_offset) );
135 |     mm_offset += 4;
136 |     (*vi)->vi[(*vi)->nv]->nhom = *( (int *)(mm_bin + mm_offset) );
137 |     mm_offset += 4;
138 |     (*vi)->vi[(*vi)->nv]->nhemi = *( (int *)(mm_bin + mm_offset) );
139 |     mm_offset += 4;
140 |     (*vi)->vi[(*vi)->nv]->nocall = *( (int *)(mm_bin + mm_offset) );
141 |     mm_offset += 4;
142 |     (*vi)->vi[(*vi)->nv]->bit_offset = *( (uint64_t *)(mm_bin + mm_offset) );
143 |     (*vi)->nv++; //increment number of variants
144 | }
145 | 
146 | 
147 | struct m_var_info * search_binary_bkgrnd(char * chr, size_t pos, unsigned char * mm_bin, struct chr_offsets * chro){
148 |     
149 |     struct m_var_info * vi = (struct m_var_info *)malloc(sizeof(struct m_var_info *));
150 |     vi->vi = (struct var_info **)malloc(sizeof(struct var_info *)*MAX_VARS);
151 |     vi->nv = 0;
152 |     
153 |     struct chr_offsets * ttc = NULL;
154 |     HASH_FIND_STR(chro, chr, ttc);
155 |     //HASH_FIND(hh, chro, &chr, 3*sizeof(char), ttc);
156 |     if (ttc == NULL) {
157 |         fprintf(stderr, "WARNING: chromosome %s not in offsets\n", chr);
158 |         return vi;
159 |     }
160 |     else {
161 |         
162 |         int min = 0;
163 |         int max = ttc->n_entries - 1; //place pointer at start of last entry
164 |         while (max >= min) { //binary search
165 |             int mid = (min + max) / 2;
166 |             uint64_t tmp_offset = ttc->byte_start + mid*LINE_BYTE_SIZE + 2; //+2 because only two chars were written for chromosome placeholder
167 |             int * mm_pos = (int *)(mm_bin + tmp_offset);
168 |             if ( (*mm_pos) == pos) {
169 |                 get_variants_S(&vi, mm_bin, tmp_offset);
170 |                 
171 |                 //look 'down' from found position
172 |                 uint64_t d_offset = tmp_offset;
173 |                 while (d_offset >= LINE_BYTE_SIZE && *( (int *)(mm_bin + (d_offset - LINE_BYTE_SIZE)) ) == pos) {
174 |                     d_offset -= LINE_BYTE_SIZE;
175 |                     get_variants_S(&vi, mm_bin, d_offset);
176 |                 }
177 |                 
178 |                 //look 'up' from found position
179 |                 uint64_t u_offset = tmp_offset + LINE_BYTE_SIZE;
180 |                 while (u_offset < mm_size && *( (int *)(mm_bin+u_offset) ) == pos) {
181 |                     get_variants_S(&vi, mm_bin, u_offset);
182 |                     u_offset += LINE_BYTE_SIZE;
183 |                 }
184 |                 
185 |                 return vi;
186 |                 
187 |             }
188 |             else if (*mm_pos < pos) {
189 |                 min = mid + 1;
190 |             }
191 |             else {
192 |                 max = mid - 1;
193 |             }
194 |         }
195 |     }
196 |     
197 |     
198 |     return vi;
199 | }
200 | 
201 | void destroy_chr_offsets(struct chr_offsets * chro){
202 |     struct chr_offsets *s, *tmp;
203 |     HASH_ITER(hh, chro, s, tmp){
204 |         HASH_DEL(chro, s);
205 |         free(s);
206 |     }
207 | }
208 | 
209 | 


--------------------------------------------------------------------------------
/bit_macros.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  bit_macros.h
  3 |  project: bit array C library
  4 |  url: https://github.com/noporpoise/BitArray/
  5 |  author: Isaac Turner <turner.isaac@gmail.com>
  6 |  license: Public Domain, no warranty
  7 |  date: Dec 2013
  8 | */
  9 | 
 10 | #ifndef BITSET_H_
 11 | #define BITSET_H_
 12 | 
 13 | #include <inttypes.h>
 14 | #include <sched.h>
 15 | 
 16 | // trailing_zeros is number of least significant zeros
 17 | // leading_zeros is number of most significant zeros
 18 | #if defined(_WIN32)
 19 |   #define trailing_zeros(x) ({ __typeof(x) _r; _BitScanReverse64(&_r, x); _r; })
 20 |   #define leading_zeros(x) ({ __typeof(x) _r; _BitScanForward64(&_r, x); _r; })
 21 | #else
 22 |   #define trailing_zeros(x) ((x) ? (__typeof(x))__builtin_ctzll(x) : (__typeof(x))sizeof(x)*8)
 23 |   #define leading_zeros(x) ((x) ? (__typeof(x))__builtin_clzll(x) : (__typeof(x))sizeof(x)*8)
 24 | #endif
 25 | 
 26 | // Get index of top set bit. If x is 0 return nbits
 27 | #define top_set_bit(x) ((x) ? sizeof(x)*8-leading_zeros(x)-1 : sizeof(x)*8)
 28 | 
 29 | #define roundup_bits2bytes(bits)   (((bits)+7)/8)
 30 | #define roundup_bits2words32(bits) (((bits)+31)/32)
 31 | #define roundup_bits2words64(bits) (((bits)+63)/64)
 32 | 
 33 | // Round a number up to the nearest number that is a power of two
 34 | #define roundup2pow(x) (1UL << (64 - leading_zeros(x)))
 35 | 
 36 | #define rot32(x,r) (((x)<<(r)) | ((x)>>(32-(r))))
 37 | #define rot64(x,r) (((x)<<(r)) | ((x)>>(64-(r))))
 38 | 
 39 | // need to check for length == 0, undefined behaviour if uint64_t >> 64 etc
 40 | #define bitmask(nbits,type) ((nbits) ? ~(type)0 >> (sizeof(type)*8-(nbits)): (type)0)
 41 | #define bitmask32(nbits) bitmask(nbits,uint32_t)
 42 | #define bitmask64(nbits) bitmask(nbits,uint64_t)
 43 | 
 44 | // A possibly faster way to combine two words with a mask
 45 | //#define bitmask_merge(a,b,abits) ((a & abits) | (b & ~abits))
 46 | #define bitmask_merge(a,b,abits) (b ^ ((a ^ b) & abits))
 47 | 
 48 | // Swap lowest four bits. A nibble is 4 bits (i.e. half a byte)
 49 | #define rev_nibble(x) ((((x)&1)<<3)|(((x)&2)<<1)|(((x)&4)>>1)|(((x)&8)>>3))
 50 | 
 51 | //
 52 | // Bit array (bitset)
 53 | //
 54 | // bitsetX_wrd(): get word for a given position
 55 | // bitsetX_idx(): get index within word for a given position
 56 | #define _VOLPTR(x) ((volatile __typeof(x) *)(&(x)))
 57 | #define _VOLVALUE(x) (*_VOLPTR(x))
 58 | 
 59 | #define _TYPESHIFT(arr,word,shift) \
 60 |         ((__typeof(*(arr)))((__typeof(*(arr)))(word) << (shift)))
 61 | 
 62 | #define bitsetX_wrd(wrdbits,pos) ((pos) / (wrdbits))
 63 | #define bitsetX_idx(wrdbits,pos) ((pos) % (wrdbits))
 64 | 
 65 | #define bitset32_wrd(pos) ((pos) >> 5)
 66 | #define bitset32_idx(pos) ((pos) & 31)
 67 | 
 68 | #define bitset64_wrd(pos) ((pos) >> 6)
 69 | #define bitset64_idx(pos) ((pos) & 63)
 70 | 
 71 | //
 72 | // Bit functions on arrays
 73 | //
 74 | #define bitset2_get(arr,wrd,idx)     (((arr)[wrd] >> (idx)) & 0x1)
 75 | #define bitset2_set(arr,wrd,idx)     ((arr)[wrd] |=  _TYPESHIFT(arr,1,idx))
 76 | #define bitset2_del(arr,wrd,idx)     ((arr)[wrd] &=~ _TYPESHIFT(arr,1,idx))
 77 | #define bitset2_tgl(arr,wrd,idx)     ((arr)[wrd] ^=  _TYPESHIFT(arr,1,idx))
 78 | #define bitset2_or(arr,wrd,idx,bit)  ((arr)[wrd] |=  _TYPESHIFT(arr,bit,idx))
 79 | #define bitset2_xor(arr,wrd,idx,bit) ((arr)[wrd]  = ~((arr)[wrd] ^ (~_TYPESHIFT(arr,bit,idx))))
 80 | #define bitset2_and(arr,wrd,idx,bit) ((arr)[wrd] &= (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx)))
 81 | #define bitset2_cpy(arr,wrd,idx,bit) ((arr)[wrd]  = ((arr)[wrd] &~ _TYPESHIFT(arr,1,idx)) | _TYPESHIFT(arr,bit,idx))
 82 | 
 83 | //
 84 | // Thread safe versions
 85 | //
 86 | // They return the value of the bit (0 or 1) before it was updated
 87 | #define bitset2_get_mt(arr,wrd,idx)     bitset2_get(_VOLPTR(*(arr)),wrd,idx)
 88 | #define bitset2_set_mt(arr,wrd,idx)     ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
 89 | #define bitset2_del_mt(arr,wrd,idx)     ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), ~_TYPESHIFT(arr,1,idx)) >> (idx))&1)
 90 | #define bitset2_tgl_mt(arr,wrd,idx)     ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,1,idx)) >> (idx))&1)
 91 | #define bitset2_or_mt(arr,wrd,idx,bit)  ((__sync_fetch_and_or (_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
 92 | #define bitset2_xor_mt(arr,wrd,idx,bit) ((__sync_fetch_and_xor(_VOLPTR((arr)[wrd]),  _TYPESHIFT(arr,bit,idx)) >> (idx))&1)
 93 | #define bitset2_and_mt(arr,wrd,idx,bit) ((__sync_fetch_and_and(_VOLPTR((arr)[wrd]), (_TYPESHIFT(arr,bit,idx) | ~_TYPESHIFT(arr,1,idx))) >> (idx))&1)
 94 | #define bitset2_cpy_mt(arr,wrd,idx,bit) ((bit) ? bitset2_set_mt(arr,wrd,idx) : bitset2_del_mt(arr,wrd,idx))
 95 | 
 96 | //
 97 | // Auto detect size of type from pointer
 98 | //
 99 | #define bitset_wrd(arr,pos) bitsetX_wrd(sizeof(*(arr))*8,pos)
100 | #define bitset_idx(arr,pos) bitsetX_idx(sizeof(*(arr))*8,pos)
101 | #define bitset_op(func,arr,pos)      func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos))
102 | #define bitset_op2(func,arr,pos,bit) func(arr, bitset_wrd(arr,pos), bitset_idx(arr,pos), bit)
103 | 
104 | // Auto-detect type size: bit functions
105 | #define bitset_get(arr,pos)     bitset_op(bitset2_get, arr, pos)
106 | #define bitset_set(arr,pos)     bitset_op(bitset2_set, arr, pos)
107 | #define bitset_del(arr,pos)     bitset_op(bitset2_del, arr, pos)
108 | #define bitset_tgl(arr,pos)     bitset_op(bitset2_tgl, arr, pos)
109 | #define bitset_or(arr,pos,bit)  bitset_op2(bitset2_or, arr, pos, bit)
110 | #define bitset_xor(arr,pos,bit) bitset_op2(bitset2_xor, arr, pos, bit)
111 | #define bitset_and(arr,pos,bit) bitset_op2(bitset2_and, arr, pos, bit)
112 | #define bitset_cpy(arr,pos,bit) bitset_op2(bitset2_cpy, arr, pos, bit)
113 | 
114 | // Auto-detect type size: thread safe bit functions
115 | // They return the value of the bit (0 or 1) before it was updated
116 | #define bitset_get_mt(arr,pos)     bitset_op(bitset2_get_mt,  arr, pos)
117 | #define bitset_set_mt(arr,pos)     bitset_op(bitset2_set_mt,  arr, pos)
118 | #define bitset_del_mt(arr,pos)     bitset_op(bitset2_del_mt,  arr, pos)
119 | #define bitset_tgl_mt(arr,pos)     bitset_op(bitset2_tgl_mt,  arr, pos)
120 | #define bitset_or_mt(arr,pos,bit)  bitset_op2(bitset2_or_mt,  arr, pos, bit)
121 | #define bitset_xor_mt(arr,pos,bit) bitset_op2(bitset2_xor_mt, arr, pos, bit)
122 | #define bitset_and_mt(arr,pos,bit) bitset_op2(bitset2_and_mt, arr, pos, bit)
123 | #define bitset_cpy_mt(arr,pos,bit) bitset_op2(bitset2_cpy_mt, arr, pos, bit)
124 | 
125 | // Clearing a word does not return a meaningful value
126 | #define bitset_clear_word(arr,pos) ((arr)[bitset_wrd(arr,pos)] = 0)
127 | #define bitset_clear_word_mt(arr,pos) (_VOLVALUE((arr)[bitset_wrd(arr,pos)]) = 0)
128 | 
129 | //
130 | // Compact bit array of spin locks
131 | // These are most effecient when arr is of type: volatile char*
132 | //
133 | // Acquire a lock
134 | #define bitlock_acquire_block(arr,pos,wait,abandon) do {                       \
135 |   size_t _w = bitset_wrd(arr,pos);                                             \
136 |   __typeof(*(arr)) _o, _n, _b = _TYPESHIFT(arr, 1, bitset_idx(arr,pos));       \
137 |   do {                                                                         \
138 |     while((_o = _VOLVALUE((arr)[_w])) & _b) { wait }                           \
139 |     abandon                                                                    \
140 |     _n = _o | _b;                                                              \
141 |   } while(!__sync_bool_compare_and_swap(_VOLPTR((arr)[_w]), _o, _n));          \
142 |   __sync_synchronize(); /* Must not move commands to before acquiring lock */  \
143 | } while(0)
144 | 
145 | // Undefined behaviour if you do not already hold the lock
146 | #define bitlock_release(arr,pos) do {                                          \
147 |   size_t _w = bitset_wrd(arr,pos);                                             \
148 |   __typeof(*(arr)) _mask = ~_TYPESHIFT(arr, 1, bitset_idx(arr,pos));           \
149 |   __sync_synchronize(); /* Must get the lock before releasing it */            \
150 |   __sync_and_and_fetch(_VOLPTR((arr)[_w]), _mask);                             \
151 | } while(0)
152 | 
153 | #define bitlock_acquire(arr,pos) bitlock_acquire_block(arr,pos,{},{})
154 | 
155 | // calls yield if cannot acquire the lock
156 | #define bitlock_yield_acquire(arr,pos) bitlock_acquire_block(arr,pos,sched_yield();,{})
157 | 
158 | // Block until we get the lock or someone else does
159 | // sets the memory pointed to by retptr to 1 if we got the lock, 0 otherwise
160 | #define bitlock_try_acquire(arr,pos,retptr) do {                               \
161 |   *retptr = 1; /* default to success, set to zero if locked */                 \
162 |   bitlock_acquire_block(arr,pos,{*retptr=0;break;},if(!*retptr){break;});      \
163 | } while(0)
164 | 
165 | /*
166 |  * Byteswapping
167 |  */
168 | 
169 | /* clang uses these to check for features */
170 | #ifndef __has_feature
171 | #define __has_feature(x) 0
172 | #endif
173 | 
174 | #ifndef __has_builtin
175 | #define __has_builtin(x) 0
176 | #endif
177 | 
178 | /* GCC versions < 4.3 do not have __builtin_bswapX() */
179 | #if ( defined(__clang__) && !__has_builtin(__builtin_bswap64) ) ||             \
180 |     ( !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) &&   \
181 |       ( (__GNUC__ < 4)  || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)) )
182 |   #define byteswap64(x) ( (((uint64_t)(x) << 56))                       | \
183 |                           (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
184 |                           (((uint64_t)(x) << 24) & 0xff0000000000ULL)   | \
185 |                           (((uint64_t)(x) <<  8) & 0xff00000000ULL)     | \
186 |                           (((uint64_t)(x) >>  8) & 0xff000000ULL)       | \
187 |                           (((uint64_t)(x) >> 24) & 0xff0000ULL)         | \
188 |                           (((uint64_t)(x) >> 40) & 0xff00ULL)           | \
189 |                           (((uint64_t)(x) >> 56)) )
190 | 
191 |   #define byteswap32(x) ( (((uint32_t)(x) << 24))                       | \
192 |                           (((uint32_t)(x) <<  8) & 0xff0000U)           | \
193 |                           (((uint32_t)(x) >>  8) & 0xff00U)             | \
194 |                           (((uint32_t)(x) >> 24)) )
195 | 
196 |   /* uint16_t type might be bigger than 2 bytes, so need to mask */
197 |   #define byteswap16(x) ( (((uint16_t)(x) & 0xff) << 8) | \
198 |                           (((uint16_t)(x) >> 8) & 0xff) )
199 | #else
200 |   #define byteswap64(x) __builtin_bswap64(x)
201 |   #define byteswap32(x) __builtin_bswap64(x)
202 |   #define byteswap16(x) __builtin_bswap64(x)
203 | #endif
204 | 
205 | #endif /* BITLOCK_H_ */
206 | 


--------------------------------------------------------------------------------
/aa_weight.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  aa_weight.c
  3 | //  VVP_dev_xcode
  4 | //
  5 | //  Created by STEVEN FLYGARE on 10/11/16.
  6 | //  Copyright © 2016 IDbyDNA. All rights reserved.
  7 | //
  8 | 
  9 | #include "aa_weight.h"
 10 | 
 11 | #define NUM_KEYS 231
 12 | 
 13 | #define CONS 1.4121
 14 | #define UNCONS 0.3022
 15 | 
 16 | static struct aa_matrix * aam;
 17 | 
 18 | void init_aa_score(){
 19 |     
 20 |     //initialize amino acid substitution scores.  based on matrix0.13.log
 21 |     char * keys[NUM_KEYS] = { "*0X","A0D","A0E","A0G","A0M","A0N","A0P","A0R","A0S","A0T","A0V","C0F","C0G","C0H","C0L","C0R","C0S","C0W","C0Y","D0A","D0E","D0G","D0H","D0N","D0V","D0Y","E0A","E0D","E0G","E0K","E0L","E0N","E0Q","E0V","F0C","F0G","F0I","F0L","F0S","F0V","F0Y","G0A","G0C","G0D","G0E","G0K","G0M","G0R","G0S","G0T","G0V","G0W","H0C","H0D","H0L","H0N","H0P","H0Q","H0R","H0Y","I0F","I0K","I0L","I0M","I0N","I0R","I0S","I0T","I0V","K0A","K0D","K0E","K0I","K0M","K0N","K0Q","K0R","K0S","K0T","L0A","L0F","L0H","L0I","L0M","L0P","L0Q","L0R","L0S","L0V","L0W","M0I","M0K","M0L","M0R","M0T","M0V","N0D","N0E","N0H","N0I","N0K","N0S","N0T","N0Y","P0A","P0F","P0H","P0L","P0Q","P0R","P0S","P0T","Q0E","Q0H","Q0K","Q0L","Q0P","Q0R","Q0W","R0C","R0D","R0G","R0H","R0I","R0K","R0L","R0M","R0P","R0Q","R0S","R0T","R0W","S0A","S0C","S0F","S0G","S0I","S0L","S0N","S0P","S0R","S0T","S0W","S0Y","T0A","T0I","T0K","T0L","T0M","T0N","T0P","T0R","T0S","T0V","V0A","V0D","V0E","V0F","V0G","V0I","V0K","V0L","V0M","V0P","W0C","W0G","W0L","W0Q","W0R","W0S","X0*","Y0C","Y0D","Y0F","Y0H","Y0L","Y0N","Y0Q","Y0R","Y0S","del-12","del-15","del-18","del-21","del-24","del-27","del-3","del-30","del-33","del-36","del-39","del-42","del-45","del-48","del-51","del-54","del-57","del-6","del-60","del-63","del-66","del-69","del-72","del-75","del-78","del-9","ins-12","ins-15","ins-18","ins-21","ins-24","ins-27","ins-3","ins-30","ins-33","ins-36","ins-39","ins-6","ins-9","splice-a1-A","splice-a1-C","splice-a1-T","splice-a2-A","splice-a2-C","splice-a2-G","splice-b1-C","splice-b1-G","splice-b1-T","splice-b2-A","splice-b2-C","splice-b2-T" };
 22 |     float scores[NUM_KEYS] = { 0.91359891142,1.84077794036,1.01098024651,0.350309375974,0.268054671093,3.32067726877,0.900077768437,3.15582095046,0.514060482919,0.540349794826,0.628795059769,2.09327639657,1.22000954491,5.7047532566,29.2743917115,0.595969916709,1.17699832069,0.986631383625,1.65452817547,0.684044202943,0.284371787196,0.93951194795,2.20525153421,1.4316783812,5.58167027113,3.02524793911,0.708654740721,0.330933515822,0.45467038898,2.40831376054,3.39154538121,16.8976235702,0.320902013172,1.53326882958,3.05351440285,2.03182992701,0.702047948101,0.65576453956,0.97621698662,1.1006862319,0.384406382773,0.400694060346,5.1693641127,2.57718986732,1.58030315672,43.6245837269,3.5884738227,2.45518665108,1.13587458176,3.85256064082,3.07938238072,2.35019764444,1.30309822248,0.633139945951,2.13287359624,0.39439020963,0.569328327198,0.378744562026,0.314781476958,0.716827634337,1.96573425468,2.86052627581,0.426846207364,0.50089173886,2.00103437015,1.89833939426,1.14017104718,0.660969407995,0.124905573926,1.35661815248,5.95459646492,0.501852900841,1.78523873226,0.937057916238,0.865792905897,0.448947142995,0.259542241486,12.7134501147,0.631500094397,19.3465545224,0.544537306052,0.699011507653,0.262519618888,0.309163611304,1.20350937887,1.1077402915,2.35289503476,0.30675297571,0.374405222367,3.04311305937,0.579552164832,3.12521265361,0.252632903491,6.19289193732,0.478336569493,0.298365415566,0.252141042725,9.14323467153,0.627308393818,1.16637157016,1.27824346828,0.525281931208,0.447002583942,1.15928116547,0.329023036095,4.36245837269,0.619017974516,1.104099356,0.952536782294,1.36239714355,0.57534565594,0.756575508116,0.408673148047,0.323726616874,0.488357458773,0.666849903837,1.33327792137,0.19037761025,12.3602987226,1.97731301247,38.3595477599,0.547403664788,1.20604622311,1.12591127563,0.18436317034,2.48919977427,0.27724034518,1.69508363222,1.01542566911,1.27602467553,0.670858341181,2.94010613985,0.0764948863701,0.683120797587,1.95994506599,0.141911296461,0.781564556232,1.62328386658,0.176043695858,0.379283579151,0.896810696342,0.193165598642,1.43143165354,1.24343124874,0.172074786805,0.641748865424,0.461683704103,0.841154544451,1.04483261198,0.43243027601,0.438703560296,1.21030361191,0.162096909993,13.905336063,0.182237547843,2.77551618023,2.86412689247,1.91502937479,0.609672706708,0.133753806045,24.7205974453,0.403390502245,0.852928702808,3.07583470517,2.61747502362,2.21746222931,0.764554560163,15.0327957437,0.618329826321,4.1469781362,27.6429358646,2.0572278011,1.60639261377,0.418993177038,0.653887955936,21.1890835245,4.47411178625,49.4411948905,42.9357745102,0.712918388241,11.9298578,4.252337046,9.9527436,10.7235,11.6106,12.4977,3.941480407,13.3848,14.2719,15.159,16.0461,16.9332,17.8203,18.7074,19.5945,20.4816,21.3687,4.033682552,22.2558,23.1429,24.03,24.9171,25.8042,26.6913,27.5784,11.59869346,6.7228767,5.192735791,17.91493848,15.7824,17.7768,19.7712,2.016674044,21.7656,23.76,25.7544,27.7488,5.507683676,15.46012184,19.20532627,7.890694571,27.59342384,13.41412178,8.094265101,7.136069027,10.05920958,13.70509544,7.857813164,7.107669134,15.21498419,11.54494088 };
 23 |     float conserved[NUM_KEYS] = { 2.77861236573,3.69595112473,2.78158289687,0.650172837121,0.268054671093,3.32067726877,2.94366286187,3.15582095046,0.801800546955,0.996750910768,1.17936028122,5.28861213069,3.51612494445,5.7047532566,29.2743917115,1.50011111282,2.6192361698,1.74958858425,4.43830835802,1.57645466633,0.547517775352,1.70012194995,4.05676423293,2.28496774448,10.4578666008,6.36556517102,1.15988760881,0.539812097288,0.781150046362,4.52562300025,3.39154538121,16.8976235702,0.83872418366,4.57995969396,4.24410362144,2.03182992701,1.87862836538,1.44337630759,2.27561120922,2.89599141958,0.598952603607,0.785166678153,11.5248261708,5.07971425289,3.47003887511,43.6245837269,3.5884738227,6.27864605808,2.33286352514,3.85256064082,5.88858772202,4.08457573855,1.30309822248,1.99366375903,4.27453456609,0.793257819723,1.04526264091,0.684796430406,0.765466968588,1.73951873277,2.10305541659,2.86052627581,0.558609035575,0.842946508963,4.18451830221,3.30803100929,5.76840579829,1.34484620972,0.216933893948,1.35661815248,5.95459646492,0.959635405612,11.3263137705,1.79808858652,1.31027456561,0.798991869633,0.372756919483,12.7134501147,1.21079572772,19.3465545224,0.911324995336,1.36707373523,0.420177441363,0.404388114374,3.36859789134,2.23004033896,9.02777667655,0.727321349074,0.822596417388,10.6728140247,1.03529186134,6.20224892768,0.48968359006,11.4929802538,1.39124528104,0.636172501084,0.57484859255,9.14323467153,0.698911371496,2.59927831437,2.55393837991,0.801242646765,0.80015118965,5.28051417656,0.671230287277,4.36245837269,1.40169277902,2.37751396019,2.18064236834,3.3635339451,1.07747697859,1.30535739741,0.677419532467,0.545735892564,0.569376723886,2.09857486554,4.87592776893,0.332295194373,12.3602987226,3.97373539982,38.3595477599,1.76860483365,2.55051363,5.42844359549,0.305771298896,5.46796797706,0.27724034518,4.3981302265,1.73842692187,3.19768032726,1.40173472301,7.35966034138,0.147951304983,1.11108682406,4.66422394496,0.324122205778,1.17254681829,3.10503111267,0.270778652712,0.652075058202,2.0670264358,0.46743997153,2.79501156481,1.95778903806,0.373368631869,1.1592740375,1.02464492758,0.841154544451,2.06724076149,0.631308798028,1.0054055368,2.75016788198,0.282033600176,13.905336063,0.423548044207,5.8255335952,10.7434504572,5.23293943858,0.780226626075,0.185328912249,24.7205974453,0.6887903473,1.62309356602,3.00364520084,9.1069758573,4.00530975742,0.764554560163,15.0327957437,1.65123745094,17.1436329842,67.3013534151,6.16661163522,3.60134962738,0.658934662783,1.17161469105,21.1890835245,25.9883106502,49.4411948905,42.9357745102,1.55378185295,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
 24 |     float unconserved[NUM_KEYS] = { 0.390219892002,0.812354782563,0.449286828113,0.183510042772,0.268054671093,3.32067726877,0.290536148385,3.15582095046,0.340439484732,0.275372722663,0.323569561891,0.708975287349,0.422714655564,5.7047532566,29.2743917115,0.22538690233,0.517322307406,0.394804713904,0.532621986266,0.31449593193,0.119266157772,0.388794499587,0.910505376573,0.865863160753,1.87832674331,1.27876960364,0.410204960454,0.166926769783,0.18642367978,1.21735790145,3.39154538121,16.8976235702,0.0911765864685,0.678079059572,1.578662795,2.03182992701,0.210313270251,0.184968941502,0.331568447668,0.353162445208,0.143651750525,0.136036478297,2.69860533082,1.02353162002,0.619292405291,43.6245837269,3.5884738227,0.979433338108,0.473256910936,3.85256064082,1.00364229821,0.948397953033,1.30309822248,0.237783373244,0.92814544074,0.187308458967,0.249322121843,0.136406961359,0.122784637955,0.266861011778,1.83082880332,2.86052627581,0.288037334269,0.165037084637,0.406503002964,1.02117973335,0.295791761021,0.253096076313,0.0582045529163,1.35661815248,5.95459646492,0.249539811041,0.832524786768,0.268632678977,0.418493902095,0.259713536399,0.125625685969,12.7134501147,0.276515832231,19.3465545224,0.210324274926,0.392462633442,0.187260114159,0.20657083957,0.408825534425,0.307800775835,0.899060319949,0.133787289883,0.164875388753,0.455947915141,0.245787934668,0.700601807416,0.0966252358557,1.55736324119,0.133534201284,0.088448145875,0.110750572984,9.14323467153,0.561742068702,0.270374767125,0.559169839753,0.247855443989,0.0969398877746,0.257397197433,0.125016641884,4.36245837269,0.27163063577,0.541802447888,0.425975847444,0.617146779204,0.262365993093,0.368982891804,0.205297857359,0.146289369473,0.368624684222,0.125170966658,0.310910943188,0.0913996058811,12.3602987226,0.736741174529,38.3595477599,0.156134012168,0.470085908769,0.328116905075,0.0666672201166,0.707678204804,0.27724034518,0.705475906971,0.425702085516,0.40261165342,0.233143554121,1.04701020896,0.0290437561938,0.335318683611,0.469242492337,0.0590635233874,0.483411059634,0.691042486655,0.0928225665578,0.176188413213,0.353220366567,0.0494178294865,0.585235802956,0.496258152798,0.0820030090174,0.314612477149,0.104240921658,0.841154544451,0.535053673856,0.28516973357,0.18638319452,0.468461345863,0.0663344666187,13.905336063,0.0806901979333,0.894868489532,0.737606912985,0.585102079237,0.413716387905,0.0870871717132,24.7205974453,0.200274410234,0.422973772401,3.2831698866,0.432116314185,0.623759854226,0.764554560163,15.0327957437,0.196710984652,0.494583125784,10.1449016233,0.70516791894,0.432951936813,0.195740296452,0.262507425604,21.1890835245,1.56917173613,49.4411948905,42.9357745102,0.188188039933,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
 25 |     
 26 |     aam = NULL;
 27 |     int i = 0;
 28 |     for (i = 0; i < NUM_KEYS; i++) {
 29 |         struct aa_matrix * new_aa = (struct aa_matrix *)malloc(sizeof(struct aa_matrix));
 30 |         memset(new_aa->aa_change,'\0',20);
 31 |         strcpy(new_aa->aa_change, keys[i]);
 32 |         new_aa->score = scores[i];
 33 |         new_aa->cons = conserved[i];
 34 |         new_aa->uncons = unconserved[i];
 35 |         HASH_ADD_STR(aam, aa_change, new_aa);
 36 |     }
 37 |     
 38 | }
 39 | 
 40 | void get_aaw(struct transcript_anno_info ** ttai, sds ref, sds var, float phast){
 41 |     
 42 |     (*ttai)->aaw = 1.0; //make sure aa is set to 1.0 before finding appropriate weight
 43 |     //check for splice terms (sequence ontology terms); most severe substitution score is assigned
 44 |     struct aa_matrix * taa = NULL;
 45 |     
 46 |     int i;
 47 |     for (i=0; i < (*ttai)->anno_tags.n; i++) {
 48 |         if (strcmp(kv_A((*ttai)->anno_tags, i), "splice_donor_variant") == 0) {
 49 |             HASH_FIND_STR(aam, "splice-a1-T", taa);
 50 |             if (taa != NULL) {
 51 |                 (*ttai)->aaw = 1.0 / taa->score;
 52 |                 (*ttai)->coding = 1;
 53 |             }
 54 |             return;
 55 |         }
 56 |     }
 57 |     
 58 |     for (i=0; i < (*ttai)->anno_tags.n; i++) {
 59 |         if (strcmp(kv_A((*ttai)->anno_tags, i), "splice_acceptor_variant") == 0) {
 60 |             HASH_FIND_STR(aam, "splice-b2-T", taa);
 61 |             if (taa != NULL) {
 62 |                 (*ttai)->aaw = 1.0 / taa->score;
 63 |                 (*ttai)->coding = 1;
 64 |             }
 65 |             return;
 66 |         }
 67 |     }
 68 | 
 69 |     if ((*ttai)->pref == NULL) {
 70 |         return;
 71 |     }
 72 |     
 73 |     //check for non-synonymous change
 74 |     size_t tmpl_pref = sdslen((*ttai)->pref);
 75 |     size_t tmpl_pvar = sdslen((*ttai)->pvar);
 76 |     if (strncmp((*ttai)->pref, (*ttai)->pvar, tmpl_pref < tmpl_pvar ? tmpl_pref : tmpl_pvar) != 0 || tmpl_pref != tmpl_pvar) {
 77 |         
 78 |         (*ttai)->coding = 1;
 79 |         int diff = abs((int)sdslen(ref) - (int)sdslen(var));
 80 |         
 81 |         if (diff > 0) { //get aa score for indel
 82 |             char pchange[20];
 83 |             memset(pchange, '\0', 20*sizeof(char));
 84 |             if (sdslen(ref) < sdslen(var)) { //insertion
 85 |                 if (diff%3 == 0 && diff <= 39) {
 86 |                     sprintf(pchange, "ins-%d", diff);
 87 |                 }
 88 |                 else {
 89 |                     sprintf(pchange, "ins-39");
 90 |                 }
 91 |                 HASH_FIND_STR(aam, pchange, taa);
 92 |                 if (taa != NULL) {
 93 |                     (*ttai)->aaw = 1.0 / taa->score;
 94 |                 }
 95 |             }
 96 |             else { //deletion
 97 |                 if (diff%3 == 0 && diff <= 78) {
 98 |                     sprintf(pchange, "del-%d", diff);
 99 |                 }
100 |                 else {
101 |                     sprintf(pchange, "del-78");
102 |                 }
103 |                 HASH_FIND_STR(aam, pchange, taa);
104 |                 if (taa != NULL) {
105 |                     (*ttai)->aaw = 1.0 / taa->score;
106 |                 }
107 |             }
108 |         }
109 |         
110 |         else { //get aa score for snv
111 |             if (sdslen((*ttai)->pref) > 1 || sdslen((*ttai)->pvar) > 1) { //skip if amino acid is longer than 1, can happen on multiallelic site
112 |                 fprintf(stderr, "WARNING: AAW set to 1.0 because annotated protein variant > 1\n");
113 |                 return;
114 |             }
115 |             char pchange[5];
116 |             memset(pchange, '\0', 5*sizeof(char));
117 |             if (strcmp((*ttai)->pref, "*") == 0) {
118 |                 sprintf(pchange, "*0X");
119 |             }
120 |             else if (strcmp((*ttai)->pvar, "*") == 0){
121 |                 sprintf(pchange, "X0*");
122 |             }
123 |             else {
124 |                 sprintf(pchange, "%s0%s", (*ttai)->pref, (*ttai)->pvar);
125 |             }
126 |             HASH_FIND_STR(aam, pchange, taa);
127 |             if (taa != NULL) {
128 |                 if (phast >= 0) {
129 |                     (*ttai)->aaw = 1.0 / ( ((1.0 - phast) * taa->uncons) + (phast * taa->cons) );
130 |                 }
131 |                 else {
132 |                     (*ttai)->aaw = 1.0 / taa->score;
133 |                 }
134 |             }
135 |         }
136 |         
137 |         return;
138 |     }
139 |     
140 |     //use phastcons score if synonymous or noncoding
141 |     if (phast >= 0) {
142 |         (*ttai)->aaw = 1.0 / ( ((1.0 - phast) * UNCONS) + (phast * CONS) );
143 |         return;
144 |     }
145 |     
146 | }
147 | 


--------------------------------------------------------------------------------
/score_variants.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  main.c
  3 | //  vvp_score
  4 | //
  5 | //  Created by steven on 8/11/15.
  6 | //  Copyright (c) 2015 yandell lab. All rights reserved.
  7 | //
  8 | 
  9 | #include "vvp_headers.h"
 10 | #include "search_binary_bkgrnd.h"
 11 | #include "parse_vcf.h"
 12 | #include "vvp_lookup.h"
 13 | #include "score_variant.h"
 14 | 
 15 | #define WORK_SIZE 100000
 16 | 
 17 | static sds input_vcf;
 18 | static sds db_prefix;
 19 | static sds output;
 20 | static int ncpus;
 21 | static int snv_only;
 22 | static int coding_only;
 23 | static int no_aa_weights;
 24 | static int no_allele_frequency;
 25 | static sds anno_tag_name;
 26 | static uint8_t gene_index;
 27 | static uint8_t transcript_index;
 28 | static uint8_t so_tag_index;
 29 | static uint8_t aa_index;
 30 | static int ll_weight_index;
 31 | 
 32 | static int n_background;
 33 | static unsigned char * mm_bin;
 34 | static struct chr_offsets * chro;
 35 | 
 36 | void usage(int exit_code) {
 37 |     fprintf(stderr, "Usage: VVP [options] -i <vcf file> -o <output prefix>\n\n");
 38 |     fprintf(stderr, "Options: (*mandatory)\n");
 39 |     fprintf(stderr, "* -i    filename      Input vcf file. Can be zipped or unzipped.  Can be 'stdin'\n");
 40 |     fprintf(stderr, "* -d    filename      database prefix\n");
 41 |     fprintf(stderr, "* -v    string        string with comma separated annotation components in info field\n");
 42 |     fprintf(stderr, "                      Format: <csq>,<gene index>,<transcript index>,<so_tag_index>,<aa_index>\n");
 43 |     fprintf(stderr, "                      Example: CSQ,4,6,1,15\n");
 44 |     fprintf(stderr, "-o      filename      fomatted output file name (for use in burden permutation)\n");
 45 |     fprintf(stderr, "-n      #             Number of threads to use, default = 1\n");
 46 |     fprintf(stderr, "-w      int           Column index (zero based) in annotation tag as extra likelihood weight\n");
 47 |     fprintf(stderr, "-x      None          Set to turn off AA scoring -- all AA weights will be set to 1.0\n");
 48 |     fprintf(stderr, "-f      None          Set to not use allele frequency when scoring (Only AA weights will be used)\n");
 49 |     fprintf(stderr, "-l      None          Set to ignore indels.  Default is to score indels\n");
 50 |     fprintf(stderr, "-c      None          Set to ignore non-coding variants.  Default is to score non-coding variants.\n\n");
 51 |     exit(exit_code);
 52 | }
 53 | 
 54 | void parse_command_line(int argc, const char * argv[]) {
 55 |     int opt;
 56 |     int sig;
 57 |     sds * tmp_info;
 58 |     int tmp_count;
 59 |     if (argc > 1 && strcmp(argv[1], "-h") == 0)
 60 |         usage(0);
 61 |     while ((opt = getopt(argc, argv, "i:d:v:o:n:w:cxlf")) != -1) {
 62 |         switch (opt) {
 63 |             case 'i' :
 64 |                 input_vcf = sdsnew(optarg);
 65 |                 break;
 66 |             case 'd' :
 67 |                 db_prefix = sdsnew(optarg);
 68 |                 break;
 69 |             case 'v' :
 70 |                 tmp_info = sdssplitlen(optarg, (int)strlen(optarg), ",", 1, &tmp_count);
 71 |                 if (tmp_count != 5) {
 72 |                     fprintf(stderr, "ARGUMENT ERROR:\tmust assign five annotation components, here only %d in %s\n", tmp_count, optarg);
 73 |                     usage(1);
 74 |                 }
 75 |                 anno_tag_name = sdsdup(tmp_info[0]);
 76 |                 gene_index = atoi(tmp_info[1]);
 77 |                 transcript_index = atoi(tmp_info[2]);
 78 |                 so_tag_index = atoi(tmp_info[3]);
 79 |                 aa_index = atoi(tmp_info[4]);
 80 |                 sdsfreesplitres(tmp_info, tmp_count);
 81 |                 break;
 82 |             case 'w':
 83 |                 sig = atoi(optarg);
 84 |                 if (sig < 0) {
 85 |                     fprintf(stderr, "ARGUMENT ERROR:\textra weight index must be >= 0\n");
 86 |                     usage(1);
 87 |                 }
 88 |                 ll_weight_index = sig;
 89 |                 break;
 90 |             case 'l' :
 91 |                 snv_only = 1;
 92 |                 break;
 93 |             case 'x' :
 94 |                 no_aa_weights = 1;
 95 |                 break;
 96 |             case 'f' :
 97 |                 no_allele_frequency = 1;
 98 |                 break;
 99 |             case 'c' :
100 |                 coding_only = 1;
101 |                 break;
102 |             case 'n' :
103 |                 sig = atoi(optarg);
104 |                 if (sig < 1){
105 |                     fprintf(stderr, "ARGUMENT ERROR:\tnumber of cpus must be set to an integer > 0\n");
106 |                     usage(1);
107 |                 }
108 |                 ncpus = (int)sig;
109 |                 break;
110 |             case 'o' :
111 |                 output = sdsnew(optarg);
112 |                 break;
113 |             default:
114 |                 usage(0);
115 |                 break;
116 |         }
117 |     }
118 |     
119 |     if (input_vcf == NULL || sdslen(input_vcf) < 2) {
120 |         fprintf(stderr, "Missing mandatory option -i\n");
121 |         usage(1);
122 |     }
123 |     if (db_prefix == NULL || sdslen(db_prefix) < 2) {
124 |         fprintf(stderr, "Missing mandatory option -d\n");
125 |         usage(1);
126 |     }
127 |     if (anno_tag_name == NULL || sdslen(anno_tag_name) < 2) {
128 |         fprintf(stderr, "Missing mandatory option -v\n");
129 |         usage(1);
130 |     }
131 |     
132 | }
133 | 
134 | struct var_info * check_background_allele(struct m_var_info * bvs, struct variant * v){
135 |     
136 |     int i = 0;
137 |     for (i = 0; i < bvs->nv; i++) {
138 |         int diff = abs((int)sdslen(v->ref) - (int)sdslen(v->var));
139 |         if (diff == 0) { //SNV or MNP
140 |             
141 |             if (sdslen(v->var) == 1) { //SNV
142 |                 if (bvs->vi[i]->var_type == v->var[0]) { //check to see if same SNV
143 |                     return bvs->vi[i];
144 |                 }
145 |             }
146 |             
147 |             else if (sdslen(v->var) > 1) { //MNP
148 |                 if (bvs->vi[i]->length == sdslen(v->var)) {
149 |                     return bvs->vi[i];
150 |                 }
151 |             }
152 |         }
153 |         else { //INDEL
154 |             if (bvs->vi[i]->length == diff) {
155 |                 
156 |                 if ( ( (int)sdslen(v->ref) > (int)sdslen(v->var) ) && bvs->vi[i]->var_type == 'D') {
157 |                     return bvs->vi[i]; //deletion
158 |                 }
159 |                 else if( ( (int)sdslen(v->ref) < (int)sdslen(v->var) ) && bvs->vi[i]->var_type == 'I'  ) {
160 |                     return bvs->vi[i]; //insertion
161 |                 }
162 |             }
163 |         }
164 |         
165 |     }
166 |     return NULL;
167 | }
168 | 
169 | void id_variant_to_string(struct variant * v){
170 |     
171 |     size_t i;
172 |     size_t n_indv = 0;
173 |     if (v->hemi.n > n_indv) {
174 |         n_indv = v->hemi.n;
175 |     }
176 |     if (v->hets.n > n_indv) {
177 |         n_indv = v->hets.n;
178 |     }
179 |     if (v->homs.n > n_indv) {
180 |         n_indv = v->homs.n;
181 |     }
182 |     
183 |     
184 |     
185 |     for (i = 0; i < n_indv; i++) {
186 |         if (i < v->hemi.n) {
187 |             if (i < (v->hemi.n - 1)) {
188 |                 v->hemi_indv = sdscatprintf(v->hemi_indv, "%d,", kv_A(v->hemi, i));
189 |             }
190 |             else {
191 |                 v->hemi_indv = sdscatprintf(v->hemi_indv, "%d", kv_A(v->hemi, i));
192 |             }
193 |             
194 |         }
195 |         if (i < v->hets.n) {
196 |             if (i < (v->hets.n - 1)) {
197 |                 v->het_indv = sdscatprintf(v->het_indv, "%d,", kv_A(v->hets, i));
198 |             }
199 |             else {
200 |                 v->het_indv = sdscatprintf(v->het_indv, "%d", kv_A(v->hets, i));
201 |             }
202 |         }
203 |         if (i < v->homs.n) {
204 |             if (i < (v->homs.n - 1)) {
205 |                 v->hom_indv = sdscatprintf(v->hom_indv, "%d,", kv_A(v->homs, i));
206 |             }
207 |             else {
208 |                 v->hom_indv = sdscatprintf(v->hom_indv, "%d", kv_A(v->homs, i));
209 |             }
210 |         }
211 |     }
212 |     
213 |     if(sdslen(v->hemi_indv) < 1){
214 |         v->hemi_indv = sdscat(v->hemi_indv, ".");
215 |     }
216 |     if(sdslen(v->het_indv) < 1){
217 |         v->het_indv = sdscat(v->het_indv, ".");
218 |     }
219 |     if(sdslen(v->hom_indv) < 1){
220 |         v->hom_indv = sdscat(v->hom_indv, ".");
221 |     }
222 | 
223 | }
224 | 
225 | 
226 | struct variant * parse_score(sds vcf_line){
227 |     
228 |     struct variant * v = parse_vcf_line(vcf_line, no_aa_weights);
229 |     struct m_var_info * bvs = search_binary_bkgrnd(v->chr, v->pos, mm_bin, chro);
230 |     struct var_info * bvi = check_background_allele(bvs, v);
231 |     if (bvi != NULL) {
232 |         v->b_nhemi = bvi->nhemi;
233 |         v->b_nhet = bvi->nhet;
234 |         v->b_nhom = bvi->nhom;
235 |         v->b_nocall = bvi->nocall;
236 |         v->bit_offset = bvi->bit_offset;
237 |         score_variant_t_b(v, n_background*2 - bvi->nocall, bvi->nhet + 2*bvi->nhom + bvi->nhemi, no_allele_frequency);
238 |     }
239 |     else {
240 |         v->b_nhemi = 0;
241 |         v->b_nhet = 0;
242 |         v->b_nhom = 0;
243 |         v->b_nocall = 0;
244 |         v->bit_offset = 0;
245 |         score_variant_t_b(v, n_background*2, 0, no_allele_frequency);
246 |     }
247 |     
248 |     id_variant_to_string(v);
249 |     
250 |     struct gene_transcript * c, * t;
251 |     HASH_ITER(hh, v->gt, c, t) {
252 |         struct transcript_anno_info * current, * tmp;
253 |         HASH_ITER(hh, c->tai, current, tmp) {
254 |             
255 |             current->hemi_vvp = score_lookup_b(current->transcript_name, current->hemi_score, current->coding);
256 |             current->het_vvp = score_lookup_b(current->transcript_name, current->het_score, current->coding);
257 |             current->hom_vvp = score_lookup_b(current->transcript_name, current->hom_score, current->coding);
258 |         }
259 |     }
260 |     
261 |     size_t i;
262 |     for (i=0; i < bvs->nv; i++) {
263 |         free(bvs->vi[i]);
264 |     }
265 |     free(bvs->vi);
266 |     free(bvs);
267 |     
268 |     return v;
269 | }
270 | 
271 | void process_vcf_lines(kvec_t(sds) * vcf_lines, struct variant *** variants){
272 |     
273 |     size_t n_lines = kv_size(*vcf_lines);
274 |     size_t i;
275 |     
276 |     #pragma omp parallel for schedule(static)
277 |     for (i = 0; i < n_lines; i++) {
278 |         (*variants)[i] = parse_score(kv_A(*vcf_lines, i));
279 |     }
280 |     
281 | }
282 | 
283 | /*
284 | int scale_het(int x){
285 |     float b = 0.055;
286 |     return (int)100.0*(1.0 / (1.0 + exp(b*(10.0 - x))));
287 | }*/
288 | 
289 | 
290 | int main(int argc, const char ** argv) {
291 |     
292 |     input_vcf = NULL;
293 |     db_prefix = NULL;
294 |     anno_tag_name = NULL;
295 |     gene_index = 0;
296 |     transcript_index = 0;
297 |     so_tag_index = 0;
298 |     aa_index = 0;
299 |     output = NULL;
300 |     ncpus = 1;
301 |     no_aa_weights = 0;
302 |     no_allele_frequency = 0;
303 |     snv_only = 0;
304 |     coding_only = 0;
305 |     n_background = 0;
306 |     mm_bin = NULL;
307 |     chro = NULL;
308 |     ll_weight_index = -1;
309 |     
310 |     parse_command_line(argc, argv);
311 |     
312 |     #ifdef _OPENMP
313 |     omp_set_num_threads(ncpus);
314 |     #endif
315 | 
316 |     
317 |     FILE * formatted_output = NULL;
318 |     
319 |     if (output != NULL) {
320 |         formatted_output = fopen(output, "w");
321 |     }
322 |     
323 |     sds dist_output = sdsempty();
324 |     dist_output = sdscatprintf(dist_output, "%s.dist", db_prefix);
325 |     load_feature_lookups_b(dist_output);
326 |     sdsfree(dist_output);
327 |     
328 |     mm_bin = load_bin_db(db_prefix, &n_background); //create memory map of background
329 |     chro = load_offsets(db_prefix); //load byte offsets in memory map
330 |     
331 |     initialize_parse_vcf(gene_index, transcript_index, so_tag_index, aa_index, anno_tag_name, ll_weight_index);
332 |     
333 |     gzFile * gf = NULL;
334 |     
335 |     if (strcmp(input_vcf, "stdin") == 0){
336 |         gf = gzdopen(STDIN_FILENO, "r");
337 |     }
338 |     else {
339 |         gf = gzopen(input_vcf, "r");
340 |     }
341 |     
342 |     if (! gf) {
343 |         fprintf(stderr, "FATAL: vcf file %s cannot be read\n", input_vcf);
344 |         exit(1);
345 |     }
346 |     
347 |     fprintf(stdout, "#%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n","chr", "start", "ref", "var", "gene", "transcript", "hemi_score", "hemi_vvp", "nhemi", "hemi_indvs", "hemi_nocall", "het_score", "het_vvp", "nhet", "het_indvs", "het_nocall", "hom_score", "hom_vvp", "nhom", "hom_indvs", "hom_nocall", "coding_ind", "indel_ind", "aa_score", "n_bhemi", "n_bhet", "n_bhom", "n_bnocall", "bit_offset", "vid", "ll_weight");
348 |     
349 |     char * buffer = (char *)malloc(sizeof(char)*BUF_SIZE);
350 |     kvec_t(sds) vcf_lines;
351 |     kv_init(vcf_lines);
352 |     char * l = gzgets(gf, buffer, BUF_SIZE);
353 |     if (buffer[0] != '#') {
354 |         kv_push(sds, vcf_lines, sdsnew(buffer));
355 |     }
356 |     
357 |     while (l != NULL) {
358 |         
359 |         while (kv_size(vcf_lines) < WORK_SIZE) {
360 |             l = gzgets(gf, buffer, BUF_SIZE);
361 |             if (l == NULL) {
362 |                 break;
363 |             }
364 |             if (buffer[0] != '#') {
365 |                 kv_push(sds, vcf_lines, sdsnew(buffer));
366 |             }
367 |         }
368 |         
369 |         
370 |         if (kv_size(vcf_lines) < 1) {
371 |             break;
372 |         }
373 |         
374 |         size_t i;
375 |         size_t n_lines = kv_size(vcf_lines);
376 |         struct variant ** variants = (struct variant **)calloc(n_lines, sizeof(struct variant *));
377 |         process_vcf_lines(&vcf_lines, &variants);
378 |         
379 |         for (i = 0; i < n_lines; i++) {
380 |             struct variant * tv = variants[i];
381 |             if (tv != NULL) {
382 |                 int indel_ind = (int)sdslen(tv->var) - (int)sdslen(tv->ref) == 0 ? 0 : 1;
383 |                 if ((tv->hemi.n > 0 || tv->hets.n > 0 || tv->homs.n > 0) && (snv_only == 0 || snv_only > indel_ind)) {
384 |                     struct gene_transcript * c, * t;
385 |                     HASH_ITER(hh, tv->gt, c, t) {
386 |                         struct transcript_anno_info * current, * tmp;
387 |                         HASH_ITER(hh, c->tai, current, tmp) {
388 |                             if (coding_only <= current->coding) {
389 |                                 fprintf(stdout, "%s\t%zu\t%s\t%s\t%s\t%s\t", tv->chr, tv->pos, tv->ref, tv->var, c->gene_name, current->transcript_name);
390 |                                 fprintf(stdout, "%f\t%d\t%zu\t%s\t%zu\t", current->hemi_score, current->hemi_vvp, tv->hemi.n, tv->hemi_indv, tv->hemi_nocalls.n);
391 |                                 fprintf(stdout, "%f\t%d\t%zu\t%s\t%zu\t", current->het_score, current->het_vvp, tv->hets.n, tv->het_indv, tv->het_nocalls.n);
392 |                                 fprintf(stdout, "%f\t%d\t%zu\t%s\t%zu\t", current->hom_score, current->hom_vvp, tv->homs.n, tv->hom_indv, tv->hom_nocalls.n);
393 |                                 fprintf(stdout, "%d\t%d\t%f\t", current->coding, indel_ind, current->aaw);
394 |                                 fprintf(stdout, "%d\t%d\t%d\t%d\t", tv->b_nhemi, tv->b_nhet, tv->b_nhom, tv->b_nocall);
395 |                                 fprintf(stdout, "%llu\t%s\t%f\n", tv->bit_offset, tv->vid, current->llw);
396 |                                 
397 |                                 if (output != NULL) {
398 |                                     int hemi_ind = (tv->b_nhemi > 0) ? 1 : 0;
399 |                                     int het_ind = (tv->b_nhet > 0) ? 1 : 0;
400 |                                     int hom_ind = (tv->b_nhom > 0) ? 1 : 0;
401 |                                     fprintf(formatted_output, "%s\t%s\t%f\t%s\t%f\t%s\t%f\t%s\t%d\t%d\t%d\t%llu\t%s\n", tv->chr, current->transcript_name, current->hemi_score, tv->hemi_indv, current->het_score, tv->het_indv, current->hom_score, tv->hom_indv, hemi_ind, het_ind, hom_ind, tv->bit_offset, tv->vid);
402 |                                 }
403 |                                 
404 |                             }
405 |                         }
406 |                     }
407 |                 }
408 |                 
409 |                 destroy_variant(tv);
410 |             }
411 |             
412 |             sdsfree(kv_pop(vcf_lines));
413 |         
414 |         }
415 |         
416 |     }
417 |         
418 |     kv_destroy(vcf_lines);
419 |     free(buffer);
420 |     
421 |     /*if (output != NULL) {
422 |         fprintf(stderr, "\nsorting and prepping formatted output for burden calculations (stdout ready for processing)...");
423 |         //prep formatted output file for burden calculations; only works on unix systems
424 |         sds sort_command = sdsnew("sort -k2,2 ");
425 |         sort_command = sdscatprintf(sort_command, "%s > %s.sorted", output, output);
426 |         system(sort_command);
427 |         sdsfree(sort_command);
428 |         
429 |         sds mv_command = sdsnew("mv ");
430 |         mv_command = sdscatprintf(mv_command, "%s.sorted %s", output, output);
431 |         system(mv_command);
432 |         fprintf(stderr, "done\n");
433 |         sdsfree(mv_command);
434 |     }*/
435 |     
436 |     
437 |     
438 |     
439 |     return 0;
440 | }
441 | 


--------------------------------------------------------------------------------
/parse_vcf.c:
--------------------------------------------------------------------------------
  1 | //
  2 | //  parse_vcf.c
  3 | //  VVP_dev_xcode
  4 | //
  5 | //  Created by STEVEN FLYGARE on 10/10/16.
  6 | //  Copyright © 2016 IDbyDNA. All rights reserved.
  7 | //
  8 | 
  9 | #include "parse_vcf.h"
 10 | 
 11 | #define ERR_GENOTYPE "ERR GENOTYPE"
 12 | #define VCF_PROB "VCF FORMAT PROBLEM"
 13 | #define ANNO_PROB "ANNOTATION PROBLEM"
 14 | 
 15 | static struct vep_field_info vfi;
 16 | 
 17 | void initialize_parse_vcf(uint8_t gene_index, uint8_t transcript_index, uint8_t seq_ontology_tag_index, uint8_t amino_acid_change_index, sds annotation_tag_name, int ll_weight_index){
 18 |     
 19 |     init_aa_score();
 20 |     
 21 |     vfi.gene_index = gene_index;
 22 |     vfi.transcript_index = transcript_index;
 23 |     vfi.seq_ontology_tag_index = seq_ontology_tag_index;
 24 |     vfi.amino_acid_change_index = amino_acid_change_index;
 25 |     vfi.annotation_tag_name = sdsdup(annotation_tag_name);
 26 |     vfi.ll_weight_index = ll_weight_index;
 27 |     
 28 |     sdsfree(annotation_tag_name);
 29 | }
 30 | 
 31 | void load_gt_info(struct variant ** v, sds * data, int data_len){
 32 |     
 33 |     int indv_index = 0;
 34 |     int i;
 35 |     for (i = 9; i < data_len; i++) {
 36 |         
 37 |         int tmp_count = 0;
 38 |         sds * tmp_data = sdssplitlen(data[i], (int)sdslen(data[i]), ":", 1, &tmp_count);
 39 |         int gd_count = 0;
 40 |         sds * genotype_data = NULL;
 41 |         genotype_data = sdssplitlen(tmp_data[0], (int)sdslen(tmp_data[0]), "|", 1, &gd_count);
 42 |         if (sdslen(tmp_data[0]) > 1 && gd_count < 2) { //if "|" didn't split, then try "/"
 43 |             sdsfreesplitres(genotype_data, gd_count);
 44 |             genotype_data = sdssplitlen(tmp_data[0], (int)sdslen(tmp_data[0]), "/", 1, &gd_count);
 45 |         }
 46 |         sdsfreesplitres(tmp_data, tmp_count);
 47 |         
 48 |         if (gd_count == 1) { //hemizygous situation
 49 |             (*v)->ni++;
 50 |             if (strcmp(genotype_data[0], ".") == 0) {
 51 |                 kv_push(int, (*v)->hemi_nocalls, indv_index);
 52 |             }
 53 |             else if (strcmp(genotype_data[0], "0") == 0){
 54 |                 (*v)->nref+=1;
 55 |             }
 56 |             else if (strcmp(genotype_data[0], "1") == 0){
 57 |                 kv_push(int, (*v)->hemi, indv_index);
 58 |             }
 59 |         }
 60 |         else if (gd_count == 2){ //diploid call
 61 |             (*v)->ni++;
 62 |             if (strcmp(genotype_data[0], genotype_data[1]) == 0) { //homozygous call
 63 |                 if (strcmp(genotype_data[0], ".") == 0) {
 64 |                     kv_push(int, (*v)->hom_nocalls, indv_index);
 65 |                 }
 66 |                 else if (strcmp(genotype_data[0], "0") == 0){
 67 |                     (*v)->nref+=2;
 68 |                 }
 69 |                 else if (strcmp(genotype_data[0], "1") == 0){
 70 |                     kv_push(int, (*v)->homs, indv_index);
 71 |                 }
 72 |             }
 73 |             else { //heterozygous call
 74 |                 int j;
 75 |                 for (j = 0; j < 2; j++) {
 76 |                     if (strcmp(genotype_data[j], ".") == 0) {
 77 |                         kv_push(int, (*v)->het_nocalls, indv_index);
 78 |                     }
 79 |                     else if (strcmp(genotype_data[j], "0") == 0){
 80 |                         (*v)->nref+=1;
 81 |                     }
 82 |                     else if (strcmp(genotype_data[j], "1") == 0){
 83 |                         kv_push(int, (*v)->hets, indv_index);
 84 |                     }
 85 |                 }
 86 |                 
 87 |             }
 88 |         }
 89 |         else {
 90 |             fprintf(stderr, "WARNING:\t%s\tgenotype problem\nchr:%s\tpos:%zu\tcol:%d\n", VCF_PROB, (*v)->chr, (*v)->pos, i);
 91 |             //exit(0);
 92 |         }
 93 |         indv_index++;
 94 |         
 95 |         sdsfreesplitres(genotype_data, gd_count);
 96 |     }
 97 | }
 98 | 
 99 | void get_bcsq_aa_change(sds aa_tag, struct transcript_anno_info ** ttai){
100 |     
101 |     if (sdslen(aa_tag) < 1) {
102 |         return;
103 |     }
104 |     int aas = 0;
105 |     sds * aa = sdssplitlen(aa_tag, (int)sdslen(aa_tag), ">", 1, &aas);
106 |     if (aas > 1) {
107 |         //(*ttai)->pref = sdsdup(aa[0]);
108 |         //(*ttai)->pvar = sdsdup(aa[1]);
109 |         size_t i;
110 |         for (i = 0; i < sdslen(aa[0]); i++) {
111 |             if (isdigit(aa[0][i]) == 0) {
112 |                 (*ttai)->pref = sdsnewlen(aa[0]+i, sdslen(aa[0]) - i);
113 |                 break;
114 |             }
115 |         }
116 |         
117 |         for (i = 0; i < sdslen(aa[1]); i++) {
118 |             if (isdigit(aa[1][i]) == 0) {
119 |                 (*ttai)->pvar = sdsnewlen(aa[1]+i, sdslen(aa[1]) - i);
120 |                 break;
121 |             }
122 |         }
123 |     }
124 |     sdsfreesplitres(aa, aas);
125 | }
126 | 
127 | 
128 | void get_aa_change(sds aa_tag, struct transcript_anno_info ** ttai){
129 |     
130 |     if (sdslen(aa_tag) < 1) {
131 |         return;
132 |     }
133 |     int aas = 0;
134 |     sds * aa = sdssplitlen(aa_tag, (int)sdslen(aa_tag), "/", 1, &aas);
135 |     if (aas > 1) {
136 |         (*ttai)->pref = sdsdup(aa[0]);
137 |         (*ttai)->pvar = sdsdup(aa[1]);
138 |     }
139 |     sdsfreesplitres(aa, aas);
140 | }
141 | 
142 | 
143 | void check_add_gene_transcript_tags(sds gene_name, sds transcript_name, sds annotation_tags, sds aa_tag, float ll_weight, struct variant ** v) {
144 |     
145 |     struct gene_transcript * tgt = NULL;
146 |     HASH_FIND_STR((*v)->gt, gene_name, tgt); //check for and add gene
147 |     if (tgt == NULL) {
148 |         tgt = (struct gene_transcript *)malloc(sizeof(struct gene_transcript));
149 |         memset(tgt->gene_name, '\0', sizeof(char)*FEATURE_NAME_LENGTH);
150 |         if (sdslen(gene_name) < 1) {
151 |             strncpy(tgt->gene_name, "NONE", 4);
152 |         }
153 |         else {
154 |             strncpy(tgt->gene_name, gene_name, sdslen(gene_name));
155 |         }
156 |         tgt->tai = NULL;
157 |         HASH_ADD_STR((*v)->gt, gene_name, tgt);
158 |     }
159 |     
160 |     struct transcript_anno_info * ttai = NULL;
161 |     HASH_FIND_STR(tgt->tai, transcript_name, ttai); //check for and add transcript
162 |     if (ttai == NULL) {
163 |         ttai = (struct transcript_anno_info *)malloc(sizeof(struct transcript_anno_info));
164 |         memset(ttai->transcript_name, '\0', sizeof(char)*FEATURE_NAME_LENGTH);
165 |         if (sdslen(transcript_name) < 1) {
166 |             strncpy(ttai->transcript_name, "NONE", 4);
167 |         }
168 |         else {
169 |             strncpy(ttai->transcript_name, transcript_name, sdslen(transcript_name));
170 |         }
171 |         
172 |         ttai->aaw = 1.0;
173 |         ttai->llw = ll_weight;
174 |         ttai->het_score = -1.0;
175 |         ttai->hom_score = -1.0;
176 |         ttai->hemi_score = -1.0;
177 |         ttai->het_vvp = -1;
178 |         ttai->hom_vvp = -1;
179 |         ttai->hemi_vvp = -1;
180 |         ttai->coding = 0;
181 |         ttai->pref = NULL;
182 |         ttai->pvar = NULL;
183 |         if (strncmp(vfi.annotation_tag_name, "BCSQ", 4) == 0) {
184 |             get_bcsq_aa_change(aa_tag, &ttai);
185 |         }
186 |         else {
187 |             get_aa_change(aa_tag, &ttai); //get aa weight
188 |         }
189 |         
190 |         kv_init(ttai->anno_tags); //initialize and add annotation tags
191 |         if (annotation_tags != NULL) {
192 |             int tags = 0;
193 |             sds * anno_tags = sdssplitlen(annotation_tags, (int)sdslen(annotation_tags), "&", 1, &tags);
194 |             int i;
195 |             for (i=0; i < tags; i++) {
196 |                 kv_push(sds, ttai->anno_tags, sdsdup(anno_tags[i]));
197 |             }
198 |             sdsfreesplitres(anno_tags, tags);
199 |         }
200 |         
201 |         HASH_ADD_STR(tgt->tai, transcript_name, ttai);
202 |     }
203 |     else {
204 |         int placeholder = 1;
205 |         //fprintf(stderr, "WARNING:\t%s\nGene:\t%s\nTranscript:\t%s already added.  Multiple annotations per transcript not allowed, will only use first.  Variant location:\t%s\t%zu\n\n", ANNO_PROB, gene_name, transcript_name, (*v)->chr, (*v)->pos);
206 |     }
207 |     
208 | }
209 | 
210 | void load_annotation_info(sds info_field, struct variant ** v){
211 |     
212 |     int count = 0;
213 |     sds * info_data = sdssplitlen(info_field, (int)sdslen(info_field), ";", 1, &count);
214 |     int i = 0;
215 |     for (i = 0; i < count; i++) {
216 |         int tag_count = 0;
217 |         sds * tag_data = sdssplitlen(info_data[i], (int)sdslen(info_data[i]), "=", 1, &tag_count);
218 |         if (tag_count > 1) {
219 |             if (strcmp(tag_data[0], vfi.annotation_tag_name) == 0) {
220 |                 int n_annotations;
221 |                 sds * annotations = sdssplitlen(tag_data[1], (int)sdslen(tag_data[1]), ",", 1, &n_annotations);
222 |                 int j;
223 |                 for (j = 0; j < n_annotations; j++) {
224 |                     //fprintf(stderr, "%s\t%d\t%s\n", (*v)->vid, j, annotations[j]);
225 |                     int pieces = 0;
226 |                     sds * anno_pieces = sdssplitlen(annotations[j], (int)sdslen(annotations[j]), "|", 1, &pieces);
227 |                     float ll_weight = -1.0;
228 |                     if (vfi.ll_weight_index >= 0) {
229 |                         ll_weight = atof(anno_pieces[vfi.ll_weight_index]);
230 |                     }
231 |                     //fprintf(stderr, "%d\n", tag_count);
232 |                     //assert(tag_count > vfi.gene_index && tag_count > vfi.transcript_index && tag_count > vfi.seq_ontology_tag_index);
233 |                     sds tmp_empty = sdsempty();
234 |                     check_add_gene_transcript_tags(pieces > vfi.gene_index ? anno_pieces[vfi.gene_index] : tmp_empty, pieces > vfi.transcript_index ? anno_pieces[vfi.transcript_index] : tmp_empty, pieces > vfi.seq_ontology_tag_index ? anno_pieces[vfi.seq_ontology_tag_index] : tmp_empty, pieces > vfi.amino_acid_change_index ? anno_pieces[vfi.amino_acid_change_index] : tmp_empty, ll_weight, v);
235 |                     sdsfree(tmp_empty);
236 |                     sdsfreesplitres(anno_pieces, pieces);
237 |                 }
238 |                 sdsfreesplitres(annotations, n_annotations);
239 |             }
240 |             else if (strcmp(tag_data[0], "PHAST") == 0) {
241 |                     (*v)->phast = atof(tag_data[1]);
242 |             }
243 |         }
244 |         sdsfreesplitres(tag_data, tag_count);
245 |     }
246 |     sdsfreesplitres(info_data, count);
247 | }
248 | 
249 | 
250 | struct variant * parse_vcf_line(sds line, int no_aa_weight){
251 |     
252 |     //initialize variant struct values as line is parsed
253 |     struct variant * v = (struct variant *)malloc(sizeof(struct variant));
254 |     v->chr = NULL;
255 |     v->bit_offset = 0;
256 |     sdstrim(line, "\n");
257 |     
258 |     //first split line by tab
259 |     int count = 0;
260 |     sds * data = sdssplitlen(line, (int)sdslen(line), "\t", 1, &count);
261 |     
262 |     if (count < 10) {
263 |         fprintf(stderr, "%s:\t vcf line has fewer than 10 columns, will skip:\t%s:%s, %s, %s\n", VCF_PROB, data[0], data[1], data[4], data[7]);
264 |         sdsfreesplitres(data, count);
265 |         free(v);
266 |         return NULL;
267 |     }
268 |     
269 |     v->chr = sdsdup(data[0]);
270 |     v->pos = atoll(data[1]);
271 |     v->vid = sdsdup(data[2]);
272 |     v->ref = sdsdup(data[3]);
273 |     
274 |     //now find number of alternate alleles, if > 1, then throw error
275 |     int num_va = 0;
276 |     sds * variant_alleles = sdssplitlen(data[4], (int)sdslen(data[4]), ",", 1, &num_va);
277 |     if (num_va > 1) {
278 |         fprintf(stderr, "FATAL:\t vcf line has >1 alternate alleles.  File needs to be decomposed before processing\n");
279 |         fprintf(stderr, "%s", line);
280 |         sdsfreesplitres(variant_alleles, num_va);
281 |         sdsfreesplitres(data, count);
282 |         exit(0);
283 |     }
284 |     
285 |     v->var = sdsdup(variant_alleles[0]);
286 |     sdsfreesplitres(variant_alleles, num_va);
287 |     //indel indicator
288 |     if ((sdslen(v->ref) != sdslen(v->var)) || strcmp(v->ref, "-") == 0 || strcmp(v->var, "-") == 0) {
289 |         v->indel = 1;
290 |     }
291 |     
292 |     //load genotype information
293 |     v->ni = 0;
294 |     v->nref = 0;
295 |     kv_init(v->hets);
296 |     v->het_indv = sdsempty();
297 |     kv_init(v->homs);
298 |     v->hom_indv = sdsempty();
299 |     kv_init(v->hemi);
300 |     v->hemi_indv = sdsempty();
301 |     kv_init(v->het_nocalls);
302 |     kv_init(v->hom_nocalls);
303 |     kv_init(v->hemi_nocalls);
304 |     load_gt_info(&v, data, count);
305 |     
306 |     //load annotation and protein information
307 |     v->gt = NULL;
308 |     v->phast = -1.0;
309 |     load_annotation_info(data[7], &v);
310 |     
311 |     //add amino acid weights
312 |     struct gene_transcript * c, * t;
313 |     HASH_ITER(hh, v->gt, c, t) {
314 |         struct transcript_anno_info * current, * tmp;
315 |         HASH_ITER(hh, c->tai, current, tmp) {
316 |             get_aaw(&current, v->ref, v->var, v->phast);
317 |             if (no_aa_weight == 1) {
318 |                 current->aaw = 1.0;
319 |             }
320 |         }
321 |     }
322 | 
323 |     sdsfreesplitres(data, count);
324 |     
325 |     return v;
326 | }
327 | 
328 | struct variant * parse_allele_frequency_line(sds line, int no_aa_weight){
329 | 
330 |     //initialize variant struct values as line is parsed
331 |     struct variant * v = (struct variant *)malloc(sizeof(struct variant));
332 |     v->chr = NULL;
333 |     v->bit_offset = 0;
334 |     sdstrim(line, "\n");
335 |     
336 |     //first split line by tab
337 |     int count = 0;
338 |     sds * data = sdssplitlen(line, (int)sdslen(line), "\t", 1, &count);
339 |     
340 |     if (count < 10) {
341 |         fprintf(stderr, "%s:\t vcf line has fewer than 10 columns, will skip:\t%s:%s, %s, %s\n", VCF_PROB, data[0], data[1], data[4], data[7]);
342 |         sdsfreesplitres(data, count);
343 |         free(v);
344 |         return NULL;
345 |     }
346 |     
347 |     v->chr = sdsdup(data[0]);
348 |     v->pos = atoll(data[1]);
349 |     v->vid = sdsdup(data[2]);
350 |     v->ref = sdsdup(data[3]);
351 |     
352 |     //now find number of alternate alleles, if > 1, then throw error
353 |     int num_va = 0;
354 |     sds * variant_alleles = sdssplitlen(data[4], (int)sdslen(data[4]), ",", 1, &num_va);
355 |     if (num_va > 1) {
356 |         fprintf(stderr, "FATAL:\t vcf line has >1 alternate alleles.  File needs to be decomposed before processing\n");
357 |         fprintf(stderr, "%s", line);
358 |         sdsfreesplitres(variant_alleles, num_va);
359 |         sdsfreesplitres(data, count);
360 |         exit(0);
361 |     }
362 |     
363 |     v->var = sdsdup(variant_alleles[0]);
364 |     sdsfreesplitres(variant_alleles, num_va);
365 |     //indel indicator
366 |     if ((sdslen(v->ref) != sdslen(v->var)) || strcmp(v->ref, "-") == 0 || strcmp(v->var, "-") == 0) {
367 |         v->indel = 1;
368 |     }
369 |     
370 |     //load genotype information
371 |     
372 |     v->ni = 0;
373 |     v->nref = 0;
374 |     kv_init(v->hets);
375 |     v->het_indv = sdsempty();
376 |     kv_init(v->homs);
377 |     v->hom_indv = sdsempty();
378 |     kv_init(v->hemi);
379 |     v->hemi_indv = sdsempty();
380 |     kv_init(v->het_nocalls);
381 |     kv_init(v->hom_nocalls);
382 |     kv_init(v->hemi_nocalls);
383 |     
384 |     //load_gt_info(&v, data, count);
385 |     int total_called = atoi(data[5]);
386 |     int n_alleles_called = atoi(data[6]);
387 |     int nocalled_alleles = total_called - n_alleles_called; //number of nocall alleles
388 |     int n_alleles_alt = atoi(data[7]);
389 |     int n_hom_indvs = atoi(data[8]);
390 |     
391 |     int n_hemi = 0;
392 |     int n_het = 0;
393 |     if (strcmp(v->chr, "X") == 0) {
394 |         n_hemi = atoi(data[9]); //hemizgyous will be the same as the number of male alleles
395 |         int tmp = n_alleles_alt - ((n_hom_indvs * 2) + n_hemi);
396 |         n_het = tmp >= 0 ? tmp : 0; //whatever is left will be het
397 |     }
398 |     else {
399 |         n_het = n_alleles_alt - (n_hom_indvs *2); //number of heterozygous alleles -- same as het individuals
400 |     }
401 |     
402 |     
403 |     int n_hom_alt = n_alleles_alt - (n_het + n_hemi); //number of alleles homozygous for alt allele
404 |     
405 |     v->nref = n_alleles_called - (n_hom_alt + n_het + n_hemi);
406 |     
407 |     //generate random numbers for the individual indexes
408 |     gsl_rng_env_setup();
409 |     gsl_rng * r = gsl_rng_alloc(gsl_rng_taus);
410 |     
411 |     int i;
412 |     for (i = 0; i < (n_hom_alt / 2); i++) {
413 |         int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1);
414 |         kv_push(int, v->homs, z);
415 |     }
416 |     for (i = 0; i < n_het; i++) {
417 |         int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1);
418 |         kv_push(int, v->hets, z);
419 |     }
420 |     for (i = 0; i < n_hemi; i++) {
421 |         int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1);
422 |         kv_push(int, v->hemi, z);
423 |     }
424 |     for (i = 0; i < nocalled_alleles; i++) {
425 |         int z = (int)gsl_rng_uniform_int(r, (total_called/2)+1);
426 |         kv_push(int, v->het_nocalls, z);
427 |     }
428 |     
429 |     gsl_rng_free(r);
430 |     
431 |     //load annotation and protein information
432 |     v->gt = NULL;
433 |     v->phast = -1.0;
434 |     load_annotation_info(data[10], &v);
435 |     
436 |     //add amino acid weights
437 |     struct gene_transcript * c, * t;
438 |     HASH_ITER(hh, v->gt, c, t) {
439 |         struct transcript_anno_info * current, * tmp;
440 |         HASH_ITER(hh, c->tai, current, tmp) {
441 |             get_aaw(&current, v->ref, v->var, v->phast);
442 |             if (no_aa_weight == 1) {
443 |                 current->aaw = 1.0;
444 |             }
445 |         }
446 |     }
447 |     
448 |     sdsfreesplitres(data, count);
449 |     
450 |     return v;
451 | 
452 | }
453 | 
454 | 
455 | 
456 | void destroy_variant(struct variant * v){
457 |     sdsfree(v->chr);
458 |     sdsfree(v->vid);
459 |     sdsfree(v->ref);
460 |     sdsfree(v->var);
461 |     kv_destroy(v->hets);
462 |     sdsfree(v->het_indv);
463 |     kv_destroy(v->homs);
464 |     sdsfree(v->hom_indv);
465 |     kv_destroy(v->hemi);
466 |     sdsfree(v->hemi_indv);
467 |     kv_destroy(v->het_nocalls);
468 |     kv_destroy(v->hom_nocalls);
469 |     kv_destroy(v->hemi_nocalls);
470 |     
471 |     struct gene_transcript * c, * t;
472 |     HASH_ITER(hh, v->gt, c, t) {
473 |         struct transcript_anno_info * current, * tmp;
474 |         HASH_ITER(hh, c->tai, current, tmp) {
475 |             int i;
476 |             if(current->pref != NULL) sdsfree(current->pref);
477 |             if(current->pvar != NULL) sdsfree(current->pvar);
478 |             size_t tmpl = c->tai->anno_tags.n;
479 |             for (i=0; i < tmpl; i++) {
480 |                 sdsfree(kv_pop(c->tai->anno_tags));
481 |             }
482 |             kv_destroy(c->tai->anno_tags);
483 |             HASH_DEL(c->tai, current);
484 |             free(current);
485 |         }
486 |         HASH_DEL(v->gt, c);
487 |         free(c);
488 |     }
489 |     free(v);
490 |     v = NULL;
491 | }
492 | 
493 | void print_vec_comma(kvec_t(int) * tmp){
494 |     int i;
495 |     for (i = 0; i < (*tmp).n; i++) {
496 |         fprintf(stdout, "%d,", kv_A(*tmp, i));
497 |     }
498 | }
499 | 
500 | void print_variant(struct variant * v){
501 |     fprintf(stdout, "chr: %s, pos: %zu\n", v->chr, v->pos);
502 |     fprintf(stdout, "id: %s\n", v->vid);
503 |     fprintf(stdout, "ref: %s, var: %s\n", v->ref, v->var);
504 |     fprintf(stdout, "number of individuals: %d\n", v->ni);
505 |     fprintf(stdout, "number ref alleles: %d\n", v->nref);
506 |     
507 |     int i;
508 |     if (v->hemi.n > 0) {
509 |         fprintf(stdout, "HEMIZYGOUS individuals: %zu\n", v->hemi.n);
510 |         //for (i = 0; i < v->hemi.n; i++) fprintf(stdout, "%d,", kv_A(v->hemi, i));
511 |         //fprintf(stdout, "\n\n");
512 |     }
513 |     if (v->hets.n > 0) {
514 |         fprintf(stdout, "HETEROZYGOUS individuals: %zu\n", v->hets.n);
515 |         //for (i = 0; i < v->hets.n; i++) fprintf(stdout, "%d,", kv_A(v->hets, i));
516 |         //fprintf(stdout, "\n\n");
517 |     }
518 |     if (v->homs.n > 0) {
519 |         fprintf(stdout, "HOMOZYGOUS individuals: %zu\n", v->homs.n);
520 |         //for (i = 0; i < v->homs.n; i++) fprintf(stdout, "%d,", kv_A(v->homs, i));
521 |         //fprintf(stdout, "\n\n");
522 |     }
523 |     if (v->hemi_nocalls.n > 0) {
524 |         fprintf(stdout, "HEMIZYGOUS nocalled individuals: %zu\n", v->hemi_nocalls.n);
525 |         //for (i = 0; i < v->hemi_nocalls.n; i++) fprintf(stdout, "%d,", kv_A(v->hemi_nocalls, i));
526 |         //fprintf(stdout, "\n\n");
527 |     }
528 |     if (v->het_nocalls.n > 0) {
529 |         fprintf(stdout, "HETEROZYGOUS nocalled individuals: %zu\n", v->het_nocalls.n);
530 |         //for (i = 0; i < v->het_nocalls.n; i++) fprintf(stdout, "%d,", kv_A(v->het_nocalls, i));
531 |         //fprintf(stdout, "\n\n");
532 |     }
533 |     if (v->hom_nocalls.n > 0) {
534 |         fprintf(stdout, "HOMOZYGOUS nocalled individuals: %zu\n", v->hom_nocalls.n);
535 |         //for (i = 0; i < v->hom_nocalls.n; i++) fprintf(stdout, "%d,", kv_A(v->hom_nocalls, i));
536 |         //fprintf(stdout, "\n\n");
537 |     }
538 |     
539 |     struct gene_transcript * c, * t;
540 |     HASH_ITER(hh, v->gt, c, t) {
541 |         fprintf(stdout, "GENE: %s\n", c->gene_name);
542 |         struct transcript_anno_info * current, * tmp;
543 |         HASH_ITER(hh, c->tai, current, tmp) {
544 |             fprintf(stdout, "\tTRANSCRIPT: %s\n", current->transcript_name);
545 |             fprintf(stdout, "\t\taaw: %f, het_score: %f, hom_score: %f, coding: %d\n", current->aaw, current->het_score, current->hom_score, current->coding);
546 |             size_t tmpl = current->anno_tags.n;
547 |             for (i = 0; i < tmpl; i++) {
548 |                 fprintf(stdout, "\t\tSO TAG: %s\n", kv_A(current->anno_tags, i));
549 |             }
550 |         }
551 |     }
552 |     
553 | }
554 | 
555 | 
556 | 


--------------------------------------------------------------------------------
/bit_array.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  bit_array.h
  3 |  project: bit array C library
  4 |  url: https://github.com/noporpoise/BitArray/
  5 |  maintainer: Isaac Turner <turner.isaac@gmail.com>
  6 |  license: Public Domain, no warranty
  7 |  date: Sep 2014
  8 | */
  9 | 
 10 | #ifndef BIT_ARRAY_HEADER_SEEN
 11 | #define BIT_ARRAY_HEADER_SEEN
 12 | 
 13 | #include <stdio.h>
 14 | #include <inttypes.h>
 15 | 
 16 | #include "bit_macros.h"
 17 | 
 18 | typedef struct BIT_ARRAY BIT_ARRAY;
 19 | 
 20 | // 64 bit words
 21 | typedef uint64_t word_t, word_addr_t, bit_index_t;
 22 | typedef uint8_t word_offset_t; // Offset within a 64 bit word
 23 | 
 24 | #define BIT_INDEX_MIN 0
 25 | #define BIT_INDEX_MAX (~(bit_index_t)0)
 26 | 
 27 | #ifdef __cplusplus
 28 | extern "C" {
 29 | #endif
 30 | 
 31 | //
 32 | // Structs
 33 | //
 34 | 
 35 | struct BIT_ARRAY
 36 | {
 37 |   word_t* words;
 38 |   bit_index_t num_of_bits;
 39 |   // Number of words used -- this is just round_up(num_of_bits / 64)
 40 |   // if num_of_bits == 0, this is 0
 41 |   word_addr_t num_of_words;
 42 |   // For more efficient allocation we use realloc only to double size --
 43 |   // not for adding every word.  Initial size is INIT_CAPACITY_WORDS.
 44 |   word_addr_t capacity_in_words;
 45 | };
 46 | 
 47 | //
 48 | // Basics: Constructor, destructor, get length, resize
 49 | //
 50 | 
 51 | // Constructor - create a new bit array of length nbits
 52 | BIT_ARRAY* bit_array_create(bit_index_t nbits);
 53 | 
 54 | // Destructor - free the memory used for a bit array
 55 | void bit_array_free(BIT_ARRAY* bitarray);
 56 | 
 57 | // Allocate using existing struct
 58 | BIT_ARRAY* bit_array_alloc(BIT_ARRAY* bitarr, bit_index_t nbits);
 59 | void bit_array_dealloc(BIT_ARRAY* bitarr);
 60 | 
 61 | // Get length of bit array
 62 | bit_index_t bit_array_length(const BIT_ARRAY* bit_arr);
 63 | 
 64 | // Change the size of a bit array. Enlarging an array will add zeros
 65 | // to the end of it. Returns 1 on success, 0 on failure (e.g. not enough memory)
 66 | char bit_array_resize(BIT_ARRAY* bitarr, bit_index_t new_num_of_bits);
 67 | 
 68 | // If bitarr length < num_bits, resizes to num_bits
 69 | char bit_array_ensure_size(BIT_ARRAY* bitarr, bit_index_t ensure_num_of_bits);
 70 | 
 71 | // Same as above but exit with an error message if out of memory
 72 | void bit_array_resize_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
 73 | void bit_array_ensure_size_critical(BIT_ARRAY* bitarr, bit_index_t num_of_bits);
 74 | 
 75 | 
 76 | //
 77 | // Macros
 78 | //
 79 | 
 80 | //
 81 | // Get, set, clear, assign and toggle individual bits
 82 | // Macros for fast access -- beware: no bounds checking
 83 | //
 84 | 
 85 | #define bit_array_get(arr,i)      bitset_get((arr)->words, i)
 86 | #define bit_array_set(arr,i)      bitset_set((arr)->words, i)
 87 | #define bit_array_clear(arr,i)    bitset_del((arr)->words, i)
 88 | #define bit_array_toggle(arr,i)   bitset_tgl((arr)->words, i)
 89 | // c must be 0 or 1
 90 | #define bit_array_assign(arr,i,c) bitset_cpy((arr)->words,i,c)
 91 | 
 92 | //
 93 | // Get, set, clear, assign and toggle individual bits
 94 | // "Safe": use assert() to check bounds
 95 | //
 96 | 
 97 | // Get the value of a bit (returns 0 or 1)
 98 | char bit_array_get_bit(const BIT_ARRAY* bitarr, bit_index_t b);
 99 | void bit_array_set_bit(BIT_ARRAY* bitarr, bit_index_t b);
100 | void bit_array_clear_bit(BIT_ARRAY* bitarr, bit_index_t b);
101 | void bit_array_toggle_bit(BIT_ARRAY* bitarr, bit_index_t b);
102 | // If char c != 0, set bit; otherwise clear bit
103 | void bit_array_assign_bit(BIT_ARRAY* bitarr, bit_index_t b, char c);
104 | 
105 | //
106 | // "Resizing": enlarge array if needed
107 | //
108 | 
109 | char bit_array_rget(BIT_ARRAY* bitarr, bit_index_t b);
110 | void bit_array_rset(BIT_ARRAY* bitarr, bit_index_t b);
111 | void bit_array_rclear(BIT_ARRAY* bitarr, bit_index_t b);
112 | void bit_array_rtoggle(BIT_ARRAY* bitarr, bit_index_t b);
113 | void bit_array_rassign(BIT_ARRAY* bitarr, bit_index_t b, char c);
114 | 
115 | //
116 | // Set, clear and toggle several bits at once
117 | //
118 | 
119 | // Set multiple bits at once.
120 | // e.g. set bits 1, 20 & 31: bit_array_set_bits(bitarr, 3, 1,20,31);
121 | // Note: variable args are of type unsigned int
122 | void bit_array_set_bits(BIT_ARRAY* bitarr, size_t n, ...);
123 | 
124 | // Clear multiple bits at once.
125 | // e.g. clear bits 1, 20 & 31: bit_array_clear_bits(bitarr, 3, 1,20,31);
126 | // Note: variable args are of type unsigned int
127 | void bit_array_clear_bits(BIT_ARRAY* bitarr, size_t n, ...);
128 | 
129 | // Toggle multiple bits at once
130 | // e.g. toggle bits 1, 20 & 31: bit_array_toggle_bits(bitarr, 3, 1,20,31);
131 | // Note: variable args are of type unsigned int
132 | void bit_array_toggle_bits(BIT_ARRAY* bitarr, size_t n, ...);
133 | 
134 | //
135 | // Set, clear and toggle all bits in a region
136 | //
137 | 
138 | // Set all the bits in a region
139 | void bit_array_set_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
140 | 
141 | // Clear all the bits in a region
142 | void bit_array_clear_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
143 | 
144 | // Toggle all the bits in a region
145 | void bit_array_toggle_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
146 | 
147 | //
148 | // Set, clear and toggle all bits at once
149 | //
150 | 
151 | // Set all bits in this array to 1
152 | void bit_array_set_all(BIT_ARRAY* bitarr);
153 | 
154 | // Set all bits in this array to 0
155 | void bit_array_clear_all(BIT_ARRAY* bitarr);
156 | 
157 | // Set all 1 bits to 0, and all 0 bits to 1
158 | void bit_array_toggle_all(BIT_ARRAY* bitarr);
159 | 
160 | //
161 | // Get / set a word of a given size
162 | //
163 | 
164 | // First bit is in the least significant bit position
165 | // start index must be within the range of the bit array (0 <= x < length)
166 | uint64_t bit_array_get_word64(const BIT_ARRAY* bitarr, bit_index_t start);
167 | uint32_t bit_array_get_word32(const BIT_ARRAY* bitarr, bit_index_t start);
168 | uint16_t bit_array_get_word16(const BIT_ARRAY* bitarr, bit_index_t start);
169 | uint8_t  bit_array_get_word8(const BIT_ARRAY* bitarr, bit_index_t start);
170 | uint64_t bit_array_get_wordn(const BIT_ARRAY* bitarr, bit_index_t start, int n);
171 | 
172 | // Set 64 bits at once from a particular start position
173 | void bit_array_set_word64(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word);
174 | void bit_array_set_word32(BIT_ARRAY* bitarr, bit_index_t start, uint32_t word);
175 | void bit_array_set_word16(BIT_ARRAY* bitarr, bit_index_t start, uint16_t word);
176 | void bit_array_set_word8(BIT_ARRAY* bitarr, bit_index_t start, uint8_t byte);
177 | void bit_array_set_wordn(BIT_ARRAY* bitarr, bit_index_t start, uint64_t word, int n);
178 | 
179 | //
180 | // Number of bits set
181 | //
182 | 
183 | // Get the number of bits set (hamming weight)
184 | bit_index_t bit_array_num_bits_set(const BIT_ARRAY* bitarr);
185 | 
186 | // Get the number of bits not set (length - hamming weight)
187 | bit_index_t bit_array_num_bits_cleared(const BIT_ARRAY* bitarr);
188 | 
189 | // Get the number of bits set in on array and not the other.  This is equivalent
190 | // to hamming weight of the XOR when the two arrays are the same length.
191 | // e.g. 10101 vs 00111 => hamming distance 2 (XOR is 10010)
192 | bit_index_t bit_array_hamming_distance(const BIT_ARRAY* arr1,
193 |                                        const BIT_ARRAY* arr2);
194 | 
195 | // Parity - returns 1 if odd number of bits set, 0 if even
196 | char bit_array_parity(const BIT_ARRAY* bitarr);
197 | 
198 | //
199 | // Find indices of set/clear bits
200 | //
201 | 
202 | // Find the index of the next bit that is set, at or after `offset`
203 | // Returns 1 if a bit is set, otherwise 0
204 | // Index of next set bit is stored in the integer pointed to by result
205 | // If no next bit is set result is not changed
206 | char bit_array_find_next_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
207 |                                  bit_index_t* result);
208 | 
209 | // Find the index of the next bit that is NOT set, at or after `offset`
210 | // Returns 1 if a bit is NOT set, otherwise 0
211 | // Index of next zero bit is stored in the integer pointed to by `result`
212 | // If no next bit is zero, value at `result` is not changed
213 | char bit_array_find_next_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
214 |                                  bit_index_t* result);
215 | 
216 | // Find the index of the previous bit that is set, before offset.
217 | // Returns 1 if a bit is set, otherwise 0
218 | // Index of previous set bit is stored in the integer pointed to by `result`
219 | // If no previous bit is set result is not changed
220 | char bit_array_find_prev_set_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
221 |                                  bit_index_t* result);
222 | 
223 | // Find the index of the previous bit that is NOT set, before offset.
224 | // Returns 1 if a bit is clear, otherwise 0
225 | // Index of previous zero bit is stored in the integer pointed to by `result`
226 | // If no previous bit is zero result is not changed
227 | char bit_array_find_prev_clear_bit(const BIT_ARRAY* bitarr, bit_index_t offset,
228 |                                    bit_index_t* result);
229 | 
230 | // Find the index of the first bit that is set.
231 | // Returns 1 if a bit is set, otherwise 0
232 | // Index of first set bit is stored in the integer pointed to by `result`
233 | // If no bit is set result is not changed
234 | char bit_array_find_first_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
235 | 
236 | // Find the index of the first bit that is NOT set.
237 | // Returns 1 if a bit is clear, otherwise 0
238 | // Index of first zero bit is stored in the integer pointed to by `result`
239 | // If no bit is zero result is not changed
240 | char bit_array_find_first_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
241 | 
242 | // Find the index of the last bit that is set.
243 | // Returns 1 if a bit is set, otherwise 0
244 | // Index of last set bit is stored in the integer pointed to by `result`
245 | // If no bit is set result is not changed
246 | char bit_array_find_last_set_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
247 | 
248 | // Find the index of the last bit that is NOT set.
249 | // Returns 1 if a bit is clear, otherwise 0
250 | // Index of last zero bit is stored in the integer pointed to by `result`
251 | // If no bit is zero result is not changed
252 | char bit_array_find_last_clear_bit(const BIT_ARRAY* bitarr, bit_index_t* result);
253 | 
254 | 
255 | //
256 | // Sorting
257 | //
258 | 
259 | // Put all the 0s before all the 1s
260 | void bit_array_sort_bits(BIT_ARRAY* bitarr);
261 | 
262 | // Put all the 1s before all the 0s
263 | void bit_array_sort_bits_rev(BIT_ARRAY* bitarr);
264 | 
265 | 
266 | //
267 | // String and printing methods
268 | //
269 | 
270 | // Construct a BIT_ARRAY from a string.
271 | void bit_array_from_str(BIT_ARRAY* bitarr, const char* bitstr);
272 | 
273 | // Construct a BIT_ARRAY from a substring with given on and off characters.
274 | void bit_array_from_substr(BIT_ARRAY* bitarr, bit_index_t offset,
275 |                            const char* str, size_t len,
276 |                            const char *on, const char *off, char left_to_right);
277 | 
278 | // Takes a char array to write to.  `str` must be bitarr->num_of_bits+1 in
279 | // length. Terminates string with '\0'
280 | char* bit_array_to_str(const BIT_ARRAY* bitarr, char* str);
281 | char* bit_array_to_str_rev(const BIT_ARRAY* bitarr, char* str);
282 | 
283 | // Get a string representations for a given region, using given on/off
284 | // characters.
285 | // Note: does not null-terminate
286 | void bit_array_to_substr(const BIT_ARRAY* bitarr,
287 |                          bit_index_t start, bit_index_t length,
288 |                          char* str, char on, char off, char left_to_right);
289 | 
290 | // Print this array to a file stream.  Prints '0's and '1'.  Doesn't print
291 | // newline.
292 | void bit_array_print(const BIT_ARRAY* bitarr, FILE* fout);
293 | 
294 | // Print a string representations for a given region, using given on/off
295 | // characters. Reverse prints from highest to lowest -- this is useful for
296 | // printing binary numbers
297 | void bit_array_print_substr(const BIT_ARRAY* bitarr,
298 |                             bit_index_t start, bit_index_t length,
299 |                             FILE* fout, char on, char off, char left_to_right);
300 | 
301 | //
302 | // Decimal
303 | //
304 | 
305 | // Get bit array as decimal str (e.g. 0b1101 -> "13")
306 | size_t bit_array_to_decimal(const BIT_ARRAY *bitarr, char *str, size_t len);
307 | 
308 | // Return number of characters used
309 | size_t bit_array_from_decimal(BIT_ARRAY *bitarr, const char* decimal);
310 | 
311 | //
312 | // Hexidecimal
313 | //
314 | 
315 | // Loads array from hex string
316 | // Returns the number of bits loaded (will be chars rounded up to multiple of 8)
317 | // (0 on failure)
318 | bit_index_t bit_array_from_hex(BIT_ARRAY* bitarr, bit_index_t offset,
319 |                                const char* str, size_t len);
320 | 
321 | // Returns number of characters written
322 | size_t bit_array_to_hex(const BIT_ARRAY* bitarr,
323 |                         bit_index_t start, bit_index_t length,
324 |                         char* str, char uppercase);
325 | 
326 | // Print bit array as hex
327 | size_t bit_array_print_hex(const BIT_ARRAY* bitarr,
328 |                            bit_index_t start, bit_index_t length,
329 |                            FILE* fout, char uppercase);
330 | 
331 | //
332 | // Clone and copy
333 | //
334 | 
335 | // Copy a BIT_ARRAY struct and the data it holds - returns pointer to new object
336 | #define bit_array_dup	bit_array_clone
337 | BIT_ARRAY* bit_array_clone(const BIT_ARRAY* bitarr);
338 | 
339 | // Copy bits from one array to another
340 | // Note: use MACRO bit_array_copy
341 | // Destination and source can be the same bit_array and
342 | // src/dst regions can overlap
343 | void bit_array_copy(BIT_ARRAY* dst, bit_index_t dstindx,
344 |                     const BIT_ARRAY* src, bit_index_t srcindx,
345 |                     bit_index_t length);
346 | 
347 | // copy all of src to dst. dst is resized to match src.
348 | void bit_array_copy_all(BIT_ARRAY* dst, const BIT_ARRAY* src);
349 | 
350 | //
351 | // Logic operators
352 | //
353 | 
354 | // BIT_ARRAYs can all be different or the same object
355 | // dest array will be resized if it is too short
356 | //
357 | void bit_array_and(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
358 | void bit_array_or (BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
359 | void bit_array_xor(BIT_ARRAY* dest, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
360 | void bit_array_not(BIT_ARRAY* dest, const BIT_ARRAY* src);
361 | 
362 | //
363 | // Comparisons
364 | //
365 | 
366 | // Note: (bit_array_cmp(a,b) == 0) <=> (bit_array_cmp_big_endian(a,b) == 0)
367 | 
368 | // comparison functions return:
369 | //   1 iff bitarr1 > bitarr2
370 | //   0 iff bitarr1 == bitarr2
371 | //  -1 iff bitarr1 < bitarr2
372 | 
373 | // Compare two bit arrays by value stored, with index 0 being the Least
374 | // Significant Bit (LSB). Arrays do not have to be the same length.
375 | // Example: ..0101 (5) > ...0011 (3) [index 0 is LSB at right hand side]
376 | int bit_array_cmp(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
377 | 
378 | // Compare two bit arrays by value stored, with index 0 being the Most
379 | // Significant Bit (MSB). Arrays do not have to be the same length.
380 | // Example: 10.. > 01.. [index 0 is MSB at left hand side]
381 | int bit_array_cmp_big_endian(const BIT_ARRAY* bitarr1, const BIT_ARRAY* bitarr2);
382 | 
383 | // compare bitarr with (bitarr2 << pos)
384 | int bit_array_cmp_words(const BIT_ARRAY *bitarr,
385 |                         bit_index_t pos, const BIT_ARRAY *bitarr2);
386 | 
387 | //
388 | // Shift, interleave, reverse
389 | //
390 | 
391 | // Shift array left/right.  If fill is zero, filled with 0, otherwise 1
392 | void bit_array_shift_right(BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
393 | void bit_array_shift_left (BIT_ARRAY* bitarr, bit_index_t shift_dist, char fill);
394 | 
395 | // shift left without losing any bits. Resizes bitarr.
396 | void bit_array_shift_left_extend(BIT_ARRAY* bitarr, bit_index_t shift_dist,
397 |                                  char fill);
398 | 
399 | // Cyclic shift
400 | void bit_array_cycle_right(BIT_ARRAY* bitarr, bit_index_t dist);
401 | void bit_array_cycle_left (BIT_ARRAY* bitarr, bit_index_t dist);
402 | 
403 | // Interleave
404 | // dst cannot point to the same bit array as src1 or src2
405 | // src1, src2 may point to the same bit array
406 | // abcd 1234 -> a1b2c3d4
407 | // 0011 0000 -> 00001010
408 | // 1111 0000 -> 10101010
409 | // 0101 1010 -> 01100110
410 | // Extends dst if it is too short, but does not shrink it if it is too long
411 | // if dst is longer than length(src1)+length(src2), the end bits are not altered
412 | void bit_array_interleave(BIT_ARRAY* dst,
413 |                           const BIT_ARRAY* src1,
414 |                           const BIT_ARRAY* src2);
415 | 
416 | // Reverse the whole array or part of it
417 | void bit_array_reverse(BIT_ARRAY* bitarr);
418 | void bit_array_reverse_region(BIT_ARRAY* bitarr, bit_index_t start, bit_index_t len);
419 | 
420 | //
421 | // Numeric
422 | //
423 | 
424 | // Returns 1 on sucess, 0 if value in array is too big
425 | char bit_array_as_num(const BIT_ARRAY* bitarr, uint64_t* result);
426 | 
427 | // 1 iff bitarr > value
428 | // 0 iff bitarr == value
429 | // -1 iff bitarr < value
430 | int bit_array_cmp_uint64(const BIT_ARRAY* bitarr, uint64_t value);
431 | 
432 | //
433 | // Arithmetic
434 | //
435 | 
436 | // bitarr will be extended if needed
437 | void bit_array_add_uint64(BIT_ARRAY* bitarr, uint64_t value);
438 | 
439 | // Add `add` to `bitarr` at `pos` -- same as:
440 | //   bitarr + (add << pos)
441 | // where pos can be bigger than the length of the array (bitarr will be resized)
442 | void bit_array_add_word(BIT_ARRAY *bitarr, bit_index_t pos, uint64_t add);
443 | 
444 | // Add `add` to `bitarr` at `pos`
445 | void bit_array_add_words(BIT_ARRAY *bitarr, bit_index_t pos, const BIT_ARRAY *add);
446 | 
447 | // If value is greater than bitarr, bitarr is not changed and 0 is returned
448 | // Returns 1 on success, 0 if value > bitarr
449 | char bit_array_sub_uint64(BIT_ARRAY* bitarr, uint64_t value);
450 | 
451 | // minus `minus` from `bitarr` at `pos` -- same as:
452 | //   bitarr + (minus << pos)
453 | // Returns 1 on success, 0 if value > bitarr
454 | char bit_array_sub_word(BIT_ARRAY *bitarr, bit_index_t pos, word_t minus);
455 | 
456 | // minus `minus` from `bitarr` at `pos`
457 | // Returns 1 on success, 0 if value > bitarr
458 | char bit_array_sub_words(BIT_ARRAY* bitarr, bit_index_t pos, BIT_ARRAY* minus);
459 | 
460 | // Multiply by some value
461 | void bit_array_mul_uint64(BIT_ARRAY *bitarr, uint64_t multiplier);
462 | 
463 | // bitarr = round_down(bitarr / divisor)
464 | // rem = bitarr % divisor
465 | void bit_array_div_uint64(BIT_ARRAY *bitarr, uint64_t divisor, uint64_t *rem);
466 | 
467 | //
468 | // Arithmetic between arrays
469 | //
470 | 
471 | // dst = src1 + src2
472 | // src1, src2 and dst can all be the same BIT_ARRAY
473 | // If dst is shorter than either of src1, src2, it is enlarged
474 | void bit_array_add(BIT_ARRAY* dst, const BIT_ARRAY* src1, const BIT_ARRAY* src2);
475 | 
476 | // dst = src1 - src2
477 | // src1, src2 and dst can all be the same BIT_ARRAY
478 | // If dst is shorter than src1, it will be extended to be as long as src1
479 | // src1 must be greater than or equal to src2 (src1 >= src2)
480 | void bit_array_subtract(BIT_ARRAY* dst,
481 |                         const BIT_ARRAY* src1, const BIT_ARRAY* src2);
482 | 
483 | // dst = src1 * src2
484 | // Pointers cannot all point to the same BIT_ARRAY
485 | void bit_array_multiply(BIT_ARRAY *dst, BIT_ARRAY *src1, BIT_ARRAY *src2);
486 | 
487 | // Results in:
488 | //   quotient = dividend / divisor
489 | //   dividend = dividend % divisor
490 | // (dividend is used to return the remainder)
491 | void bit_array_divide(BIT_ARRAY *dividend, BIT_ARRAY *quotient, BIT_ARRAY *divisor);
492 | 
493 | //
494 | // Read/Write bit_array to a file
495 | //
496 | // File format is [8 bytes: for number of elements in array][data]
497 | // Number of bytes of data is: (int)((num_of_bits + 7) / 8)
498 | //
499 | 
500 | // Saves bit array to a file
501 | // returns the number of bytes written
502 | bit_index_t bit_array_save(const BIT_ARRAY* bitarr, FILE* f);
503 | 
504 | // Reads bit array from a file. bitarr is resized and filled.
505 | // Returns 1 on success, 0 on failure
506 | char bit_array_load(BIT_ARRAY* bitarr, FILE* f);
507 | 
508 | char bit_array_load_mm(BIT_ARRAY * bitarr, unsigned char * mm, uint64_t offset);
509 | 
510 | 
511 | //
512 | // Hash function
513 | //
514 | 
515 | // Pass seed as 0 on first call, pass previous hash value if rehashing due
516 | // to a collision
517 | // Using bob jenkins hash lookup3
518 | uint64_t bit_array_hash(const BIT_ARRAY* bitarr, uint64_t seed);
519 | 
520 | //
521 | // Randomness
522 | //
523 | 
524 | // Set bits randomly with probability prob : 0 <= prob <= 1
525 | void bit_array_random(BIT_ARRAY* bitarr, float prob);
526 | 
527 | // Shuffle the bits in an array randomly
528 | void bit_array_shuffle(BIT_ARRAY* bitarr);
529 | 
530 | // Get the next permutation of an array with a fixed size and given number of
531 | // bits set.  Also known as next lexicographic permutation.
532 | // Given a bit array find the next lexicographic orginisation of the bits
533 | // Number of possible combinations given by (size choose bits_set) i.e. nCk
534 | // 00011 -> 00101 -> 00110 -> 01001 -> 01010 ->
535 | // 01100 -> 10001 -> 10010 -> 10100 -> 11000 -> 00011 (back to start)
536 | void bit_array_next_permutation(BIT_ARRAY* bitarr);
537 | 
538 | //
539 | // Generally useful functions
540 | //
541 | 
542 | // Generalised 'binary to string' function
543 | // Adds bits to the string in order of lsb to msb
544 | // e.g. 0b11010 (26 in decimal) would come out as "01011"
545 | char* bit_array_word2str(const void *ptr, size_t num_of_bits, char *str);
546 | 
547 | // Same as above but in reverse
548 | char* bit_array_word2str_rev(const void *ptr, size_t num_of_bits, char *str);
549 | 
550 | #ifdef __cplusplus
551 | }
552 | #endif
553 | 
554 | #endif
555 | 


--------------------------------------------------------------------------------
/khash.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /*
 27 |   An example:
 28 | 
 29 | #include "khash.h"
 30 | KHASH_MAP_INIT_INT(32, char)
 31 | int main() {
 32 | 	int ret, is_missing;
 33 | 	khiter_t k;
 34 | 	khash_t(32) *h = kh_init(32);
 35 | 	k = kh_put(32, h, 5, &ret);
 36 | 	kh_value(h, k) = 10;
 37 | 	k = kh_get(32, h, 10);
 38 | 	is_missing = (k == kh_end(h));
 39 | 	k = kh_get(32, h, 5);
 40 | 	kh_del(32, h, k);
 41 | 	for (k = kh_begin(h); k != kh_end(h); ++k)
 42 | 		if (kh_exist(h, k)) kh_value(h, k) = 1;
 43 | 	kh_destroy(32, h);
 44 | 	return 0;
 45 | }
 46 | */
 47 | 
 48 | /*
 49 |   2011-12-29 (0.2.7):
 50 | 
 51 |     * Minor code clean up; no actual effect.
 52 | 
 53 |   2011-09-16 (0.2.6):
 54 | 
 55 | 	* The capacity is a power of 2. This seems to dramatically improve the
 56 | 	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
 57 | 
 58 | 	   - http://code.google.com/p/ulib/
 59 | 	   - http://nothings.org/computer/judy/
 60 | 
 61 | 	* Allow to optionally use linear probing which usually has better
 62 | 	  performance for random input. Double hashing is still the default as it
 63 | 	  is more robust to certain non-random input.
 64 | 
 65 | 	* Added Wang's integer hash function (not used by default). This hash
 66 | 	  function is more robust to certain non-random input.
 67 | 
 68 |   2011-02-14 (0.2.5):
 69 | 
 70 |     * Allow to declare global functions.
 71 | 
 72 |   2009-09-26 (0.2.4):
 73 | 
 74 |     * Improve portability
 75 | 
 76 |   2008-09-19 (0.2.3):
 77 | 
 78 | 	* Corrected the example
 79 | 	* Improved interfaces
 80 | 
 81 |   2008-09-11 (0.2.2):
 82 | 
 83 | 	* Improved speed a little in kh_put()
 84 | 
 85 |   2008-09-10 (0.2.1):
 86 | 
 87 | 	* Added kh_clear()
 88 | 	* Fixed a compiling error
 89 | 
 90 |   2008-09-02 (0.2.0):
 91 | 
 92 | 	* Changed to token concatenation which increases flexibility.
 93 | 
 94 |   2008-08-31 (0.1.2):
 95 | 
 96 | 	* Fixed a bug in kh_get(), which has not been tested previously.
 97 | 
 98 |   2008-08-31 (0.1.1):
 99 | 
100 | 	* Added destructor
101 | */
102 | 
103 | 
104 | #ifndef __AC_KHASH_H
105 | #define __AC_KHASH_H
106 | 
107 | /*!
108 |   @header
109 | 
110 |   Generic hash table library.
111 |  */
112 | 
113 | #define AC_VERSION_KHASH_H "0.2.6"
114 | 
115 | #include <stdlib.h>
116 | #include <string.h>
117 | #include <limits.h>
118 | 
119 | #ifdef USE_MALLOC_WRAPPERS
120 | #  include "malloc_wrap.h"
121 | #endif
122 | 
123 | /* compipler specific configuration */
124 | 
125 | #if UINT_MAX == 0xffffffffu
126 | typedef unsigned int khint32_t;
127 | #elif ULONG_MAX == 0xffffffffu
128 | typedef unsigned long khint32_t;
129 | #endif
130 | 
131 | #if ULONG_MAX == ULLONG_MAX
132 | typedef unsigned long khint64_t;
133 | #else
134 | typedef unsigned long long khint64_t;
135 | #endif
136 | 
137 | #ifdef _MSC_VER
138 | #define kh_inline __inline
139 | #else
140 | #define kh_inline inline
141 | #endif
142 | 
143 | typedef khint32_t khint_t;
144 | typedef khint_t khiter_t;
145 | 
146 | #define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
147 | #define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
148 | #define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
149 | #define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
150 | #define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
151 | #define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
152 | #define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
153 | 
154 | #ifdef KHASH_LINEAR
155 | #define __ac_inc(k, m) 1
156 | #else
157 | #define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m)
158 | #endif
159 | 
160 | #define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
161 | 
162 | #ifndef kroundup32
163 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
164 | #endif
165 | 
166 | #ifndef kcalloc
167 | #define kcalloc(N,Z) calloc(N,Z)
168 | #endif
169 | #ifndef kmalloc
170 | #define kmalloc(Z) malloc(Z)
171 | #endif
172 | #ifndef krealloc
173 | #define krealloc(P,Z) realloc(P,Z)
174 | #endif
175 | #ifndef kfree
176 | #define kfree(P) free(P)
177 | #endif
178 | 
179 | static const double __ac_HASH_UPPER = 0.77;
180 | 
181 | #define __KHASH_TYPE(name, khkey_t, khval_t) \
182 | 	typedef struct { \
183 | 		khint_t n_buckets, size, n_occupied, upper_bound; \
184 | 		khint32_t *flags; \
185 | 		khkey_t *keys; \
186 | 		khval_t *vals; \
187 | 	} kh_##name##_t;
188 | 
189 | #define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
190 | 	extern kh_##name##_t *kh_init_##name(void);							\
191 | 	extern void kh_destroy_##name(kh_##name##_t *h);					\
192 | 	extern void kh_clear_##name(kh_##name##_t *h);						\
193 | 	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
194 | 	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
195 | 	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
196 | 	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
197 | 
198 | #define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
199 | 	SCOPE kh_##name##_t *kh_init_##name(void) {							\
200 | 		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
201 | 	}																	\
202 | 	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
203 | 	{																	\
204 | 		if (h) {														\
205 | 			kfree((void *)h->keys); kfree(h->flags);					\
206 | 			kfree((void *)h->vals);										\
207 | 			kfree(h);													\
208 | 		}																\
209 | 	}																	\
210 | 	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
211 | 	{																	\
212 | 		if (h && h->flags) {											\
213 | 			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
214 | 			h->size = h->n_occupied = 0;								\
215 | 		}																\
216 | 	}																	\
217 | 	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
218 | 	{																	\
219 | 		if (h->n_buckets) {												\
220 | 			khint_t inc, k, i, last, mask;								\
221 | 			mask = h->n_buckets - 1;									\
222 | 			k = __hash_func(key); i = k & mask;							\
223 | 			inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \
224 | 			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
225 | 				i = (i + inc) & mask; 									\
226 | 				if (i == last) return h->n_buckets;						\
227 | 			}															\
228 | 			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
229 | 		} else return 0;												\
230 | 	}																	\
231 | 	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
232 | 	{ /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
233 | 		khint32_t *new_flags = 0;										\
234 | 		khint_t j = 1;													\
235 | 		{																\
236 | 			kroundup32(new_n_buckets); 									\
237 | 			if (new_n_buckets < 4) new_n_buckets = 4;					\
238 | 			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
239 | 			else { /* hash table size to be changed (shrink or expand); rehash */ \
240 | 				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
241 | 				if (!new_flags) return -1;								\
242 | 				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
243 | 				if (h->n_buckets < new_n_buckets) {	/* expand */		\
244 | 					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
245 | 					if (!new_keys) return -1;							\
246 | 					h->keys = new_keys;									\
247 | 					if (kh_is_map) {									\
248 | 						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
249 | 						if (!new_vals) return -1;						\
250 | 						h->vals = new_vals;								\
251 | 					}													\
252 | 				} /* otherwise shrink */								\
253 | 			}															\
254 | 		}																\
255 | 		if (j) { /* rehashing is needed */								\
256 | 			for (j = 0; j != h->n_buckets; ++j) {						\
257 | 				if (__ac_iseither(h->flags, j) == 0) {					\
258 | 					khkey_t key = h->keys[j];							\
259 | 					khval_t val;										\
260 | 					khint_t new_mask;									\
261 | 					new_mask = new_n_buckets - 1; 						\
262 | 					if (kh_is_map) val = h->vals[j];					\
263 | 					__ac_set_isdel_true(h->flags, j);					\
264 | 					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
265 | 						khint_t inc, k, i;								\
266 | 						k = __hash_func(key);							\
267 | 						i = k & new_mask;								\
268 | 						inc = __ac_inc(k, new_mask);					\
269 | 						while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \
270 | 						__ac_set_isempty_false(new_flags, i);			\
271 | 						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
272 | 							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
273 | 							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
274 | 							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
275 | 						} else { /* write the element and jump out of the loop */ \
276 | 							h->keys[i] = key;							\
277 | 							if (kh_is_map) h->vals[i] = val;			\
278 | 							break;										\
279 | 						}												\
280 | 					}													\
281 | 				}														\
282 | 			}															\
283 | 			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
284 | 				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
285 | 				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
286 | 			}															\
287 | 			kfree(h->flags); /* free the working space */				\
288 | 			h->flags = new_flags;										\
289 | 			h->n_buckets = new_n_buckets;								\
290 | 			h->n_occupied = h->size;									\
291 | 			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
292 | 		}																\
293 | 		return 0;														\
294 | 	}																	\
295 | 	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
296 | 	{																	\
297 | 		khint_t x;														\
298 | 		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
299 | 			if (h->n_buckets > (h->size<<1)) {							\
300 | 				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
301 | 					*ret = -1; return h->n_buckets;						\
302 | 				}														\
303 | 			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
304 | 				*ret = -1; return h->n_buckets;							\
305 | 			}															\
306 | 		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
307 | 		{																\
308 | 			khint_t inc, k, i, site, last, mask = h->n_buckets - 1;		\
309 | 			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
310 | 			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
311 | 			else {														\
312 | 				inc = __ac_inc(k, mask); last = i;						\
313 | 				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
314 | 					if (__ac_isdel(h->flags, i)) site = i;				\
315 | 					i = (i + inc) & mask; 								\
316 | 					if (i == last) { x = site; break; }					\
317 | 				}														\
318 | 				if (x == h->n_buckets) {								\
319 | 					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
320 | 					else x = i;											\
321 | 				}														\
322 | 			}															\
323 | 		}																\
324 | 		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
325 | 			h->keys[x] = key;											\
326 | 			__ac_set_isboth_false(h->flags, x);							\
327 | 			++h->size; ++h->n_occupied;									\
328 | 			*ret = 1;													\
329 | 		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
330 | 			h->keys[x] = key;											\
331 | 			__ac_set_isboth_false(h->flags, x);							\
332 | 			++h->size;													\
333 | 			*ret = 2;													\
334 | 		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
335 | 		return x;														\
336 | 	}																	\
337 | 	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
338 | 	{																	\
339 | 		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
340 | 			__ac_set_isdel_true(h->flags, x);							\
341 | 			--h->size;													\
342 | 		}																\
343 | 	}
344 | 
345 | #define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
346 | 	__KHASH_TYPE(name, khkey_t, khval_t) 								\
347 | 	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
348 | 
349 | #define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
350 | 	__KHASH_TYPE(name, khkey_t, khval_t) 								\
351 | 	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
352 | 
353 | #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
354 | 	KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
355 | 
356 | /* --- BEGIN OF HASH FUNCTIONS --- */
357 | 
358 | /*! @function
359 |   @abstract     Integer hash function
360 |   @param  key   The integer [khint32_t]
361 |   @return       The hash value [khint_t]
362 |  */
363 | #define kh_int_hash_func(key) (khint32_t)(key)
364 | /*! @function
365 |   @abstract     Integer comparison function
366 |  */
367 | #define kh_int_hash_equal(a, b) ((a) == (b))
368 | /*! @function
369 |   @abstract     64-bit integer hash function
370 |   @param  key   The integer [khint64_t]
371 |   @return       The hash value [khint_t]
372 |  */
373 | #define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
374 | /*! @function
375 |   @abstract     64-bit integer comparison function
376 |  */
377 | #define kh_int64_hash_equal(a, b) ((a) == (b))
378 | /*! @function
379 |   @abstract     const char* hash function
380 |   @param  s     Pointer to a null terminated string
381 |   @return       The hash value
382 |  */
383 | static kh_inline khint_t __ac_X31_hash_string(const char *s)
384 | {
385 | 	khint_t h = (khint_t)*s;
386 | 	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
387 | 	return h;
388 | }
389 | /*! @function
390 |   @abstract     Another interface to const char* hash function
391 |   @param  key   Pointer to a null terminated string [const char*]
392 |   @return       The hash value [khint_t]
393 |  */
394 | #define kh_str_hash_func(key) __ac_X31_hash_string(key)
395 | /*! @function
396 |   @abstract     Const char* comparison function
397 |  */
398 | #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
399 | 
400 | static kh_inline khint_t __ac_Wang_hash(khint_t key)
401 | {
402 |     key += ~(key << 15);
403 |     key ^=  (key >> 10);
404 |     key +=  (key << 3);
405 |     key ^=  (key >> 6);
406 |     key += ~(key << 11);
407 |     key ^=  (key >> 16);
408 |     return key;
409 | }
410 | #define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key)
411 | 
412 | /* --- END OF HASH FUNCTIONS --- */
413 | 
414 | /* Other convenient macros... */
415 | 
416 | /*!
417 |   @abstract Type of the hash table.
418 |   @param  name  Name of the hash table [symbol]
419 |  */
420 | #define khash_t(name) kh_##name##_t
421 | 
422 | /*! @function
423 |   @abstract     Initiate a hash table.
424 |   @param  name  Name of the hash table [symbol]
425 |   @return       Pointer to the hash table [khash_t(name)*]
426 |  */
427 | #define kh_init(name) kh_init_##name()
428 | 
429 | /*! @function
430 |   @abstract     Destroy a hash table.
431 |   @param  name  Name of the hash table [symbol]
432 |   @param  h     Pointer to the hash table [khash_t(name)*]
433 |  */
434 | #define kh_destroy(name, h) kh_destroy_##name(h)
435 | 
436 | /*! @function
437 |   @abstract     Reset a hash table without deallocating memory.
438 |   @param  name  Name of the hash table [symbol]
439 |   @param  h     Pointer to the hash table [khash_t(name)*]
440 |  */
441 | #define kh_clear(name, h) kh_clear_##name(h)
442 | 
443 | /*! @function
444 |   @abstract     Resize a hash table.
445 |   @param  name  Name of the hash table [symbol]
446 |   @param  h     Pointer to the hash table [khash_t(name)*]
447 |   @param  s     New size [khint_t]
448 |  */
449 | #define kh_resize(name, h, s) kh_resize_##name(h, s)
450 | 
451 | /*! @function
452 |   @abstract     Insert a key to the hash table.
453 |   @param  name  Name of the hash table [symbol]
454 |   @param  h     Pointer to the hash table [khash_t(name)*]
455 |   @param  k     Key [type of keys]
456 |   @param  r     Extra return code: 0 if the key is present in the hash table;
457 |                 1 if the bucket is empty (never used); 2 if the element in
458 | 				the bucket has been deleted [int*]
459 |   @return       Iterator to the inserted element [khint_t]
460 |  */
461 | #define kh_put(name, h, k, r) kh_put_##name(h, k, r)
462 | 
463 | /*! @function
464 |   @abstract     Retrieve a key from the hash table.
465 |   @param  name  Name of the hash table [symbol]
466 |   @param  h     Pointer to the hash table [khash_t(name)*]
467 |   @param  k     Key [type of keys]
468 |   @return       Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
469 |  */
470 | #define kh_get(name, h, k) kh_get_##name(h, k)
471 | 
472 | /*! @function
473 |   @abstract     Remove a key from the hash table.
474 |   @param  name  Name of the hash table [symbol]
475 |   @param  h     Pointer to the hash table [khash_t(name)*]
476 |   @param  k     Iterator to the element to be deleted [khint_t]
477 |  */
478 | #define kh_del(name, h, k) kh_del_##name(h, k)
479 | 
480 | /*! @function
481 |   @abstract     Test whether a bucket contains data.
482 |   @param  h     Pointer to the hash table [khash_t(name)*]
483 |   @param  x     Iterator to the bucket [khint_t]
484 |   @return       1 if containing data; 0 otherwise [int]
485 |  */
486 | #define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
487 | 
488 | /*! @function
489 |   @abstract     Get key given an iterator
490 |   @param  h     Pointer to the hash table [khash_t(name)*]
491 |   @param  x     Iterator to the bucket [khint_t]
492 |   @return       Key [type of keys]
493 |  */
494 | #define kh_key(h, x) ((h)->keys[x])
495 | 
496 | /*! @function
497 |   @abstract     Get value given an iterator
498 |   @param  h     Pointer to the hash table [khash_t(name)*]
499 |   @param  x     Iterator to the bucket [khint_t]
500 |   @return       Value [type of values]
501 |   @discussion   For hash sets, calling this results in segfault.
502 |  */
503 | #define kh_val(h, x) ((h)->vals[x])
504 | 
505 | /*! @function
506 |   @abstract     Alias of kh_val()
507 |  */
508 | #define kh_value(h, x) ((h)->vals[x])
509 | 
510 | /*! @function
511 |   @abstract     Get the start iterator
512 |   @param  h     Pointer to the hash table [khash_t(name)*]
513 |   @return       The start iterator [khint_t]
514 |  */
515 | #define kh_begin(h) (khint_t)(0)
516 | 
517 | /*! @function
518 |   @abstract     Get the end iterator
519 |   @param  h     Pointer to the hash table [khash_t(name)*]
520 |   @return       The end iterator [khint_t]
521 |  */
522 | #define kh_end(h) ((h)->n_buckets)
523 | 
524 | /*! @function
525 |   @abstract     Get the number of elements in the hash table
526 |   @param  h     Pointer to the hash table [khash_t(name)*]
527 |   @return       Number of elements in the hash table [khint_t]
528 |  */
529 | #define kh_size(h) ((h)->size)
530 | 
531 | /*! @function
532 |   @abstract     Get the number of buckets in the hash table
533 |   @param  h     Pointer to the hash table [khash_t(name)*]
534 |   @return       Number of buckets in the hash table [khint_t]
535 |  */
536 | #define kh_n_buckets(h) ((h)->n_buckets)
537 | 
538 | /*! @function
539 |   @abstract     Iterate over the entries in the hash table
540 |   @param  h     Pointer to the hash table [khash_t(name)*]
541 |   @param  kvar  Variable to which key will be assigned
542 |   @param  vvar  Variable to which value will be assigned
543 |   @param  code  Block of code to execute
544 |  */
545 | #define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
546 | 	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
547 | 		if (!kh_exist(h,__i)) continue;						\
548 | 		(kvar) = kh_key(h,__i);								\
549 | 		(vvar) = kh_val(h,__i);								\
550 | 		code;												\
551 | 	} }
552 | 
553 | /*! @function
554 |   @abstract     Iterate over the values in the hash table
555 |   @param  h     Pointer to the hash table [khash_t(name)*]
556 |   @param  vvar  Variable to which value will be assigned
557 |   @param  code  Block of code to execute
558 |  */
559 | #define kh_foreach_value(h, vvar, code) { khint_t __i;		\
560 | 	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
561 | 		if (!kh_exist(h,__i)) continue;						\
562 | 		(vvar) = kh_val(h,__i);								\
563 | 		code;												\
564 | 	} }
565 | 
566 | /* More conenient interfaces */
567 | 
568 | /*! @function
569 |   @abstract     Instantiate a hash set containing integer keys
570 |   @param  name  Name of the hash table [symbol]
571 |  */
572 | #define KHASH_SET_INIT_INT(name)										\
573 | 	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
574 | 
575 | /*! @function
576 |   @abstract     Instantiate a hash map containing integer keys
577 |   @param  name  Name of the hash table [symbol]
578 |   @param  khval_t  Type of values [type]
579 |  */
580 | #define KHASH_MAP_INIT_INT(name, khval_t)								\
581 | 	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
582 | 
583 | /*! @function
584 |   @abstract     Instantiate a hash map containing 64-bit integer keys
585 |   @param  name  Name of the hash table [symbol]
586 |  */
587 | #define KHASH_SET_INIT_INT64(name)										\
588 | 	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
589 | 
590 | /*! @function
591 |   @abstract     Instantiate a hash map containing 64-bit integer keys
592 |   @param  name  Name of the hash table [symbol]
593 |   @param  khval_t  Type of values [type]
594 |  */
595 | #define KHASH_MAP_INIT_INT64(name, khval_t)								\
596 | 	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
597 | 
598 | typedef const char *kh_cstr_t;
599 | /*! @function
600 |   @abstract     Instantiate a hash map containing const char* keys
601 |   @param  name  Name of the hash table [symbol]
602 |  */
603 | #define KHASH_SET_INIT_STR(name)										\
604 | 	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
605 | 
606 | /*! @function
607 |   @abstract     Instantiate a hash map containing const char* keys
608 |   @param  name  Name of the hash table [symbol]
609 |   @param  khval_t  Type of values [type]
610 |  */
611 | #define KHASH_MAP_INIT_STR(name, khval_t)								\
612 | 	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
613 | 
614 | #endif /* __AC_KHASH_H */
615 | 


--------------------------------------------------------------------------------
/sds.c:
--------------------------------------------------------------------------------
  1 | /* SDS (Simple Dynamic Strings), A C dynamic strings library.
  2 |  *
  3 |  * Copyright (c) 2006-2014, Salvatore Sanfilippo <antirez at gmail dot com>
  4 |  * All rights reserved.
  5 |  *
  6 |  * Redistribution and use in source and binary forms, with or without
  7 |  * modification, are permitted provided that the following conditions are met:
  8 |  *
  9 |  *   * Redistributions of source code must retain the above copyright notice,
 10 |  *     this list of conditions and the following disclaimer.
 11 |  *   * Redistributions in binary form must reproduce the above copyright
 12 |  *     notice, this list of conditions and the following disclaimer in the
 13 |  *     documentation and/or other materials provided with the distribution.
 14 |  *   * Neither the name of Redis nor the names of its contributors may be used
 15 |  *     to endorse or promote products derived from this software without
 16 |  *     specific prior written permission.
 17 |  *
 18 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 19 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 20 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 21 |  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 22 |  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 23 |  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 24 |  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 25 |  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 26 |  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 27 |  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 28 |  * POSSIBILITY OF SUCH DAMAGE.
 29 |  */
 30 | 
 31 | #include <stdio.h>
 32 | #include <stdlib.h>
 33 | #include <string.h>
 34 | #include <ctype.h>
 35 | #include <assert.h>
 36 | 
 37 | #include "sds.h"
 38 | 
 39 | /* Create a new sds string with the content specified by the 'init' pointer
 40 |  * and 'initlen'.
 41 |  * If NULL is used for 'init' the string is initialized with zero bytes.
 42 |  *
 43 |  * The string is always null-termined (all the sds strings are, always) so
 44 |  * even if you create an sds string with:
 45 |  *
 46 |  * mystring = sdsnewlen("abc",3");
 47 |  *
 48 |  * You can print the string with printf() as there is an implicit \0 at the
 49 |  * end of the string. However the string is binary safe and can contain
 50 |  * \0 characters in the middle, as the length is stored in the sds header. */
 51 | sds sdsnewlen(const void *init, size_t initlen) {
 52 |     struct sdshdr *sh;
 53 | 
 54 |     if (init) {
 55 |         sh = malloc(sizeof *sh+initlen+1);
 56 |     } else {
 57 |         sh = calloc(sizeof *sh+initlen+1,1);
 58 |     }
 59 |     if (sh == NULL) return NULL;
 60 |     sh->len = initlen;
 61 |     sh->free = 0;
 62 |     if (initlen && init)
 63 |         memcpy(sh->buf, init, initlen);
 64 |     sh->buf[initlen] = '\0';
 65 |     return (char*)sh->buf;
 66 | }
 67 | 
 68 | /* Create an empty (zero length) sds string. Even in this case the string
 69 |  * always has an implicit null term. */
 70 | sds sdsempty(void) {
 71 |     return sdsnewlen("",0);
 72 | }
 73 | 
 74 | /* Create a new sds string starting from a null termined C string. */
 75 | sds sdsnew(const char *init) {
 76 |     size_t initlen = (init == NULL) ? 0 : strlen(init);
 77 |     return sdsnewlen(init, initlen);
 78 | }
 79 | 
 80 | /* Duplicate an sds string. */
 81 | sds sdsdup(const sds s) {
 82 |     return sdsnewlen(s, sdslen(s));
 83 | }
 84 | 
 85 | /* Free an sds string. No operation is performed if 's' is NULL. */
 86 | void sdsfree(sds s) {
 87 |     if (s == NULL) return;
 88 |     free(s-sizeof(struct sdshdr));
 89 | }
 90 | 
 91 | /* Set the sds string length to the length as obtained with strlen(), so
 92 |  * considering as content only up to the first null term character.
 93 |  *
 94 |  * This function is useful when the sds string is hacked manually in some
 95 |  * way, like in the following example:
 96 |  *
 97 |  * s = sdsnew("foobar");
 98 |  * s[2] = '\0';
 99 |  * sdsupdatelen(s);
100 |  * printf("%d\n", sdslen(s));
101 |  *
102 |  * The output will be "2", but if we comment out the call to sdsupdatelen()
103 |  * the output will be "6" as the string was modified but the logical length
104 |  * remains 6 bytes. */
105 | void sdsupdatelen(sds s) {
106 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
107 |     int reallen = strlen(s);
108 |     sh->free += (sh->len-reallen);
109 |     sh->len = reallen;
110 | }
111 | 
112 | /* Modify an sds string on-place to make it empty (zero length).
113 |  * However all the existing buffer is not discarded but set as free space
114 |  * so that next append operations will not require allocations up to the
115 |  * number of bytes previously available. */
116 | void sdsclear(sds s) {
117 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
118 |     sh->free += sh->len;
119 |     sh->len = 0;
120 |     sh->buf[0] = '\0';
121 | }
122 | 
123 | /* Enlarge the free space at the end of the sds string so that the caller
124 |  * is sure that after calling this function can overwrite up to addlen
125 |  * bytes after the end of the string, plus one more byte for nul term.
126 |  * 
127 |  * Note: this does not change the *length* of the sds string as returned
128 |  * by sdslen(), but only the free buffer space we have. */
129 | sds sdsMakeRoomFor(sds s, size_t addlen) {
130 |     struct sdshdr *sh, *newsh;
131 |     size_t free = sdsavail(s);
132 |     size_t len, newlen;
133 | 
134 |     if (free >= addlen) return s;
135 |     len = sdslen(s);
136 |     sh = (void*) (s-sizeof *sh);;
137 |     newlen = (len+addlen);
138 |     if (newlen < SDS_MAX_PREALLOC)
139 |         newlen *= 2;
140 |     else
141 |         newlen += SDS_MAX_PREALLOC;
142 |     newsh = realloc(sh, sizeof *newsh+newlen+1);
143 |     if (newsh == NULL) return NULL;
144 | 
145 |     newsh->free = newlen - len;
146 |     return newsh->buf;
147 | }
148 | 
149 | /* Reallocate the sds string so that it has no free space at the end. The
150 |  * contained string remains not altered, but next concatenation operations
151 |  * will require a reallocation.
152 |  *
153 |  * After the call, the passed sds string is no longer valid and all the
154 |  * references must be substituted with the new pointer returned by the call. */
155 | sds sdsRemoveFreeSpace(sds s) {
156 |     struct sdshdr *sh;
157 | 
158 |     sh = (void*) (s-sizeof *sh);;
159 |     sh = realloc(sh, sizeof *sh+sh->len+1);
160 |     sh->free = 0;
161 |     return sh->buf;
162 | }
163 | 
164 | /* Return the total size of the allocation of the specifed sds string,
165 |  * including:
166 |  * 1) The sds header before the pointer.
167 |  * 2) The string.
168 |  * 3) The free buffer at the end if any.
169 |  * 4) The implicit null term.
170 |  */
171 | size_t sdsAllocSize(sds s) {
172 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
173 | 
174 |     return sizeof(*sh)+sh->len+sh->free+1;
175 | }
176 | 
177 | /* Increment the sds length and decrements the left free space at the
178 |  * end of the string according to 'incr'. Also set the null term
179 |  * in the new end of the string.
180 |  *
181 |  * This function is used in order to fix the string length after the
182 |  * user calls sdsMakeRoomFor(), writes something after the end of
183 |  * the current string, and finally needs to set the new length.
184 |  *
185 |  * Note: it is possible to use a negative increment in order to
186 |  * right-trim the string.
187 |  *
188 |  * Usage example:
189 |  *
190 |  * Using sdsIncrLen() and sdsMakeRoomFor() it is possible to mount the
191 |  * following schema, to cat bytes coming from the kernel to the end of an
192 |  * sds string without copying into an intermediate buffer:
193 |  *
194 |  * oldlen = sdslen(s);
195 |  * s = sdsMakeRoomFor(s, BUFFER_SIZE);
196 |  * nread = read(fd, s+oldlen, BUFFER_SIZE);
197 |  * ... check for nread <= 0 and handle it ...
198 |  * sdsIncrLen(s, nread);
199 |  */
200 | void sdsIncrLen(sds s, int incr) {
201 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
202 | 
203 |     assert(sh->free >= incr);
204 |     sh->len += incr;
205 |     sh->free -= incr;
206 |     assert(sh->free >= 0);
207 |     s[sh->len] = '\0';
208 | }
209 | 
210 | /* Grow the sds to have the specified length. Bytes that were not part of
211 |  * the original length of the sds will be set to zero.
212 |  *
213 |  * if the specified length is smaller than the current length, no operation
214 |  * is performed. */
215 | sds sdsgrowzero(sds s, size_t len) {
216 |     struct sdshdr *sh = (void*) (s-sizeof *sh);
217 |     size_t totlen, curlen = sh->len;
218 | 
219 |     if (len <= curlen) return s;
220 |     s = sdsMakeRoomFor(s,len-curlen);
221 |     if (s == NULL) return NULL;
222 | 
223 |     /* Make sure added region doesn't contain garbage */
224 |     sh = (void*)(s-sizeof *sh);
225 |     memset(s+curlen,0,(len-curlen+1)); /* also set trailing \0 byte */
226 |     totlen = sh->len+sh->free;
227 |     sh->len = len;
228 |     sh->free = totlen-sh->len;
229 |     return s;
230 | }
231 | 
232 | /* Append the specified binary-safe string pointed by 't' of 'len' bytes to the
233 |  * end of the specified sds string 's'.
234 |  *
235 |  * After the call, the passed sds string is no longer valid and all the
236 |  * references must be substituted with the new pointer returned by the call. */
237 | sds sdscatlen(sds s, const void *t, size_t len) {
238 |     struct sdshdr *sh;
239 |     size_t curlen = sdslen(s);
240 | 
241 |     s = sdsMakeRoomFor(s,len);
242 |     if (s == NULL) return NULL;
243 |     sh = (void*) (s-sizeof *sh);;
244 |     memcpy(s+curlen, t, len);
245 |     sh->len = curlen+len;
246 |     sh->free = sh->free-len;
247 |     s[curlen+len] = '\0';
248 |     return s;
249 | }
250 | 
251 | /* Append the specified null termianted C string to the sds string 's'.
252 |  *
253 |  * After the call, the passed sds string is no longer valid and all the
254 |  * references must be substituted with the new pointer returned by the call. */
255 | sds sdscat(sds s, const char *t) {
256 |     return sdscatlen(s, t, strlen(t));
257 | }
258 | 
259 | /* Append the specified sds 't' to the existing sds 's'.
260 |  *
261 |  * After the call, the modified sds string is no longer valid and all the
262 |  * references must be substituted with the new pointer returned by the call. */
263 | sds sdscatsds(sds s, const sds t) {
264 |     return sdscatlen(s, t, sdslen(t));
265 | }
266 | 
267 | /* Destructively modify the sds string 's' to hold the specified binary
268 |  * safe string pointed by 't' of length 'len' bytes. */
269 | sds sdscpylen(sds s, const char *t, size_t len) {
270 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
271 |     size_t totlen = sh->free+sh->len;
272 | 
273 |     if (totlen < len) {
274 |         s = sdsMakeRoomFor(s,len-sh->len);
275 |         if (s == NULL) return NULL;
276 |         sh = (void*) (s-sizeof *sh);;
277 |         totlen = sh->free+sh->len;
278 |     }
279 |     memcpy(s, t, len);
280 |     s[len] = '\0';
281 |     sh->len = len;
282 |     sh->free = totlen-len;
283 |     return s;
284 | }
285 | 
286 | /* Like sdscpylen() but 't' must be a null-termined string so that the length
287 |  * of the string is obtained with strlen(). */
288 | sds sdscpy(sds s, const char *t) {
289 |     return sdscpylen(s, t, strlen(t));
290 | }
291 | 
292 | /* Like sdscatpritf() but gets va_list instead of being variadic. */
293 | sds sdscatvprintf(sds s, const char *fmt, va_list ap) {
294 |     va_list cpy;
295 |     char *buf, *t;
296 |     size_t buflen = 16;
297 | 
298 |     while(1) {
299 |         buf = malloc(buflen);
300 |         if (buf == NULL) return NULL;
301 |         buf[buflen-2] = '\0';
302 |         va_copy(cpy,ap);
303 |         vsnprintf(buf, buflen, fmt, cpy);
304 |         if (buf[buflen-2] != '\0') {
305 |             free(buf);
306 |             buflen *= 2;
307 |             continue;
308 |         }
309 |         break;
310 |     }
311 |     t = sdscat(s, buf);
312 |     free(buf);
313 |     return t;
314 | }
315 | 
316 | /* Append to the sds string 's' a string obtained using printf-alike format
317 |  * specifier.
318 |  *
319 |  * After the call, the modified sds string is no longer valid and all the
320 |  * references must be substituted with the new pointer returned by the call.
321 |  *
322 |  * Example:
323 |  *
324 |  * s = sdsempty("Sum is: ");
325 |  * s = sdscatprintf(s,"%d+%d = %d",a,b,a+b).
326 |  *
327 |  * Often you need to create a string from scratch with the printf-alike
328 |  * format. When this is the need, just use sdsempty() as the target string:
329 |  *
330 |  * s = sdscatprintf(sdsempty(), "... your format ...", args);
331 |  */
332 | sds sdscatprintf(sds s, const char *fmt, ...) {
333 |     va_list ap;
334 |     char *t;
335 |     va_start(ap, fmt);
336 |     t = sdscatvprintf(s,fmt,ap);
337 |     va_end(ap);
338 |     return t;
339 | }
340 | 
341 | /* Remove the part of the string from left and from right composed just of
342 |  * contiguous characters found in 'cset', that is a null terminted C string.
343 |  *
344 |  * After the call, the modified sds string is no longer valid and all the
345 |  * references must be substituted with the new pointer returned by the call.
346 |  *
347 |  * Example:
348 |  *
349 |  * s = sdsnew("AA...AA.a.aa.aHelloWorld     :::");
350 |  * s = sdstrim(s,"A. :");
351 |  * printf("%s\n", s);
352 |  *
353 |  * Output will be just "Hello World".
354 |  */
355 | void sdstrim(sds s, const char *cset) {
356 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
357 |     char *start, *end, *sp, *ep;
358 |     size_t len;
359 | 
360 |     sp = start = s;
361 |     ep = end = s+sdslen(s)-1;
362 |     while(sp <= end && strchr(cset, *sp)) sp++;
363 |     while(ep > start && strchr(cset, *ep)) ep--;
364 |     len = (sp > ep) ? 0 : ((ep-sp)+1);
365 |     if (sh->buf != sp) memmove(sh->buf, sp, len);
366 |     sh->buf[len] = '\0';
367 |     sh->free = sh->free+(sh->len-len);
368 |     sh->len = len;
369 | }
370 | 
371 | /* Turn the string into a smaller (or equal) string containing only the
372 |  * substring specified by the 'start' and 'end' indexes.
373 |  *
374 |  * start and end can be negative, where -1 means the last character of the
375 |  * string, -2 the penultimate character, and so forth.
376 |  *
377 |  * The interval is inclusive, so the start and end characters will be part
378 |  * of the resulting string.
379 |  *
380 |  * The string is modified in-place.
381 |  *
382 |  * Example:
383 |  *
384 |  * s = sdsnew("Hello World");
385 |  * sdsrange(s,1,-1); => "ello World"
386 |  */
387 | void sdsrange(sds s, int start, int end) {
388 |     struct sdshdr *sh = (void*) (s-sizeof *sh);;
389 |     size_t newlen, len = sdslen(s);
390 | 
391 |     if (len == 0) return;
392 |     if (start < 0) {
393 |         start = len+start;
394 |         if (start < 0) start = 0;
395 |     }
396 |     if (end < 0) {
397 |         end = len+end;
398 |         if (end < 0) end = 0;
399 |     }
400 |     newlen = (start > end) ? 0 : (end-start)+1;
401 |     if (newlen != 0) {
402 |         if (start >= (signed)len) {
403 |             newlen = 0;
404 |         } else if (end >= (signed)len) {
405 |             end = len-1;
406 |             newlen = (start > end) ? 0 : (end-start)+1;
407 |         }
408 |     } else {
409 |         start = 0;
410 |     }
411 |     if (start && newlen) memmove(sh->buf, sh->buf+start, newlen);
412 |     sh->buf[newlen] = 0;
413 |     sh->free = sh->free+(sh->len-newlen);
414 |     sh->len = newlen;
415 | }
416 | 
417 | /* Apply tolower() to every character of the sds string 's'. */
418 | void sdstolower(sds s) {
419 |     int len = sdslen(s), j;
420 | 
421 |     for (j = 0; j < len; j++) s[j] = tolower(s[j]);
422 | }
423 | 
424 | /* Apply toupper() to every character of the sds string 's'. */
425 | void sdstoupper(sds s) {
426 |     int len = sdslen(s), j;
427 | 
428 |     for (j = 0; j < len; j++) s[j] = toupper(s[j]);
429 | }
430 | 
431 | /* Compare two sds strings s1 and s2 with memcmp().
432 |  *
433 |  * Return value:
434 |  *
435 |  *     1 if s1 > s2.
436 |  *    -1 if s1 < s2.
437 |  *     0 if s1 and s2 are exactly the same binary string.
438 |  *
439 |  * If two strings share exactly the same prefix, but one of the two has
440 |  * additional characters, the longer string is considered to be greater than
441 |  * the smaller one. */
442 | int sdscmp(const sds s1, const sds s2) {
443 |     size_t l1, l2, minlen;
444 |     int cmp;
445 | 
446 |     l1 = sdslen(s1);
447 |     l2 = sdslen(s2);
448 |     minlen = (l1 < l2) ? l1 : l2;
449 |     cmp = memcmp(s1,s2,minlen);
450 |     if (cmp == 0) return l1-l2;
451 |     return cmp;
452 | }
453 | 
454 | /* Split 's' with separator in 'sep'. An array
455 |  * of sds strings is returned. *count will be set
456 |  * by reference to the number of tokens returned.
457 |  *
458 |  * On out of memory, zero length string, zero length
459 |  * separator, NULL is returned.
460 |  *
461 |  * Note that 'sep' is able to split a string using
462 |  * a multi-character separator. For example
463 |  * sdssplit("foo_-_bar","_-_"); will return two
464 |  * elements "foo" and "bar".
465 |  *
466 |  * This version of the function is binary-safe but
467 |  * requires length arguments. sdssplit() is just the
468 |  * same function but for zero-terminated strings.
469 |  */
470 | sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count) {
471 |     int elements = 0, slots = 5, start = 0, j;
472 |     sds *tokens;
473 | 
474 |     if (seplen < 1 || len < 0) return NULL;
475 | 
476 |     tokens = malloc(sizeof(sds)*slots);
477 |     if (tokens == NULL) return NULL;
478 | 
479 |     if (len == 0) {
480 |         *count = 0;
481 |         return tokens;
482 |     }
483 |     for (j = 0; j < (len-(seplen-1)); j++) {
484 |         /* make sure there is room for the next element and the final one */
485 |         if (slots < elements+2) {
486 |             sds *newtokens;
487 | 
488 |             slots *= 2;
489 |             newtokens = realloc(tokens,sizeof(sds)*slots);
490 |             if (newtokens == NULL) goto cleanup;
491 |             tokens = newtokens;
492 |         }
493 |         /* search the separator */
494 |         if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) {
495 |             tokens[elements] = sdsnewlen(s+start,j-start);
496 |             if (tokens[elements] == NULL) goto cleanup;
497 |             elements++;
498 |             start = j+seplen;
499 |             j = j+seplen-1; /* skip the separator */
500 |         }
501 |     }
502 |     /* Add the final element. We are sure there is room in the tokens array. */
503 |     tokens[elements] = sdsnewlen(s+start,len-start);
504 |     if (tokens[elements] == NULL) goto cleanup;
505 |     elements++;
506 |     *count = elements;
507 |     return tokens;
508 | 
509 | cleanup:
510 |     {
511 |         int i;
512 |         for (i = 0; i < elements; i++) sdsfree(tokens[i]);
513 |         free(tokens);
514 |         *count = 0;
515 |         return NULL;
516 |     }
517 | }
518 | 
519 | /* Free the result returned by sdssplitlen(), or do nothing if 'tokens' is NULL. */
520 | void sdsfreesplitres(sds *tokens, int count) {
521 |     if (!tokens) return;
522 |     while(count--)
523 |         sdsfree(tokens[count]);
524 |     free(tokens);
525 | }
526 | 
527 | /* Create an sds string from a long long value. It is much faster than:
528 |  *
529 |  * sdscatprintf(sdsempty(),"%lld\n", value);
530 |  */
531 | sds sdsfromlonglong(long long value) {
532 |     char buf[32], *p;
533 |     unsigned long long v;
534 | 
535 |     v = (value < 0) ? -value : value;
536 |     p = buf+31; /* point to the last character */
537 |     do {
538 |         *p-- = '0'+(v%10);
539 |         v /= 10;
540 |     } while(v);
541 |     if (value < 0) *p-- = '-';
542 |     p++;
543 |     return sdsnewlen(p,32-(p-buf));
544 | }
545 | 
546 | /* Append to the sds string "s" an escaped string representation where
547 |  * all the non-printable characters (tested with isprint()) are turned into
548 |  * escapes in the form "\n\r\a...." or "\x<hex-number>".
549 |  *
550 |  * After the call, the modified sds string is no longer valid and all the
551 |  * references must be substituted with the new pointer returned by the call. */
552 | sds sdscatrepr(sds s, const char *p, size_t len) {
553 |     s = sdscatlen(s,"\"",1);
554 |     while(len--) {
555 |         switch(*p) {
556 |         case '\\':
557 |         case '"':
558 |             s = sdscatprintf(s,"\\%c",*p);
559 |             break;
560 |         case '\n': s = sdscatlen(s,"\\n",2); break;
561 |         case '\r': s = sdscatlen(s,"\\r",2); break;
562 |         case '\t': s = sdscatlen(s,"\\t",2); break;
563 |         case '\a': s = sdscatlen(s,"\\a",2); break;
564 |         case '\b': s = sdscatlen(s,"\\b",2); break;
565 |         default:
566 |             if (isprint(*p))
567 |                 s = sdscatprintf(s,"%c",*p);
568 |             else
569 |                 s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
570 |             break;
571 |         }
572 |         p++;
573 |     }
574 |     return sdscatlen(s,"\"",1);
575 | }
576 | 
577 | /* Helper function for sdssplitargs() that returns non zero if 'c'
578 |  * is a valid hex digit. */
579 | int is_hex_digit(char c) {
580 |     return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') ||
581 |            (c >= 'A' && c <= 'F');
582 | }
583 | 
584 | /* Helper function for sdssplitargs() that converts a hex digit into an
585 |  * integer from 0 to 15 */
586 | int hex_digit_to_int(char c) {
587 |     switch(c) {
588 |     case '0': return 0;
589 |     case '1': return 1;
590 |     case '2': return 2;
591 |     case '3': return 3;
592 |     case '4': return 4;
593 |     case '5': return 5;
594 |     case '6': return 6;
595 |     case '7': return 7;
596 |     case '8': return 8;
597 |     case '9': return 9;
598 |     case 'a': case 'A': return 10;
599 |     case 'b': case 'B': return 11;
600 |     case 'c': case 'C': return 12;
601 |     case 'd': case 'D': return 13;
602 |     case 'e': case 'E': return 14;
603 |     case 'f': case 'F': return 15;
604 |     default: return 0;
605 |     }
606 | }
607 | 
608 | /* Split a line into arguments, where every argument can be in the
609 |  * following programming-language REPL-alike form:
610 |  *
611 |  * foo bar "newline are supported\n" and "\xff\x00otherstuff"
612 |  *
613 |  * The number of arguments is stored into *argc, and an array
614 |  * of sds is returned.
615 |  *
616 |  * The caller should free the resulting array of sds strings with
617 |  * sdsfreesplitres().
618 |  *
619 |  * Note that sdscatrepr() is able to convert back a string into
620 |  * a quoted string in the same format sdssplitargs() is able to parse.
621 |  *
622 |  * The function returns the allocated tokens on success, even when the
623 |  * input string is empty, or NULL if the input contains unbalanced
624 |  * quotes or closed quotes followed by non space characters
625 |  * as in: "foo"bar or "foo'
626 |  */
627 | sds *sdssplitargs(const char *line, int *argc) {
628 |     const char *p = line;
629 |     char *current = NULL;
630 |     char **vector = NULL;
631 | 
632 |     *argc = 0;
633 |     while(1) {
634 |         /* skip blanks */
635 |         while(*p && isspace(*p)) p++;
636 |         if (*p) {
637 |             /* get a token */
638 |             int inq=0;  /* set to 1 if we are in "quotes" */
639 |             int insq=0; /* set to 1 if we are in 'single quotes' */
640 |             int done=0;
641 | 
642 |             if (current == NULL) current = sdsempty();
643 |             while(!done) {
644 |                 if (inq) {
645 |                     if (*p == '\\' && *(p+1) == 'x' &&
646 |                                              is_hex_digit(*(p+2)) &&
647 |                                              is_hex_digit(*(p+3)))
648 |                     {
649 |                         unsigned char byte;
650 | 
651 |                         byte = (hex_digit_to_int(*(p+2))*16)+
652 |                                 hex_digit_to_int(*(p+3));
653 |                         current = sdscatlen(current,(char*)&byte,1);
654 |                         p += 3;
655 |                     } else if (*p == '\\' && *(p+1)) {
656 |                         char c;
657 | 
658 |                         p++;
659 |                         switch(*p) {
660 |                         case 'n': c = '\n'; break;
661 |                         case 'r': c = '\r'; break;
662 |                         case 't': c = '\t'; break;
663 |                         case 'b': c = '\b'; break;
664 |                         case 'a': c = '\a'; break;
665 |                         default: c = *p; break;
666 |                         }
667 |                         current = sdscatlen(current,&c,1);
668 |                     } else if (*p == '"') {
669 |                         /* closing quote must be followed by a space or
670 |                          * nothing at all. */
671 |                         if (*(p+1) && !isspace(*(p+1))) goto err;
672 |                         done=1;
673 |                     } else if (!*p) {
674 |                         /* unterminated quotes */
675 |                         goto err;
676 |                     } else {
677 |                         current = sdscatlen(current,p,1);
678 |                     }
679 |                 } else if (insq) {
680 |                     if (*p == '\\' && *(p+1) == '\'') {
681 |                         p++;
682 |                         current = sdscatlen(current,"'",1);
683 |                     } else if (*p == '\'') {
684 |                         /* closing quote must be followed by a space or
685 |                          * nothing at all. */
686 |                         if (*(p+1) && !isspace(*(p+1))) goto err;
687 |                         done=1;
688 |                     } else if (!*p) {
689 |                         /* unterminated quotes */
690 |                         goto err;
691 |                     } else {
692 |                         current = sdscatlen(current,p,1);
693 |                     }
694 |                 } else {
695 |                     switch(*p) {
696 |                     case ' ':
697 |                     case '\n':
698 |                     case '\r':
699 |                     case '\t':
700 |                     case '\0':
701 |                         done=1;
702 |                         break;
703 |                     case '"':
704 |                         inq=1;
705 |                         break;
706 |                     case '\'':
707 |                         insq=1;
708 |                         break;
709 |                     default:
710 |                         current = sdscatlen(current,p,1);
711 |                         break;
712 |                     }
713 |                 }
714 |                 if (*p) p++;
715 |             }
716 |             /* add the token to the vector */
717 |             vector = realloc(vector,((*argc)+1)*sizeof(char*));
718 |             vector[*argc] = current;
719 |             (*argc)++;
720 |             current = NULL;
721 |         } else {
722 |             /* Even on empty input string return something not NULL. */
723 |             if (vector == NULL) vector = malloc(sizeof(void*));
724 |             return vector;
725 |         }
726 |     }
727 | 
728 | err:
729 |     while((*argc)--)
730 |         sdsfree(vector[*argc]);
731 |     free(vector);
732 |     if (current) sdsfree(current);
733 |     *argc = 0;
734 |     return NULL;
735 | }
736 | 
737 | /* Modify the string substituting all the occurrences of the set of
738 |  * characters specified in the 'from' string to the corresponding character
739 |  * in the 'to' array.
740 |  *
741 |  * For instance: sdsmapchars(mystring, "ho", "01", 2)
742 |  * will have the effect of turning the string "hello" into "0ell1".
743 |  *
744 |  * The function returns the sds string pointer, that is always the same
745 |  * as the input pointer since no resize is needed. */
746 | sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen) {
747 |     size_t j, i, l = sdslen(s);
748 | 
749 |     for (j = 0; j < l; j++) {
750 |         for (i = 0; i < setlen; i++) {
751 |             if (s[j] == from[i]) {
752 |                 s[j] = to[i];
753 |                 break;
754 |             }
755 |         }
756 |     }
757 |     return s;
758 | }
759 | 
760 | /* Join an array of C strings using the specified separator (also a C string).
761 |  * Returns the result as an sds string. */
762 | sds sdsjoin(char **argv, int argc, char *sep, size_t seplen) {
763 |     sds join = sdsempty();
764 |     int j;
765 | 
766 |     for (j = 0; j < argc; j++) {
767 |         join = sdscat(join, argv[j]);
768 |         if (j != argc-1) join = sdscatlen(join,sep,seplen);
769 |     }
770 |     return join;
771 | }
772 | 
773 | /* Like sdsjoin, but joins an array of SDS strings. */
774 | sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen) {
775 |     sds join = sdsempty();
776 |     int j;
777 | 
778 |     for (j = 0; j < argc; j++) {
779 |         join = sdscatsds(join, argv[j]);
780 |         if (j != argc-1) join = sdscatlen(join,sep,seplen);
781 |     }
782 |     return join;
783 | }
784 | 
785 | #ifdef SDS_TEST_MAIN
786 | #include <stdio.h>
787 | #include "testhelp.h"
788 | 
789 | int main(void) {
790 |     {
791 |         struct sdshdr *sh;
792 |         sds x = sdsnew("foo"), y;
793 | 
794 |         test_cond("Create a string and obtain the length",
795 |             sdslen(x) == 3 && memcmp(x,"foo\0",4) == 0)
796 | 
797 |         sdsfree(x);
798 |         x = sdsnewlen("foo",2);
799 |         test_cond("Create a string with specified length",
800 |             sdslen(x) == 2 && memcmp(x,"fo\0",3) == 0)
801 | 
802 |         x = sdscat(x,"bar");
803 |         test_cond("Strings concatenation",
804 |             sdslen(x) == 5 && memcmp(x,"fobar\0",6) == 0);
805 | 
806 |         x = sdscpy(x,"a");
807 |         test_cond("sdscpy() against an originally longer string",
808 |             sdslen(x) == 1 && memcmp(x,"a\0",2) == 0)
809 | 
810 |         x = sdscpy(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk");
811 |         test_cond("sdscpy() against an originally shorter string",
812 |             sdslen(x) == 33 &&
813 |             memcmp(x,"xyzxxxxxxxxxxyyyyyyyyyykkkkkkkkkk\0",33) == 0)
814 | 
815 |         sdsfree(x);
816 |         x = sdscatprintf(sdsempty(),"%d",123);
817 |         test_cond("sdscatprintf() seems working in the base case",
818 |             sdslen(x) == 3 && memcmp(x,"123\0",4) ==0)
819 | 
820 |         sdsfree(x);
821 |         x = sdsnew("xxciaoyyy");
822 |         sdstrim(x,"xy");
823 |         test_cond("sdstrim() correctly trims characters",
824 |             sdslen(x) == 4 && memcmp(x,"ciao\0",5) == 0)
825 | 
826 |         y = sdsdup(x);
827 |         sdsrange(y,1,1);
828 |         test_cond("sdsrange(...,1,1)",
829 |             sdslen(y) == 1 && memcmp(y,"i\0",2) == 0)
830 | 
831 |         sdsfree(y);
832 |         y = sdsdup(x);
833 |         sdsrange(y,1,-1);
834 |         test_cond("sdsrange(...,1,-1)",
835 |             sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0)
836 | 
837 |         sdsfree(y);
838 |         y = sdsdup(x);
839 |         sdsrange(y,-2,-1);
840 |         test_cond("sdsrange(...,-2,-1)",
841 |             sdslen(y) == 2 && memcmp(y,"ao\0",3) == 0)
842 | 
843 |         sdsfree(y);
844 |         y = sdsdup(x);
845 |         sdsrange(y,2,1);
846 |         test_cond("sdsrange(...,2,1)",
847 |             sdslen(y) == 0 && memcmp(y,"\0",1) == 0)
848 | 
849 |         sdsfree(y);
850 |         y = sdsdup(x);
851 |         sdsrange(y,1,100);
852 |         test_cond("sdsrange(...,1,100)",
853 |             sdslen(y) == 3 && memcmp(y,"iao\0",4) == 0)
854 | 
855 |         sdsfree(y);
856 |         y = sdsdup(x);
857 |         sdsrange(y,100,100);
858 |         test_cond("sdsrange(...,100,100)",
859 |             sdslen(y) == 0 && memcmp(y,"\0",1) == 0)
860 | 
861 |         sdsfree(y);
862 |         sdsfree(x);
863 |         x = sdsnew("foo");
864 |         y = sdsnew("foa");
865 |         test_cond("sdscmp(foo,foa)", sdscmp(x,y) > 0)
866 | 
867 |         sdsfree(y);
868 |         sdsfree(x);
869 |         x = sdsnew("bar");
870 |         y = sdsnew("bar");
871 |         test_cond("sdscmp(bar,bar)", sdscmp(x,y) == 0)
872 | 
873 |         sdsfree(y);
874 |         sdsfree(x);
875 |         x = sdsnew("aar");
876 |         y = sdsnew("bar");
877 |         test_cond("sdscmp(bar,bar)", sdscmp(x,y) < 0)
878 | 
879 |         sdsfree(y);
880 |         sdsfree(x);
881 |         x = sdsnewlen("\a\n\0foo\r",7);
882 |         y = sdscatrepr(sdsempty(),x,sdslen(x));
883 |         test_cond("sdscatrepr(...data...)",
884 |             memcmp(y,"\"\\a\\n\\x00foo\\r\"",15) == 0)
885 | 
886 |         {
887 |             int oldfree;
888 | 
889 |             sdsfree(x);
890 |             x = sdsnew("0");
891 |             sh = (void*) (x-(sizeof(struct sdshdr)));
892 |             test_cond("sdsnew() free/len buffers", sh->len == 1 && sh->free == 0);
893 |             x = sdsMakeRoomFor(x,1);
894 |             sh = (void*) (x-(sizeof(struct sdshdr)));
895 |             test_cond("sdsMakeRoomFor()", sh->len == 1 && sh->free > 0);
896 |             oldfree = sh->free;
897 |             x[1] = '1';
898 |             sdsIncrLen(x,1);
899 |             test_cond("sdsIncrLen() -- content", x[0] == '0' && x[1] == '1');
900 |             test_cond("sdsIncrLen() -- len", sh->len == 2);
901 |             test_cond("sdsIncrLen() -- free", sh->free == oldfree-1);
902 |         }
903 |     }
904 |     test_report()
905 |     return 0;
906 | }
907 | #endif
908 | 


--------------------------------------------------------------------------------