├── src └── Doc.md ├── bin ├── kmeria ├── kbin2bgzf └── retrieve_kmer ├── lib ├── libhts.a ├── libhts.so ├── libhts.so.3 ├── libhts.so.1.20-15-gb204d55 └── pkgconfig │ └── htslib.pc ├── external_tools ├── kmc ├── gemma └── kmc_tools ├── images ├── image1.png └── kmeria_image2 │ └── kmeria_image2.jpg ├── bimbamAsso ├── bimbamAsso ├── bimbamKin └── bimbamAssoDoc.md ├── scripts ├── plot_manhattan.R ├── calc_gwas_threshold.R ├── calc_gwas_threshold_new.R ├── pheno_simulation.pl └── sample_order_manager.pl ├── kmeria_env.yaml ├── LICENSE ├── examples ├── example.md ├── sample_sra.list ├── simulated_phenotype1.tsv └── sample_depth.tsv ├── include ├── util.h ├── hts_os.h ├── kfunc.h ├── kroundup.h ├── knetfile.h ├── hts_log.h ├── khash_str2int.h ├── hts_defs.h ├── tbx.h ├── klist.h ├── kbitset.h ├── hts_expr.h ├── regidx.h ├── kseq.h ├── ksort.h ├── hts_endian.h ├── kstring.h ├── thread_pool.h └── hfile.h ├── Makefile └── README.md /src/Doc.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /bin/kmeria: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/bin/kmeria -------------------------------------------------------------------------------- /bin/kbin2bgzf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/bin/kbin2bgzf -------------------------------------------------------------------------------- /lib/libhts.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/lib/libhts.a -------------------------------------------------------------------------------- /lib/libhts.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/lib/libhts.so -------------------------------------------------------------------------------- /lib/libhts.so.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/lib/libhts.so.3 -------------------------------------------------------------------------------- /bin/retrieve_kmer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/bin/retrieve_kmer -------------------------------------------------------------------------------- /external_tools/kmc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/external_tools/kmc -------------------------------------------------------------------------------- /images/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/images/image1.png -------------------------------------------------------------------------------- /bimbamAsso/bimbamAsso: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/bimbamAsso/bimbamAsso -------------------------------------------------------------------------------- /bimbamAsso/bimbamKin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/bimbamAsso/bimbamKin -------------------------------------------------------------------------------- /external_tools/gemma: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/external_tools/gemma -------------------------------------------------------------------------------- /external_tools/kmc_tools: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/external_tools/kmc_tools -------------------------------------------------------------------------------- /lib/libhts.so.1.20-15-gb204d55: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/lib/libhts.so.1.20-15-gb204d55 -------------------------------------------------------------------------------- /images/kmeria_image2/kmeria_image2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sh1ne111/KMERIA/HEAD/images/kmeria_image2/kmeria_image2.jpg -------------------------------------------------------------------------------- /scripts/plot_manhattan.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | 5 | # Usage: Rscript plot_manhattan.R sigkmer.plot.txt sigkmer 1e-7 6 | 7 | 8 | argv<-commandArgs(TRUE) 9 | 10 | library('CMplot') 11 | 12 | a <- read.table(argv[1], header=F) 13 | 14 | ## rgb(129,196,240) skyblue 15 | ## rgb(235,70,144 pink 16 | 17 | CMplot(a,plot.type = 'm',file='png',file.name=argv[2],col=c(rgb(192,192,192,max=255),rgb(0,0,0,max=255)),threshold = as.numeric(argv[3]),threshold.col='red',threshold.lty = 2,threshold.lwd = 1, amplify = TRUE, signal.cex = c(1,1), signal.pch = c(20,20),signal.col = NULL, dpi=326,cex=0.6,conf.int=T,conf.int.col='grey',box=F,main='',ylab=expression(-log[10](italic(p))), width=12, height = 4) 18 | -------------------------------------------------------------------------------- /lib/pkgconfig/htslib.pc: -------------------------------------------------------------------------------- 1 | includedir=/public/home/agis_chenshuai/software/htslib/include 2 | libdir=/public/home/agis_chenshuai/software/htslib/lib 3 | 4 | # Flags and libraries needed when linking against a static libhts.a 5 | # (used by manual and semi-manual pkg-config(1)-style enquiries). 6 | static_ldflags=-L/public/software/compiler/gcc-12.2.0/lib 7 | static_libs=-lpthread -lz -lm -lbz2 -llzma 8 | 9 | Name: htslib 10 | Description: C library for high-throughput sequencing data formats 11 | Version: 1.20-15-gb204d55 12 | Cflags: -I${includedir} 13 | Libs: -L${libdir} -lhts 14 | Libs.private: -L${libdir} -L/public/software/compiler/gcc-12.2.0/lib -lbz2 -lhts -lm -lpthread 15 | Requires.private: zlib liblzma 16 | -------------------------------------------------------------------------------- /kmeria_env.yaml: -------------------------------------------------------------------------------- 1 | name: kmeriaenv 2 | channels: 3 | - https://conda.anaconda.org/conda-forge 4 | dependencies: 5 | - _libgcc_mutex=0.1=conda_forge 6 | - _openmp_mutex=4.5=2_gnu 7 | - binutils_impl_linux-64=2.44=h9d8b0ac_3 8 | - bzip2=1.0.8=hda65f42_8 9 | - conda-gcc-specs=15.2.0=h56430cd_7 10 | - gcc=15.2.0=hc115cf6_7 11 | - gcc_impl_linux-64=15.2.0=hcacfade_7 12 | - gxx=15.2.0=h834e499_7 13 | - gxx_impl_linux-64=15.2.0=h54ccb8d_7 14 | - kernel-headers_linux-64=3.10.0=he073ed8_18 15 | - lapack=3.6.1=ha44fe06_2 16 | - ld_impl_linux-64=2.44=h1aa0949_3 17 | - libgcc=15.2.0=h767d61c_7 18 | - libgcc-devel_linux-64=15.2.0=h73f6952_107 19 | - libgcc-ng=15.2.0=h69a702a_7 20 | - libgfortran=15.2.0=h69a702a_7 21 | - libgfortran5=15.2.0=hcd61629_7 22 | - libgomp=15.2.0=h767d61c_7 23 | - libopenblas=0.3.30=pthreads_h94d23a6_2 24 | - libsanitizer=15.2.0=hb13aed2_7 25 | - libstdcxx=15.2.0=h8f9b012_7 26 | - libstdcxx-devel_linux-64=15.2.0=h73f6952_107 27 | - libzlib=1.3.1=hb9d3cd8_2 28 | - openblas=0.3.30=pthreads_h6ec200e_2 29 | - sysroot_linux-64=2.17=h0157908_18 30 | - tzdata=2025b=h78e105d_0 31 | - zlib=1.3.1=hb9d3cd8_2 32 | - zstd=1.5.7=hb8e6e7a_2 33 | 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 - Chen Shuai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/calc_gwas_threshold.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | args <- commandArgs(trailingOnly = TRUE) 4 | infile <- args[1] # Input file name 5 | outfile <- args[2] # Output prefix 6 | threads <- as.numeric(args[3]) # Number of threads for fread 7 | thres <- as.numeric(args[4]) # P-value significance threshold. i.e: 0.05 8 | 9 | # Load only the 12th column (p_wald) from the input file using multiple threads 10 | kmers <- fread( 11 | infile, 12 | nThread = threads, 13 | select = 12, # Load only the 12th column 14 | col.names = "p_wald" # Assign column name 15 | ) 16 | 17 | # Ensure no missing or invalid values in p_wald 18 | kmers <- kmers[!is.na(p_wald) & p_wald > 0 & p_wald <= 1] 19 | 20 | # Number of markers (rows) 21 | n <- nrow(kmers) 22 | 23 | # Compute Benjamini-Hochberg (BH) adjusted P-values 24 | kmers[, p_wald_bh := p.adjust(p_wald, method = "BH")] 25 | 26 | # Calculate -log10(BH cutoff) 27 | bhCutoff <- -log10(max(kmers[p_wald_bh < thres, p_wald], na.rm = TRUE)) 28 | 29 | # Calculate -log10(Bonferroni cutoff) 30 | bfCutoff <- -log10(thres / n) 31 | 32 | # Print results 33 | cat("-log10(BH cutoff):", bhCutoff, "\n") 34 | cat("-log10(BF cutoff):", bfCutoff, "\n") 35 | 36 | # Optional: Save results to output file (if needed) 37 | outfile <- paste0(outfile, "_results.txt") 38 | fwrite( 39 | data.table(BH_Cutoff = bhCutoff, BF_Cutoff = bfCutoff), 40 | file = outfile, 41 | sep = "\t", 42 | col.names = TRUE 43 | ) 44 | -------------------------------------------------------------------------------- /examples/example.md: -------------------------------------------------------------------------------- 1 | 2 | ### This script is an example for downloading sweet potato SRA data and simulating phenotypes for KMERIA analyses 3 | 4 | ```bash 5 | 6 | Usage: run_example_pipe.sh -s -p -d [options] 7 | 8 | Required Arguments: 9 | -s, --sra-list File containing SRA accession numbers (one per line) 10 | Example format: 11 | SRR28578193 12 | SRR28578210 13 | ... 14 | 15 | -p, --phenotype File containing phenotype data (sample_name phenotype_value) 16 | Example format: 17 | SRR28578193 10 18 | SRR28578210 20 19 | ... 20 | 21 | -d, --depth File containing sequencing depth information (sample_name depth) 22 | Example format: 23 | SRR28578193 25 24 | SRR28578210 30 25 | SRR28578407 28 26 | ... 27 | 28 | Optional Arguments: 29 | -t, --threads Number of threads to use (default: 8) 30 | -m, --memory Memory allocation (default: 32G) 31 | -k, --kmer K-mer size (default: 31) 32 | --batch-size Batch size for processing (default: 1) 33 | --ploidy Genome ploidy (default: 6) 34 | --pheno-col Phenotype column number (default: 2) 35 | -h, --help Show this help message 36 | 37 | Example Usage: 38 | run_example_pipeline.sh -s sra_list.txt -p phenotypes.txt -d sample_depths.txt -t 16 -m 16G 39 | 40 | Steps: 41 | 1. Download SRA files from NCBI 42 | 2. Convert SRA files to FASTQ format 43 | 3. Prepare input files for KMERIA analysis 44 | 4. Generate KMERIA job scripts 45 | 46 | Note: This pipeline generates job scripts that need to be submitted manually to your cluster system. 47 | ``` 48 | ### sweet potato data 49 | 50 | ```bash 51 | # command 52 | run_example_pipe.sh -s sample_sra.list -p simulated_phenotype1.tsv -d sample_depth.tsv 53 | 54 | ``` 55 | -------------------------------------------------------------------------------- /scripts/calc_gwas_threshold_new.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | args <- commandArgs(trailingOnly = TRUE) 3 | infile <- args[1] # Input file name 4 | outfile <- args[2] # Output prefix 5 | threads <- as.numeric(args[3]) # Number of threads for fread 6 | thres <- as.numeric(args[4]) # Base P-value threshold (e.g., 0.05) 7 | k_size <- as.numeric(args[5]) # k-mer size (e.g., 31) 8 | 9 | # Load only the 12th column (p_wald) from the input file using multiple threads 10 | kmers <- fread( 11 | infile, 12 | nThread = threads, 13 | select = 12, 14 | col.names = "p_wald" 15 | ) 16 | 17 | # Ensure no missing or invalid values in p_wald 18 | kmers <- kmers[!is.na(p_wald) & p_wald > 0 & p_wald <= 1] 19 | 20 | # Total number of k-mers 21 | n_total <- nrow(kmers) 22 | 23 | # Calculate effective number of tests based on k-mer size 24 | n_effective <- n_total / k_size 25 | 26 | # Calculate adjusted thresholds 27 | bh_cutoff <- thres / n_effective # More relaxed threshold 28 | bf_cutoff <- thres / n_total # Traditional Bonferroni 29 | 30 | # Compute Benjamini-Hochberg (BH) adjusted P-values 31 | kmers[, p_wald_bh := p.adjust(p_wald, method = "BH")] 32 | 33 | # Calculate cutoffs for Manhattan plots 34 | log_bh_relaxed <- -log10(bh_cutoff) 35 | log_bf_standard <- -log10(bf_cutoff) 36 | log_sig_kmers <- -log10(max(kmers[p_wald_bh < thres, p_wald], na.rm = TRUE)) 37 | 38 | # Print results 39 | cat("Total k-mers:", n_total, "\n") 40 | cat("Effective number of tests:", n_effective, "\n") 41 | cat("Relaxed threshold (α/effective tests):", bh_cutoff, "\n") 42 | cat("Standard Bonferroni threshold (α/total tests):", bf_cutoff, "\n") 43 | cat("-log10(Relaxed threshold):", log_bh_relaxed, "\n") 44 | cat("-log10(Bonferroni threshold):", log_bf_standard, "\n") 45 | cat("-log10(BH significant k-mers threshold):", log_sig_kmers, "\n") 46 | 47 | # Save results 48 | fwrite( 49 | data.table( 50 | Total_Kmers = n_total, 51 | Effective_Tests = n_effective, 52 | Relaxed_Threshold = bh_cutoff, 53 | Bonferroni_Threshold = bf_cutoff, 54 | Log10_Relaxed = log_bh_relaxed, 55 | Log10_Bonferroni = log_bf_standard, 56 | Log10_Sig_BH = log_sig_kmers 57 | ), 58 | file = paste0(outfile, "_thresholds.txt"), 59 | sep = "\t", 60 | col.names = TRUE 61 | ) 62 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #define CALLOC(ptr, len) ((ptr) = (__typeof__(ptr))calloc((len), sizeof(*(ptr)))) 5 | #define MALLOC(ptr, len) ((ptr) = (__typeof__(ptr))malloc((len) * sizeof(*(ptr)))) 6 | #define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr)))) 7 | 8 | #include 9 | 10 | static unsigned char seq_nt4_table[256] = { // translate ACGT to 0123 11 | 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 12 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 13 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 14 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 15 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 16 | 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 17 | 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 18 | 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 19 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 21 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 22 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 23 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 24 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 25 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 26 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 27 | }; 28 | 29 | 30 | static inline uint64_t seq2kmer(char *seq, int k) 31 | { 32 | uint64_t kmer = 0, krev = 0; 33 | const uint64_t shift = (k - 1) * 2; 34 | for(int i = 0; i < k; i++) { 35 | int c = seq_nt4_table[(uint8_t)seq[i]] & 0x3 ; //^ACGT->A 36 | kmer = (kmer << 2) | c; 37 | krev = krev >> 2 | (uint64_t)(~c) << shift; 38 | } 39 | return kmer < krev ? kmer : krev; 40 | } 41 | 42 | static inline void kmer2seq(char * seq, int ksize, uint64_t kmer) 43 | { 44 | for(int i = 0; i < ksize; i++) { 45 | seq[i] = "ACGTN"[(kmer >> ((ksize - 1 - i) * 2)) & 0x03]; 46 | } 47 | } 48 | 49 | static inline uint64_t hash(uint64_t key, uint64_t mask) // invertible integer hash function 50 | { 51 | //return key&mask; 52 | key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; 53 | key = key ^ key >> 24; 54 | key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 55 | key = key ^ key >> 14; 56 | key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 57 | key = key ^ key >> 28; 58 | key = (key + (key << 31)) & mask; 59 | return key; 60 | } 61 | 62 | //usage 63 | void print_usage(); 64 | 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BIN_DIR := ./bin 2 | SRC_DIR := src 3 | OBJ_DIR := obj 4 | INCLUDE_DIR := ./include 5 | LIB_DIR := ./lib 6 | 7 | #compiler 8 | CC := gcc 9 | CXX := g++ 10 | CFLAGS := -Wall -O3 -I $(INCLUDE_DIR) 11 | CXXFLAGS := $(CFLAGS) -std=c++14 -fPIE 12 | LDFLAGS := -L $(LIB_DIR) -pie 13 | LIBS := -lz -lhts -lpthread -lbz2 14 | PROG := kmeria 15 | 16 | ifneq ($(asan),) 17 | CFLAGS += -fsanitize=address 18 | CXXFLAGS += -fsanitize=address 19 | LDFLAGS += -fsanitize=address 20 | endif 21 | 22 | C_SOURCES := $(wildcard $(SRC_DIR)/*.c) 23 | CPP_SOURCES := $(wildcard $(SRC_DIR)/*.cpp) 24 | SOURCES := $(C_SOURCES) $(CPP_SOURCES) 25 | 26 | C_OBJS := $(patsubst $(SRC_DIR)/%.c,$(OBJ_DIR)/%.o,$(C_SOURCES)) 27 | CPP_OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SOURCES)) 28 | OBJS := $(C_OBJS) $(CPP_OBJS) 29 | 30 | $(shell mkdir -p $(OBJ_DIR)) 31 | $(shell mkdir -p $(BIN_DIR)) 32 | 33 | .PHONY: all clean install package 34 | 35 | all: $(PROG) 36 | 37 | # rule for linking 38 | $(PROG): $(OBJS) 39 | $(CXX) $(OBJS) -o $@ $(LDFLAGS) $(LIBS) 40 | 41 | $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp 42 | $(CXX) $(CXXFLAGS) -c $< -o $@ 43 | 44 | $(OBJ_DIR)/%.o: $(SRC_DIR)/%.c 45 | $(CC) $(CFLAGS) -DUSE_BGZF -c $< -o $@ 46 | 47 | $(OBJ_DIR)/main.o: $(SRC_DIR)/main.c 48 | $(OBJ_DIR)/kcount.o: $(SRC_DIR)/kcount.c $(SRC_DIR)/kstring.h $(SRC_DIR)/kthread.h $(SRC_DIR)/ketopt.h $(SRC_DIR)/kseq.h $(SRC_DIR)/kvec.h $(SRC_DIR)/ksort.h $(SRC_DIR)/util.h 49 | $(OBJ_DIR)/kdump.o: $(SRC_DIR)/kdump.c $(SRC_DIR)/kdump.h $(SRC_DIR)/kstring.h 50 | $(OBJ_DIR)/kmatrix.o: $(SRC_DIR)/kmatrix.cpp $(SRC_DIR)/kmatrix.h 51 | $(OBJ_DIR)/kfilter.o: $(SRC_DIR)/kfilter.cpp $(SRC_DIR)/kfilter.h 52 | $(OBJ_DIR)/kmtob.o: $(SRC_DIR)/kmtob.cpp $(SRC_DIR)/kmtob.h 53 | $(OBJ_DIR)/kbtog.o: $(SRC_DIR)/kbtog.cpp $(SRC_DIR)/kbtog.h 54 | $(OBJ_DIR)/ksketch.o: $(SRC_DIR)/ksketch.c 55 | $(OBJ_DIR)/kassoc.o: $(SRC_DIR)/kassoc.cpp $(SRC_DIR)/kassoc.h 56 | $(OBJ_DIR)/fkr.o: $(SRC_DIR)/fkr.c 57 | $(OBJ_DIR)/fkrtgs.o: $(SRC_DIR)/fkrtgs.c 58 | $(OBJ_DIR)/kmc_file.o: $(SRC_DIR)/kmc_file.cpp $(SRC_DIR)/kmc_file.h $(SRC_DIR)/kmer_defs.h 59 | $(OBJ_DIR)/mmer.o: $(SRC_DIR)/mmer.cpp $(SRC_DIR)/mmer.h $(SRC_DIR)/kmer_defs.h 60 | $(OBJ_DIR)/kmer_api.o: $(SRC_DIR)/kmer_api.cpp $(SRC_DIR)/kmer_api.h $(SRC_DIR)/kmer_defs.h 61 | $(OBJ_DIR)/kstring.o: $(SRC_DIR)/kstring.c $(SRC_DIR)/kstring.h 62 | $(OBJ_DIR)/kthread.o: $(SRC_DIR)/kthread.c $(SRC_DIR)/kthread.h 63 | $(OBJ_DIR)/kbam.o: $(SRC_DIR)/kbam.c 64 | $(OBJ_DIR)/kaddp.o: $(SRC_DIR)/kaddp.c 65 | $(OBJ_DIR)/functions.o: $(SRC_DIR)/functions.c 66 | 67 | DEPFILES := $(OBJS:.o=.d) 68 | -include $(DEPFILES) 69 | 70 | $(OBJ_DIR)/%.d: $(SRC_DIR)/%.c 71 | @$(CC) $(CFLAGS) -MM -MT $(@:.d=.o) -MF $@ $< 72 | 73 | $(OBJ_DIR)/%.d: $(SRC_DIR)/%.cpp 74 | @$(CXX) $(CXXFLAGS) -MM -MT $(@:.d=.o) -MF $@ $< 75 | 76 | install: $(PROG) 77 | cp $(PROG) $(BIN_DIR) 78 | 79 | clean: 80 | rm -rf $(PROG) $(OBJ_DIR)/*.o $(OBJ_DIR)/*.d $(SRC_DIR)/*.dSYM KMERIA.tar.gz 81 | 82 | distclean: clean 83 | rm -rf $(OBJ_DIR) 84 | 85 | package: clean 86 | tar -zcf KMERIA.tar.gz Makefile bin/ src/ include/ lib/ scripts/ external_tools/ kmeria_env.yml 87 | -------------------------------------------------------------------------------- /include/hts_os.h: -------------------------------------------------------------------------------- 1 | /// @file hts_os.h 2 | /// Operating System specific tweaks, for compatibility with POSIX. 3 | /* 4 | Copyright (C) 2017, 2019-2020 Genome Research Ltd. 5 | 6 | Author: James Bonfield 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. */ 25 | 26 | #ifndef HTSLIB_HTS_OS_H 27 | #define HTSLIB_HTS_OS_H 28 | 29 | #include "hts_defs.h" 30 | 31 | #ifdef __cplusplus 32 | extern "C" { 33 | #endif 34 | 35 | /* This is srand48_deterministic() on platforms that provide it, or srand48() 36 | otherwise (or our own POSIX srand48() on platforms that provide neither). 37 | Hence calling hts_srand48() will always set up the same POSIX-determined 38 | sequence of pseudo-random numbers on any platform, while calling srand48() 39 | may (e.g., on OpenBSD) set up a different non-deterministic sequence. */ 40 | HTSLIB_EXPORT 41 | void hts_srand48(long seed); 42 | 43 | HTSLIB_EXPORT 44 | double hts_erand48(unsigned short xseed[3]); 45 | 46 | HTSLIB_EXPORT 47 | double hts_drand48(void); 48 | 49 | HTSLIB_EXPORT 50 | long hts_lrand48(void); 51 | 52 | #if defined(_WIN32) && !defined(__CYGWIN__) 53 | // Windows usually lacks *rand48(), but cygwin provides them. 54 | #define srand48(S) hts_srand48((S)) 55 | #define erand48(X) hts_erand48((X)) 56 | #define drand48() hts_drand48() 57 | #define lrand48() hts_lrand48() 58 | #endif 59 | 60 | #if 0 /* def _WIN32 - disabled for now, not currently used */ 61 | /* Check if the fd is a cygwin/msys's pty. */ 62 | extern int is_cygpty(int fd); 63 | #endif 64 | 65 | #ifdef __cplusplus 66 | } 67 | #endif 68 | 69 | #if defined(__MINGW32__) 70 | #include 71 | #define mkdir(filename,mode) mkdir((filename)) 72 | #endif 73 | 74 | #ifdef _WIN32 75 | #include 76 | #define srandom srand 77 | #define random rand 78 | #endif 79 | 80 | /* MSVC does not provide ssize_t in its . This ensures the type 81 | is available (unless suppressed by defining HTS_NO_SSIZE_T first). */ 82 | #if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined HTS_NO_SSIZE_T && !defined ssize_t 83 | #define ssize_t intptr_t 84 | #endif 85 | 86 | #endif // HTSLIB_HTS_OS_H 87 | -------------------------------------------------------------------------------- /bimbamAsso/bimbamAssoDoc.md: -------------------------------------------------------------------------------- 1 | ## bimbamKin usage 2 | ``` 3 | Usage: bimbamKin [input_file] [output_file] 4 | Required parameters 5 | [input_file] : bimbam dosage file (support .gz) 6 | [output_file]: output file name (required for bimbam) 7 | -b : input is bimbam dosage format 8 | Optional parameters 9 | -g : input bimbam file is gzip compressed 10 | -d [# digits]: precision of the kinship values (default : 10) 11 | -s : compute IBS matrix instead of BN 12 | -r : randomly fill missing genotypes 13 | -h : use hetero division for missing data 14 | -v : turn on verbose mode 15 | ``` 16 | 17 | ## bimbamAsso usage 18 | ``` 19 | Usage: bimbamAsso [options] 20 | 21 | Required parameters: 22 | -p [phenotype_file] : Phenotype file with columns (FAMID INDID PHENO) 23 | -o [out_prefix] : Output file prefix 24 | -t [genotype_file] : Genotype file (TPED or Bimbam format) 25 | Use -b flag for Bimbam format 26 | 27 | Bimbam format options: 28 | -b : Input is in Bimbam dosage format 29 | -s [sample_file] : Sample information file (FAMID INDID) 30 | Required when using -b flag 31 | -g : Input genotype file is gzip compressed 32 | Works with both TPED and Bimbam formats 33 | 34 | Bimbam file format: 35 | Column 1: Marker ID (e.g., k-mer sequence) 36 | Column 2: Chromosome (can be placeholder like 'X') 37 | Column 3: Position (can be placeholder like 'Y') 38 | Column 4+: Dosage values for each sample (0-2 or dosages) 39 | Delimiter: comma (,) 40 | Example: AACCGAAA,X,Y,0.27,0.22,0.29,0.33,... 41 | 42 | Sample file format (for -s): 43 | Column 1: Family ID 44 | Column 2: Individual ID 45 | Delimiter: space or tab 46 | 47 | Kinship options: 48 | -k [kinship_file] : Pre-computed kinship matrix (n×n) 49 | -K [method] : Generate kinship matrix 50 | 1 = IBS with mean fill-in 51 | 2 = IBS with random fill-in 52 | 3 = Balding-Nichols 53 | 54 | Optional parameters: 55 | -c [covar_file] : Covariate file (FAMID INDID COV1 COV2 ...) 56 | -i [in_prefix] : Input prefix for pre-computed eigenvectors 57 | -d [digits] : Output precision (default: 5) 58 | -S [start_index] : Start marker index (default: 0) 59 | -E [end_index] : End marker index (default: all markers) 60 | -v : Verbose mode 61 | -w : Write eigenvalue/eigenvector files 62 | -N : Disable GLS (use OLS instead) 63 | 64 | Output files: 65 | [prefix].ps : Association results 66 | Format: MARKER_ID BETA P_VALUE 67 | [prefix].reml : REML estimates 68 | [prefix].log : Log file 69 | [prefix].kinf : Kinship matrix (if -K or -w used) 70 | [prefix].eLvals : Eigenvalues (if -w used) 71 | [prefix].eLvecs : Eigenvectors (if -w used) 72 | 73 | Example usage: 74 | # Bimbam format with gzip 75 | bimbamAsso -b -g -t kmers.bimbam.gz -s samples.txt -p pheno.txt -k kinship.txt -o results 76 | ``` 77 | -------------------------------------------------------------------------------- /include/kfunc.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (C) 2010, 2013-2014 Genome Research Ltd. 4 | Copyright (C) 2011 Attractive Chaos 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | */ 26 | 27 | #ifndef HTSLIB_KFUNC_H 28 | #define HTSLIB_KFUNC_H 29 | 30 | #include "hts_defs.h" 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | /* Log gamma function 37 | * \log{\Gamma(z)} 38 | * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 39 | */ 40 | HTSLIB_EXPORT 41 | double kf_lgamma(double z); 42 | 43 | /* complementary error function 44 | * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt 45 | * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 46 | */ 47 | HTSLIB_EXPORT 48 | double kf_erfc(double x); 49 | 50 | /* The following computes regularized incomplete gamma functions. 51 | * Formulas are taken from Wiki, with additional input from Numerical 52 | * Recipes in C (for modified Lentz's algorithm) and AS245 53 | * (http://lib.stat.cmu.edu/apstat/245). 54 | * 55 | * A good online calculator is available at: 56 | * 57 | * http://www.danielsoper.com/statcalc/calc23.aspx 58 | * 59 | * It calculates upper incomplete gamma function, which equals 60 | * kf_gammaq(s,z)*tgamma(s). 61 | */ 62 | 63 | HTSLIB_EXPORT 64 | double kf_gammap(double s, double z); 65 | HTSLIB_EXPORT 66 | double kf_gammaq(double s, double z); 67 | 68 | /* Regularized incomplete beta function. The method is taken from 69 | * Numerical Recipe in C, 2nd edition, section 6.4. The following web 70 | * page calculates the incomplete beta function, which equals 71 | * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): 72 | * 73 | * http://www.danielsoper.com/statcalc/calc36.aspx 74 | */ 75 | HTSLIB_EXPORT 76 | double kf_betai(double a, double b, double x); 77 | 78 | /* 79 | * n11 n12 | n1_ 80 | * n21 n22 | n2_ 81 | * -----------+---- 82 | * n_1 n_2 | n 83 | */ 84 | HTSLIB_EXPORT 85 | double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); 86 | 87 | #ifdef __cplusplus 88 | } 89 | #endif 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /include/kroundup.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (C) 2020 Genome Research Ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef KROUNDUP_H 27 | #define KROUNDUP_H 28 | 29 | // Value of this macro is 1 if x is a signed type; 0 if unsigned 30 | #define k_signed_type(x) (!(-((x) * 0 + 1) > 0)) 31 | 32 | /* 33 | Macro with value 1 if the highest bit in x is set for any integer type 34 | 35 | This is written avoiding conditionals (?: operator) to reduce the likelihood 36 | of gcc attempting jump thread optimisations for code paths where (x) is 37 | large. These optimisations can cause gcc to issue warnings about excessively 38 | large memory allocations when the kroundup64() macro below is used with 39 | malloc(). Such warnings can be misleading as they imply only the large 40 | allocation happens when it's actually working fine for normal values of (x). 41 | 42 | See https://developers.redhat.com/blog/2019/03/13/understanding-gcc-warnings-part-2/ 43 | */ 44 | #define k_high_bit_set(x) ((((x) >> (sizeof(x) * 8 - 1 - k_signed_type(x))) & 1)) 45 | 46 | /*! @hideinitializer 47 | @abstract Round up to next power of two 48 | @discussion 49 | This macro will work for unsigned types up to uint64_t. 50 | 51 | If the next power of two does not fit in the given type, it will set 52 | the largest value that does. 53 | */ 54 | #define kroundup64(x) ((x) > 0 ? \ 55 | (--(x), \ 56 | (x)|=(x)>>(sizeof(x)/8), \ 57 | (x)|=(x)>>(sizeof(x)/4), \ 58 | (x)|=(x)>>(sizeof(x)/2), \ 59 | (x)|=(x)>>(sizeof(x)), \ 60 | (x)|=(x)>>(sizeof(x)*2), \ 61 | (x)|=(x)>>(sizeof(x)*4), \ 62 | (x) += !k_high_bit_set(x), \ 63 | (x)) \ 64 | : 0) 65 | 66 | // Historic interfaces for 32-bit and size_t values. The macro above 67 | // works for both (as long as size_t is no more than 64 bits). 68 | 69 | #ifndef kroundup32 70 | #define kroundup32(x) kroundup64(x) 71 | #endif 72 | #ifndef kroundup_size_t 73 | #define kroundup_size_t(x) kroundup64(x) 74 | #endif 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /include/knetfile.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2012, 2014, 2021-2022 Genome Research Ltd (GRL). 4 | 2010 by Attractive Chaos 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | */ 26 | 27 | #ifndef KNETFILE_H 28 | #define KNETFILE_H 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | #include "hts_defs.h" 35 | 36 | #ifndef _WIN32 37 | #define netread(fd, ptr, len) read(fd, ptr, len) 38 | #define netwrite(fd, ptr, len) write(fd, ptr, len) 39 | #define netclose(fd) close(fd) 40 | #else 41 | #include 42 | #define netread(fd, ptr, len) recv(fd, ptr, len, 0) 43 | #define netwrite(fd, ptr, len) send(fd, ptr, len, 0) 44 | #define netclose(fd) closesocket(fd) 45 | #endif 46 | 47 | // Ensure ssize_t exists within this header. All #includes must precede this, 48 | // and ssize_t must be undefined again at the end of this header. 49 | #if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t 50 | #define HTSLIB_SSIZE_T 51 | #define ssize_t intptr_t 52 | #endif 53 | 54 | // FIXME: currently I/O is unbuffered 55 | 56 | #define KNF_TYPE_LOCAL 1 57 | #define KNF_TYPE_FTP 2 58 | #define KNF_TYPE_HTTP 3 59 | 60 | // Kept for API/ABI compatability only. Do not use directly! 61 | typedef struct knetFile_s { 62 | int type, fd; 63 | int64_t offset; 64 | char *host, *port; 65 | 66 | // the following are for FTP only 67 | int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; 68 | char *response, *retr, *size_cmd; 69 | int64_t seek_offset; // for lazy seek 70 | int64_t file_size; 71 | 72 | // the following are for HTTP only 73 | char *path, *http_host; 74 | } knetFile; 75 | 76 | #define knet_tell(fp) ((fp)->offset) 77 | #define knet_fileno(fp) ((fp)->fd) 78 | 79 | #ifdef __cplusplus 80 | extern "C" { 81 | #endif 82 | 83 | HTSLIB_EXPORT 84 | knetFile *knet_open(const char *fn, const char *mode) HTS_DEPRECATED("Please use hopen instead"); 85 | 86 | /* 87 | This only works with local files. 88 | */ 89 | HTSLIB_EXPORT 90 | knetFile *knet_dopen(int fd, const char *mode) HTS_DEPRECATED("Please use hdopen instead"); 91 | 92 | /* 93 | If ->is_ready==0, this routine updates ->fd; otherwise, it simply 94 | reads from ->fd. 95 | */ 96 | HTSLIB_EXPORT 97 | ssize_t knet_read(knetFile *fp, void *buf, size_t len) HTS_DEPRECATED("Please use hread instead"); 98 | 99 | /* 100 | This routine only sets ->offset and ->is_ready=0. It does not 101 | communicate with the FTP server. 102 | */ 103 | HTSLIB_EXPORT 104 | off_t knet_seek(knetFile *fp, off_t off, int whence) HTS_DEPRECATED("Please use hseek instead"); 105 | HTSLIB_EXPORT 106 | int knet_close(knetFile *fp) HTS_DEPRECATED("Please use hclose instead"); 107 | 108 | #ifdef __cplusplus 109 | } 110 | #endif 111 | 112 | #ifdef HTSLIB_SSIZE_T 113 | #undef HTSLIB_SSIZE_T 114 | #undef ssize_t 115 | #endif 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /include/hts_log.h: -------------------------------------------------------------------------------- 1 | /// \file htslib/hts_log.h 2 | /// Configuration of log levels. 3 | /* The MIT License 4 | Copyright (C) 2017 Genome Research Ltd. 5 | 6 | Author: Anders Kaplan 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining 9 | a copy of this software and associated documentation files (the 10 | "Software"), to deal in the Software without restriction, including 11 | without limitation the rights to use, copy, modify, merge, publish, 12 | distribute, sublicense, and/or sell copies of the Software, and to 13 | permit persons to whom the Software is furnished to do so, subject to 14 | the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be 17 | included in all copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 23 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 24 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | */ 28 | 29 | #ifndef HTS_LOG_H 30 | #define HTS_LOG_H 31 | 32 | #include "hts_defs.h" 33 | 34 | #ifdef __cplusplus 35 | extern "C" { 36 | #endif 37 | 38 | /// Log levels. 39 | enum htsLogLevel { 40 | HTS_LOG_OFF, ///< All logging disabled. 41 | HTS_LOG_ERROR, ///< Logging of errors only. 42 | HTS_LOG_WARNING = 3, ///< Logging of errors and warnings. 43 | HTS_LOG_INFO, ///< Logging of errors, warnings, and normal but significant events. 44 | HTS_LOG_DEBUG, ///< Logging of all except the most detailed debug events. 45 | HTS_LOG_TRACE ///< All logging enabled. 46 | }; 47 | 48 | /// Sets the selected log level. 49 | HTSLIB_EXPORT 50 | void hts_set_log_level(enum htsLogLevel level); 51 | 52 | /// Gets the selected log level. 53 | HTSLIB_EXPORT 54 | enum htsLogLevel hts_get_log_level(void); 55 | 56 | /// Selected log level. 57 | /*! 58 | * One of the HTS_LOG_* values. The default is HTS_LOG_WARNING. 59 | * \note Avoid direct use of this variable. Use hts_set_log_level and hts_get_log_level instead. 60 | */ 61 | HTSLIB_EXPORT 62 | extern int hts_verbose; 63 | 64 | /*! Logs an event. 65 | * \param severity Severity of the event: 66 | * - HTS_LOG_ERROR means that something went wrong so that a task could not be completed. 67 | * - HTS_LOG_WARNING means that something unexpected happened, but that execution can continue, perhaps in a degraded mode. 68 | * - HTS_LOG_INFO means that something normal but significant happened. 69 | * - HTS_LOG_DEBUG means that something normal and insignificant happened. 70 | * - HTS_LOG_TRACE means that something happened that might be of interest when troubleshooting. 71 | * \param context Context where the event occurred. Typically set to "__func__". 72 | * \param format Format string with placeholders, like printf. 73 | */ 74 | HTSLIB_EXPORT 75 | void hts_log(enum htsLogLevel severity, const char *context, const char *format, ...) 76 | HTS_FORMAT(HTS_PRINTF_FMT, 3, 4); 77 | 78 | /*! Logs an event with severity HTS_LOG_ERROR and default context. Parameters: format, ... */ 79 | #define hts_log_error(...) hts_log(HTS_LOG_ERROR, __func__, __VA_ARGS__) 80 | 81 | /*! Logs an event with severity HTS_LOG_WARNING and default context. Parameters: format, ... */ 82 | #define hts_log_warning(...) hts_log(HTS_LOG_WARNING, __func__, __VA_ARGS__) 83 | 84 | /*! Logs an event with severity HTS_LOG_INFO and default context. Parameters: format, ... */ 85 | #define hts_log_info(...) hts_log(HTS_LOG_INFO, __func__, __VA_ARGS__) 86 | 87 | /*! Logs an event with severity HTS_LOG_DEBUG and default context. Parameters: format, ... */ 88 | #define hts_log_debug(...) hts_log(HTS_LOG_DEBUG, __func__, __VA_ARGS__) 89 | 90 | /*! Logs an event with severity HTS_LOG_TRACE and default context. Parameters: format, ... */ 91 | #define hts_log_trace(...) hts_log(HTS_LOG_TRACE, __func__, __VA_ARGS__) 92 | 93 | #ifdef __cplusplus 94 | } 95 | #endif 96 | 97 | #endif // #ifndef HTS_LOG_H 98 | -------------------------------------------------------------------------------- /include/khash_str2int.h: -------------------------------------------------------------------------------- 1 | /* khash_str2int.h -- C-string to integer hash table. 2 | 3 | Copyright (C) 2013-2014,2020 Genome Research Ltd. 4 | 5 | Author: Petr Danecek 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. */ 24 | 25 | #ifndef HTSLIB_KHASH_STR2INT_H 26 | #define HTSLIB_KHASH_STR2INT_H 27 | 28 | #include "khash.h" 29 | 30 | KHASH_MAP_INIT_STR(str2int, int) 31 | 32 | /* 33 | * Wrappers for khash dictionaries used by mpileup. 34 | */ 35 | 36 | static inline void *khash_str2int_init(void) 37 | { 38 | return kh_init(str2int); 39 | } 40 | 41 | /* 42 | * Destroy the hash structure, but not the keys 43 | */ 44 | static inline void khash_str2int_destroy(void *_hash) 45 | { 46 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 47 | if (hash) kh_destroy(str2int, hash); // Note that strings are not freed. 48 | } 49 | 50 | /* 51 | * Destroys both the hash structure and the keys 52 | */ 53 | static inline void khash_str2int_destroy_free(void *_hash) 54 | { 55 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 56 | khint_t k; 57 | if (hash == 0) return; 58 | for (k = 0; k < kh_end(hash); ++k) 59 | if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); 60 | kh_destroy(str2int, hash); 61 | } 62 | 63 | /* 64 | * Returns 1 if key exists or 0 if not 65 | */ 66 | static inline int khash_str2int_has_key(void *_hash, const char *str) 67 | { 68 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 69 | khint_t k = kh_get(str2int, hash, str); 70 | if ( k == kh_end(hash) ) return 0; 71 | return 1; 72 | } 73 | 74 | /* 75 | * Returns 0 on success and -1 when the key is not present. On success, 76 | * *value is set, unless NULL is passed. 77 | */ 78 | static inline int khash_str2int_get(void *_hash, const char *str, int *value) 79 | { 80 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 81 | khint_t k; 82 | if ( !hash ) return -1; 83 | k = kh_get(str2int, hash, str); 84 | if ( k == kh_end(hash) ) return -1; 85 | if ( !value ) return 0; 86 | *value = kh_val(hash, k); 87 | return 0; 88 | } 89 | 90 | /* 91 | * Add a new string to the dictionary, auto-incrementing the value. 92 | * On success returns the newly inserted integer id, on error -1 93 | * is returned. Note that the key must continue to exist throughout 94 | * the whole life of _hash. 95 | */ 96 | static inline int khash_str2int_inc(void *_hash, const char *str) 97 | { 98 | khint_t k; 99 | int ret; 100 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 101 | if ( !hash ) return -1; 102 | k = kh_put(str2int, hash, str, &ret); 103 | if (ret < 0) return -1; 104 | if (ret == 0) return kh_val(hash, k); 105 | kh_val(hash, k) = kh_size(hash) - 1; 106 | return kh_val(hash, k); 107 | } 108 | 109 | /* 110 | * Set a new key,value pair. On success returns the bin index, on 111 | * error -1 is returned. Note that the key must continue to exist 112 | * throughout the whole life of _hash. 113 | */ 114 | static inline int khash_str2int_set(void *_hash, const char *str, int value) 115 | { 116 | khint_t k; 117 | int ret; 118 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 119 | if ( !hash ) return -1; 120 | k = kh_put(str2int, hash, str, &ret); 121 | if (ret < 0) return -1; 122 | kh_val(hash,k) = value; 123 | return k; 124 | } 125 | 126 | /* 127 | * Return the number of keys in the hash table. 128 | */ 129 | static inline int khash_str2int_size(void *_hash) 130 | { 131 | khash_t(str2int) *hash = (khash_t(str2int)*)_hash; 132 | return kh_size(hash); 133 | } 134 | 135 | #endif 136 | -------------------------------------------------------------------------------- /examples/sample_sra.list: -------------------------------------------------------------------------------- 1 | SRR28578485 2 | SRR28578484 3 | SRR28578303 4 | SRR28578326 5 | SRR28578283 6 | SRR28578272 7 | SRR28578261 8 | SRR28578218 9 | SRR28578207 10 | SRR28578196 11 | SRR28578483 12 | SRR28578472 13 | SRR28578461 14 | SRR28578450 15 | SRR28578439 16 | SRR28578428 17 | SRR28578379 18 | SRR28578368 19 | SRR28578357 20 | SRR28578314 21 | SRR28578302 22 | SRR28578291 23 | SRR28578248 24 | SRR28578237 25 | SRR28578226 26 | SRR28578413 27 | SRR28578402 28 | SRR28578391 29 | SRR28578342 30 | SRR28578331 31 | SRR28578325 32 | SRR28578324 33 | SRR28578323 34 | SRR28578322 35 | SRR28578321 36 | SRR28578320 37 | SRR28578287 38 | SRR28578286 39 | SRR28578285 40 | SRR28578284 41 | SRR28578282 42 | SRR28578281 43 | SRR28578280 44 | SRR28578279 45 | SRR28578278 46 | SRR28578277 47 | SRR28578276 48 | SRR28578275 49 | SRR28578274 50 | SRR28578273 51 | SRR28578271 52 | SRR28578270 53 | SRR28578269 54 | SRR28578268 55 | SRR28578267 56 | SRR28578266 57 | SRR28578265 58 | SRR28578264 59 | SRR28578263 60 | SRR28578262 61 | SRR28578260 62 | SRR28578259 63 | SRR28578258 64 | SRR28578257 65 | SRR28578256 66 | SRR28578223 67 | SRR28578222 68 | SRR28578221 69 | SRR28578220 70 | SRR28578219 71 | SRR28578217 72 | SRR28578216 73 | SRR28578215 74 | SRR28578214 75 | SRR28578213 76 | SRR28578212 77 | SRR28578211 78 | SRR28578210 79 | SRR28578209 80 | SRR28578208 81 | SRR28578206 82 | SRR28578205 83 | SRR28578204 84 | SRR28578203 85 | SRR28578202 86 | SRR28578201 87 | SRR28578200 88 | SRR28578199 89 | SRR28578198 90 | SRR28578197 91 | SRR28578195 92 | SRR28578194 93 | SRR28578193 94 | SRR28578192 95 | SRR28578389 96 | SRR28578388 97 | SRR28578387 98 | SRR28578386 99 | SRR28578385 100 | SRR28578384 101 | SRR28578482 102 | SRR28578481 103 | SRR28578480 104 | SRR28578479 105 | SRR28578478 106 | SRR28578477 107 | SRR28578476 108 | SRR28578475 109 | SRR28578474 110 | SRR28578473 111 | SRR28578471 112 | SRR28578470 113 | SRR28578469 114 | SRR28578468 115 | SRR28578467 116 | SRR28578466 117 | SRR28578465 118 | SRR28578464 119 | SRR28578463 120 | SRR28578462 121 | SRR28578460 122 | SRR28578459 123 | SRR28578458 124 | SRR28578457 125 | SRR28578456 126 | SRR28578455 127 | SRR28578454 128 | SRR28578453 129 | SRR28578452 130 | SRR28578451 131 | SRR28578449 132 | SRR28578448 133 | SRR28578447 134 | SRR28578446 135 | SRR28578445 136 | SRR28578444 137 | SRR28578443 138 | SRR28578442 139 | SRR28578441 140 | SRR28578440 141 | SRR28578438 142 | SRR28578437 143 | SRR28578436 144 | SRR28578435 145 | SRR28578434 146 | SRR28578433 147 | SRR28578432 148 | SRR28578431 149 | SRR28578430 150 | SRR28578429 151 | SRR28578427 152 | SRR28578426 153 | SRR28578425 154 | SRR28578424 155 | SRR28578423 156 | SRR28578422 157 | SRR28578383 158 | SRR28578382 159 | SRR28578381 160 | SRR28578380 161 | SRR28578378 162 | SRR28578377 163 | SRR28578376 164 | SRR28578375 165 | SRR28578374 166 | SRR28578373 167 | SRR28578372 168 | SRR28578371 169 | SRR28578370 170 | SRR28578369 171 | SRR28578367 172 | SRR28578366 173 | SRR28578365 174 | SRR28578364 175 | SRR28578363 176 | SRR28578362 177 | SRR28578361 178 | SRR28578360 179 | SRR28578359 180 | SRR28578358 181 | SRR28578356 182 | SRR28578355 183 | SRR28578354 184 | SRR28578353 185 | SRR28578352 186 | SRR28578319 187 | SRR28578318 188 | SRR28578317 189 | SRR28578316 190 | SRR28578315 191 | SRR28578313 192 | SRR28578312 193 | SRR28578311 194 | SRR28578310 195 | SRR28578309 196 | SRR28578308 197 | SRR28578307 198 | SRR28578306 199 | SRR28578305 200 | SRR28578304 201 | SRR28578301 202 | SRR28578300 203 | SRR28578299 204 | SRR28578298 205 | SRR28578297 206 | SRR28578296 207 | SRR28578295 208 | SRR28578294 209 | SRR28578293 210 | SRR28578292 211 | SRR28578290 212 | SRR28578289 213 | SRR28578288 214 | SRR28578255 215 | SRR28578254 216 | SRR28578253 217 | SRR28578252 218 | SRR28578251 219 | SRR28578250 220 | SRR28578249 221 | SRR28578247 222 | SRR28578246 223 | SRR28578245 224 | SRR28578244 225 | SRR28578243 226 | SRR28578242 227 | SRR28578241 228 | SRR28578240 229 | SRR28578239 230 | SRR28578238 231 | SRR28578236 232 | SRR28578235 233 | SRR28578234 234 | SRR28578233 235 | SRR28578232 236 | SRR28578231 237 | SRR28578230 238 | SRR28578229 239 | SRR28578228 240 | SRR28578227 241 | SRR28578225 242 | SRR28578224 243 | SRR28578421 244 | SRR28578420 245 | SRR28578419 246 | SRR28578418 247 | SRR28578417 248 | SRR28578416 249 | SRR28578415 250 | SRR28578414 251 | SRR28578412 252 | SRR28578411 253 | SRR28578410 254 | SRR28578409 255 | SRR28578408 256 | SRR28578407 257 | SRR28578406 258 | SRR28578405 259 | SRR28578404 260 | SRR28578403 261 | SRR28578401 262 | SRR28578400 263 | SRR28578399 264 | SRR28578398 265 | SRR28578397 266 | SRR28578396 267 | SRR28578395 268 | SRR28578394 269 | SRR28578393 270 | SRR28578392 271 | SRR28578390 272 | SRR28578351 273 | SRR28578350 274 | SRR28578349 275 | SRR28578348 276 | SRR28578347 277 | SRR28578346 278 | SRR28578345 279 | SRR28578344 280 | SRR28578343 281 | SRR28578341 282 | SRR28578340 283 | SRR28578339 284 | SRR28578338 285 | SRR28578337 286 | SRR28578336 287 | SRR28578335 288 | SRR28578334 289 | SRR28578333 290 | SRR28578332 291 | SRR28578330 292 | SRR28578329 293 | SRR28578328 294 | SRR28578327 295 | -------------------------------------------------------------------------------- /include/hts_defs.h: -------------------------------------------------------------------------------- 1 | /* hts_defs.h -- Miscellaneous definitions. 2 | 3 | Copyright (C) 2013-2015,2017, 2019-2020 Genome Research Ltd. 4 | 5 | Author: John Marshall 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. */ 24 | 25 | #ifndef HTSLIB_HTS_DEFS_H 26 | #define HTSLIB_HTS_DEFS_H 27 | 28 | #if defined __MINGW32__ 29 | #include // For __MINGW_PRINTF_FORMAT macro 30 | #endif 31 | 32 | #ifdef __clang__ 33 | #ifdef __has_attribute 34 | #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute) 35 | #endif 36 | 37 | #ifdef __has_builtin 38 | #define HTS_COMPILER_HAS_BUILTIN(function) __has_builtin(function) 39 | #endif 40 | 41 | #elif defined __GNUC__ 42 | #define HTS_GCC_AT_LEAST(major, minor) \ 43 | (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) 44 | #endif 45 | 46 | #ifndef HTS_COMPILER_HAS 47 | #define HTS_COMPILER_HAS(attribute) 0 48 | #endif 49 | #ifndef HTS_COMPILER_HAS_BUILTIN 50 | #define HTS_COMPILER_HAS_BUILTIN(function) 0 51 | #endif 52 | 53 | #ifndef HTS_GCC_AT_LEAST 54 | #define HTS_GCC_AT_LEAST(major, minor) 0 55 | #endif 56 | 57 | #if HTS_COMPILER_HAS(__nonstring__) || HTS_GCC_AT_LEAST(8,1) 58 | #define HTS_NONSTRING __attribute__ ((__nonstring__)) 59 | #else 60 | #define HTS_NONSTRING 61 | #endif 62 | 63 | #if HTS_COMPILER_HAS(__noreturn__) || HTS_GCC_AT_LEAST(3,0) 64 | #define HTS_NORETURN __attribute__ ((__noreturn__)) 65 | #else 66 | #define HTS_NORETURN 67 | #endif 68 | 69 | // Enable optimisation level 3, especially for gcc. To be used 70 | // where we want to force vectorisation in hot loops and the default -O2 71 | // just doesn't cut it. 72 | #if HTS_COMPILER_HAS(optimize) || HTS_GCC_AT_LEAST(4,4) 73 | #define HTS_OPT3 __attribute__((optimize("O3"))) 74 | #else 75 | #define HTS_OPT3 76 | #endif 77 | 78 | #if HTS_COMPILER_HAS(aligned) || HTS_GCC_AT_LEAST(4,3) 79 | #define HTS_ALIGN32 __attribute__((aligned(32))) 80 | #else 81 | #define HTS_ALIGN32 82 | #endif 83 | 84 | // GCC introduced warn_unused_result in 3.4 but added -Wno-unused-result later 85 | #if HTS_COMPILER_HAS(__warn_unused_result__) || HTS_GCC_AT_LEAST(4,5) 86 | #define HTS_RESULT_USED __attribute__ ((__warn_unused_result__)) 87 | #else 88 | #define HTS_RESULT_USED 89 | #endif 90 | 91 | #if HTS_COMPILER_HAS(__unused__) || HTS_GCC_AT_LEAST(3,0) 92 | #define HTS_UNUSED __attribute__ ((__unused__)) 93 | #else 94 | #define HTS_UNUSED 95 | #endif 96 | 97 | #if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(4,5) 98 | #define HTS_DEPRECATED(message) __attribute__ ((__deprecated__ (message))) 99 | #elif HTS_GCC_AT_LEAST(3,1) 100 | #define HTS_DEPRECATED(message) __attribute__ ((__deprecated__)) 101 | #else 102 | #define HTS_DEPRECATED(message) 103 | #endif 104 | 105 | #if (HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4)) && !defined(__ICC) 106 | #define HTS_DEPRECATED_ENUM(message) __attribute__ ((__deprecated__ (message))) 107 | #else 108 | #define HTS_DEPRECATED_ENUM(message) 109 | #endif 110 | 111 | // On mingw the "printf" format type doesn't work. It needs "gnu_printf" 112 | // in order to check %lld and %z, otherwise it defaults to checking against 113 | // the Microsoft library printf format options despite linking against the 114 | // GNU posix implementation of printf. The __MINGW_PRINTF_FORMAT macro 115 | // expands to printf or gnu_printf as required, but obviously may not 116 | // exist 117 | #ifdef __MINGW_PRINTF_FORMAT 118 | #define HTS_PRINTF_FMT __MINGW_PRINTF_FORMAT 119 | #else 120 | #define HTS_PRINTF_FMT printf 121 | #endif 122 | 123 | #if HTS_COMPILER_HAS(__format__) || HTS_GCC_AT_LEAST(3,0) 124 | #define HTS_FORMAT(type, idx, first) __attribute__((__format__ (type, idx, first))) 125 | #else 126 | #define HTS_FORMAT(type, idx, first) 127 | #endif 128 | 129 | #define HTS_COMPILER_HAS_TARGET_AND_BUILTIN_CPU_SUPPORTS \ 130 | ((HTS_COMPILER_HAS(target) && HTS_COMPILER_HAS_BUILTIN(__builtin_cpu_supports)) \ 131 | || HTS_GCC_AT_LEAST(4, 8)) 132 | 133 | #if (defined(__x86_64__) || defined(_M_X64)) 134 | #define HTS_BUILD_IS_X86_64 1 135 | #else 136 | #define HTS_BUILD_IS_X86_64 0 137 | #endif 138 | 139 | #if defined(_WIN32) || defined(__CYGWIN__) 140 | #if defined(HTS_BUILDING_LIBRARY) 141 | #define HTSLIB_EXPORT __declspec(dllexport) 142 | #else 143 | #define HTSLIB_EXPORT 144 | #endif 145 | #elif HTS_COMPILER_HAS(__visibility__) || HTS_GCC_AT_LEAST(4,0) 146 | #define HTSLIB_EXPORT __attribute__((__visibility__("default"))) 147 | #elif defined(__SUNPRO_C) && __SUNPRO_C >= 0x550 148 | #define HTSLIB_EXPORT __global 149 | #else 150 | #define HTSLIB_EXPORT 151 | #endif 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /include/tbx.h: -------------------------------------------------------------------------------- 1 | /// @file htslib/tbx.h 2 | /// Tabix API functions. 3 | /* 4 | Copyright (C) 2009, 2012-2015, 2019 Genome Research Ltd. 5 | Copyright (C) 2010, 2012 Broad Institute. 6 | 7 | Author: Heng Li 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in 17 | all copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. */ 26 | 27 | #ifndef HTSLIB_TBX_H 28 | #define HTSLIB_TBX_H 29 | 30 | #include "hts.h" 31 | 32 | #ifdef __cplusplus 33 | extern "C" { 34 | #endif 35 | 36 | #define TBX_MAX_SHIFT 31 37 | 38 | #define TBX_GENERIC 0 39 | #define TBX_SAM 1 40 | #define TBX_VCF 2 41 | #define TBX_GAF 3 42 | #define TBX_UCSC 0x10000 43 | 44 | typedef struct tbx_conf_t { 45 | int32_t preset; 46 | int32_t sc, bc, ec; // seq col., beg col. and end col. 47 | int32_t meta_char, line_skip; 48 | } tbx_conf_t; 49 | 50 | typedef struct tbx_t { 51 | tbx_conf_t conf; 52 | hts_idx_t *idx; 53 | void *dict; 54 | } tbx_t; 55 | 56 | HTSLIB_EXPORT 57 | extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf, tbx_conf_gaf; 58 | 59 | #define tbx_itr_destroy(iter) hts_itr_destroy(iter) 60 | #define tbx_itr_queryi(tbx, tid, beg, end) hts_itr_query((tbx)->idx, (tid), (beg), (end), tbx_readrec) 61 | #define tbx_itr_querys(tbx, s) hts_itr_querys((tbx)->idx, (s), (hts_name2id_f)(tbx_name2id), (tbx), hts_itr_query, tbx_readrec) 62 | #define tbx_itr_next(htsfp, tbx, itr, r) hts_itr_next(hts_get_bgzfp(htsfp), (itr), (r), (tbx)) 63 | #define tbx_bgzf_itr_next(bgzfp, tbx, itr, r) hts_itr_next((bgzfp), (itr), (r), (tbx)) 64 | 65 | HTSLIB_EXPORT 66 | int tbx_name2id(tbx_t *tbx, const char *ss); 67 | 68 | /* Internal helper function used by tbx_itr_next() */ 69 | HTSLIB_EXPORT 70 | BGZF *hts_get_bgzfp(htsFile *fp); 71 | 72 | HTSLIB_EXPORT 73 | int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end); 74 | 75 | /// Build an index of the lines in a BGZF-compressed file 76 | /** The index struct returned by a successful call should be freed 77 | via tbx_destroy() when it is no longer needed. 78 | */ 79 | HTSLIB_EXPORT 80 | tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf); 81 | /* 82 | * All tbx_index_build* methods return: 0 (success), -1 (general failure) or -2 (compression not BGZF) 83 | */ 84 | HTSLIB_EXPORT 85 | int tbx_index_build(const char *fn, int min_shift, const tbx_conf_t *conf); 86 | 87 | HTSLIB_EXPORT 88 | int tbx_index_build2(const char *fn, const char *fnidx, int min_shift, const tbx_conf_t *conf); 89 | 90 | HTSLIB_EXPORT 91 | int tbx_index_build3(const char *fn, const char *fnidx, int min_shift, int n_threads, const tbx_conf_t *conf); 92 | 93 | 94 | /// Load or stream a .tbi or .csi index 95 | /** @param fn Name of the data file corresponding to the index 96 | 97 | Equivalent to tbx_index_load3(fn, NULL, HTS_IDX_SAVE_REMOTE); 98 | */ 99 | HTSLIB_EXPORT 100 | tbx_t *tbx_index_load(const char *fn); 101 | 102 | /// Load or stream a .tbi or .csi index 103 | /** @param fn Name of the data file corresponding to the index 104 | @param fnidx Name of the indexed file 105 | @return The index, or NULL if an error occurred 106 | 107 | If @p fnidx is NULL, the index name will be derived from @p fn. 108 | 109 | Equivalent to tbx_index_load3(fn, fnidx, HTS_IDX_SAVE_REMOTE); 110 | */ 111 | HTSLIB_EXPORT 112 | tbx_t *tbx_index_load2(const char *fn, const char *fnidx); 113 | 114 | /// Load or stream a .tbi or .csi index 115 | /** @param fn Name of the data file corresponding to the index 116 | @param fnidx Name of the indexed file 117 | @param flags Flags to alter behaviour (see description) 118 | @return The index, or NULL if an error occurred 119 | 120 | If @p fnidx is NULL, the index name will be derived from @p fn. 121 | 122 | The @p flags parameter can be set to a combination of the following 123 | values: 124 | 125 | HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes 126 | HTS_IDX_SILENT_FAIL Fail silently if the index is not present 127 | 128 | The index struct returned by a successful call should be freed 129 | via tbx_destroy() when it is no longer needed. 130 | */ 131 | HTSLIB_EXPORT 132 | tbx_t *tbx_index_load3(const char *fn, const char *fnidx, int flags); 133 | 134 | HTSLIB_EXPORT 135 | const char **tbx_seqnames(tbx_t *tbx, int *n); // free the array but not the values 136 | 137 | HTSLIB_EXPORT 138 | void tbx_destroy(tbx_t *tbx); 139 | 140 | #ifdef __cplusplus 141 | } 142 | #endif 143 | 144 | #endif 145 | -------------------------------------------------------------------------------- /include/klist.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008-2009, by Attractive Chaos 4 | Copyright (C) 2013, 2015 Genome Research Ltd. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | */ 26 | 27 | #ifndef _AC_KLIST_H 28 | #define _AC_KLIST_H 29 | 30 | #include 31 | 32 | #ifndef klib_unused 33 | #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 34 | #define klib_unused __attribute__ ((__unused__)) 35 | #else 36 | #define klib_unused 37 | #endif 38 | #endif /* klib_unused */ 39 | 40 | #define KMEMPOOL_INIT2(SCOPE, name, kmptype_t, kmpfree_f) \ 41 | typedef struct { \ 42 | size_t cnt, n, max; \ 43 | kmptype_t **buf; \ 44 | } kmp_##name##_t; \ 45 | SCOPE kmp_##name##_t *kmp_init_##name(void) { \ 46 | return (kmp_##name##_t *)calloc(1, sizeof(kmp_##name##_t)); \ 47 | } \ 48 | SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \ 49 | size_t k; \ 50 | for (k = 0; k < mp->n; ++k) { \ 51 | kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ 52 | } \ 53 | free(mp->buf); free(mp); \ 54 | } \ 55 | SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ 56 | ++mp->cnt; \ 57 | if (mp->n == 0) return (kmptype_t *)calloc(1, sizeof(kmptype_t)); \ 58 | return mp->buf[--mp->n]; \ 59 | } \ 60 | SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ 61 | --mp->cnt; \ 62 | if (mp->n == mp->max) { \ 63 | mp->max = mp->max? mp->max<<1 : 16; \ 64 | mp->buf = (kmptype_t **)realloc(mp->buf, sizeof(kmptype_t *) * mp->max); \ 65 | } \ 66 | mp->buf[mp->n++] = p; \ 67 | } 68 | 69 | #define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ 70 | KMEMPOOL_INIT2(static inline klib_unused, name, kmptype_t, kmpfree_f) 71 | 72 | #define kmempool_t(name) kmp_##name##_t 73 | #define kmp_init(name) kmp_init_##name() 74 | #define kmp_destroy(name, mp) kmp_destroy_##name(mp) 75 | #define kmp_alloc(name, mp) kmp_alloc_##name(mp) 76 | #define kmp_free(name, mp, p) kmp_free_##name(mp, p) 77 | 78 | #define KLIST_INIT2(SCOPE, name, kltype_t, kmpfree_t) \ 79 | struct __kl1_##name { \ 80 | kltype_t data; \ 81 | struct __kl1_##name *next; \ 82 | }; \ 83 | typedef struct __kl1_##name kl1_##name; \ 84 | KMEMPOOL_INIT2(SCOPE, name, kl1_##name, kmpfree_t) \ 85 | typedef struct { \ 86 | kl1_##name *head, *tail; \ 87 | kmp_##name##_t *mp; \ 88 | size_t size; \ 89 | } kl_##name##_t; \ 90 | SCOPE kl_##name##_t *kl_init_##name(void) { \ 91 | kl_##name##_t *kl = (kl_##name##_t *)calloc(1, sizeof(kl_##name##_t)); \ 92 | kl->mp = kmp_init(name); \ 93 | kl->head = kl->tail = kmp_alloc(name, kl->mp); \ 94 | kl->head->next = 0; \ 95 | return kl; \ 96 | } \ 97 | SCOPE void kl_destroy_##name(kl_##name##_t *kl) { \ 98 | kl1_##name *p; \ 99 | for (p = kl->head; p != kl->tail; p = p->next) \ 100 | kmp_free(name, kl->mp, p); \ 101 | kmp_free(name, kl->mp, p); \ 102 | kmp_destroy(name, kl->mp); \ 103 | free(kl); \ 104 | } \ 105 | SCOPE kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ 106 | kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ 107 | q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ 108 | ++kl->size; \ 109 | return &q->data; \ 110 | } \ 111 | SCOPE int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ 112 | kl1_##name *p; \ 113 | if (kl->head->next == 0) return -1; \ 114 | --kl->size; \ 115 | p = kl->head; kl->head = kl->head->next; \ 116 | if (d) *d = p->data; \ 117 | kmp_free(name, kl->mp, p); \ 118 | return 0; \ 119 | } 120 | 121 | #define KLIST_INIT(name, kltype_t, kmpfree_t) \ 122 | KLIST_INIT2(static inline klib_unused, name, kltype_t, kmpfree_t) 123 | 124 | #define kliter_t(name) kl1_##name 125 | #define klist_t(name) kl_##name##_t 126 | #define kl_val(iter) ((iter)->data) 127 | #define kl_next(iter) ((iter)->next) 128 | #define kl_begin(kl) ((kl)->head) 129 | #define kl_end(kl) ((kl)->tail) 130 | 131 | #define kl_init(name) kl_init_##name() 132 | #define kl_destroy(name, kl) kl_destroy_##name(kl) 133 | #define kl_pushp(name, kl) kl_pushp_##name(kl) 134 | #define kl_shift(name, kl, d) kl_shift_##name(kl, d) 135 | 136 | #endif 137 | -------------------------------------------------------------------------------- /include/kbitset.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (C) 2015, 2018 Genome Research Ltd. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #ifndef KBITSET_H 27 | #define KBITSET_H 28 | 29 | /* Example of using kbitset_t, which represents a subset of {0,..., N-1}, 30 | where N is the size specified in kbs_init(). 31 | 32 | kbitset_t *bset = kbs_init(100); 33 | kbs_insert(bset, 5); 34 | kbs_insert(bset, 68); 35 | kbs_delete(bset, 37); 36 | // ... 37 | 38 | if (kbs_exists(bset, 68)) printf("68 present\n"); 39 | 40 | kbitset_iter_t itr; 41 | int i; 42 | kbs_start(&itr); 43 | while ((i = kbs_next(bset, &itr)) >= 0) 44 | printf("%d present\n", i); 45 | 46 | kbs_destroy(bset); 47 | 48 | Example of declaring a kbitset_t-using function in a header file, so that 49 | only source files that actually use process() need to include : 50 | 51 | struct kbitset_t; 52 | void process(struct kbitset_t *bset); 53 | */ 54 | 55 | #include 56 | #include 57 | #include 58 | 59 | #define KBS_ELTBITS (CHAR_BIT * sizeof (unsigned long)) 60 | #define KBS_ELT(i) ((i) / KBS_ELTBITS) 61 | #define KBS_MASK(i) (1UL << ((i) % KBS_ELTBITS)) 62 | 63 | typedef struct kbitset_t { 64 | size_t n, n_max; 65 | unsigned long b[1]; 66 | } kbitset_t; 67 | 68 | // (For internal use only.) Returns a mask (like 00011111) showing 69 | // which bits are in use in the last slot (for the given ni) set. 70 | static inline unsigned long kbs_last_mask(size_t ni) 71 | { 72 | unsigned long mask = KBS_MASK(ni) - 1; 73 | return mask? mask : ~0UL; 74 | } 75 | 76 | // Initialise a bit set capable of holding ni integers, 0 <= i < ni. 77 | // The set returned is empty if fill == 0, or all of [0,ni) otherwise. 78 | static inline kbitset_t *kbs_init2(size_t ni, int fill) 79 | { 80 | size_t n = (ni + KBS_ELTBITS-1) / KBS_ELTBITS; 81 | kbitset_t *bs = 82 | (kbitset_t *) malloc(sizeof(kbitset_t) + n * sizeof(unsigned long)); 83 | if (bs == NULL) return NULL; 84 | bs->n = bs->n_max = n; 85 | memset(bs->b, fill? ~0 : 0, n * sizeof (unsigned long)); 86 | // b[n] is always non-zero (a fact used by kbs_next()). 87 | bs->b[n] = kbs_last_mask(ni); 88 | if (fill) bs->b[n-1] &= bs->b[n]; 89 | return bs; 90 | } 91 | 92 | // Initialise an empty bit set capable of holding ni integers, 0 <= i < ni. 93 | static inline kbitset_t *kbs_init(size_t ni) 94 | { 95 | return kbs_init2(ni, 0); 96 | } 97 | 98 | // Resize an existing bit set to be capable of holding ni_new integers. 99 | // Elements in [ni_old,ni_new) are added to the set if fill != 0. 100 | static inline int kbs_resize2(kbitset_t **bsp, size_t ni_new, int fill) 101 | { 102 | kbitset_t *bs = *bsp; 103 | size_t n = bs? bs->n : 0; 104 | size_t n_new = (ni_new + KBS_ELTBITS-1) / KBS_ELTBITS; 105 | if (bs == NULL || n_new > bs->n_max) { 106 | bs = (kbitset_t *) 107 | realloc(*bsp, sizeof(kbitset_t) + n_new * sizeof(unsigned long)); 108 | if (bs == NULL) return -1; 109 | 110 | bs->n_max = n_new; 111 | *bsp = bs; 112 | } 113 | 114 | bs->n = n_new; 115 | if (n_new >= n) 116 | memset(&bs->b[n], fill? ~0 : 0, (n_new - n) * sizeof (unsigned long)); 117 | bs->b[n_new] = kbs_last_mask(ni_new); 118 | // Need to clear excess bits when fill!=0 or n_newb[n_new-1] &= bs->b[n_new]; 120 | return 0; 121 | } 122 | 123 | // Resize an existing bit set to be capable of holding ni_new integers. 124 | // Returns negative on error. 125 | static inline int kbs_resize(kbitset_t **bsp, size_t ni_new) 126 | { 127 | return kbs_resize2(bsp, ni_new, 0); 128 | } 129 | 130 | // Destroy a bit set. 131 | static inline void kbs_destroy(kbitset_t *bs) 132 | { 133 | free(bs); 134 | } 135 | 136 | // Reset the bit set to empty. 137 | static inline void kbs_clear(kbitset_t *bs) 138 | { 139 | memset(bs->b, 0, bs->n * sizeof (unsigned long)); 140 | } 141 | 142 | // Reset the bit set to all of [0,ni). 143 | static inline void kbs_insert_all(kbitset_t *bs) 144 | { 145 | memset(bs->b, ~0, bs->n * sizeof (unsigned long)); 146 | bs->b[bs->n-1] &= bs->b[bs->n]; 147 | } 148 | 149 | // Insert an element into the bit set. 150 | static inline void kbs_insert(kbitset_t *bs, int i) 151 | { 152 | bs->b[KBS_ELT(i)] |= KBS_MASK(i); 153 | } 154 | 155 | // Remove an element from the bit set. 156 | static inline void kbs_delete(kbitset_t *bs, int i) 157 | { 158 | bs->b[KBS_ELT(i)] &= ~KBS_MASK(i); 159 | } 160 | 161 | // Test whether the bit set contains the element. 162 | static inline int kbs_exists(const kbitset_t *bs, int i) 163 | { 164 | return (bs->b[KBS_ELT(i)] & KBS_MASK(i)) != 0; 165 | } 166 | 167 | typedef struct kbitset_iter_t { 168 | unsigned long mask; 169 | size_t elt; 170 | int i; 171 | } kbitset_iter_t; 172 | 173 | // Initialise or reset a bit set iterator. 174 | static inline void kbs_start(kbitset_iter_t *itr) 175 | { 176 | itr->mask = 1; 177 | itr->elt = 0; 178 | itr->i = 0; 179 | } 180 | 181 | // Return the next element contained in the bit set, or -1 if there are no more. 182 | static inline int kbs_next(const kbitset_t *bs, kbitset_iter_t *itr) 183 | { 184 | unsigned long b = bs->b[itr->elt]; 185 | 186 | for (;;) { 187 | if (itr->mask == 0) { 188 | while ((b = bs->b[++itr->elt]) == 0) itr->i += KBS_ELTBITS; 189 | if (itr->elt == bs->n) return -1; 190 | itr->mask = 1; 191 | } 192 | 193 | if (b & itr->mask) break; 194 | 195 | itr->i++; 196 | itr->mask <<= 1; 197 | } 198 | 199 | itr->mask <<= 1; 200 | return itr->i++; 201 | } 202 | 203 | #endif 204 | -------------------------------------------------------------------------------- /include/hts_expr.h: -------------------------------------------------------------------------------- 1 | /* expr.c -- filter expression parsing and processing. 2 | 3 | Copyright (C) 2020, 2022 Genome Research Ltd. 4 | 5 | Author: James Bonfield 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notices and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. */ 24 | 25 | #ifndef HTS_EXPR_H 26 | #define HTS_EXPR_H 27 | 28 | #include 29 | #include "kstring.h" 30 | #include "hts_defs.h" 31 | 32 | /// Holds a filter variable. This is also used to return the results. 33 | /** 34 | * The expression language has 3-states of string, numeric, and unknown. 35 | * The unknown state is either a NaN numeric or a null string, with both 36 | * internally considered to have the same "unknown" meaning. 37 | * 38 | * These largely match the IEE 754 semantics for NaN comparisons: <, >, ==, 39 | * != all fail, (even NaN == NaN). Similarly arithmetic (+,-,/,*,%) with 40 | * unknown values are still unknown (and false). 41 | * 42 | * The departure from NaN semantics though is that our unknown/null state is 43 | * considered to be false while NaN in C is true. Similarly the false nature 44 | * of our unknown state meants !val becomes true, !!val is once again false, 45 | * val && 1 is false, val || 0 is false, and val || 1 is true along with 46 | * !val || 0 and !val && 1. 47 | * 48 | * Note it is possible for empty strings and zero numbers to also be true. 49 | * An example of this is the aux string '[NM]' which returns true if the 50 | * NM tag is found, regardless of whether it is also zero. However the 51 | * better approach added in 1.16 is 'exists([NM])'. 52 | */ 53 | typedef struct hts_expr_val_t { 54 | char is_str; // Use .s vs .d 55 | char is_true; // Force true if even zero 56 | kstring_t s; // is_str and empty s permitted (eval as false) 57 | double d; // otherwise this 58 | } hts_expr_val_t; 59 | 60 | /// Returns true if an hts_expr_val_t is defined. 61 | /* An example usage of this is in the SAM expression filter where an 62 | * [X0] aux tag will be the value of X0 (string or numeric) if set, or 63 | * a false nul-string (not the same as an empty one) when not set. 64 | */ 65 | static inline int hts_expr_val_exists(hts_expr_val_t *v) { 66 | return v && !(v->is_str == 1 && v->s.s == NULL) 67 | && !(v->is_str == 0 && isnan(v->d)); 68 | } 69 | 70 | /// Returns true if an hts_expr_val_t is defined or is undef-but-true 71 | static inline int hts_expr_val_existsT(hts_expr_val_t *v) { 72 | return (v && v->is_true) || hts_expr_val_exists(v); 73 | } 74 | 75 | /// Set a value to be undefined (nan). 76 | static inline void hts_expr_val_undef(hts_expr_val_t *v) { 77 | ks_clear(&v->s); 78 | v->is_true = 0; 79 | v->is_str = 0; 80 | v->d = NAN; 81 | } 82 | 83 | /// Frees a hts_expr_val_t type. 84 | static inline void hts_expr_val_free(hts_expr_val_t *f) { 85 | ks_free(&f->s); 86 | } 87 | 88 | /// Opaque hts_filter_t type. Definition in hts_expr.c 89 | typedef struct hts_filter_t hts_filter_t; 90 | 91 | /// For static initialisation of hts_expr_val_t values 92 | #define HTS_EXPR_VAL_INIT {0, 0, KS_INITIALIZE, 0} 93 | 94 | /// Creates a filter for expression "str". 95 | /** @param str The filter expression 96 | * @return A pointer on success, NULL on failure 97 | */ 98 | HTSLIB_EXPORT 99 | hts_filter_t *hts_filter_init(const char *str); 100 | 101 | /// Frees an hts_filter_t created via hts_filter_init 102 | /** @param filt The filter pointer. 103 | */ 104 | HTSLIB_EXPORT 105 | void hts_filter_free(hts_filter_t *filt); 106 | 107 | /// Type for expression symbol lookups; name -> value. 108 | typedef int (hts_expr_sym_func)(void *data, char *str, char **end, 109 | hts_expr_val_t *res); 110 | 111 | /// Evaluates a filter expression and returns the value 112 | /** @param filt The filter, produced by hts_filter_init 113 | * @param data Arbitrary caller data, passed into sym_func 114 | * @param sym_func Callback function to lookup variables. 115 | * @param res Filled out with the result of the filter evaluation 116 | * @return Returns 0 on success, -1 on failure 117 | * 118 | * sym_func and data may be NULL if the caller does not need its own data 119 | * pointer or if it has no variables to lookup. 120 | * 121 | * The type of the returned result may be numeric of string, as defined by 122 | * the is_str member. It can also be explicitly defined to be true even 123 | * for a null value. This may be used to check for the existence of 124 | * something, irrespective of whether that something evaluates to zero. 125 | * 126 | * @p res must be initialized using HTS_EXPR_VAL_INIT before passing it 127 | * to this function for the first time. 128 | */ 129 | HTSLIB_EXPORT 130 | int hts_filter_eval2(hts_filter_t *filt, 131 | void *data, hts_expr_sym_func *sym_func, 132 | hts_expr_val_t *res); 133 | 134 | /// Evaluate a filter expression (derecated API) 135 | /** 136 | * @copydetails hts_filter_eval2() 137 | * 138 | * If calling this function more than once with the same @p res 139 | * parameter, hts_expr_val_free(res) must be used between invocations 140 | * to clear any allocated memory prior to reuse. 141 | * 142 | * @deprecated This function has been replaced by hts_filter_eval2(), 143 | * which clears @p res properly itself. 144 | */ 145 | HTSLIB_EXPORT 146 | int hts_filter_eval(hts_filter_t *filt, 147 | void *data, hts_expr_sym_func *sym_func, 148 | hts_expr_val_t *res) 149 | HTS_DEPRECATED("Please use hts_filter_eval2 instead"); 150 | 151 | 152 | #endif /* HTS_EXPR_H */ 153 | -------------------------------------------------------------------------------- /examples/simulated_phenotype1.tsv: -------------------------------------------------------------------------------- 1 | SRR28578485 0.015350 2 | SRR28578484 -0.152834 3 | SRR28578303 0.375435 4 | SRR28578326 1.756401 5 | SRR28578283 -0.186121 6 | SRR28578272 -0.558678 7 | SRR28578261 -0.182508 8 | SRR28578218 0.746364 9 | SRR28578207 2.460086 10 | SRR28578196 -0.898288 11 | SRR28578483 0.104167 12 | SRR28578472 0.238934 13 | SRR28578461 2.620617 14 | SRR28578450 -0.266697 15 | SRR28578439 0.909209 16 | SRR28578428 1.094817 17 | SRR28578379 0.934180 18 | SRR28578368 1.531665 19 | SRR28578357 0.916640 20 | SRR28578314 0.162970 21 | SRR28578302 0.111847 22 | SRR28578291 -0.180896 23 | SRR28578248 -0.133922 24 | SRR28578237 2.122005 25 | SRR28578226 0.372227 26 | SRR28578413 1.121698 27 | SRR28578402 0.890496 28 | SRR28578391 3.474425 29 | SRR28578342 0.642844 30 | SRR28578331 0.407824 31 | SRR28578325 0.373156 32 | SRR28578324 0.927806 33 | SRR28578323 0.729470 34 | SRR28578322 0.338377 35 | SRR28578321 0.197840 36 | SRR28578320 1.624201 37 | SRR28578287 1.752924 38 | SRR28578286 -0.107463 39 | SRR28578285 0.153756 40 | SRR28578284 1.157363 41 | SRR28578282 0.195271 42 | SRR28578281 -0.800984 43 | SRR28578280 2.397096 44 | SRR28578279 0.026908 45 | SRR28578278 0.737097 46 | SRR28578277 0.057859 47 | SRR28578276 0.129869 48 | SRR28578275 0.428957 49 | SRR28578274 1.413348 50 | SRR28578273 0.611546 51 | SRR28578271 0.627865 52 | SRR28578270 -0.322823 53 | SRR28578269 0.758957 54 | SRR28578268 0.357164 55 | SRR28578267 -0.733142 56 | SRR28578266 1.235542 57 | SRR28578265 -0.056784 58 | SRR28578264 0.106163 59 | SRR28578263 1.051545 60 | SRR28578262 0.917527 61 | SRR28578260 1.130876 62 | SRR28578259 0.714086 63 | SRR28578258 -0.236837 64 | SRR28578257 0.540427 65 | SRR28578256 0.869497 66 | SRR28578223 1.313608 67 | SRR28578222 1.517876 68 | SRR28578221 0.283099 69 | SRR28578220 1.079678 70 | SRR28578219 0.825395 71 | SRR28578217 -0.243878 72 | SRR28578216 0.857042 73 | SRR28578215 1.310794 74 | SRR28578214 0.903592 75 | SRR28578213 -0.052143 76 | SRR28578212 0.940835 77 | SRR28578211 1.669975 78 | SRR28578210 1.514653 79 | SRR28578209 1.089527 80 | SRR28578208 0.918759 81 | SRR28578206 -0.036818 82 | SRR28578205 -0.581671 83 | SRR28578204 2.168435 84 | SRR28578203 0.040138 85 | SRR28578202 -0.115942 86 | SRR28578201 0.399634 87 | SRR28578200 0.193508 88 | SRR28578199 -0.526991 89 | SRR28578198 -0.022578 90 | SRR28578197 0.903969 91 | SRR28578195 1.643454 92 | SRR28578194 -0.389796 93 | SRR28578193 -0.526194 94 | SRR28578192 1.129792 95 | SRR28578389 0.380636 96 | SRR28578388 1.179258 97 | SRR28578387 0.088578 98 | SRR28578386 -0.034430 99 | SRR28578385 -0.354654 100 | SRR28578384 -0.055777 101 | SRR28578482 0.919460 102 | SRR28578481 1.589098 103 | SRR28578480 0.556462 104 | SRR28578479 0.051680 105 | SRR28578478 0.033078 106 | SRR28578477 1.623619 107 | SRR28578476 0.075092 108 | SRR28578475 0.447291 109 | SRR28578474 0.652891 110 | SRR28578473 0.401198 111 | SRR28578471 -0.091506 112 | SRR28578470 0.718587 113 | SRR28578469 0.633212 114 | SRR28578468 1.740935 115 | SRR28578467 0.373563 116 | SRR28578466 0.260882 117 | SRR28578465 0.237778 118 | SRR28578464 -0.097856 119 | SRR28578463 0.484402 120 | SRR28578462 0.333819 121 | SRR28578460 0.975734 122 | SRR28578459 1.406464 123 | SRR28578458 1.190055 124 | SRR28578457 0.402485 125 | SRR28578456 1.667156 126 | SRR28578455 0.231174 127 | SRR28578454 0.863157 128 | SRR28578453 0.697110 129 | SRR28578452 0.885809 130 | SRR28578451 -0.603129 131 | SRR28578449 -0.668723 132 | SRR28578448 0.400409 133 | SRR28578447 0.409803 134 | SRR28578446 0.232150 135 | SRR28578445 0.754964 136 | SRR28578444 0.336216 137 | SRR28578443 0.963353 138 | SRR28578442 1.694846 139 | SRR28578441 0.043868 140 | SRR28578440 1.137920 141 | SRR28578438 0.639858 142 | SRR28578437 0.488518 143 | SRR28578436 0.717449 144 | SRR28578435 0.812639 145 | SRR28578434 -0.273782 146 | SRR28578433 0.506274 147 | SRR28578432 0.979713 148 | SRR28578431 0.149510 149 | SRR28578430 0.727051 150 | SRR28578429 1.808918 151 | SRR28578427 1.758718 152 | SRR28578426 1.924294 153 | SRR28578425 0.370651 154 | SRR28578424 -0.450223 155 | SRR28578423 1.326266 156 | SRR28578422 -0.028246 157 | SRR28578383 0.231055 158 | SRR28578382 0.318712 159 | SRR28578381 1.451005 160 | SRR28578380 0.548501 161 | SRR28578378 2.014849 162 | SRR28578377 0.317900 163 | SRR28578376 0.359471 164 | SRR28578375 -0.563661 165 | SRR28578374 1.024085 166 | SRR28578373 1.906276 167 | SRR28578372 0.036577 168 | SRR28578371 0.644046 169 | SRR28578370 0.481375 170 | SRR28578369 0.475594 171 | SRR28578367 0.010078 172 | SRR28578366 0.468675 173 | SRR28578365 0.575471 174 | SRR28578364 0.138366 175 | SRR28578363 -0.847662 176 | SRR28578362 -0.193327 177 | SRR28578361 -0.326889 178 | SRR28578360 2.527612 179 | SRR28578359 0.738793 180 | SRR28578358 -0.080099 181 | SRR28578356 0.974457 182 | SRR28578355 1.781260 183 | SRR28578354 0.773621 184 | SRR28578353 -0.249875 185 | SRR28578352 0.376294 186 | SRR28578319 1.117174 187 | SRR28578318 1.094563 188 | SRR28578317 0.915879 189 | SRR28578316 0.841098 190 | SRR28578315 0.686435 191 | SRR28578313 0.799282 192 | SRR28578312 1.395672 193 | SRR28578311 0.027989 194 | SRR28578310 3.256524 195 | SRR28578309 1.503412 196 | SRR28578308 1.224483 197 | SRR28578307 0.384261 198 | SRR28578306 1.389031 199 | SRR28578305 1.020493 200 | SRR28578304 0.881160 201 | SRR28578301 2.429629 202 | SRR28578300 0.354412 203 | SRR28578299 0.142545 204 | SRR28578298 0.652719 205 | SRR28578297 0.522463 206 | SRR28578296 1.154955 207 | SRR28578295 0.906529 208 | SRR28578294 0.328401 209 | SRR28578293 0.576214 210 | SRR28578292 0.187118 211 | SRR28578290 -0.851691 212 | SRR28578289 0.904824 213 | SRR28578288 0.146348 214 | SRR28578255 0.121312 215 | SRR28578254 0.796293 216 | SRR28578253 0.382000 217 | SRR28578252 1.401101 218 | SRR28578251 0.742941 219 | SRR28578250 0.099013 220 | SRR28578249 0.355679 221 | SRR28578247 2.028911 222 | SRR28578246 -0.811441 223 | SRR28578245 0.640982 224 | SRR28578244 1.648073 225 | SRR28578243 0.281964 226 | SRR28578242 -0.211762 227 | SRR28578241 0.980499 228 | SRR28578240 1.780250 229 | SRR28578239 0.370553 230 | SRR28578238 0.528591 231 | SRR28578236 -0.343525 232 | SRR28578235 0.148609 233 | SRR28578234 -0.313076 234 | SRR28578233 1.134845 235 | SRR28578232 0.285616 236 | SRR28578231 0.358191 237 | SRR28578230 0.567615 238 | SRR28578229 1.046904 239 | SRR28578228 1.596277 240 | SRR28578227 0.719800 241 | SRR28578225 1.534264 242 | SRR28578224 0.392886 243 | SRR28578421 0.522781 244 | SRR28578420 -1.041363 245 | SRR28578419 0.418558 246 | SRR28578418 -0.302946 247 | SRR28578417 -0.316913 248 | SRR28578416 1.590539 249 | SRR28578415 0.579787 250 | SRR28578414 0.792669 251 | SRR28578412 -1.019902 252 | SRR28578411 1.064175 253 | SRR28578410 0.209092 254 | SRR28578409 -0.059478 255 | SRR28578408 0.971317 256 | SRR28578407 0.399264 257 | SRR28578406 0.546429 258 | SRR28578405 0.226951 259 | SRR28578404 0.686096 260 | SRR28578403 0.385972 261 | SRR28578401 0.231461 262 | SRR28578400 0.938005 263 | SRR28578399 1.102970 264 | SRR28578398 -0.298917 265 | SRR28578397 0.828115 266 | SRR28578396 0.517408 267 | SRR28578395 1.038483 268 | SRR28578394 2.860168 269 | SRR28578393 0.103245 270 | SRR28578392 0.405810 271 | SRR28578390 0.901980 272 | SRR28578351 1.150240 273 | SRR28578350 0.961269 274 | SRR28578349 0.481837 275 | SRR28578348 1.912020 276 | SRR28578347 0.101633 277 | SRR28578346 1.392756 278 | SRR28578345 0.397884 279 | SRR28578344 0.853388 280 | SRR28578343 1.887585 281 | SRR28578341 0.455888 282 | SRR28578340 0.401392 283 | SRR28578339 -0.704465 284 | SRR28578338 1.645842 285 | SRR28578337 3.138635 286 | SRR28578336 0.130843 287 | SRR28578335 -0.200439 288 | SRR28578334 0.222784 289 | SRR28578333 1.084317 290 | SRR28578332 1.154798 291 | SRR28578330 1.916702 292 | SRR28578329 2.502245 293 | SRR28578328 1.712107 294 | SRR28578327 1.376835 295 | -------------------------------------------------------------------------------- /examples/sample_depth.tsv: -------------------------------------------------------------------------------- 1 | SRR28578485 143.2531311 2 | SRR28578484 151.7451801 3 | SRR28578303 130.8840004 4 | SRR28578326 145.6732238 5 | SRR28578283 129.6164269 6 | SRR28578272 133.0766887 7 | SRR28578261 130.8015448 8 | SRR28578218 150.7535854 9 | SRR28578207 142.0480967 10 | SRR28578196 145.9352826 11 | SRR28578483 170.0392565 12 | SRR28578472 145.0320155 13 | SRR28578461 145.2868437 14 | SRR28578450 129.8445563 15 | SRR28578439 169.7566636 16 | SRR28578428 143.6554543 17 | SRR28578379 134.6438728 18 | SRR28578368 164.0949687 19 | SRR28578357 150.2528057 20 | SRR28578314 143.5934031 21 | SRR28578302 145.6878861 22 | SRR28578291 139.2583647 23 | SRR28578248 143.1810411 24 | SRR28578237 152.7471523 25 | SRR28578226 144.2557342 26 | SRR28578413 187.5248313 27 | SRR28578402 197.8440574 28 | SRR28578391 138.3807307 29 | SRR28578342 131.7956768 30 | SRR28578331 135.2141832 31 | SRR28578325 147.1701382 32 | SRR28578324 191.0845245 33 | SRR28578323 148.7398269 34 | SRR28578322 142.7089263 35 | SRR28578321 142.5868203 36 | SRR28578320 137.969766 37 | SRR28578287 150.0220773 38 | SRR28578286 128.8518503 39 | SRR28578285 155.911366 40 | SRR28578284 143.5118468 41 | SRR28578282 127.4889947 42 | SRR28578281 139.2510428 43 | SRR28578280 138.1051722 44 | SRR28578279 151.0340088 45 | SRR28578278 140.9271126 46 | SRR28578277 141.785147 47 | SRR28578276 138.927264 48 | SRR28578275 152.1813333 49 | SRR28578274 145.7968208 50 | SRR28578273 128.5262358 51 | SRR28578271 146.2282614 52 | SRR28578270 149.0494574 53 | SRR28578269 157.4817642 54 | SRR28578268 142.4607845 55 | SRR28578267 141.8328976 56 | SRR28578266 131.2414512 57 | SRR28578265 140.4025497 58 | SRR28578264 155.0227024 59 | SRR28578263 136.3023413 60 | SRR28578262 167.9989417 61 | SRR28578260 144.7588587 62 | SRR28578259 159.3491258 63 | SRR28578258 148.457272 64 | SRR28578257 169.2237788 65 | SRR28578256 180.0942892 66 | SRR28578223 170.7408817 67 | SRR28578222 147.8890689 68 | SRR28578221 158.6294185 69 | SRR28578220 196.9508623 70 | SRR28578219 178.3365735 71 | SRR28578217 198.0742402 72 | SRR28578216 146.0505051 73 | SRR28578215 179.0663316 74 | SRR28578214 171.8797744 75 | SRR28578213 149.6450265 76 | SRR28578212 173.687766 77 | SRR28578211 162.4697338 78 | SRR28578210 193.8358375 79 | SRR28578209 181.857272 80 | SRR28578208 136.0210918 81 | SRR28578206 145.0601276 82 | SRR28578205 185.6434976 83 | SRR28578204 173.4668799 84 | SRR28578203 184.3963519 85 | SRR28578202 169.1448318 86 | SRR28578201 171.7011722 87 | SRR28578200 184.1282909 88 | SRR28578199 182.797815 89 | SRR28578198 176.8433797 90 | SRR28578197 172.6886172 91 | SRR28578195 157.6159408 92 | SRR28578194 171.155453 93 | SRR28578193 147.21286 94 | SRR28578192 194.9149082 95 | SRR28578389 191.9837135 96 | SRR28578388 210.1314534 97 | SRR28578387 133.2083404 98 | SRR28578386 165.7496049 99 | SRR28578385 174.7243894 100 | SRR28578384 166.9999501 101 | SRR28578482 165.9079091 102 | SRR28578481 178.3916737 103 | SRR28578480 165.6316971 104 | SRR28578479 161.9896128 105 | SRR28578478 155.3865704 106 | SRR28578477 193.9456313 107 | SRR28578476 178.8110923 108 | SRR28578475 166.3202274 109 | SRR28578474 164.8694932 110 | SRR28578473 141.0720534 111 | SRR28578471 165.761528 112 | SRR28578470 160.3248269 113 | SRR28578469 174.2795152 114 | SRR28578468 153.9012411 115 | SRR28578467 174.4042393 116 | SRR28578466 156.7300018 117 | SRR28578465 161.8366442 118 | SRR28578464 161.1937395 119 | SRR28578463 165.9830481 120 | SRR28578462 170.2429267 121 | SRR28578460 152.7796773 122 | SRR28578459 165.3567563 123 | SRR28578458 151.4256901 124 | SRR28578457 175.4446031 125 | SRR28578456 146.7291316 126 | SRR28578455 145.603374 127 | SRR28578454 156.0551192 128 | SRR28578453 148.202815 129 | SRR28578452 146.004657 130 | SRR28578451 186.5897068 131 | SRR28578449 160.9098653 132 | SRR28578448 151.4828653 133 | SRR28578447 148.0202905 134 | SRR28578446 134.2446715 135 | SRR28578445 160.2249188 136 | SRR28578444 167.1644865 137 | SRR28578443 150.6382331 138 | SRR28578442 159.2051143 139 | SRR28578441 152.3748751 140 | SRR28578440 138.2622486 141 | SRR28578438 162.1715466 142 | SRR28578437 146.0231678 143 | SRR28578436 129.457706 144 | SRR28578435 150.2099086 145 | SRR28578434 169.5042742 146 | SRR28578433 162.5236119 147 | SRR28578432 142.0784075 148 | SRR28578431 176.2003766 149 | SRR28578430 158.6165024 150 | SRR28578429 153.35126 151 | SRR28578427 137.2430062 152 | SRR28578426 136.066155 153 | SRR28578425 150.4098923 154 | SRR28578424 132.1660194 155 | SRR28578423 119.8675497 156 | SRR28578422 175.5891863 157 | SRR28578383 159.6194896 158 | SRR28578382 198.0260428 159 | SRR28578381 138.7655276 160 | SRR28578380 175.1074574 161 | SRR28578378 137.2876901 162 | SRR28578377 137.3332627 163 | SRR28578376 135.3282587 164 | SRR28578375 164.0677907 165 | SRR28578374 152.5337095 166 | SRR28578373 151.5252618 167 | SRR28578372 161.2778309 168 | SRR28578371 152.7106181 169 | SRR28578370 151.0698808 170 | SRR28578369 160.6362313 171 | SRR28578367 151.9023475 172 | SRR28578366 129.3007205 173 | SRR28578365 162.4620636 174 | SRR28578364 157.3524344 175 | SRR28578363 163.0489064 176 | SRR28578362 142.1415245 177 | SRR28578361 127.1245236 178 | SRR28578360 157.0732203 179 | SRR28578359 155.6583567 180 | SRR28578358 136.5695784 181 | SRR28578356 129.7704481 182 | SRR28578355 159.1627201 183 | SRR28578354 99.18067373 184 | SRR28578353 149.194687 185 | SRR28578352 133.0468296 186 | SRR28578319 128.0859033 187 | SRR28578318 141.1068159 188 | SRR28578317 343.1520821 189 | SRR28578316 127.0914168 190 | SRR28578315 131.6930923 191 | SRR28578313 147.6000521 192 | SRR28578312 152.5254168 193 | SRR28578311 149.1965099 194 | SRR28578310 171.1481907 195 | SRR28578309 135.073774 196 | SRR28578308 241.7305996 197 | SRR28578307 217.7852693 198 | SRR28578306 213.0327347 199 | SRR28578305 170.3054742 200 | SRR28578304 190.3147616 201 | SRR28578301 162.6870455 202 | SRR28578300 225.5027854 203 | SRR28578299 187.8744623 204 | SRR28578298 202.6601647 205 | SRR28578297 154.2234932 206 | SRR28578296 153.5015488 207 | SRR28578295 174.9999934 208 | SRR28578294 201.0378962 209 | SRR28578293 145.8807673 210 | SRR28578292 141.7794433 211 | SRR28578290 146.109781 212 | SRR28578289 146.9162954 213 | SRR28578288 175.65617 214 | SRR28578255 139.4443316 215 | SRR28578254 136.7770375 216 | SRR28578253 370.5687483 217 | SRR28578252 176.8643876 218 | SRR28578251 133.7986605 219 | SRR28578250 171.5117272 220 | SRR28578249 166.6167912 221 | SRR28578247 131.6842208 222 | SRR28578246 178.9852675 223 | SRR28578245 167.595819 224 | SRR28578244 144.1452834 225 | SRR28578243 172.9373903 226 | SRR28578242 148.9094415 227 | SRR28578241 146.6088375 228 | SRR28578240 148.0394097 229 | SRR28578239 146.7996221 230 | SRR28578238 149.3603196 231 | SRR28578236 143.2360013 232 | SRR28578235 140.8784331 233 | SRR28578234 138.5926556 234 | SRR28578233 149.1481951 235 | SRR28578232 146.8668857 236 | SRR28578231 148.8185501 237 | SRR28578230 153.4541801 238 | SRR28578229 143.8012252 239 | SRR28578228 150.0290728 240 | SRR28578227 161.785728 241 | SRR28578225 144.9497616 242 | SRR28578224 136.3997307 243 | SRR28578421 146.7556499 244 | SRR28578420 143.6487166 245 | SRR28578419 158.6755682 246 | SRR28578418 164.485064 247 | SRR28578417 154.6393682 248 | SRR28578416 162.6985386 249 | SRR28578415 159.4969642 250 | SRR28578414 150.6021563 251 | SRR28578412 145.4088614 252 | SRR28578411 134.9963801 253 | SRR28578410 156.1433395 254 | SRR28578409 155.460226 255 | SRR28578408 142.5423647 256 | SRR28578407 143.7154675 257 | SRR28578406 128.8869347 258 | SRR28578405 150.6782901 259 | SRR28578404 151.5545029 260 | SRR28578403 153.3908949 261 | SRR28578401 145.1757404 262 | SRR28578400 154.7573395 263 | SRR28578399 162.6256742 264 | SRR28578398 154.4668419 265 | SRR28578397 184.2081532 266 | SRR28578396 182.1584799 267 | SRR28578395 156.9410693 268 | SRR28578394 162.5399506 269 | SRR28578393 180.1123501 270 | SRR28578392 145.0870949 271 | SRR28578390 142.2964826 272 | SRR28578351 164.6562349 273 | SRR28578350 148.0059046 274 | SRR28578349 165.4628848 275 | SRR28578348 143.8568675 276 | SRR28578347 174.9672649 277 | SRR28578346 133.5743461 278 | SRR28578345 147.1783241 279 | SRR28578344 154.2741894 280 | SRR28578343 131.2188075 281 | SRR28578341 148.5623082 282 | SRR28578340 160.5201294 283 | SRR28578339 150.0300627 284 | SRR28578338 142.750268 285 | SRR28578337 166.2543219 286 | SRR28578336 162.7860442 287 | SRR28578335 152.7214305 288 | SRR28578334 149.8982976 289 | SRR28578333 142.7828163 290 | SRR28578332 158.3161391 291 | SRR28578330 192.9998283 292 | SRR28578329 142.268366 293 | SRR28578328 144.3896583 294 | SRR28578327 132.4213863 295 | -------------------------------------------------------------------------------- /scripts/pheno_simulation.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long; 5 | 6 | my $genotype_file = ""; 7 | my $heritability = 0.5; 8 | my $num_qtl = 100; 9 | my $ploidy = 6; 10 | my $num_phenotypes = 100; 11 | my $output_prefix = "simulation"; 12 | 13 | GetOptions( 14 | "genotype|g=s" => \$genotype_file, 15 | "heritability|h=f" => \$heritability, 16 | "qtl|q=i" => \$num_qtl, 17 | "ploidy|p=i" => \$ploidy, 18 | "phenotypes|n=i" => \$num_phenotypes, 19 | "output|o=s" => \$output_prefix 20 | ) or die "Error!\nUsage: $0 -g -h

-q -p -n -o \n"; 21 | 22 | die "Please define the genotype file (-g)\n" unless $genotype_file; 23 | die "genotype file not found!: $genotype_file\n" unless -e $genotype_file; 24 | 25 | print "Loading genotype file...\n"; 26 | 27 | my @individuals = (); 28 | my @snp_ids = (); 29 | my %genotypes = (); 30 | my %marker_info = (); 31 | 32 | open(my $fh, '<', $genotype_file) or die "file cannot be opened $genotype_file: $!\n"; 33 | 34 | my $header = <$fh>; 35 | chomp $header; 36 | my @header_parts = split(/\s+/, $header); 37 | 38 | # Check the first 5 cols: Marker, Chrom, Position, REF, ALT 39 | if (scalar(@header_parts) < 6) { 40 | die "Error: File format incorrect,(Marker, Chrom, Position, REF, ALT, Sample ...)\n"; 41 | } 42 | 43 | my @expected_headers = ("Marker", "Chrom", "Position", "REF", "ALT"); 44 | for my $i (0..4) { 45 | if ($header_parts[$i] ne $expected_headers[$i]) { 46 | print "Warnning: " . ($i+1) . "Col title '$header_parts[$i]',Expected '$expected_headers[$i]'\n"; 47 | } 48 | } 49 | 50 | @individuals = @header_parts[5..$#header_parts]; 51 | print "Checked " . scalar(@individuals) . " 个个体\n"; 52 | print "Individual: " . join(", ", @individuals[0..min(4, $#individuals)]); 53 | print "..." if scalar(@individuals) > 5; 54 | print "\n"; 55 | 56 | chomp $line; 57 | next if $line =~ /^\s*$/; 58 | 59 | my @parts = split(/\s+/, $line); 60 | 61 | if (scalar(@parts) != scalar(@header_parts)) { 62 | print "Warning: The" . ($snp_count + 2) . "rows and cols not matched,next line\n"; 63 | next; 64 | } 65 | 66 | my $marker_id = $parts[0]; 67 | my $chrom = $parts[1]; 68 | my $position = $parts[2]; 69 | my $ref_allele = $parts[3]; 70 | my $alt_allele = $parts[4]; 71 | 72 | $marker_info{$marker_id} = { 73 | chrom => $chrom, 74 | position => $position, 75 | ref => $ref_allele, 76 | alt => $alt_allele 77 | }; 78 | 79 | push @snp_ids, $marker_id; 80 | 81 | for my $i (0..$#individuals) { 82 | my $genotype = $parts[$i + 5]; 83 | 84 | 85 | if (!defined $genotype || $genotype eq 'NA' || $genotype eq '') { 86 | $genotype = 0; 87 | } elsif ($genotype !~ /^\d+$/) { 88 | print "Warning: marker $marker_id individual $individuals[$i] genotype value '$genotype' not number,set 0\n"; 89 | $genotype = 0; 90 | } 91 | 92 | $genotypes{$marker_id}{$individuals[$i]} = $genotype; 93 | } 94 | $snp_count++; 95 | } 96 | close($fh); 97 | 98 | print "Loading $snp_count SNPs\n"; 99 | 100 | my $max_dosage = 0; 101 | my $dosage_distribution = {}; 102 | 103 | for my $snp_id (@snp_ids) { 104 | for my $ind (@individuals) { 105 | my $dosage = $genotypes{$snp_id}{$ind}; 106 | $max_dosage = $dosage if $dosage > $max_dosage; 107 | $dosage_distribution->{$dosage}++; 108 | } 109 | } 110 | 111 | print "gene dosage distribution: "; 112 | for my $dosage (sort {$a <=> $b} keys %$dosage_distribution) { 113 | print "$dosage(" . $dosage_distribution->{$dosage} . ") "; 114 | } 115 | print "\n"; 116 | 117 | if ($max_dosage > $ploidy) { 118 | print "Warning: 检测到的最大基因剂量 ($max_dosage) 超过设定的倍性 ($ploidy)\n"; 119 | print "将倍性调整为 $max_dosage\n"; 120 | $ploidy = $max_dosage; 121 | } 122 | 123 | print "使用倍性: $ploidy\n"; 124 | 125 | 126 | print "随机选择 $num_qtl 个QTL...\n"; 127 | die "QTL数量不能超过SNP总数\n" if $num_qtl > $snp_count; 128 | 129 | # Fisher-Yates select QTLs 130 | my @snp_indices = (0..$#snp_ids); 131 | for my $i (reverse 0..$#snp_indices) { 132 | my $j = int(rand($i + 1)); 133 | ($snp_indices[$i], $snp_indices[$j]) = ($snp_indices[$j], $snp_indices[$i]); 134 | } 135 | 136 | my @selected_qtl_indices = @snp_indices[0..($num_qtl-1)]; 137 | my @qtl_snps = @snp_ids[@selected_qtl_indices]; 138 | 139 | print "已选择QTL位点\n"; 140 | print "示例QTL: " . join(", ", @qtl_snps[0..min(4, $#qtl_snps)]) . "\n"; 141 | 142 | print "为QTL分配效应值...\n"; 143 | my %qtl_effects = (); 144 | 145 | for my $qtl (@qtl_snps) { 146 | # Box-Muller transform 147 | my $u1 = rand(); 148 | my $u2 = rand(); 149 | my $z0 = sqrt(-2 * log($u1)) * cos(2 * 3.14159265359 * $u2); 150 | $qtl_effects{$qtl} = $z0; 151 | } 152 | 153 | my $sum_var = 0; 154 | for my $ind (@individuals) { 155 | my $genetic_value = 0; 156 | for my $qtl (@qtl_snps) { 157 | my $dosage = $genotypes{$qtl}{$ind}; 158 | my $normalized_dosage = $dosage / $ploidy; 159 | $genetic_value += $qtl_effects{$qtl} * $normalized_dosage; 160 | } 161 | $sum_var += $genetic_value ** 2; 162 | } 163 | 164 | my $genetic_var = $sum_var / scalar(@individuals); 165 | my $scaling_factor = $genetic_var > 0 ? sqrt($heritability / $genetic_var) : 1; 166 | 167 | for my $qtl (@qtl_snps) { 168 | $qtl_effects{$qtl} *= $scaling_factor; 169 | } 170 | 171 | print "QTL效应值已标准化\n"; 172 | 173 | print "生成 $num_phenotypes 组表型数据...\n"; 174 | my @all_phenotypes = (); 175 | 176 | for my $rep (1..$num_phenotypes) { 177 | my %phenotypes = (); 178 | 179 | for my $ind (@individuals) { 180 | my $genetic_value = 0; 181 | for my $qtl (@qtl_snps) { 182 | my $dosage = $genotypes{$qtl}{$ind}; 183 | my $normalized_dosage = $dosage / $ploidy; 184 | $genetic_value += $qtl_effects{$qtl} * $normalized_dosage; 185 | } 186 | my $error_var = (1 - $heritability); 187 | # 188 | my $u1 = rand(); 189 | my $u2 = rand(); 190 | my $environmental_effect = sqrt(-2 * log($u1)) * cos(2 * 3.14159265359 * $u2) * sqrt($error_var); 191 | 192 | $phenotypes{$ind} = $genetic_value + $environmental_effect; 193 | } 194 | 195 | push @all_phenotypes, \%phenotypes; 196 | } 197 | 198 | 199 | my $qtl_output = "${output_prefix}_QTL_effects.txt"; 200 | print "Output QTL effects: $qtl_output\n"; 201 | 202 | open(my $qtl_fh, '>', $qtl_output) or die "cannot create file $qtl_output: $!\n"; 203 | print $qtl_fh "QTL_ID\tChrom\tPosition\tREF\tALT\tEffect\n"; 204 | for my $qtl (@qtl_snps) { 205 | printf $qtl_fh "%s\t%s\t%s\t%s\t%s\t%.6f\n", 206 | $qtl, 207 | $marker_info{$qtl}{chrom}, 208 | $marker_info{$qtl}{position}, 209 | $marker_info{$qtl}{ref}, 210 | $marker_info{$qtl}{alt}, 211 | $qtl_effects{$qtl}; 212 | } 213 | close($qtl_fh); 214 | 215 | my $phenotype_output = "${output_prefix}_phenotypes.txt"; 216 | print "Ouput phenotype: $phenotype_output\n"; 217 | 218 | open(my $phen_fh, '>', $phenotype_output) or die "cannot create file $phenotype_output: $!\n"; 219 | 220 | print $phen_fh "Individual"; 221 | for my $rep (1..$num_phenotypes) { 222 | print $phen_fh "\tPhenotype_$rep"; 223 | } 224 | print $phen_fh "\n"; 225 | for my $ind (@individuals) { 226 | print $phen_fh $ind; 227 | for my $rep (0..($num_phenotypes-1)) { 228 | printf $phen_fh "\t%.6f", $all_phenotypes[$rep]{$ind}; 229 | } 230 | print $phen_fh "\n"; 231 | } 232 | close($phen_fh); 233 | 234 | print "Individual number: " . scalar(@individuals) . "\n"; 235 | print "SNP number: $snp_count\n"; 236 | print "QTL number: $num_qtl\n"; 237 | print "h2: $heritability\n"; 238 | print "ploidy: $ploidy\n"; 239 | print "Pheno number: $num_phenotypes\n"; 240 | print "QTL effect file: $qtl_output\n"; 241 | print "Pheno output: $phenotype_output\n"; 242 | 243 | my $total_genetic_var = 0; 244 | my $total_phenotypic_var = 0; 245 | 246 | for my $rep (0..($num_phenotypes-1)) { 247 | my @genetic_values = (); 248 | my @phenotype_values = (); 249 | 250 | for my $ind (@individuals) { 251 | my $genetic_value = 0; 252 | for my $qtl (@qtl_snps) { 253 | my $dosage = $genotypes{$qtl}{$ind}; 254 | my $normalized_dosage = $dosage / $ploidy; 255 | $genetic_value += $qtl_effects{$qtl} * $normalized_dosage; 256 | } 257 | push @genetic_values, $genetic_value; 258 | push @phenotype_values, $all_phenotypes[$rep]{$ind}; 259 | } 260 | 261 | my $genetic_mean = 0; 262 | my $phenotype_mean = 0; 263 | $genetic_mean += $_ for @genetic_values; 264 | $phenotype_mean += $_ for @phenotype_values; 265 | $genetic_mean /= @genetic_values; 266 | $phenotype_mean /= @phenotype_values; 267 | 268 | my $genetic_var_rep = 0; 269 | my $phenotype_var_rep = 0; 270 | for my $i (0..$#genetic_values) { 271 | $genetic_var_rep += ($genetic_values[$i] - $genetic_mean) ** 2; 272 | $phenotype_var_rep += ($phenotype_values[$i] - $phenotype_mean) ** 2; 273 | } 274 | $genetic_var_rep /= (@genetic_values - 1); 275 | $phenotype_var_rep /= (@phenotype_values - 1); 276 | 277 | $total_genetic_var += $genetic_var_rep; 278 | $total_phenotypic_var += $phenotype_var_rep; 279 | } 280 | 281 | my $avg_genetic_var = $total_genetic_var / $num_phenotypes; 282 | my $avg_phenotypic_var = $total_phenotypic_var / $num_phenotypes; 283 | my $realized_heritability = $avg_phenotypic_var > 0 ? $avg_genetic_var / $avg_phenotypic_var : 0; 284 | 285 | printf "h2: %.4f (target: %.4f)\n", $realized_heritability, $heritability; 286 | 287 | sub min { 288 | my ($a, $b) = @_; 289 | return $a < $b ? $a : $b; 290 | } 291 | -------------------------------------------------------------------------------- /include/regidx.h: -------------------------------------------------------------------------------- 1 | /// @file htslib/regidx.h 2 | /// Region indexing. 3 | /* 4 | Copyright (C) 2014-2019 Genome Research Ltd. 5 | 6 | Author: Petr Danecek 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | THE SOFTWARE. 25 | */ 26 | 27 | /* 28 | Region indexing with an optional payload. 29 | 30 | Example of usage: 31 | 32 | // Init the parser and print regions. In this example the payload is a 33 | // pointer to a string. For the description of parse_custom and 34 | // free_custom functions, see regidx_parse_f and regidx_free_f below, 35 | // and for working example see test/test-regidx.c. 36 | regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); 37 | 38 | // Query overlap with chr:beg-end (beg,end are 1-based coordinates) 39 | regitr_t *itr = regitr_init(idx); 40 | if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); 41 | 42 | while ( regitr_overlap(itr) ) 43 | { 44 | printf("[%"PRIhts_pos",%"PRIhts_pos"] overlaps with [%"PRIhts_pos",%"PRIhts_pos"], payload=%s\n", 45 | beg, end, itr->beg+1, itr->end+1, regitr_payload(itr,char*)); 46 | } 47 | 48 | regidx_destroy(idx); 49 | regitr_destroy(itr); 50 | 51 | 52 | Another example, loop over all regions: 53 | 54 | regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); 55 | regitr_t *itr = regitr_init(idx); 56 | 57 | while ( regitr_loop(itr) ) 58 | printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); 59 | 60 | regidx_destroy(idx); 61 | regitr_destroy(itr); 62 | */ 63 | 64 | #ifndef HTSLIB_REGIDX_H 65 | #define HTSLIB_REGIDX_H 66 | 67 | #include "hts.h" 68 | 69 | #ifdef __cplusplus 70 | extern "C" { 71 | #endif 72 | 73 | // maximum regidx position (0-based). Used to represent the end point of 74 | // regions which do not explicitly set one. regidx_push() also limits 75 | // positions passed to it to be no bigger than this. 76 | 77 | // Limit is set to ensure some internal values used by regidx keep within 32 78 | // bits and to stop the index from getting too big. 79 | 80 | #define REGIDX_MAX (1ULL << 35) 81 | 82 | typedef struct regidx_t regidx_t; 83 | typedef struct regitr_t 84 | { 85 | hts_pos_t beg,end; 86 | void *payload; 87 | char *seq; 88 | void *itr; 89 | } 90 | regitr_t; 91 | 92 | #define regitr_payload(itr,type_t) (*((type_t*)(itr)->payload)) 93 | 94 | // Old API for backwards compatibility 95 | #define REGITR_START(itr) (itr).beg 96 | #define REGITR_END(itr) (itr).end 97 | #define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload) 98 | #define REGITR_OVERLAP(itr,from,to) regidx_overlap((itr)); 99 | 100 | /* 101 | * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed 102 | * or regidx_parse_tab below. The function is expected to set `chr_from` and 103 | * `chr_to` to point to first and last character of chromosome name and set 104 | * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was 105 | * called with non-zero payload_size, the `payload` points to a memory 106 | * location of the payload_size and `usr` is the data passed to regidx_init(). 107 | * Any memory allocated by the function will be freed by regidx_free_f called 108 | * by regidx_destroy(). 109 | * 110 | * Return value: 0 on success, -1 to skip a record, -2 on fatal error. 111 | */ 112 | typedef int (*regidx_parse_f)(const char *line, char **chr_beg, char **chr_end, hts_pos_t *beg, hts_pos_t *end, void *payload, void *usr); 113 | typedef void (*regidx_free_f)(void *payload); 114 | 115 | /* 116 | * A note about the parsers: 117 | * - leading spaces are ignored 118 | * - lines starting with "#" are ignored 119 | */ 120 | HTSLIB_EXPORT 121 | int regidx_parse_bed(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open) 122 | HTSLIB_EXPORT 123 | int regidx_parse_tab(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive) 124 | HTSLIB_EXPORT 125 | int regidx_parse_reg(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive) 126 | HTSLIB_EXPORT 127 | int regidx_parse_vcf(const char*,char**,char**,hts_pos_t*,hts_pos_t*,void*,void*); 128 | 129 | /* 130 | * regidx_init() - creates new index 131 | * regidx_init_string() - creates new index, from a string rather than from a file 132 | * 133 | * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() 134 | * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, 135 | * the format will be autodected, currently either regidx_parse_tab (the default) or 136 | * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that 137 | * the exact autodetection algorithm will change. 138 | * @param freef: NULL or see description of regidx_parse_f 139 | * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f 140 | * @param usr: optional user data passed to regidx_parse_f 141 | * 142 | * Returns index on success or NULL on error. 143 | * 144 | * The regidx_t index struct returned by a successful call should be freed 145 | * via regidx_destroy() when it is no longer needed. 146 | */ 147 | HTSLIB_EXPORT 148 | regidx_t *regidx_init(const char *fname, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); 149 | HTSLIB_EXPORT 150 | regidx_t *regidx_init_string(const char *string, regidx_parse_f parsef, regidx_free_f freef, size_t payload_size, void *usr); 151 | 152 | /* 153 | * regidx_destroy() - free memory allocated by regidx_init 154 | */ 155 | HTSLIB_EXPORT 156 | void regidx_destroy(regidx_t *idx); 157 | 158 | /* 159 | * regidx_overlap() - check overlap of the location chr:from-to with regions 160 | * @param beg,end: 0-based start, end coordinate (inclusive) 161 | * @param itr: pointer to iterator, can be NULL if regidx_loop not needed 162 | * 163 | * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping 164 | * regions can be iterated as shown in the example above. 165 | */ 166 | HTSLIB_EXPORT 167 | int regidx_overlap(regidx_t *idx, const char *chr, hts_pos_t beg, hts_pos_t end, regitr_t *itr); 168 | 169 | /* 170 | * regidx_insert() - add a new region. 171 | * regidx_insert_list() - add new regions from a list 172 | * regidx_push() - low level insertion of a new region 173 | * 174 | * Returns 0 on success or -1 on error. 175 | */ 176 | HTSLIB_EXPORT 177 | int regidx_insert(regidx_t *idx, char *line); 178 | HTSLIB_EXPORT 179 | int regidx_insert_list(regidx_t *idx, char *line, char delim); 180 | HTSLIB_EXPORT 181 | int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, hts_pos_t beg, hts_pos_t end, void *payload); 182 | 183 | /* 184 | * regidx_seq_names() - return list of all sequence names 185 | */ 186 | HTSLIB_EXPORT 187 | char **regidx_seq_names(regidx_t *idx, int *n); 188 | 189 | /* 190 | * regidx_seq_nregs() - number of regions 191 | * regidx_nregs() - total number of regions 192 | */ 193 | HTSLIB_EXPORT 194 | int regidx_seq_nregs(regidx_t *idx, const char *seq); 195 | 196 | HTSLIB_EXPORT 197 | int regidx_nregs(regidx_t *idx); 198 | 199 | /* 200 | * regitr_init() - initialize an iterator. The idx parameter is required only 201 | * with regitr_loop. If only regitr_overlap is called, NULL 202 | * can be given. 203 | * 204 | * The regitr_t struct returned by a successful regitr_init() 205 | * call should be freed via regitr_destroy() when it is no 206 | * longer needed. 207 | * 208 | * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle. 209 | * Not required with regitr_overlap. 210 | */ 211 | HTSLIB_EXPORT 212 | regitr_t *regitr_init(regidx_t *idx); 213 | HTSLIB_EXPORT 214 | void regitr_destroy(regitr_t *itr); 215 | HTSLIB_EXPORT 216 | void regitr_reset(regidx_t *idx, regitr_t *itr); 217 | 218 | /* 219 | * regitr_overlap() - next overlapping region 220 | * Returns 0 when done or 1 when itr is set to next region 221 | */ 222 | HTSLIB_EXPORT 223 | int regitr_overlap(regitr_t *itr); 224 | 225 | /* 226 | * regitr_loop() - loop over all regions 227 | * Returns 0 when done or 1 when itr is set to next region 228 | */ 229 | HTSLIB_EXPORT 230 | int regitr_loop(regitr_t *itr); 231 | 232 | /* 233 | * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop 234 | */ 235 | HTSLIB_EXPORT 236 | void regitr_copy(regitr_t *dst, regitr_t *src); 237 | 238 | #ifdef __cplusplus 239 | } 240 | #endif 241 | 242 | #endif 243 | -------------------------------------------------------------------------------- /include/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | Copyright (C) 2013, 2018, 2020, 2023 Genome Research Ltd. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | */ 26 | 27 | #ifndef AC_KSEQ_H 28 | #define AC_KSEQ_H 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | #include "kstring.h" 35 | 36 | #ifndef klib_unused 37 | #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 38 | #define klib_unused __attribute__ ((__unused__)) 39 | #else 40 | #define klib_unused 41 | #endif 42 | #endif /* klib_unused */ 43 | 44 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 45 | #define KS_SEP_TAB 1 // isspace() && !' ' 46 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 47 | #define KS_SEP_MAX 2 48 | 49 | #define __KS_TYPE(type_t) \ 50 | typedef struct __kstream_t { \ 51 | int begin, end; \ 52 | int is_eof:2, bufsize:30; \ 53 | uint64_t seek_pos; \ 54 | type_t f; \ 55 | unsigned char *buf; \ 56 | } kstream_t; 57 | 58 | #define ks_err(ks) ((ks)->end == -1) 59 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 60 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 61 | 62 | #define __KS_BASIC(SCOPE, type_t, __bufsize) \ 63 | SCOPE kstream_t *ks_init(type_t f) \ 64 | { \ 65 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 66 | ks->f = f; ks->bufsize = __bufsize; \ 67 | ks->buf = (unsigned char*)malloc(__bufsize); \ 68 | return ks; \ 69 | } \ 70 | SCOPE void ks_destroy(kstream_t *ks) \ 71 | { \ 72 | if (!ks) return; \ 73 | free(ks->buf); \ 74 | free(ks); \ 75 | } 76 | 77 | #define __KS_INLINED(__read) \ 78 | static inline klib_unused int ks_getc(kstream_t *ks) \ 79 | { \ 80 | if (ks_err(ks)) return -3; \ 81 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 82 | if (ks->begin >= ks->end) { \ 83 | ks->begin = 0; \ 84 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 85 | if (ks->end == 0) { ks->is_eof = 1; return -1; } \ 86 | if (ks->end == -1) { ks->is_eof = 1; return -3; } \ 87 | } \ 88 | ks->seek_pos++; \ 89 | return (int)ks->buf[ks->begin++]; \ 90 | } \ 91 | static inline klib_unused int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 92 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 93 | 94 | #define __KS_GETUNTIL(SCOPE, __read) \ 95 | SCOPE int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 96 | { \ 97 | int gotany = 0; \ 98 | if (dret) *dret = 0; \ 99 | str->l = append? str->l : 0; \ 100 | uint64_t seek_pos = str->l; \ 101 | for (;;) { \ 102 | int i; \ 103 | if (ks_err(ks)) return -3; \ 104 | if (ks->begin >= ks->end) { \ 105 | if (!ks->is_eof) { \ 106 | ks->begin = 0; \ 107 | ks->end = __read(ks->f, ks->buf, ks->bufsize); \ 108 | if (ks->end == 0) { ks->is_eof = 1; break; } \ 109 | if (ks->end == -1) { ks->is_eof = 1; return -3; } \ 110 | } else break; \ 111 | } \ 112 | if (delimiter == KS_SEP_LINE) { \ 113 | unsigned char *sep = (unsigned char *)memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \ 114 | i = sep != NULL ? sep - ks->buf : ks->end; \ 115 | } else if (delimiter > KS_SEP_MAX) { \ 116 | unsigned char *sep = (unsigned char *)memchr(ks->buf + ks->begin, delimiter, ks->end - ks->begin); \ 117 | i = sep != NULL ? sep - ks->buf : ks->end; \ 118 | } else if (delimiter == KS_SEP_SPACE) { \ 119 | for (i = ks->begin; i < ks->end; ++i) \ 120 | if (isspace(ks->buf[i])) break; \ 121 | } else if (delimiter == KS_SEP_TAB) { \ 122 | for (i = ks->begin; i < ks->end; ++i) \ 123 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 124 | } else i = 0; /* never come to here! */ \ 125 | (void) ks_expand(str, i - ks->begin + 1); \ 126 | seek_pos += i - ks->begin; if ( i < ks->end ) seek_pos++; \ 127 | gotany = 1; \ 128 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 129 | str->l = str->l + (i - ks->begin); \ 130 | ks->begin = i + 1; \ 131 | if (i < ks->end) { \ 132 | if (dret) *dret = ks->buf[i]; \ 133 | break; \ 134 | } \ 135 | } \ 136 | if (!gotany && ks_eof(ks)) return -1; \ 137 | ks->seek_pos += seek_pos; \ 138 | if (str->s == 0) { \ 139 | str->m = 1; \ 140 | str->s = (char*)calloc(1, 1); \ 141 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 142 | str->s[str->l] = '\0'; \ 143 | return str->l; \ 144 | } 145 | 146 | #define KSTREAM_INIT2(SCOPE, type_t, __read, __bufsize) \ 147 | __KS_TYPE(type_t) \ 148 | __KS_BASIC(SCOPE, type_t, __bufsize) \ 149 | __KS_GETUNTIL(SCOPE, __read) \ 150 | __KS_INLINED(__read) 151 | 152 | #define KSTREAM_INIT(type_t, __read, __bufsize) KSTREAM_INIT2(static, type_t, __read, __bufsize) 153 | 154 | #define KSTREAM_DECLARE(type_t, __read) \ 155 | __KS_TYPE(type_t) \ 156 | extern int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append); \ 157 | extern kstream_t *ks_init(type_t f); \ 158 | extern void ks_destroy(kstream_t *ks); \ 159 | __KS_INLINED(__read) 160 | 161 | /****************** 162 | * FASTA/Q parser * 163 | ******************/ 164 | 165 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 166 | 167 | #define __KSEQ_BASIC(SCOPE, type_t) \ 168 | SCOPE kseq_t *kseq_init(type_t fd) \ 169 | { \ 170 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 171 | s->f = ks_init(fd); \ 172 | return s; \ 173 | } \ 174 | SCOPE void kseq_destroy(kseq_t *ks) \ 175 | { \ 176 | if (!ks) return; \ 177 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 178 | ks_destroy(ks->f); \ 179 | free(ks); \ 180 | } 181 | 182 | /* Return value: 183 | >=0 length of the sequence (normal) 184 | -1 end-of-file 185 | -2 truncated quality string 186 | -3 error reading stream 187 | -4 overflow error 188 | */ 189 | #define __KSEQ_READ(SCOPE) \ 190 | SCOPE int kseq_read(kseq_t *seq) \ 191 | { \ 192 | int c,r; \ 193 | kstream_t *ks = seq->f; \ 194 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 195 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ 196 | if (c < 0) return c; /* end of file or error */ \ 197 | seq->last_char = c; \ 198 | } /* else: the first header char has been read in the previous call */ \ 199 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 200 | if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ 201 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 202 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 203 | seq->seq.m = 256; \ 204 | seq->seq.s = (char*)malloc(seq->seq.m); \ 205 | } \ 206 | while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ 207 | if (c == '\n') continue; /* skip empty lines */ \ 208 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 209 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 210 | } \ 211 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 212 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 213 | seq->seq.m = seq->seq.l + 2; \ 214 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 215 | if (seq->seq.l + 1 >= seq->seq.m) return -4; /* error: adjusting m overflowed */ \ 216 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 217 | } \ 218 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 219 | if (c != '+') return seq->seq.l; /* FASTA */ \ 220 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 221 | seq->qual.m = seq->seq.m; \ 222 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 223 | } \ 224 | while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ 225 | if (c == -1) return -2; /* error: no quality string */ \ 226 | while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1)) >= 0 && seq->qual.l < seq->seq.l); \ 227 | if (c == -3) return -3; /* stream error */ \ 228 | seq->last_char = 0; /* we have not come to the next header line */ \ 229 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 230 | return seq->seq.l; \ 231 | } 232 | 233 | #define __KSEQ_TYPE(type_t) \ 234 | typedef struct { \ 235 | kstring_t name, comment, seq, qual; \ 236 | int last_char; \ 237 | kstream_t *f; \ 238 | } kseq_t; 239 | 240 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 241 | KSTREAM_INIT(type_t, __read, 16384) \ 242 | __KSEQ_TYPE(type_t) \ 243 | __KSEQ_BASIC(SCOPE, type_t) \ 244 | __KSEQ_READ(SCOPE) 245 | 246 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 247 | 248 | #define KSEQ_DECLARE(type_t) \ 249 | __KS_TYPE(type_t) \ 250 | __KSEQ_TYPE(type_t) \ 251 | extern kseq_t *kseq_init(type_t fd); \ 252 | void kseq_destroy(kseq_t *ks); \ 253 | int kseq_read(kseq_t *seq); 254 | 255 | #endif 256 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## KMERIA 2 | 3 | ### A KMER-based genome-wIde Assocation testing approach on polyploids 4 | 5 | 6 | 7 | 8 | ## Table of Contents 9 | 10 | - [Introduction](#intro) 11 | - [Features](#features) 12 | - [Prerequisites](#prere) 13 | - [Installation](#install) 14 | - [Getting Started](#started) 15 | - [KMERIA pipeline (easy mode)](https://github.com/Sh1ne111/KMERIA/wiki/Pipeline-(Easy-Mode)) 16 | - [KMERIA overview (detailed steps)](https://github.com/Sh1ne111/KMERIA/wiki/Detailed-Step-by-Step-Tutorial)) 17 | - [Miscellaneous](#misc) 18 | - [Contact](#contact) 19 | - [Citation](#citing) 20 | 21 | 22 | ## Introduction 23 | 24 | This repository contains an implementation of a k-mer-based method for Genome-Wide Association Studies (GWAS) in complex polyploid organisms (e.g., sugarcane, potato, sweetpotato, alfalfa,...). The approach is equally applicable to diploid species. By leveraging k-mer abundance profiles and statistical modeling, the method identifies associations between genetic variants and phenotypic traits. 25 | 26 | ## Features 27 | 28 | - **Enhanced Genetic Variability Detection:** KMERIA can capture a wider range of genetic variants, including structural variations and copy number variations, which are often overlooked in traditional GWAS. 29 | 30 | - **Independent of Reference Genomes:** KMERIA do not rely on a reference genome in steps to identify genotypes, making them suitable for organisms with complex and variable genomic architectures, such as auto-polyploids. 31 | 32 | - **Improved Additive effect Estimation:** The analysis of k-mer copy number can provide more efficient estimates of additive effects in auto-polyploid species, allowing for better interpretation of genotype-phenotype relationships. 33 | 34 | - **Facilitated Genotype Identification:** KMERIA reduce the complexity of identifying genotypes in polyploids, facilitating faster and more efficient association analyses. 35 | 36 | ## Recent updates 37 | 38 | * KMERIA Version 2.0.1 (2025.10.30): 39 | - K-mer matrix construction is now more efficient and consumes fewer resources; 40 | - Updated filter step to use new compressed output format; 41 | - Enhanced m2b step with BGZF compression and statistics; 42 | - Updated the association step to use our newly implemented Association tool **bimbamAsso** 43 | 44 | * KMERIA Version 0.0.1 (2024.10.14) is no longer be maintained 45 | 46 | ## Prerequisites 47 | 48 | - C/C++ compiler 49 | - GNU make 50 | - Linux system 51 | 52 | ## Installation 53 | ```bash 54 | 55 | # Clone the KMERIA repository: 56 | git clone https://github.com/Sh1ne111/KMERIA.git 57 | 58 | # To avoid GNU C++ Runtime Library conflicts, you can create a conda virtual environment to ensure all dependent libraries are installed correctly. 59 | conda env create -f kmeria_env.yml 60 | conda activate kmeriaenv 61 | 62 | # htslib 63 | export LD_LIBRARY_PATH=/your_path/KMERIA/lib:$LD_LIBRARY_PATH 64 | 65 | # Change Permissions 66 | chmod 755 /your_path/KMERIA/bin/* 67 | chmod 755 /your_path/KMERIA/external_tools/* 68 | chmod 755 /your_path/KMERIA/bimbamAsso/* 69 | 70 | #Add PATH environment 71 | export PATH=/your_path/KMERIA/bin:/your_path/KMERIA/bimbamAsso:/your_path/KMERIA/external_tools:$PATH 72 | 73 | 74 | # For source code installations 75 | # cd /your_path/KEMRIA/ 76 | # make && make install 77 | # make clean 78 | ``` 79 | 80 | 81 | ## Quick Start 82 | KMERIA provides a wrapper script, kmeria_wrapper.pl, designed to generate job scripts for the entire analysis pipeline, with built-in support for SLURM, SGE, and PBS schedulers. To facilitate the execution of a complete KMERIA analysis, we strongly recommend using this script as the entry point for workflow management. 83 | 84 | ```bash 85 | perl /KMERIA/scripts/kmeria_wrapper.pl --step all \ 86 | --input /path/to/fastq_files \ 87 | --output /path/to/kmeria_results \ 88 | --samples sample.list \ 89 | --threads 32 \ 90 | --kmer 31 \ 91 | --min-abund 5 \ 92 | --max-abund 1000 \ 93 | --batch-size 2 \ 94 | --use-kmc \ # Optional, default: kmeria count 95 | --kmc-memory 32 \ 96 | --ploidy 4 \ 97 | --depth-file /path/to/sample_depths.txt \ 98 | --pheno /path/to/phenotypes.txt \ 99 | --pheno-col 1 \ 100 | --use-bimbam-tools \ # Optional: Use built-in 'bimbamAsso' instead of 'gemma' 101 | --scheduler slurm \ 102 | --queue hebhcnormal01 103 | ``` 104 | 105 | ### ➡️ **Full Pipeline and Documentation** 106 | 107 | For detailed, step-by-step instructions, parameter explanations, and advanced usage, please visit our comprehensive [**KMERIA Wiki**](https://github.com/Sh1ne111/KMERIA/wiki). 108 | 109 | - **[Pipeline (Easy Mode)](https://github.com/Sh1ne111/KMERIA/wiki/Pipeline-(Easy-Mode))**: Detailed breakdown of the `kmeria_wrapper.pl` parameters. 110 | - **[Detailed Step-by-Step Tutorial](https://github.com/Sh1ne111/KMERIA/wiki/Detailed-Step-by-Step-Tutorial)**: A complete walkthrough of the entire KMERIA workflow, from raw reads to association results. 111 | - **[Post-GWAS Analysis](https://github.com/Sh1ne111/KMERIA/wiki/Post-GWAS-Analysis)**: Guides on mapping associated k-mers and reads. 112 | - **[Retrieve k-mer dosage](https://github.com/Sh1ne111/KMERIA/wiki/Retrieve-k%E2%80%90mer-dosage-from-the-k%E2%80%90mer-counting-matrices)**: Retrieve k‐mer dosage from the k‐mer counting matrices. 113 | 114 | ## Command Overview 115 | ``` 116 | #===============================================================================# 117 | # # 118 | # _ ____ __ ______ _____ _____ # 119 | # | |/ / \/ | ____| __ \|_ _| /\ # 120 | # | ' /| \ / | |__ | |__) | | | / \ # 121 | # | < | |\/| | __| | _ / | | / /\ \ # 122 | # | . \| | | | |____| | \ \ _| |_ / ____ \ # 123 | # |_|\_|_| |_|______|_| \_\_____/_/ \_\ # 124 | # # 125 | #===============================================================================# 126 | 127 | Program: KMERIA - A KMER-based genome-wIde Association testing approach 128 | for polyploids 129 | 130 | Version: v2.0.1 (2025-10-14) 131 | Author: Chen Shuai 132 | GitHub: https://github.com/Sh1ne111/KMERIA 133 | 134 | Usage: kmeria [options] 135 | 136 | Commands: 137 | 138 | Data Processing: 139 | count Count k-mers from FASTA/FASTQ files 140 | dump Convert binary k-mer file to plain text 141 | kctm Build population k-mer counting matrix 142 | filter Filter k-mer matrix by frequency and quality 143 | 144 | Format Conversion: 145 | m2b Convert k-mer matrix to BIMBAM dosage format 146 | b2g Convert BIMBAM format to genotype format 147 | 148 | Analysis: 149 | sketch Random sampling for PCA and kinship calculation 150 | asso Conduct k-mer genome-wide association study 151 | 152 | Utilities: 153 | fkr Fetch reads associated k-mers from FASTQ files 154 | fkrtgs Fetch reads associated k-mers from TGS FASTQ files 155 | kbam Extract reads associated k-mers from BAM files 156 | addp Annotate BAM with association p-values 157 | 158 | Additional Help: 159 | kmeria -h Show detailed help for specific command 160 | Visit https://github.com/Sh1ne111/KMERIA for documentation 161 | 162 | #===========================================================================# 163 | # Citation: If you use KMERIA, please cite our paper at [Journal/DOI] # 164 | #===========================================================================# 165 | ``` 166 | 167 | ## Miscellaneous Tools 168 | 169 | KMERIA also includes several utility scripts located in the `/bin` and `/scripts` directories: 170 | 171 | - `/bin/retrieve_kmer`: Get k-mer dosage from filtered k-mer counting matrices. 172 | - `/scripts/calc_gwas_threshold_new.R`: Calculate the GWAS significance threshold. 173 | - `/scripts/plot_manhattan.R`: Helper script for plotting Manhattan plots. 174 | 175 | Usage instructions are available on the [**Wiki**](https://github.com/Sh1ne111/KMERIA/wiki). 176 | 177 | ## Contact 178 | 179 | For questions or feedback, please contact [Chen Shuai] at [chensss1209@gmail.com]. 180 | 181 | ## FAQs 182 | ``` 183 | Should I use kmeria count or KMC? 184 | Use kmeria count (default) for: 185 | - Most standard analyses 186 | - Direct KMERIA pipeline integration 187 | Use KMC (--use-kmc) for: 188 | - Very large datasets (>100GB per sample) 189 | - When you need strict abundance filtering 190 | - Compatibility with other KMC-based workflows 191 | - Faster 192 | Consider: 193 | - Shorter k-mers: More sensitive, more false positives, less memory 194 | - Longer k-mers: More specific, fewer false positives, more memory 195 | 196 | How do I process paired-end reads? 197 | Both methods automatically detect and process paired-end files: 198 | - Files matching: sample_R1.fq.gz and sample_R2.fq.gz 199 | - Or: sample_1.fq.gz and sample_2.fq.gz 200 | 201 | Can I restart a failed pipeline? 202 | Yes! Since each step generates independent job scripts: 203 | 1. Identify which step failed (check log files) 204 | 2. Fix the issue (add memory, correct input files, etc.) 205 | 3. Re-run only that specific step: --step count|kctm|filter|m2b|asso 206 | 4. Continue with subsequent steps 207 | 208 | How do I speed up association analysis? 209 | The association step handles internal parallelism: 210 | - Use --threads to set concurrency (e.g., 64) 211 | - Ensure fast I/O (SSD storage) 212 | - Pre-compute kinship and covariates 213 | Choose tool mode with --use-bimbam-tools for bimbamAsso mode. 214 | ``` 215 | 216 | ## Citation 217 | 218 | If you have used KMERIA in your research, please cite below: 219 | 220 | > https://github.com/Sh1ne111/KMERIA 221 | > 222 | > Shuai Chen et al. A k-mer-based GWAS approach empowering gene mining in polyploids, 05 November 2025, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-7347406/v1] 223 | -------------------------------------------------------------------------------- /include/ksort.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2012-2013, 2017-2019 Genome Research Ltd (GRL). 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Contact: Heng Li */ 27 | 28 | /* 29 | 2012-12-11 (0.1.4): 30 | 31 | * Defined __ks_insertsort_##name as static to compile with C99. 32 | 33 | 2008-11-16 (0.1.4): 34 | 35 | * Fixed a bug in introsort() that happens in rare cases. 36 | 37 | 2008-11-05 (0.1.3): 38 | 39 | * Fixed a bug in introsort() for complex comparisons. 40 | 41 | * Fixed a bug in mergesort(). The previous version is not stable. 42 | 43 | 2008-09-15 (0.1.2): 44 | 45 | * Accelerated introsort. On my Mac (not on another Linux machine), 46 | my implementation is as fast as the C++ standard library's sort() 47 | on random input. 48 | 49 | * Added combsort and in introsort, switch to combsort if the 50 | recursion is too deep. 51 | 52 | 2008-09-13 (0.1.1): 53 | 54 | * Added k-small algorithm 55 | 56 | 2008-09-05 (0.1.0): 57 | 58 | * Initial version 59 | 60 | */ 61 | 62 | #ifndef AC_KSORT_H 63 | #define AC_KSORT_H 64 | 65 | #include 66 | #include 67 | #include "hts_defs.h" 68 | 69 | #ifndef klib_unused 70 | #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) 71 | #define klib_unused __attribute__ ((__unused__)) 72 | #else 73 | #define klib_unused 74 | #endif 75 | #endif /* klib_unused */ 76 | 77 | #ifdef __cplusplus 78 | extern "C" { 79 | #endif 80 | 81 | // Use our own drand48() symbol (used by ks_shuffle) to avoid portability 82 | // problems on Windows. Don't include htslib/hts_os.h for this as it 83 | // may not get on with older attempts to fix this in code that includes 84 | // this file. 85 | HTSLIB_EXPORT 86 | extern double hts_drand48(void); 87 | 88 | typedef struct { 89 | void *left, *right; 90 | int depth; 91 | } ks_isort_stack_t; 92 | 93 | #define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; } 94 | 95 | #define KSORT_INIT(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, , type_t, __sort_lt) 96 | #define KSORT_INIT_STATIC(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, static klib_unused, type_t, __sort_lt) 97 | #define KSORT_INIT2(name, SCOPE, type_t, __sort_lt) KSORT_INIT_(_ ## name, SCOPE, type_t, __sort_lt) 98 | 99 | #define KSORT_INIT_(name, SCOPE, type_t, __sort_lt) \ 100 | SCOPE int ks_mergesort##name(size_t n, type_t array[], type_t temp[]) \ 101 | { \ 102 | type_t *a2[2], *a, *b; \ 103 | int curr, shift; \ 104 | \ 105 | a2[0] = array; \ 106 | a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ 107 | for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ 164 | ks_heapadjust##name(i, lsize, l); \ 165 | } \ 166 | SCOPE void ks_heapsort##name(size_t lsize, type_t l[]) \ 167 | { \ 168 | size_t i; \ 169 | for (i = lsize - 1; i > 0; --i) { \ 170 | type_t tmp; \ 171 | tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust##name(0, i, l); \ 172 | } \ 173 | } \ 174 | static inline void __ks_insertsort##name(type_t *s, type_t *t) \ 175 | { \ 176 | type_t *i, *j, swap_tmp; \ 177 | for (i = s + 1; i < t; ++i) \ 178 | for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ 179 | swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ 180 | } \ 181 | } \ 182 | SCOPE void ks_combsort##name(size_t n, type_t a[]) \ 183 | { \ 184 | const double shrink_factor = 1.2473309501039786540366528676643; \ 185 | int do_swap; \ 186 | size_t gap = n; \ 187 | type_t tmp, *i, *j; \ 188 | do { \ 189 | if (gap > 2) { \ 190 | gap = (size_t)(gap / shrink_factor); \ 191 | if (gap == 9 || gap == 10) gap = 11; \ 192 | } \ 193 | do_swap = 0; \ 194 | for (i = a; i < a + n - gap; ++i) { \ 195 | j = i + gap; \ 196 | if (__sort_lt(*j, *i)) { \ 197 | tmp = *i; *i = *j; *j = tmp; \ 198 | do_swap = 1; \ 199 | } \ 200 | } \ 201 | } while (do_swap || gap > 2); \ 202 | if (gap != 1) __ks_insertsort##name(a, a + n); \ 203 | } \ 204 | SCOPE int ks_introsort##name(size_t n, type_t a[]) \ 205 | { \ 206 | int d; \ 207 | ks_isort_stack_t *top, *stack; \ 208 | type_t rp, swap_tmp; \ 209 | type_t *s, *t, *i, *j, *k; \ 210 | \ 211 | if (n < 1) return 0; \ 212 | else if (n == 2) { \ 213 | if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ 214 | return 0; \ 215 | } \ 216 | for (d = 2; 1ul<>1) + 1; \ 227 | if (__sort_lt(*k, *i)) { \ 228 | if (__sort_lt(*k, *j)) k = j; \ 229 | } else k = __sort_lt(*j, *i)? i : j; \ 230 | rp = *k; \ 231 | if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ 232 | for (;;) { \ 233 | do ++i; while (__sort_lt(*i, rp)); \ 234 | do --j; while (i <= j && __sort_lt(rp, *j)); \ 235 | if (j <= i) break; \ 236 | swap_tmp = *i; *i = *j; *j = swap_tmp; \ 237 | } \ 238 | swap_tmp = *i; *i = *t; *t = swap_tmp; \ 239 | if (i-s > t-i) { \ 240 | if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ 241 | s = t-i > 16? i+1 : t; \ 242 | } else { \ 243 | if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ 244 | t = i-s > 16? i-1 : s; \ 245 | } \ 246 | } else { \ 247 | if (top == stack) { \ 248 | free(stack); \ 249 | __ks_insertsort##name(a, a+n); \ 250 | return 0; \ 251 | } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ 252 | } \ 253 | } \ 254 | return 0; \ 255 | } \ 256 | /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ 257 | /* 0 <= kk < n */ \ 258 | SCOPE type_t ks_ksmall##name(size_t n, type_t arr[], size_t kk) \ 259 | { \ 260 | type_t *low, *high, *k, *ll, *hh, *mid; \ 261 | low = arr; high = arr + n - 1; k = arr + kk; \ 262 | for (;;) { \ 263 | if (high <= low) return *k; \ 264 | if (high == low + 1) { \ 265 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 266 | return *k; \ 267 | } \ 268 | mid = low + (high - low) / 2; \ 269 | if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ 270 | if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ 271 | if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ 272 | KSORT_SWAP(type_t, *mid, *(low+1)); \ 273 | ll = low + 1; hh = high; \ 274 | for (;;) { \ 275 | do ++ll; while (__sort_lt(*ll, *low)); \ 276 | do --hh; while (__sort_lt(*low, *hh)); \ 277 | if (hh < ll) break; \ 278 | KSORT_SWAP(type_t, *ll, *hh); \ 279 | } \ 280 | KSORT_SWAP(type_t, *low, *hh); \ 281 | if (hh <= k) low = ll; \ 282 | if (hh >= k) high = hh - 1; \ 283 | } \ 284 | } \ 285 | SCOPE void ks_shuffle##name(size_t n, type_t a[]) \ 286 | { \ 287 | int i, j; \ 288 | for (i = n; i > 1; --i) { \ 289 | type_t tmp; \ 290 | j = (int)(hts_drand48() * i); \ 291 | tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ 292 | } \ 293 | } 294 | 295 | #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) 296 | #define ks_introsort(name, n, a) ks_introsort_##name(n, a) 297 | #define ks_combsort(name, n, a) ks_combsort_##name(n, a) 298 | #define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) 299 | #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) 300 | #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) 301 | #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) 302 | #define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) 303 | 304 | #define ks_lt_generic(a, b) ((a) < (b)) 305 | #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) 306 | 307 | typedef const char *ksstr_t; 308 | 309 | #define KSORT_INIT_GENERIC(type_t) KSORT_INIT_(_ ## type_t, , type_t, ks_lt_generic) 310 | #define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) 311 | 312 | #define KSORT_INIT_STATIC_GENERIC(type_t) KSORT_INIT_(_ ## type_t, static klib_unused, type_t, ks_lt_generic) 313 | #define KSORT_INIT_STATIC_STR KSORT_INIT_STATIC(str, ksstr_t, ks_lt_str) 314 | 315 | #define KSORT_INIT2_GENERIC(type_t, SCOPE) KSORT_INIT_(_ ## type_t, SCOPE, type_t, ks_lt_generic) 316 | #define KSORT_INIT2_STR KSORT_INIT2(str, SCOPE, ksstr_t, ks_lt_str) 317 | 318 | #ifdef __cplusplus 319 | } 320 | #endif 321 | 322 | #endif 323 | -------------------------------------------------------------------------------- /include/hts_endian.h: -------------------------------------------------------------------------------- 1 | /// @file hts_endian.h 2 | /// Byte swapping and unaligned access functions. 3 | /* 4 | Copyright (C) 2017 Genome Research Ltd. 5 | 6 | Author: Rob Davies 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. */ 25 | 26 | #ifndef HTS_ENDIAN_H 27 | #define HTS_ENDIAN_H 28 | 29 | #include 30 | 31 | /* 32 | * Compile-time endianness tests. 33 | * 34 | * Note that these tests may fail. They should only be used to enable 35 | * faster versions of endian-neutral implementations. The endian-neutral 36 | * version should always be available as a fall-back. 37 | * 38 | * See https://sourceforge.net/p/predef/wiki/Endianness/ 39 | */ 40 | 41 | /* Save typing as both endian and unaligned tests want to know about x86 */ 42 | #if (defined(__i386__) || defined(__i386) || defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(__i686__) || defined(__i686)) && !defined(HTS_x86) 43 | # define HTS_x86 /* x86 and x86_64 platform */ 44 | #endif 45 | 46 | /** @def HTS_LITTLE_ENDIAN 47 | * @brief Defined if platform is known to be little-endian 48 | */ 49 | 50 | #ifndef HTS_LITTLE_ENDIAN 51 | # if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ 52 | || defined(__LITTLE_ENDIAN__) \ 53 | || defined(HTS_x86) \ 54 | || defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) \ 55 | || defined(_MIPSEL) || defined(__MIPSEL) || defined(__MIPSEL__) 56 | # define HTS_LITTLE_ENDIAN 57 | # endif 58 | #endif 59 | 60 | /** @def HTS_BIG_ENDIAN 61 | * @brief Defined if platform is known to be big-endian 62 | */ 63 | 64 | #ifndef HTS_BIG_ENDIAN 65 | # if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) \ 66 | || defined(__BIG_ENDIAN__) \ 67 | || defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AAARCHEB__) \ 68 | || defined(_MIPSEB) || defined(__MIPSEB) || defined(__MIPSEB__) 69 | # define HTS_BIG_ENDIAN 70 | # endif 71 | #endif 72 | 73 | /** @def HTS_ENDIAN_NEUTRAL 74 | * @brief Define this to disable any endian-specific optimizations 75 | */ 76 | 77 | #if defined(HTS_ENDIAN_NEUTRAL) || (defined(HTS_LITTLE_ENDIAN) && defined(HTS_BIG_ENDIAN)) 78 | /* Disable all endian-specific code. */ 79 | # undef HTS_LITTLE_ENDIAN 80 | # undef HTS_BIG_ENDIAN 81 | #endif 82 | 83 | /** @def HTS_ALLOW_UNALIGNED 84 | * @brief Control use of unaligned memory access. 85 | * 86 | * Defining HTS_ALLOW_UNALIGNED=1 converts shift-and-or to simple casts on 87 | * little-endian platforms that can tolerate unaligned access (notably Intel 88 | * x86). 89 | * 90 | * Defining HTS_ALLOW_UNALIGNED=0 forces shift-and-or. 91 | */ 92 | 93 | // Consider using AX_CHECK_ALIGNED_ACCESS_REQUIRED in autoconf. 94 | #ifndef HTS_ALLOW_UNALIGNED 95 | # if defined(HTS_x86) 96 | # define HTS_ALLOW_UNALIGNED 1 97 | # else 98 | # define HTS_ALLOW_UNALIGNED 0 99 | # endif 100 | #endif 101 | 102 | #if HTS_ALLOW_UNALIGNED != 0 103 | # if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) || defined(__clang__) 104 | // This prevents problems with gcc's vectoriser generating the wrong 105 | // instructions for unaligned data. 106 | typedef uint16_t uint16_u __attribute__ ((__aligned__ (1))); 107 | typedef uint32_t uint32_u __attribute__ ((__aligned__ (1))); 108 | typedef uint64_t uint64_u __attribute__ ((__aligned__ (1))); 109 | #else 110 | typedef uint16_t uint16_u; 111 | typedef uint32_t uint32_u; 112 | typedef uint64_t uint64_u; 113 | # endif 114 | #endif 115 | 116 | /// Get a uint8_t value from an unsigned byte array 117 | /** @param buf Pointer to source byte, may be unaligned 118 | * @return An 8-bit unsigned integer 119 | */ 120 | static inline uint8_t le_to_u8(const uint8_t *buf) { 121 | return *buf; 122 | } 123 | 124 | /// Get a uint16_t value from an unsigned byte array 125 | /** @param buf Pointer to source byte, may be unaligned 126 | * @return A 16 bit unsigned integer 127 | * The input is read in little-endian byte order. 128 | */ 129 | static inline uint16_t le_to_u16(const uint8_t *buf) { 130 | #if defined(HTS_LITTLE_ENDIAN) && HTS_ALLOW_UNALIGNED != 0 131 | return *((uint16_u *) buf); 132 | #else 133 | return (uint16_t) buf[0] | ((uint16_t) buf[1] << 8); 134 | #endif 135 | } 136 | 137 | /// Get a uint32_t value from an unsigned byte array 138 | /** @param buf Pointer to source byte array, may be unaligned 139 | * @return A 32 bit unsigned integer 140 | * The input is read in little-endian byte order. 141 | */ 142 | static inline uint32_t le_to_u32(const uint8_t *buf) { 143 | #if defined(HTS_LITTLE_ENDIAN) && HTS_ALLOW_UNALIGNED != 0 144 | return *((uint32_u *) buf); 145 | #else 146 | return ((uint32_t) buf[0] | 147 | ((uint32_t) buf[1] << 8) | 148 | ((uint32_t) buf[2] << 16) | 149 | ((uint32_t) buf[3] << 24)); 150 | #endif 151 | } 152 | 153 | /// Get a uint64_t value from an unsigned byte array 154 | /** @param buf Pointer to source byte array, may be unaligned 155 | * @return A 64 bit unsigned integer 156 | * The input is read in little-endian byte order. 157 | */ 158 | static inline uint64_t le_to_u64(const uint8_t *buf) { 159 | #if defined(HTS_LITTLE_ENDIAN) && HTS_ALLOW_UNALIGNED != 0 160 | return *((uint64_u *) buf); 161 | #else 162 | return ((uint64_t) buf[0] | 163 | ((uint64_t) buf[1] << 8) | 164 | ((uint64_t) buf[2] << 16) | 165 | ((uint64_t) buf[3] << 24) | 166 | ((uint64_t) buf[4] << 32) | 167 | ((uint64_t) buf[5] << 40) | 168 | ((uint64_t) buf[6] << 48) | 169 | ((uint64_t) buf[7] << 56)); 170 | #endif 171 | } 172 | 173 | /// Store a uint16_t value in little-endian byte order 174 | /** @param val The value to store 175 | * @param buf Where to store it (may be unaligned) 176 | */ 177 | static inline void u16_to_le(uint16_t val, uint8_t *buf) { 178 | #if defined(HTS_LITTLE_ENDIAN) && HTS_ALLOW_UNALIGNED != 0 179 | *((uint16_u *) buf) = val; 180 | #else 181 | buf[0] = val & 0xff; 182 | buf[1] = (val >> 8) & 0xff; 183 | #endif 184 | } 185 | 186 | /// Store a uint32_t value in little-endian byte order 187 | /** @param val The value to store 188 | * @param buf Where to store it (may be unaligned) 189 | */ 190 | static inline void u32_to_le(uint32_t val, uint8_t *buf) { 191 | #if defined(HTS_LITTLE_ENDIAN) && HTS_ALLOW_UNALIGNED != 0 192 | *((uint32_u *) buf) = val; 193 | #else 194 | buf[0] = val & 0xff; 195 | buf[1] = (val >> 8) & 0xff; 196 | buf[2] = (val >> 16) & 0xff; 197 | buf[3] = (val >> 24) & 0xff; 198 | #endif 199 | } 200 | 201 | /// Store a uint64_t value in little-endian byte order 202 | /** @param val The value to store 203 | * @param buf Where to store it (may be unaligned) 204 | */ 205 | static inline void u64_to_le(uint64_t val, uint8_t *buf) { 206 | #if defined(HTS_LITTLE_ENDIAN) && HTS_ALLOW_UNALIGNED != 0 207 | *((uint64_u *) buf) = val; 208 | #else 209 | buf[0] = val & 0xff; 210 | buf[1] = (val >> 8) & 0xff; 211 | buf[2] = (val >> 16) & 0xff; 212 | buf[3] = (val >> 24) & 0xff; 213 | buf[4] = (val >> 32) & 0xff; 214 | buf[5] = (val >> 40) & 0xff; 215 | buf[6] = (val >> 48) & 0xff; 216 | buf[7] = (val >> 56) & 0xff; 217 | #endif 218 | } 219 | 220 | /* Signed values. Grab the data as unsigned, then convert to signed without 221 | * triggering undefined behaviour. On any sensible platform, the conversion 222 | * should optimise away to nothing. 223 | */ 224 | 225 | /// Get an int8_t value from an unsigned byte array 226 | /** @param buf Pointer to source byte array, may be unaligned 227 | * @return A 8 bit signed integer 228 | * The input data is interpreted as 2's complement representation. 229 | */ 230 | static inline int8_t le_to_i8(const uint8_t *buf) { 231 | return *buf < 0x80 ? (int8_t) *buf : -((int8_t) (0xff - *buf)) - 1; 232 | } 233 | 234 | /// Get an int16_t value from an unsigned byte array 235 | /** @param buf Pointer to source byte array, may be unaligned 236 | * @return A 16 bit signed integer 237 | * The input data is interpreted as 2's complement representation in 238 | * little-endian byte order. 239 | */ 240 | static inline int16_t le_to_i16(const uint8_t *buf) { 241 | uint16_t v = le_to_u16(buf); 242 | return v < 0x8000 ? (int16_t) v : -((int16_t) (0xffff - v)) - 1; 243 | } 244 | 245 | /// Get an int32_t value from an unsigned byte array 246 | /** @param buf Pointer to source byte array, may be unaligned 247 | * @return A 32 bit signed integer 248 | * The input data is interpreted as 2's complement representation in 249 | * little-endian byte order. 250 | */ 251 | static inline int32_t le_to_i32(const uint8_t *buf) { 252 | uint32_t v = le_to_u32(buf); 253 | return v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1; 254 | } 255 | 256 | /// Get an int64_t value from an unsigned byte array 257 | /** @param buf Pointer to source byte array, may be unaligned 258 | * @return A 64 bit signed integer 259 | * The input data is interpreted as 2's complement representation in 260 | * little-endian byte order. 261 | */ 262 | static inline int64_t le_to_i64(const uint8_t *buf) { 263 | uint64_t v = le_to_u64(buf); 264 | return (v < 0x8000000000000000ULL 265 | ? (int64_t) v : -((int64_t) (0xffffffffffffffffULL - v)) - 1); 266 | } 267 | 268 | // Converting the other way is easier as signed -> unsigned is well defined. 269 | 270 | /// Store a uint16_t value in little-endian byte order 271 | /** @param val The value to store 272 | * @param buf Where to store it (may be unaligned) 273 | */ 274 | static inline void i16_to_le(int16_t val, uint8_t *buf) { 275 | u16_to_le(val, buf); 276 | } 277 | 278 | /// Store a uint32_t value in little-endian byte order 279 | /** @param val The value to store 280 | * @param buf Where to store it (may be unaligned) 281 | */ 282 | static inline void i32_to_le(int32_t val, uint8_t *buf) { 283 | u32_to_le(val, buf); 284 | } 285 | 286 | /// Store a uint64_t value in little-endian byte order 287 | /** @param val The value to store 288 | * @param buf Where to store it (may be unaligned) 289 | */ 290 | static inline void i64_to_le(int64_t val, uint8_t *buf) { 291 | u64_to_le(val, buf); 292 | } 293 | 294 | /* Floating point. Assumptions: 295 | * Platform uses IEEE 754 format 296 | * sizeof(float) == sizeof(uint32_t) 297 | * sizeof(double) == sizeof(uint64_t) 298 | * Endian-ness is the same for both floating point and integer 299 | * Type-punning via a union is allowed 300 | */ 301 | 302 | /// Get a float value from an unsigned byte array 303 | /** @param buf Pointer to source byte array, may be unaligned 304 | * @return A 32 bit floating point value 305 | * The input is interpreted as an IEEE 754 format float in little-endian 306 | * byte order. 307 | */ 308 | static inline float le_to_float(const uint8_t *buf) { 309 | union { 310 | uint32_t u; 311 | float f; 312 | } convert; 313 | 314 | convert.u = le_to_u32(buf); 315 | return convert.f; 316 | } 317 | 318 | /// Get a double value from an unsigned byte array 319 | /** @param buf Pointer to source byte array, may be unaligned 320 | * @return A 64 bit floating point value 321 | * The input is interpreted as an IEEE 754 format double in little-endian 322 | * byte order. 323 | */ 324 | static inline double le_to_double(const uint8_t *buf) { 325 | union { 326 | uint64_t u; 327 | double f; 328 | } convert; 329 | 330 | convert.u = le_to_u64(buf); 331 | return convert.f; 332 | } 333 | 334 | /// Store a float value in little-endian byte order 335 | /** @param val The value to store 336 | * @param buf Where to store it (may be unaligned) 337 | */ 338 | static inline void float_to_le(float val, uint8_t *buf) { 339 | union { 340 | uint32_t u; 341 | float f; 342 | } convert; 343 | 344 | convert.f = val; 345 | u32_to_le(convert.u, buf); 346 | } 347 | 348 | /// Store a double value in little-endian byte order 349 | /** @param val The value to store 350 | * @param buf Where to store it (may be unaligned) 351 | */ 352 | static inline void double_to_le(double val, uint8_t *buf) { 353 | union { 354 | uint64_t u; 355 | double f; 356 | } convert; 357 | 358 | convert.f = val; 359 | u64_to_le(convert.u, buf); 360 | } 361 | 362 | #endif /* HTS_ENDIAN_H */ 363 | -------------------------------------------------------------------------------- /include/kstring.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (C) 2011 by Attractive Chaos 4 | Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 21 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 22 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 23 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | */ 26 | 27 | #ifndef KSTRING_H 28 | #define KSTRING_H 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "hts_defs.h" 40 | #include "kroundup.h" 41 | 42 | #if defined __GNUC__ && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)) 43 | #ifdef __MINGW_PRINTF_FORMAT 44 | #define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__MINGW_PRINTF_FORMAT, fmt, arg))) 45 | #else 46 | #define KS_ATTR_PRINTF(fmt, arg) __attribute__((__format__ (__printf__, fmt, arg))) 47 | #endif // __MINGW_PRINTF_FORMAT 48 | #else 49 | #define KS_ATTR_PRINTF(fmt, arg) 50 | #endif 51 | 52 | #ifndef HAVE___BUILTIN_CLZ 53 | #if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) 54 | #define HAVE___BUILTIN_CLZ 1 55 | #endif 56 | #endif 57 | 58 | // Ensure ssize_t exists within this header. All #includes must precede this, 59 | // and ssize_t must be undefined again at the end of this header. 60 | #if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t 61 | #define HTSLIB_SSIZE_T 62 | #define ssize_t intptr_t 63 | #endif 64 | 65 | /* kstring_t is a simple non-opaque type whose fields are likely to be 66 | * used directly by user code (but see also ks_str() and ks_len() below). 67 | * A kstring_t object is initialised by either of 68 | * kstring_t str = KS_INITIALIZE; 69 | * kstring_t str; ...; ks_initialize(&str); 70 | * and either ownership of the underlying buffer should be given away before 71 | * the object disappears (see ks_release() below) or the kstring_t should be 72 | * destroyed with ks_free(&str) or free(str.s) */ 73 | #ifndef KSTRING_T 74 | #define KSTRING_T kstring_t 75 | typedef struct kstring_t { 76 | size_t l, m; 77 | char *s; 78 | } kstring_t; 79 | #endif 80 | 81 | typedef struct ks_tokaux_t { 82 | uint64_t tab[4]; 83 | int sep, finished; 84 | const char *p; // end of the current token 85 | } ks_tokaux_t; 86 | 87 | #ifdef __cplusplus 88 | extern "C" { 89 | #endif 90 | 91 | HTSLIB_EXPORT 92 | int kvsprintf(kstring_t *s, const char *fmt, va_list ap) KS_ATTR_PRINTF(2,0); 93 | 94 | HTSLIB_EXPORT 95 | int ksprintf(kstring_t *s, const char *fmt, ...) KS_ATTR_PRINTF(2,3); 96 | 97 | HTSLIB_EXPORT 98 | int kputd(double d, kstring_t *s); // custom %g only handler 99 | 100 | HTSLIB_EXPORT 101 | int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); 102 | 103 | HTSLIB_EXPORT 104 | char *kstrstr(const char *str, const char *pat, int **_prep); 105 | 106 | HTSLIB_EXPORT 107 | char *kstrnstr(const char *str, const char *pat, int n, int **_prep); 108 | 109 | HTSLIB_EXPORT 110 | void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); 111 | 112 | /* kstrtok() is similar to strtok_r() except that str is not 113 | * modified and both str and sep can be NULL. For efficiency, it is 114 | * actually recommended to set both to NULL in the subsequent calls 115 | * if sep is not changed. */ 116 | HTSLIB_EXPORT 117 | char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); 118 | 119 | /* kgetline() uses the supplied fgets()-like function to read a "\n"- 120 | * or "\r\n"-terminated line from fp. The line read is appended to the 121 | * kstring without its terminator and 0 is returned; EOF is returned at 122 | * EOF or on error (determined by querying fp, as per fgets()). */ 123 | typedef char *kgets_func(char *, int, void *); 124 | HTSLIB_EXPORT 125 | int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp); 126 | 127 | /* kgetline2() uses the supplied hgetln()-like function to read a "\n"- 128 | * or "\r\n"-terminated line from fp. The line read is appended to the 129 | * ksring without its terminator and 0 is returned; EOF is returned at 130 | * EOF or on error (determined by querying fp, as per fgets()). */ 131 | typedef ssize_t kgets_func2(char *, size_t, void *); 132 | HTSLIB_EXPORT 133 | int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp); 134 | 135 | #ifdef __cplusplus 136 | } 137 | #endif 138 | 139 | /// kstring initializer for structure assignment 140 | #define KS_INITIALIZE { 0, 0, NULL } 141 | 142 | /// kstring initializer for pointers 143 | /** 144 | @note Not to be used if the buffer has been allocated. Use ks_release() 145 | or ks_clear() instead. 146 | */ 147 | 148 | static inline void ks_initialize(kstring_t *s) 149 | { 150 | s->l = s->m = 0; 151 | s->s = NULL; 152 | } 153 | 154 | /// Resize a kstring to a given capacity 155 | static inline int ks_resize(kstring_t *s, size_t size) 156 | { 157 | if (s->m < size) { 158 | char *tmp; 159 | size = (size > (SIZE_MAX>>2)) ? size : size + (size >> 1); 160 | tmp = (char*)realloc(s->s, size); 161 | if (!tmp) 162 | return -1; 163 | s->s = tmp; 164 | s->m = size; 165 | } 166 | return 0; 167 | } 168 | 169 | /// Increase kstring capacity by a given number of bytes 170 | static inline int ks_expand(kstring_t *s, size_t expansion) 171 | { 172 | size_t new_size = s->l + expansion; 173 | 174 | if (new_size < s->l) // Overflow check 175 | return -1; 176 | return ks_resize(s, new_size); 177 | } 178 | 179 | /// Returns the kstring buffer 180 | static inline char *ks_str(kstring_t *s) 181 | { 182 | return s->s; 183 | } 184 | 185 | /// Returns the kstring buffer, or an empty string if l == 0 186 | /** 187 | * Unlike ks_str(), this function will never return NULL. If the kstring is 188 | * empty it will return a read-only empty string. As the returned value 189 | * may be read-only, the caller should not attempt to modify it. 190 | */ 191 | static inline const char *ks_c_str(kstring_t *s) 192 | { 193 | return s->l && s->s ? s->s : ""; 194 | } 195 | 196 | static inline size_t ks_len(kstring_t *s) 197 | { 198 | return s->l; 199 | } 200 | 201 | /// Reset kstring length to zero 202 | /** 203 | @return The kstring itself 204 | 205 | Example use: kputsn(string, len, ks_clear(s)) 206 | */ 207 | static inline kstring_t *ks_clear(kstring_t *s) 208 | { 209 | s->l = 0; 210 | return s; 211 | } 212 | 213 | // Give ownership of the underlying buffer away to something else (making 214 | // that something else responsible for freeing it), leaving the kstring_t 215 | // empty and ready to be used again, or ready to go out of scope without 216 | // needing free(str.s) to prevent a memory leak. 217 | static inline char *ks_release(kstring_t *s) 218 | { 219 | char *ss = s->s; 220 | s->l = s->m = 0; 221 | s->s = NULL; 222 | return ss; 223 | } 224 | 225 | /// Safely free the underlying buffer in a kstring. 226 | static inline void ks_free(kstring_t *s) 227 | { 228 | if (s) { 229 | free(s->s); 230 | ks_initialize(s); 231 | } 232 | } 233 | 234 | static inline int kputsn(const char *p, size_t l, kstring_t *s) 235 | { 236 | size_t new_sz = s->l + l + 2; 237 | if (new_sz <= s->l || ks_resize(s, new_sz) < 0) 238 | return EOF; 239 | memcpy(s->s + s->l, p, l); 240 | s->l += l; 241 | s->s[s->l] = 0; 242 | return l; 243 | } 244 | 245 | static inline int kputs(const char *p, kstring_t *s) 246 | { 247 | if (!p) { errno = EFAULT; return -1; } 248 | return kputsn(p, strlen(p), s); 249 | } 250 | 251 | static inline int kputc(int c, kstring_t *s) 252 | { 253 | if (ks_resize(s, s->l + 2) < 0) 254 | return EOF; 255 | s->s[s->l++] = c; 256 | s->s[s->l] = 0; 257 | return (unsigned char)c; 258 | } 259 | 260 | static inline int kputc_(int c, kstring_t *s) 261 | { 262 | if (ks_resize(s, s->l + 1) < 0) 263 | return EOF; 264 | s->s[s->l++] = c; 265 | return 1; 266 | } 267 | 268 | static inline int kputsn_(const void *p, size_t l, kstring_t *s) 269 | { 270 | size_t new_sz = s->l + l; 271 | if (new_sz < s->l || ks_resize(s, new_sz ? new_sz : 1) < 0) 272 | return EOF; 273 | memcpy(s->s + s->l, p, l); 274 | s->l += l; 275 | return l; 276 | } 277 | 278 | static inline int kputuw(unsigned x, kstring_t *s) 279 | { 280 | #if HAVE___BUILTIN_CLZ && UINT_MAX == 4294967295U 281 | static const unsigned int kputuw_num_digits[32] = { 282 | 10, 10, 10, 9, 9, 9, 8, 8, 283 | 8, 7, 7, 7, 7, 6, 6, 6, 284 | 5, 5, 5, 4, 4, 4, 4, 3, 285 | 3, 3, 2, 2, 2, 1, 1, 1 286 | }; 287 | static const unsigned int kputuw_thresholds[32] = { 288 | 0, 0, 1000000000U, 0, 0, 100000000U, 0, 0, 289 | 10000000, 0, 0, 0, 1000000, 0, 0, 100000, 290 | 0, 0, 10000, 0, 0, 0, 1000, 0, 291 | 0, 100, 0, 0, 10, 0, 0, 0 292 | }; 293 | #else 294 | uint64_t m; 295 | #endif 296 | static const char kputuw_dig2r[] = 297 | "00010203040506070809" 298 | "10111213141516171819" 299 | "20212223242526272829" 300 | "30313233343536373839" 301 | "40414243444546474849" 302 | "50515253545556575859" 303 | "60616263646566676869" 304 | "70717273747576777879" 305 | "80818283848586878889" 306 | "90919293949596979899"; 307 | unsigned int l, j; 308 | char *cp; 309 | 310 | // Trivial case - also prevents __builtin_clz(0), which is undefined 311 | if (x < 10) { 312 | if (ks_resize(s, s->l + 2) < 0) 313 | return EOF; 314 | s->s[s->l++] = '0'+x; 315 | s->s[s->l] = 0; 316 | return 0; 317 | } 318 | 319 | // Find out how many digits are to be printed. 320 | #if HAVE___BUILTIN_CLZ && UINT_MAX == 4294967295U 321 | /* 322 | * Table method - should be quick if clz can be done in hardware. 323 | * Find the most significant bit of the value to print and look 324 | * up in a table to find out how many decimal digits are needed. 325 | * This number needs to be adjusted by 1 for cases where the decimal 326 | * length could vary for a given number of bits (for example, 327 | * a four bit number could be between 8 and 15). 328 | */ 329 | 330 | l = __builtin_clz(x); 331 | l = kputuw_num_digits[l] - (x < kputuw_thresholds[l]); 332 | #else 333 | // Fallback for when clz is not available 334 | m = 1; 335 | l = 0; 336 | do { 337 | l++; 338 | m *= 10; 339 | } while (x >= m); 340 | #endif 341 | 342 | if (ks_resize(s, s->l + l + 2) < 0) 343 | return EOF; 344 | 345 | // Add digits two at a time 346 | j = l; 347 | cp = s->s + s->l; 348 | while (x >= 10) { 349 | const char *d = &kputuw_dig2r[2*(x%100)]; 350 | x /= 100; 351 | memcpy(&cp[j-=2], d, 2); 352 | } 353 | 354 | // Last one (if necessary). We know that x < 10 by now. 355 | if (j == 1) 356 | cp[0] = x + '0'; 357 | 358 | s->l += l; 359 | s->s[s->l] = 0; 360 | return 0; 361 | } 362 | 363 | static inline int kputw(int c, kstring_t *s) 364 | { 365 | unsigned int x = c; 366 | if (c < 0) { 367 | x = -x; 368 | if (ks_resize(s, s->l + 3) < 0) 369 | return EOF; 370 | s->s[s->l++] = '-'; 371 | } 372 | 373 | return kputuw(x, s); 374 | } 375 | 376 | static inline int kputll(long long c, kstring_t *s) 377 | { 378 | char buf[32]; 379 | int i, l = 0; 380 | unsigned long long x = c; 381 | if (c < 0) x = -x; 382 | do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); 383 | if (c < 0) buf[l++] = '-'; 384 | if (ks_resize(s, s->l + l + 2) < 0) 385 | return EOF; 386 | for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; 387 | s->s[s->l] = 0; 388 | return 0; 389 | } 390 | 391 | static inline int kputl(long c, kstring_t *s) { 392 | return kputll(c, s); 393 | } 394 | 395 | /* 396 | * Returns 's' split by delimiter, with *n being the number of components; 397 | * NULL on failure. 398 | */ 399 | static inline int *ksplit(kstring_t *s, int delimiter, int *n) 400 | { 401 | int max = 0, *offsets = 0; 402 | *n = ksplit_core(s->s, delimiter, &max, &offsets); 403 | return offsets; 404 | } 405 | 406 | #ifdef HTSLIB_SSIZE_T 407 | #undef HTSLIB_SSIZE_T 408 | #undef ssize_t 409 | #endif 410 | 411 | #endif 412 | -------------------------------------------------------------------------------- /scripts/sample_order_manager.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Long; 6 | use Pod::Usage; 7 | 8 | #Copyright to Chen Shuai (chensss1209@gmail.com) 9 | #Date: 2025-10-25 10 | 11 | our $VERSION = "1.0.0"; 12 | 13 | my $action = ""; 14 | my $sample_list = ""; 15 | my $depth_file = ""; 16 | my $pheno_file = ""; 17 | my $output_depth = ""; 18 | my $output_pheno = ""; 19 | my $help = 0; 20 | 21 | GetOptions( 22 | "action=s" => \$action, 23 | "samples=s" => \$sample_list, 24 | "depth=s" => \$depth_file, 25 | "pheno=s" => \$pheno_file, 26 | "out-depth=s" => \$output_depth, 27 | "out-pheno=s" => \$output_pheno, 28 | "help|h" => \$help, 29 | ) or pod2usage(2); 30 | 31 | pod2usage(-verbose => 2) if $help; 32 | 33 | if ($action eq "check") { 34 | check_sample_order(); 35 | } elsif ($action eq "reorder") { 36 | reorder_files(); 37 | } elsif ($action eq "extract") { 38 | extract_from_depth(); 39 | } else { 40 | print "Error: Unknown action '$action'. Valid actions are: check, reorder, extract\n"; 41 | pod2usage(1); 42 | } 43 | 44 | sub check_sample_order { 45 | unless ($sample_list) { 46 | die "Error: --samples is required for 'check' action\n"; 47 | } 48 | 49 | print "=" x 70 . "\n"; 50 | print "Checking Sample Order Consistency\n"; 51 | print "=" x 70 . "\n\n"; 52 | 53 | # Read sample list 54 | my @samples = read_file_samples($sample_list); 55 | print "Reference sample list: $sample_list\n"; 56 | print " Total samples: " . scalar(@samples) . "\n"; 57 | print " First 5: " . join(", ", @samples[0..4]) . "\n\n"; 58 | 59 | my $all_consistent = 1; 60 | 61 | # Check depth file 62 | if ($depth_file && -f $depth_file) { 63 | print "Checking depth file: $depth_file\n"; 64 | my @depth_samples = read_file_samples($depth_file); 65 | my $consistent = compare_sample_order(\@samples, \@depth_samples, "depth file"); 66 | $all_consistent = 0 unless $consistent; 67 | print "\n"; 68 | } else { 69 | print "Depth file not provided or not found\n\n"; 70 | } 71 | 72 | # Check phenotype file 73 | if ($pheno_file && -f $pheno_file) { 74 | print "Checking phenotype file: $pheno_file\n"; 75 | my @pheno_samples = read_file_samples($pheno_file); 76 | my $consistent = compare_sample_order(\@samples, \@pheno_samples, "phenotype file"); 77 | $all_consistent = 0 unless $consistent; 78 | print "\n"; 79 | } else { 80 | print "Phenotype file not provided or not found\n\n"; 81 | } 82 | 83 | print "=" x 70 . "\n"; 84 | if ($all_consistent) { 85 | print "✓ All files have CONSISTENT sample order!\n"; 86 | print "=" x 70 . "\n"; 87 | } else { 88 | print "✗ WARNING: Sample order INCONSISTENCY detected!\n"; 89 | print " Use 'reorder' action to fix the order\n"; 90 | print "=" x 70 . "\n"; 91 | exit 1; 92 | } 93 | } 94 | 95 | sub reorder_files { 96 | unless ($sample_list) { 97 | die "Error: --samples is required for 'reorder' action\n"; 98 | } 99 | 100 | # Read reference sample order 101 | my @samples = read_file_samples($sample_list); 102 | print "Reference sample order: $sample_list (" . scalar(@samples) . " samples)\n\n"; 103 | 104 | # Reorder depth file 105 | if ($depth_file && -f $depth_file) { 106 | unless ($output_depth) { 107 | $output_depth = $depth_file; 108 | $output_depth =~ s/\.tsv$/_reordered.tsv/; 109 | $output_depth .= "_reordered.tsv" unless $output_depth =~ /_reordered/; 110 | } 111 | 112 | print "Reordering depth file...\n"; 113 | print " Input: $depth_file\n"; 114 | print " Output: $output_depth\n"; 115 | 116 | my %depth_data = read_depth_file($depth_file); 117 | write_depth_file($output_depth, \@samples, \%depth_data); 118 | 119 | print " ✓ Depth file reordered successfully\n\n"; 120 | } 121 | 122 | # Reorder phenotype file 123 | if ($pheno_file && -f $pheno_file) { 124 | unless ($output_pheno) { 125 | $output_pheno = $pheno_file; 126 | $output_pheno =~ s/\.tsv$/_reordered.tsv/; 127 | $output_pheno =~ s/\.txt$/_reordered.txt/; 128 | $output_pheno .= "_reordered.txt" unless $output_pheno =~ /_reordered/; 129 | } 130 | 131 | print "Reordering phenotype file...\n"; 132 | print " Input: $pheno_file\n"; 133 | print " Output: $output_pheno\n"; 134 | 135 | my %pheno_data = read_pheno_file($pheno_file); 136 | write_pheno_file($output_pheno, \@samples, \%pheno_data); 137 | 138 | print " ✓ Phenotype file reordered successfully\n\n"; 139 | } 140 | 141 | print "Reordering complete!\n"; 142 | print "Please verify the output files before using them in the pipeline.\n"; 143 | } 144 | 145 | sub extract_from_depth { 146 | unless ($depth_file && -f $depth_file) { 147 | die "Error: --depth file is required and must exist for 'extract' action\n"; 148 | } 149 | 150 | unless ($sample_list) { 151 | $sample_list = "sample_list_from_depth.txt"; 152 | } 153 | 154 | print "Extracting sample list from depth file...\n"; 155 | print " Input: $depth_file\n"; 156 | print " Output: $sample_list\n\n"; 157 | 158 | my @samples = read_file_samples($depth_file); 159 | 160 | open(my $fh, '>', $sample_list) or die "Cannot create $sample_list: $!"; 161 | foreach my $sample (@samples) { 162 | print $fh "$sample\n"; 163 | } 164 | close($fh); 165 | 166 | print "✓ Extracted " . scalar(@samples) . " samples\n"; 167 | print "✓ Sample list saved to: $sample_list\n"; 168 | } 169 | 170 | 171 | sub read_file_samples { 172 | my ($file) = @_; 173 | my @samples; 174 | 175 | open(my $fh, '<', $file) or die "Cannot open $file: $!"; 176 | while (my $line = <$fh>) { 177 | chomp $line; 178 | next if $line =~ /^\s*$/; # Skip empty lines 179 | next if $line =~ /^#/; # Skip comments 180 | 181 | # Extract first column (sample name) 182 | my @fields = split(/\s+/, $line); 183 | push @samples, $fields[0]; 184 | } 185 | close($fh); 186 | 187 | return @samples; 188 | } 189 | 190 | sub compare_sample_order { 191 | my ($ref_samples, $test_samples, $label) = @_; 192 | 193 | my $ref_count = scalar(@$ref_samples); 194 | my $test_count = scalar(@$test_samples); 195 | 196 | print " Total samples in $label: $test_count\n"; 197 | 198 | if ($ref_count != $test_count) { 199 | print " ✗ ERROR: Sample count mismatch!\n"; 200 | print " Reference: $ref_count samples\n"; 201 | print " $label: $test_count samples\n"; 202 | 203 | my %ref_hash = map { $_ => 1 } @$ref_samples; 204 | my %test_hash = map { $_ => 1 } @$test_samples; 205 | 206 | my @missing_in_test = grep { !exists $test_hash{$_} } @$ref_samples; 207 | my @extra_in_test = grep { !exists $ref_hash{$_} } @$test_samples; 208 | 209 | if (@missing_in_test) { 210 | print " Missing in $label: " . join(", ", @missing_in_test[0..4]) . "\n"; 211 | } 212 | if (@extra_in_test) { 213 | print " Extra in $label: " . join(", ", @extra_in_test[0..4]) . "\n"; 214 | } 215 | 216 | return 0; 217 | } 218 | 219 | my $order_consistent = 1; 220 | my @mismatches; 221 | 222 | for (my $i = 0; $i < $ref_count; $i++) { 223 | if ($ref_samples->[$i] ne $test_samples->[$i]) { 224 | $order_consistent = 0; 225 | push @mismatches, { 226 | pos => $i, 227 | ref => $ref_samples->[$i], 228 | test => $test_samples->[$i] 229 | }; 230 | last if scalar(@mismatches) >= 5; # Only show first 5 mismatches 231 | } 232 | } 233 | 234 | if ($order_consistent) { 235 | print " ✓ Sample order is CONSISTENT\n"; 236 | return 1; 237 | } else { 238 | print " ✗ WARNING: Sample order is INCONSISTENT\n"; 239 | print " First mismatches:\n"; 240 | foreach my $mm (@mismatches) { 241 | printf(" Position %d: '%s' (reference) vs '%s' ($label)\n", 242 | $mm->{pos}, $mm->{ref}, $mm->{test}); 243 | } 244 | return 0; 245 | } 246 | } 247 | 248 | sub read_depth_file { 249 | my ($file) = @_; 250 | my %data; 251 | 252 | open(my $fh, '<', $file) or die "Cannot open $file: $!"; 253 | while (my $line = <$fh>) { 254 | chomp $line; 255 | next if $line =~ /^\s*$/; 256 | next if $line =~ /^#/; 257 | 258 | my @fields = split(/\t/, $line); 259 | if (scalar(@fields) >= 2) { 260 | $data{$fields[0]} = $fields[1]; 261 | } 262 | } 263 | close($fh); 264 | 265 | return %data; 266 | } 267 | 268 | sub write_depth_file { 269 | my ($file, $samples, $data) = @_; 270 | 271 | open(my $fh, '>', $file) or die "Cannot create $file: $!"; 272 | foreach my $sample (@$samples) { 273 | if (exists $data->{$sample}) { 274 | print $fh "$sample\t$data->{$sample}\n"; 275 | } else { 276 | warn "WARNING: No depth data found for sample: $sample\n"; 277 | print $fh "$sample\tNA\n"; 278 | } 279 | } 280 | close($fh); 281 | } 282 | 283 | sub read_pheno_file { 284 | my ($file) = @_; 285 | my %data; 286 | 287 | open(my $fh, '<', $file) or die "Cannot open $file: $!"; 288 | while (my $line = <$fh>) { 289 | chomp $line; 290 | next if $line =~ /^\s*$/; 291 | next if $line =~ /^#/; 292 | 293 | my @fields = split(/\s+/, $line); 294 | if (scalar(@fields) >= 1) { 295 | my $sample = shift @fields; 296 | $data{$sample} = join("\t", @fields); 297 | } 298 | } 299 | close($fh); 300 | 301 | return %data; 302 | } 303 | 304 | sub write_pheno_file { 305 | my ($file, $samples, $data) = @_; 306 | 307 | open(my $fh, '>', $file) or die "Cannot create $file: $!"; 308 | foreach my $sample (@$samples) { 309 | if (exists $data->{$sample}) { 310 | print $fh "$sample\t$data->{$sample}\n"; 311 | } else { 312 | warn "WARNING: No phenotype data found for sample: $sample\n"; 313 | print $fh "$sample\tNA\n"; 314 | } 315 | } 316 | close($fh); 317 | } 318 | 319 | __END__ 320 | 321 | =head1 NAME 322 | 323 | sample_order_manager.pl - Manage and verify sample order consistency 324 | 325 | =head1 SYNOPSIS 326 | 327 | sample_order_manager.pl --action [options] 328 | 329 | Actions: 330 | check Check if sample order is consistent across files 331 | reorder Reorder depth/phenotype files to match reference sample list 332 | extract Extract sample list from depth file 333 | 334 | Options: 335 | --samples FILE Reference sample list file (required for most actions) 336 | --depth FILE Sample depth file (sample_depth.tsv) 337 | --pheno FILE Phenotype file (sample_pheno.tsv) 338 | --out-depth FILE Output reordered depth file 339 | --out-pheno FILE Output reordered phenotype file 340 | --help|-h Show this help message 341 | 342 | =head1 DESCRIPTION 343 | 344 | This script helps ensure sample order consistency across all input files 345 | used in the KMERIA pipeline. Sample order inconsistency is a common source 346 | of errors in genotype-phenotype association studies. 347 | 348 | =head1 EXAMPLES 349 | 350 | =head2 Check sample order consistency 351 | 352 | perl sample_order_manager.pl --action check \ 353 | --samples sample.list \ 354 | --depth sample_depth.tsv \ 355 | --pheno sample_pheno.tsv 356 | 357 | =head2 Reorder files to match reference sample list 358 | 359 | perl sample_order_manager.pl --action reorder \ 360 | --samples sample.list \ 361 | --depth sample_depth.tsv \ 362 | --pheno sample_pheno.tsv \ 363 | --out-depth sample_depth_reordered.tsv \ 364 | --out-pheno sample_pheno_reordered.tsv 365 | 366 | =head2 Extract sample list from depth file 367 | 368 | perl sample_order_manager.pl --action extract \ 369 | --depth sample_depth.tsv \ 370 | --samples output_sample.list 371 | 372 | =head1 WORKFLOW RECOMMENDATION 373 | 374 | Before running KMERIA pipeline: 375 | 376 | 1. Decide on your reference sample order (usually from sample.list) 377 | 2. Check consistency: 378 | perl sample_order_manager.pl --action check --samples sample.list \ 379 | --depth sample_depth.tsv --pheno sample_pheno.tsv 380 | 381 | 3. If inconsistent, reorder files: 382 | perl sample_order_manager.pl --action reorder --samples sample.list \ 383 | --depth sample_depth.tsv --pheno sample_pheno.tsv 384 | 385 | 4. Verify the reordered files: 386 | perl sample_order_manager.pl --action check --samples sample.list \ 387 | --depth sample_depth_reordered.tsv --pheno sample_pheno_reordered.tsv 388 | 389 | 5. Run KMERIA pipeline with reordered files 390 | 391 | =head1 FILE FORMATS 392 | 393 | =head2 Sample list (--samples) 394 | 395 | Plain text, one sample per line: 396 | sample1 397 | sample2 398 | sample3 399 | 400 | =head2 Depth file (--depth) 401 | 402 | Tab-separated, no header: 403 | sample1 45.2 404 | sample2 52.8 405 | sample3 38.9 406 | 407 | =head2 Phenotype file (--pheno) 408 | 409 | Tab or space-separated, no header: 410 | sample1 1.5 0 411 | sample2 2.3 1 412 | sample3 1.8 1 413 | 414 | =head1 AUTHOR 415 | 416 | Version 1.0.0 - Sample order management tool for KMERIA pipeline 417 | 418 | =cut 419 | -------------------------------------------------------------------------------- /include/thread_pool.h: -------------------------------------------------------------------------------- 1 | /// @file htslib/thread_pool.h 2 | /// Thread pool for multi-threading applications. 3 | /* 4 | Copyright (c) 2013-2017, 2019, 2020 Genome Research Ltd. 5 | 6 | Author: James Bonfield 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. */ 25 | 26 | /* 27 | * This file implements a thread pool for multi-threading applications. It 28 | * consists of two distinct interfaces: thread pools and thread process 29 | * queues (a queue of both jobs to-do and of the results of completed jobs). 30 | * Do not confuse "process" here with a unix PID; rather it is analogous to a 31 | * program reading a stream of data blocks, processing them in some manner, 32 | * and outputting a stream of new data blocks. 33 | * 34 | * The pool of threads is given a function pointer and void* data to pass in. 35 | * This means the pool can run jobs of multiple types, albeit first come 36 | * first served with no job scheduling except to pick tasks for the 37 | * processes that have room to store the result. 38 | * 39 | * Upon completion, the return value from the function pointer is 40 | * added to back to the process result queue if required. We may have 41 | * multiple "processes" in use for the one pool. 42 | * 43 | * To see example usage, please look at the #ifdef TEST_MAIN code in 44 | * thread_pool.c. 45 | */ 46 | 47 | #ifndef HTSLIB_THREAD_POOL_H 48 | #define HTSLIB_THREAD_POOL_H 49 | 50 | #include "hts_defs.h" 51 | 52 | #ifdef __cplusplus 53 | extern "C" { 54 | #endif 55 | 56 | /*----------------------------------------------------------------------------- 57 | * Opaque data types. 58 | * 59 | * Actual definitions are in thread_pool_internal.h, but these should only 60 | * be used by thread_pool.c itself. 61 | */ 62 | 63 | /* 64 | * An hts_tpool_process implements a queue of input jobs to process and a 65 | * queue of resulting output post-processing. Internally it consists of two 66 | * buffered queues, analogous to the pipes in a unix pipeline: 67 | * ...input | process | output... 68 | * 69 | * Both input and output queues have size limits to prevent either queue from 70 | * growing too large and serial numbers to ensure sequential consumption of 71 | * the output. 72 | * 73 | * The thread pool may have many heterogeneous tasks, each using its own 74 | * process mixed into the same thread pool. 75 | */ 76 | typedef struct hts_tpool_process hts_tpool_process; 77 | 78 | /* 79 | * The single pool structure itself. 80 | * 81 | * This knows nothing about the nature of the jobs or where their output is 82 | * going, but it maintains a list of process-queues associated with this pool 83 | * from which the jobs are taken. 84 | */ 85 | typedef struct hts_tpool hts_tpool; 86 | 87 | /* 88 | * An output, after job has executed. 89 | */ 90 | typedef struct hts_tpool_result hts_tpool_result; 91 | 92 | 93 | /*----------------------------------------------------------------------------- 94 | * Thread pool external functions 95 | */ 96 | 97 | 98 | /* 99 | * Creates a worker pool with n worker threads. 100 | * 101 | * Returns pool pointer on success; 102 | * NULL on failure 103 | * 104 | * The hts_tpool struct returned by a successful call should be freed 105 | * via hts_tpool_destroy() when it is no longer needed. 106 | */ 107 | HTSLIB_EXPORT 108 | hts_tpool *hts_tpool_init(int n); 109 | 110 | 111 | /* 112 | * Returns the number of requested threads for a pool. 113 | */ 114 | HTSLIB_EXPORT 115 | int hts_tpool_size(hts_tpool *p); 116 | 117 | 118 | /// Add an item to the work pool. 119 | /** 120 | * @param p Thread pool 121 | * @param q Process queue 122 | * @param func Function run by the thread pool 123 | * @param arg Data for use by func() 124 | * @return 0 on success 125 | * -1 on failure 126 | */ 127 | // FIXME: should this drop the hts_tpool*p argument? It's just q->p 128 | HTSLIB_EXPORT 129 | int hts_tpool_dispatch(hts_tpool *p, hts_tpool_process *q, 130 | void *(*func)(void *arg), void *arg); 131 | 132 | /// Add an item to the work pool, with nonblocking option. 133 | /** 134 | * @param p Thread pool 135 | * @param q Process queue 136 | * @param func Function run by the thread pool 137 | * @param arg Data for use by func() 138 | * @param nonblock Non-blocking flag (see description) 139 | * @return 0 on success 140 | * -1 on failure 141 | * 142 | * The @p nonblock parameter can take one of the following values: 143 | * 0 => block if input queue is full 144 | * +1 => don't block if input queue is full, but do not add task 145 | * -1 => add task regardless of whether queue is full (over-size) 146 | * 147 | * If @p nonblock is +1 and the queue is full, -1 will be returned and 148 | * `errno` is set to `EAGAIN`. 149 | */ 150 | HTSLIB_EXPORT 151 | int hts_tpool_dispatch2(hts_tpool *p, hts_tpool_process *q, 152 | void *(*func)(void *arg), void *arg, int nonblock); 153 | 154 | /// Add an item to the work pool, with nonblocking and cleanup callbacks. 155 | /** 156 | * @param p Thread pool 157 | * @param q Process queue 158 | * @param exec_func Function run by the thread pool 159 | * @param arg Data for use by func() 160 | * @param job_cleanup Callback to clean up when discarding jobs 161 | * @param result_cleanup Callback to clean up when discarding result data 162 | * @param nonblock Non-blocking flag (see description) 163 | * @return 0 on success 164 | * -1 on failure 165 | * 166 | * The @p nonblock parameter can take one of the following values: 167 | * 0 => block if input queue is full 168 | * +1 => don't block if input queue is full, but do not add task 169 | * -1 => add task regardless of whether queue is full (over-size) 170 | * 171 | * If @p nonblock is +1 and the queue is full, -1 will be returned and 172 | * `errno` is set to `EAGAIN`. 173 | * 174 | * The job_cleanup() and result_cleanup() callbacks are used when discarding 175 | * data from a queue, for example when calling hts_tpool_process_reset() 176 | * or hts_tpool_process_destroy(). 177 | * 178 | * If not NULL, job_cleanup() will be called for each pending job with the 179 | * value of @p arg that was set for that job. This can be used to free 180 | * any data associated with @p arg, and also @p arg itself. 181 | * 182 | * Similarly, result_cleanup() can be used to free any results left by 183 | * jobs that had started before hts_tpool_process_reset() was called. 184 | * The argument passed to result_cleanup() is the pointer that would 185 | * have been returned by calling hts_tpool_result_data() on the result 186 | * when pulled from the queue. 187 | * 188 | * job_cleanup() and result_cleanup() are only called when discarding jobs. 189 | * For jobs that are processed normally, it is the responsibility of 190 | * exec_func() and / or consumers of any results to do any cleaning up 191 | * necessary. 192 | */ 193 | HTSLIB_EXPORT 194 | int hts_tpool_dispatch3(hts_tpool *p, hts_tpool_process *q, 195 | void *(*exec_func)(void *arg), void *arg, 196 | void (*job_cleanup)(void *arg), 197 | void (*result_cleanup)(void *data), 198 | int nonblock); 199 | 200 | /* 201 | * Wakes up a single thread stuck in dispatch and make it return with 202 | * errno EAGAIN. 203 | */ 204 | HTSLIB_EXPORT 205 | void hts_tpool_wake_dispatch(hts_tpool_process *q); 206 | 207 | /* 208 | * Flushes the process-queue, but doesn't exit. This simply drains the queue 209 | * and ensures all worker threads have finished their current tasks 210 | * associated with this process. 211 | * 212 | * NOT: This does not mean the worker threads are not executing jobs in 213 | * another process-queue. 214 | * 215 | * Returns 0 on success; 216 | * -1 on failure 217 | */ 218 | HTSLIB_EXPORT 219 | int hts_tpool_process_flush(hts_tpool_process *q); 220 | 221 | /* 222 | * Resets a process to the initial state. 223 | * 224 | * This removes any queued up input jobs, disables any notification of 225 | * new results/output, flushes what is left and then discards any 226 | * queued output. Anything consumer stuck in a wait on results to 227 | * appear should stay stuck and will only wake up when new data is 228 | * pushed through the queue. 229 | * 230 | * Returns 0 on success; 231 | * -1 on failure 232 | */ 233 | HTSLIB_EXPORT 234 | int hts_tpool_process_reset(hts_tpool_process *q, int free_results); 235 | 236 | /* Returns the process queue size */ 237 | HTSLIB_EXPORT 238 | int hts_tpool_process_qsize(hts_tpool_process *q); 239 | 240 | 241 | /* 242 | * Destroys a thread pool. The threads are joined into the main 243 | * thread so they will finish their current work load. 244 | */ 245 | HTSLIB_EXPORT 246 | void hts_tpool_destroy(hts_tpool *p); 247 | 248 | /* 249 | * Destroys a thread pool without waiting on jobs to complete. 250 | * Use hts_tpool_kill(p) to quickly exit after a fatal error. 251 | */ 252 | HTSLIB_EXPORT 253 | void hts_tpool_kill(hts_tpool *p); 254 | 255 | /* 256 | * Pulls the next item off the process result queue. The caller should free 257 | * it (and any internals as appropriate) after use. This doesn't wait for a 258 | * result to be present. 259 | * 260 | * Results will be returned in strict order. 261 | * 262 | * Returns hts_tpool_result pointer if a result is ready. 263 | * NULL if not. 264 | */ 265 | HTSLIB_EXPORT 266 | hts_tpool_result *hts_tpool_next_result(hts_tpool_process *q); 267 | 268 | /* 269 | * Pulls the next item off the process result queue. The caller should free 270 | * it (and any internals as appropriate) after use. This will wait for 271 | * a result to be present if none are currently available. 272 | * 273 | * Results will be returned in strict order. 274 | * 275 | * Returns hts_tpool_result pointer if a result is ready. 276 | * NULL on error or during shutdown. 277 | */ 278 | HTSLIB_EXPORT 279 | hts_tpool_result *hts_tpool_next_result_wait(hts_tpool_process *q); 280 | 281 | /* 282 | * Frees a result 'r' and if free_data is true also frees 283 | * the internal r->data result too. 284 | */ 285 | HTSLIB_EXPORT 286 | void hts_tpool_delete_result(hts_tpool_result *r, int free_data); 287 | 288 | /* 289 | * Returns the data portion of a hts_tpool_result, corresponding 290 | * to the actual "result" itself. 291 | */ 292 | HTSLIB_EXPORT 293 | void *hts_tpool_result_data(hts_tpool_result *r); 294 | 295 | /* 296 | * Initialises a thread process-queue. 297 | * 298 | * In_only, if true, indicates that the process generates does not need to 299 | * hold any output. Otherwise an output queue is used to store the results 300 | * of processing each input job. 301 | * 302 | * Results hts_tpool_process pointer on success; 303 | * NULL on failure 304 | * 305 | * The hts_tpool_process struct returned by a successful call should be freed 306 | * via hts_tpool_process_destroy() when it is no longer needed. 307 | */ 308 | HTSLIB_EXPORT 309 | hts_tpool_process *hts_tpool_process_init(hts_tpool *p, int qsize, int in_only); 310 | 311 | 312 | /* Deallocates memory for a thread process-queue. 313 | * Must be called before the thread pool is destroyed. 314 | */ 315 | HTSLIB_EXPORT 316 | void hts_tpool_process_destroy(hts_tpool_process *q); 317 | 318 | /* 319 | * Returns true if there are no items in the process results queue and 320 | * also none still pending. 321 | */ 322 | HTSLIB_EXPORT 323 | int hts_tpool_process_empty(hts_tpool_process *q); 324 | 325 | /* 326 | * Returns the number of completed jobs in the process results queue. 327 | */ 328 | HTSLIB_EXPORT 329 | int hts_tpool_process_len(hts_tpool_process *q); 330 | 331 | /* 332 | * Returns the number of completed jobs in the process results queue plus the 333 | * number running and queued up to run. 334 | */ 335 | HTSLIB_EXPORT 336 | int hts_tpool_process_sz(hts_tpool_process *q); 337 | 338 | /* 339 | * Shutdown a process. 340 | * 341 | * This sets the shutdown flag and wakes any threads waiting on process 342 | * condition variables. 343 | */ 344 | HTSLIB_EXPORT 345 | void hts_tpool_process_shutdown(hts_tpool_process *q); 346 | 347 | /* 348 | * Returns whether this process queue has been shutdown. 349 | * Return value of 1 signifies normal shutdown while >1 signifies it 350 | * was shutdown due to an error condition. 351 | */ 352 | HTSLIB_EXPORT 353 | int hts_tpool_process_is_shutdown(hts_tpool_process *q); 354 | 355 | /* 356 | * Attach and detach a thread process-queue with / from the thread pool 357 | * scheduler. 358 | * 359 | * We need to do attach after making a thread process, but may also wish 360 | * to temporarily detach if we wish to stop running jobs on a specific 361 | * process while permitting other process to continue. 362 | */ 363 | HTSLIB_EXPORT 364 | void hts_tpool_process_attach(hts_tpool *p, hts_tpool_process *q); 365 | 366 | HTSLIB_EXPORT 367 | void hts_tpool_process_detach(hts_tpool *p, hts_tpool_process *q); 368 | 369 | /* 370 | * Increment and decrement the reference count in a process-queue. 371 | * If the queue is being driven from two external (non thread-pool) 372 | * threads, eg "main" and a "reader", this permits each end to 373 | * decrement its use of the process-queue independently. 374 | */ 375 | HTSLIB_EXPORT 376 | void hts_tpool_process_ref_incr(hts_tpool_process *q); 377 | 378 | HTSLIB_EXPORT 379 | void hts_tpool_process_ref_decr(hts_tpool_process *q); 380 | 381 | #ifdef __cplusplus 382 | } 383 | #endif 384 | 385 | #endif 386 | -------------------------------------------------------------------------------- /include/hfile.h: -------------------------------------------------------------------------------- 1 | /// @file htslib/hfile.h 2 | /// Buffered low-level input/output streams. 3 | /* 4 | Copyright (C) 2013-2022 Genome Research Ltd. 5 | 6 | Author: John Marshall 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. */ 25 | 26 | #ifndef HTSLIB_HFILE_H 27 | #define HTSLIB_HFILE_H 28 | 29 | #include 30 | 31 | #include 32 | 33 | #include "hts_defs.h" 34 | 35 | // Ensure ssize_t exists within this header. All #includes must precede this, 36 | // and ssize_t must be undefined again at the end of this header. 37 | #if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t 38 | #define HTSLIB_SSIZE_T 39 | #define ssize_t intptr_t 40 | #endif 41 | 42 | #ifdef __cplusplus 43 | extern "C" { 44 | #endif 45 | 46 | struct hFILE_backend; 47 | struct kstring_t; 48 | 49 | /// Low-level input/output stream handle 50 | /** The fields of this structure are declared here solely for the benefit 51 | of the hFILE-related inline functions. They may change in future releases. 52 | User code should not use them directly; you should imagine that hFILE is an 53 | opaque incomplete type. 54 | */ 55 | typedef struct hFILE { 56 | // @cond internal 57 | char *buffer, *begin, *end, *limit; 58 | const struct hFILE_backend *backend; 59 | off_t offset; 60 | unsigned at_eof:1, mobile:1, readonly:1, preserve:1; 61 | int has_errno; 62 | // @endcond 63 | } hFILE; 64 | 65 | /// Open the named file or URL as a stream 66 | /** @return An hFILE pointer, or `NULL` (with _errno_ set) if an error occurred. 67 | 68 | The usual `fopen(3)` _mode_ letters are supported: one of 69 | `r` (read), `w` (write), `a` (append), optionally followed by any of 70 | `+` (update), `e` (close on `exec(2)`), `x` (create exclusively), 71 | `:` (indicates scheme-specific variable arguments follow). 72 | */ 73 | HTSLIB_EXPORT 74 | hFILE *hopen(const char *filename, const char *mode, ...) HTS_RESULT_USED; 75 | 76 | /// Associate a stream with an existing open file descriptor 77 | /** @return An hFILE pointer, or `NULL` (with _errno_ set) if an error occurred. 78 | 79 | Note that the file must be opened in binary mode, or else 80 | there will be problems on platforms that make a difference 81 | between text and binary mode. 82 | 83 | By default, the returned hFILE "takes ownership" of the file descriptor 84 | and _fd_ will be closed by hclose(). When _mode_ contains `S` (shared fd), 85 | hclose() will destroy the hFILE but not close the underlying _fd_. 86 | 87 | For socket descriptors (on Windows), _mode_ should contain `s`. 88 | */ 89 | HTSLIB_EXPORT 90 | hFILE *hdopen(int fd, const char *mode) HTS_RESULT_USED; 91 | 92 | /// Report whether the file name or URL denotes remote storage 93 | /** @return 0 if local, 1 if remote. 94 | 95 | "Remote" means involving e.g. explicit network access, with the implication 96 | that callers may wish to cache such files' contents locally. 97 | */ 98 | HTSLIB_EXPORT 99 | int hisremote(const char *filename) HTS_RESULT_USED; 100 | 101 | /// Append an extension or replace an existing extension 102 | /** @param buffer The kstring to be used to store the modified filename 103 | @param filename The filename to be (copied and) adjusted 104 | @param replace If non-zero, one extension (if any) is removed first 105 | @param extension The extension to be added (e.g. ".csi") 106 | @return The modified filename (i.e., `buffer->s`), or NULL on error. 107 | @since 1.10 108 | 109 | If _filename_ is an URL, alters extensions at the end of the `hier-part`, 110 | leaving any trailing `?query` or `#fragment` unchanged. 111 | */ 112 | HTSLIB_EXPORT 113 | char *haddextension(struct kstring_t *buffer, const char *filename, 114 | int replace, const char *extension) HTS_RESULT_USED; 115 | 116 | /// Flush (for output streams) and close the stream 117 | /** @return 0 if successful, or `EOF` (with _errno_ set) if an error occurred. 118 | */ 119 | HTSLIB_EXPORT 120 | int hclose(hFILE *fp) HTS_RESULT_USED; 121 | 122 | /// Close the stream, without flushing or propagating errors 123 | /** For use while cleaning up after an error only. Preserves _errno_. 124 | */ 125 | HTSLIB_EXPORT 126 | void hclose_abruptly(hFILE *fp); 127 | 128 | /// Return the stream's error indicator 129 | /** @return Non-zero (in fact, an _errno_ value) if an error has occurred. 130 | 131 | This would be called `herror()` and return true/false to parallel `ferror(3)`, 132 | but a networking-related `herror(3)` function already exists. 133 | */ 134 | static inline int herrno(hFILE *fp) 135 | { 136 | return fp->has_errno; 137 | } 138 | 139 | /// Clear the stream's error indicator 140 | static inline void hclearerr(hFILE *fp) 141 | { 142 | fp->has_errno = 0; 143 | } 144 | 145 | /// Reposition the read/write stream offset 146 | /** @return The resulting offset within the stream (as per `lseek(2)`), 147 | or negative if an error occurred. 148 | */ 149 | HTSLIB_EXPORT 150 | off_t hseek(hFILE *fp, off_t offset, int whence) HTS_RESULT_USED; 151 | 152 | /// Report the current stream offset 153 | /** @return The offset within the stream, starting from zero. 154 | */ 155 | static inline off_t htell(hFILE *fp) 156 | { 157 | return fp->offset + (fp->begin - fp->buffer); 158 | } 159 | 160 | /// Read one character from the stream 161 | /** @return The character read, or `EOF` on end-of-file or error. 162 | */ 163 | static inline int hgetc(hFILE *fp) 164 | { 165 | HTSLIB_EXPORT 166 | extern int hgetc2(hFILE *); 167 | return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp); 168 | } 169 | 170 | /// Read from the stream until the delimiter, up to a maximum length 171 | /** @param buffer The buffer into which bytes will be written 172 | @param size The size of the buffer 173 | @param delim The delimiter (interpreted as an `unsigned char`) 174 | @param fp The file stream 175 | @return The number of bytes read, or negative on error. 176 | @since 1.4 177 | 178 | Bytes will be read into the buffer up to and including a delimiter, until 179 | EOF is reached, or _size-1_ bytes have been written, whichever comes first. 180 | The string will then be terminated with a NUL byte (`\0`). 181 | */ 182 | HTSLIB_EXPORT 183 | ssize_t hgetdelim(char *buffer, size_t size, int delim, hFILE *fp) 184 | HTS_RESULT_USED; 185 | 186 | /// Read a line from the stream, up to a maximum length 187 | /** @param buffer The buffer into which bytes will be written 188 | @param size The size of the buffer 189 | @param fp The file stream 190 | @return The number of bytes read, or negative on error. 191 | @since 1.4 192 | 193 | Specialization of hgetdelim() for a `\n` delimiter. 194 | */ 195 | static inline ssize_t HTS_RESULT_USED 196 | hgetln(char *buffer, size_t size, hFILE *fp) 197 | { 198 | return hgetdelim(buffer, size, '\n', fp); 199 | } 200 | 201 | /// Read a line from the stream, up to a maximum length 202 | /** @param buffer The buffer into which bytes will be written 203 | @param size The size of the buffer (must be > 1 to be useful) 204 | @param fp The file stream 205 | @return _buffer_ on success, or `NULL` if an error occurred. 206 | @since 1.4 207 | 208 | This function can be used as a replacement for `fgets(3)`, or together with 209 | kstring's `kgetline()` to read arbitrarily-long lines into a _kstring_t_. 210 | */ 211 | HTSLIB_EXPORT 212 | char *hgets(char *buffer, int size, hFILE *fp) HTS_RESULT_USED; 213 | 214 | /// Peek at characters to be read without removing them from buffers 215 | /** @param fp The file stream 216 | @param buffer The buffer to which the peeked bytes will be written 217 | @param nbytes The number of bytes to peek at; limited by the size of the 218 | internal buffer, which could be as small as 4K. 219 | @return The number of bytes peeked, which may be less than _nbytes_ 220 | if EOF is encountered; or negative, if there was an I/O error. 221 | 222 | The characters peeked at remain in the stream's internal buffer, and will be 223 | returned by later hread() etc calls. 224 | */ 225 | HTSLIB_EXPORT 226 | ssize_t hpeek(hFILE *fp, void *buffer, size_t nbytes) HTS_RESULT_USED; 227 | 228 | /// Read a block of characters from the file 229 | /** @return The number of bytes read, or negative if an error occurred. 230 | 231 | The full _nbytes_ requested will be returned, except as limited by EOF 232 | or I/O errors. 233 | */ 234 | static inline ssize_t HTS_RESULT_USED 235 | hread(hFILE *fp, void *buffer, size_t nbytes) 236 | { 237 | HTSLIB_EXPORT 238 | extern ssize_t hread2(hFILE *, void *, size_t, size_t); 239 | 240 | size_t n = fp->end - fp->begin; 241 | if (n > nbytes) n = nbytes; 242 | memcpy(buffer, fp->begin, n); 243 | fp->begin += n; 244 | return (n == nbytes || !fp->mobile)? (ssize_t) n : hread2(fp, buffer, nbytes, n); 245 | } 246 | 247 | /// Write a character to the stream 248 | /** @return The character written, or `EOF` if an error occurred. 249 | */ 250 | static inline int hputc(int c, hFILE *fp) 251 | { 252 | HTSLIB_EXPORT 253 | extern int hputc2(int, hFILE *); 254 | if (fp->begin < fp->limit) *(fp->begin++) = c; 255 | else c = hputc2(c, fp); 256 | return c; 257 | } 258 | 259 | /// Write a string to the stream 260 | /** @return 0 if successful, or `EOF` if an error occurred. 261 | */ 262 | static inline int hputs(const char *text, hFILE *fp) 263 | { 264 | HTSLIB_EXPORT 265 | extern int hputs2(const char *, size_t, size_t, hFILE *); 266 | 267 | size_t nbytes = strlen(text), n = fp->limit - fp->begin; 268 | if (n > nbytes) n = nbytes; 269 | memcpy(fp->begin, text, n); 270 | fp->begin += n; 271 | return (n == nbytes)? 0 : hputs2(text, nbytes, n, fp); 272 | } 273 | 274 | /// Write a block of characters to the file 275 | /** @return Either _nbytes_, or negative if an error occurred. 276 | 277 | In the absence of I/O errors, the full _nbytes_ will be written. 278 | */ 279 | static inline ssize_t HTS_RESULT_USED 280 | hwrite(hFILE *fp, const void *buffer, size_t nbytes) 281 | { 282 | HTSLIB_EXPORT 283 | extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t); 284 | HTSLIB_EXPORT 285 | extern int hfile_set_blksize(hFILE *fp, size_t bufsiz); 286 | 287 | if (!fp->mobile) { 288 | size_t n = fp->limit - fp->begin; 289 | if (n < nbytes) { 290 | hfile_set_blksize(fp, fp->limit - fp->buffer + nbytes); 291 | fp->end = fp->limit; 292 | } 293 | } 294 | 295 | size_t n = fp->limit - fp->begin; 296 | if (nbytes >= n && fp->begin == fp->buffer) { 297 | // Go straight to hwrite2 if the buffer is empty and the request 298 | // won't fit. 299 | return hwrite2(fp, buffer, nbytes, 0); 300 | } 301 | 302 | if (n > nbytes) n = nbytes; 303 | memcpy(fp->begin, buffer, n); 304 | fp->begin += n; 305 | return (n==nbytes)? (ssize_t) n : hwrite2(fp, buffer, nbytes, n); 306 | } 307 | 308 | /// For writing streams, flush buffered output to the underlying stream 309 | /** @return 0 if successful, or `EOF` if an error occurred. 310 | 311 | This includes low-level flushing such as via `fdatasync(2)`. 312 | */ 313 | HTSLIB_EXPORT 314 | int hflush(hFILE *fp) HTS_RESULT_USED; 315 | 316 | /// For hfile_mem: get the internal buffer and it's size from a hfile 317 | /** @return buffer if successful, or NULL if an error occurred 318 | 319 | The buffer returned should not be freed as this will happen when the 320 | hFILE is closed. 321 | */ 322 | HTSLIB_EXPORT 323 | char *hfile_mem_get_buffer(hFILE *file, size_t *length); 324 | 325 | /// For hfile_mem: get the internal buffer and it's size from a hfile. 326 | /** @return buffer if successful, or NULL if an error occurred 327 | 328 | This is similar to hfile_mem_get_buffer except that ownership of the 329 | buffer is granted to the caller, who now has responsibility for freeing 330 | it. From this point onwards, the hFILE should not be used for any 331 | purpose other than closing. 332 | */ 333 | HTSLIB_EXPORT 334 | char *hfile_mem_steal_buffer(hFILE *file, size_t *length); 335 | 336 | /// Fills out sc_list[] with the list of known URL schemes. 337 | /** 338 | * @param plugin [in] Restricts schemes to only those from 'plugin. 339 | * @param sc_list [out] Filled out with the scheme names 340 | * @param nschemes [in/out] Size of sc_list (in) and number returned (out) 341 | * 342 | * Plugin may be passed in as NULL in which case all schemes are returned. 343 | * Use plugin "built-in" to list the built in schemes. 344 | * The size of sc_list is determined by the input value of *nschemes. 345 | * This is updated to return the output size. It is up to the caller to 346 | * determine whether to call again with a larger number if this is too small. 347 | * 348 | * The return value represents the total number found matching plugin, which 349 | * may be larger than *nschemes if too small a value was specified. 350 | * 351 | * @return the number of schemes found on success. 352 | * -1 on failure 353 | */ 354 | HTSLIB_EXPORT 355 | int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); 356 | 357 | /// Fills out plist[] with the list of known hFILE plugins. 358 | /* 359 | * @param plist [out] Filled out with the plugin names 360 | * @param nplugins [in/out] Size of plist (in) and number returned (out) 361 | * 362 | * The size of plist is determined by the input value of *nplugins. 363 | * This is updated to return the output size. It is up to the caller to 364 | * determine whether to call again with a larger number if this is too small. 365 | * 366 | * The return value represents the total number found, which may be 367 | * larger than *nplugins if too small a value was specified. 368 | * 369 | * @return the number of plugins found on success. 370 | * -1 on failure 371 | */ 372 | HTSLIB_EXPORT 373 | int hfile_list_plugins(const char *plist[], int *nplugins); 374 | 375 | /// Tests for the presence of a specific hFILE plugin. 376 | /* 377 | * @param name The name of the plugin to query. 378 | * 379 | * @return 1 if found, 0 otherwise. 380 | */ 381 | HTSLIB_EXPORT 382 | int hfile_has_plugin(const char *name); 383 | 384 | #ifdef __cplusplus 385 | } 386 | #endif 387 | 388 | #ifdef HTSLIB_SSIZE_T 389 | #undef HTSLIB_SSIZE_T 390 | #undef ssize_t 391 | #endif 392 | 393 | #endif 394 | --------------------------------------------------------------------------------