├── src ├── seqdiff.h ├── pullseq.h ├── file_read.h ├── pull_by_size.h ├── pull_by_re.h ├── pull_by_name.h ├── size_filter.h ├── search_header.h ├── output.h ├── hash.h ├── CMakeLists.txt ├── linked_list.h ├── seqdiff_results.h ├── bst.h ├── global.h ├── cmpseq.h ├── seqdiff_results.c ├── test_linked_list.c ├── search_header.c ├── hash.c ├── size_filter.c ├── pull_by_size.c ├── linked_list.c ├── pull_by_name.c ├── pull_by_re.c ├── file_read.c ├── output.c ├── cmpseq.c ├── bst.c ├── seqdiff.c ├── kseq.h ├── pullseq.c └── uthash.h ├── AUTHORS ├── NEWS ├── test ├── utest_d.fa ├── utest_c.fa ├── test.txt ├── pull_header.rb ├── test.fa ├── utest_b.fa └── utest_a.fa ├── CMakeLists.txt ├── .gitignore ├── cmake └── FindPCRE2.cmake ├── COPYING ├── ChangeLog ├── README └── INSTALL /src/seqdiff.h: -------------------------------------------------------------------------------- 1 | #ifndef SEQDIFF_H 2 | #define SEQDIFF_H 3 | 4 | 5 | #endif 6 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | pullseq/seqdiff was written by Brian C. Thomas (bct.x42@gmail.com) 2 | Copyright 2015 3 | -------------------------------------------------------------------------------- /src/pullseq.h: -------------------------------------------------------------------------------- 1 | #ifndef PULLSEQ_H 2 | #define PULLSEQ_H 3 | 4 | #define PULLSEQ_SORTMETHOD "UTHASH" 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | 03/23/2023: convert pullseq to use cmake 2 | 02/27/2013: converted pullseq to use autotools for 3 | configuration/building/installation 4 | -------------------------------------------------------------------------------- /src/file_read.h: -------------------------------------------------------------------------------- 1 | #ifndef FILE_READ_H 2 | #define FILE_READ_H 3 | 4 | int getl(char **line, FILE *fp); 5 | char *parse_name(char *line); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /test/utest_d.fa: -------------------------------------------------------------------------------- 1 | >test 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 3 | >testa 4 | MAFSADVLKERRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 5 | -------------------------------------------------------------------------------- /test/utest_c.fa: -------------------------------------------------------------------------------- 1 | >test 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 3 | >testa 4 | MAFSADVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 5 | -------------------------------------------------------------------------------- /src/pull_by_size.h: -------------------------------------------------------------------------------- 1 | #ifndef PULL_BY_SIZE_H 2 | #define PULL_BY_SIZE_H 3 | 4 | int pull_by_size(char *input_file, int min, int max, int length, int convert, int just_count); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /src/pull_by_re.h: -------------------------------------------------------------------------------- 1 | #ifndef PULL_BY_RE_H 2 | #define PULL_BY_RE_H 3 | 4 | int pull_by_re(char *input_file, char *aStrRegex, int min, int max, int length, int exclude, int convert, int just_count); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /src/pull_by_name.h: -------------------------------------------------------------------------------- 1 | #ifndef PULL_BY_NAME_H 2 | #define PULL_BY_NAME_H 3 | 4 | int pull_by_name(char *input_fasta, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /src/size_filter.h: -------------------------------------------------------------------------------- 1 | #ifndef SIZE_FILTER_H 2 | #define SIZE_FILTER_H 3 | 4 | #include "global.h" 5 | 6 | int size_filter(kseq_t *seq, int is_fasta, int min, int max, int length, int convert, int just_count); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /src/search_header.h: -------------------------------------------------------------------------------- 1 | #ifndef SEARCH_HEADER_H 2 | #define SEARCH_HEADER_H 3 | 4 | #define PCRE2_CODE_UNIT_WIDTH 8 5 | 6 | #include 7 | 8 | #define MAX_CAPTURE_COUNT 30 9 | 10 | int search_header(pcre2_code *re, char *str); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /test/test.txt: -------------------------------------------------------------------------------- 1 | A 2 | AA 3 | BB 4 | BBB C 5 | DD E FF 6 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wqXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wqXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wqXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wq XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wq XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wq 7 | -------------------------------------------------------------------------------- /src/output.h: -------------------------------------------------------------------------------- 1 | #ifndef OUTPUT_H 2 | #define OUTPUT_H 3 | 4 | #include 5 | #include "global.h" 6 | 7 | void print_fasta_seq(kseq_t *seq,int n); 8 | void print_fastq_seq(kseq_t *seq); 9 | void print_fasta(FILE *fp,char *name, char *comment, char *seq, size_t colwidth); 10 | #endif 11 | -------------------------------------------------------------------------------- /src/hash.h: -------------------------------------------------------------------------------- 1 | #ifndef HASH_H 2 | #define HASH_H 3 | 4 | #include "global.h" 5 | 6 | typedef struct lookup { 7 | char *name; 8 | UT_hash_handle hh; 9 | } lookup_t; 10 | 11 | 12 | void add_name(char *name); 13 | lookup_t *find_name(char *name); 14 | void delete_name(lookup_t *s); 15 | void delete_hash(void); 16 | void print_hash(void); 17 | int hash_key_count(void); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | message(STATUS "Building src/pullseq") 2 | add_executable(pullseq pullseq.c 3 | global.h 4 | hash.c 5 | output.c 6 | search_header.c 7 | size_filter.c 8 | file_read.c 9 | pull_by_name.c 10 | pull_by_re.c 11 | pull_by_size.c 12 | ) 13 | 14 | message(STATUS "Building src/seqdiff") 15 | add_executable(seqdiff seqdiff.c 16 | global.h 17 | cmpseq.c 18 | file_read.c 19 | hash.c 20 | output.c 21 | seqdiff.c 22 | seqdiff_results.c 23 | ) 24 | -------------------------------------------------------------------------------- /src/linked_list.h: -------------------------------------------------------------------------------- 1 | #ifndef LINKED_LIST_H 2 | #define LINKED_LIST_H 3 | typedef struct _node { 4 | char *word; 5 | struct _node *next; 6 | } node ; 7 | 8 | typedef struct _list_t { 9 | node *head; 10 | node *end; 11 | } list_t; 12 | 13 | void initialize_list(list_t *list); 14 | node * initnode(char *word); 15 | void add_to_list(list_t *list, node *n); 16 | node *search_list(list_t *list, char *word); 17 | void delete_list(list_t *list,node *n); 18 | #endif 19 | -------------------------------------------------------------------------------- /test/pull_header.rb: -------------------------------------------------------------------------------- 1 | require 'nubio' 2 | file = ARGV[0] 3 | n = ARGV[1].to_i 4 | range = ARGV[2].to_i 5 | hits = Hash.new 6 | $stderr.puts "file: #{file}; n: #{n}; range: #{range}" 7 | while (n > 0) 8 | r = rand(range) 9 | unless hits.has_key?(r) 10 | n -= 1 11 | hits[r] = 1 12 | end 13 | end 14 | 15 | count = 0 16 | NuBio::Parser::Fastq.new(file).each do |f| 17 | count += 1 18 | if hits.has_key?(count) 19 | temp = f.header.split 20 | puts temp[0] 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /src/seqdiff_results.h: -------------------------------------------------------------------------------- 1 | #ifndef SEQDIFF_RESULTS_H 2 | #define SEQDIFF_RESULTS_H 3 | 4 | #include 5 | 6 | typedef struct _seqdiff_results_t { 7 | int first_file_total; 8 | int first_file_uniq; 9 | int second_file_total; 10 | int second_file_uniq; 11 | int common; 12 | char *first_file; 13 | char *second_file; 14 | FILE *a_output_fp; 15 | FILE *b_output_fp; 16 | FILE *c_output_fp; 17 | int use_header; 18 | int only_summarize; 19 | } seqdiff_results_t; 20 | 21 | seqdiff_results_t *seqdiff_results_init(void); 22 | void seqdiff_results_destroy(seqdiff_results_t *results); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/bst.h: -------------------------------------------------------------------------------- 1 | #ifndef BST_H 2 | #define BST_H 3 | typedef struct _node { 4 | char *name; 5 | struct _node *left; 6 | struct _node *right; 7 | struct _node *parent; 8 | } node_t ; 9 | 10 | typedef struct _tree { 11 | node_t *root; 12 | } tree_t; 13 | 14 | node_t *initnode(char *word); 15 | int insertnode(tree_t *tree, char *name); 16 | node_t *searchtree(tree_t *tree, char *word); 17 | int deletenode(tree_t *tree, char *name); 18 | void deletetree(tree_t *tree); 19 | int compare(char *left, char *right); 20 | void print_inorder(node_t *node); 21 | void print_preorder(node_t *node); 22 | void print_postorder(node_t *node); 23 | #endif 24 | -------------------------------------------------------------------------------- /src/global.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_H 2 | #define GLOBAL_H 3 | 4 | #define PULLSEQ_VERSION "1.0.2" 5 | 6 | #define _POSIX_C_SOURCE 200809L 7 | /* 8 | #ifdef DEBUG 9 | #define DEBUGP(x, args...) fprintf(stderr, " [%s(), %s:%u]\n" x, __FUNCTION__, __FILE__,__LINE__, ## args) 10 | #else 11 | #define DEBUGP(x, args...) 12 | #endif 13 | */ 14 | 15 | #include "zlib.h" 16 | #include "kseq.h" 17 | #include "uthash.h" 18 | 19 | #define BUFFER_SIZE 65535 20 | 21 | __KS_TYPE(gzFile) 22 | __KS_BASIC(gzFile, BUFFER_SIZE) 23 | __KSEQ_TYPE(gzFile) 24 | __KSEQ_BASIC(static, gzFile) 25 | 26 | extern char const *progname; 27 | extern int QUALITY_SCORE; 28 | extern int verbose_flag; 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/cmpseq.h: -------------------------------------------------------------------------------- 1 | #ifndef CMPSEQ_H 2 | #define CMPSEQ_H 3 | 4 | #include "global.h" 5 | #include "seqdiff_results.h" 6 | 7 | typedef struct _sd_lookup { 8 | char *name; /* header_name */ 9 | char *comment; /* header_description */ 10 | char *seq; 11 | int count; 12 | int in_a; 13 | int in_b; 14 | UT_hash_handle hh; 15 | } sd_lookup_t; 16 | 17 | /* hash-related methods */ 18 | void sd_add_seq(kseq_t *seq, int file, int use_header); 19 | sd_lookup_t *sd_find_seq(char *str, int use_header); 20 | void sd_delete_seq(sd_lookup_t *s); 21 | void sd_delete_hash(void); 22 | void sd_print_hash(void); 23 | int sd_hash_key_count(void); 24 | 25 | void cmpseq(seqdiff_results_t *results); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.7...3.26) 2 | 3 | if(${CMAKE_VERSION} VERSION_LESS 3.12) 4 | cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}) 5 | endif() 6 | 7 | project(Pullseq VERSION 1.0 8 | DESCRIPTION "Extract & Manipulate Sequence Files" 9 | LANGUAGES C) 10 | 11 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) 12 | #include(cmake/FindPCRE2.cmake) 13 | 14 | add_subdirectory(src) 15 | 16 | find_package(PCRE2 REQUIRED) 17 | find_package(ZLIB REQUIRED) 18 | 19 | if(PCRE2_FOUND) 20 | target_include_directories(pullseq PUBLIC ${PCRE2_INCLUDE_DIRS}) 21 | target_link_libraries(pullseq ${PCRE2_LIBRARIES}) 22 | endif(PCRE2_FOUND) 23 | 24 | target_link_libraries(pullseq z) 25 | target_link_libraries(seqdiff z) 26 | 27 | -------------------------------------------------------------------------------- /src/seqdiff_results.c: -------------------------------------------------------------------------------- 1 | #include "seqdiff_results.h" 2 | #include 3 | #include 4 | 5 | seqdiff_results_t *seqdiff_results_init(void) { 6 | seqdiff_results_t *r; 7 | r = (seqdiff_results_t *)malloc(sizeof(seqdiff_results_t)); 8 | if (!r) { 9 | fprintf(stderr, "ERROR: could not allocate memory for seqdiff_results_t structure\n"); 10 | exit(1); 11 | } 12 | 13 | r->first_file_total = 0; 14 | r->first_file_uniq = 0; 15 | r->second_file_total = 0; 16 | r->second_file_uniq = 0; 17 | r->common = 0; 18 | r->first_file = NULL; 19 | r->second_file = NULL; 20 | r->a_output_fp = NULL; 21 | r->b_output_fp = NULL; 22 | r->c_output_fp = NULL; 23 | r->use_header = 0; 24 | r->only_summarize = 0; 25 | return r; 26 | } 27 | 28 | void seqdiff_results_destroy(seqdiff_results_t *results) { 29 | free(results); 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/test_linked_list.c: -------------------------------------------------------------------------------- 1 | #include "pullseq.h" 2 | 3 | int main(int argc, char *argv[]) { 4 | int i; 5 | node *n; 6 | list_t *list; 7 | char *words[3] = {"one","two","three"}; 8 | 9 | list = (list_t *) malloc(sizeof(list_t)); 10 | initialize_list(list); 11 | n = (node *) NULL; 12 | 13 | 14 | for(i=0; i<3; i++) { 15 | fprintf(stderr,"creating node for word %s\n",words[i]); 16 | n = initnode(words[i]); 17 | add_to_list(list,n); 18 | } 19 | 20 | n = list->head; 21 | while(n != NULL) { 22 | fprintf(stderr,"%p: %s\n",n,n->word); 23 | n = n->next; 24 | } 25 | 26 | n = search_list(list,"two"); 27 | fprintf(stderr,"found %s (%p)\n", n->word,n); 28 | n = search_list(list,"wo"); 29 | if (n == NULL) 30 | fprintf(stderr,"did not find \n"); 31 | 32 | delete_list(list,list->head); 33 | return EXIT_SUCCESS; 34 | } 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.o 3 | *.gch 4 | src/pullseq 5 | src/seqdiff 6 | src/test_hash.c 7 | test/t100000.fq 8 | test/t100000.txt 9 | test/t1000000.fq 10 | test/t1000000.txt 11 | test/t10000000.fq 12 | test/t10000000.txt 13 | test/big_test.fastq 14 | test/big_test.txt 15 | .deps 16 | Makefile 17 | valgrind_ex.sh 18 | real_test.sh 19 | Makefile.in 20 | aclocal.m4 21 | *.log 22 | stamp-h1 23 | config.* 24 | /src/config.* 25 | /autom4te.cache 26 | /compile 27 | /configure 28 | /depcomp 29 | /install-sh 30 | /missing 31 | test/RifleCSP2_O2Inj_2_contigs.fa 32 | test/RifleCSP2_O2Inj_2_feature_locations.txt 33 | test/all_contigs_01142015.fa 34 | test/all_contigs_01142015.fa.summary.txt 35 | test/bct_feature_locations_01142015.txt 36 | test/rpL6_bact_arch_euk_curated.fasta 37 | test/list 38 | test/listlist2 39 | test/test.txt.2 40 | test/test_gene.fna 41 | test/test_locations.txt 42 | test/test_locations_gene.txt 43 | test/test_locations_noname.txt 44 | test/test_noname.fa 45 | build/ 46 | -------------------------------------------------------------------------------- /cmake/FindPCRE2.cmake: -------------------------------------------------------------------------------- 1 | # - Find pcre 2 | # Find the native PCRE2 headers and libraries. 3 | # 4 | # PCRE2_INCLUDE_DIRS - where to find pcre.h, etc. 5 | # PCRE2_LIBRARIES - List of libraries when using pcre. 6 | # PCRE2_FOUND - True if pcre found. 7 | 8 | # Look for the header file. 9 | FIND_PATH(PCRE2_INCLUDE_DIR pcre2.h) 10 | 11 | # Look for the library. 12 | FIND_LIBRARY(PCRE2_LIBRARY NAMES libpcre2.a pcre2-8) 13 | 14 | # Handle the QUIETLY and REQUIRED arguments and set PCRE_FOUND to TRUE if all listed variables are TRUE. 15 | INCLUDE(FindPackageHandleStandardArgs) 16 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE2 DEFAULT_MSG PCRE2_LIBRARY PCRE2_INCLUDE_DIR) 17 | 18 | # Copy the results to the output variables. 19 | IF(PCRE2_FOUND) 20 | SET(PCRE2_LIBRARIES ${PCRE2_LIBRARY}) 21 | SET(PCRE2_INCLUDE_DIRS ${PCRE2_INCLUDE_DIR}) 22 | message(STATUS "${PCRE2_INCLUDE_DIRS}") 23 | message(STATUS "${PCRE2_LIBRARIES}") 24 | ELSE(PCRE2_FOUND) 25 | SET(PCRE_LIBRARIES) 26 | SET(PCRE_INCLUDE_DIRS) 27 | ENDIF(PCRE2_FOUND) 28 | 29 | MARK_AS_ADVANCED(PCRE2_INCLUDE_DIRS PCRE2_LIBRARIES) 30 | -------------------------------------------------------------------------------- /src/search_header.c: -------------------------------------------------------------------------------- 1 | #define PCRE2_CODE_UNIT_WIDTH 8 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "search_header.h" 8 | #include "global.h" 9 | 10 | 11 | /* re is a compiled pcre2 regex */ 12 | int search_header(pcre2_code *re, char *str) { 13 | int pcreExecRet; 14 | pcre2_match_data *match_data; 15 | 16 | if (str == NULL) { 17 | return 0; 18 | } 19 | match_data = pcre2_match_data_create_from_pattern(re, NULL); // init structure for result 20 | 21 | /* run the match */ 22 | pcreExecRet = pcre2_match(re, 23 | str, 24 | strlen(str), // length of header string 25 | 0, // Start looking at this point 26 | 0, // pcre exec OPTIONS 27 | match_data, // pcre2_match_data 28 | NULL); // default match context 29 | 30 | pcre2_match_data_free(match_data); /* Release memory used for the match */ 31 | 32 | if (pcreExecRet < 0) { 33 | if (pcreExecRet == PCRE2_ERROR_NOMATCH) 34 | return 0; 35 | else 36 | fprintf(stderr, "Problem with your regex (%d)\n", pcreExecRet); 37 | } else 38 | return 1; 39 | } 40 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2013 Brian C. Thomas . 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | -------------------------------------------------------------------------------- /test/test.fa: -------------------------------------------------------------------------------- 1 | >AA test1 2 | AAAAAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 3 | >BB test2 4 | CCCCCCCCCCCCCCGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 5 | AAAAAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 6 | >BB2 7 | ACCGTGCAGTCGACGACGTAATTAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 8 | >CC test3 9 | GGGGAGGGGGGGGCGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 10 | AAAAAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 11 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 12 | >DD test4 13 | TTTTTTTTTTTTTCGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 14 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 15 | >Eukaryota_Amoebozoa_Mycetozoa_Myxogastria_Myxogastromycetidae_Physariida_Physaraceae_Physarum_polycephalum 16 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 17 | >Eukaryota_Haptophyceae_Pavlovales_Pavlovaceae_Pavlova_lutheri 18 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA 19 | -------------------------------------------------------------------------------- /src/hash.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "global.h" 6 | #include "hash.h" 7 | 8 | lookup_t *lookup = NULL; 9 | 10 | void add_name(char *name) 11 | { 12 | lookup_t *s; 13 | s = (lookup_t *)malloc(sizeof(lookup_t)); 14 | if (s == NULL) { 15 | fprintf(stderr,"couldn't get memory for lookup_t\n"); 16 | exit(EXIT_FAILURE); 17 | } else { 18 | s->name = NULL; 19 | } 20 | s->name = (char *)malloc(sizeof(char *) * (strlen(name)+1)); 21 | if (s->name == NULL) { 22 | fprintf(stderr,"couldn't get memory for name string\n"); 23 | exit(EXIT_FAILURE); 24 | } 25 | strncpy(s->name,name,strlen(name)+1); 26 | 27 | HASH_ADD_KEYPTR( hh, lookup, s->name, strlen(s->name), s ); 28 | } 29 | 30 | lookup_t *find_name(char *name) 31 | { 32 | lookup_t *s; 33 | HASH_FIND_STR(lookup, name, s); 34 | if (s) 35 | return s; 36 | else 37 | return (lookup_t *)NULL; 38 | } 39 | 40 | void delete_name(lookup_t *s) 41 | { 42 | HASH_DEL(lookup, s); 43 | free(s->name); 44 | free(s); 45 | } 46 | 47 | void delete_hash() 48 | { 49 | lookup_t *current_name, *tmp; 50 | HASH_ITER(hh,lookup,current_name,tmp) { 51 | delete_name(current_name); 52 | } 53 | } 54 | 55 | void print_hash(void) 56 | { 57 | lookup_t *s; 58 | for(s=lookup;s!=NULL;s=s->hh.next) 59 | fprintf(stderr,"name %s\n",s->name); 60 | } 61 | 62 | int hash_key_count(void) 63 | { 64 | lookup_t *s; 65 | int count = 0; 66 | for(s=lookup;s!=NULL;s=s->hh.next) 67 | count++; 68 | return(count); 69 | } 70 | -------------------------------------------------------------------------------- /src/size_filter.c: -------------------------------------------------------------------------------- 1 | #include "global.h" 2 | #include "output.h" 3 | #include 4 | 5 | int size_filter(kseq_t *seq, int is_fasta, int min, int max, int length, int convert, int just_count) { 6 | int count=0; 7 | if (min > 0 && max > 0) { /* got a min and max */ 8 | if (seq->seq.l >= min && seq->seq.l <= max) { 9 | count++; 10 | if (!just_count) { 11 | if (convert) 12 | is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); 13 | else 14 | is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); 15 | } 16 | } 17 | } else if (min > 0 || max > 0) { /* either min or max is 0 */ 18 | if (min > 0 && seq->seq.l >= min) { 19 | count++; 20 | if (!just_count) { 21 | if (convert) 22 | is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); 23 | else 24 | is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); 25 | } 26 | } else if (max > 0 && seq->seq.l <= max) { 27 | count++; 28 | if (!just_count) { 29 | if (convert) 30 | is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); 31 | else 32 | is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); 33 | } 34 | } 35 | } else { 36 | /* neither min nor max was > 0, so we print this sequence */ 37 | count++; 38 | if (!just_count) { 39 | if (convert) 40 | is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length); 41 | else 42 | is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq); 43 | } 44 | } 45 | return count; 46 | } 47 | -------------------------------------------------------------------------------- /src/pull_by_size.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "pull_by_size.h" 8 | #include "file_read.h" 9 | #include "global.h" 10 | #include "size_filter.h" 11 | 12 | 13 | __KS_GETC(gzread, BUFFER_SIZE) 14 | __KS_GETUNTIL(gzread, BUFFER_SIZE) 15 | __KSEQ_READ(static) 16 | 17 | /* 18 | extern char const *progname; 19 | extern int verbose_flag; 20 | */ 21 | 22 | int pull_by_size(char *input_file, int min, int max,int length, int convert, int just_count) { 23 | gzFile fp; 24 | int count=0,l; 25 | int hit = 0; 26 | int excluded = 0; 27 | int is_fasta = 0; /* assume fastq */ 28 | kseq_t *seq; 29 | 30 | /* open fasta file */ 31 | fp = gzopen(input_file,"r"); 32 | if (!fp) { 33 | fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); 34 | exit(EXIT_FAILURE); 35 | } 36 | 37 | seq = kseq_init(fp); 38 | 39 | /* determine file type */ 40 | l = kseq_read(seq); /* read the first sequence */ 41 | is_fasta = seq->qual.s == NULL ? 1 : 0; 42 | gzrewind(fp); 43 | kseq_rewind(seq); /* rewind to beginning for main loop */ 44 | 45 | if (verbose_flag) { 46 | if (is_fasta) 47 | fprintf(stderr, "Input is FASTA format\n"); 48 | else 49 | fprintf(stderr, "Input is FASTQ format\n"); 50 | } 51 | 52 | /* search through list and see if this header matches */ 53 | while((l = kseq_read(seq)) >= 0) { 54 | hit = size_filter(seq, is_fasta, min, max, length, convert, just_count); 55 | if (hit) 56 | count++; 57 | else 58 | excluded++; 59 | } 60 | kseq_destroy(seq); 61 | gzclose(fp); /* done reading file */ 62 | 63 | if (just_count) { 64 | fprintf(stdout, "Total output: %i\n", count); 65 | fprintf(stdout, "Total excluded: %i\n", excluded); 66 | } 67 | return count; 68 | } 69 | -------------------------------------------------------------------------------- /src/linked_list.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "linked_list.h" 6 | #include "global.h" 7 | 8 | /* initialize the list */ 9 | void initialize_list(list_t *list) 10 | { 11 | list->head = NULL; 12 | list->end = NULL; 13 | } 14 | 15 | /* create a new node and add the word */ 16 | /* return l to newnode */ 17 | node *initnode(char *word) 18 | { 19 | node *newnode; 20 | newnode = (node *)malloc(sizeof(node)); 21 | if (newnode == NULL) 22 | return (node *) NULL; /* OOM */ 23 | else { 24 | char *w; 25 | w = (char *)malloc(strlen(word)+1); 26 | if (!w) 27 | return NULL; 28 | newnode->word = w; 29 | strcpy(newnode->word, word); 30 | /*newnode->word = strndup(word,strlen(word)+1);*/ 31 | newnode->next = NULL; 32 | return newnode; 33 | } 34 | } 35 | 36 | 37 | /* adds a node to given list */ 38 | /* returns l to "current" node */ 39 | void add_to_list(list_t *list, node *n) 40 | { 41 | if (list->head == NULL) { 42 | list->head = n; /* first in list */ 43 | list->end = n; 44 | } else { 45 | list->end->next = n; 46 | list->end = n; 47 | } 48 | /*fprintf(stderr,"n is %p; head is %p; end is %p\n",n,list->head,list->end);*/ 49 | } 50 | 51 | /* find first node with this word */ 52 | /* returns node with word */ 53 | node *search_list(list_t *list, char *word) 54 | { 55 | node *l = list->head; 56 | while(l != NULL) { 57 | if (strcmp(word, l->word) == 0) 58 | return l; 59 | l = l->next; 60 | if( l == NULL ) 61 | break; 62 | } 63 | return NULL; 64 | } 65 | 66 | /* chuck the whole list */ 67 | void delete_list(list_t *list, node *n) 68 | { 69 | node *temp; 70 | 71 | if ( list->head == NULL ) return; /* dont try to delete an empty list */ 72 | 73 | if (n == list->head) { /* if we are deleting the entire list */ 74 | list->head = NULL; /* then reset head and end to signify empty */ 75 | list->end = NULL; /* list */ 76 | } else { 77 | temp = list->head; /* if its not the entire list, readjust end */ 78 | while( temp->next != n ) /* locate previous node to l */ 79 | temp = temp->next; 80 | list->end = temp; /* set end to node before l */ 81 | } 82 | 83 | while( n != NULL ) { /* whilst there are still nodes to delete */ 84 | temp = n->next; /* record address of next node */ 85 | free(n->word); /* free this node */ 86 | free(n); /* free this node */ 87 | n = temp; /* point to next node to be deleted */ 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/pull_by_name.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "global.h" 8 | #include "pull_by_name.h" 9 | #include "hash.h" 10 | #include "file_read.h" 11 | #include "size_filter.h" 12 | 13 | __KS_GETC(gzread, BUFFER_SIZE) 14 | __KS_GETUNTIL(gzread, BUFFER_SIZE) 15 | __KSEQ_READ(static) 16 | 17 | /* 18 | extern char const *progname; 19 | extern int verbose_flag; 20 | */ 21 | 22 | int pull_by_name(char *input_file, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count) { 23 | gzFile fp; 24 | int i,l,capacity=80; 25 | int count=0,excluded=0; 26 | int is_fasta = -1; 27 | char *fasta_name; 28 | char *line; 29 | kseq_t *seq; 30 | 31 | /* get some space for the line */ 32 | line = malloc(sizeof(char) * capacity); /* get memory allocated */ 33 | if (!line) { 34 | fprintf(stderr, "%s - line malloc: %s\n",progname, strerror(errno)); 35 | exit(EXIT_FAILURE); 36 | } 37 | 38 | while((i = getl(&line, names_fp)) != -1) { 39 | fasta_name = parse_name(line); 40 | if (fasta_name) { 41 | add_name(fasta_name); /* add fasta_name to hash */ 42 | } 43 | } 44 | 45 | free(line); /* free up line */ 46 | 47 | if (verbose_flag) { 48 | fprintf(stderr,"\n"); 49 | fprintf(stderr,"done reading from input (%d entries)\n", hash_key_count()); 50 | } 51 | /*print_hash();*/ 52 | 53 | 54 | /* open fasta file */ 55 | fp = gzopen(input_file,"r"); 56 | if (!fp) { 57 | fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | seq = kseq_init(fp); /* initialize kseq */ 62 | 63 | /* determine file type */ 64 | l = kseq_read(seq); /* read the first sequence */ 65 | is_fasta = seq->qual.s == NULL ? 1 : 0; 66 | gzrewind(fp); /* rewind to beginning for main loop */ 67 | kseq_rewind(seq); 68 | 69 | if (verbose_flag) { 70 | if (is_fasta) 71 | fprintf(stderr, "Input is FASTA format\n"); 72 | else 73 | fprintf(stderr, "Input is FASTQ format\n"); 74 | } 75 | 76 | /* search through list and see if this header matches */ 77 | while((l = kseq_read(seq)) >= 0) { 78 | if (exclude == 0) { /* INCLUDE names from names file */ 79 | if (find_name(seq->name.s)) /* found name in list */ 80 | count += size_filter(seq, is_fasta, min, max, length, convert, just_count); 81 | else 82 | excluded++; 83 | } else { /* EXCLUDE names from names file */ 84 | if (find_name(seq->name.s)) /* found name in list */ 85 | excluded++; 86 | else 87 | count += size_filter(seq, is_fasta, min, max, length, convert, just_count); 88 | } 89 | } 90 | kseq_destroy(seq); 91 | gzclose(fp); /* done reading file */ 92 | 93 | delete_hash(); /* free the list nodes */ 94 | 95 | if (just_count) { 96 | fprintf(stdout, "Total output: %i\n", count); 97 | if (exclude) 98 | fprintf(stdout, "Total excluded: %i\n", excluded); 99 | } 100 | 101 | if (verbose_flag) { 102 | fprintf(stderr,"Processed %i entries\n",count); 103 | if (exclude) 104 | fprintf(stderr,"Excluded %i entries\n",excluded); 105 | } 106 | return count; 107 | } 108 | -------------------------------------------------------------------------------- /src/pull_by_re.c: -------------------------------------------------------------------------------- 1 | #define PCRE2_CODE_UNIT_WIDTH 8 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "pull_by_re.h" 11 | #include "file_read.h" 12 | #include "global.h" 13 | #include "size_filter.h" 14 | #include "search_header.h" 15 | 16 | 17 | __KS_GETC(gzread, BUFFER_SIZE) 18 | __KS_GETUNTIL(gzread, BUFFER_SIZE) 19 | __KSEQ_READ(static) 20 | 21 | /* 22 | extern char const *progname; 23 | extern int verbose_flag; 24 | */ 25 | 26 | int pull_by_re(char *input_file, char *aStrRegex, int min, int max, int length, int exclude, int convert, int just_count) { 27 | gzFile fp; 28 | int count=0,l; 29 | int excluded = 0; 30 | int is_fasta = 0; /* assume fastq */ 31 | kseq_t *seq; 32 | 33 | /* pcre2 variables */ 34 | pcre2_code *re; // the regex object 35 | PCRE2_SIZE erroroffset; 36 | int errornumber; 37 | 38 | /* open fasta file */ 39 | fp = gzopen(input_file,"r"); 40 | if (!fp) { 41 | fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file); 42 | exit(EXIT_FAILURE); 43 | } 44 | 45 | seq = kseq_init(fp); 46 | 47 | /* determine file type */ 48 | l = kseq_read(seq); /* read the first sequence */ 49 | is_fasta = seq->qual.s == NULL ? 1 : 0; 50 | gzrewind(fp); 51 | kseq_rewind(seq); /* rewind to beginning for main loop */ 52 | 53 | if (verbose_flag) { 54 | if (is_fasta) 55 | fprintf(stderr, "Input is FASTA format\n"); 56 | else 57 | fprintf(stderr, "Input is FASTQ format\n"); 58 | } 59 | 60 | /* initialize the re */ 61 | re = pcre2_compile( 62 | aStrRegex, /* the pattern */ 63 | PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ 64 | 0, /* default options */ 65 | &errornumber, /* for error num */ 66 | &erroroffset, /* err offset */ 67 | NULL); /* default compile context */ 68 | 69 | if (re == NULL) { 70 | PCRE2_UCHAR buffer[256]; 71 | pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); 72 | fprintf(stderr, "PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, buffer); 73 | exit(EXIT_FAILURE); 74 | } 75 | 76 | /* search through list and see if this header matches */ 77 | while((l = kseq_read(seq)) >= 0) { 78 | if (exclude) { 79 | if (search_header(re, seq->name.s) || search_header(re, seq->comment.s)) 80 | excluded++; 81 | else { 82 | /* regex doesn't match, so check size/print */ 83 | count += size_filter(seq, is_fasta, min, max, length, convert, just_count); 84 | } 85 | } else { 86 | if (search_header(re, seq->name.s) || search_header(re, seq->comment.s)) { 87 | /* regex matches so check size/print */ 88 | count += size_filter(seq, is_fasta, min, max, length, convert, just_count); 89 | } else 90 | excluded++; 91 | } 92 | } /* end of seq traversal */ 93 | 94 | /* tear down re */ 95 | pcre2_code_free(re); /* free up the re */ 96 | 97 | kseq_destroy(seq); 98 | gzclose(fp); /* done reading file so close */ 99 | 100 | if (just_count) { 101 | fprintf(stdout, "Total output: %i\n", count); 102 | fprintf(stdout, "Total excluded: %i\n", excluded); 103 | } 104 | return count; 105 | } 106 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2015-08-17 12:40:39 -0700 Brian C. Thomas 2 | 3 | * Merge branch 'master' of github.com:bcthomas/pullseq (HEAD -> master, origin/master) 4 | 5 | 2015-08-17 12:36:36 -0700 Brian C. Thomas 6 | 7 | * fixed fasta comment bug 8 | 9 | 2015-01-08 08:29:58 -0800 Brian C. Thomas 10 | 11 | * closed names_fp file pointer 12 | 13 | 2014-09-17 12:04:28 -0700 Brian C. Thomas 14 | 15 | * update uthash to 1.9.9.1 16 | 17 | 2014-08-15 16:57:44 -0700 Brian C. Thomas 18 | 19 | * update to 1.0.1 20 | 21 | 2014-08-15 16:56:54 -0700 Brian C. Thomas 22 | 23 | * fixed bug in search_header() 24 | 25 | 2014-08-13 10:15:05 -0700 Brian C. Thomas 26 | 27 | * updated README 28 | 29 | 2014-08-13 09:17:09 -0700 Brian C. Thomas 30 | 31 | * changed param name to "regex" and short to "-g" 32 | 33 | 2014-08-13 09:01:04 -0700 Brian C. Thomas 34 | 35 | * updated regex matching to be case-insensitive 36 | 37 | 2014-08-11 12:05:27 -0700 Brian C. Thomas 38 | 39 | * Regex searching and some refactoring 40 | 41 | 2014-07-16 09:18:12 -0700 Brian C. Thomas 42 | 43 | * allow '>' or '@' in the names files 44 | 45 | 2013-12-16 08:42:23 -0800 Brian C. Thomas 46 | 47 | * added ability to get names from STDIN 48 | 49 | 2013-12-13 07:21:01 -0800 Brian C. Thomas 50 | 51 | * fixed but in FASTQ header output 52 | 53 | 2013-10-25 11:59:14 -0700 Brian C. Thomas 54 | 55 | * typo 56 | 57 | 2013-10-25 11:57:50 -0700 Brian C. Thomas 58 | 59 | * updated README 60 | 61 | 2013-10-25 11:55:08 -0700 Brian C. Thomas 62 | 63 | * added sequence counting, version, help 64 | 65 | 2013-03-13 13:39:00 -0700 Brian C. Thomas 66 | 67 | * Merge branch 'seqdiff' 68 | 69 | 2013-03-13 13:37:59 -0700 Brian C. Thomas 70 | 71 | * updated docs 72 | 73 | 2013-03-13 13:26:11 -0700 Brian C. Thomas 74 | 75 | * finished seqdiff 76 | 77 | 2013-03-09 14:25:41 -0800 Brian C. Thomas 78 | 79 | * Completed cmpseq() 80 | 81 | 2013-02-28 09:44:44 -0800 Brian C. Thomas 82 | 83 | * updated license to be accurate 84 | 85 | 2013-02-28 09:44:44 -0800 Brian C. Thomas 86 | 87 | * updated license to be accurate 88 | 89 | 2013-02-27 15:10:50 -0800 Brian C. Thomas 90 | 91 | * updated AC_CONFIG_SRCDIR 92 | 93 | 2013-02-27 15:02:24 -0800 Brian C. Thomas 94 | 95 | * modernized for automake 1.13+ 96 | 97 | 2013-02-27 14:49:34 -0800 Brian C. Thomas 98 | 99 | * updated ChangeLog 100 | 101 | 2013-02-27 14:48:29 -0800 Brian C. Thomas 102 | 103 | * removed leftover junk 104 | 105 | 2013-02-27 14:46:20 -0800 Brian C. Thomas 106 | 107 | * convert to autotools! 108 | 109 | 2013-02-26 12:39:31 -0800 Brian C. Thomas 110 | 111 | * updated headers and readme 112 | 113 | 2012-11-01 08:23:16 -0700 Brian C. Thomas 114 | 115 | * more typos 116 | 117 | 2012-11-01 08:22:05 -0700 Brian C. Thomas 118 | 119 | * more readme touchups 120 | 121 | 2012-11-01 08:19:58 -0700 Brian C. Thomas 122 | 123 | * typo in readme 124 | 125 | 2012-11-01 08:17:05 -0700 Brian C. Thomas 126 | 127 | * updated readme 128 | 129 | 2012-11-01 07:59:00 -0700 Brian C. Thomas 130 | 131 | * fix rewind bug and clean up verbose messages 132 | 133 | 2012-10-29 13:06:33 -0700 Brian C. Thomas 134 | 135 | * updated uthash branch to current 136 | 137 | 2012-04-27 16:39:16 -0700 Brian C. Thomas 138 | 139 | * more test dir cleanup for github 140 | 141 | 2012-04-27 15:57:19 -0700 Brian C. Thomas 142 | 143 | * removed big testing files from repo 144 | 145 | 2012-04-27 14:28:18 -0700 Brian C. Thomas 146 | 147 | * convert to uthash for names file 148 | 149 | 2012-04-25 10:25:29 -0700 Brian C. Thomas 150 | 151 | * completed binary tree implementation 152 | 153 | 2012-04-18 19:52:08 -0700 Brian C. Thomas 154 | 155 | * restructured header files (origin/linklist, linklist) 156 | 157 | 2012-04-18 16:44:51 -0700 Brian C. Thomas 158 | 159 | * updated arg parsing 160 | 161 | 2012-04-18 15:32:53 -0700 Brian C. Thomas 162 | 163 | * see previous commit - forgot this file 164 | 165 | 2012-04-18 15:31:02 -0700 Brian C. Thomas 166 | 167 | * Completed linked-list version 168 | 169 | 2012-04-18 09:07:08 -0700 Brian C. Thomas 170 | 171 | * updated Makefile - added uthash.h 172 | 173 | 2012-04-18 09:04:06 -0700 Brian C. Thomas 174 | 175 | * initial commit 176 | 177 | -------------------------------------------------------------------------------- /src/file_read.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "file_read.h" 8 | #include "global.h" 9 | 10 | char *parse_name(char *line) 11 | { 12 | char *word; 13 | char *delims = " \t\n"; /* space, tab, newline */ 14 | word = strtok(line, delims); 15 | /* check if the name begins with '>' or '@' and drop it */ 16 | if ( *word == '>' || *word == '@' ) 17 | word++; /* move the word up one char */ 18 | return word; 19 | } 20 | 21 | #define BUFSIZE 80 22 | int getl(char **lineptr, FILE *fp) { 23 | int ch; 24 | ssize_t buf_pos = 0; 25 | ssize_t count = 2; /* Always buf_pos + 2 (see below). */ 26 | size_t new_length = 0; 27 | size_t n = BUFSIZE; 28 | char *temp; 29 | 30 | if ((lineptr == NULL) || (fp == NULL)) { 31 | errno = EINVAL; 32 | return -1; 33 | } 34 | 35 | if (errno != 0) 36 | errno = 0; 37 | 38 | if (*lineptr == NULL) { 39 | *lineptr = malloc(n * sizeof(char)); 40 | 41 | if (*lineptr == NULL) { 42 | return -1; /* Out of memory. */ 43 | } 44 | } 45 | 46 | /* 47 | * There are buf_pos characters in the buffer. When we read another 48 | * character, we want to store it, and we also need enough 49 | * room for a nul string. So we need to realloc as soon as our capacity 50 | * becomes less than buf_pos + 2. 51 | * Hence the variable "count" which always equals buf_pos + 2. 52 | */ 53 | 54 | while ((ch = getc(fp)) != EOF) { 55 | if (errno != 0) 56 | return -1; 57 | 58 | if (count > n) { /* current chars read is going to blow our buffer - add more */ 59 | new_length = n * 2; /* double the current buffer size */ 60 | if (new_length <= n) { /* Overflow. */ 61 | errno = ENOMEM; 62 | /* We couldn't store the character, */ 63 | /* so put it back on the stream. */ 64 | ungetc(ch, fp); 65 | return -1; 66 | } 67 | temp = (char *)realloc(*lineptr, new_length * sizeof(char)); /* realloc to a temp */ 68 | if (temp == NULL) { 69 | ungetc(ch, fp); 70 | return -1; 71 | } 72 | n = new_length; /* set n to the new length we were able to get from system */ 73 | *lineptr = temp; /* set line to this new temp string */ 74 | } 75 | 76 | (*lineptr)[buf_pos++] = (char)ch; /* set this char in the string at buf_pos and THEN increment buf_pos */ 77 | 78 | if (ch == '\n') /* eol */ 79 | break; 80 | 81 | if (count == SSIZE_MAX) { /* SSIZE_MAX is 32767 - posix def */ 82 | /* We'll overflow ssize_t on the next round, since the return 83 | * type is SSIZE_T */ 84 | errno = ENOMEM; 85 | return -1; 86 | } 87 | count++; /* increment ch count */ 88 | } 89 | 90 | (*lineptr)[buf_pos] = '\0'; /* set last position to \0 */ 91 | 92 | if (buf_pos == 0) { /* nothing in the file? */ 93 | buf_pos = -1; 94 | } 95 | return buf_pos; 96 | } 97 | 98 | int getlx(char **iline,FILE *fp) 99 | { 100 | char *line = *iline; 101 | char *newline = NULL; 102 | char *buf = NULL; 103 | char *eol = NULL; 104 | size_t capacity = sizeof(line); /* reasonable starting point for line length */ 105 | size_t remaining = capacity; 106 | size_t used = 0; 107 | 108 | buf = line; /* point buf -> line */ 109 | line[0] = '\0'; 110 | 111 | /* read file into buf */ 112 | while (fgets(buf, remaining, fp)) { 113 | eol = strchr(buf, '\n'); /* locate first occurrence of '\n' */ 114 | if (eol) { /* found a newline in the string */ 115 | *eol = '\0'; /* replace the newline with the null character */ 116 | break; 117 | } else { 118 | /* buffer was too small - enlarge it */ 119 | used = buf + remaining - line; 120 | 121 | newline = realloc(line, capacity * 2); 122 | if (!newline) { 123 | fprintf(stderr, "getl - realloc: %s\n", strerror(errno)); 124 | return -1; 125 | } else { 126 | line = newline; 127 | } 128 | 129 | buf = line + used - 1; 130 | capacity *= 2; 131 | remaining = capacity - used; 132 | } 133 | } 134 | 135 | if (errno) { 136 | fprintf(stderr, "getl - fgets: %s\n", strerror(errno)); 137 | } else if (line[0]) { 138 | char *eol = strchr(buf, '\n'); 139 | if (eol) 140 | *eol = '\0'; 141 | /*buf = line;*/ 142 | return strlen(line); 143 | } 144 | return -1; 145 | } 146 | -------------------------------------------------------------------------------- /src/output.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "global.h" 6 | 7 | /* 8 | extern char const *progname; 9 | extern int verbose_flag; 10 | */ 11 | 12 | int QUALITY_SCORE; 13 | 14 | void print_fastq_seq(kseq_t *seq) 15 | { 16 | char *qual_str = NULL; 17 | int i=0; 18 | int l=strlen(seq->seq.s) + 1; /* sequence length */ 19 | if (seq->qual.s == NULL) { /* just use a default value for the quality code */ 20 | qual_str = (char *)malloc(sizeof(char) * l); 21 | for (i=0;icomment.l == 0) 27 | printf("@%s\n%s\n+\n%s\n", seq->name.s, seq->seq.s, qual_str); 28 | else 29 | printf("@%s %s\n%s\n+\n%s\n", seq->name.s, seq->comment.s, seq->seq.s, qual_str); 30 | 31 | free(qual_str); 32 | } else { 33 | if (seq->comment.l == 0) 34 | printf("@%s\n%s\n+\n%s\n",seq->name.s,seq->seq.s,seq->qual.s); 35 | else 36 | printf("@%s %s\n%s\n+\n%s\n",seq->name.s,seq->comment.s,seq->seq.s,seq->qual.s); 37 | } 38 | } 39 | 40 | void print_fasta_seq(kseq_t *seq, int n) 41 | { 42 | int l = seq->seq.l; /* sequence length */ 43 | int x,i=0; 44 | char *seqbuf = NULL; 45 | seqbuf = (char *)malloc(sizeof(char) * (n + 1)); 46 | if (seqbuf == NULL) { 47 | fprintf(stderr,"print_seq: out of memory for seqbuf!\n"); 48 | exit(EXIT_FAILURE); 49 | } 50 | 51 | if (n <= 0) 52 | n = 50; 53 | 54 | if (l > n) { /* seqlength is > column length - split sequence */ 55 | if (seq->comment.s == NULL) { 56 | printf(">%s\n",seq->name.s); 57 | } else { 58 | if (seq->comment.l == 0) 59 | printf(">%s\n",seq->name.s); 60 | else 61 | printf(">%s %s\n",seq->name.s, seq->comment.s); 62 | } 63 | 64 | for (x=0; xseq.s[x]; 67 | i++; 68 | } else { /* i is >= column width, so print this line */ 69 | seqbuf[i] = '\0'; /* set last position in string to null */ 70 | printf("%s\n",seqbuf); /* print this line */ 71 | i = 0; /* reset i */ 72 | seqbuf[0] = '\0'; /* reset buffer */ 73 | seqbuf[i] = seq->seq.s[x]; /* set this buffer line to current sequence char */ 74 | i++; 75 | } 76 | } 77 | if (i 0) 80 | printf("%s\n",seqbuf); 81 | } else { /* seqlength < column length, so just print the full sequence */ 82 | if (seq->comment.l == 0) 83 | printf(">%s\n%s\n",seq->name.s,seq->seq.s); 84 | else 85 | printf(">%s %s\n%s\n",seq->name.s,seq->comment.s,seq->seq.s); 86 | } 87 | free(seqbuf); 88 | } 89 | 90 | void print_fasta(FILE *fp, char *name, char *comment, char *seq, size_t colwidth) 91 | { 92 | int l = strlen(seq); /* sequence length */ 93 | int x,i=0; 94 | char *seqbuf = NULL; 95 | seqbuf = (char *)malloc(sizeof(char) * (colwidth + 1)); 96 | if (seqbuf == NULL) { 97 | fprintf(stderr,"print_seq: out of memory for seqbuf!\n"); 98 | exit(EXIT_FAILURE); 99 | } 100 | 101 | if (l > colwidth) { /* seqlength is > column length - split sequence */ 102 | if (comment == NULL) 103 | fprintf(fp, ">%s\n",name); 104 | else 105 | fprintf(fp, ">%s %s\n",name,comment); 106 | 107 | for (x=0; x= column width, so print this line */ 112 | seqbuf[i] = '\0'; /* set last position in string to null */ 113 | fprintf(fp, "%s\n",seqbuf); /* print this line */ 114 | i = 0; /* reset i */ 115 | seqbuf[0] = '\0'; /* reset buffer */ 116 | seqbuf[i] = seq[x]; /* set this buffer line to current sequence char */ 117 | i++; 118 | } 119 | } 120 | if (i 0) 123 | fprintf(fp, "%s\n",seqbuf); 124 | } else { /* seqlength < column length, so just print the full sequence */ 125 | if (comment == NULL) 126 | fprintf(fp, ">%s\n%s\n",name,seq); 127 | else 128 | fprintf(fp, ">%s %s\n%s\n",name,comment,seq); 129 | } 130 | free(seqbuf); 131 | } 132 | -------------------------------------------------------------------------------- /test/utest_b.fa: -------------------------------------------------------------------------------- 1 | >test 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 3 | >testa 4 | MAFSADVLKERRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 5 | >UniRef90_Q6GZX4 Putative transcription factor 001R n=8 Tax=Ranavirus RepID=001R_FRG3G 6 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 7 | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD 8 | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL 9 | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD 10 | SFRKIYTDLGWKFTPL 11 | >UniRef90_Q6GZX3 Uncharacterized protein 002L n=5 Tax=Ranavirus RepID=002L_FRG3G 12 | MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR 13 | IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL 14 | AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC 15 | KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML 16 | DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK 17 | VMFFVAGAVLVAILISTVRW 18 | >UniRef90_Q197F8 Uncharacterized protein 002R n=1 Tax=Invertebrate iridescent virus 3 RepID=002R_IIV3 19 | MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL 20 | QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT 21 | FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD 22 | LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET 23 | YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY 24 | STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS 25 | GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI 26 | QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC 27 | >UniRef90_Q197F7 Uncharacterized protein 003L n=1 Tax=Invertebrate iridescent virus 3 RepID=003L_IIV3 28 | MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT 29 | PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS 30 | TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI 31 | >UniRef90_Q6GZX2 Uncharacterized protein 3R n=8 Tax=Ranavirus RepID=003R_FRG3G 32 | MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD 33 | RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI 34 | FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ 35 | PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD 36 | AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR 37 | TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA 38 | LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR 39 | KAKIQEMFDNMVSRMVTS 40 | >UniRef90_Q6GZX1 Uncharacterized protein 004R n=8 Tax=Ranavirus RepID=004R_FRG3G 41 | MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY 42 | >UniRef90_Q197F5 Uncharacterized protein 005L n=1 Tax=Invertebrate iridescent virus 3 RepID=005L_IIV3 43 | MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTL 44 | CSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELC 45 | KADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYH 46 | QPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY 47 | >UniRef90_Q6GZX0 Uncharacterized protein 005R n=4 Tax=Frog virus 3 RepID=005R_FRG3G 48 | MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS 49 | NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED 50 | QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT 51 | REFVDKDAQEFQDFLNSLDASLLS 52 | >UniRef90_Q91G88 Putative KilA-N domain-containing protein 006L n=1 Tax=Invertebrate iridescent virus 6 RepID=006L_IIV6 53 | MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL 54 | IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII 55 | INYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRK 56 | KDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVE 57 | DRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCH 58 | PNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV 59 | >UniRef90_Q6GZW9 Uncharacterized protein 006R n=3 Tax=Frog virus 3 RepID=006R_FRG3G 60 | MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIE 61 | YLGPWVQAEYRRIKG 62 | >UniRef90_Q6GZW8 Uncharacterized protein 007R n=2 Tax=Frog virus 3 RepID=007R_FRG3G 63 | MRSIKPLRCCNAHGRHVSQEYGRCTLLLFREKLFLQTGLVCNKQCNAPNNDGAESKHHGI 64 | HHGSRGALALRGAGVHLLASAALGPRVLAGLVPTGRSVQGSVGQCGRVAQIGRARDVAAR 65 | KQESYCEK 66 | >UniRef90_Q197F3 Uncharacterized protein 007R n=1 Tax=Invertebrate iridescent virus 3 RepID=007R_IIV3 67 | MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV 68 | YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL 69 | LVGNEWYCKTFGKAGSKNVFLYNMIPTIYRDEPQHQEQILKKFMFFNATKNVEQNPNFLD 70 | NVPEEYYHLLLPKSWVEKNLSDKYRKIMETEHKPLVFSCEPAFSFGLCRNTQDKNESYQL 71 | SLCLYEREKPRDAEIVWAAKYDELAAMVRDYLKKTPEFKKYRSFISCMKGLSWKNNEIGD 72 | KDGPKLYPKVIFNRKKGEFVTIFTKDDDVEPETIEDPRTILDRRCVVQAALRLESVFVHN 73 | KVAIQLRINDVLISEWKEASSKPQPLILRRHRFTKPSSSVAKSTSPSLRNSGSDESDLNQ 74 | SDSDKEDERVVPVPKTKRIVKTVKLPN 75 | >UniRef90_Q197F2 Uncharacterized protein 008L n=1 Tax=Invertebrate iridescent virus 3 RepID=008L_IIV3 76 | MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQNAGDVTNKAYVDQA 77 | VMSAAVPVASSTTVGTIQMAGDLEGSSGTNPIIAANKITLNKLQKIGPKMVIGNPNSDWN 78 | NTQEIELDSSFRIVDNRLNAGIVPISSTDPNKSNTVIPAPQQNGLFYLDSSGRVWVWAEH 79 | YYKCITPSRYISKWMGVGDFQELTVGQSVMWDSGRPSIETVSTQGLEVEWISSTNFTLSS 80 | LYLIPIVVKVTICIPLLGQPDQMAKFVLYSVSSAQQPRTGIVLTTDSSRSSAPIVSEYIT 81 | VNWFEPKSYSVQLKEVNSDSGTTVTICSDKWLANPFLDCWITIEEVG 82 | >UniRef90_Q91G85 Uncharacterized protein 009R n=1 Tax=Invertebrate iridescent virus 6 RepID=009R_IIV6 83 | MIKLFCVLAAFISINSACQSSHQQREEFTVATYHSSSICTTYCYSNCVVASQHKGLNVES 84 | YTCDKPDPYGRETVCKCTLIKCHDI 85 | >UniRef90_UPI00029CD601 LOW QUALITY PROTEIN: hypothetical protein OPAG_08414, partial n=1 Tax=Rhodococcus opacus PD630 RepID=UPI00029CD601 86 | MSRATAGRIRSVDQLRPMPNPSEFTPLLVMNPRDRRSPHTVHIALGGVECFARNRSSHIM 87 | KLTTAHNVLDKLTTAHNVLGKLT 88 | -------------------------------------------------------------------------------- /src/cmpseq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "global.h" 8 | #include "cmpseq.h" 9 | #include "output.h" 10 | #include "seqdiff_results.h" 11 | 12 | __KS_GETC(gzread, BUFFER_SIZE) 13 | __KS_GETUNTIL(gzread, BUFFER_SIZE) 14 | __KSEQ_READ(static) 15 | 16 | sd_lookup_t *sd_lookup = NULL; 17 | 18 | /** 19 | * cmpseq 20 | * 21 | * Compare two files of sequences and determine which sequences are 22 | * uniq or common to each file. 23 | * 24 | **/ 25 | void cmpseq(seqdiff_results_t *results) { 26 | gzFile fp; 27 | int l; 28 | kseq_t *seq; 29 | sd_lookup_t *s,*temp; 30 | 31 | /* open first sequence file */ 32 | fp = gzopen(results->first_file,"r"); 33 | if (!fp) { 34 | fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,results->first_file); 35 | exit(EXIT_FAILURE); 36 | } 37 | 38 | seq = kseq_init(fp); /* initialize kseq */ 39 | 40 | while((l = kseq_read(seq)) >= 0) { 41 | results->first_file_total++; /* increment first_file_total */ 42 | sd_add_seq(seq,1,results->use_header); 43 | } 44 | 45 | kseq_destroy(seq); 46 | gzclose(fp); /* done reading file */ 47 | 48 | /* process second_file */ 49 | fp = gzopen(results->second_file,"r"); 50 | if (!fp) { 51 | fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,results->second_file); 52 | exit(EXIT_FAILURE); 53 | } 54 | 55 | seq = kseq_init(fp); /* initialize kseq */ 56 | 57 | while((l = kseq_read(seq)) >= 0) { 58 | results->second_file_total++; /* increment second_file_total */ 59 | sd_add_seq(seq,2,results->use_header); 60 | } 61 | 62 | kseq_destroy(seq); /* free kseq struct */ 63 | gzclose(fp); /* done reading file */ 64 | 65 | HASH_ITER(hh, sd_lookup, s, temp) { 66 | if (s->in_a == 1 && s->in_b == 0) { 67 | results->first_file_uniq++; 68 | if (!results->only_summarize) 69 | fprintf(stdout,"%s\t\n",s->name); 70 | /* printing out fasta or fastq????? */ 71 | if (results->a_output_fp != NULL) 72 | print_fasta(results->a_output_fp, s->name, s->comment, s->seq, 50); 73 | } else if (s->in_a == 0 && s->in_b == 1) { 74 | results->second_file_uniq++; 75 | if (!results->only_summarize) 76 | fprintf(stdout,"\t%s\n",s->name); 77 | if (results->b_output_fp != NULL) 78 | print_fasta(results->b_output_fp, s->name, s->comment, s->seq, 50); 79 | } else if (s->in_a == 1 && s->in_b == 1) { 80 | results->common++; 81 | if (!results->only_summarize) 82 | fprintf(stdout,"%s\t%s\n",s->name,s->name); 83 | if (results->c_output_fp != NULL) { 84 | print_fasta(results->c_output_fp, s->name, s->comment, s->seq, 50); 85 | } 86 | } 87 | } 88 | 89 | sd_delete_hash(); /* free the hash nodes */ 90 | } 91 | 92 | /* hash functions */ 93 | /** 94 | * sd_add_seq 95 | * checks if the key is in the hash. If yes, just increment count; if 96 | * no, add the new entry to the hash. 97 | */ 98 | void sd_add_seq(kseq_t *seq, int file, int use_header) 99 | { 100 | sd_lookup_t *s; 101 | 102 | if (use_header) 103 | HASH_FIND_STR(sd_lookup,seq->name.s,s); 104 | else 105 | HASH_FIND_STR(sd_lookup,seq->seq.s,s); 106 | 107 | if (s==NULL) { /* key is not in hash */ 108 | s = (sd_lookup_t *)malloc(sizeof(sd_lookup_t)); 109 | if (s == NULL) { 110 | fprintf(stderr,"couldn't get memory for sd_lookup_t\n"); 111 | exit(EXIT_FAILURE); 112 | } else { 113 | /* initialize struct */ 114 | s->seq = NULL; 115 | s->name = NULL; 116 | s->count = 1; 117 | s->in_a = 0; 118 | s->in_b = 0; 119 | } 120 | 121 | s->seq = (char *)malloc((sizeof(char*) * (strlen(seq->seq.s)+1))); 122 | if (s->seq == NULL) { 123 | fprintf(stderr,"couldn't get memory for seq string\n"); 124 | exit(EXIT_FAILURE); 125 | } else 126 | strncpy(s->seq,seq->seq.s,strlen(seq->seq.s)+1); 127 | 128 | s->name = (char *)malloc((sizeof(char*) * (strlen(seq->name.s)+1))); 129 | if (s->name == NULL) { 130 | fprintf(stderr,"couldn't get memory for name string\n"); 131 | exit(EXIT_FAILURE); 132 | } else 133 | strncpy(s->name,seq->name.s,strlen(seq->name.s)+1); 134 | 135 | if (seq->comment.s != NULL) { 136 | s->comment = (char *)malloc((sizeof(char*) * (strlen(seq->comment.s)+1))); 137 | if (s->comment == NULL) { 138 | fprintf(stderr,"couldn't get memory for comment string\n"); 139 | exit(EXIT_FAILURE); 140 | } else 141 | strncpy(s->comment,seq->comment.s,strlen(seq->comment.s)+1); 142 | } else 143 | s->comment = NULL; 144 | 145 | if (file == 1) 146 | s->in_a = 1; 147 | else if (file == 2) 148 | s->in_b = 1; 149 | if (use_header) 150 | HASH_ADD_KEYPTR( hh, sd_lookup, s->name, strlen(s->name), s ); 151 | else 152 | HASH_ADD_KEYPTR( hh, sd_lookup, s->seq, strlen(s->seq), s ); 153 | } else { 154 | s->count++; /* key is already in the hash, just incr count */ 155 | if (file == 1) 156 | s->in_a = 1; 157 | else if (file == 2) 158 | s->in_b = 1; 159 | } 160 | } 161 | 162 | sd_lookup_t *sd_find_seq(char *str, int use_header) 163 | { 164 | sd_lookup_t *s; 165 | HASH_FIND_STR(sd_lookup, str, s); 166 | if (s) 167 | return s; 168 | else 169 | return (sd_lookup_t *)NULL; 170 | } 171 | 172 | void sd_delete_seq(sd_lookup_t *s) 173 | { 174 | HASH_DEL(sd_lookup, s); 175 | free(s->seq); 176 | free(s->name); 177 | free(s); 178 | } 179 | 180 | void sd_delete_hash() 181 | { 182 | sd_lookup_t *current_seq, *tmp; 183 | HASH_ITER(hh,sd_lookup,current_seq,tmp) { 184 | sd_delete_seq(current_seq); 185 | } 186 | } 187 | 188 | int sd_hash_key_count(void) 189 | { 190 | sd_lookup_t *s; 191 | int count = 0; 192 | for(s=sd_lookup;s!=NULL;s=s->hh.next) 193 | count++; 194 | return(count); 195 | } 196 | 197 | void sd_print_hash(void) 198 | { 199 | sd_lookup_t *s; 200 | for(s=sd_lookup;s!=NULL;s=s->hh.next) 201 | fprintf(stderr,"name %s\n",s->name); 202 | } 203 | 204 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Summary: 2 | 3 | Software to extract sequence from a fasta or fastq. Also filter 4 | sequences by a minimum length or maximum length. Fast, written in C, 5 | using kseq.h library. 6 | 7 | 8 | Pullseq Summary: 9 | 10 | pullseq - extract sequences from a fasta/fastq file. This program is 11 | fast, and can be useful in a variety of situations. You can use it to 12 | extract sequences from one fasta/fastq file into a new file, given 13 | either a list of header ids to include or a regular expression 14 | pattern to match. Results can be included (default) or excluded, 15 | and they can additionally be filtered with minimum / maximum sequence 16 | lengths. 17 | 18 | Additionally, it can convert from fastq to fasta or visa-versa and 19 | can change the length of the output sequence lines. 20 | 21 | NOTE: pullseq prints to standard out, so you need to use redirection 22 | (e.g. pullseq input.fasta -m 10 *>* output.fasta ) to create output files. 23 | 24 | Synopsis: 25 | 26 | pullseq -i -n
27 | 28 | pullseq -i -m 29 | 30 | pullseq -i -g 31 | 32 | pullseq -i -m -a 33 | 34 | pullseq -i -t 35 | 36 | cat | pullseq -i -N 37 | 38 | Options: 39 | -i, --input, Input fasta/fastq file (required) 40 | -n, --names, File of header id names to search for 41 | -N, --names_stdin, Use STDIN for header id names 42 | -g, --regex, Regular expression to match (PERL compatible; always case-insensitive) 43 | -m, --min, Minimum sequence length 44 | -a, --max, Maximum sequence length 45 | -l, --length, Sequence characters per line (default 50) 46 | -c, --convert, Convert input to fastq/fasta (e.g. if input is fastq, output will be fasta) 47 | -q, --quality, ASCII code to use for fasta->fastq quality conversions 48 | -e, --excluded, Exclude the header id names in the list (-n) 49 | -t, --count, Just count the possible output, but don't write it 50 | -h, --help, Display this help and exit 51 | -v, --verbose, Print extra details during the run 52 | --version, Output version information and exit 53 | 54 | =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 55 | 56 | Seqdiff Summary: 57 | seqdiff - compare two fasta (or fastq) files to determine overlap of 58 | sequences. This overlap can be at the sequence level (are two 59 | sequences exactly the same in both files?) or at the header name 60 | level (do two sequences contain the same header name between the two 61 | files?). 62 | 63 | Synopsis: 64 | seqdiff -1 first_file.fa -2 second_file.fa 65 | 66 | Usage: 67 | seqdiff -1 -2 68 | 69 | Options: 70 | -1, --first, First sequence file (required) 71 | -2, --second, Second sequence file (required) 72 | -a, --a_output, File name for uniques from first file 73 | -b, --b_output, File name for uniques from second file 74 | -c, --c_output, File name for common entries 75 | -d, --headers, Compare headers instead of sequences (default: false) 76 | -s, --summary, Just show summary stats? (default: false) 77 | -h, --help, Display this help and exit 78 | -v, --verbose, Print extra details during the run 79 | --version, Output version information and exit 80 | 81 | REQUIREMENTS: 82 | Pullseq/Seqdiff require a C compiler and has been tested to work with 83 | either GCC or clang. They also require (and include) kseq.h (Heng 84 | Li) and uthash.h (http://troydhanson.github.com/uthash/). 85 | 86 | kseq.h also requires Zlib (so your linker should be able to handle 87 | the '-lz' option). You can obtain zlib from http://www.zlib.net/ 88 | or commonly from your OS package manager (e.g. apt-get zlib or 89 | emerge zlib). 90 | 91 | NEW INSTALL: 92 | Pullseq uses CMake, so you must have CMake installed on your system. 93 | 94 | git clone: https://github.com/bcthomas/pullseq.git 95 | cd pullseq 96 | mkdir build 97 | cd build 98 | cmake .. 99 | 100 | This will build binaries in build/src/ 101 | > build/src/pullseq 102 | > build/src/seqdiff 103 | 104 | 105 | 106 | OLD INSTALL: 107 | To install, do the following in a shell on your system... 108 | 109 | From Git: 110 | git clone https://github.com/bcthomas/pullseq.git # checkout the code using git 111 | cd pullseq 112 | ./bootstrap # get set up for config/build after cloning 113 | ./configure # configure the application based on your system 114 | make # will build the application 115 | make install # will install in /usr/local by default 116 | 117 | From a Release file (tar or zip): 118 | tar xvf pullseq_version.tar.gz 119 | cd pullseq_version 120 | ./autoconf # make sure configuration is set 121 | ./configure # configure the application based on your system 122 | make # will build the application 123 | make install # will install in /usr/local by default 124 | 125 | NOTE: If you have PCRE (perl-compatible regular expression library) 126 | installed in a non-standard location (e.g. on a mac using brew), the 127 | ./configure script will fail. You'll need to update your CFLAGS and 128 | LDFLAGS env settings to define where your PCRE library files were 129 | installed. 130 | 131 | For example, on a mac with pcre installed by brew, you can do this: 132 | 133 | pcre-config --cflags 134 | -I/usr/local/Cellar/pcre/8.39/include 135 | 136 | Then you can just add this to a env CFLAGS variable and run the 137 | configure command, like so... 138 | 139 | export CFLAGS="-I/usr/local/Cellar/pcre/8.39/include" 140 | ./configure 141 | 142 | If your pcre library is installed somewhere else, you just update 143 | the CFLAGS env variable accordingly. 144 | -------------------------------------------------------------------------------- /src/bst.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "global.h" 6 | #include "bst.h" 7 | 8 | /* create a new node */ 9 | /* return pointer to new node */ 10 | node_t *initnode(char *name) 11 | { 12 | node_t *newnode; 13 | newnode = (node_t *)malloc(sizeof(node_t)); 14 | if (newnode == NULL) 15 | return (node_t *)NULL; /* OOM */ 16 | else { 17 | char *newname; 18 | newname = (char *)malloc(strlen(name)+1); /* alloc space for name */ 19 | if (!newname) 20 | return (node_t *)NULL; /* oom */ 21 | newnode->name = newname; 22 | strcpy(newnode->name, name); 23 | 24 | newnode->parent = newnode->left = newnode->right = (node_t *)NULL; 25 | return newnode; 26 | } 27 | } 28 | 29 | /* simple alphabetic comparison */ 30 | int compare(char *left, char *right) 31 | { 32 | int result = 0; 33 | result = strcmp(left,right); 34 | if (result == 0) 35 | return 0; /* equal */ 36 | else if (result > 0) 37 | return 1; /* gt */ 38 | else if (result < 0) 39 | return -1; /* lt */ 40 | return result; 41 | } 42 | 43 | /* find node with this name */ 44 | /* returns node or NULL if not found */ 45 | node_t *searchtree(tree_t *tree, char *name) 46 | { 47 | int result; 48 | node_t *node = tree->root; 49 | 50 | while(node != NULL) { 51 | result = compare(name,node->name); 52 | if (result == 0) { 53 | return node; 54 | } else if (result < 0) { 55 | node = node->left; 56 | } else if (result > 0) { 57 | node = node->right; 58 | } else 59 | break; 60 | } 61 | return (node_t *)NULL; 62 | } 63 | 64 | /* adds a node to tree */ 65 | /* returns 0 or 1 depending on success/fail */ 66 | int insertnode(tree_t *tree, char *name) 67 | { 68 | int result; 69 | node_t *node; 70 | 71 | if (tree->root == NULL) { 72 | tree->root = initnode(name); 73 | } else { 74 | node = tree->root; 75 | while(1) { 76 | result = compare(name,node->name); 77 | 78 | if (result == 0) 79 | return 0; /* return fail, since data is already in tree */ 80 | else if (result < 1) { /*left*/ 81 | if (node->left == NULL) 82 | break; 83 | else 84 | node = node->left; 85 | } else { /*right*/ 86 | if (node->right == NULL) 87 | break; 88 | else 89 | node = node->right; 90 | } 91 | } /* while(1) */ 92 | 93 | /* we've broken from the loop, so we have a NULL leaf*/ 94 | if (result < 1) { 95 | node->left = initnode(name); 96 | node->left->parent = node; 97 | } else { 98 | node->right = initnode(name); 99 | node->right->parent = node; 100 | } 101 | } 102 | return 1; 103 | } 104 | 105 | /* delete node from tree */ 106 | int deletenode(tree_t *tree, char *name) 107 | { 108 | if (tree->root != NULL) { 109 | node_t head = {NULL,NULL,NULL,NULL}; 110 | node_t *node = &head; 111 | node_t *t = NULL; 112 | int result; 113 | 114 | node->right = tree->root; /* point the head node at the tree top */ 115 | tree->root->parent = &head; /* also point the tree root's head */ 116 | 117 | /* walk the tree, looking for data to delete */ 118 | while(1) { 119 | result = compare(name,node->name); 120 | 121 | if (result == 0) /* found the node to delete */ 122 | break; 123 | else if (result < 1) { /*left*/ 124 | if (node->left == NULL) 125 | return 0; /* not found */ 126 | else 127 | node = node->left; 128 | } else { /*right*/ 129 | if (node->right == NULL) 130 | return 0; /* not found */ 131 | else 132 | node = node->right; 133 | } 134 | } /* while(1) */ 135 | 136 | 137 | /* if we found matching name, f is pointing to the matching 138 | * node */ 139 | if (node != NULL) { 140 | if (node->left != NULL && node->right != NULL) { /* two children */ 141 | t = node->right; 142 | while (t->left != NULL) { 143 | t = t->left; 144 | } 145 | t->parent = node->parent; 146 | if (node->parent->right == node) 147 | node->parent->right = t; 148 | else 149 | node->parent->left = t; 150 | free(node); 151 | } else if (node->left == NULL && node->right == NULL) { 152 | /* leaf */ 153 | free(node); 154 | } else if (node->left == NULL) { 155 | /* set right */ 156 | t = node->right; /* temp copy of right node */ 157 | node->right->parent = node->parent; 158 | node->parent->right = t; 159 | free(node); 160 | } else { 161 | /* set set left */ 162 | t = node->left; /* temp copy of left node */ 163 | node->left->parent = node->parent; 164 | node->parent->left = t; 165 | free(node); 166 | } 167 | } 168 | } 169 | /* tree->root is NULL, so just return */ 170 | return 1; 171 | } 172 | 173 | /* print from node down, inorder */ 174 | void print_inorder(node_t *node) 175 | { 176 | if (node == NULL) 177 | return; 178 | print_inorder(node->left); 179 | fprintf(stderr,"%s\n",node->name); 180 | print_inorder(node->right); 181 | return; 182 | } 183 | 184 | /* print from node down, preorder */ 185 | void print_preorder(node_t *node) 186 | { 187 | if (node == NULL) 188 | return; 189 | fprintf(stderr,"%s\n",node->name); 190 | print_preorder(node->left); 191 | print_preorder(node->right); 192 | return; 193 | } 194 | 195 | /* print from node down, postorder */ 196 | void print_postorder(node_t *node) 197 | { 198 | if (node == NULL) 199 | return; 200 | print_postorder(node->left); 201 | print_postorder(node->right); 202 | fprintf(stderr,"%s\n",node->name); 203 | return; 204 | } 205 | 206 | void deletetreenode(node_t *node) 207 | { 208 | while(1) { 209 | if (node == NULL) 210 | break; 211 | else if (node->left != NULL) { 212 | deletetreenode(node->left); 213 | node->left = NULL; 214 | } else if (node->right != NULL) { 215 | deletetreenode(node->right); 216 | node->right = NULL; 217 | } else { 218 | free(node->name); 219 | free(node); 220 | return; 221 | } 222 | } 223 | return; 224 | } 225 | 226 | /* chuck the whole list */ 227 | void deletetree(tree_t *tree) 228 | { 229 | if (tree->root == NULL) { 230 | return; 231 | } else { 232 | deletetreenode(tree->root); 233 | } 234 | free(tree); 235 | return; 236 | } 237 | -------------------------------------------------------------------------------- /test/utest_a.fa: -------------------------------------------------------------------------------- 1 | >test 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 3 | >testa 4 | MAFSADVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 5 | >UniRef90_Q6GZX4 Putative transcription factor 001R n=8 Tax=Ranavirus RepID=001R_FRG3G 6 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS 7 | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD 8 | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL 9 | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD 10 | SFRKIYTDLGWKFTPL 11 | >UniRef90_Q6GZX3 Uncharacterized protein 002L n=5 Tax=Ranavirus RepID=002L_FRG3G 12 | MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR 13 | IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL 14 | AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC 15 | KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML 16 | DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK 17 | VMFFVAGAVLVAILISTVRW 18 | >UniRef90_Q197F8 Uncharacterized protein 002R n=1 Tax=Invertebrate iridescent virus 3 RepID=002R_IIV3 19 | MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL 20 | QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT 21 | FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD 22 | LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET 23 | YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY 24 | STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS 25 | GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI 26 | QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC 27 | >UniRef90_Q197F7 Uncharacterized protein 003L n=1 Tax=Invertebrate iridescent virus 3 RepID=003L_IIV3 28 | MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT 29 | PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS 30 | TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI 31 | >UniRef90_Q6GZX2 Uncharacterized protein 3R n=8 Tax=Ranavirus RepID=003R_FRG3G 32 | MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD 33 | RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI 34 | FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ 35 | PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD 36 | AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR 37 | TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA 38 | LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR 39 | KAKIQEMFDNMVSRMVTS 40 | >UniRef90_Q6GZX1 Uncharacterized protein 004R n=8 Tax=Ranavirus RepID=004R_FRG3G 41 | MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY 42 | >UniRef90_Q197F5 Uncharacterized protein 005L n=1 Tax=Invertebrate iridescent virus 3 RepID=005L_IIV3 43 | MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTL 44 | CSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELC 45 | KADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYH 46 | QPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY 47 | >UniRef90_Q6GZX0 Uncharacterized protein 005R n=4 Tax=Frog virus 3 RepID=005R_FRG3G 48 | MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS 49 | NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED 50 | QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT 51 | REFVDKDAQEFQDFLNSLDASLLS 52 | >UniRef90_Q91G88 Putative KilA-N domain-containing protein 006L n=1 Tax=Invertebrate iridescent virus 6 RepID=006L_IIV6 53 | MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL 54 | IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII 55 | INYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRK 56 | KDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVE 57 | DRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCH 58 | PNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV 59 | >UniRef90_Q6GZW9 Uncharacterized protein 006R n=3 Tax=Frog virus 3 RepID=006R_FRG3G 60 | MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIE 61 | YLGPWVQAEYRRIKG 62 | >UniRef90_Q6GZW8 Uncharacterized protein 007R n=2 Tax=Frog virus 3 RepID=007R_FRG3G 63 | MRSIKPLRCCNAHGRHVSQEYGRCTLLLFREKLFLQTGLVCNKQCNAPNNDGAESKHHGI 64 | HHGSRGALALRGAGVHLLASAALGPRVLAGLVPTGRSVQGSVGQCGRVAQIGRARDVAAR 65 | KQESYCEK 66 | >UniRef90_Q197F3 Uncharacterized protein 007R n=1 Tax=Invertebrate iridescent virus 3 RepID=007R_IIV3 67 | MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV 68 | YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL 69 | LVGNEWYCKTFGKAGSKNVFLYNMIPTIYRDEPQHQEQILKKFMFFNATKNVEQNPNFLD 70 | NVPEEYYHLLLPKSWVEKNLSDKYRKIMETEHKPLVFSCEPAFSFGLCRNTQDKNESYQL 71 | SLCLYEREKPRDAEIVWAAKYDELAAMVRDYLKKTPEFKKYRSFISCMKGLSWKNNEIGD 72 | KDGPKLYPKVIFNRKKGEFVTIFTKDDDVEPETIEDPRTILDRRCVVQAALRLESVFVHN 73 | KVAIQLRINDVLISEWKEASSKPQPLILRRHRFTKPSSSVAKSTSPSLRNSGSDESDLNQ 74 | SDSDKEDERVVPVPKTKRIVKTVKLPN 75 | >UniRef90_Q197F2 Uncharacterized protein 008L n=1 Tax=Invertebrate iridescent virus 3 RepID=008L_IIV3 76 | MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQNAGDVTNKAYVDQA 77 | VMSAAVPVASSTTVGTIQMAGDLEGSSGTNPIIAANKITLNKLQKIGPKMVIGNPNSDWN 78 | NTQEIELDSSFRIVDNRLNAGIVPISSTDPNKSNTVIPAPQQNGLFYLDSSGRVWVWAEH 79 | YYKCITPSRYISKWMGVGDFQELTVGQSVMWDSGRPSIETVSTQGLEVEWISSTNFTLSS 80 | LYLIPIVVKVTICIPLLGQPDQMAKFVLYSVSSAQQPRTGIVLTTDSSRSSAPIVSEYIT 81 | VNWFEPKSYSVQLKEVNSDSGTTVTICSDKWLANPFLDCWITIEEVG 82 | >UniRef90_Q6GZW6 Putative helicase 009L n=9 Tax=Ranavirus RepID=009L_FRG3G 83 | MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRRAVQDDPAFGHQKLV 84 | ETFLSEDTPYRELLLFHAPGTGKTCTVVSVAERAKEKGLTRGCIVLARGAALLRNFLHEL 85 | VFNCGTGGRYIPEGYADMGDQERTRKMRKAVSSYYQFRTYETFAKSVATMSAEAIRARYD 86 | RFVIVMDEVHHLRSVQAEGVNTYSAISRFLRTVRGCVKMLLTGTPMTNEPGELADVLNLI 87 | LPQDKTIRPEDGIFSNSGDLLKPDELAERVRGRVSYLKAARPDAGLTFAGEVLGGTGMTH 88 | LRLVRLEMSAFQSDAYASAWDQDAGDRNIFSNSRQCSLAVMPDRRWGSAAEARNPSQVRR 89 | MAGQNLAEYSVKYDYLVRVASSSPKTFAYCEYVNGSGLSLLSDILLANGWRRATGRETTP 90 | GKRFALLTASQKNIHKIVQRFNHEDNVDGAYISLLLGSRVVAEGLTFKEVRHTVILTPHW 91 | NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPRSIDSDMYAVSEVKD 92 | KRIKAVERILMTSAADCSLLRSRNLYPSEFDGSRECEYGRCAYRCSNVSVEPGPLPALLG 93 | ASAAEAVAQVRLDGGGDPAIMKVDMSTLWAEVTAGRRYVNRWGDGAVLRAEGGRLELSAP 94 | YGSSEEGRWGDFYKTRNLCYAKMDQDHLRADDLRDSLPQEVEELLTVSPVETIGETASAM 95 | PQEVATAILMACVQARADGKTLNVVRRDALLDFYKGFYAMGPSGWTVWLHARGANAKVYD 96 | GRRWNPADEDTLEFLAARSAKFTDTRIGYYGLYNPNLKDFCIRDVTQGKRDKVDLRKLTV 97 | GRRCVDWDQRTLVHIVARLMKIDGRRDFMPHATLREMRELAEQDPLHEPSDLTSKEACRR 98 | FLFWTQKGDNKFRRQDICKAMEKWFIENDLMEDNFDCGHQHKRRGKFA 99 | >UniRef90_Q91G85 Uncharacterized protein 009R n=1 Tax=Invertebrate iridescent virus 6 RepID=009R_IIV6 100 | MIKLFCVLAAFISINSACQSSHQQREEFTVATYHSSSICTTYCYSNCVVASQHKGLNVES 101 | YTCDKPDPYGRETVCKCTLIKCHDI 102 | -------------------------------------------------------------------------------- /src/seqdiff.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "global.h" 9 | #include "cmpseq.h" 10 | #include "seqdiff.h" 11 | 12 | int verbose_flag; 13 | char const *progname; 14 | 15 | #define VERSION 0.1 16 | 17 | void show_usage(int status) { 18 | fprintf(stderr, "seqdiff - a bioinformatics tool for comparing sequences in two files\n"); 19 | fprintf(stderr, "\n(Written by bct - 2013) "); 20 | /* fprintf(stderr, "search method: %s",PULLSEQ_SORTMETHOD); */ 21 | fprintf(stderr, "\nUsage:\n" 22 | "%s -1 -2 \n\n", progname); 23 | fprintf(stderr, " Options:\n" 24 | " -1, --first, First sequence file (required)\n" 25 | " -2, --second, Second sequence file (required)\n"); 26 | fprintf(stderr, " -a, --a_output, File name for uniques from first file\n" 27 | " -b, --b_output, File name for uniques from second file\n" 28 | " -c, --c_output, File name for common entries\n"); 29 | 30 | fprintf(stderr, " -d, --headers, Compare headers instead of sequences (default: false)\n" 31 | " -s, --summary, Just show summary stats? (default: false)\n"); 32 | fprintf(stderr, " -h, --help, Display this help and exit\n" 33 | " -v, --verbose, Print extra details during the run\n" 34 | " --version, Output version information and exit\n\n"); 35 | exit(status); 36 | } 37 | 38 | int main(int argc, char *argv[]) { 39 | int c; /* character for getopt processing */ 40 | /* command argument variables */ 41 | char *first_file = NULL; 42 | char *second_file = NULL; 43 | char *a_output_file = NULL; 44 | char *b_output_file = NULL; 45 | char *c_output_file = NULL; 46 | FILE *a_output_fp = NULL; 47 | FILE *b_output_fp = NULL; 48 | FILE *c_output_fp = NULL; 49 | int use_header = 0; 50 | int only_summarize = 0; 51 | 52 | /* internal variables */ 53 | seqdiff_results_t *results; 54 | results = seqdiff_results_init(); 55 | 56 | extern char *optarg; /* external from getopt */ 57 | 58 | verbose_flag = 0; /* assume not verbose */ 59 | progname = argv[0]; /* capture the program name */ 60 | if (argc < 2) { 61 | show_usage(EXIT_FAILURE); 62 | } 63 | 64 | while(1) { 65 | static struct option long_options[] = 66 | { 67 | {"verbose", no_argument, 0, 'v'}, 68 | {"version", no_argument, 0, 'V'}, 69 | {"help", no_argument, 0, 'h'}, 70 | {"summary", no_argument, 0, 's'}, 71 | {"headers", no_argument, 0, 'd'}, 72 | {"first", required_argument, 0, '1'}, 73 | {"second", required_argument, 0, '2'}, 74 | {"a_output", required_argument, 0, 'a'}, 75 | {"b_output", required_argument, 0, 'b'}, 76 | {"c_output", required_argument, 0, 'c'}, 77 | {0, 0, 0, 0} 78 | }; 79 | 80 | /* getopt_long stores the option index here. */ 81 | int option_index = 0; 82 | 83 | c = getopt_long(argc, argv, "vVh?sd1:2:a:b:c:", long_options, &option_index); 84 | 85 | /* Detect the end of the options. */ 86 | if (c == -1) 87 | break; 88 | 89 | switch (c) { 90 | case 'v': 91 | verbose_flag = 1; 92 | break; 93 | 94 | case 'V': 95 | /* version */ 96 | printf("Version is %f\n",VERSION); 97 | break; 98 | 99 | case 'h': 100 | show_usage(EXIT_FAILURE); 101 | break; 102 | 103 | case '?': 104 | /* getopt_long already printed an error message. */ 105 | break; 106 | 107 | case 's': 108 | only_summarize = 1; 109 | break; 110 | 111 | case 'd': 112 | use_header = 1; 113 | break; 114 | 115 | case '1': 116 | first_file = (char*) malloc(strlen(optarg)+1); 117 | strcpy(first_file,optarg); 118 | break; 119 | 120 | case '2': 121 | second_file = (char*) malloc(strlen(optarg)+1); 122 | strcpy(second_file,optarg); 123 | break; 124 | 125 | case 'a': 126 | a_output_file = (char*) malloc(strlen(optarg)+1); 127 | strcpy(a_output_file,optarg); 128 | break; 129 | 130 | case 'b': 131 | b_output_file = (char*) malloc(strlen(optarg)+1); 132 | strcpy(b_output_file,optarg); 133 | break; 134 | 135 | case 'c': 136 | c_output_file = (char*) malloc(strlen(optarg)+1); 137 | strcpy(c_output_file,optarg); 138 | break; 139 | 140 | default: 141 | abort (); 142 | } 143 | } 144 | 145 | /* Instead of reporting '--verbose' 146 | and '--brief' as they are encountered, 147 | we report the final status resulting from them. */ 148 | if (verbose_flag) { 149 | fprintf(stderr, "verbose flag is set\n"); 150 | fprintf(stderr,"First file is %s\n", first_file); 151 | fprintf(stderr,"Second file is %s\n", second_file); 152 | if (a_output_file != NULL && a_output_file != NULL && a_output_file != NULL) { 153 | fprintf(stderr, "Output will be written to files:\n"); 154 | fprintf(stderr, " first file uniques: %s\n", a_output_file); 155 | fprintf(stderr, " second file uniques: %s\n", b_output_file); 156 | fprintf(stderr, " common to both input files: %s\n", c_output_file); 157 | } else 158 | fprintf(stderr,"No output files will be generated\n"); 159 | 160 | if (use_header) 161 | fprintf(stderr,"Processing will be done using headers, not sequences\n"); 162 | else 163 | fprintf(stderr,"Processing will be done using sequences\n"); 164 | if (only_summarize) 165 | fprintf(stderr,"Only showing summary information\n"); 166 | } 167 | 168 | /* check validity of given argument combination */ 169 | if (! first_file) { 170 | fprintf (stderr, "Error: First sequence file is required.\n"); 171 | return EXIT_FAILURE; 172 | } 173 | 174 | if (! second_file) { 175 | fprintf (stderr, "Error: First sequence file is required.\n"); 176 | return EXIT_FAILURE; 177 | } 178 | 179 | results->first_file = first_file; 180 | results->second_file = second_file; 181 | results->use_header = use_header; 182 | results->only_summarize = only_summarize; 183 | if (a_output_file != NULL && b_output_file != NULL && c_output_file != NULL) { 184 | results->a_output_fp = fopen(a_output_file,"w+"); 185 | if (!results->a_output_fp) { 186 | fprintf(stderr,"%s - failed to open file %s\n",progname,a_output_file); 187 | exit(EXIT_FAILURE); 188 | } 189 | results->b_output_fp = fopen(b_output_file,"w+"); 190 | if (!results->b_output_fp) { 191 | fprintf(stderr,"%s - failed to open file %s\n",progname,b_output_file); 192 | exit(EXIT_FAILURE); 193 | } 194 | results->c_output_fp = fopen(c_output_file,"w+"); 195 | if (!results->c_output_fp) { 196 | fprintf(stderr,"%s - failed to open file %s\n",progname,c_output_file); 197 | exit(EXIT_FAILURE); 198 | } 199 | } 200 | 201 | /* do the comparison */ 202 | cmpseq(results); 203 | 204 | if (a_output_file != NULL && b_output_file != NULL && c_output_file != NULL) { 205 | fclose(results->a_output_fp); 206 | fclose(results->b_output_fp); 207 | fclose(results->c_output_fp); 208 | } 209 | 210 | /* report results */ 211 | fprintf(stderr, " first_file_total = %d\n", results->first_file_total); 212 | fprintf(stderr, " first_file_uniq = %d\n", results->first_file_uniq); 213 | fprintf(stderr, "second_file_total = %d\n", results->second_file_total); 214 | fprintf(stderr, " second_file_uniq = %d\n", results->second_file_uniq); 215 | fprintf(stderr, " common = %d\n", results->common); 216 | 217 | free(first_file); 218 | free(second_file); 219 | if (a_output_file != NULL) 220 | free(a_output_file); 221 | if (b_output_file != NULL) 222 | free(b_output_file); 223 | if (c_output_file != NULL) 224 | free(c_output_file); 225 | 226 | /* clean up */ 227 | seqdiff_results_destroy(results); 228 | fclose(stderr); 229 | fclose(stdout); 230 | fclose(stdin); 231 | return EXIT_SUCCESS; 232 | } 233 | -------------------------------------------------------------------------------- /src/kseq.h: -------------------------------------------------------------------------------- 1 | /* The MIT License 2 | 3 | Copyright (c) 2008, 2009, 2011 Attractive Chaos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | /* Last Modified: 05MAR2012 */ 27 | 28 | #ifndef AC_KSEQ_H 29 | #define AC_KSEQ_H 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r 36 | #define KS_SEP_TAB 1 // isspace() && !' ' 37 | #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) 38 | #define KS_SEP_MAX 2 39 | 40 | #define __KS_TYPE(type_t) \ 41 | typedef struct __kstream_t { \ 42 | unsigned char *buf; \ 43 | int begin, end, is_eof; \ 44 | type_t f; \ 45 | } kstream_t; 46 | 47 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) 48 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) 49 | 50 | #define __KS_BASIC(type_t, __bufsize) \ 51 | static inline kstream_t *ks_init(type_t f) \ 52 | { \ 53 | kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ 54 | ks->f = f; \ 55 | ks->buf = (unsigned char*)malloc(__bufsize); \ 56 | return ks; \ 57 | } \ 58 | static inline void ks_destroy(kstream_t *ks) \ 59 | { \ 60 | if (ks) { \ 61 | free(ks->buf); \ 62 | free(ks); \ 63 | } \ 64 | } 65 | 66 | #define __KS_GETC(__read, __bufsize) \ 67 | static inline int ks_getc(kstream_t *ks) \ 68 | { \ 69 | if (ks->is_eof && ks->begin >= ks->end) return -1; \ 70 | if (ks->begin >= ks->end) { \ 71 | ks->begin = 0; \ 72 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 73 | if (ks->end == 0) { ks->is_eof = 1; return -1;} \ 74 | } \ 75 | return (int)ks->buf[ks->begin++]; \ 76 | } 77 | 78 | #ifndef KSTRING_T 79 | #define KSTRING_T kstring_t 80 | typedef struct __kstring_t { 81 | size_t l, m; 82 | char *s; 83 | } kstring_t; 84 | #endif 85 | 86 | #ifndef kroundup32 87 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) 88 | #endif 89 | 90 | #define __KS_GETUNTIL(__read, __bufsize) \ 91 | static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ 92 | { \ 93 | int gotany = 0; \ 94 | if (dret) *dret = 0; \ 95 | str->l = append? str->l : 0; \ 96 | for (;;) { \ 97 | int i; \ 98 | if (ks->begin >= ks->end) { \ 99 | if (!ks->is_eof) { \ 100 | ks->begin = 0; \ 101 | ks->end = __read(ks->f, ks->buf, __bufsize); \ 102 | if (ks->end == 0) { ks->is_eof = 1; break; } \ 103 | } else break; \ 104 | } \ 105 | if (delimiter == KS_SEP_LINE) { \ 106 | for (i = ks->begin; i < ks->end; ++i) \ 107 | if (ks->buf[i] == '\n') break; \ 108 | } else if (delimiter > KS_SEP_MAX) { \ 109 | for (i = ks->begin; i < ks->end; ++i) \ 110 | if (ks->buf[i] == delimiter) break; \ 111 | } else if (delimiter == KS_SEP_SPACE) { \ 112 | for (i = ks->begin; i < ks->end; ++i) \ 113 | if (isspace(ks->buf[i])) break; \ 114 | } else if (delimiter == KS_SEP_TAB) { \ 115 | for (i = ks->begin; i < ks->end; ++i) \ 116 | if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ 117 | } else i = 0; /* never come to here! */ \ 118 | if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ 119 | str->m = str->l + (i - ks->begin) + 1; \ 120 | kroundup32(str->m); \ 121 | str->s = (char*)realloc(str->s, str->m); \ 122 | } \ 123 | gotany = 1; \ 124 | memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ 125 | str->l = str->l + (i - ks->begin); \ 126 | ks->begin = i + 1; \ 127 | if (i < ks->end) { \ 128 | if (dret) *dret = ks->buf[i]; \ 129 | break; \ 130 | } \ 131 | } \ 132 | if (!gotany && ks_eof(ks)) return -1; \ 133 | if (str->s == 0) { \ 134 | str->m = 1; \ 135 | str->s = (char*)calloc(1, 1); \ 136 | } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ 137 | str->s[str->l] = '\0'; \ 138 | return str->l; \ 139 | } \ 140 | static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ 141 | { return ks_getuntil2(ks, delimiter, str, dret, 0); } 142 | 143 | #define KSTREAM_INIT(type_t, __read, __bufsize) \ 144 | __KS_TYPE(type_t) \ 145 | __KS_BASIC(type_t, __bufsize) \ 146 | __KS_GETC(__read, __bufsize) \ 147 | __KS_GETUNTIL(__read, __bufsize) 148 | 149 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) 150 | 151 | #define __KSEQ_BASIC(SCOPE, type_t) \ 152 | SCOPE kseq_t *kseq_init(type_t fd) \ 153 | { \ 154 | kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ 155 | s->f = ks_init(fd); \ 156 | return s; \ 157 | } \ 158 | SCOPE void kseq_destroy(kseq_t *ks) \ 159 | { \ 160 | if (!ks) return; \ 161 | free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ 162 | ks_destroy(ks->f); \ 163 | free(ks); \ 164 | } 165 | 166 | /* Return value: 167 | >=0 length of the sequence (normal) 168 | -1 end-of-file 169 | -2 truncated quality string 170 | */ 171 | #define __KSEQ_READ(SCOPE) \ 172 | SCOPE int kseq_read(kseq_t *seq) \ 173 | { \ 174 | int c; \ 175 | kstream_t *ks = seq->f; \ 176 | if (seq->last_char == 0) { /* then jump to the next header line */ \ 177 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ 178 | if (c == -1) return -1; /* end of file */ \ 179 | seq->last_char = c; \ 180 | } /* else: the first header char has been read in the previous call */ \ 181 | seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ 182 | if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ 183 | if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ 184 | if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ 185 | seq->seq.m = 256; \ 186 | seq->seq.s = (char*)malloc(seq->seq.m); \ 187 | } \ 188 | while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ 189 | if (c == '\n') continue; /* skip empty lines */ \ 190 | seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ 191 | ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ 192 | } \ 193 | if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ 194 | if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ 195 | seq->seq.m = seq->seq.l + 2; \ 196 | kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ 197 | seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ 198 | } \ 199 | seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ 200 | if (c != '+') return seq->seq.l; /* FASTA */ \ 201 | if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ 202 | seq->qual.m = seq->seq.m; \ 203 | seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ 204 | } \ 205 | while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ 206 | if (c == -1) return -2; /* error: no quality string */ \ 207 | while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ 208 | seq->last_char = 0; /* we have not come to the next header line */ \ 209 | if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ 210 | return seq->seq.l; \ 211 | } 212 | 213 | #define __KSEQ_TYPE(type_t) \ 214 | typedef struct { \ 215 | kstring_t name, comment, seq, qual; \ 216 | int last_char; \ 217 | kstream_t *f; \ 218 | } kseq_t; 219 | 220 | #define KSEQ_INIT2(SCOPE, type_t, __read) \ 221 | KSTREAM_INIT(type_t, __read, 16384) \ 222 | __KSEQ_TYPE(type_t) \ 223 | __KSEQ_BASIC(SCOPE, type_t) \ 224 | __KSEQ_READ(SCOPE) 225 | 226 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) 227 | 228 | #define KSEQ_DECLARE(type_t) \ 229 | __KS_TYPE(type_t) \ 230 | __KSEQ_TYPE(type_t) \ 231 | extern kseq_t *kseq_init(type_t fd); \ 232 | void kseq_destroy(kseq_t *ks); \ 233 | int kseq_read(kseq_t *seq); 234 | 235 | #endif 236 | -------------------------------------------------------------------------------- /src/pullseq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "global.h" 9 | #include "pullseq.h" 10 | #include "pull_by_name.h" 11 | #include "pull_by_size.h" 12 | #include "pull_by_re.h" 13 | 14 | int verbose_flag; 15 | char const *progname; 16 | 17 | void show_usage(int status) { 18 | fprintf(stderr, "pullseq - a bioinformatics tool for manipulating fasta and fastq files\n"); 19 | fprintf(stderr, "\nVersion: %s Name lookup method: %s", PULLSEQ_VERSION, PULLSEQ_SORTMETHOD); 20 | fprintf(stderr, "\n(Written by bct - copyright 2012-2015)\n"); 21 | fprintf(stderr, "\nUsage:\n"); 22 | fprintf(stderr, " %s -i -n
\n\n", progname); 23 | fprintf(stderr, " %s -i -m \n\n", progname); 24 | fprintf(stderr, " %s -i -g \n\n", progname); 25 | fprintf(stderr, " %s -i -m -a \n\n", progname); 26 | fprintf(stderr, " %s -i -t\n\n", progname); 27 | fprintf(stderr, " cat | %s -i -N\n\n", progname); 28 | 29 | fprintf(stderr, " Options:\n"); 30 | fprintf(stderr, " -i, --input, Input fasta/fastq file (required)\n"); 31 | fprintf(stderr, " -n, --names, File of header id names to search for\n"); 32 | fprintf(stderr, " -N, --names_stdin, Use STDIN for header id names\n"); 33 | fprintf(stderr, " -g, --regex, Regular expression to match (PERL compatible; always case-insensitive)\n"); 34 | fprintf(stderr, " -m, --min, Minimum sequence length\n"); 35 | fprintf(stderr, " -a, --max, Maximum sequence length\n"); 36 | fprintf(stderr, " -l, --length, Sequence characters per line (default 50)\n"); 37 | fprintf(stderr, " -c, --convert, Convert input to fastq/fasta (e.g. if input is fastq, output will be fasta)\n"); 38 | fprintf(stderr, " -q, --quality, ASCII code to use for fasta->fastq quality conversions\n"); 39 | fprintf(stderr, " -e, --excluded, Exclude the header id names in the list (-n)\n"); 40 | fprintf(stderr, " -t, --count, Just count the possible output, but don't write it\n"); 41 | fprintf(stderr, " -h, --help, Display this help and exit\n"); 42 | fprintf(stderr, " -v, --verbose, Print extra details during the run\n"); 43 | fprintf(stderr, " --version, Output version information and exit\n\n"); 44 | 45 | exit(status); 46 | } 47 | 48 | int main(int argc, char *argv[]) { 49 | int c; 50 | char *in = NULL,*names = NULL; 51 | FILE *names_fp = NULL; 52 | int min = -1, max = -1; 53 | int names_from_stdin = 0; 54 | int exclude = 0; 55 | int count = 0; 56 | int just_count = 0; /* flag for just counting the output */ 57 | int convert = 0; 58 | int length = 50; 59 | long value; 60 | char *end; 61 | char *aStrRegex = NULL; 62 | 63 | extern char *optarg; /* external from getopt */ 64 | 65 | verbose_flag = 0; /* assume not verbose */ 66 | 67 | progname = argv[0]; 68 | if (argc < 4) { /* progname + at least 3 other args */ 69 | show_usage(EXIT_FAILURE); 70 | } 71 | 72 | while(1) { 73 | static struct option long_options[] = 74 | { 75 | {"verbose", no_argument, 0, 'v'}, 76 | {"convert", no_argument, 0, 'c'}, 77 | {"exclude", no_argument, 0, 'e'}, 78 | {"count", no_argument, 0, 't'}, 79 | {"version", no_argument, 0, 'V'}, 80 | {"help", no_argument, 0, 'h'}, 81 | {"input", required_argument, 0, 'i'}, 82 | {"regex", required_argument, 0, 'g'}, 83 | {"names", required_argument, 0, 'n'}, 84 | {"names_stdin", no_argument, 0, 'N'}, 85 | {"min", required_argument, 0, 'm'}, 86 | {"max", required_argument, 0, 'a'}, 87 | {"length", required_argument, 0, 'l'}, 88 | {"quality", required_argument, 0, 'q'}, 89 | {0, 0, 0, 0} 90 | }; 91 | 92 | /* getopt_long stores the option index here. */ 93 | int option_index = 0; 94 | 95 | c = getopt_long (argc, argv, "Vvh?cetq:i:g:Nn:m:a:l:", long_options, &option_index); 96 | 97 | /* Detect the end of the options. */ 98 | if (c == -1) 99 | break; 100 | 101 | switch (c) { 102 | case 'v': 103 | verbose_flag = 1; 104 | break; 105 | 106 | case 'i': 107 | in = (char*) malloc(strlen(optarg)+1); 108 | strcpy(in,optarg); 109 | break; 110 | 111 | case 'g': 112 | aStrRegex = (char*) malloc(strlen(optarg)+1); 113 | strcpy(aStrRegex, optarg); 114 | break; 115 | 116 | case 'n': 117 | names = (char*) malloc(strlen(optarg)+1); 118 | strcpy(names, optarg); 119 | break; 120 | 121 | case 'N': 122 | names_from_stdin = 1; 123 | break; 124 | 125 | case 'm': 126 | value = strtol(optarg, &end, 0); 127 | if (*end == '\0' && errno == 0) { 128 | min = atoi(optarg); 129 | } else { 130 | fprintf(stderr, "Maximum value (-m) argument '%s' is not an integer\n", optarg); 131 | return EXIT_FAILURE; 132 | } 133 | break; 134 | 135 | case 'a': 136 | value = strtol(optarg, &end, 0); 137 | if (*end == '\0' && errno == 0) { 138 | max = atoi(optarg); 139 | } else { 140 | fprintf(stderr, "Maximum value (-a) argument '%s' is not an integer\n", optarg); 141 | return EXIT_FAILURE; 142 | } 143 | break; 144 | 145 | case 'c': 146 | convert = 1; 147 | QUALITY_SCORE = 61; 148 | break; 149 | 150 | case 't': 151 | just_count = 1; 152 | break; 153 | 154 | case 'q': 155 | value = strtol(optarg, &end, 0); 156 | if (*end == 0 && errno == 0) { 157 | QUALITY_SCORE = atoi(optarg); 158 | } else { 159 | fprintf(stderr, "Quality ASCII value (-q) is invalid - must be an ASCII code (e.g. 73, which is 'I')\n"); 160 | return EXIT_FAILURE; 161 | } 162 | break; 163 | 164 | case 'e': 165 | exclude = 1; 166 | break; 167 | 168 | case 'l': 169 | value = strtol(optarg, &end,0); 170 | if (*end == '\0' && errno == 0) { 171 | length = atoi(optarg); 172 | } else { 173 | fprintf(stderr, "Sequence length value (-l) argument '%s' is not an integer\n", optarg); 174 | return EXIT_FAILURE; 175 | } 176 | break; 177 | 178 | case 'V': 179 | /* version */ 180 | printf("Version is %s\n", PULLSEQ_VERSION); 181 | break; 182 | 183 | case 'h': 184 | show_usage(EXIT_FAILURE); 185 | break; 186 | 187 | case '?': 188 | /* getopt_long already printed an error message. */ 189 | break; 190 | 191 | default: 192 | abort (); 193 | } 194 | } 195 | 196 | /* Instead of reporting '--verbose' 197 | and '--brief' as they are encountered, 198 | we report the final status resulting from them. */ 199 | if (verbose_flag) { 200 | fprintf(stderr, "verbose flag is set\n"); 201 | fprintf(stderr,"Input is %s\n", in); 202 | if (convert) 203 | fprintf(stderr,"Input will be converted between FASTQ and FASTA\n"); 204 | if (names_from_stdin) { 205 | if (exclude) 206 | fprintf(stderr,"Names in STDIN will be excluded\n"); 207 | else 208 | fprintf(stderr,"Names in STDIN will be included\n"); 209 | 210 | } else if (names != NULL) { 211 | if (exclude) { 212 | fprintf(stderr,"Names in %s will be excluded\n", names); 213 | } 214 | else { 215 | fprintf(stderr,"Names in %s will be included\n", names); 216 | } 217 | } 218 | if (aStrRegex) 219 | if (exclude) { 220 | fprintf(stderr,"Only sequences not matching %s will be output\n", aStrRegex); 221 | } 222 | else { 223 | fprintf(stderr,"Only sequences matching %s will be output\n", aStrRegex); 224 | } 225 | if (max > 0) 226 | fprintf(stderr,"Only sequences less than %i will be output\n", max); 227 | if (min > 0) 228 | fprintf(stderr,"Only sequences greater than %i will be output\n", min); 229 | if (length > 0) 230 | fprintf(stderr,"Output will be %i columns long\n", length); 231 | if (just_count > 0) 232 | fprintf(stderr,"Output will be counted only\n"); 233 | } 234 | 235 | /* check validity of given argument set */ 236 | if (!in) { 237 | fprintf (stderr, "Error: Input file is required.\n"); 238 | return EXIT_FAILURE; 239 | } 240 | 241 | if (names) { 242 | if (!strcmp(in, names)) { 243 | fprintf (stderr, "Error: Input file is same as names file.\n"); 244 | return EXIT_FAILURE; 245 | } 246 | } 247 | 248 | if (names && names_from_stdin) { 249 | fprintf (stderr, "Error: Cannot use names from STDIN *and* names from a file.\n"); 250 | return EXIT_FAILURE; 251 | } 252 | 253 | if (aStrRegex) { 254 | if (names || names_from_stdin) { 255 | fprintf (stderr, "Error: You can't use a names file or names from STDIN and a regex match.\n"); 256 | return EXIT_FAILURE; 257 | } 258 | } 259 | 260 | if (min > 0 && max > 0) { 261 | if (max <= min) { 262 | fprintf (stderr, "Error: Max is less than or equal to min.\n"); 263 | return EXIT_FAILURE; 264 | } 265 | } 266 | 267 | if (names || names_from_stdin) { 268 | if (names) { 269 | names_fp = fopen(names,"r"); 270 | if (!names_fp) { 271 | fprintf(stderr,"%s - failed to open names file %s\n",progname, names); 272 | exit(EXIT_FAILURE); 273 | } 274 | } else { 275 | names_fp = stdin; 276 | } 277 | count = pull_by_name(in, names_fp, min, max, length, exclude, convert, just_count); 278 | } else if (aStrRegex) { 279 | count = pull_by_re(in, aStrRegex, min, max, length, exclude, convert, just_count); 280 | } else { 281 | count = pull_by_size(in, min, max, length, convert, just_count); 282 | } 283 | 284 | /* free up memory */ 285 | free(in); 286 | 287 | if (names) 288 | free(names); 289 | 290 | if (names_fp) 291 | fclose(names_fp); 292 | 293 | if (aStrRegex) 294 | free(aStrRegex); 295 | 296 | if (verbose_flag) 297 | fprintf(stderr,"Pulled %i entries\n",count); 298 | 299 | /* close streams */ 300 | fclose(stderr); 301 | fclose(stdout); 302 | fclose(stdin); 303 | return EXIT_SUCCESS; 304 | } 305 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installation Instructions 2 | ************************* 3 | 4 | NOTE: 5 | 6 | Basic Installation 7 | ================== 8 | 9 | Briefly, the shell commands `./configure; make; make install' should 10 | configure, build, and install this package. The following 11 | more-detailed instructions are generic; see the `README' file for 12 | instructions specific to this package. Some packages provide this 13 | `INSTALL' file but do not implement all of the features documented 14 | below. The lack of an optional feature in a given package is not 15 | necessarily a bug. More recommendations for GNU packages can be found 16 | in *note Makefile Conventions: (standards)Makefile Conventions. 17 | 18 | The `configure' shell script attempts to guess correct values for 19 | various system-dependent variables used during compilation. It uses 20 | those values to create a `Makefile' in each directory of the package. 21 | It may also create one or more `.h' files containing system-dependent 22 | definitions. Finally, it creates a shell script `config.status' that 23 | you can run in the future to recreate the current configuration, and a 24 | file `config.log' containing compiler output (useful mainly for 25 | debugging `configure'). 26 | 27 | It can also use an optional file (typically called `config.cache' 28 | and enabled with `--cache-file=config.cache' or simply `-C') that saves 29 | the results of its tests to speed up reconfiguring. Caching is 30 | disabled by default to prevent problems with accidental use of stale 31 | cache files. 32 | 33 | If you need to do unusual things to compile the package, please try 34 | to figure out how `configure' could check whether to do them, and mail 35 | diffs or instructions to the address given in the `README' so they can 36 | be considered for the next release. If you are using the cache, and at 37 | some point `config.cache' contains results you don't want to keep, you 38 | may remove or edit it. 39 | 40 | The file `configure.ac' (or `configure.in') is used to create 41 | `configure' by a program called `autoconf'. You need `configure.ac' if 42 | you want to change it or regenerate `configure' using a newer version 43 | of `autoconf'. 44 | 45 | The simplest way to compile this package is: 46 | 47 | 1. `cd' to the directory containing the package's source code and type 48 | `./configure' to configure the package for your system. 49 | 50 | Running `configure' might take a while. While running, it prints 51 | some messages telling which features it is checking for. 52 | 53 | 2. Type `make' to compile the package. 54 | 55 | 3. Optionally, type `make check' to run any self-tests that come with 56 | the package, generally using the just-built uninstalled binaries. 57 | 58 | 4. Type `make install' to install the programs and any data files and 59 | documentation. When installing into a prefix owned by root, it is 60 | recommended that the package be configured and built as a regular 61 | user, and only the `make install' phase executed with root 62 | privileges. 63 | 64 | 5. Optionally, type `make installcheck' to repeat any self-tests, but 65 | this time using the binaries in their final installed location. 66 | This target does not install anything. Running this target as a 67 | regular user, particularly if the prior `make install' required 68 | root privileges, verifies that the installation completed 69 | correctly. 70 | 71 | 6. You can remove the program binaries and object files from the 72 | source code directory by typing `make clean'. To also remove the 73 | files that `configure' created (so you can compile the package for 74 | a different kind of computer), type `make distclean'. There is 75 | also a `make maintainer-clean' target, but that is intended mainly 76 | for the package's developers. If you use it, you may have to get 77 | all sorts of other programs in order to regenerate files that came 78 | with the distribution. 79 | 80 | 7. Often, you can also type `make uninstall' to remove the installed 81 | files again. In practice, not all packages have tested that 82 | uninstallation works correctly, even though it is required by the 83 | GNU Coding Standards. 84 | 85 | 8. Some packages, particularly those that use Automake, provide `make 86 | distcheck', which can by used by developers to test that all other 87 | targets like `make install' and `make uninstall' work correctly. 88 | This target is generally not run by end users. 89 | 90 | Compilers and Options 91 | ===================== 92 | 93 | Some systems require unusual options for compilation or linking that 94 | the `configure' script does not know about. Run `./configure --help' 95 | for details on some of the pertinent environment variables. 96 | 97 | You can give `configure' initial values for configuration parameters 98 | by setting variables in the command line or in the environment. Here 99 | is an example: 100 | 101 | ./configure CC=c99 CFLAGS=-g LIBS=-lposix 102 | 103 | *Note Defining Variables::, for more details. 104 | 105 | Compiling For Multiple Architectures 106 | ==================================== 107 | 108 | You can compile the package for more than one kind of computer at the 109 | same time, by placing the object files for each architecture in their 110 | own directory. To do this, you can use GNU `make'. `cd' to the 111 | directory where you want the object files and executables to go and run 112 | the `configure' script. `configure' automatically checks for the 113 | source code in the directory that `configure' is in and in `..'. This 114 | is known as a "VPATH" build. 115 | 116 | With a non-GNU `make', it is safer to compile the package for one 117 | architecture at a time in the source code directory. After you have 118 | installed the package for one architecture, use `make distclean' before 119 | reconfiguring for another architecture. 120 | 121 | On MacOS X 10.5 and later systems, you can create libraries and 122 | executables that work on multiple system types--known as "fat" or 123 | "universal" binaries--by specifying multiple `-arch' options to the 124 | compiler but only a single `-arch' option to the preprocessor. Like 125 | this: 126 | 127 | ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 128 | CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ 129 | CPP="gcc -E" CXXCPP="g++ -E" 130 | 131 | This is not guaranteed to produce working output in all cases, you 132 | may have to build one architecture at a time and combine the results 133 | using the `lipo' tool if you have problems. 134 | 135 | Installation Names 136 | ================== 137 | 138 | By default, `make install' installs the package's commands under 139 | `/usr/local/bin', include files under `/usr/local/include', etc. You 140 | can specify an installation prefix other than `/usr/local' by giving 141 | `configure' the option `--prefix=PREFIX', where PREFIX must be an 142 | absolute file name. 143 | 144 | You can specify separate installation prefixes for 145 | architecture-specific files and architecture-independent files. If you 146 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses 147 | PREFIX as the prefix for installing programs and libraries. 148 | Documentation and other data files still use the regular prefix. 149 | 150 | In addition, if you use an unusual directory layout you can give 151 | options like `--bindir=DIR' to specify different values for particular 152 | kinds of files. Run `configure --help' for a list of the directories 153 | you can set and what kinds of files go in them. In general, the 154 | default for these options is expressed in terms of `${prefix}', so that 155 | specifying just `--prefix' will affect all of the other directory 156 | specifications that were not explicitly provided. 157 | 158 | The most portable way to affect installation locations is to pass the 159 | correct locations to `configure'; however, many packages provide one or 160 | both of the following shortcuts of passing variable assignments to the 161 | `make install' command line to change installation locations without 162 | having to reconfigure or recompile. 163 | 164 | The first method involves providing an override variable for each 165 | affected directory. For example, `make install 166 | prefix=/alternate/directory' will choose an alternate location for all 167 | directory configuration variables that were expressed in terms of 168 | `${prefix}'. Any directories that were specified during `configure', 169 | but not in terms of `${prefix}', must each be overridden at install 170 | time for the entire installation to be relocated. The approach of 171 | makefile variable overrides for each directory variable is required by 172 | the GNU Coding Standards, and ideally causes no recompilation. 173 | However, some platforms have known limitations with the semantics of 174 | shared libraries that end up requiring recompilation when using this 175 | method, particularly noticeable in packages that use GNU Libtool. 176 | 177 | The second method involves providing the `DESTDIR' variable. For 178 | example, `make install DESTDIR=/alternate/directory' will prepend 179 | `/alternate/directory' before all installation names. The approach of 180 | `DESTDIR' overrides is not required by the GNU Coding Standards, and 181 | does not work on platforms that have drive letters. On the other hand, 182 | it does better at avoiding recompilation issues, and works well even 183 | when some directory options were not specified in terms of `${prefix}' 184 | at `configure' time. 185 | 186 | Optional Features 187 | ================= 188 | 189 | If the package supports it, you can cause programs to be installed 190 | with an extra prefix or suffix on their names by giving `configure' the 191 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. 192 | 193 | Some packages pay attention to `--enable-FEATURE' options to 194 | `configure', where FEATURE indicates an optional part of the package. 195 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE 196 | is something like `gnu-as' or `x' (for the X Window System). The 197 | `README' should mention any `--enable-' and `--with-' options that the 198 | package recognizes. 199 | 200 | For packages that use the X Window System, `configure' can usually 201 | find the X include and library files automatically, but if it doesn't, 202 | you can use the `configure' options `--x-includes=DIR' and 203 | `--x-libraries=DIR' to specify their locations. 204 | 205 | Some packages offer the ability to configure how verbose the 206 | execution of `make' will be. For these packages, running `./configure 207 | --enable-silent-rules' sets the default to minimal output, which can be 208 | overridden with `make V=1'; while running `./configure 209 | --disable-silent-rules' sets the default to verbose, which can be 210 | overridden with `make V=0'. 211 | 212 | Particular systems 213 | ================== 214 | 215 | On HP-UX, the default C compiler is not ANSI C compatible. If GNU 216 | CC is not installed, it is recommended to use the following options in 217 | order to use an ANSI C compiler: 218 | 219 | ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" 220 | 221 | and if that doesn't work, install pre-built binaries of GCC for HP-UX. 222 | 223 | On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot 224 | parse its `' header file. The option `-nodtk' can be used as 225 | a workaround. If GNU CC is not installed, it is therefore recommended 226 | to try 227 | 228 | ./configure CC="cc" 229 | 230 | and if that doesn't work, try 231 | 232 | ./configure CC="cc -nodtk" 233 | 234 | On Solaris, don't put `/usr/ucb' early in your `PATH'. This 235 | directory contains several dysfunctional programs; working variants of 236 | these programs are available in `/usr/bin'. So, if you need `/usr/ucb' 237 | in your `PATH', put it _after_ `/usr/bin'. 238 | 239 | On Haiku, software installed for all users goes in `/boot/common', 240 | not `/usr/local'. It is recommended to use the following options: 241 | 242 | ./configure --prefix=/boot/common 243 | 244 | Specifying the System Type 245 | ========================== 246 | 247 | There may be some features `configure' cannot figure out 248 | automatically, but needs to determine by the type of machine the package 249 | will run on. Usually, assuming the package is built to be run on the 250 | _same_ architectures, `configure' can figure that out, but if it prints 251 | a message saying it cannot guess the machine type, give it the 252 | `--build=TYPE' option. TYPE can either be a short name for the system 253 | type, such as `sun4', or a canonical name which has the form: 254 | 255 | CPU-COMPANY-SYSTEM 256 | 257 | where SYSTEM can have one of these forms: 258 | 259 | OS 260 | KERNEL-OS 261 | 262 | See the file `config.sub' for the possible values of each field. If 263 | `config.sub' isn't included in this package, then this package doesn't 264 | need to know the machine type. 265 | 266 | If you are _building_ compiler tools for cross-compiling, you should 267 | use the option `--target=TYPE' to select the type of system they will 268 | produce code for. 269 | 270 | If you want to _use_ a cross compiler, that generates code for a 271 | platform different from the build platform, you should specify the 272 | "host" platform (i.e., that on which the generated programs will 273 | eventually be run) with `--host=TYPE'. 274 | 275 | Sharing Defaults 276 | ================ 277 | 278 | If you want to set default values for `configure' scripts to share, 279 | you can create a site shell script called `config.site' that gives 280 | default values for variables like `CC', `cache_file', and `prefix'. 281 | `configure' looks for `PREFIX/share/config.site' if it exists, then 282 | `PREFIX/etc/config.site' if it exists. Or, you can set the 283 | `CONFIG_SITE' environment variable to the location of the site script. 284 | A warning: not all `configure' scripts look for a site script. 285 | 286 | Defining Variables 287 | ================== 288 | 289 | Variables not defined in a site shell script can be set in the 290 | environment passed to `configure'. However, some packages may run 291 | configure again during the build, and the customized values of these 292 | variables may be lost. In order to avoid this problem, you should set 293 | them in the `configure' command line, using `VAR=value'. For example: 294 | 295 | ./configure CC=/usr/local2/bin/gcc 296 | 297 | causes the specified `gcc' to be used as the C compiler (unless it is 298 | overridden in the site shell script). 299 | 300 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to 301 | an Autoconf bug. Until the bug is fixed you can use this workaround: 302 | 303 | CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash 304 | 305 | `configure' Invocation 306 | ====================== 307 | 308 | `configure' recognizes the following options to control how it 309 | operates. 310 | 311 | `--help' 312 | `-h' 313 | Print a summary of all of the options to `configure', and exit. 314 | 315 | `--help=short' 316 | `--help=recursive' 317 | Print a summary of the options unique to this package's 318 | `configure', and exit. The `short' variant lists options used 319 | only in the top level, while the `recursive' variant lists options 320 | also present in any nested packages. 321 | 322 | `--version' 323 | `-V' 324 | Print the version of Autoconf used to generate the `configure' 325 | script, and exit. 326 | 327 | `--cache-file=FILE' 328 | Enable the cache: use and save the results of the tests in FILE, 329 | traditionally `config.cache'. FILE defaults to `/dev/null' to 330 | disable caching. 331 | 332 | `--config-cache' 333 | `-C' 334 | Alias for `--cache-file=config.cache'. 335 | 336 | `--quiet' 337 | `--silent' 338 | `-q' 339 | Do not print messages saying which checks are being made. To 340 | suppress all normal output, redirect it to `/dev/null' (any error 341 | messages will still be shown). 342 | 343 | `--srcdir=DIR' 344 | Look for the package's source code in directory DIR. Usually 345 | `configure' can determine that directory automatically. 346 | 347 | `--prefix=DIR' 348 | Use DIR as the installation prefix. *note Installation Names:: 349 | for more details, including other options available for fine-tuning 350 | the installation locations. 351 | 352 | `--no-create' 353 | `-n' 354 | Run the configure checks, but stop before creating any output 355 | files. 356 | 357 | `configure' also accepts some other, not widely useful, options. Run 358 | `configure --help' for more details. 359 | 360 | -------------------------------------------------------------------------------- /src/uthash.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2003-2014, Troy D. Hanson http://troydhanson.github.com/uthash/ 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | #ifndef UTHASH_H 25 | #define UTHASH_H 26 | 27 | #include /* memcmp,strlen */ 28 | #include /* ptrdiff_t */ 29 | #include /* exit() */ 30 | 31 | /* These macros use decltype or the earlier __typeof GNU extension. 32 | As decltype is only available in newer compilers (VS2010 or gcc 4.3+ 33 | when compiling c++ source) this code uses whatever method is needed 34 | or, for VS2008 where neither is available, uses casting workarounds. */ 35 | #if defined(_MSC_VER) /* MS compiler */ 36 | #if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ 37 | #define DECLTYPE(x) (decltype(x)) 38 | #else /* VS2008 or older (or VS2010 in C mode) */ 39 | #define NO_DECLTYPE 40 | #define DECLTYPE(x) 41 | #endif 42 | #elif defined(__BORLANDC__) || defined(__LCC__) || defined(__WATCOMC__) 43 | #define NO_DECLTYPE 44 | #define DECLTYPE(x) 45 | #else /* GNU, Sun and other compilers */ 46 | #define DECLTYPE(x) (__typeof(x)) 47 | #endif 48 | 49 | #ifdef NO_DECLTYPE 50 | #define DECLTYPE_ASSIGN(dst,src) \ 51 | do { \ 52 | char **_da_dst = (char**)(&(dst)); \ 53 | *_da_dst = (char*)(src); \ 54 | } while(0) 55 | #else 56 | #define DECLTYPE_ASSIGN(dst,src) \ 57 | do { \ 58 | (dst) = DECLTYPE(dst)(src); \ 59 | } while(0) 60 | #endif 61 | 62 | /* a number of the hash function use uint32_t which isn't defined on Pre VS2010 */ 63 | #if defined (_WIN32) 64 | #if defined(_MSC_VER) && _MSC_VER >= 1600 65 | #include 66 | #elif defined(__WATCOMC__) 67 | #include 68 | #else 69 | typedef unsigned int uint32_t; 70 | typedef unsigned char uint8_t; 71 | #endif 72 | #else 73 | #include 74 | #endif 75 | 76 | #define UTHASH_VERSION 1.9.9 77 | 78 | #ifndef uthash_fatal 79 | #define uthash_fatal(msg) exit(-1) /* fatal error (out of memory,etc) */ 80 | #endif 81 | #ifndef uthash_malloc 82 | #define uthash_malloc(sz) malloc(sz) /* malloc fcn */ 83 | #endif 84 | #ifndef uthash_free 85 | #define uthash_free(ptr,sz) free(ptr) /* free fcn */ 86 | #endif 87 | 88 | #ifndef uthash_noexpand_fyi 89 | #define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ 90 | #endif 91 | #ifndef uthash_expand_fyi 92 | #define uthash_expand_fyi(tbl) /* can be defined to log expands */ 93 | #endif 94 | 95 | /* initial number of buckets */ 96 | #define HASH_INITIAL_NUM_BUCKETS 32 /* initial number of buckets */ 97 | #define HASH_INITIAL_NUM_BUCKETS_LOG2 5 /* lg2 of initial number of buckets */ 98 | #define HASH_BKT_CAPACITY_THRESH 10 /* expand when bucket count reaches */ 99 | 100 | /* calculate the element whose hash handle address is hhe */ 101 | #define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) 102 | 103 | #define HASH_FIND(hh,head,keyptr,keylen,out) \ 104 | do { \ 105 | unsigned _hf_bkt,_hf_hashv; \ 106 | out=NULL; \ 107 | if (head) { \ 108 | HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt); \ 109 | if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) { \ 110 | HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], \ 111 | keyptr,keylen,out); \ 112 | } \ 113 | } \ 114 | } while (0) 115 | 116 | #ifdef HASH_BLOOM 117 | #define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM) 118 | #define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0) 119 | #define HASH_BLOOM_MAKE(tbl) \ 120 | do { \ 121 | (tbl)->bloom_nbits = HASH_BLOOM; \ 122 | (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ 123 | if (!((tbl)->bloom_bv)) { uthash_fatal( "out of memory"); } \ 124 | memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN); \ 125 | (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ 126 | } while (0) 127 | 128 | #define HASH_BLOOM_FREE(tbl) \ 129 | do { \ 130 | uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ 131 | } while (0) 132 | 133 | #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8))) 134 | #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8))) 135 | 136 | #define HASH_BLOOM_ADD(tbl,hashv) \ 137 | HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) 138 | 139 | #define HASH_BLOOM_TEST(tbl,hashv) \ 140 | HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) 141 | 142 | #else 143 | #define HASH_BLOOM_MAKE(tbl) 144 | #define HASH_BLOOM_FREE(tbl) 145 | #define HASH_BLOOM_ADD(tbl,hashv) 146 | #define HASH_BLOOM_TEST(tbl,hashv) (1) 147 | #define HASH_BLOOM_BYTELEN 0 148 | #endif 149 | 150 | #define HASH_MAKE_TABLE(hh,head) \ 151 | do { \ 152 | (head)->hh.tbl = (UT_hash_table*)uthash_malloc( \ 153 | sizeof(UT_hash_table)); \ 154 | if (!((head)->hh.tbl)) { uthash_fatal( "out of memory"); } \ 155 | memset((head)->hh.tbl, 0, sizeof(UT_hash_table)); \ 156 | (head)->hh.tbl->tail = &((head)->hh); \ 157 | (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ 158 | (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ 159 | (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ 160 | (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ 161 | HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ 162 | if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); } \ 163 | memset((head)->hh.tbl->buckets, 0, \ 164 | HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ 165 | HASH_BLOOM_MAKE((head)->hh.tbl); \ 166 | (head)->hh.tbl->signature = HASH_SIGNATURE; \ 167 | } while(0) 168 | 169 | #define HASH_ADD(hh,head,fieldname,keylen_in,add) \ 170 | HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add) 171 | 172 | #define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced) \ 173 | do { \ 174 | replaced=NULL; \ 175 | HASH_FIND(hh,head,&((add)->fieldname),keylen_in,replaced); \ 176 | if (replaced!=NULL) { \ 177 | HASH_DELETE(hh,head,replaced); \ 178 | }; \ 179 | HASH_ADD(hh,head,fieldname,keylen_in,add); \ 180 | } while(0) 181 | 182 | #define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ 183 | do { \ 184 | unsigned _ha_bkt; \ 185 | (add)->hh.next = NULL; \ 186 | (add)->hh.key = (char*)(keyptr); \ 187 | (add)->hh.keylen = (unsigned)(keylen_in); \ 188 | if (!(head)) { \ 189 | head = (add); \ 190 | (head)->hh.prev = NULL; \ 191 | HASH_MAKE_TABLE(hh,head); \ 192 | } else { \ 193 | (head)->hh.tbl->tail->next = (add); \ 194 | (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ 195 | (head)->hh.tbl->tail = &((add)->hh); \ 196 | } \ 197 | (head)->hh.tbl->num_items++; \ 198 | (add)->hh.tbl = (head)->hh.tbl; \ 199 | HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets, \ 200 | (add)->hh.hashv, _ha_bkt); \ 201 | HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh); \ 202 | HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv); \ 203 | HASH_EMIT_KEY(hh,head,keyptr,keylen_in); \ 204 | HASH_FSCK(hh,head); \ 205 | } while(0) 206 | 207 | #define HASH_TO_BKT( hashv, num_bkts, bkt ) \ 208 | do { \ 209 | bkt = ((hashv) & ((num_bkts) - 1)); \ 210 | } while(0) 211 | 212 | /* delete "delptr" from the hash table. 213 | * "the usual" patch-up process for the app-order doubly-linked-list. 214 | * The use of _hd_hh_del below deserves special explanation. 215 | * These used to be expressed using (delptr) but that led to a bug 216 | * if someone used the same symbol for the head and deletee, like 217 | * HASH_DELETE(hh,users,users); 218 | * We want that to work, but by changing the head (users) below 219 | * we were forfeiting our ability to further refer to the deletee (users) 220 | * in the patch-up process. Solution: use scratch space to 221 | * copy the deletee pointer, then the latter references are via that 222 | * scratch pointer rather than through the repointed (users) symbol. 223 | */ 224 | #define HASH_DELETE(hh,head,delptr) \ 225 | do { \ 226 | unsigned _hd_bkt; \ 227 | struct UT_hash_handle *_hd_hh_del; \ 228 | if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) ) { \ 229 | uthash_free((head)->hh.tbl->buckets, \ 230 | (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ 231 | HASH_BLOOM_FREE((head)->hh.tbl); \ 232 | uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ 233 | head = NULL; \ 234 | } else { \ 235 | _hd_hh_del = &((delptr)->hh); \ 236 | if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) { \ 237 | (head)->hh.tbl->tail = \ 238 | (UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \ 239 | (head)->hh.tbl->hho); \ 240 | } \ 241 | if ((delptr)->hh.prev) { \ 242 | ((UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \ 243 | (head)->hh.tbl->hho))->next = (delptr)->hh.next; \ 244 | } else { \ 245 | DECLTYPE_ASSIGN(head,(delptr)->hh.next); \ 246 | } \ 247 | if (_hd_hh_del->next) { \ 248 | ((UT_hash_handle*)((ptrdiff_t)_hd_hh_del->next + \ 249 | (head)->hh.tbl->hho))->prev = \ 250 | _hd_hh_del->prev; \ 251 | } \ 252 | HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ 253 | HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ 254 | (head)->hh.tbl->num_items--; \ 255 | } \ 256 | HASH_FSCK(hh,head); \ 257 | } while (0) 258 | 259 | 260 | /* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ 261 | #define HASH_FIND_STR(head,findstr,out) \ 262 | HASH_FIND(hh,head,findstr,strlen(findstr),out) 263 | #define HASH_ADD_STR(head,strfield,add) \ 264 | HASH_ADD(hh,head,strfield[0],strlen(add->strfield),add) 265 | #define HASH_REPLACE_STR(head,strfield,add,replaced) \ 266 | HASH_REPLACE(hh,head,strfield[0],strlen(add->strfield),add,replaced) 267 | #define HASH_FIND_INT(head,findint,out) \ 268 | HASH_FIND(hh,head,findint,sizeof(int),out) 269 | #define HASH_ADD_INT(head,intfield,add) \ 270 | HASH_ADD(hh,head,intfield,sizeof(int),add) 271 | #define HASH_REPLACE_INT(head,intfield,add,replaced) \ 272 | HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced) 273 | #define HASH_FIND_PTR(head,findptr,out) \ 274 | HASH_FIND(hh,head,findptr,sizeof(void *),out) 275 | #define HASH_ADD_PTR(head,ptrfield,add) \ 276 | HASH_ADD(hh,head,ptrfield,sizeof(void *),add) 277 | #define HASH_REPLACE_PTR(head,ptrfield,add,replaced) \ 278 | HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced) 279 | #define HASH_DEL(head,delptr) \ 280 | HASH_DELETE(hh,head,delptr) 281 | 282 | /* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. 283 | * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. 284 | */ 285 | #ifdef HASH_DEBUG 286 | #define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0) 287 | #define HASH_FSCK(hh,head) \ 288 | do { \ 289 | unsigned _bkt_i; \ 290 | unsigned _count, _bkt_count; \ 291 | char *_prev; \ 292 | struct UT_hash_handle *_thh; \ 293 | if (head) { \ 294 | _count = 0; \ 295 | for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) { \ 296 | _bkt_count = 0; \ 297 | _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ 298 | _prev = NULL; \ 299 | while (_thh) { \ 300 | if (_prev != (char*)(_thh->hh_prev)) { \ 301 | HASH_OOPS("invalid hh_prev %p, actual %p\n", \ 302 | _thh->hh_prev, _prev ); \ 303 | } \ 304 | _bkt_count++; \ 305 | _prev = (char*)(_thh); \ 306 | _thh = _thh->hh_next; \ 307 | } \ 308 | _count += _bkt_count; \ 309 | if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ 310 | HASH_OOPS("invalid bucket count %d, actual %d\n", \ 311 | (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ 312 | } \ 313 | } \ 314 | if (_count != (head)->hh.tbl->num_items) { \ 315 | HASH_OOPS("invalid hh item count %d, actual %d\n", \ 316 | (head)->hh.tbl->num_items, _count ); \ 317 | } \ 318 | /* traverse hh in app order; check next/prev integrity, count */ \ 319 | _count = 0; \ 320 | _prev = NULL; \ 321 | _thh = &(head)->hh; \ 322 | while (_thh) { \ 323 | _count++; \ 324 | if (_prev !=(char*)(_thh->prev)) { \ 325 | HASH_OOPS("invalid prev %p, actual %p\n", \ 326 | _thh->prev, _prev ); \ 327 | } \ 328 | _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ 329 | _thh = ( _thh->next ? (UT_hash_handle*)((char*)(_thh->next) + \ 330 | (head)->hh.tbl->hho) : NULL ); \ 331 | } \ 332 | if (_count != (head)->hh.tbl->num_items) { \ 333 | HASH_OOPS("invalid app item count %d, actual %d\n", \ 334 | (head)->hh.tbl->num_items, _count ); \ 335 | } \ 336 | } \ 337 | } while (0) 338 | #else 339 | #define HASH_FSCK(hh,head) 340 | #endif 341 | 342 | /* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to 343 | * the descriptor to which this macro is defined for tuning the hash function. 344 | * The app can #include to get the prototype for write(2). */ 345 | #ifdef HASH_EMIT_KEYS 346 | #define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ 347 | do { \ 348 | unsigned _klen = fieldlen; \ 349 | write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ 350 | write(HASH_EMIT_KEYS, keyptr, fieldlen); \ 351 | } while (0) 352 | #else 353 | #define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) 354 | #endif 355 | 356 | /* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */ 357 | #ifdef HASH_FUNCTION 358 | #define HASH_FCN HASH_FUNCTION 359 | #else 360 | #define HASH_FCN HASH_JEN 361 | #endif 362 | 363 | /* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */ 364 | #define HASH_BER(key,keylen,num_bkts,hashv,bkt) \ 365 | do { \ 366 | unsigned _hb_keylen=keylen; \ 367 | char *_hb_key=(char*)(key); \ 368 | (hashv) = 0; \ 369 | while (_hb_keylen--) { (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; } \ 370 | bkt = (hashv) & (num_bkts-1); \ 371 | } while (0) 372 | 373 | 374 | /* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at 375 | * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */ 376 | #define HASH_SAX(key,keylen,num_bkts,hashv,bkt) \ 377 | do { \ 378 | unsigned _sx_i; \ 379 | char *_hs_key=(char*)(key); \ 380 | hashv = 0; \ 381 | for(_sx_i=0; _sx_i < keylen; _sx_i++) \ 382 | hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ 383 | bkt = hashv & (num_bkts-1); \ 384 | } while (0) 385 | /* FNV-1a variation */ 386 | #define HASH_FNV(key,keylen,num_bkts,hashv,bkt) \ 387 | do { \ 388 | unsigned _fn_i; \ 389 | char *_hf_key=(char*)(key); \ 390 | hashv = 2166136261UL; \ 391 | for(_fn_i=0; _fn_i < keylen; _fn_i++) \ 392 | hashv = hashv ^ _hf_key[_fn_i]; \ 393 | hashv = hashv * 16777619; \ 394 | bkt = hashv & (num_bkts-1); \ 395 | } while(0) 396 | 397 | #define HASH_OAT(key,keylen,num_bkts,hashv,bkt) \ 398 | do { \ 399 | unsigned _ho_i; \ 400 | char *_ho_key=(char*)(key); \ 401 | hashv = 0; \ 402 | for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ 403 | hashv += _ho_key[_ho_i]; \ 404 | hashv += (hashv << 10); \ 405 | hashv ^= (hashv >> 6); \ 406 | } \ 407 | hashv += (hashv << 3); \ 408 | hashv ^= (hashv >> 11); \ 409 | hashv += (hashv << 15); \ 410 | bkt = hashv & (num_bkts-1); \ 411 | } while(0) 412 | 413 | #define HASH_JEN_MIX(a,b,c) \ 414 | do { \ 415 | a -= b; a -= c; a ^= ( c >> 13 ); \ 416 | b -= c; b -= a; b ^= ( a << 8 ); \ 417 | c -= a; c -= b; c ^= ( b >> 13 ); \ 418 | a -= b; a -= c; a ^= ( c >> 12 ); \ 419 | b -= c; b -= a; b ^= ( a << 16 ); \ 420 | c -= a; c -= b; c ^= ( b >> 5 ); \ 421 | a -= b; a -= c; a ^= ( c >> 3 ); \ 422 | b -= c; b -= a; b ^= ( a << 10 ); \ 423 | c -= a; c -= b; c ^= ( b >> 15 ); \ 424 | } while (0) 425 | 426 | #define HASH_JEN(key,keylen,num_bkts,hashv,bkt) \ 427 | do { \ 428 | unsigned _hj_i,_hj_j,_hj_k; \ 429 | unsigned char *_hj_key=(unsigned char*)(key); \ 430 | hashv = 0xfeedbeef; \ 431 | _hj_i = _hj_j = 0x9e3779b9; \ 432 | _hj_k = (unsigned)(keylen); \ 433 | while (_hj_k >= 12) { \ 434 | _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ 435 | + ( (unsigned)_hj_key[2] << 16 ) \ 436 | + ( (unsigned)_hj_key[3] << 24 ) ); \ 437 | _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ 438 | + ( (unsigned)_hj_key[6] << 16 ) \ 439 | + ( (unsigned)_hj_key[7] << 24 ) ); \ 440 | hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ 441 | + ( (unsigned)_hj_key[10] << 16 ) \ 442 | + ( (unsigned)_hj_key[11] << 24 ) ); \ 443 | \ 444 | HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ 445 | \ 446 | _hj_key += 12; \ 447 | _hj_k -= 12; \ 448 | } \ 449 | hashv += keylen; \ 450 | switch ( _hj_k ) { \ 451 | case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); \ 452 | case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); \ 453 | case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); \ 454 | case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); \ 455 | case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); \ 456 | case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); \ 457 | case 5: _hj_j += _hj_key[4]; \ 458 | case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); \ 459 | case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); \ 460 | case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); \ 461 | case 1: _hj_i += _hj_key[0]; \ 462 | } \ 463 | HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ 464 | bkt = hashv & (num_bkts-1); \ 465 | } while(0) 466 | 467 | /* The Paul Hsieh hash function */ 468 | #undef get16bits 469 | #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ 470 | || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) 471 | #define get16bits(d) (*((const uint16_t *) (d))) 472 | #endif 473 | 474 | #if !defined (get16bits) 475 | #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ 476 | +(uint32_t)(((const uint8_t *)(d))[0]) ) 477 | #endif 478 | #define HASH_SFH(key,keylen,num_bkts,hashv,bkt) \ 479 | do { \ 480 | unsigned char *_sfh_key=(unsigned char*)(key); \ 481 | uint32_t _sfh_tmp, _sfh_len = keylen; \ 482 | \ 483 | int _sfh_rem = _sfh_len & 3; \ 484 | _sfh_len >>= 2; \ 485 | hashv = 0xcafebabe; \ 486 | \ 487 | /* Main loop */ \ 488 | for (;_sfh_len > 0; _sfh_len--) { \ 489 | hashv += get16bits (_sfh_key); \ 490 | _sfh_tmp = (uint32_t)(get16bits (_sfh_key+2)) << 11 ^ hashv; \ 491 | hashv = (hashv << 16) ^ _sfh_tmp; \ 492 | _sfh_key += 2*sizeof (uint16_t); \ 493 | hashv += hashv >> 11; \ 494 | } \ 495 | \ 496 | /* Handle end cases */ \ 497 | switch (_sfh_rem) { \ 498 | case 3: hashv += get16bits (_sfh_key); \ 499 | hashv ^= hashv << 16; \ 500 | hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)] << 18); \ 501 | hashv += hashv >> 11; \ 502 | break; \ 503 | case 2: hashv += get16bits (_sfh_key); \ 504 | hashv ^= hashv << 11; \ 505 | hashv += hashv >> 17; \ 506 | break; \ 507 | case 1: hashv += *_sfh_key; \ 508 | hashv ^= hashv << 10; \ 509 | hashv += hashv >> 1; \ 510 | } \ 511 | \ 512 | /* Force "avalanching" of final 127 bits */ \ 513 | hashv ^= hashv << 3; \ 514 | hashv += hashv >> 5; \ 515 | hashv ^= hashv << 4; \ 516 | hashv += hashv >> 17; \ 517 | hashv ^= hashv << 25; \ 518 | hashv += hashv >> 6; \ 519 | bkt = hashv & (num_bkts-1); \ 520 | } while(0) 521 | 522 | #ifdef HASH_USING_NO_STRICT_ALIASING 523 | /* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads. 524 | * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error. 525 | * MurmurHash uses the faster approach only on CPU's where we know it's safe. 526 | * 527 | * Note the preprocessor built-in defines can be emitted using: 528 | * 529 | * gcc -m64 -dM -E - < /dev/null (on gcc) 530 | * cc -## a.c (where a.c is a simple test file) (Sun Studio) 531 | */ 532 | #if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)) 533 | #define MUR_GETBLOCK(p,i) p[i] 534 | #else /* non intel */ 535 | #define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0) 536 | #define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1) 537 | #define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2) 538 | #define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3) 539 | #define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL)) 540 | #if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__)) 541 | #define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24)) 542 | #define MUR_TWO_TWO(p) ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16)) 543 | #define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >> 8)) 544 | #else /* assume little endian non-intel */ 545 | #define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24)) 546 | #define MUR_TWO_TWO(p) ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16)) 547 | #define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) << 8)) 548 | #endif 549 | #define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) : \ 550 | (MUR_PLUS1_ALIGNED(p) ? MUR_THREE_ONE(p) : \ 551 | (MUR_PLUS2_ALIGNED(p) ? MUR_TWO_TWO(p) : \ 552 | MUR_ONE_THREE(p)))) 553 | #endif 554 | #define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) 555 | #define MUR_FMIX(_h) \ 556 | do { \ 557 | _h ^= _h >> 16; \ 558 | _h *= 0x85ebca6b; \ 559 | _h ^= _h >> 13; \ 560 | _h *= 0xc2b2ae35l; \ 561 | _h ^= _h >> 16; \ 562 | } while(0) 563 | 564 | #define HASH_MUR(key,keylen,num_bkts,hashv,bkt) \ 565 | do { \ 566 | const uint8_t *_mur_data = (const uint8_t*)(key); \ 567 | const int _mur_nblocks = (keylen) / 4; \ 568 | uint32_t _mur_h1 = 0xf88D5353; \ 569 | uint32_t _mur_c1 = 0xcc9e2d51; \ 570 | uint32_t _mur_c2 = 0x1b873593; \ 571 | uint32_t _mur_k1 = 0; \ 572 | const uint8_t *_mur_tail; \ 573 | const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \ 574 | int _mur_i; \ 575 | for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) { \ 576 | _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i); \ 577 | _mur_k1 *= _mur_c1; \ 578 | _mur_k1 = MUR_ROTL32(_mur_k1,15); \ 579 | _mur_k1 *= _mur_c2; \ 580 | \ 581 | _mur_h1 ^= _mur_k1; \ 582 | _mur_h1 = MUR_ROTL32(_mur_h1,13); \ 583 | _mur_h1 = _mur_h1*5+0xe6546b64; \ 584 | } \ 585 | _mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4); \ 586 | _mur_k1=0; \ 587 | switch((keylen) & 3) { \ 588 | case 3: _mur_k1 ^= _mur_tail[2] << 16; \ 589 | case 2: _mur_k1 ^= _mur_tail[1] << 8; \ 590 | case 1: _mur_k1 ^= _mur_tail[0]; \ 591 | _mur_k1 *= _mur_c1; \ 592 | _mur_k1 = MUR_ROTL32(_mur_k1,15); \ 593 | _mur_k1 *= _mur_c2; \ 594 | _mur_h1 ^= _mur_k1; \ 595 | } \ 596 | _mur_h1 ^= (keylen); \ 597 | MUR_FMIX(_mur_h1); \ 598 | hashv = _mur_h1; \ 599 | bkt = hashv & (num_bkts-1); \ 600 | } while(0) 601 | #endif /* HASH_USING_NO_STRICT_ALIASING */ 602 | 603 | /* key comparison function; return 0 if keys equal */ 604 | #define HASH_KEYCMP(a,b,len) memcmp(a,b,len) 605 | 606 | /* iterate over items in a known bucket to find desired item */ 607 | #define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out) \ 608 | do { \ 609 | if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head)); \ 610 | else out=NULL; \ 611 | while (out) { \ 612 | if ((out)->hh.keylen == keylen_in) { \ 613 | if ((HASH_KEYCMP((out)->hh.key,keyptr,keylen_in)) == 0) break; \ 614 | } \ 615 | if ((out)->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,(out)->hh.hh_next)); \ 616 | else out = NULL; \ 617 | } \ 618 | } while(0) 619 | 620 | /* add an item to a bucket */ 621 | #define HASH_ADD_TO_BKT(head,addhh) \ 622 | do { \ 623 | head.count++; \ 624 | (addhh)->hh_next = head.hh_head; \ 625 | (addhh)->hh_prev = NULL; \ 626 | if (head.hh_head) { (head).hh_head->hh_prev = (addhh); } \ 627 | (head).hh_head=addhh; \ 628 | if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH) \ 629 | && (addhh)->tbl->noexpand != 1) { \ 630 | HASH_EXPAND_BUCKETS((addhh)->tbl); \ 631 | } \ 632 | } while(0) 633 | 634 | /* remove an item from a given bucket */ 635 | #define HASH_DEL_IN_BKT(hh,head,hh_del) \ 636 | (head).count--; \ 637 | if ((head).hh_head == hh_del) { \ 638 | (head).hh_head = hh_del->hh_next; \ 639 | } \ 640 | if (hh_del->hh_prev) { \ 641 | hh_del->hh_prev->hh_next = hh_del->hh_next; \ 642 | } \ 643 | if (hh_del->hh_next) { \ 644 | hh_del->hh_next->hh_prev = hh_del->hh_prev; \ 645 | } 646 | 647 | /* Bucket expansion has the effect of doubling the number of buckets 648 | * and redistributing the items into the new buckets. Ideally the 649 | * items will distribute more or less evenly into the new buckets 650 | * (the extent to which this is true is a measure of the quality of 651 | * the hash function as it applies to the key domain). 652 | * 653 | * With the items distributed into more buckets, the chain length 654 | * (item count) in each bucket is reduced. Thus by expanding buckets 655 | * the hash keeps a bound on the chain length. This bounded chain 656 | * length is the essence of how a hash provides constant time lookup. 657 | * 658 | * The calculation of tbl->ideal_chain_maxlen below deserves some 659 | * explanation. First, keep in mind that we're calculating the ideal 660 | * maximum chain length based on the *new* (doubled) bucket count. 661 | * In fractions this is just n/b (n=number of items,b=new num buckets). 662 | * Since the ideal chain length is an integer, we want to calculate 663 | * ceil(n/b). We don't depend on floating point arithmetic in this 664 | * hash, so to calculate ceil(n/b) with integers we could write 665 | * 666 | * ceil(n/b) = (n/b) + ((n%b)?1:0) 667 | * 668 | * and in fact a previous version of this hash did just that. 669 | * But now we have improved things a bit by recognizing that b is 670 | * always a power of two. We keep its base 2 log handy (call it lb), 671 | * so now we can write this with a bit shift and logical AND: 672 | * 673 | * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) 674 | * 675 | */ 676 | #define HASH_EXPAND_BUCKETS(tbl) \ 677 | do { \ 678 | unsigned _he_bkt; \ 679 | unsigned _he_bkt_i; \ 680 | struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ 681 | UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ 682 | _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ 683 | 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ 684 | if (!_he_new_buckets) { uthash_fatal( "out of memory"); } \ 685 | memset(_he_new_buckets, 0, \ 686 | 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ 687 | tbl->ideal_chain_maxlen = \ 688 | (tbl->num_items >> (tbl->log2_num_buckets+1)) + \ 689 | ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0); \ 690 | tbl->nonideal_items = 0; \ 691 | for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++) \ 692 | { \ 693 | _he_thh = tbl->buckets[ _he_bkt_i ].hh_head; \ 694 | while (_he_thh) { \ 695 | _he_hh_nxt = _he_thh->hh_next; \ 696 | HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt); \ 697 | _he_newbkt = &(_he_new_buckets[ _he_bkt ]); \ 698 | if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { \ 699 | tbl->nonideal_items++; \ 700 | _he_newbkt->expand_mult = _he_newbkt->count / \ 701 | tbl->ideal_chain_maxlen; \ 702 | } \ 703 | _he_thh->hh_prev = NULL; \ 704 | _he_thh->hh_next = _he_newbkt->hh_head; \ 705 | if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev = \ 706 | _he_thh; \ 707 | _he_newbkt->hh_head = _he_thh; \ 708 | _he_thh = _he_hh_nxt; \ 709 | } \ 710 | } \ 711 | uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ 712 | tbl->num_buckets *= 2; \ 713 | tbl->log2_num_buckets++; \ 714 | tbl->buckets = _he_new_buckets; \ 715 | tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ? \ 716 | (tbl->ineff_expands+1) : 0; \ 717 | if (tbl->ineff_expands > 1) { \ 718 | tbl->noexpand=1; \ 719 | uthash_noexpand_fyi(tbl); \ 720 | } \ 721 | uthash_expand_fyi(tbl); \ 722 | } while(0) 723 | 724 | 725 | /* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ 726 | /* Note that HASH_SORT assumes the hash handle name to be hh. 727 | * HASH_SRT was added to allow the hash handle name to be passed in. */ 728 | #define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) 729 | #define HASH_SRT(hh,head,cmpfcn) \ 730 | do { \ 731 | unsigned _hs_i; \ 732 | unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ 733 | struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ 734 | if (head) { \ 735 | _hs_insize = 1; \ 736 | _hs_looping = 1; \ 737 | _hs_list = &((head)->hh); \ 738 | while (_hs_looping) { \ 739 | _hs_p = _hs_list; \ 740 | _hs_list = NULL; \ 741 | _hs_tail = NULL; \ 742 | _hs_nmerges = 0; \ 743 | while (_hs_p) { \ 744 | _hs_nmerges++; \ 745 | _hs_q = _hs_p; \ 746 | _hs_psize = 0; \ 747 | for ( _hs_i = 0; _hs_i < _hs_insize; _hs_i++ ) { \ 748 | _hs_psize++; \ 749 | _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ 750 | ((void*)((char*)(_hs_q->next) + \ 751 | (head)->hh.tbl->hho)) : NULL); \ 752 | if (! (_hs_q) ) break; \ 753 | } \ 754 | _hs_qsize = _hs_insize; \ 755 | while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) { \ 756 | if (_hs_psize == 0) { \ 757 | _hs_e = _hs_q; \ 758 | _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ 759 | ((void*)((char*)(_hs_q->next) + \ 760 | (head)->hh.tbl->hho)) : NULL); \ 761 | _hs_qsize--; \ 762 | } else if ( (_hs_qsize == 0) || !(_hs_q) ) { \ 763 | _hs_e = _hs_p; \ 764 | if (_hs_p){ \ 765 | _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ 766 | ((void*)((char*)(_hs_p->next) + \ 767 | (head)->hh.tbl->hho)) : NULL); \ 768 | } \ 769 | _hs_psize--; \ 770 | } else if (( \ 771 | cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \ 772 | DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \ 773 | ) <= 0) { \ 774 | _hs_e = _hs_p; \ 775 | if (_hs_p){ \ 776 | _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ 777 | ((void*)((char*)(_hs_p->next) + \ 778 | (head)->hh.tbl->hho)) : NULL); \ 779 | } \ 780 | _hs_psize--; \ 781 | } else { \ 782 | _hs_e = _hs_q; \ 783 | _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ 784 | ((void*)((char*)(_hs_q->next) + \ 785 | (head)->hh.tbl->hho)) : NULL); \ 786 | _hs_qsize--; \ 787 | } \ 788 | if ( _hs_tail ) { \ 789 | _hs_tail->next = ((_hs_e) ? \ 790 | ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL); \ 791 | } else { \ 792 | _hs_list = _hs_e; \ 793 | } \ 794 | if (_hs_e) { \ 795 | _hs_e->prev = ((_hs_tail) ? \ 796 | ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL); \ 797 | } \ 798 | _hs_tail = _hs_e; \ 799 | } \ 800 | _hs_p = _hs_q; \ 801 | } \ 802 | if (_hs_tail){ \ 803 | _hs_tail->next = NULL; \ 804 | } \ 805 | if ( _hs_nmerges <= 1 ) { \ 806 | _hs_looping=0; \ 807 | (head)->hh.tbl->tail = _hs_tail; \ 808 | DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ 809 | } \ 810 | _hs_insize *= 2; \ 811 | } \ 812 | HASH_FSCK(hh,head); \ 813 | } \ 814 | } while (0) 815 | 816 | /* This function selects items from one hash into another hash. 817 | * The end result is that the selected items have dual presence 818 | * in both hashes. There is no copy of the items made; rather 819 | * they are added into the new hash through a secondary hash 820 | * hash handle that must be present in the structure. */ 821 | #define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ 822 | do { \ 823 | unsigned _src_bkt, _dst_bkt; \ 824 | void *_last_elt=NULL, *_elt; \ 825 | UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ 826 | ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ 827 | if (src) { \ 828 | for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ 829 | for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ 830 | _src_hh; \ 831 | _src_hh = _src_hh->hh_next) { \ 832 | _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ 833 | if (cond(_elt)) { \ 834 | _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho); \ 835 | _dst_hh->key = _src_hh->key; \ 836 | _dst_hh->keylen = _src_hh->keylen; \ 837 | _dst_hh->hashv = _src_hh->hashv; \ 838 | _dst_hh->prev = _last_elt; \ 839 | _dst_hh->next = NULL; \ 840 | if (_last_elt_hh) { _last_elt_hh->next = _elt; } \ 841 | if (!dst) { \ 842 | DECLTYPE_ASSIGN(dst,_elt); \ 843 | HASH_MAKE_TABLE(hh_dst,dst); \ 844 | } else { \ 845 | _dst_hh->tbl = (dst)->hh_dst.tbl; \ 846 | } \ 847 | HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ 848 | HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh); \ 849 | (dst)->hh_dst.tbl->num_items++; \ 850 | _last_elt = _elt; \ 851 | _last_elt_hh = _dst_hh; \ 852 | } \ 853 | } \ 854 | } \ 855 | } \ 856 | HASH_FSCK(hh_dst,dst); \ 857 | } while (0) 858 | 859 | #define HASH_CLEAR(hh,head) \ 860 | do { \ 861 | if (head) { \ 862 | uthash_free((head)->hh.tbl->buckets, \ 863 | (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ 864 | HASH_BLOOM_FREE((head)->hh.tbl); \ 865 | uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ 866 | (head)=NULL; \ 867 | } \ 868 | } while(0) 869 | 870 | #define HASH_OVERHEAD(hh,head) \ 871 | (size_t)((((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ 872 | ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ 873 | (sizeof(UT_hash_table)) + \ 874 | (HASH_BLOOM_BYTELEN))) 875 | 876 | #ifdef NO_DECLTYPE 877 | #define HASH_ITER(hh,head,el,tmp) \ 878 | for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL); \ 879 | el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) 880 | #else 881 | #define HASH_ITER(hh,head,el,tmp) \ 882 | for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL); \ 883 | el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL)) 884 | #endif 885 | 886 | /* obtain a count of items in the hash */ 887 | #define HASH_COUNT(head) HASH_CNT(hh,head) 888 | #define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0) 889 | 890 | typedef struct UT_hash_bucket { 891 | struct UT_hash_handle *hh_head; 892 | unsigned count; 893 | 894 | /* expand_mult is normally set to 0. In this situation, the max chain length 895 | * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If 896 | * the bucket's chain exceeds this length, bucket expansion is triggered). 897 | * However, setting expand_mult to a non-zero value delays bucket expansion 898 | * (that would be triggered by additions to this particular bucket) 899 | * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. 900 | * (The multiplier is simply expand_mult+1). The whole idea of this 901 | * multiplier is to reduce bucket expansions, since they are expensive, in 902 | * situations where we know that a particular bucket tends to be overused. 903 | * It is better to let its chain length grow to a longer yet-still-bounded 904 | * value, than to do an O(n) bucket expansion too often. 905 | */ 906 | unsigned expand_mult; 907 | 908 | } UT_hash_bucket; 909 | 910 | /* random signature used only to find hash tables in external analysis */ 911 | #define HASH_SIGNATURE 0xa0111fe1 912 | #define HASH_BLOOM_SIGNATURE 0xb12220f2 913 | 914 | typedef struct UT_hash_table { 915 | UT_hash_bucket *buckets; 916 | unsigned num_buckets, log2_num_buckets; 917 | unsigned num_items; 918 | struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ 919 | ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ 920 | 921 | /* in an ideal situation (all buckets used equally), no bucket would have 922 | * more than ceil(#items/#buckets) items. that's the ideal chain length. */ 923 | unsigned ideal_chain_maxlen; 924 | 925 | /* nonideal_items is the number of items in the hash whose chain position 926 | * exceeds the ideal chain maxlen. these items pay the penalty for an uneven 927 | * hash distribution; reaching them in a chain traversal takes >ideal steps */ 928 | unsigned nonideal_items; 929 | 930 | /* ineffective expands occur when a bucket doubling was performed, but 931 | * afterward, more than half the items in the hash had nonideal chain 932 | * positions. If this happens on two consecutive expansions we inhibit any 933 | * further expansion, as it's not helping; this happens when the hash 934 | * function isn't a good fit for the key domain. When expansion is inhibited 935 | * the hash will still work, albeit no longer in constant time. */ 936 | unsigned ineff_expands, noexpand; 937 | 938 | uint32_t signature; /* used only to find hash tables in external analysis */ 939 | #ifdef HASH_BLOOM 940 | uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ 941 | uint8_t *bloom_bv; 942 | char bloom_nbits; 943 | #endif 944 | 945 | } UT_hash_table; 946 | 947 | typedef struct UT_hash_handle { 948 | struct UT_hash_table *tbl; 949 | void *prev; /* prev element in app order */ 950 | void *next; /* next element in app order */ 951 | struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ 952 | struct UT_hash_handle *hh_next; /* next hh in bucket order */ 953 | void *key; /* ptr to enclosing struct's key */ 954 | unsigned keylen; /* enclosing struct's key len */ 955 | unsigned hashv; /* result of hash-fcn(key) */ 956 | } UT_hash_handle; 957 | 958 | #endif /* UTHASH_H */ 959 | --------------------------------------------------------------------------------