├── src
    ├── seqdiff.h
    ├── pullseq.h
    ├── file_read.h
    ├── pull_by_size.h
    ├── pull_by_re.h
    ├── pull_by_name.h
    ├── size_filter.h
    ├── search_header.h
    ├── output.h
    ├── hash.h
    ├── CMakeLists.txt
    ├── linked_list.h
    ├── seqdiff_results.h
    ├── bst.h
    ├── global.h
    ├── cmpseq.h
    ├── seqdiff_results.c
    ├── test_linked_list.c
    ├── search_header.c
    ├── hash.c
    ├── size_filter.c
    ├── pull_by_size.c
    ├── linked_list.c
    ├── pull_by_name.c
    ├── pull_by_re.c
    ├── file_read.c
    ├── output.c
    ├── cmpseq.c
    ├── bst.c
    ├── seqdiff.c
    ├── kseq.h
    ├── pullseq.c
    └── uthash.h
├── AUTHORS
├── NEWS
├── test
    ├── utest_d.fa
    ├── utest_c.fa
    ├── test.txt
    ├── pull_header.rb
    ├── test.fa
    ├── utest_b.fa
    └── utest_a.fa
├── CMakeLists.txt
├── .gitignore
├── cmake
    └── FindPCRE2.cmake
├── COPYING
├── ChangeLog
├── README
└── INSTALL


/src/seqdiff.h:
--------------------------------------------------------------------------------
1 | #ifndef SEQDIFF_H
2 | #define SEQDIFF_H
3 | 
4 | 
5 | #endif
6 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | pullseq/seqdiff was written by Brian C. Thomas (bct.x42@gmail.com)
2 | Copyright 2015
3 | 


--------------------------------------------------------------------------------
/src/pullseq.h:
--------------------------------------------------------------------------------
1 | #ifndef PULLSEQ_H
2 | #define PULLSEQ_H
3 | 
4 | #define PULLSEQ_SORTMETHOD "UTHASH"
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | 03/23/2023: convert pullseq to use cmake
2 | 02/27/2013: converted pullseq to use autotools for
3 |             configuration/building/installation
4 | 


--------------------------------------------------------------------------------
/src/file_read.h:
--------------------------------------------------------------------------------
1 | #ifndef FILE_READ_H
2 | #define FILE_READ_H
3 | 
4 | int getl(char **line, FILE *fp);
5 | char *parse_name(char *line);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/test/utest_d.fa:
--------------------------------------------------------------------------------
1 | >test
2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
3 | >testa
4 | MAFSADVLKERRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
5 | 


--------------------------------------------------------------------------------
/test/utest_c.fa:
--------------------------------------------------------------------------------
1 | >test
2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
3 | >testa
4 | MAFSADVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
5 | 


--------------------------------------------------------------------------------
/src/pull_by_size.h:
--------------------------------------------------------------------------------
1 | #ifndef PULL_BY_SIZE_H
2 | #define PULL_BY_SIZE_H
3 | 
4 | int pull_by_size(char *input_file, int min, int max, int length, int convert, int just_count);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/src/pull_by_re.h:
--------------------------------------------------------------------------------
1 | #ifndef PULL_BY_RE_H
2 | #define PULL_BY_RE_H
3 | 
4 | int pull_by_re(char *input_file, char *aStrRegex, int min, int max, int length, int exclude, int convert, int just_count);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/src/pull_by_name.h:
--------------------------------------------------------------------------------
1 | #ifndef PULL_BY_NAME_H
2 | #define PULL_BY_NAME_H
3 | 
4 | int pull_by_name(char *input_fasta, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/src/size_filter.h:
--------------------------------------------------------------------------------
1 | #ifndef SIZE_FILTER_H
2 | #define SIZE_FILTER_H
3 | 
4 | #include "global.h"
5 | 
6 | int size_filter(kseq_t *seq, int is_fasta, int min, int max, int length, int convert, int just_count);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/src/search_header.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEARCH_HEADER_H
 2 | #define SEARCH_HEADER_H
 3 | 
 4 | #define PCRE2_CODE_UNIT_WIDTH 8
 5 | 
 6 | #include <pcre2.h>
 7 | 
 8 | #define MAX_CAPTURE_COUNT 30
 9 | 
10 | int search_header(pcre2_code *re, char *str);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/test/test.txt:
--------------------------------------------------------------------------------
1 | A
2 | AA
3 | BB
4 | BBB C
5 | DD E FF
6 | XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wqXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wqXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wqXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wq XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wq XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX:wq
7 | 


--------------------------------------------------------------------------------
/src/output.h:
--------------------------------------------------------------------------------
 1 | #ifndef OUTPUT_H
 2 | #define OUTPUT_H
 3 | 
 4 | #include <stdio.h>
 5 | #include "global.h"
 6 | 
 7 | void print_fasta_seq(kseq_t *seq,int n);
 8 | void print_fastq_seq(kseq_t *seq);
 9 | void print_fasta(FILE *fp,char *name, char *comment, char *seq, size_t colwidth);
10 | #endif
11 | 


--------------------------------------------------------------------------------
/src/hash.h:
--------------------------------------------------------------------------------
 1 | #ifndef HASH_H
 2 | #define HASH_H
 3 | 
 4 | #include "global.h"
 5 | 
 6 | typedef struct lookup {
 7 | 	char *name;
 8 | 	UT_hash_handle hh;
 9 | } lookup_t;
10 | 
11 | 
12 | void add_name(char *name);
13 | lookup_t *find_name(char *name);
14 | void delete_name(lookup_t *s);
15 | void delete_hash(void);
16 | void print_hash(void);
17 | int hash_key_count(void);
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | message(STATUS "Building src/pullseq")
 2 | add_executable(pullseq pullseq.c
 3 | 	global.h
 4 | 	hash.c
 5 | 	output.c
 6 | 	search_header.c
 7 | 	size_filter.c
 8 | 	file_read.c
 9 | 	pull_by_name.c
10 | 	pull_by_re.c
11 | 	pull_by_size.c
12 | )
13 | 
14 | message(STATUS "Building src/seqdiff")
15 | add_executable(seqdiff seqdiff.c
16 | 	global.h
17 | 	cmpseq.c
18 | 	file_read.c
19 | 	hash.c
20 | 	output.c
21 | 	seqdiff.c
22 | 	seqdiff_results.c
23 | )
24 | 


--------------------------------------------------------------------------------
/src/linked_list.h:
--------------------------------------------------------------------------------
 1 | #ifndef LINKED_LIST_H
 2 | #define LINKED_LIST_H
 3 | typedef struct _node {
 4 |   char *word;
 5 |   struct _node *next;
 6 | } node ;
 7 | 
 8 | typedef struct _list_t {
 9 |   node *head;
10 |   node *end;
11 | } list_t;
12 | 
13 | void initialize_list(list_t *list);
14 | node * initnode(char *word);
15 | void add_to_list(list_t *list, node *n);
16 | node *search_list(list_t *list, char *word);
17 | void delete_list(list_t *list,node *n);
18 | #endif
19 | 


--------------------------------------------------------------------------------
/test/pull_header.rb:
--------------------------------------------------------------------------------
 1 | require 'nubio'
 2 | file = ARGV[0]
 3 | n = ARGV[1].to_i
 4 | range = ARGV[2].to_i
 5 | hits = Hash.new
 6 | $stderr.puts "file: #{file}; n: #{n}; range: #{range}"
 7 | while (n > 0)
 8 |   r = rand(range)
 9 |   unless hits.has_key?(r)
10 |     n -= 1
11 |     hits[r] = 1
12 |   end
13 | end
14 | 
15 | count = 0
16 | NuBio::Parser::Fastq.new(file).each do |f|
17 |   count += 1
18 |   if hits.has_key?(count)
19 |     temp = f.header.split
20 |     puts temp[0]
21 |   end
22 | end
23 | 


--------------------------------------------------------------------------------
/src/seqdiff_results.h:
--------------------------------------------------------------------------------
 1 | #ifndef SEQDIFF_RESULTS_H
 2 | #define SEQDIFF_RESULTS_H
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | typedef struct _seqdiff_results_t {
 7 | 	int first_file_total;
 8 | 	int first_file_uniq;
 9 | 	int second_file_total;
10 | 	int second_file_uniq;
11 | 	int common;
12 | 	char *first_file;
13 | 	char *second_file;
14 | 	FILE *a_output_fp;
15 | 	FILE *b_output_fp;
16 | 	FILE *c_output_fp;
17 | 	int use_header;
18 | 	int only_summarize;
19 | } seqdiff_results_t;
20 | 
21 | seqdiff_results_t *seqdiff_results_init(void);
22 | void seqdiff_results_destroy(seqdiff_results_t *results);
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/bst.h:
--------------------------------------------------------------------------------
 1 | #ifndef BST_H
 2 | #define BST_H
 3 | typedef struct _node {
 4 |   char *name;
 5 |   struct _node *left;
 6 |   struct _node *right;
 7 |   struct _node *parent;
 8 | } node_t ;
 9 | 
10 | typedef struct _tree {
11 | 	node_t *root;
12 | } tree_t;
13 | 
14 | node_t *initnode(char *word);
15 | int insertnode(tree_t *tree, char *name);
16 | node_t *searchtree(tree_t *tree, char *word);
17 | int deletenode(tree_t *tree, char *name);
18 | void deletetree(tree_t *tree);
19 | int compare(char *left, char *right);
20 | void print_inorder(node_t *node);
21 | void print_preorder(node_t *node);
22 | void print_postorder(node_t *node);
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/global.h:
--------------------------------------------------------------------------------
 1 | #ifndef GLOBAL_H
 2 | #define GLOBAL_H
 3 | 
 4 | #define PULLSEQ_VERSION "1.0.2"
 5 | 
 6 | #define _POSIX_C_SOURCE 200809L
 7 | /*
 8 | #ifdef DEBUG
 9 | #define DEBUGP(x, args...) fprintf(stderr, " [%s(), %s:%u]\n" x, __FUNCTION__, __FILE__,__LINE__, ## args)
10 | #else
11 | #define DEBUGP(x, args...)
12 | #endif
13 | */
14 | 
15 | #include "zlib.h"
16 | #include "kseq.h"
17 | #include "uthash.h"
18 | 
19 | #define BUFFER_SIZE 65535
20 | 
21 | __KS_TYPE(gzFile)
22 | __KS_BASIC(gzFile, BUFFER_SIZE)
23 | __KSEQ_TYPE(gzFile)
24 | __KSEQ_BASIC(static, gzFile)
25 | 
26 | extern char const *progname;
27 | extern int QUALITY_SCORE;
28 | extern int verbose_flag;
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/cmpseq.h:
--------------------------------------------------------------------------------
 1 | #ifndef CMPSEQ_H
 2 | #define CMPSEQ_H
 3 | 
 4 | #include "global.h"
 5 | #include "seqdiff_results.h"
 6 | 
 7 | typedef struct _sd_lookup {
 8 | 	char *name;    /* header_name */
 9 | 	char *comment; /* header_description */
10 | 	char *seq;
11 | 	int count;
12 | 	int in_a;
13 | 	int in_b;
14 | 	UT_hash_handle hh;
15 | } sd_lookup_t;
16 | 
17 | /* hash-related methods */
18 | void sd_add_seq(kseq_t *seq, int file, int use_header);
19 | sd_lookup_t *sd_find_seq(char *str, int use_header);
20 | void sd_delete_seq(sd_lookup_t *s);
21 | void sd_delete_hash(void);
22 | void sd_print_hash(void);
23 | int sd_hash_key_count(void);
24 | 
25 | void cmpseq(seqdiff_results_t *results);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.7...3.26)
 2 | 
 3 | if(${CMAKE_VERSION} VERSION_LESS 3.12)
 4 |     cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
 5 | endif()
 6 | 
 7 | project(Pullseq VERSION 1.0
 8 | 		DESCRIPTION "Extract & Manipulate Sequence Files"
 9 | 		LANGUAGES C)
10 | 
11 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
12 | #include(cmake/FindPCRE2.cmake)
13 | 
14 | add_subdirectory(src)
15 | 
16 | find_package(PCRE2 REQUIRED)
17 | find_package(ZLIB REQUIRED)
18 | 
19 | if(PCRE2_FOUND)
20 | 	target_include_directories(pullseq PUBLIC ${PCRE2_INCLUDE_DIRS})
21 | 	target_link_libraries(pullseq ${PCRE2_LIBRARIES})
22 | endif(PCRE2_FOUND)
23 | 
24 | target_link_libraries(pullseq z)
25 | target_link_libraries(seqdiff z)
26 | 
27 | 


--------------------------------------------------------------------------------
/src/seqdiff_results.c:
--------------------------------------------------------------------------------
 1 | #include "seqdiff_results.h"
 2 | #include <stdlib.h>
 3 | #include <stdio.h>
 4 | 
 5 | seqdiff_results_t *seqdiff_results_init(void) {
 6 | 	seqdiff_results_t *r;
 7 | 	r = (seqdiff_results_t *)malloc(sizeof(seqdiff_results_t));
 8 | 	if (!r) {
 9 | 		fprintf(stderr, "ERROR: could not allocate memory for seqdiff_results_t structure\n");
10 | 		exit(1);
11 | 	}
12 | 	
13 | 	r->first_file_total = 0;
14 | 	r->first_file_uniq = 0;
15 | 	r->second_file_total = 0;
16 | 	r->second_file_uniq = 0;
17 | 	r->common = 0;
18 | 	r->first_file = NULL;
19 | 	r->second_file = NULL;
20 | 	r->a_output_fp = NULL;
21 | 	r->b_output_fp = NULL;
22 | 	r->c_output_fp = NULL;
23 | 	r->use_header = 0;
24 | 	r->only_summarize = 0;
25 | 	return r;
26 | }
27 | 
28 | void seqdiff_results_destroy(seqdiff_results_t *results) {
29 | 	free(results);
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/src/test_linked_list.c:
--------------------------------------------------------------------------------
 1 | #include "pullseq.h"
 2 | 
 3 | int main(int argc, char *argv[]) {
 4 |   int i;
 5 |   node *n;
 6 |   list_t *list;
 7 |   char *words[3] = {"one","two","three"};
 8 | 
 9 |   list = (list_t *) malloc(sizeof(list_t));
10 |   initialize_list(list);
11 |   n = (node *) NULL;
12 | 
13 | 
14 |   for(i=0; i<3; i++) {
15 |     fprintf(stderr,"creating node for word %s\n",words[i]);
16 |     n = initnode(words[i]);
17 |     add_to_list(list,n);
18 |   }
19 | 
20 |   n = list->head;
21 |   while(n != NULL) {
22 |     fprintf(stderr,"%p: %s\n",n,n->word);
23 |     n = n->next;
24 |   }
25 | 
26 |   n = search_list(list,"two");
27 |   fprintf(stderr,"found %s (%p)\n", n->word,n);
28 |   n = search_list(list,"wo");
29 |   if (n == NULL)
30 |     fprintf(stderr,"did not find \n");
31 | 
32 |   delete_list(list,list->head);
33 |   return EXIT_SUCCESS;
34 | }
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.o
 3 | *.gch
 4 | src/pullseq
 5 | src/seqdiff
 6 | src/test_hash.c
 7 | test/t100000.fq
 8 | test/t100000.txt
 9 | test/t1000000.fq
10 | test/t1000000.txt
11 | test/t10000000.fq
12 | test/t10000000.txt
13 | test/big_test.fastq
14 | test/big_test.txt
15 | .deps
16 | Makefile
17 | valgrind_ex.sh
18 | real_test.sh
19 | Makefile.in
20 | aclocal.m4
21 | *.log
22 | stamp-h1
23 | config.*
24 | /src/config.*
25 | /autom4te.cache
26 | /compile
27 | /configure
28 | /depcomp
29 | /install-sh
30 | /missing
31 | test/RifleCSP2_O2Inj_2_contigs.fa
32 | test/RifleCSP2_O2Inj_2_feature_locations.txt
33 | test/all_contigs_01142015.fa
34 | test/all_contigs_01142015.fa.summary.txt
35 | test/bct_feature_locations_01142015.txt
36 | test/rpL6_bact_arch_euk_curated.fasta
37 | test/list
38 | test/listlist2
39 | test/test.txt.2
40 | test/test_gene.fna
41 | test/test_locations.txt
42 | test/test_locations_gene.txt
43 | test/test_locations_noname.txt
44 | test/test_noname.fa
45 | build/
46 | 


--------------------------------------------------------------------------------
/cmake/FindPCRE2.cmake:
--------------------------------------------------------------------------------
 1 | # - Find pcre
 2 | # Find the native PCRE2 headers and libraries.
 3 | #
 4 | # PCRE2_INCLUDE_DIRS	- where to find pcre.h, etc.
 5 | # PCRE2_LIBRARIES	- List of libraries when using pcre.
 6 | # PCRE2_FOUND	- True if pcre found.
 7 | 
 8 | # Look for the header file.
 9 | FIND_PATH(PCRE2_INCLUDE_DIR pcre2.h)
10 | 
11 | # Look for the library.
12 | FIND_LIBRARY(PCRE2_LIBRARY NAMES libpcre2.a pcre2-8)
13 | 
14 | # Handle the QUIETLY and REQUIRED arguments and set PCRE_FOUND to TRUE if all listed variables are TRUE.
15 | INCLUDE(FindPackageHandleStandardArgs)
16 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(PCRE2 DEFAULT_MSG PCRE2_LIBRARY PCRE2_INCLUDE_DIR)
17 | 
18 | # Copy the results to the output variables.
19 | IF(PCRE2_FOUND)
20 | 	SET(PCRE2_LIBRARIES ${PCRE2_LIBRARY})
21 | 	SET(PCRE2_INCLUDE_DIRS ${PCRE2_INCLUDE_DIR})
22 | 	message(STATUS "${PCRE2_INCLUDE_DIRS}")
23 | 	message(STATUS "${PCRE2_LIBRARIES}")
24 | ELSE(PCRE2_FOUND)
25 | 	SET(PCRE_LIBRARIES)
26 | 	SET(PCRE_INCLUDE_DIRS)
27 | ENDIF(PCRE2_FOUND)
28 | 
29 | MARK_AS_ADVANCED(PCRE2_INCLUDE_DIRS PCRE2_LIBRARIES)
30 | 


--------------------------------------------------------------------------------
/src/search_header.c:
--------------------------------------------------------------------------------
 1 | #define PCRE2_CODE_UNIT_WIDTH 8
 2 | 
 3 | #include <stdio.h>
 4 | #include <string.h>
 5 | #include <pcre2.h>
 6 | 
 7 | #include "search_header.h"
 8 | #include "global.h"
 9 | 
10 | 
11 | /* re is a compiled pcre2 regex */
12 | int search_header(pcre2_code *re, char *str) {
13 | 	int pcreExecRet;
14 | 	pcre2_match_data *match_data;
15 | 
16 | 	if (str == NULL) {
17 | 		return 0;
18 | 	}
19 | 	match_data = pcre2_match_data_create_from_pattern(re, NULL); // init structure for result
20 | 
21 | 	/* run the match */
22 | 	pcreExecRet = pcre2_match(re,
23 | 			str, 
24 | 			strlen(str),    // length of header string
25 | 			0,                      // Start looking at this point
26 | 			0,                      // pcre exec OPTIONS
27 | 			match_data, // pcre2_match_data
28 | 			NULL); // default match context
29 | 
30 | 	pcre2_match_data_free(match_data);   /* Release memory used for the match */
31 | 
32 | 	if (pcreExecRet < 0) {
33 | 		if (pcreExecRet == PCRE2_ERROR_NOMATCH)
34 | 			return 0;
35 | 		else
36 | 			fprintf(stderr, "Problem with your regex (%d)\n", pcreExecRet);
37 | 	} else
38 | 		return 1;
39 | }
40 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | /* The MIT License
 2 | 
 3 |    Copyright (c) 2013 Brian C. Thomas <bct.x42@gmail.com>.
 4 | 
 5 |    Permission is hereby granted, free of charge, to any person obtaining
 6 |    a copy of this software and associated documentation files (the
 7 |    "Software"), to deal in the Software without restriction, including
 8 |    without limitation the rights to use, copy, modify, merge, publish,
 9 |    distribute, sublicense, and/or sell copies of the Software, and to
10 |    permit persons to whom the Software is furnished to do so, subject to
11 |    the following conditions:
12 | 
13 |    The above copyright notice and this permission notice shall be
14 |    included in all copies or substantial portions of the Software.
15 | 
16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 |    SOFTWARE.
24 | */
25 | 


--------------------------------------------------------------------------------
/test/test.fa:
--------------------------------------------------------------------------------
 1 | >AA test1
 2 | AAAAAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
 3 | >BB test2
 4 | CCCCCCCCCCCCCCGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
 5 | AAAAAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
 6 | >BB2
 7 | ACCGTGCAGTCGACGACGTAATTAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
 8 | >CC test3
 9 | GGGGAGGGGGGGGCGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
10 | AAAAAAAAAAAAACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
11 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
12 | >DD test4
13 | TTTTTTTTTTTTTCGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
14 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
15 | >Eukaryota_Amoebozoa_Mycetozoa_Myxogastria_Myxogastromycetidae_Physariida_Physaraceae_Physarum_polycephalum
16 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
17 | >Eukaryota_Haptophyceae_Pavlovales_Pavlovaceae_Pavlova_lutheri
18 | ACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTAACCGTGCAGTCGACGACCGTGCAGTCGACGACCGTGCAGTCGACGACGTA
19 | 


--------------------------------------------------------------------------------
/src/hash.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | 
 5 | #include "global.h"
 6 | #include "hash.h"
 7 | 
 8 | lookup_t *lookup = NULL;
 9 | 
10 | void add_name(char *name)
11 | {
12 | 	lookup_t *s;
13 | 	s = (lookup_t *)malloc(sizeof(lookup_t));
14 | 	if (s == NULL) {
15 | 		fprintf(stderr,"couldn't get memory for lookup_t\n");
16 | 		exit(EXIT_FAILURE);
17 | 	} else {
18 | 		s->name = NULL;
19 | 	}
20 | 	s->name = (char *)malloc(sizeof(char *) * (strlen(name)+1));
21 | 	if (s->name == NULL) {
22 | 		fprintf(stderr,"couldn't get memory for name string\n");
23 | 		exit(EXIT_FAILURE);
24 | 	}
25 | 	strncpy(s->name,name,strlen(name)+1);
26 | 
27 | 	HASH_ADD_KEYPTR( hh, lookup, s->name, strlen(s->name), s );
28 | }
29 | 
30 | lookup_t *find_name(char *name)
31 | {
32 | 	lookup_t *s;
33 | 	HASH_FIND_STR(lookup, name, s);
34 | 	if (s)
35 | 		return s;
36 | 	else
37 | 		return (lookup_t *)NULL;
38 | }
39 | 
40 | void delete_name(lookup_t *s)
41 | {
42 | 	HASH_DEL(lookup, s);
43 | 	free(s->name);
44 | 	free(s);
45 | }
46 | 
47 | void delete_hash()
48 | {
49 | 	lookup_t *current_name, *tmp;
50 | 	HASH_ITER(hh,lookup,current_name,tmp) {
51 | 		delete_name(current_name);
52 | 	}
53 | }
54 | 
55 | void print_hash(void)
56 | {
57 | 	lookup_t *s;
58 | 	for(s=lookup;s!=NULL;s=s->hh.next)
59 | 		fprintf(stderr,"name %s\n",s->name);
60 | }
61 | 
62 | int hash_key_count(void)
63 | {
64 | 	lookup_t *s;
65 | 	int count = 0;
66 | 	for(s=lookup;s!=NULL;s=s->hh.next)
67 | 		count++;
68 | 	return(count);
69 | }
70 | 


--------------------------------------------------------------------------------
/src/size_filter.c:
--------------------------------------------------------------------------------
 1 | #include "global.h"
 2 | #include "output.h"
 3 | #include <stdio.h>
 4 | 
 5 | int size_filter(kseq_t *seq, int is_fasta, int min, int max, int length, int convert, int just_count) {
 6 | 	int count=0;
 7 | 	if (min > 0 && max > 0) { /* got a min and max */
 8 | 		if (seq->seq.l >= min && seq->seq.l <= max) {
 9 | 			count++;
10 | 			if (!just_count) {
11 | 				if (convert)
12 | 					is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
13 | 				else
14 | 					is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
15 | 			}
16 | 		}
17 | 	} else if (min > 0 || max > 0) { /* either  min or max is 0 */
18 | 		if (min > 0 && seq->seq.l >= min) {
19 | 			count++;
20 | 			if (!just_count) {
21 | 				if (convert)
22 | 					is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
23 | 				else
24 | 					is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
25 | 			}
26 | 		} else if (max > 0 && seq->seq.l <= max) {
27 | 			count++;
28 | 			if (!just_count) {
29 | 				if (convert)
30 | 					is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
31 | 				else
32 | 					is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
33 | 			}
34 | 		}
35 | 	} else {
36 | 		/* neither min nor max was > 0, so we print this sequence */
37 | 		count++;
38 | 		if (!just_count) {
39 | 			if (convert)
40 | 				is_fasta ? print_fastq_seq(seq) : print_fasta_seq(seq,length);
41 | 			else
42 | 				is_fasta ? print_fasta_seq(seq,length) : print_fastq_seq(seq);
43 | 		}
44 | 	}
45 | 	return count;
46 | }
47 | 


--------------------------------------------------------------------------------
/src/pull_by_size.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <zlib.h>
 5 | #include <errno.h>
 6 | 
 7 | #include "pull_by_size.h"
 8 | #include "file_read.h"
 9 | #include "global.h"
10 | #include "size_filter.h"
11 | 
12 | 
13 | __KS_GETC(gzread, BUFFER_SIZE)
14 | __KS_GETUNTIL(gzread, BUFFER_SIZE)
15 | __KSEQ_READ(static)
16 | 
17 | /*
18 | extern char const *progname;
19 | extern int verbose_flag;
20 | */
21 | 
22 | int pull_by_size(char *input_file, int min, int max,int length, int convert, int just_count) {
23 | 	gzFile fp;
24 | 	int count=0,l;
25 | 	int hit = 0;
26 | 	int excluded = 0;
27 | 	int is_fasta = 0; /* assume fastq */
28 | 	kseq_t *seq;
29 | 
30 | 	/* open fasta file */
31 | 	fp = gzopen(input_file,"r");
32 | 	if (!fp) {
33 | 		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
34 | 		exit(EXIT_FAILURE);
35 | 	}
36 | 
37 | 	seq = kseq_init(fp);
38 | 
39 | 	/* determine file type */
40 | 	l = kseq_read(seq); /* read the first sequence */
41 | 	is_fasta = seq->qual.s == NULL ? 1 : 0;
42 | 	gzrewind(fp); 
43 | 	kseq_rewind(seq); /* rewind to beginning for main loop */
44 | 
45 |     if (verbose_flag) {
46 |         if (is_fasta)
47 |             fprintf(stderr, "Input is FASTA format\n");
48 |         else
49 |             fprintf(stderr, "Input is FASTQ format\n");
50 |     }
51 | 
52 | 	/* search through list and see if this header matches */
53 | 	while((l = kseq_read(seq)) >= 0) {
54 | 		hit = size_filter(seq, is_fasta, min, max, length, convert, just_count);
55 | 		if (hit)
56 | 			count++;
57 | 		else
58 | 			excluded++;
59 | 	}
60 | 	kseq_destroy(seq);
61 | 	gzclose(fp); /* done reading file */
62 | 
63 | 	if (just_count) {
64 | 		fprintf(stdout, "Total output: %i\n", count);
65 | 		fprintf(stdout, "Total excluded: %i\n", excluded);
66 | 	}
67 | 	return count;
68 | }
69 | 


--------------------------------------------------------------------------------
/src/linked_list.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <string.h>
 4 | 
 5 | #include "linked_list.h"
 6 | #include "global.h"
 7 | 
 8 | /* initialize the list */
 9 | void initialize_list(list_t *list)
10 | {
11 |   list->head =  NULL;
12 |   list->end =  NULL;
13 | }
14 | 
15 | /* create a new node and add the word */
16 | /* return l to newnode */
17 | node *initnode(char *word)
18 | {
19 |   node *newnode;
20 |   newnode = (node *)malloc(sizeof(node));
21 |   if (newnode == NULL)
22 |     return (node *) NULL; /* OOM */
23 |   else {
24 |     char *w;
25 |     w = (char *)malloc(strlen(word)+1);
26 |     if (!w)
27 |       return NULL;
28 |     newnode->word = w;
29 |     strcpy(newnode->word, word);
30 |     /*newnode->word = strndup(word,strlen(word)+1);*/
31 |     newnode->next = NULL;
32 |     return newnode;
33 |   }
34 | }
35 | 
36 | 
37 | /* adds a node to given list */
38 | /* returns l to "current" node */
39 | void add_to_list(list_t *list, node *n)
40 | {
41 |   if (list->head == NULL) {
42 |     list->head = n; /* first in list */
43 |     list->end = n;
44 |   } else {
45 |     list->end->next = n;
46 |     list->end = n;
47 |   }
48 |   /*fprintf(stderr,"n is %p; head is %p; end is %p\n",n,list->head,list->end);*/
49 | }
50 | 
51 | /* find first node with this word */
52 | /* returns node with word */
53 | node *search_list(list_t *list, char *word)
54 | {
55 |   node *l = list->head;
56 |   while(l != NULL) {
57 |     if (strcmp(word, l->word) == 0)
58 |       return l;
59 |     l = l->next;
60 |     if( l == NULL )
61 |       break;
62 |   }
63 |   return NULL;
64 | }
65 | 
66 | /* chuck the whole list */
67 | void delete_list(list_t *list, node *n)
68 | {
69 |   node *temp;
70 | 
71 |   if ( list->head == NULL ) return;   /* dont try to delete an empty list       */
72 | 
73 |   if (n == list->head) {    /* if we are deleting the entire list         */
74 |     list->head = NULL;         /* then reset head and end to signify empty   */
75 |     list->end = NULL;          /* list                                       */
76 |   } else {
77 |     temp = list->head;         /* if its not the entire list, readjust end  */
78 |     while( temp->next != n )         /* locate previous node to l  */
79 |     temp = temp->next;
80 |     list->end = temp;                        /* set end to node before l   */
81 |   }
82 | 
83 |   while( n != NULL ) {   /* whilst there are still nodes to delete     */
84 |     temp = n->next;     /* record address of next node                */
85 |     free(n->word);          /* free this node                             */
86 |     free(n);          /* free this node                             */
87 |     n = temp;           /* point to next node to be deleted           */
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/pull_by_name.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <zlib.h>
  5 | #include <errno.h>
  6 | 
  7 | #include "global.h"
  8 | #include "pull_by_name.h"
  9 | #include "hash.h"
 10 | #include "file_read.h"
 11 | #include "size_filter.h"
 12 | 
 13 | __KS_GETC(gzread, BUFFER_SIZE)
 14 | __KS_GETUNTIL(gzread, BUFFER_SIZE)
 15 | __KSEQ_READ(static)
 16 | 
 17 | /*
 18 | extern char const *progname;
 19 | extern int verbose_flag;
 20 | */
 21 | 
 22 | int pull_by_name(char *input_file, FILE *names_fp, int min, int max, int length, int exclude, int convert, int just_count) {
 23 | 	gzFile fp;
 24 | 	int i,l,capacity=80;
 25 | 	int count=0,excluded=0;
 26 | 	int is_fasta = -1;
 27 | 	char *fasta_name;
 28 | 	char *line;
 29 | 	kseq_t *seq;
 30 | 
 31 | 	/* get some space for the line */
 32 | 	line = malloc(sizeof(char) * capacity); /* get memory allocated */
 33 | 	if (!line) {
 34 | 		fprintf(stderr, "%s - line malloc: %s\n",progname, strerror(errno));
 35 | 		exit(EXIT_FAILURE);
 36 | 	}
 37 | 
 38 | 	while((i = getl(&line, names_fp)) != -1) {
 39 | 		fasta_name = parse_name(line);
 40 | 		if (fasta_name) {
 41 | 			add_name(fasta_name);             /* add fasta_name to hash */
 42 | 		}
 43 | 	}
 44 | 
 45 | 	free(line); /* free up line */
 46 | 
 47 | 	if (verbose_flag) {
 48 | 		fprintf(stderr,"\n");
 49 | 		fprintf(stderr,"done reading from input (%d entries)\n", hash_key_count());
 50 | 	}
 51 | 	/*print_hash();*/
 52 | 
 53 | 
 54 | 	/* open fasta file */
 55 | 	fp = gzopen(input_file,"r");
 56 | 	if (!fp) {
 57 | 		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
 58 | 		exit(EXIT_FAILURE);
 59 | 	}
 60 | 
 61 | 	seq = kseq_init(fp); /* initialize kseq */
 62 | 
 63 | 	/* determine file type */
 64 | 	l = kseq_read(seq); /* read the first sequence */
 65 | 	is_fasta = seq->qual.s == NULL ? 1 : 0;
 66 | 	gzrewind(fp); /* rewind to beginning for main loop */
 67 | 	kseq_rewind(seq);
 68 | 
 69 | 	if (verbose_flag) {
 70 | 		if (is_fasta)
 71 | 			fprintf(stderr, "Input is FASTA format\n");
 72 | 		else
 73 | 			fprintf(stderr, "Input is FASTQ format\n");
 74 | 	}
 75 | 
 76 | 	/* search through list and see if this header matches */
 77 | 	while((l = kseq_read(seq)) >= 0) {
 78 | 		if (exclude == 0) { /* INCLUDE names from names file */
 79 | 			if (find_name(seq->name.s))            /* found name in list */
 80 | 				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
 81 | 			else
 82 | 				excluded++;
 83 | 		} else { /* EXCLUDE names from names file */
 84 | 			if (find_name(seq->name.s))            /* found name in list */
 85 | 				excluded++;
 86 | 			else
 87 | 				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
 88 | 		}
 89 | 	}
 90 | 	kseq_destroy(seq);
 91 | 	gzclose(fp); /* done reading file */
 92 | 
 93 | 	delete_hash(); /* free the list nodes */
 94 | 
 95 | 	if (just_count) {
 96 | 		fprintf(stdout, "Total output: %i\n", count);
 97 | 		if (exclude)
 98 | 			fprintf(stdout, "Total excluded: %i\n", excluded);
 99 | 	}
100 | 
101 | 	if (verbose_flag) {
102 | 		fprintf(stderr,"Processed %i entries\n",count);
103 | 		if (exclude)
104 | 			fprintf(stderr,"Excluded %i entries\n",excluded);
105 | 	}
106 | 	return count;
107 | }
108 | 


--------------------------------------------------------------------------------
/src/pull_by_re.c:
--------------------------------------------------------------------------------
  1 | #define PCRE2_CODE_UNIT_WIDTH 8
  2 | 
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include <zlib.h>
  7 | #include <errno.h>
  8 | #include <pcre2.h>
  9 | 
 10 | #include "pull_by_re.h"
 11 | #include "file_read.h"
 12 | #include "global.h"
 13 | #include "size_filter.h"
 14 | #include "search_header.h"
 15 | 
 16 | 
 17 | __KS_GETC(gzread, BUFFER_SIZE)
 18 | __KS_GETUNTIL(gzread, BUFFER_SIZE)
 19 | __KSEQ_READ(static)
 20 | 
 21 | /*
 22 | extern char const *progname;
 23 | extern int verbose_flag;
 24 | */
 25 | 
 26 | int pull_by_re(char *input_file, char *aStrRegex, int min, int max, int length, int exclude, int convert, int just_count) {
 27 | 	gzFile fp;
 28 | 	int count=0,l;
 29 | 	int excluded = 0;
 30 | 	int is_fasta = 0; /* assume fastq */
 31 | 	kseq_t *seq;
 32 | 
 33 | 	/* pcre2 variables */
 34 | 	pcre2_code *re; // the regex object
 35 | 	PCRE2_SIZE erroroffset;
 36 | 	int errornumber;
 37 | 
 38 | 	/* open fasta file */
 39 | 	fp = gzopen(input_file,"r");
 40 | 	if (!fp) {
 41 | 		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,input_file);
 42 | 		exit(EXIT_FAILURE);
 43 | 	}
 44 | 
 45 | 	seq = kseq_init(fp);
 46 | 
 47 | 	/* determine file type */
 48 | 	l = kseq_read(seq); /* read the first sequence */
 49 | 	is_fasta = seq->qual.s == NULL ? 1 : 0;
 50 | 	gzrewind(fp); 
 51 | 	kseq_rewind(seq); /* rewind to beginning for main loop */
 52 | 
 53 |         if (verbose_flag) {
 54 |             if (is_fasta)
 55 |                 fprintf(stderr, "Input is FASTA format\n");
 56 |             else
 57 |                 fprintf(stderr, "Input is FASTQ format\n");
 58 |         }
 59 | 
 60 | 	/* initialize the re */
 61 | 	re = pcre2_compile(
 62 | 			aStrRegex,             /* the pattern */
 63 | 			PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
 64 | 			0,                     /* default options */
 65 | 			&errornumber,          /* for error num */
 66 | 			&erroroffset,          /* err offset */
 67 | 			NULL);                 /* default compile context */
 68 | 
 69 | 	if (re == NULL) {
 70 | 		  PCRE2_UCHAR buffer[256];
 71 | 		  pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
 72 | 		  fprintf(stderr, "PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, buffer);
 73 | 		  exit(EXIT_FAILURE);
 74 | 	}
 75 | 
 76 | 	/* search through list and see if this header matches */
 77 | 	while((l = kseq_read(seq)) >= 0) {
 78 | 		if (exclude) {
 79 | 			if (search_header(re, seq->name.s) || search_header(re, seq->comment.s))
 80 | 				excluded++;
 81 | 			else {
 82 | 				/* regex doesn't match, so check size/print */
 83 | 				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
 84 | 			}
 85 | 		} else {
 86 | 			if (search_header(re, seq->name.s) || search_header(re, seq->comment.s)) {
 87 | 				/* regex matches so check size/print */
 88 | 				count += size_filter(seq, is_fasta, min, max, length, convert, just_count);
 89 | 			} else
 90 | 				excluded++;
 91 | 		}
 92 | 	} /* end of seq traversal */
 93 | 
 94 | 	/* tear down re */
 95 | 	pcre2_code_free(re); /* free up the re */
 96 | 
 97 | 	kseq_destroy(seq);
 98 | 	gzclose(fp); /* done reading file so close */
 99 | 
100 | 	if (just_count) {
101 | 		fprintf(stdout, "Total output: %i\n", count);
102 | 		fprintf(stdout, "Total excluded: %i\n", excluded);
103 | 	}
104 | 	return count;
105 | }
106 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
  1 | 2015-08-17 12:40:39 -0700 Brian C. Thomas 
  2 | 
  3 | 	* Merge branch 'master' of github.com:bcthomas/pullseq (HEAD -> master, origin/master)
  4 | 
  5 | 2015-08-17 12:36:36 -0700 Brian C. Thomas 
  6 | 
  7 | 	* fixed fasta comment bug
  8 | 
  9 | 2015-01-08 08:29:58 -0800 Brian C. Thomas 
 10 | 
 11 | 	* closed names_fp file pointer
 12 | 
 13 | 2014-09-17 12:04:28 -0700 Brian C. Thomas 
 14 | 
 15 | 	* update uthash to 1.9.9.1
 16 | 
 17 | 2014-08-15 16:57:44 -0700 Brian C. Thomas 
 18 | 
 19 | 	* update to 1.0.1
 20 | 
 21 | 2014-08-15 16:56:54 -0700 Brian C. Thomas 
 22 | 
 23 | 	* fixed bug in search_header()
 24 | 
 25 | 2014-08-13 10:15:05 -0700 Brian C. Thomas 
 26 | 
 27 | 	* updated README
 28 | 
 29 | 2014-08-13 09:17:09 -0700 Brian C. Thomas 
 30 | 
 31 | 	* changed param name to "regex" and short to "-g"
 32 | 
 33 | 2014-08-13 09:01:04 -0700 Brian C. Thomas 
 34 | 
 35 | 	* updated regex matching to be case-insensitive
 36 | 
 37 | 2014-08-11 12:05:27 -0700 Brian C. Thomas 
 38 | 
 39 | 	* Regex searching and some refactoring
 40 | 
 41 | 2014-07-16 09:18:12 -0700 Brian C. Thomas 
 42 | 
 43 | 	* allow '>' or '@' in the names files
 44 | 
 45 | 2013-12-16 08:42:23 -0800 Brian C. Thomas 
 46 | 
 47 | 	* added ability to get names from STDIN
 48 | 
 49 | 2013-12-13 07:21:01 -0800 Brian C. Thomas 
 50 | 
 51 | 	* fixed but in FASTQ header output
 52 | 
 53 | 2013-10-25 11:59:14 -0700 Brian C. Thomas 
 54 | 
 55 | 	* typo
 56 | 
 57 | 2013-10-25 11:57:50 -0700 Brian C. Thomas 
 58 | 
 59 | 	* updated README
 60 | 
 61 | 2013-10-25 11:55:08 -0700 Brian C. Thomas 
 62 | 
 63 | 	* added sequence counting, version, help
 64 | 
 65 | 2013-03-13 13:39:00 -0700 Brian C. Thomas 
 66 | 
 67 | 	* Merge branch 'seqdiff'
 68 | 
 69 | 2013-03-13 13:37:59 -0700 Brian C. Thomas 
 70 | 
 71 | 	* updated docs
 72 | 
 73 | 2013-03-13 13:26:11 -0700 Brian C. Thomas 
 74 | 
 75 | 	* finished seqdiff
 76 | 
 77 | 2013-03-09 14:25:41 -0800 Brian C. Thomas 
 78 | 
 79 | 	* Completed cmpseq()
 80 | 
 81 | 2013-02-28 09:44:44 -0800 Brian C. Thomas 
 82 | 
 83 | 	* updated license to be accurate
 84 | 
 85 | 2013-02-28 09:44:44 -0800 Brian C. Thomas 
 86 | 
 87 | 	* updated license to be accurate
 88 | 
 89 | 2013-02-27 15:10:50 -0800 Brian C. Thomas 
 90 | 
 91 | 	* updated AC_CONFIG_SRCDIR
 92 | 
 93 | 2013-02-27 15:02:24 -0800 Brian C. Thomas 
 94 | 
 95 | 	* modernized for automake 1.13+
 96 | 
 97 | 2013-02-27 14:49:34 -0800 Brian C. Thomas 
 98 | 
 99 | 	* updated ChangeLog
100 | 
101 | 2013-02-27 14:48:29 -0800 Brian C. Thomas 
102 | 
103 | 	* removed leftover junk
104 | 
105 | 2013-02-27 14:46:20 -0800 Brian C. Thomas 
106 | 
107 | 	* convert to autotools!
108 | 
109 | 2013-02-26 12:39:31 -0800 Brian C. Thomas 
110 | 
111 | 	* updated headers and readme
112 | 
113 | 2012-11-01 08:23:16 -0700 Brian C. Thomas 
114 | 
115 | 	* more typos
116 | 
117 | 2012-11-01 08:22:05 -0700 Brian C. Thomas 
118 | 
119 | 	* more readme touchups
120 | 
121 | 2012-11-01 08:19:58 -0700 Brian C. Thomas 
122 | 
123 | 	* typo in readme
124 | 
125 | 2012-11-01 08:17:05 -0700 Brian C. Thomas 
126 | 
127 | 	* updated readme
128 | 
129 | 2012-11-01 07:59:00 -0700 Brian C. Thomas 
130 | 
131 | 	* fix rewind bug and clean up verbose messages
132 | 
133 | 2012-10-29 13:06:33 -0700 Brian C. Thomas 
134 | 
135 | 	* updated uthash branch to current
136 | 
137 | 2012-04-27 16:39:16 -0700 Brian C. Thomas 
138 | 
139 | 	* more test dir cleanup for github
140 | 
141 | 2012-04-27 15:57:19 -0700 Brian C. Thomas 
142 | 
143 | 	* removed big testing files from repo
144 | 
145 | 2012-04-27 14:28:18 -0700 Brian C. Thomas 
146 | 
147 | 	* convert to uthash for names file
148 | 
149 | 2012-04-25 10:25:29 -0700 Brian C. Thomas 
150 | 
151 | 	* completed binary tree implementation
152 | 
153 | 2012-04-18 19:52:08 -0700 Brian C. Thomas 
154 | 
155 | 	* restructured header files (origin/linklist, linklist)
156 | 
157 | 2012-04-18 16:44:51 -0700 Brian C. Thomas 
158 | 
159 | 	* updated arg parsing
160 | 
161 | 2012-04-18 15:32:53 -0700 Brian C. Thomas 
162 | 
163 | 	* see previous commit - forgot this file
164 | 
165 | 2012-04-18 15:31:02 -0700 Brian C. Thomas 
166 | 
167 | 	* Completed linked-list version
168 | 
169 | 2012-04-18 09:07:08 -0700 Brian C. Thomas 
170 | 
171 | 	* updated Makefile - added uthash.h
172 | 
173 | 2012-04-18 09:04:06 -0700 Brian C. Thomas 
174 | 
175 | 	* initial commit
176 | 
177 | 


--------------------------------------------------------------------------------
/src/file_read.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <errno.h>
  5 | #include <limits.h>
  6 | 
  7 | #include "file_read.h"
  8 | #include "global.h"
  9 | 
 10 | char *parse_name(char *line)
 11 | {
 12 | 	char *word;
 13 | 	char *delims = " \t\n"; /* space, tab, newline */
 14 | 	word = strtok(line, delims);
 15 | 	/* check if the name begins with '>' or '@' and drop it */
 16 | 	if ( *word == '>' || *word == '@' )
 17 | 		word++; /* move the word up one char */
 18 | 	return word;
 19 | }
 20 | 
 21 | #define BUFSIZE 80
 22 | 	int getl(char **lineptr, FILE *fp) {
 23 | 		int ch;
 24 | 		ssize_t buf_pos = 0;
 25 | 		ssize_t count = 2; /* Always buf_pos + 2 (see below). */
 26 | 		size_t new_length = 0;
 27 | 		size_t n = BUFSIZE;
 28 | 		char *temp;
 29 | 
 30 | 		if ((lineptr == NULL) || (fp == NULL)) {
 31 | 			errno = EINVAL;
 32 | 			return -1;
 33 | 		}
 34 | 
 35 | 		if (errno != 0)
 36 | 			errno = 0;
 37 | 
 38 | 		if (*lineptr == NULL) {
 39 | 			*lineptr = malloc(n * sizeof(char));
 40 | 
 41 | 			if (*lineptr == NULL) {
 42 | 				return -1; /* Out of memory. */
 43 | 			}
 44 | 		}
 45 | 
 46 | 		/*
 47 | 		 * There are buf_pos characters in the buffer.  When we read another
 48 | 		 * character, we want to store it, and we also need enough
 49 | 		 * room for a nul string. So we need to realloc as soon as our capacity
 50 | 		 * becomes less than buf_pos + 2.
 51 | 		 * Hence the variable "count" which always equals buf_pos + 2.
 52 | 		 */
 53 | 
 54 | 		while ((ch = getc(fp)) != EOF) {
 55 | 			if (errno != 0)
 56 | 				return -1;
 57 | 
 58 | 			if (count > n) { /* current chars read is going to blow our buffer - add more */
 59 | 				new_length = n * 2; /* double the current buffer size */
 60 | 				if (new_length <= n) { /* Overflow. */
 61 | 					errno = ENOMEM;
 62 | 					/* We couldn't store the character, */
 63 | 					/* so put it back on the stream. */
 64 | 					ungetc(ch, fp);
 65 | 					return -1;
 66 | 				}
 67 | 				temp = (char *)realloc(*lineptr, new_length * sizeof(char)); /* realloc to a temp */
 68 | 				if (temp == NULL) {
 69 | 					ungetc(ch, fp);
 70 | 					return -1;
 71 | 				}
 72 | 				n = new_length; /* set n to the new length we were able to get from system */
 73 | 				*lineptr = temp; /* set line to this new temp string */
 74 | 			}
 75 | 
 76 | 			(*lineptr)[buf_pos++] = (char)ch; /* set this char in the string at buf_pos and THEN increment buf_pos */
 77 | 
 78 | 			if (ch == '\n') /* eol */
 79 | 				break;
 80 | 
 81 | 			if (count == SSIZE_MAX) { /* SSIZE_MAX is 32767 - posix def */
 82 | 				/* We'll overflow ssize_t on the next round, since the return
 83 | 				 * type is SSIZE_T */
 84 | 				errno = ENOMEM;
 85 | 				return -1;
 86 | 			}
 87 | 			count++; /* increment ch count */
 88 | 		}
 89 | 
 90 | 		(*lineptr)[buf_pos] = '\0'; /* set last position to \0 */
 91 | 
 92 | 		if (buf_pos == 0) { /* nothing in the file? */
 93 | 			buf_pos = -1;
 94 | 		}
 95 | 		return buf_pos;
 96 | 	}
 97 | 
 98 | 	int getlx(char **iline,FILE *fp)
 99 | 	{
100 | 		char *line = *iline;
101 | 		char *newline = NULL;
102 | 		char *buf = NULL;
103 | 		char *eol = NULL;
104 | 		size_t capacity = sizeof(line); /* reasonable starting point for line length */
105 | 		size_t remaining = capacity;
106 | 		size_t used = 0;
107 | 
108 | 		buf = line; /* point buf -> line */
109 | 		line[0] = '\0';
110 | 
111 | 		/* read file  into buf */
112 | 		while (fgets(buf, remaining, fp)) {
113 | 			eol = strchr(buf, '\n'); /* locate first occurrence of '\n' */
114 | 			if (eol) { /* found a newline in the string */
115 | 				*eol = '\0'; /* replace the newline with the null character */
116 | 				break;
117 | 			} else {
118 | 				/* buffer was too small - enlarge it */
119 | 				used = buf + remaining - line;
120 | 
121 | 				newline = realloc(line, capacity * 2);
122 | 				if (!newline) {
123 | 					fprintf(stderr, "getl - realloc: %s\n", strerror(errno));
124 | 					return -1;
125 | 				} else {
126 | 					line = newline;
127 | 				}
128 | 
129 | 				buf = line + used - 1;
130 | 				capacity *= 2;
131 | 				remaining = capacity - used;
132 | 			}
133 | 		}
134 | 
135 | 		if (errno) {
136 | 			fprintf(stderr, "getl - fgets: %s\n", strerror(errno));
137 | 		} else if (line[0]) {
138 | 			char *eol = strchr(buf, '\n');
139 | 			if (eol)
140 | 				*eol = '\0';
141 | 			/*buf = line;*/
142 | 			return strlen(line);
143 | 		}
144 | 		return -1;
145 | 	}
146 | 


--------------------------------------------------------------------------------
/src/output.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #include "global.h"
  6 | 
  7 | /*
  8 | extern char const *progname;
  9 | extern int verbose_flag;
 10 | */
 11 | 
 12 | int QUALITY_SCORE;
 13 | 
 14 | void print_fastq_seq(kseq_t *seq)
 15 | {
 16 | 	char *qual_str = NULL;
 17 | 	int i=0;
 18 | 	int l=strlen(seq->seq.s) + 1;     /* sequence length */
 19 | 	if (seq->qual.s == NULL) {        /* just use a default value for the quality code */
 20 | 		qual_str = (char *)malloc(sizeof(char) * l);
 21 | 		for (i=0;i<l;i++) {
 22 | 			qual_str[i] = (char)QUALITY_SCORE; /* set the value to char '=', ASCII 61 */
 23 | 		}
 24 | 		qual_str[l - 1] = '\0'; /* terminate the string */
 25 | 
 26 | 		if (seq->comment.l == 0)
 27 | 			printf("@%s\n%s\n+\n%s\n", seq->name.s, seq->seq.s, qual_str);
 28 | 		else
 29 | 			printf("@%s %s\n%s\n+\n%s\n", seq->name.s, seq->comment.s, seq->seq.s, qual_str);
 30 | 
 31 | 		free(qual_str);
 32 | 	} else {
 33 | 		if (seq->comment.l == 0)
 34 | 			printf("@%s\n%s\n+\n%s\n",seq->name.s,seq->seq.s,seq->qual.s);
 35 | 		else
 36 | 			printf("@%s %s\n%s\n+\n%s\n",seq->name.s,seq->comment.s,seq->seq.s,seq->qual.s);
 37 | 	}
 38 | }
 39 | 
 40 | void print_fasta_seq(kseq_t *seq, int n)
 41 | {
 42 | 	int l = seq->seq.l;   /* sequence length */
 43 | 	int x,i=0;
 44 | 	char *seqbuf = NULL;
 45 | 	seqbuf = (char *)malloc(sizeof(char) * (n + 1));
 46 | 	if (seqbuf == NULL) {
 47 | 		fprintf(stderr,"print_seq: out of memory for seqbuf!\n");
 48 | 		exit(EXIT_FAILURE);
 49 | 	}
 50 | 
 51 | 	if (n <= 0)
 52 | 		n = 50;
 53 | 
 54 | 	if (l > n) {                  /* seqlength is > column length - split sequence */
 55 | 		if (seq->comment.s == NULL) {
 56 | 			printf(">%s\n",seq->name.s);
 57 | 		} else {
 58 | 			if (seq->comment.l == 0)
 59 | 				printf(">%s\n",seq->name.s);
 60 | 			else
 61 | 				printf(">%s %s\n",seq->name.s, seq->comment.s);
 62 | 		}
 63 | 
 64 | 		for (x=0; x<l;x++) {
 65 | 			if (i < n) {                     /* there's less sequence than the column width */
 66 | 				seqbuf[i] = seq->seq.s[x];
 67 | 				i++;
 68 | 			} else {                         /* i is >= column width, so print this line */
 69 | 				seqbuf[i] = '\0';            /* set last position in string to null */
 70 | 				printf("%s\n",seqbuf);       /* print this line */
 71 | 				i = 0;                       /* reset i */
 72 | 				seqbuf[0] = '\0';            /* reset buffer */
 73 | 				seqbuf[i] = seq->seq.s[x];   /* set this buffer line to current sequence char */
 74 | 				i++;
 75 | 			}
 76 | 		}
 77 | 		if (i<n)
 78 | 			seqbuf[i] = '\0';
 79 | 		if (strlen(seqbuf) > 0)
 80 | 			printf("%s\n",seqbuf);
 81 | 	} else {                     /* seqlength < column length, so just print the full sequence */
 82 | 		if (seq->comment.l == 0)
 83 | 			printf(">%s\n%s\n",seq->name.s,seq->seq.s);
 84 | 		else
 85 | 			printf(">%s %s\n%s\n",seq->name.s,seq->comment.s,seq->seq.s);
 86 | 	}
 87 | 	free(seqbuf);
 88 | }
 89 | 
 90 | void print_fasta(FILE *fp, char *name, char *comment, char *seq, size_t colwidth)
 91 | {
 92 | 	int l = strlen(seq);   /* sequence length */
 93 | 	int x,i=0;
 94 | 	char *seqbuf = NULL;
 95 | 	seqbuf = (char *)malloc(sizeof(char) * (colwidth + 1));
 96 | 	if (seqbuf == NULL) {
 97 | 		fprintf(stderr,"print_seq: out of memory for seqbuf!\n");
 98 | 		exit(EXIT_FAILURE);
 99 | 	}
100 | 
101 | 	if (l > colwidth) {                  /* seqlength is > column length - split sequence */
102 | 		if (comment == NULL)
103 | 			fprintf(fp, ">%s\n",name);
104 | 		else
105 | 			fprintf(fp, ">%s %s\n",name,comment);
106 | 
107 | 		for (x=0; x<l;x++) {
108 | 			if (i < colwidth) {                     /* there's less sequence than the column width */
109 | 				seqbuf[i] = seq[x];
110 | 				i++;
111 | 			} else {                         /* i is >= column width, so print this line */
112 | 				seqbuf[i] = '\0';            /* set last position in string to null */
113 | 				fprintf(fp, "%s\n",seqbuf);       /* print this line */
114 | 				i = 0;                       /* reset i */
115 | 				seqbuf[0] = '\0';            /* reset buffer */
116 | 				seqbuf[i] = seq[x];   /* set this buffer line to current sequence char */
117 | 				i++;
118 | 			}
119 | 		}
120 | 		if (i<colwidth)
121 | 			seqbuf[i] = '\0';
122 | 		if (strlen(seqbuf) > 0)
123 | 			fprintf(fp, "%s\n",seqbuf);
124 | 	} else {                     /* seqlength < column length, so just print the full sequence */
125 | 		if (comment == NULL)
126 | 			fprintf(fp, ">%s\n%s\n",name,seq);
127 | 		else
128 | 			fprintf(fp, ">%s %s\n%s\n",name,comment,seq);
129 | 	}
130 | 	free(seqbuf);
131 | }
132 | 


--------------------------------------------------------------------------------
/test/utest_b.fa:
--------------------------------------------------------------------------------
 1 | >test
 2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
 3 | >testa
 4 | MAFSADVLKERRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
 5 | >UniRef90_Q6GZX4 Putative transcription factor 001R n=8 Tax=Ranavirus RepID=001R_FRG3G
 6 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
 7 | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD
 8 | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL
 9 | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD
10 | SFRKIYTDLGWKFTPL
11 | >UniRef90_Q6GZX3 Uncharacterized protein 002L n=5 Tax=Ranavirus RepID=002L_FRG3G
12 | MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
13 | IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
14 | AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
15 | KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
16 | DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
17 | VMFFVAGAVLVAILISTVRW
18 | >UniRef90_Q197F8 Uncharacterized protein 002R n=1 Tax=Invertebrate iridescent virus 3 RepID=002R_IIV3
19 | MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL
20 | QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT
21 | FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD
22 | LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET
23 | YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY
24 | STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS
25 | GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI
26 | QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC
27 | >UniRef90_Q197F7 Uncharacterized protein 003L n=1 Tax=Invertebrate iridescent virus 3 RepID=003L_IIV3
28 | MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT
29 | PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS
30 | TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI
31 | >UniRef90_Q6GZX2 Uncharacterized protein 3R n=8 Tax=Ranavirus RepID=003R_FRG3G
32 | MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD
33 | RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI
34 | FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ
35 | PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD
36 | AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR
37 | TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA
38 | LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR
39 | KAKIQEMFDNMVSRMVTS
40 | >UniRef90_Q6GZX1 Uncharacterized protein 004R n=8 Tax=Ranavirus RepID=004R_FRG3G
41 | MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY
42 | >UniRef90_Q197F5 Uncharacterized protein 005L n=1 Tax=Invertebrate iridescent virus 3 RepID=005L_IIV3
43 | MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTL
44 | CSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELC
45 | KADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYH
46 | QPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY
47 | >UniRef90_Q6GZX0 Uncharacterized protein 005R n=4 Tax=Frog virus 3 RepID=005R_FRG3G
48 | MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS
49 | NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED
50 | QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT
51 | REFVDKDAQEFQDFLNSLDASLLS
52 | >UniRef90_Q91G88 Putative KilA-N domain-containing protein 006L n=1 Tax=Invertebrate iridescent virus 6 RepID=006L_IIV6
53 | MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL
54 | IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII
55 | INYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRK
56 | KDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVE
57 | DRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCH
58 | PNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV
59 | >UniRef90_Q6GZW9 Uncharacterized protein 006R n=3 Tax=Frog virus 3 RepID=006R_FRG3G
60 | MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIE
61 | YLGPWVQAEYRRIKG
62 | >UniRef90_Q6GZW8 Uncharacterized protein 007R n=2 Tax=Frog virus 3 RepID=007R_FRG3G
63 | MRSIKPLRCCNAHGRHVSQEYGRCTLLLFREKLFLQTGLVCNKQCNAPNNDGAESKHHGI
64 | HHGSRGALALRGAGVHLLASAALGPRVLAGLVPTGRSVQGSVGQCGRVAQIGRARDVAAR
65 | KQESYCEK
66 | >UniRef90_Q197F3 Uncharacterized protein 007R n=1 Tax=Invertebrate iridescent virus 3 RepID=007R_IIV3
67 | MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV
68 | YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL
69 | LVGNEWYCKTFGKAGSKNVFLYNMIPTIYRDEPQHQEQILKKFMFFNATKNVEQNPNFLD
70 | NVPEEYYHLLLPKSWVEKNLSDKYRKIMETEHKPLVFSCEPAFSFGLCRNTQDKNESYQL
71 | SLCLYEREKPRDAEIVWAAKYDELAAMVRDYLKKTPEFKKYRSFISCMKGLSWKNNEIGD
72 | KDGPKLYPKVIFNRKKGEFVTIFTKDDDVEPETIEDPRTILDRRCVVQAALRLESVFVHN
73 | KVAIQLRINDVLISEWKEASSKPQPLILRRHRFTKPSSSVAKSTSPSLRNSGSDESDLNQ
74 | SDSDKEDERVVPVPKTKRIVKTVKLPN
75 | >UniRef90_Q197F2 Uncharacterized protein 008L n=1 Tax=Invertebrate iridescent virus 3 RepID=008L_IIV3
76 | MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQNAGDVTNKAYVDQA
77 | VMSAAVPVASSTTVGTIQMAGDLEGSSGTNPIIAANKITLNKLQKIGPKMVIGNPNSDWN
78 | NTQEIELDSSFRIVDNRLNAGIVPISSTDPNKSNTVIPAPQQNGLFYLDSSGRVWVWAEH
79 | YYKCITPSRYISKWMGVGDFQELTVGQSVMWDSGRPSIETVSTQGLEVEWISSTNFTLSS
80 | LYLIPIVVKVTICIPLLGQPDQMAKFVLYSVSSAQQPRTGIVLTTDSSRSSAPIVSEYIT
81 | VNWFEPKSYSVQLKEVNSDSGTTVTICSDKWLANPFLDCWITIEEVG
82 | >UniRef90_Q91G85 Uncharacterized protein 009R n=1 Tax=Invertebrate iridescent virus 6 RepID=009R_IIV6
83 | MIKLFCVLAAFISINSACQSSHQQREEFTVATYHSSSICTTYCYSNCVVASQHKGLNVES
84 | YTCDKPDPYGRETVCKCTLIKCHDI
85 | >UniRef90_UPI00029CD601 LOW QUALITY PROTEIN: hypothetical protein OPAG_08414, partial n=1 Tax=Rhodococcus opacus PD630 RepID=UPI00029CD601
86 | MSRATAGRIRSVDQLRPMPNPSEFTPLLVMNPRDRRSPHTVHIALGGVECFARNRSSHIM
87 | KLTTAHNVLDKLTTAHNVLGKLT
88 | 


--------------------------------------------------------------------------------
/src/cmpseq.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <zlib.h>
  5 | #include <errno.h>
  6 | 
  7 | #include "global.h"
  8 | #include "cmpseq.h"
  9 | #include "output.h"
 10 | #include "seqdiff_results.h"
 11 | 
 12 | __KS_GETC(gzread, BUFFER_SIZE)
 13 | __KS_GETUNTIL(gzread, BUFFER_SIZE)
 14 | __KSEQ_READ(static)
 15 | 
 16 | sd_lookup_t *sd_lookup = NULL;
 17 | 
 18 | /**
 19 |  * cmpseq
 20 |  *
 21 |  * Compare two files of sequences and determine which sequences are
 22 |  * uniq or common to each file.
 23 |  *
 24 |  **/
 25 | void cmpseq(seqdiff_results_t *results) {
 26 | 	gzFile fp;
 27 | 	int l;
 28 | 	kseq_t *seq;
 29 | 	sd_lookup_t *s,*temp;
 30 | 
 31 | 	/* open first sequence file */
 32 | 	fp = gzopen(results->first_file,"r");
 33 | 	if (!fp) {
 34 | 		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,results->first_file);
 35 | 		exit(EXIT_FAILURE);
 36 | 	}
 37 | 
 38 | 	seq = kseq_init(fp); /* initialize kseq */
 39 | 	
 40 | 	while((l = kseq_read(seq)) >= 0) {
 41 | 		results->first_file_total++; /* increment first_file_total */
 42 | 		sd_add_seq(seq,1,results->use_header);
 43 | 	}
 44 | 
 45 | 	kseq_destroy(seq);
 46 | 	gzclose(fp); /* done reading file */
 47 | 
 48 | 	/* process second_file */
 49 | 	fp = gzopen(results->second_file,"r");
 50 | 	if (!fp) {
 51 | 		fprintf(stderr,"%s - Couldn't open fasta file %s\n",progname,results->second_file);
 52 | 		exit(EXIT_FAILURE);
 53 | 	}
 54 | 
 55 | 	seq = kseq_init(fp); /* initialize kseq */
 56 | 	
 57 | 	while((l = kseq_read(seq)) >= 0) {
 58 | 		results->second_file_total++; /* increment second_file_total */
 59 | 		sd_add_seq(seq,2,results->use_header);
 60 | 	}
 61 | 
 62 | 	kseq_destroy(seq); /* free kseq struct */
 63 | 	gzclose(fp); /* done reading file */
 64 | 
 65 | 	HASH_ITER(hh, sd_lookup, s, temp) {
 66 | 		if (s->in_a == 1 && s->in_b == 0) {
 67 | 			results->first_file_uniq++;
 68 | 			if (!results->only_summarize)
 69 | 				fprintf(stdout,"%s\t\n",s->name);
 70 | 			/* printing out fasta or fastq????? */
 71 | 			if (results->a_output_fp != NULL)
 72 | 				print_fasta(results->a_output_fp, s->name, s->comment, s->seq, 50);
 73 | 		} else if (s->in_a == 0 && s->in_b == 1) {
 74 | 			results->second_file_uniq++;
 75 | 			if (!results->only_summarize)
 76 | 				fprintf(stdout,"\t%s\n",s->name);
 77 | 			if (results->b_output_fp != NULL)
 78 | 				print_fasta(results->b_output_fp, s->name, s->comment, s->seq, 50);
 79 | 		} else if (s->in_a == 1 && s->in_b == 1) {
 80 | 			results->common++;
 81 | 			if (!results->only_summarize)
 82 | 				fprintf(stdout,"%s\t%s\n",s->name,s->name);
 83 | 			if (results->c_output_fp != NULL) {
 84 | 				print_fasta(results->c_output_fp, s->name, s->comment, s->seq, 50);
 85 | 			}
 86 | 		}
 87 | 	}
 88 | 
 89 | 	sd_delete_hash(); /* free the hash nodes */
 90 | }
 91 | 
 92 | /* hash functions */
 93 | /**
 94 |  * sd_add_seq
 95 |  * checks if the key is in the hash. If yes, just increment count; if
 96 |  * no, add the new entry to the hash.
 97 |  */
 98 | void sd_add_seq(kseq_t *seq, int file, int use_header)
 99 | {
100 | 	sd_lookup_t *s;
101 | 
102 | 	if (use_header)
103 | 		HASH_FIND_STR(sd_lookup,seq->name.s,s);
104 | 	else
105 | 		HASH_FIND_STR(sd_lookup,seq->seq.s,s);
106 | 
107 | 	if (s==NULL) { /* key is not in hash */
108 | 		s = (sd_lookup_t *)malloc(sizeof(sd_lookup_t));
109 | 		if (s == NULL) {
110 | 			fprintf(stderr,"couldn't get memory for sd_lookup_t\n");
111 | 			exit(EXIT_FAILURE);
112 | 		} else {
113 | 			/* initialize struct */
114 | 			s->seq = NULL;
115 | 			s->name = NULL;
116 | 			s->count = 1;
117 | 			s->in_a = 0;
118 | 			s->in_b = 0;
119 | 		}
120 | 
121 | 		s->seq = (char *)malloc((sizeof(char*) * (strlen(seq->seq.s)+1)));
122 | 		if (s->seq == NULL) {
123 | 			fprintf(stderr,"couldn't get memory for seq string\n");
124 | 			exit(EXIT_FAILURE);
125 | 		} else
126 | 			strncpy(s->seq,seq->seq.s,strlen(seq->seq.s)+1);
127 | 
128 | 		s->name = (char *)malloc((sizeof(char*) * (strlen(seq->name.s)+1)));
129 | 		if (s->name == NULL) {
130 | 			fprintf(stderr,"couldn't get memory for name string\n");
131 | 			exit(EXIT_FAILURE);
132 | 		} else
133 | 			strncpy(s->name,seq->name.s,strlen(seq->name.s)+1);
134 | 
135 | 		if (seq->comment.s != NULL) {
136 | 			s->comment = (char *)malloc((sizeof(char*) * (strlen(seq->comment.s)+1)));
137 | 			if (s->comment == NULL) {
138 | 				fprintf(stderr,"couldn't get memory for comment string\n");
139 | 				exit(EXIT_FAILURE);
140 | 			} else
141 | 				strncpy(s->comment,seq->comment.s,strlen(seq->comment.s)+1);
142 | 		} else
143 | 			s->comment = NULL;
144 | 
145 | 		if (file == 1)
146 | 			s->in_a = 1;
147 | 		else if (file == 2)
148 | 			s->in_b = 1;
149 | 		if (use_header)
150 | 			HASH_ADD_KEYPTR( hh, sd_lookup, s->name, strlen(s->name), s );
151 | 		else
152 | 			HASH_ADD_KEYPTR( hh, sd_lookup, s->seq, strlen(s->seq), s );
153 | 	} else {
154 | 		s->count++; /* key is already in the hash, just incr count */
155 | 		if (file == 1)
156 | 			s->in_a = 1;
157 | 		else if (file == 2)
158 | 			s->in_b = 1;
159 | 	}
160 | }
161 | 
162 | sd_lookup_t *sd_find_seq(char *str, int use_header)
163 | {
164 | 	sd_lookup_t *s;
165 | 	HASH_FIND_STR(sd_lookup, str, s);
166 | 	if (s)
167 | 		return s;
168 | 	else
169 | 		return (sd_lookup_t *)NULL;
170 | }
171 | 
172 | void sd_delete_seq(sd_lookup_t *s)
173 | {
174 | 	HASH_DEL(sd_lookup, s);
175 | 	free(s->seq);
176 | 	free(s->name);
177 | 	free(s);
178 | }
179 | 
180 | void sd_delete_hash()
181 | {
182 | 	sd_lookup_t *current_seq, *tmp;
183 | 	HASH_ITER(hh,sd_lookup,current_seq,tmp) {
184 | 		sd_delete_seq(current_seq);
185 | 	}
186 | }
187 | 
188 | int sd_hash_key_count(void)
189 | {
190 | 	sd_lookup_t *s;
191 | 	int count = 0;
192 | 	for(s=sd_lookup;s!=NULL;s=s->hh.next)
193 | 		count++;
194 | 	return(count);
195 | }
196 | 
197 | void sd_print_hash(void)
198 | {
199 | 	sd_lookup_t *s;
200 | 	for(s=sd_lookup;s!=NULL;s=s->hh.next)
201 | 		fprintf(stderr,"name %s\n",s->name);
202 | }
203 | 
204 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | Summary:
  2 | 
  3 | Software to extract sequence from a fasta or fastq. Also filter
  4 | sequences by a minimum length or maximum length. Fast, written in C,
  5 | using kseq.h library.
  6 | 
  7 | 
  8 | Pullseq Summary:
  9 | 
 10 |   pullseq - extract sequences from a fasta/fastq file.  This program is
 11 |   fast, and can be useful in a variety of situations.  You can use it to
 12 |   extract sequences from one fasta/fastq file into a new file, given
 13 |   either a list of header ids to include or a regular expression
 14 |   pattern to match.  Results can be included (default) or excluded,
 15 |   and they can additionally be filtered with minimum / maximum sequence
 16 |   lengths.
 17 | 
 18 |   Additionally, it can convert from fastq to fasta or visa-versa and
 19 |   can change the length of the output sequence lines.
 20 | 
 21 |   NOTE: pullseq prints to standard out, so you need to use redirection
 22 |   (e.g. pullseq input.fasta -m 10 *>* output.fasta ) to create output files.
 23 | 
 24 | Synopsis:
 25 | 
 26 |  pullseq -i <input fasta/fastq file> -n <header names to select>
 27 | 
 28 |  pullseq -i <input fasta/fastq file> -m <minimum sequence length>
 29 | 
 30 |  pullseq -i <input fasta/fastq file> -g <regex name to match>
 31 | 
 32 |  pullseq -i <input fasta/fastq file> -m <minimum sequence length> -a <max sequence length>
 33 | 
 34 |  pullseq -i <input fasta/fastq file> -t
 35 | 
 36 |  cat <names to select from STDIN> | pullseq -i <input fasta/fastq file> -N
 37 | 
 38 |   Options:
 39 |     -i, --input,       Input fasta/fastq file (required)
 40 |     -n, --names,       File of header id names to search for
 41 |     -N, --names_stdin, Use STDIN for header id names
 42 |     -g, --regex,       Regular expression to match (PERL compatible; always case-insensitive)
 43 |     -m, --min,         Minimum sequence length
 44 |     -a, --max,         Maximum sequence length
 45 |     -l, --length,      Sequence characters per line (default 50)
 46 |     -c, --convert,     Convert input to fastq/fasta (e.g. if input is fastq, output will be fasta)
 47 |     -q, --quality,     ASCII code to use for fasta->fastq quality conversions
 48 |     -e, --excluded,    Exclude the header id names in the list (-n)
 49 |     -t, --count,       Just count the possible output, but don't write it
 50 |     -h, --help,        Display this help and exit
 51 |     -v, --verbose,     Print extra details during the run
 52 |     --version,         Output version information and exit
 53 | 
 54 | =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 55 | 
 56 | Seqdiff Summary:
 57 |   seqdiff - compare two fasta (or fastq) files to determine overlap of
 58 |   sequences.  This overlap can be at the sequence level (are two
 59 |   sequences exactly the same in both files?) or at the header name
 60 |   level (do two sequences contain the same header name between the two
 61 |   files?).
 62 | 
 63 | Synopsis:
 64 |   seqdiff -1 first_file.fa -2 second_file.fa
 65 | 
 66 | Usage:
 67 |   seqdiff -1 <first input fasta/fastq file> -2 <second fasta/fastq file>
 68 | 
 69 |   Options:
 70 |     -1, --first,      First sequence file (required)
 71 |     -2, --second,     Second sequence file (required)
 72 |     -a, --a_output,   File name for uniques from first file
 73 |     -b, --b_output,   File name for uniques from second file
 74 |     -c, --c_output,   File name for common entries
 75 |     -d, --headers,    Compare headers instead of sequences (default: false)
 76 |     -s, --summary,    Just show summary stats? (default: false)
 77 |     -h, --help,       Display this help and exit
 78 |     -v, --verbose,    Print extra details during the run
 79 |     --version,        Output version information and exit
 80 | 
 81 | REQUIREMENTS:
 82 |   Pullseq/Seqdiff require a C compiler and has been tested to work with
 83 |   either GCC or clang. They also require (and include) kseq.h (Heng
 84 |   Li) and uthash.h (http://troydhanson.github.com/uthash/).
 85 | 
 86 |   kseq.h also requires Zlib (so your linker should be able to handle
 87 |   the '-lz' option).  You can obtain zlib from http://www.zlib.net/
 88 |   or commonly from your OS package manager (e.g. apt-get zlib or
 89 |   emerge zlib).
 90 | 
 91 | NEW INSTALL:
 92 |   Pullseq uses CMake, so you must have CMake installed on your system.
 93 | 
 94 |   git clone: https://github.com/bcthomas/pullseq.git
 95 |   cd pullseq
 96 |   mkdir build
 97 |   cd build
 98 |   cmake ..
 99 | 
100 |   This will build binaries in build/src/
101 |   > build/src/pullseq
102 |   > build/src/seqdiff
103 | 
104 | 
105 | 
106 | OLD INSTALL:
107 |   To install, do the following in a shell on your system...
108 | 
109 |   From Git:
110 |   git clone https://github.com/bcthomas/pullseq.git # checkout the code using git
111 |   cd pullseq
112 |   ./bootstrap  # get set up for config/build after cloning
113 |   ./configure  # configure the application based on your system
114 |   make         # will build the application
115 |   make install # will install in /usr/local by default
116 | 
117 |   From a Release file (tar or zip):
118 |   tar xvf pullseq_version.tar.gz
119 |   cd pullseq_version
120 |   ./autoconf   # make sure configuration is set
121 |   ./configure  # configure the application based on your system
122 |   make         # will build the application
123 |   make install # will install in /usr/local by default
124 | 
125 |   NOTE: If you have PCRE (perl-compatible regular expression library)
126 |   installed in a non-standard location (e.g. on a mac using brew), the
127 |   ./configure script will fail. You'll need to update your CFLAGS and
128 |   LDFLAGS env settings to define where your PCRE library files were
129 |   installed.
130 | 
131 |   For example, on a mac with pcre installed by brew, you can do this:
132 | 
133 |   pcre-config --cflags
134 |   -I/usr/local/Cellar/pcre/8.39/include
135 |   
136 |   Then you can just add this to a env CFLAGS variable and run the
137 |   configure command, like so...
138 | 
139 |   export CFLAGS="-I/usr/local/Cellar/pcre/8.39/include"
140 |   ./configure
141 | 
142 |   If your pcre library is installed somewhere else, you just update
143 |   the CFLAGS env variable accordingly.
144 | 


--------------------------------------------------------------------------------
/src/bst.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <string.h>
  4 | 
  5 | #include "global.h"
  6 | #include "bst.h"
  7 | 
  8 | /* create a new node */
  9 | /* return pointer to new node */
 10 | node_t *initnode(char *name)
 11 | {
 12 | 	node_t *newnode;
 13 | 	newnode = (node_t *)malloc(sizeof(node_t));
 14 | 	if (newnode == NULL)
 15 | 		return (node_t *)NULL; /* OOM */
 16 | 	else {
 17 | 		char *newname;
 18 | 		newname = (char *)malloc(strlen(name)+1); /* alloc space for name */
 19 | 		if (!newname)
 20 | 			return (node_t *)NULL; /* oom */
 21 | 		newnode->name = newname;
 22 | 		strcpy(newnode->name, name);
 23 | 
 24 | 		newnode->parent = newnode->left = newnode->right = (node_t *)NULL;
 25 | 		return newnode;
 26 | 	}
 27 | }
 28 | 
 29 | /* simple alphabetic comparison */
 30 | int compare(char *left, char *right)
 31 | {
 32 | 	int result = 0;
 33 | 	result = strcmp(left,right);
 34 | 	if (result == 0)
 35 | 		return 0; /* equal */
 36 | 	else if (result > 0)
 37 | 		return 1; /* gt */
 38 | 	else if (result < 0)
 39 | 		return -1; /* lt */
 40 | 	return result;
 41 | }
 42 | 
 43 | /* find node with this name */
 44 | /* returns node or NULL if not found */
 45 | node_t *searchtree(tree_t *tree, char *name)
 46 | {
 47 | 	int result;
 48 | 	node_t *node = tree->root;
 49 | 
 50 | 	while(node != NULL) {
 51 | 		result = compare(name,node->name);
 52 | 		if (result == 0) {
 53 | 			return node;
 54 | 		} else if (result < 0) {
 55 | 			node = node->left;
 56 | 		} else if (result > 0) {
 57 | 			node = node->right;
 58 | 		} else
 59 | 			break;
 60 | 	}
 61 | 	return (node_t *)NULL;
 62 | }
 63 | 
 64 | /* adds a node to tree */
 65 | /* returns 0 or 1 depending on success/fail */
 66 | int insertnode(tree_t *tree, char *name)
 67 | {
 68 | 	int result;
 69 | 	node_t *node;
 70 | 
 71 | 	if (tree->root == NULL) {
 72 | 		tree->root = initnode(name);
 73 | 	} else {
 74 | 		node = tree->root;
 75 | 		while(1) {
 76 | 			result = compare(name,node->name);
 77 | 
 78 | 			if (result == 0)
 79 | 				return 0; /* return fail, since data is already in tree */
 80 | 			else if (result < 1) { /*left*/
 81 | 				if (node->left == NULL)
 82 | 					break;
 83 | 				else
 84 | 					node = node->left;
 85 | 			} else { /*right*/
 86 | 				if (node->right == NULL)
 87 | 					break;
 88 | 				else
 89 | 					node = node->right;
 90 | 			}
 91 | 		} /* while(1) */
 92 | 
 93 | 		/* we've broken from the loop, so we have a NULL leaf*/
 94 | 		if (result < 1) {
 95 | 			node->left = initnode(name);
 96 | 			node->left->parent = node;
 97 | 		} else {
 98 | 			node->right = initnode(name);
 99 | 			node->right->parent = node;
100 | 		}
101 | 	}
102 | 	return 1;
103 | }
104 | 
105 | /* delete node from tree */
106 | int deletenode(tree_t *tree, char *name)
107 | {
108 | 	if (tree->root != NULL) {
109 | 		node_t head = {NULL,NULL,NULL,NULL};
110 | 		node_t *node = &head;
111 | 		node_t *t = NULL;
112 | 		int result;
113 | 
114 | 		node->right = tree->root; /* point the head node at the tree top */
115 | 		tree->root->parent = &head; /* also point the tree root's head */
116 | 	
117 | 		/* walk the tree, looking for data to delete */
118 | 		while(1) {
119 | 			result = compare(name,node->name);
120 | 
121 | 			if (result == 0) /* found the node to delete */
122 | 				break;
123 | 			else if (result < 1) { /*left*/
124 | 				if (node->left == NULL)
125 | 					return 0; /* not found */
126 | 				else
127 | 					node = node->left;
128 | 			} else { /*right*/
129 | 				if (node->right == NULL)
130 | 					return 0; /* not found */
131 | 				else
132 | 					node = node->right;
133 | 			}
134 | 		} /* while(1) */
135 | 
136 | 
137 | 		/* if we found matching name, f is pointing to the matching
138 | 		 * node */
139 | 		if (node != NULL) {
140 | 			if (node->left != NULL && node->right != NULL) { /* two children */
141 | 				t = node->right;
142 | 				while (t->left != NULL) {
143 | 					t = t->left;
144 | 				}
145 | 				t->parent = node->parent;
146 | 				if (node->parent->right == node)
147 | 					node->parent->right = t;
148 | 				else
149 | 					node->parent->left = t;
150 | 				free(node);
151 | 			} else if (node->left == NULL && node->right == NULL) {
152 | 				/* leaf */
153 | 				free(node);
154 | 			} else if (node->left == NULL) {
155 | 				/* set right */
156 | 				t = node->right; /* temp copy of right node */
157 | 				node->right->parent = node->parent;
158 | 				node->parent->right = t;
159 | 				free(node);
160 | 			} else {
161 | 				/* set set left */
162 | 				t = node->left; /* temp copy of left node */
163 | 				node->left->parent = node->parent;
164 | 				node->parent->left = t;
165 | 				free(node);
166 | 			}
167 | 		}
168 | 	}
169 | 	/* tree->root is NULL, so just return */
170 | 	return 1;
171 | }
172 | 
173 | /* print from node down, inorder */
174 | void print_inorder(node_t *node)
175 | {
176 | 	if (node == NULL)
177 | 		return;
178 | 	print_inorder(node->left);
179 | 	fprintf(stderr,"%s\n",node->name);
180 | 	print_inorder(node->right);
181 | 	return;
182 | }
183 | 
184 | /* print from node down, preorder */
185 | void print_preorder(node_t *node)
186 | {
187 | 	if (node == NULL)
188 | 		return;
189 | 	fprintf(stderr,"%s\n",node->name);
190 | 	print_preorder(node->left);
191 | 	print_preorder(node->right);
192 | 	return;
193 | }
194 | 
195 | /* print from node down, postorder */
196 | void print_postorder(node_t *node)
197 | {
198 | 	if (node == NULL)
199 | 		return;
200 | 	print_postorder(node->left);
201 | 	print_postorder(node->right);
202 | 	fprintf(stderr,"%s\n",node->name);
203 | 	return;
204 | }
205 | 
206 | void deletetreenode(node_t *node)
207 | {
208 | 	while(1) {
209 | 		if (node == NULL)
210 | 			break;
211 | 		else if (node->left != NULL) {
212 | 			deletetreenode(node->left);
213 | 			node->left = NULL;
214 | 		} else if (node->right != NULL) {
215 | 			deletetreenode(node->right);
216 | 			node->right = NULL;
217 | 		} else {
218 | 			free(node->name);
219 | 			free(node);
220 | 			return;
221 | 		}
222 | 	}
223 | 	return;
224 | }
225 | 
226 | /* chuck the whole list */
227 | void deletetree(tree_t *tree)
228 | {
229 | 	if (tree->root == NULL) {
230 | 		return;
231 | 	} else {
232 | 		deletetreenode(tree->root);
233 | 	}
234 | 	free(tree);
235 | 	return;
236 | }
237 | 


--------------------------------------------------------------------------------
/test/utest_a.fa:
--------------------------------------------------------------------------------
  1 | >test
  2 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
  3 | >testa
  4 | MAFSADVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
  5 | >UniRef90_Q6GZX4 Putative transcription factor 001R n=8 Tax=Ranavirus RepID=001R_FRG3G
  6 | MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
  7 | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD
  8 | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL
  9 | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD
 10 | SFRKIYTDLGWKFTPL
 11 | >UniRef90_Q6GZX3 Uncharacterized protein 002L n=5 Tax=Ranavirus RepID=002L_FRG3G
 12 | MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
 13 | IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
 14 | AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
 15 | KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
 16 | DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
 17 | VMFFVAGAVLVAILISTVRW
 18 | >UniRef90_Q197F8 Uncharacterized protein 002R n=1 Tax=Invertebrate iridescent virus 3 RepID=002R_IIV3
 19 | MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWKMNREQALAERYPEL
 20 | QTSEPSEDYSGPVESLELLPLEIKLDIMQYLSWEQISWCKHPWLWTRWYKDNVVRVSAIT
 21 | FEDFQREYAFPEKIQEIHFTDTRAEEIKAILETTPNVTRLVIRRIDDMNYNTHGDLGLDD
 22 | LEFLTHLMVEDACGFTDFWAPSLTHLTIKNLDMHPRWFGPVMDGIKSMQSTLKYLYIFET
 23 | YGVNKPFVQWCTDNIETFYCTNSYRYENVPRPIYVWVLFQEDEWHGYRVEDNKFHRRYMY
 24 | STILHKRDTDWVENNPLKTPAQVEMYKFLLRISQLNRDGTGYESDSDPENEHFDDESFSS
 25 | GEEDSSDEDDPTWAPDSDDSDWETETEEEPSVAARILEKGKLTITNLMKSLGFKPKPKKI
 26 | QSIDRYFCSLDSNYNSEDEDFEYDSDSEDDDSDSEDDC
 27 | >UniRef90_Q197F7 Uncharacterized protein 003L n=1 Tax=Invertebrate iridescent virus 3 RepID=003L_IIV3
 28 | MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGAWFDTSLNARSLTTT
 29 | PSLTTCTPPSLAACTPPTSLGMVDSPPHINPPRRIGTLCFDFGSAKSPQRCECVASDRPS
 30 | TTSNTAPDTYRLLITNSKTRKNNYGTCRLEPLTYGI
 31 | >UniRef90_Q6GZX2 Uncharacterized protein 3R n=8 Tax=Ranavirus RepID=003R_FRG3G
 32 | MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVYQMSNILLTERRQVD
 33 | RAMGGSDDDGVMVVALSPSDFKTVLGSALLAVERDMVHVVPKYLQTPGILHDMLVLLTPI
 34 | FGEALSVDMSGATDVMVQQIATAGFVDVDPLHSSVSWKDNVSCPVALLAVSNAVRTMMGQ
 35 | PCQVTLIIDVGTQNILRDLVNLPVEMSGDLQVMAYTKDPLGKVPAVGVSVFDSGSVQKGD
 36 | AHSVGAPDGLVSFHTHPVSSAVELNYHAGWPSNVDMSSLLTMKNLMHVVVAEEGLWTMAR
 37 | TLSMQRLTKVLTDAEKDVMRAAAFNLFLPLNELRVMGTKDSNNKSLKTYFEVFETFTIGA
 38 | LMKHSGVTPTAFVDRRWLDNTIYHMGFIPWGRDMRFVVEYDLDGTNPFLNTVPTLMSVKR
 39 | KAKIQEMFDNMVSRMVTS
 40 | >UniRef90_Q6GZX1 Uncharacterized protein 004R n=8 Tax=Ranavirus RepID=004R_FRG3G
 41 | MNAKYDTDQGVGRMLFLGTIGLAVVVGGLMAYGYYYDGKTPSSGTSFHTASPSFSSRYRY
 42 | >UniRef90_Q197F5 Uncharacterized protein 005L n=1 Tax=Invertebrate iridescent virus 3 RepID=005L_IIV3
 43 | MRYTVLIALQGALLLLLLIDDGQGQSPYPYPGMPCNSSRQCGLGTCVHSRCAHCSSDGTL
 44 | CSPEDPTMVWPCCPESSCQLVVGLPSLVNHYNCLPNQCTDSSQCPGGFGCMTRRSKCELC
 45 | KADGEACNSPYLDWRKDKECCSGYCHTEARGLEGVCIDPKKIFCTPKNPWQLAPYPPSYH
 46 | QPTTLRPPTSLYDSWLMSGFLVKSTTAPSTQEEEDDY
 47 | >UniRef90_Q6GZX0 Uncharacterized protein 005R n=4 Tax=Frog virus 3 RepID=005R_FRG3G
 48 | MQNPLPEVMSPEHDKRTTTPMSKEANKFIRELDKKPGDLAVVSDFVKRNTGKRLPIGKRS
 49 | NLYVRICDLSGTIYMGETFILESWEELYLPEPTKMEVLGTLESCCGIPPFPEWIVMVGED
 50 | QCVYAYGDEEILLFAYSVKQLVEEGIQETGISYKYPDDISDVDEEVLQQDEEIQKIRKKT
 51 | REFVDKDAQEFQDFLNSLDASLLS
 52 | >UniRef90_Q91G88 Putative KilA-N domain-containing protein 006L n=1 Tax=Invertebrate iridescent virus 6 RepID=006L_IIV6
 53 | MDSLNEVCYEQIKGTFYKGLFGDFPLIVDKKTGCFNATKLCVLGGKRFVDWNKTLRSKKL
 54 | IQYYETRCDIKTESLLYEIKGDNNDEITKQITGTYLPKEFILDIASWISVEFYDKCNNII
 55 | INYFVNEYKTMDKKTLQSKINEVEEKMQKLLNEKEEELQEKNDKIDELILFSKRMEEDRK
 56 | KDREMMIKQEKMLRELGIHLEDVSSQNNELIEKVDEQVEQNAVLNFKIDNIQNKLEIAVE
 57 | DRAPQPKQNLKRERFILLKRNDDYYPYYTIRAQDINARSALKRQKNLYNEVSVLLDLTCH
 58 | PNSKTLYVRVKDELKQKGVVFNLCKVSISNSKINEEELIKAMETINDEKRDV
 59 | >UniRef90_Q6GZW9 Uncharacterized protein 006R n=3 Tax=Frog virus 3 RepID=006R_FRG3G
 60 | MYKMYFLKDQKFSLSGTIRINDKTQSEYGSVWCPGLSITGLHHDAIDHNMFEEMETEIIE
 61 | YLGPWVQAEYRRIKG
 62 | >UniRef90_Q6GZW8 Uncharacterized protein 007R n=2 Tax=Frog virus 3 RepID=007R_FRG3G
 63 | MRSIKPLRCCNAHGRHVSQEYGRCTLLLFREKLFLQTGLVCNKQCNAPNNDGAESKHHGI
 64 | HHGSRGALALRGAGVHLLASAALGPRVLAGLVPTGRSVQGSVGQCGRVAQIGRARDVAAR
 65 | KQESYCEK
 66 | >UniRef90_Q197F3 Uncharacterized protein 007R n=1 Tax=Invertebrate iridescent virus 3 RepID=007R_IIV3
 67 | MEAKNITIDNTTYNFFKFYNINQPLTNLKYLNSERLCFSNAVMGKIVDDASTITITYHRV
 68 | YFGISGPKPRQVADLGEYYDVNELLNYDTYTKTQEFAQKYNSLVKPTIDAKNWSGNELVL
 69 | LVGNEWYCKTFGKAGSKNVFLYNMIPTIYRDEPQHQEQILKKFMFFNATKNVEQNPNFLD
 70 | NVPEEYYHLLLPKSWVEKNLSDKYRKIMETEHKPLVFSCEPAFSFGLCRNTQDKNESYQL
 71 | SLCLYEREKPRDAEIVWAAKYDELAAMVRDYLKKTPEFKKYRSFISCMKGLSWKNNEIGD
 72 | KDGPKLYPKVIFNRKKGEFVTIFTKDDDVEPETIEDPRTILDRRCVVQAALRLESVFVHN
 73 | KVAIQLRINDVLISEWKEASSKPQPLILRRHRFTKPSSSVAKSTSPSLRNSGSDESDLNQ
 74 | SDSDKEDERVVPVPKTKRIVKTVKLPN
 75 | >UniRef90_Q197F2 Uncharacterized protein 008L n=1 Tax=Invertebrate iridescent virus 3 RepID=008L_IIV3
 76 | MSFKVYDPIAELIATQFPTSNPDLQIINNDVLVVSPHKITLPMGPQNAGDVTNKAYVDQA
 77 | VMSAAVPVASSTTVGTIQMAGDLEGSSGTNPIIAANKITLNKLQKIGPKMVIGNPNSDWN
 78 | NTQEIELDSSFRIVDNRLNAGIVPISSTDPNKSNTVIPAPQQNGLFYLDSSGRVWVWAEH
 79 | YYKCITPSRYISKWMGVGDFQELTVGQSVMWDSGRPSIETVSTQGLEVEWISSTNFTLSS
 80 | LYLIPIVVKVTICIPLLGQPDQMAKFVLYSVSSAQQPRTGIVLTTDSSRSSAPIVSEYIT
 81 | VNWFEPKSYSVQLKEVNSDSGTTVTICSDKWLANPFLDCWITIEEVG
 82 | >UniRef90_Q6GZW6 Putative helicase 009L n=9 Tax=Ranavirus RepID=009L_FRG3G
 83 | MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRRAVQDDPAFGHQKLV
 84 | ETFLSEDTPYRELLLFHAPGTGKTCTVVSVAERAKEKGLTRGCIVLARGAALLRNFLHEL
 85 | VFNCGTGGRYIPEGYADMGDQERTRKMRKAVSSYYQFRTYETFAKSVATMSAEAIRARYD
 86 | RFVIVMDEVHHLRSVQAEGVNTYSAISRFLRTVRGCVKMLLTGTPMTNEPGELADVLNLI
 87 | LPQDKTIRPEDGIFSNSGDLLKPDELAERVRGRVSYLKAARPDAGLTFAGEVLGGTGMTH
 88 | LRLVRLEMSAFQSDAYASAWDQDAGDRNIFSNSRQCSLAVMPDRRWGSAAEARNPSQVRR
 89 | MAGQNLAEYSVKYDYLVRVASSSPKTFAYCEYVNGSGLSLLSDILLANGWRRATGRETTP
 90 | GKRFALLTASQKNIHKIVQRFNHEDNVDGAYISLLLGSRVVAEGLTFKEVRHTVILTPHW
 91 | NYTETAQAIARSWRAGSHDRLKARGEAVAVTVHRLVAVPRGRDTPRSIDSDMYAVSEVKD
 92 | KRIKAVERILMTSAADCSLLRSRNLYPSEFDGSRECEYGRCAYRCSNVSVEPGPLPALLG
 93 | ASAAEAVAQVRLDGGGDPAIMKVDMSTLWAEVTAGRRYVNRWGDGAVLRAEGGRLELSAP
 94 | YGSSEEGRWGDFYKTRNLCYAKMDQDHLRADDLRDSLPQEVEELLTVSPVETIGETASAM
 95 | PQEVATAILMACVQARADGKTLNVVRRDALLDFYKGFYAMGPSGWTVWLHARGANAKVYD
 96 | GRRWNPADEDTLEFLAARSAKFTDTRIGYYGLYNPNLKDFCIRDVTQGKRDKVDLRKLTV
 97 | GRRCVDWDQRTLVHIVARLMKIDGRRDFMPHATLREMRELAEQDPLHEPSDLTSKEACRR
 98 | FLFWTQKGDNKFRRQDICKAMEKWFIENDLMEDNFDCGHQHKRRGKFA
 99 | >UniRef90_Q91G85 Uncharacterized protein 009R n=1 Tax=Invertebrate iridescent virus 6 RepID=009R_IIV6
100 | MIKLFCVLAAFISINSACQSSHQQREEFTVATYHSSSICTTYCYSNCVVASQHKGLNVES
101 | YTCDKPDPYGRETVCKCTLIKCHDI
102 | 


--------------------------------------------------------------------------------
/src/seqdiff.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <zlib.h>
  5 | #include <errno.h>
  6 | #include <getopt.h>
  7 | 
  8 | #include "global.h"
  9 | #include "cmpseq.h"
 10 | #include "seqdiff.h"
 11 | 
 12 | int verbose_flag;
 13 | char const *progname;
 14 | 
 15 | #define VERSION 0.1
 16 | 
 17 | void show_usage(int status) {
 18 | 	fprintf(stderr, "seqdiff - a bioinformatics tool for comparing sequences in two files\n");
 19 | 	fprintf(stderr, "\n(Written by bct - 2013)     ");
 20 | 	/*  fprintf(stderr, "search method: %s",PULLSEQ_SORTMETHOD); */
 21 | 	fprintf(stderr, "\nUsage:\n"
 22 | 			"%s -1 <first input fasta/fastq file> -2 <second fasta/fastq file>\n\n", progname);
 23 | 	fprintf(stderr, "  Options:\n"
 24 | 			"    -1, --first,      First sequence file (required)\n"
 25 | 			"    -2, --second,     Second sequence file (required)\n");
 26 | 	fprintf(stderr, "    -a, --a_output,   File name for uniques from first file\n"
 27 | 			"    -b, --b_output,   File name for uniques from second file\n"
 28 | 			"    -c, --c_output,   File name for common entries\n");
 29 | 
 30 | 	fprintf(stderr, "    -d, --headers,    Compare headers instead of sequences (default: false)\n"
 31 | 			"    -s, --summary, Just show summary stats? (default: false)\n");
 32 | 	fprintf(stderr, "    -h, --help,       Display this help and exit\n"
 33 | 			"    -v, --verbose,    Print extra details during the run\n"
 34 | 			"    --version,        Output version information and exit\n\n");
 35 | 	exit(status);
 36 | }
 37 | 
 38 | int main(int argc, char *argv[]) {
 39 | 	int c;                          /* character for getopt processing */
 40 | 	/* command argument variables */
 41 | 	char *first_file = NULL;
 42 | 	char *second_file = NULL;
 43 | 	char *a_output_file = NULL;
 44 | 	char *b_output_file = NULL;
 45 | 	char *c_output_file = NULL;
 46 | 	FILE *a_output_fp = NULL;
 47 | 	FILE *b_output_fp = NULL;
 48 | 	FILE *c_output_fp = NULL;
 49 | 	int use_header = 0;
 50 | 	int only_summarize = 0;
 51 | 
 52 | 	/* internal variables */
 53 | 	seqdiff_results_t *results;
 54 | 	results = seqdiff_results_init();
 55 | 
 56 | 	extern char *optarg; /* external from getopt */
 57 | 
 58 | 	verbose_flag = 0; /* assume not verbose */
 59 | 	progname = argv[0]; /* capture the program name */
 60 | 	if (argc < 2) {
 61 | 		show_usage(EXIT_FAILURE);
 62 | 	}
 63 | 
 64 | 	while(1) {
 65 | 		static struct option long_options[] =
 66 | 		{
 67 | 			{"verbose",    no_argument, 0, 'v'},
 68 | 			{"version",    no_argument, 0, 'V'},
 69 | 			{"help",       no_argument, 0, 'h'},
 70 | 			{"summary",    no_argument, 0, 's'},
 71 | 			{"headers",    no_argument, 0, 'd'},
 72 | 			{"first",      required_argument, 0, '1'},
 73 | 			{"second",     required_argument, 0, '2'},
 74 | 			{"a_output",   required_argument, 0, 'a'},
 75 | 			{"b_output",   required_argument, 0, 'b'},
 76 | 			{"c_output",   required_argument, 0, 'c'},
 77 | 			{0, 0, 0, 0}
 78 | 		};
 79 | 
 80 | 		/* getopt_long stores the option index here. */
 81 | 		int option_index = 0;
 82 | 
 83 | 		c = getopt_long(argc, argv, "vVh?sd1:2:a:b:c:", long_options, &option_index);
 84 | 
 85 | 		/* Detect the end of the options. */
 86 | 		if (c == -1)
 87 | 			break;
 88 | 
 89 | 		switch (c) {
 90 | 			case 'v':
 91 | 				verbose_flag = 1;
 92 | 				break;
 93 | 
 94 | 			case 'V':
 95 | 				/* version */
 96 | 				printf("Version is %f\n",VERSION);
 97 | 				break;
 98 | 
 99 | 			case 'h':
100 | 				show_usage(EXIT_FAILURE);
101 | 				break;
102 | 
103 | 			case '?':
104 | 				/* getopt_long already printed an error message. */
105 | 				break;
106 | 
107 | 			case 's':
108 | 				only_summarize = 1;
109 | 				break;
110 | 
111 | 			case 'd':
112 | 				use_header = 1;
113 | 				break;
114 | 
115 | 			case '1':
116 | 				first_file = (char*) malloc(strlen(optarg)+1);
117 | 				strcpy(first_file,optarg);
118 | 				break;
119 | 
120 | 			case '2':
121 | 				second_file = (char*) malloc(strlen(optarg)+1);
122 | 				strcpy(second_file,optarg);
123 | 				break;
124 | 
125 | 			case 'a':
126 | 				a_output_file = (char*) malloc(strlen(optarg)+1);
127 | 				strcpy(a_output_file,optarg);
128 | 				break;
129 | 
130 | 			case 'b':
131 | 				b_output_file = (char*) malloc(strlen(optarg)+1);
132 | 				strcpy(b_output_file,optarg);
133 | 				break;
134 | 
135 | 			case 'c':
136 | 				c_output_file = (char*) malloc(strlen(optarg)+1);
137 | 				strcpy(c_output_file,optarg);
138 | 				break;
139 | 
140 | 			default:
141 | 				abort ();
142 | 		}
143 | 	}
144 | 
145 | 	/* Instead of reporting '--verbose'
146 | 	   and '--brief' as they are encountered,
147 | 	   we report the final status resulting from them. */
148 | 	if (verbose_flag) {
149 | 		fprintf(stderr, "verbose flag is set\n");
150 | 		fprintf(stderr,"First file is %s\n", first_file);
151 | 		fprintf(stderr,"Second file is %s\n", second_file);
152 | 		if (a_output_file != NULL && a_output_file != NULL && a_output_file != NULL) {
153 | 			fprintf(stderr, "Output will be written to files:\n");
154 | 			fprintf(stderr, "  first file uniques: %s\n", a_output_file);
155 | 			fprintf(stderr, "  second file uniques: %s\n", b_output_file);
156 | 			fprintf(stderr, "  common to both input files: %s\n", c_output_file);
157 | 		} else
158 | 			fprintf(stderr,"No output files will be generated\n");
159 | 
160 | 		if (use_header)
161 | 			fprintf(stderr,"Processing will be done using headers, not sequences\n");
162 | 		else
163 | 			fprintf(stderr,"Processing will be done using sequences\n");
164 | 		if (only_summarize)
165 | 			fprintf(stderr,"Only showing summary information\n");
166 | 	}
167 | 
168 | 	/* check validity of given argument combination */
169 | 	if (! first_file) {
170 | 		fprintf (stderr, "Error: First sequence file is required.\n");
171 | 		return EXIT_FAILURE;
172 | 	}
173 | 
174 | 	if (! second_file) {
175 | 		fprintf (stderr, "Error: First sequence file is required.\n");
176 | 		return EXIT_FAILURE;
177 | 	}
178 | 
179 | 	results->first_file = first_file;
180 | 	results->second_file = second_file;
181 | 	results->use_header = use_header;
182 | 	results->only_summarize = only_summarize;
183 | 	if (a_output_file != NULL && b_output_file != NULL && c_output_file != NULL) {
184 | 		results->a_output_fp = fopen(a_output_file,"w+");
185 | 		if (!results->a_output_fp) {
186 | 			fprintf(stderr,"%s - failed to open file %s\n",progname,a_output_file);
187 | 			exit(EXIT_FAILURE);
188 | 		}
189 | 		results->b_output_fp = fopen(b_output_file,"w+");
190 | 		if (!results->b_output_fp) {
191 | 			fprintf(stderr,"%s - failed to open file %s\n",progname,b_output_file);
192 | 			exit(EXIT_FAILURE);
193 | 		}
194 | 		results->c_output_fp = fopen(c_output_file,"w+");
195 | 		if (!results->c_output_fp) {
196 | 			fprintf(stderr,"%s - failed to open file %s\n",progname,c_output_file);
197 | 			exit(EXIT_FAILURE);
198 | 		}
199 | 	}
200 | 
201 | 	/* do the comparison */
202 | 	cmpseq(results);
203 | 
204 | 	if (a_output_file != NULL && b_output_file != NULL && c_output_file != NULL) {
205 | 		fclose(results->a_output_fp);
206 | 		fclose(results->b_output_fp);
207 | 		fclose(results->c_output_fp);
208 | 	}
209 | 
210 | 	/* report results */
211 | 	fprintf(stderr, " first_file_total = %d\n", results->first_file_total);
212 | 	fprintf(stderr, "  first_file_uniq = %d\n", results->first_file_uniq);
213 | 	fprintf(stderr, "second_file_total = %d\n", results->second_file_total);
214 | 	fprintf(stderr, " second_file_uniq = %d\n", results->second_file_uniq);
215 | 	fprintf(stderr, "           common = %d\n",           results->common);
216 | 
217 | 	free(first_file);
218 | 	free(second_file);
219 | 	if (a_output_file != NULL)
220 | 		free(a_output_file);
221 | 	if (b_output_file != NULL)
222 | 		free(b_output_file);
223 | 	if (c_output_file != NULL)
224 | 		free(c_output_file);
225 | 
226 | 	/* clean up */
227 | 	seqdiff_results_destroy(results);
228 | 	fclose(stderr);
229 | 	fclose(stdout);
230 | 	fclose(stdin);
231 | 	return EXIT_SUCCESS;
232 | }
233 | 


--------------------------------------------------------------------------------
/src/kseq.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining
  6 |    a copy of this software and associated documentation files (the
  7 |    "Software"), to deal in the Software without restriction, including
  8 |    without limitation the rights to use, copy, modify, merge, publish,
  9 |    distribute, sublicense, and/or sell copies of the Software, and to
 10 |    permit persons to whom the Software is furnished to do so, subject to
 11 |    the following conditions:
 12 | 
 13 |    The above copyright notice and this permission notice shall be
 14 |    included in all copies or substantial portions of the Software.
 15 | 
 16 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 20 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 21 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 22 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |    SOFTWARE.
 24 | */
 25 | 
 26 | /* Last Modified: 05MAR2012 */
 27 | 
 28 | #ifndef AC_KSEQ_H
 29 | #define AC_KSEQ_H
 30 | 
 31 | #include <ctype.h>
 32 | #include <string.h>
 33 | #include <stdlib.h>
 34 | 
 35 | #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
 36 | #define KS_SEP_TAB   1 // isspace() && !' '
 37 | #define KS_SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
 38 | #define KS_SEP_MAX   2
 39 | 
 40 | #define __KS_TYPE(type_t)						\
 41 | 	typedef struct __kstream_t {				\
 42 | 		unsigned char *buf;						\
 43 | 		int begin, end, is_eof;					\
 44 | 		type_t f;								\
 45 | 	} kstream_t;
 46 | 
 47 | #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
 48 | #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
 49 | 
 50 | #define __KS_BASIC(type_t, __bufsize)								\
 51 | 	static inline kstream_t *ks_init(type_t f)						\
 52 | 	{																\
 53 | 		kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));	\
 54 | 		ks->f = f;													\
 55 | 		ks->buf = (unsigned char*)malloc(__bufsize);				\
 56 | 		return ks;													\
 57 | 	}																\
 58 | 	static inline void ks_destroy(kstream_t *ks)					\
 59 | 	{																\
 60 | 		if (ks) {													\
 61 | 			free(ks->buf);											\
 62 | 			free(ks);												\
 63 | 		}															\
 64 | 	}
 65 | 
 66 | #define __KS_GETC(__read, __bufsize)						\
 67 | 	static inline int ks_getc(kstream_t *ks)				\
 68 | 	{														\
 69 | 		if (ks->is_eof && ks->begin >= ks->end) return -1;	\
 70 | 		if (ks->begin >= ks->end) {							\
 71 | 			ks->begin = 0;									\
 72 | 			ks->end = __read(ks->f, ks->buf, __bufsize);	\
 73 | 			if (ks->end == 0) { ks->is_eof = 1; return -1;}	\
 74 | 		}													\
 75 | 		return (int)ks->buf[ks->begin++];					\
 76 | 	}
 77 | 
 78 | #ifndef KSTRING_T
 79 | #define KSTRING_T kstring_t
 80 | typedef struct __kstring_t {
 81 | 	size_t l, m;
 82 | 	char *s;
 83 | } kstring_t;
 84 | #endif
 85 | 
 86 | #ifndef kroundup32
 87 | #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
 88 | #endif
 89 | 
 90 | #define __KS_GETUNTIL(__read, __bufsize)								\
 91 | 	static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
 92 | 	{																	\
 93 | 		int gotany = 0;													\
 94 | 		if (dret) *dret = 0;											\
 95 | 		str->l = append? str->l : 0;									\
 96 | 		for (;;) {														\
 97 | 			int i;														\
 98 | 			if (ks->begin >= ks->end) {									\
 99 | 				if (!ks->is_eof) {										\
100 | 					ks->begin = 0;										\
101 | 					ks->end = __read(ks->f, ks->buf, __bufsize);		\
102 | 					if (ks->end == 0) { ks->is_eof = 1; break; }		\
103 | 				} else break;											\
104 | 			}															\
105 | 			if (delimiter == KS_SEP_LINE) { \
106 | 				for (i = ks->begin; i < ks->end; ++i) \
107 | 					if (ks->buf[i] == '\n') break; \
108 | 			} else if (delimiter > KS_SEP_MAX) {						\
109 | 				for (i = ks->begin; i < ks->end; ++i)					\
110 | 					if (ks->buf[i] == delimiter) break;					\
111 | 			} else if (delimiter == KS_SEP_SPACE) {						\
112 | 				for (i = ks->begin; i < ks->end; ++i)					\
113 | 					if (isspace(ks->buf[i])) break;						\
114 | 			} else if (delimiter == KS_SEP_TAB) {						\
115 | 				for (i = ks->begin; i < ks->end; ++i)					\
116 | 					if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
117 | 			} else i = 0; /* never come to here! */						\
118 | 			if (str->m - str->l < (size_t)(i - ks->begin + 1)) {		\
119 | 				str->m = str->l + (i - ks->begin) + 1;					\
120 | 				kroundup32(str->m);										\
121 | 				str->s = (char*)realloc(str->s, str->m);				\
122 | 			}															\
123 | 			gotany = 1;													\
124 | 			memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
125 | 			str->l = str->l + (i - ks->begin);							\
126 | 			ks->begin = i + 1;											\
127 | 			if (i < ks->end) {											\
128 | 				if (dret) *dret = ks->buf[i];							\
129 | 				break;													\
130 | 			}															\
131 | 		}																\
132 | 		if (!gotany && ks_eof(ks)) return -1;							\
133 | 		if (str->s == 0) {												\
134 | 			str->m = 1;													\
135 | 			str->s = (char*)calloc(1, 1);								\
136 | 		} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
137 | 		str->s[str->l] = '\0';											\
138 | 		return str->l;													\
139 | 	} \
140 | 	static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
141 | 	{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
142 | 
143 | #define KSTREAM_INIT(type_t, __read, __bufsize) \
144 | 	__KS_TYPE(type_t)							\
145 | 	__KS_BASIC(type_t, __bufsize)				\
146 | 	__KS_GETC(__read, __bufsize)				\
147 | 	__KS_GETUNTIL(__read, __bufsize)
148 | 
149 | #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
150 | 
151 | #define __KSEQ_BASIC(SCOPE, type_t)										\
152 | 	SCOPE kseq_t *kseq_init(type_t fd)									\
153 | 	{																	\
154 | 		kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));					\
155 | 		s->f = ks_init(fd);												\
156 | 		return s;														\
157 | 	}																	\
158 | 	SCOPE void kseq_destroy(kseq_t *ks)									\
159 | 	{																	\
160 | 		if (!ks) return;												\
161 | 		free(ks->name.s); free(ks->comment.s); free(ks->seq.s);	free(ks->qual.s); \
162 | 		ks_destroy(ks->f);												\
163 | 		free(ks);														\
164 | 	}
165 | 
166 | /* Return value:
167 |    >=0  length of the sequence (normal)
168 |    -1   end-of-file
169 |    -2   truncated quality string
170 |  */
171 | #define __KSEQ_READ(SCOPE) \
172 | 	SCOPE int kseq_read(kseq_t *seq) \
173 | 	{ \
174 | 		int c; \
175 | 		kstream_t *ks = seq->f; \
176 | 		if (seq->last_char == 0) { /* then jump to the next header line */ \
177 | 			while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
178 | 			if (c == -1) return -1; /* end of file */ \
179 | 			seq->last_char = c; \
180 | 		} /* else: the first header char has been read in the previous call */ \
181 | 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
182 | 		if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
183 | 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
184 | 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
185 | 			seq->seq.m = 256; \
186 | 			seq->seq.s = (char*)malloc(seq->seq.m); \
187 | 		} \
188 | 		while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
189 | 			if (c == '\n') continue; /* skip empty lines */ \
190 | 			seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
191 | 			ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
192 | 		} \
193 | 		if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */	\
194 | 		if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195 | 			seq->seq.m = seq->seq.l + 2; \
196 | 			kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197 | 			seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
198 | 		} \
199 | 		seq->seq.s[seq->seq.l] = 0;	/* null terminated string */ \
200 | 		if (c != '+') return seq->seq.l; /* FASTA */ \
201 | 		if (seq->qual.m < seq->seq.m) {	/* allocate memory for qual in case insufficient */ \
202 | 			seq->qual.m = seq->seq.m; \
203 | 			seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
204 | 		} \
205 | 		while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206 | 		if (c == -1) return -2; /* error: no quality string */ \
207 | 		while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
208 | 		seq->last_char = 0;	/* we have not come to the next header line */ \
209 | 		if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
210 | 		return seq->seq.l; \
211 | 	}
212 | 
213 | #define __KSEQ_TYPE(type_t)						\
214 | 	typedef struct {							\
215 | 		kstring_t name, comment, seq, qual;		\
216 | 		int last_char;							\
217 | 		kstream_t *f;							\
218 | 	} kseq_t;
219 | 
220 | #define KSEQ_INIT2(SCOPE, type_t, __read)		\
221 | 	KSTREAM_INIT(type_t, __read, 16384)			\
222 | 	__KSEQ_TYPE(type_t)							\
223 | 	__KSEQ_BASIC(SCOPE, type_t)					\
224 | 	__KSEQ_READ(SCOPE)
225 | 
226 | #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
227 | 
228 | #define KSEQ_DECLARE(type_t) \
229 | 	__KS_TYPE(type_t) \
230 | 	__KSEQ_TYPE(type_t) \
231 | 	extern kseq_t *kseq_init(type_t fd); \
232 | 	void kseq_destroy(kseq_t *ks); \
233 | 	int kseq_read(kseq_t *seq);
234 | 
235 | #endif
236 | 


--------------------------------------------------------------------------------
/src/pullseq.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <zlib.h>
  5 | #include <errno.h>
  6 | #include <getopt.h>
  7 | 
  8 | #include "global.h"
  9 | #include "pullseq.h"
 10 | #include "pull_by_name.h"
 11 | #include "pull_by_size.h"
 12 | #include "pull_by_re.h"
 13 | 
 14 | int verbose_flag;
 15 | char const *progname;
 16 | 
 17 | void show_usage(int status) {
 18 | 	fprintf(stderr, "pullseq - a bioinformatics tool for manipulating fasta and fastq files\n");
 19 | 	fprintf(stderr, "\nVersion: %s              Name lookup method: %s", PULLSEQ_VERSION, PULLSEQ_SORTMETHOD);
 20 | 	fprintf(stderr, "\n(Written by bct - copyright 2012-2015)\n");
 21 | 	fprintf(stderr, "\nUsage:\n");
 22 | 	fprintf(stderr, " %s -i <input fasta/fastq file> -n <header names to select>\n\n", progname);
 23 | 	fprintf(stderr, " %s -i <input fasta/fastq file> -m <minimum sequence length>\n\n", progname);
 24 | 	fprintf(stderr, " %s -i <input fasta/fastq file> -g <regex name to match>\n\n", progname);
 25 | 	fprintf(stderr, " %s -i <input fasta/fastq file> -m <minimum sequence length> -a <max sequence length>\n\n", progname);
 26 | 	fprintf(stderr, " %s -i <input fasta/fastq file> -t\n\n", progname);
 27 | 	fprintf(stderr, " cat <names to select from STDIN> | %s -i <input fasta/fastq file> -N\n\n", progname);
 28 | 
 29 | 	fprintf(stderr, "  Options:\n");
 30 | 	fprintf(stderr, "    -i, --input,       Input fasta/fastq file (required)\n");
 31 | 	fprintf(stderr, "    -n, --names,       File of header id names to search for\n");
 32 | 	fprintf(stderr, "    -N, --names_stdin, Use STDIN for header id names\n");
 33 | 	fprintf(stderr, "    -g, --regex,       Regular expression to match (PERL compatible; always case-insensitive)\n");
 34 | 	fprintf(stderr, "    -m, --min,         Minimum sequence length\n");
 35 | 	fprintf(stderr, "    -a, --max,         Maximum sequence length\n");
 36 | 	fprintf(stderr, "    -l, --length,      Sequence characters per line (default 50)\n");
 37 | 	fprintf(stderr, "    -c, --convert,     Convert input to fastq/fasta (e.g. if input is fastq, output will be fasta)\n");
 38 | 	fprintf(stderr, "    -q, --quality,     ASCII code to use for fasta->fastq quality conversions\n");
 39 | 	fprintf(stderr, "    -e, --excluded,    Exclude the header id names in the list (-n)\n");
 40 | 	fprintf(stderr, "    -t, --count,       Just count the possible output, but don't write it\n");
 41 | 	fprintf(stderr, "    -h, --help,        Display this help and exit\n");
 42 | 	fprintf(stderr, "    -v, --verbose,     Print extra details during the run\n");
 43 | 	fprintf(stderr, "    --version,         Output version information and exit\n\n");
 44 | 
 45 | 	exit(status);
 46 | }
 47 | 
 48 | int main(int argc, char *argv[]) {
 49 | 	int c;
 50 | 	char *in = NULL,*names = NULL;
 51 | 	FILE *names_fp = NULL;
 52 | 	int min = -1, max = -1;
 53 | 	int names_from_stdin = 0;
 54 | 	int exclude = 0;
 55 | 	int count = 0;
 56 | 	int just_count = 0; /* flag for just counting the output */
 57 | 	int convert = 0;
 58 | 	int length = 50;
 59 | 	long value;
 60 | 	char *end;
 61 | 	char *aStrRegex = NULL;
 62 | 
 63 | 	extern char *optarg; /* external from getopt */
 64 | 
 65 | 	verbose_flag = 0; /* assume not verbose */
 66 | 
 67 | 	progname = argv[0];
 68 | 	if (argc < 4) { /* progname + at least 3 other args */
 69 | 		show_usage(EXIT_FAILURE);
 70 | 	}
 71 | 
 72 | 	while(1) {
 73 | 		static struct option long_options[] =
 74 | 		{
 75 | 			{"verbose",     no_argument,       0, 'v'},
 76 | 			{"convert",     no_argument,       0, 'c'},
 77 | 			{"exclude",     no_argument,       0, 'e'},
 78 | 			{"count",       no_argument,       0, 't'},
 79 | 			{"version",     no_argument,       0, 'V'},
 80 | 			{"help",        no_argument,       0, 'h'},
 81 | 			{"input",       required_argument, 0, 'i'},
 82 | 			{"regex",       required_argument, 0, 'g'},
 83 | 			{"names",       required_argument, 0, 'n'},
 84 | 			{"names_stdin", no_argument,       0, 'N'},
 85 | 			{"min",         required_argument, 0, 'm'},
 86 | 			{"max",         required_argument, 0, 'a'},
 87 | 			{"length",      required_argument, 0, 'l'},
 88 | 			{"quality",     required_argument, 0, 'q'},
 89 | 			{0, 0, 0, 0}
 90 | 		};
 91 | 
 92 | 		/* getopt_long stores the option index here. */
 93 | 		int option_index = 0;
 94 | 
 95 | 		c = getopt_long (argc, argv, "Vvh?cetq:i:g:Nn:m:a:l:", long_options, &option_index);
 96 | 
 97 | 		/* Detect the end of the options. */
 98 | 		if (c == -1)
 99 | 			break;
100 | 
101 | 		switch (c) {
102 | 			case 'v':
103 | 				verbose_flag = 1;
104 | 				break;
105 | 
106 | 			case 'i':
107 | 				in = (char*) malloc(strlen(optarg)+1);
108 | 				strcpy(in,optarg);
109 | 				break;
110 | 
111 | 			case 'g':
112 | 				aStrRegex = (char*) malloc(strlen(optarg)+1);
113 | 				strcpy(aStrRegex, optarg);
114 | 				break;
115 | 
116 | 			case 'n':
117 | 				names = (char*) malloc(strlen(optarg)+1);
118 | 				strcpy(names, optarg);
119 | 				break;
120 | 
121 | 			case 'N':
122 | 				names_from_stdin = 1;
123 | 				break;
124 | 
125 | 			case 'm':
126 | 				value = strtol(optarg, &end, 0);
127 | 				if (*end == '\0' && errno == 0) {
128 | 					min = atoi(optarg);
129 | 				} else {
130 | 					fprintf(stderr, "Maximum value (-m) argument '%s' is not an integer\n", optarg);
131 | 					return EXIT_FAILURE;
132 | 				}
133 | 				break;
134 | 
135 | 			case 'a':
136 | 				value = strtol(optarg, &end, 0);
137 | 				if (*end == '\0' && errno == 0) {
138 | 					max = atoi(optarg);
139 | 				} else {
140 | 					fprintf(stderr, "Maximum value (-a) argument '%s' is not an integer\n", optarg);
141 | 					return EXIT_FAILURE;
142 | 				}
143 | 				break;
144 | 
145 | 			case 'c':
146 | 				convert = 1;
147 | 				QUALITY_SCORE = 61;
148 | 				break;
149 | 
150 | 			case 't':
151 | 				just_count = 1;
152 | 				break;
153 | 
154 | 			case 'q':
155 | 				value = strtol(optarg, &end, 0);
156 | 				if (*end == 0 && errno == 0) {
157 | 					QUALITY_SCORE = atoi(optarg);
158 | 				} else {
159 | 					fprintf(stderr, "Quality ASCII value (-q) is invalid - must be an ASCII code (e.g. 73, which is 'I')\n");
160 | 					return EXIT_FAILURE;
161 | 				}
162 | 				break;
163 | 
164 | 			case 'e':
165 | 				exclude = 1;
166 | 				break;
167 | 
168 | 			case 'l':
169 | 				value = strtol(optarg, &end,0);
170 | 				if (*end == '\0' && errno == 0) {
171 | 					length = atoi(optarg);
172 | 				} else {
173 | 					fprintf(stderr, "Sequence length value (-l) argument '%s' is not an integer\n", optarg);
174 | 					return EXIT_FAILURE;
175 | 				}
176 | 				break;
177 | 
178 | 			case 'V':
179 | 				/* version */
180 | 				printf("Version is %s\n", PULLSEQ_VERSION);
181 | 				break;
182 | 
183 | 			case 'h':
184 | 				show_usage(EXIT_FAILURE);
185 | 				break;
186 | 
187 | 			case '?':
188 | 				/* getopt_long already printed an error message. */
189 | 				break;
190 | 
191 | 			default:
192 | 				abort ();
193 | 		}
194 | 	}
195 | 
196 | 	/* Instead of reporting '--verbose'
197 | 	   and '--brief' as they are encountered,
198 | 	   we report the final status resulting from them. */
199 | 	if (verbose_flag) {
200 | 		fprintf(stderr, "verbose flag is set\n");
201 | 		fprintf(stderr,"Input is %s\n", in);
202 | 		if (convert)
203 | 			fprintf(stderr,"Input will be converted between FASTQ and FASTA\n");
204 | 		if (names_from_stdin) {
205 | 			if (exclude)
206 | 				fprintf(stderr,"Names in STDIN will be excluded\n");
207 | 			else
208 | 				fprintf(stderr,"Names in STDIN will be included\n");
209 | 
210 | 		} else if (names != NULL) {
211 | 			if (exclude) {
212 | 				fprintf(stderr,"Names in %s will be excluded\n", names);
213 |             }
214 | 			else {
215 | 				fprintf(stderr,"Names in %s will be included\n", names);
216 |             }
217 | 		}
218 | 		if (aStrRegex)
219 | 			if (exclude) {
220 | 				fprintf(stderr,"Only sequences not matching %s will be output\n", aStrRegex);
221 |             }
222 | 			else {
223 | 				fprintf(stderr,"Only sequences matching %s will be output\n", aStrRegex);
224 |             }
225 | 		if (max > 0)
226 | 			fprintf(stderr,"Only sequences less than %i will be output\n", max);
227 | 		if (min > 0)
228 | 			fprintf(stderr,"Only sequences greater than %i will be output\n", min);
229 | 		if (length > 0)
230 | 			fprintf(stderr,"Output will be %i columns long\n", length);
231 | 		if (just_count > 0)
232 | 			fprintf(stderr,"Output will be counted only\n");
233 | 	}
234 | 
235 | 	/* check validity of given argument set */
236 | 	if (!in) {
237 | 		fprintf (stderr, "Error: Input file is required.\n");
238 | 		return EXIT_FAILURE;
239 | 	}
240 | 
241 | 	if (names) {
242 | 		if (!strcmp(in, names)) {
243 | 			fprintf (stderr, "Error: Input file is same as names file.\n");
244 | 			return EXIT_FAILURE;
245 | 		}
246 | 	}
247 | 
248 | 	if (names && names_from_stdin) {
249 | 		fprintf (stderr, "Error: Cannot use names from STDIN *and* names from a file.\n");
250 | 		return EXIT_FAILURE;
251 | 	}
252 | 
253 | 	if (aStrRegex) {
254 | 		if (names || names_from_stdin) {
255 | 			fprintf (stderr, "Error: You can't use a names file or names from STDIN and a regex match.\n");
256 | 			return EXIT_FAILURE;
257 | 		}
258 | 	}
259 | 
260 | 	if (min > 0 && max > 0) {
261 | 		if (max <= min) {
262 | 			fprintf (stderr, "Error: Max is less than or equal to min.\n");
263 | 			return EXIT_FAILURE;
264 | 		}
265 | 	}
266 | 
267 | 	if (names || names_from_stdin) {
268 | 		if (names) {
269 | 			names_fp = fopen(names,"r");
270 | 			if (!names_fp) {
271 | 				fprintf(stderr,"%s - failed to open names file %s\n",progname, names);
272 | 				exit(EXIT_FAILURE);
273 | 			}
274 | 		} else {
275 | 			names_fp = stdin;
276 | 		}
277 | 		count = pull_by_name(in, names_fp, min, max, length, exclude, convert, just_count);
278 | 	} else if (aStrRegex) {
279 | 		count = pull_by_re(in, aStrRegex, min, max, length, exclude, convert, just_count);
280 | 	} else {
281 | 		count = pull_by_size(in, min, max, length, convert, just_count);
282 | 	}
283 | 
284 | 	/* free up memory */
285 | 	free(in);
286 | 
287 | 	if (names)
288 | 		free(names);
289 | 
290 | 	if (names_fp)
291 | 		fclose(names_fp);
292 | 
293 | 	if (aStrRegex)
294 | 		free(aStrRegex);
295 | 
296 | 	if (verbose_flag)
297 | 		fprintf(stderr,"Pulled %i entries\n",count);
298 | 
299 | 	/* close streams */
300 | 	fclose(stderr);
301 | 	fclose(stdout);
302 | 	fclose(stdin);
303 | 	return EXIT_SUCCESS;
304 | }
305 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
  1 | Installation Instructions
  2 | *************************
  3 | 
  4 | NOTE: 
  5 | 
  6 | Basic Installation
  7 | ==================
  8 | 
  9 |    Briefly, the shell commands `./configure; make; make install' should
 10 | configure, build, and install this package.  The following
 11 | more-detailed instructions are generic; see the `README' file for
 12 | instructions specific to this package.  Some packages provide this
 13 | `INSTALL' file but do not implement all of the features documented
 14 | below.  The lack of an optional feature in a given package is not
 15 | necessarily a bug.  More recommendations for GNU packages can be found
 16 | in *note Makefile Conventions: (standards)Makefile Conventions.
 17 | 
 18 |    The `configure' shell script attempts to guess correct values for
 19 | various system-dependent variables used during compilation.  It uses
 20 | those values to create a `Makefile' in each directory of the package.
 21 | It may also create one or more `.h' files containing system-dependent
 22 | definitions.  Finally, it creates a shell script `config.status' that
 23 | you can run in the future to recreate the current configuration, and a
 24 | file `config.log' containing compiler output (useful mainly for
 25 | debugging `configure').
 26 | 
 27 |    It can also use an optional file (typically called `config.cache'
 28 | and enabled with `--cache-file=config.cache' or simply `-C') that saves
 29 | the results of its tests to speed up reconfiguring.  Caching is
 30 | disabled by default to prevent problems with accidental use of stale
 31 | cache files.
 32 | 
 33 |    If you need to do unusual things to compile the package, please try
 34 | to figure out how `configure' could check whether to do them, and mail
 35 | diffs or instructions to the address given in the `README' so they can
 36 | be considered for the next release.  If you are using the cache, and at
 37 | some point `config.cache' contains results you don't want to keep, you
 38 | may remove or edit it.
 39 | 
 40 |    The file `configure.ac' (or `configure.in') is used to create
 41 | `configure' by a program called `autoconf'.  You need `configure.ac' if
 42 | you want to change it or regenerate `configure' using a newer version
 43 | of `autoconf'.
 44 | 
 45 |    The simplest way to compile this package is:
 46 | 
 47 |   1. `cd' to the directory containing the package's source code and type
 48 |      `./configure' to configure the package for your system.
 49 | 
 50 |      Running `configure' might take a while.  While running, it prints
 51 |      some messages telling which features it is checking for.
 52 | 
 53 |   2. Type `make' to compile the package.
 54 | 
 55 |   3. Optionally, type `make check' to run any self-tests that come with
 56 |      the package, generally using the just-built uninstalled binaries.
 57 | 
 58 |   4. Type `make install' to install the programs and any data files and
 59 |      documentation.  When installing into a prefix owned by root, it is
 60 |      recommended that the package be configured and built as a regular
 61 |      user, and only the `make install' phase executed with root
 62 |      privileges.
 63 | 
 64 |   5. Optionally, type `make installcheck' to repeat any self-tests, but
 65 |      this time using the binaries in their final installed location.
 66 |      This target does not install anything.  Running this target as a
 67 |      regular user, particularly if the prior `make install' required
 68 |      root privileges, verifies that the installation completed
 69 |      correctly.
 70 | 
 71 |   6. You can remove the program binaries and object files from the
 72 |      source code directory by typing `make clean'.  To also remove the
 73 |      files that `configure' created (so you can compile the package for
 74 |      a different kind of computer), type `make distclean'.  There is
 75 |      also a `make maintainer-clean' target, but that is intended mainly
 76 |      for the package's developers.  If you use it, you may have to get
 77 |      all sorts of other programs in order to regenerate files that came
 78 |      with the distribution.
 79 | 
 80 |   7. Often, you can also type `make uninstall' to remove the installed
 81 |      files again.  In practice, not all packages have tested that
 82 |      uninstallation works correctly, even though it is required by the
 83 |      GNU Coding Standards.
 84 | 
 85 |   8. Some packages, particularly those that use Automake, provide `make
 86 |      distcheck', which can by used by developers to test that all other
 87 |      targets like `make install' and `make uninstall' work correctly.
 88 |      This target is generally not run by end users.
 89 | 
 90 | Compilers and Options
 91 | =====================
 92 | 
 93 |    Some systems require unusual options for compilation or linking that
 94 | the `configure' script does not know about.  Run `./configure --help'
 95 | for details on some of the pertinent environment variables.
 96 | 
 97 |    You can give `configure' initial values for configuration parameters
 98 | by setting variables in the command line or in the environment.  Here
 99 | is an example:
100 | 
101 |      ./configure CC=c99 CFLAGS=-g LIBS=-lposix
102 | 
103 |    *Note Defining Variables::, for more details.
104 | 
105 | Compiling For Multiple Architectures
106 | ====================================
107 | 
108 |    You can compile the package for more than one kind of computer at the
109 | same time, by placing the object files for each architecture in their
110 | own directory.  To do this, you can use GNU `make'.  `cd' to the
111 | directory where you want the object files and executables to go and run
112 | the `configure' script.  `configure' automatically checks for the
113 | source code in the directory that `configure' is in and in `..'.  This
114 | is known as a "VPATH" build.
115 | 
116 |    With a non-GNU `make', it is safer to compile the package for one
117 | architecture at a time in the source code directory.  After you have
118 | installed the package for one architecture, use `make distclean' before
119 | reconfiguring for another architecture.
120 | 
121 |    On MacOS X 10.5 and later systems, you can create libraries and
122 | executables that work on multiple system types--known as "fat" or
123 | "universal" binaries--by specifying multiple `-arch' options to the
124 | compiler but only a single `-arch' option to the preprocessor.  Like
125 | this:
126 | 
127 |      ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
128 |                  CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
129 |                  CPP="gcc -E" CXXCPP="g++ -E"
130 | 
131 |    This is not guaranteed to produce working output in all cases, you
132 | may have to build one architecture at a time and combine the results
133 | using the `lipo' tool if you have problems.
134 | 
135 | Installation Names
136 | ==================
137 | 
138 |    By default, `make install' installs the package's commands under
139 | `/usr/local/bin', include files under `/usr/local/include', etc.  You
140 | can specify an installation prefix other than `/usr/local' by giving
141 | `configure' the option `--prefix=PREFIX', where PREFIX must be an
142 | absolute file name.
143 | 
144 |    You can specify separate installation prefixes for
145 | architecture-specific files and architecture-independent files.  If you
146 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses
147 | PREFIX as the prefix for installing programs and libraries.
148 | Documentation and other data files still use the regular prefix.
149 | 
150 |    In addition, if you use an unusual directory layout you can give
151 | options like `--bindir=DIR' to specify different values for particular
152 | kinds of files.  Run `configure --help' for a list of the directories
153 | you can set and what kinds of files go in them.  In general, the
154 | default for these options is expressed in terms of `${prefix}', so that
155 | specifying just `--prefix' will affect all of the other directory
156 | specifications that were not explicitly provided.
157 | 
158 |    The most portable way to affect installation locations is to pass the
159 | correct locations to `configure'; however, many packages provide one or
160 | both of the following shortcuts of passing variable assignments to the
161 | `make install' command line to change installation locations without
162 | having to reconfigure or recompile.
163 | 
164 |    The first method involves providing an override variable for each
165 | affected directory.  For example, `make install
166 | prefix=/alternate/directory' will choose an alternate location for all
167 | directory configuration variables that were expressed in terms of
168 | `${prefix}'.  Any directories that were specified during `configure',
169 | but not in terms of `${prefix}', must each be overridden at install
170 | time for the entire installation to be relocated.  The approach of
171 | makefile variable overrides for each directory variable is required by
172 | the GNU Coding Standards, and ideally causes no recompilation.
173 | However, some platforms have known limitations with the semantics of
174 | shared libraries that end up requiring recompilation when using this
175 | method, particularly noticeable in packages that use GNU Libtool.
176 | 
177 |    The second method involves providing the `DESTDIR' variable.  For
178 | example, `make install DESTDIR=/alternate/directory' will prepend
179 | `/alternate/directory' before all installation names.  The approach of
180 | `DESTDIR' overrides is not required by the GNU Coding Standards, and
181 | does not work on platforms that have drive letters.  On the other hand,
182 | it does better at avoiding recompilation issues, and works well even
183 | when some directory options were not specified in terms of `${prefix}'
184 | at `configure' time.
185 | 
186 | Optional Features
187 | =================
188 | 
189 |    If the package supports it, you can cause programs to be installed
190 | with an extra prefix or suffix on their names by giving `configure' the
191 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
192 | 
193 |    Some packages pay attention to `--enable-FEATURE' options to
194 | `configure', where FEATURE indicates an optional part of the package.
195 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE
196 | is something like `gnu-as' or `x' (for the X Window System).  The
197 | `README' should mention any `--enable-' and `--with-' options that the
198 | package recognizes.
199 | 
200 |    For packages that use the X Window System, `configure' can usually
201 | find the X include and library files automatically, but if it doesn't,
202 | you can use the `configure' options `--x-includes=DIR' and
203 | `--x-libraries=DIR' to specify their locations.
204 | 
205 |    Some packages offer the ability to configure how verbose the
206 | execution of `make' will be.  For these packages, running `./configure
207 | --enable-silent-rules' sets the default to minimal output, which can be
208 | overridden with `make V=1'; while running `./configure
209 | --disable-silent-rules' sets the default to verbose, which can be
210 | overridden with `make V=0'.
211 | 
212 | Particular systems
213 | ==================
214 | 
215 |    On HP-UX, the default C compiler is not ANSI C compatible.  If GNU
216 | CC is not installed, it is recommended to use the following options in
217 | order to use an ANSI C compiler:
218 | 
219 |      ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
220 | 
221 | and if that doesn't work, install pre-built binaries of GCC for HP-UX.
222 | 
223 |    On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
224 | parse its `<wchar.h>' header file.  The option `-nodtk' can be used as
225 | a workaround.  If GNU CC is not installed, it is therefore recommended
226 | to try
227 | 
228 |      ./configure CC="cc"
229 | 
230 | and if that doesn't work, try
231 | 
232 |      ./configure CC="cc -nodtk"
233 | 
234 |    On Solaris, don't put `/usr/ucb' early in your `PATH'.  This
235 | directory contains several dysfunctional programs; working variants of
236 | these programs are available in `/usr/bin'.  So, if you need `/usr/ucb'
237 | in your `PATH', put it _after_ `/usr/bin'.
238 | 
239 |    On Haiku, software installed for all users goes in `/boot/common',
240 | not `/usr/local'.  It is recommended to use the following options:
241 | 
242 |      ./configure --prefix=/boot/common
243 | 
244 | Specifying the System Type
245 | ==========================
246 | 
247 |    There may be some features `configure' cannot figure out
248 | automatically, but needs to determine by the type of machine the package
249 | will run on.  Usually, assuming the package is built to be run on the
250 | _same_ architectures, `configure' can figure that out, but if it prints
251 | a message saying it cannot guess the machine type, give it the
252 | `--build=TYPE' option.  TYPE can either be a short name for the system
253 | type, such as `sun4', or a canonical name which has the form:
254 | 
255 |      CPU-COMPANY-SYSTEM
256 | 
257 | where SYSTEM can have one of these forms:
258 | 
259 |      OS
260 |      KERNEL-OS
261 | 
262 |    See the file `config.sub' for the possible values of each field.  If
263 | `config.sub' isn't included in this package, then this package doesn't
264 | need to know the machine type.
265 | 
266 |    If you are _building_ compiler tools for cross-compiling, you should
267 | use the option `--target=TYPE' to select the type of system they will
268 | produce code for.
269 | 
270 |    If you want to _use_ a cross compiler, that generates code for a
271 | platform different from the build platform, you should specify the
272 | "host" platform (i.e., that on which the generated programs will
273 | eventually be run) with `--host=TYPE'.
274 | 
275 | Sharing Defaults
276 | ================
277 | 
278 |    If you want to set default values for `configure' scripts to share,
279 | you can create a site shell script called `config.site' that gives
280 | default values for variables like `CC', `cache_file', and `prefix'.
281 | `configure' looks for `PREFIX/share/config.site' if it exists, then
282 | `PREFIX/etc/config.site' if it exists.  Or, you can set the
283 | `CONFIG_SITE' environment variable to the location of the site script.
284 | A warning: not all `configure' scripts look for a site script.
285 | 
286 | Defining Variables
287 | ==================
288 | 
289 |    Variables not defined in a site shell script can be set in the
290 | environment passed to `configure'.  However, some packages may run
291 | configure again during the build, and the customized values of these
292 | variables may be lost.  In order to avoid this problem, you should set
293 | them in the `configure' command line, using `VAR=value'.  For example:
294 | 
295 |      ./configure CC=/usr/local2/bin/gcc
296 | 
297 | causes the specified `gcc' to be used as the C compiler (unless it is
298 | overridden in the site shell script).
299 | 
300 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to
301 | an Autoconf bug.  Until the bug is fixed you can use this workaround:
302 | 
303 |      CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
304 | 
305 | `configure' Invocation
306 | ======================
307 | 
308 |    `configure' recognizes the following options to control how it
309 | operates.
310 | 
311 | `--help'
312 | `-h'
313 |      Print a summary of all of the options to `configure', and exit.
314 | 
315 | `--help=short'
316 | `--help=recursive'
317 |      Print a summary of the options unique to this package's
318 |      `configure', and exit.  The `short' variant lists options used
319 |      only in the top level, while the `recursive' variant lists options
320 |      also present in any nested packages.
321 | 
322 | `--version'
323 | `-V'
324 |      Print the version of Autoconf used to generate the `configure'
325 |      script, and exit.
326 | 
327 | `--cache-file=FILE'
328 |      Enable the cache: use and save the results of the tests in FILE,
329 |      traditionally `config.cache'.  FILE defaults to `/dev/null' to
330 |      disable caching.
331 | 
332 | `--config-cache'
333 | `-C'
334 |      Alias for `--cache-file=config.cache'.
335 | 
336 | `--quiet'
337 | `--silent'
338 | `-q'
339 |      Do not print messages saying which checks are being made.  To
340 |      suppress all normal output, redirect it to `/dev/null' (any error
341 |      messages will still be shown).
342 | 
343 | `--srcdir=DIR'
344 |      Look for the package's source code in directory DIR.  Usually
345 |      `configure' can determine that directory automatically.
346 | 
347 | `--prefix=DIR'
348 |      Use DIR as the installation prefix.  *note Installation Names::
349 |      for more details, including other options available for fine-tuning
350 |      the installation locations.
351 | 
352 | `--no-create'
353 | `-n'
354 |      Run the configure checks, but stop before creating any output
355 |      files.
356 | 
357 | `configure' also accepts some other, not widely useful, options.  Run
358 | `configure --help' for more details.
359 | 
360 | 


--------------------------------------------------------------------------------
/src/uthash.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2003-2014, Troy D. Hanson     http://troydhanson.github.com/uthash/
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 22 | */
 23 | 
 24 | #ifndef UTHASH_H
 25 | #define UTHASH_H
 26 | 
 27 | #include <string.h>   /* memcmp,strlen */
 28 | #include <stddef.h>   /* ptrdiff_t */
 29 | #include <stdlib.h>   /* exit() */
 30 | 
 31 | /* These macros use decltype or the earlier __typeof GNU extension.
 32 |    As decltype is only available in newer compilers (VS2010 or gcc 4.3+
 33 |    when compiling c++ source) this code uses whatever method is needed
 34 |    or, for VS2008 where neither is available, uses casting workarounds. */
 35 | #if defined(_MSC_VER)   /* MS compiler */
 36 | #if _MSC_VER >= 1600 && defined(__cplusplus)  /* VS2010 or newer in C++ mode */
 37 | #define DECLTYPE(x) (decltype(x))
 38 | #else                   /* VS2008 or older (or VS2010 in C mode) */
 39 | #define NO_DECLTYPE
 40 | #define DECLTYPE(x)
 41 | #endif
 42 | #elif defined(__BORLANDC__) || defined(__LCC__) || defined(__WATCOMC__)
 43 | #define NO_DECLTYPE
 44 | #define DECLTYPE(x)
 45 | #else                   /* GNU, Sun and other compilers */
 46 | #define DECLTYPE(x) (__typeof(x))
 47 | #endif
 48 | 
 49 | #ifdef NO_DECLTYPE
 50 | #define DECLTYPE_ASSIGN(dst,src)                                                 \
 51 | do {                                                                             \
 52 |   char **_da_dst = (char**)(&(dst));                                             \
 53 |   *_da_dst = (char*)(src);                                                       \
 54 | } while(0)
 55 | #else
 56 | #define DECLTYPE_ASSIGN(dst,src)                                                 \
 57 | do {                                                                             \
 58 |   (dst) = DECLTYPE(dst)(src);                                                    \
 59 | } while(0)
 60 | #endif
 61 | 
 62 | /* a number of the hash function use uint32_t which isn't defined on Pre VS2010 */
 63 | #if defined (_WIN32)
 64 | #if defined(_MSC_VER) && _MSC_VER >= 1600
 65 | #include <stdint.h>
 66 | #elif defined(__WATCOMC__)
 67 | #include <stdint.h>
 68 | #else
 69 | typedef unsigned int uint32_t;
 70 | typedef unsigned char uint8_t;
 71 | #endif
 72 | #else
 73 | #include <stdint.h>
 74 | #endif
 75 | 
 76 | #define UTHASH_VERSION 1.9.9
 77 | 
 78 | #ifndef uthash_fatal
 79 | #define uthash_fatal(msg) exit(-1)        /* fatal error (out of memory,etc) */
 80 | #endif
 81 | #ifndef uthash_malloc
 82 | #define uthash_malloc(sz) malloc(sz)      /* malloc fcn                      */
 83 | #endif
 84 | #ifndef uthash_free
 85 | #define uthash_free(ptr,sz) free(ptr)     /* free fcn                        */
 86 | #endif
 87 | 
 88 | #ifndef uthash_noexpand_fyi
 89 | #define uthash_noexpand_fyi(tbl)          /* can be defined to log noexpand  */
 90 | #endif
 91 | #ifndef uthash_expand_fyi
 92 | #define uthash_expand_fyi(tbl)            /* can be defined to log expands   */
 93 | #endif
 94 | 
 95 | /* initial number of buckets */
 96 | #define HASH_INITIAL_NUM_BUCKETS 32      /* initial number of buckets        */
 97 | #define HASH_INITIAL_NUM_BUCKETS_LOG2 5  /* lg2 of initial number of buckets */
 98 | #define HASH_BKT_CAPACITY_THRESH 10      /* expand when bucket count reaches */
 99 | 
100 | /* calculate the element whose hash handle address is hhe */
101 | #define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho)))
102 | 
103 | #define HASH_FIND(hh,head,keyptr,keylen,out)                                     \
104 | do {                                                                             \
105 |   unsigned _hf_bkt,_hf_hashv;                                                    \
106 |   out=NULL;                                                                      \
107 |   if (head) {                                                                    \
108 |      HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt);   \
109 |      if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) {                           \
110 |        HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ],  \
111 |                         keyptr,keylen,out);                                      \
112 |      }                                                                           \
113 |   }                                                                              \
114 | } while (0)
115 | 
116 | #ifdef HASH_BLOOM
117 | #define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM)
118 | #define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0)
119 | #define HASH_BLOOM_MAKE(tbl)                                                     \
120 | do {                                                                             \
121 |   (tbl)->bloom_nbits = HASH_BLOOM;                                               \
122 |   (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN);                 \
123 |   if (!((tbl)->bloom_bv))  { uthash_fatal( "out of memory"); }                   \
124 |   memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN);                                \
125 |   (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE;                                       \
126 | } while (0)
127 | 
128 | #define HASH_BLOOM_FREE(tbl)                                                     \
129 | do {                                                                             \
130 |   uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN);                              \
131 | } while (0)
132 | 
133 | #define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8)))
134 | #define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8)))
135 | 
136 | #define HASH_BLOOM_ADD(tbl,hashv)                                                \
137 |   HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
138 | 
139 | #define HASH_BLOOM_TEST(tbl,hashv)                                               \
140 |   HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1)))
141 | 
142 | #else
143 | #define HASH_BLOOM_MAKE(tbl)
144 | #define HASH_BLOOM_FREE(tbl)
145 | #define HASH_BLOOM_ADD(tbl,hashv)
146 | #define HASH_BLOOM_TEST(tbl,hashv) (1)
147 | #define HASH_BLOOM_BYTELEN 0
148 | #endif
149 | 
150 | #define HASH_MAKE_TABLE(hh,head)                                                 \
151 | do {                                                                             \
152 |   (head)->hh.tbl = (UT_hash_table*)uthash_malloc(                                \
153 |                   sizeof(UT_hash_table));                                        \
154 |   if (!((head)->hh.tbl))  { uthash_fatal( "out of memory"); }                    \
155 |   memset((head)->hh.tbl, 0, sizeof(UT_hash_table));                              \
156 |   (head)->hh.tbl->tail = &((head)->hh);                                          \
157 |   (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS;                        \
158 |   (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2;              \
159 |   (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head);                    \
160 |   (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc(                      \
161 |           HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));               \
162 |   if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); }             \
163 |   memset((head)->hh.tbl->buckets, 0,                                             \
164 |           HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket));               \
165 |   HASH_BLOOM_MAKE((head)->hh.tbl);                                               \
166 |   (head)->hh.tbl->signature = HASH_SIGNATURE;                                    \
167 | } while(0)
168 | 
169 | #define HASH_ADD(hh,head,fieldname,keylen_in,add)                                \
170 |         HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add)
171 | 
172 | #define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced)                   \
173 | do {                                                                             \
174 |   replaced=NULL;                                                                 \
175 |   HASH_FIND(hh,head,&((add)->fieldname),keylen_in,replaced);                     \
176 |   if (replaced!=NULL) {                                                          \
177 |      HASH_DELETE(hh,head,replaced);                                              \
178 |   };                                                                             \
179 |   HASH_ADD(hh,head,fieldname,keylen_in,add);                                     \
180 | } while(0)
181 | 
182 | #define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add)                            \
183 | do {                                                                             \
184 |  unsigned _ha_bkt;                                                               \
185 |  (add)->hh.next = NULL;                                                          \
186 |  (add)->hh.key = (char*)(keyptr);                                                \
187 |  (add)->hh.keylen = (unsigned)(keylen_in);                                       \
188 |  if (!(head)) {                                                                  \
189 |     head = (add);                                                                \
190 |     (head)->hh.prev = NULL;                                                      \
191 |     HASH_MAKE_TABLE(hh,head);                                                    \
192 |  } else {                                                                        \
193 |     (head)->hh.tbl->tail->next = (add);                                          \
194 |     (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail);         \
195 |     (head)->hh.tbl->tail = &((add)->hh);                                         \
196 |  }                                                                               \
197 |  (head)->hh.tbl->num_items++;                                                    \
198 |  (add)->hh.tbl = (head)->hh.tbl;                                                 \
199 |  HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets,                         \
200 |          (add)->hh.hashv, _ha_bkt);                                              \
201 |  HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh);                   \
202 |  HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv);                                 \
203 |  HASH_EMIT_KEY(hh,head,keyptr,keylen_in);                                        \
204 |  HASH_FSCK(hh,head);                                                             \
205 | } while(0)
206 | 
207 | #define HASH_TO_BKT( hashv, num_bkts, bkt )                                      \
208 | do {                                                                             \
209 |   bkt = ((hashv) & ((num_bkts) - 1));                                            \
210 | } while(0)
211 | 
212 | /* delete "delptr" from the hash table.
213 |  * "the usual" patch-up process for the app-order doubly-linked-list.
214 |  * The use of _hd_hh_del below deserves special explanation.
215 |  * These used to be expressed using (delptr) but that led to a bug
216 |  * if someone used the same symbol for the head and deletee, like
217 |  *  HASH_DELETE(hh,users,users);
218 |  * We want that to work, but by changing the head (users) below
219 |  * we were forfeiting our ability to further refer to the deletee (users)
220 |  * in the patch-up process. Solution: use scratch space to
221 |  * copy the deletee pointer, then the latter references are via that
222 |  * scratch pointer rather than through the repointed (users) symbol.
223 |  */
224 | #define HASH_DELETE(hh,head,delptr)                                              \
225 | do {                                                                             \
226 |     unsigned _hd_bkt;                                                            \
227 |     struct UT_hash_handle *_hd_hh_del;                                           \
228 |     if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) )  {         \
229 |         uthash_free((head)->hh.tbl->buckets,                                     \
230 |                     (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
231 |         HASH_BLOOM_FREE((head)->hh.tbl);                                         \
232 |         uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                      \
233 |         head = NULL;                                                             \
234 |     } else {                                                                     \
235 |         _hd_hh_del = &((delptr)->hh);                                            \
236 |         if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) {     \
237 |             (head)->hh.tbl->tail =                                               \
238 |                 (UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) +               \
239 |                 (head)->hh.tbl->hho);                                            \
240 |         }                                                                        \
241 |         if ((delptr)->hh.prev) {                                                 \
242 |             ((UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) +                  \
243 |                     (head)->hh.tbl->hho))->next = (delptr)->hh.next;             \
244 |         } else {                                                                 \
245 |             DECLTYPE_ASSIGN(head,(delptr)->hh.next);                             \
246 |         }                                                                        \
247 |         if (_hd_hh_del->next) {                                                  \
248 |             ((UT_hash_handle*)((ptrdiff_t)_hd_hh_del->next +                     \
249 |                     (head)->hh.tbl->hho))->prev =                                \
250 |                     _hd_hh_del->prev;                                            \
251 |         }                                                                        \
252 |         HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt);   \
253 |         HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del);        \
254 |         (head)->hh.tbl->num_items--;                                             \
255 |     }                                                                            \
256 |     HASH_FSCK(hh,head);                                                          \
257 | } while (0)
258 | 
259 | 
260 | /* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */
261 | #define HASH_FIND_STR(head,findstr,out)                                          \
262 |     HASH_FIND(hh,head,findstr,strlen(findstr),out)
263 | #define HASH_ADD_STR(head,strfield,add)                                          \
264 |     HASH_ADD(hh,head,strfield[0],strlen(add->strfield),add)
265 | #define HASH_REPLACE_STR(head,strfield,add,replaced)                             \
266 |     HASH_REPLACE(hh,head,strfield[0],strlen(add->strfield),add,replaced)
267 | #define HASH_FIND_INT(head,findint,out)                                          \
268 |     HASH_FIND(hh,head,findint,sizeof(int),out)
269 | #define HASH_ADD_INT(head,intfield,add)                                          \
270 |     HASH_ADD(hh,head,intfield,sizeof(int),add)
271 | #define HASH_REPLACE_INT(head,intfield,add,replaced)                             \
272 |     HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced)
273 | #define HASH_FIND_PTR(head,findptr,out)                                          \
274 |     HASH_FIND(hh,head,findptr,sizeof(void *),out)
275 | #define HASH_ADD_PTR(head,ptrfield,add)                                          \
276 |     HASH_ADD(hh,head,ptrfield,sizeof(void *),add)
277 | #define HASH_REPLACE_PTR(head,ptrfield,add,replaced)                             \
278 |     HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced)
279 | #define HASH_DEL(head,delptr)                                                    \
280 |     HASH_DELETE(hh,head,delptr)
281 | 
282 | /* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined.
283 |  * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined.
284 |  */
285 | #ifdef HASH_DEBUG
286 | #define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0)
287 | #define HASH_FSCK(hh,head)                                                       \
288 | do {                                                                             \
289 |     unsigned _bkt_i;                                                             \
290 |     unsigned _count, _bkt_count;                                                 \
291 |     char *_prev;                                                                 \
292 |     struct UT_hash_handle *_thh;                                                 \
293 |     if (head) {                                                                  \
294 |         _count = 0;                                                              \
295 |         for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) {       \
296 |             _bkt_count = 0;                                                      \
297 |             _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head;                      \
298 |             _prev = NULL;                                                        \
299 |             while (_thh) {                                                       \
300 |                if (_prev != (char*)(_thh->hh_prev)) {                            \
301 |                    HASH_OOPS("invalid hh_prev %p, actual %p\n",                  \
302 |                     _thh->hh_prev, _prev );                                      \
303 |                }                                                                 \
304 |                _bkt_count++;                                                     \
305 |                _prev = (char*)(_thh);                                            \
306 |                _thh = _thh->hh_next;                                             \
307 |             }                                                                    \
308 |             _count += _bkt_count;                                                \
309 |             if ((head)->hh.tbl->buckets[_bkt_i].count !=  _bkt_count) {          \
310 |                HASH_OOPS("invalid bucket count %d, actual %d\n",                 \
311 |                 (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count);              \
312 |             }                                                                    \
313 |         }                                                                        \
314 |         if (_count != (head)->hh.tbl->num_items) {                               \
315 |             HASH_OOPS("invalid hh item count %d, actual %d\n",                   \
316 |                 (head)->hh.tbl->num_items, _count );                             \
317 |         }                                                                        \
318 |         /* traverse hh in app order; check next/prev integrity, count */         \
319 |         _count = 0;                                                              \
320 |         _prev = NULL;                                                            \
321 |         _thh =  &(head)->hh;                                                     \
322 |         while (_thh) {                                                           \
323 |            _count++;                                                             \
324 |            if (_prev !=(char*)(_thh->prev)) {                                    \
325 |               HASH_OOPS("invalid prev %p, actual %p\n",                          \
326 |                     _thh->prev, _prev );                                         \
327 |            }                                                                     \
328 |            _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh);                    \
329 |            _thh = ( _thh->next ?  (UT_hash_handle*)((char*)(_thh->next) +        \
330 |                                   (head)->hh.tbl->hho) : NULL );                 \
331 |         }                                                                        \
332 |         if (_count != (head)->hh.tbl->num_items) {                               \
333 |             HASH_OOPS("invalid app item count %d, actual %d\n",                  \
334 |                 (head)->hh.tbl->num_items, _count );                             \
335 |         }                                                                        \
336 |     }                                                                            \
337 | } while (0)
338 | #else
339 | #define HASH_FSCK(hh,head)
340 | #endif
341 | 
342 | /* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to
343 |  * the descriptor to which this macro is defined for tuning the hash function.
344 |  * The app can #include <unistd.h> to get the prototype for write(2). */
345 | #ifdef HASH_EMIT_KEYS
346 | #define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)                                   \
347 | do {                                                                             \
348 |     unsigned _klen = fieldlen;                                                   \
349 |     write(HASH_EMIT_KEYS, &_klen, sizeof(_klen));                                \
350 |     write(HASH_EMIT_KEYS, keyptr, fieldlen);                                     \
351 | } while (0)
352 | #else
353 | #define HASH_EMIT_KEY(hh,head,keyptr,fieldlen)
354 | #endif
355 | 
356 | /* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */
357 | #ifdef HASH_FUNCTION
358 | #define HASH_FCN HASH_FUNCTION
359 | #else
360 | #define HASH_FCN HASH_JEN
361 | #endif
362 | 
363 | /* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */
364 | #define HASH_BER(key,keylen,num_bkts,hashv,bkt)                                  \
365 | do {                                                                             \
366 |   unsigned _hb_keylen=keylen;                                                    \
367 |   char *_hb_key=(char*)(key);                                                    \
368 |   (hashv) = 0;                                                                   \
369 |   while (_hb_keylen--)  { (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; }   \
370 |   bkt = (hashv) & (num_bkts-1);                                                  \
371 | } while (0)
372 | 
373 | 
374 | /* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at
375 |  * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */
376 | #define HASH_SAX(key,keylen,num_bkts,hashv,bkt)                                  \
377 | do {                                                                             \
378 |   unsigned _sx_i;                                                                \
379 |   char *_hs_key=(char*)(key);                                                    \
380 |   hashv = 0;                                                                     \
381 |   for(_sx_i=0; _sx_i < keylen; _sx_i++)                                          \
382 |       hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i];                     \
383 |   bkt = hashv & (num_bkts-1);                                                    \
384 | } while (0)
385 | /* FNV-1a variation */
386 | #define HASH_FNV(key,keylen,num_bkts,hashv,bkt)                                  \
387 | do {                                                                             \
388 |   unsigned _fn_i;                                                                \
389 |   char *_hf_key=(char*)(key);                                                    \
390 |   hashv = 2166136261UL;                                                          \
391 |   for(_fn_i=0; _fn_i < keylen; _fn_i++)                                          \
392 |       hashv = hashv ^ _hf_key[_fn_i];                                            \
393 |       hashv = hashv * 16777619;                                                  \
394 |   bkt = hashv & (num_bkts-1);                                                    \
395 | } while(0)
396 | 
397 | #define HASH_OAT(key,keylen,num_bkts,hashv,bkt)                                  \
398 | do {                                                                             \
399 |   unsigned _ho_i;                                                                \
400 |   char *_ho_key=(char*)(key);                                                    \
401 |   hashv = 0;                                                                     \
402 |   for(_ho_i=0; _ho_i < keylen; _ho_i++) {                                        \
403 |       hashv += _ho_key[_ho_i];                                                   \
404 |       hashv += (hashv << 10);                                                    \
405 |       hashv ^= (hashv >> 6);                                                     \
406 |   }                                                                              \
407 |   hashv += (hashv << 3);                                                         \
408 |   hashv ^= (hashv >> 11);                                                        \
409 |   hashv += (hashv << 15);                                                        \
410 |   bkt = hashv & (num_bkts-1);                                                    \
411 | } while(0)
412 | 
413 | #define HASH_JEN_MIX(a,b,c)                                                      \
414 | do {                                                                             \
415 |   a -= b; a -= c; a ^= ( c >> 13 );                                              \
416 |   b -= c; b -= a; b ^= ( a << 8 );                                               \
417 |   c -= a; c -= b; c ^= ( b >> 13 );                                              \
418 |   a -= b; a -= c; a ^= ( c >> 12 );                                              \
419 |   b -= c; b -= a; b ^= ( a << 16 );                                              \
420 |   c -= a; c -= b; c ^= ( b >> 5 );                                               \
421 |   a -= b; a -= c; a ^= ( c >> 3 );                                               \
422 |   b -= c; b -= a; b ^= ( a << 10 );                                              \
423 |   c -= a; c -= b; c ^= ( b >> 15 );                                              \
424 | } while (0)
425 | 
426 | #define HASH_JEN(key,keylen,num_bkts,hashv,bkt)                                  \
427 | do {                                                                             \
428 |   unsigned _hj_i,_hj_j,_hj_k;                                                    \
429 |   unsigned char *_hj_key=(unsigned char*)(key);                                  \
430 |   hashv = 0xfeedbeef;                                                            \
431 |   _hj_i = _hj_j = 0x9e3779b9;                                                    \
432 |   _hj_k = (unsigned)(keylen);                                                      \
433 |   while (_hj_k >= 12) {                                                          \
434 |     _hj_i +=    (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 )                      \
435 |         + ( (unsigned)_hj_key[2] << 16 )                                         \
436 |         + ( (unsigned)_hj_key[3] << 24 ) );                                      \
437 |     _hj_j +=    (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 )                      \
438 |         + ( (unsigned)_hj_key[6] << 16 )                                         \
439 |         + ( (unsigned)_hj_key[7] << 24 ) );                                      \
440 |     hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 )                         \
441 |         + ( (unsigned)_hj_key[10] << 16 )                                        \
442 |         + ( (unsigned)_hj_key[11] << 24 ) );                                     \
443 |                                                                                  \
444 |      HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                          \
445 |                                                                                  \
446 |      _hj_key += 12;                                                              \
447 |      _hj_k -= 12;                                                                \
448 |   }                                                                              \
449 |   hashv += keylen;                                                               \
450 |   switch ( _hj_k ) {                                                             \
451 |      case 11: hashv += ( (unsigned)_hj_key[10] << 24 );                          \
452 |      case 10: hashv += ( (unsigned)_hj_key[9] << 16 );                           \
453 |      case 9:  hashv += ( (unsigned)_hj_key[8] << 8 );                            \
454 |      case 8:  _hj_j += ( (unsigned)_hj_key[7] << 24 );                           \
455 |      case 7:  _hj_j += ( (unsigned)_hj_key[6] << 16 );                           \
456 |      case 6:  _hj_j += ( (unsigned)_hj_key[5] << 8 );                            \
457 |      case 5:  _hj_j += _hj_key[4];                                               \
458 |      case 4:  _hj_i += ( (unsigned)_hj_key[3] << 24 );                           \
459 |      case 3:  _hj_i += ( (unsigned)_hj_key[2] << 16 );                           \
460 |      case 2:  _hj_i += ( (unsigned)_hj_key[1] << 8 );                            \
461 |      case 1:  _hj_i += _hj_key[0];                                               \
462 |   }                                                                              \
463 |   HASH_JEN_MIX(_hj_i, _hj_j, hashv);                                             \
464 |   bkt = hashv & (num_bkts-1);                                                    \
465 | } while(0)
466 | 
467 | /* The Paul Hsieh hash function */
468 | #undef get16bits
469 | #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__)             \
470 |   || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
471 | #define get16bits(d) (*((const uint16_t *) (d)))
472 | #endif
473 | 
474 | #if !defined (get16bits)
475 | #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)             \
476 |                        +(uint32_t)(((const uint8_t *)(d))[0]) )
477 | #endif
478 | #define HASH_SFH(key,keylen,num_bkts,hashv,bkt)                                  \
479 | do {                                                                             \
480 |   unsigned char *_sfh_key=(unsigned char*)(key);                                 \
481 |   uint32_t _sfh_tmp, _sfh_len = keylen;                                          \
482 |                                                                                  \
483 |   int _sfh_rem = _sfh_len & 3;                                                   \
484 |   _sfh_len >>= 2;                                                                \
485 |   hashv = 0xcafebabe;                                                            \
486 |                                                                                  \
487 |   /* Main loop */                                                                \
488 |   for (;_sfh_len > 0; _sfh_len--) {                                              \
489 |     hashv    += get16bits (_sfh_key);                                            \
490 |     _sfh_tmp       = (uint32_t)(get16bits (_sfh_key+2)) << 11  ^ hashv;          \
491 |     hashv     = (hashv << 16) ^ _sfh_tmp;                                        \
492 |     _sfh_key += 2*sizeof (uint16_t);                                             \
493 |     hashv    += hashv >> 11;                                                     \
494 |   }                                                                              \
495 |                                                                                  \
496 |   /* Handle end cases */                                                         \
497 |   switch (_sfh_rem) {                                                            \
498 |     case 3: hashv += get16bits (_sfh_key);                                       \
499 |             hashv ^= hashv << 16;                                                \
500 |             hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)] << 18);              \
501 |             hashv += hashv >> 11;                                                \
502 |             break;                                                               \
503 |     case 2: hashv += get16bits (_sfh_key);                                       \
504 |             hashv ^= hashv << 11;                                                \
505 |             hashv += hashv >> 17;                                                \
506 |             break;                                                               \
507 |     case 1: hashv += *_sfh_key;                                                  \
508 |             hashv ^= hashv << 10;                                                \
509 |             hashv += hashv >> 1;                                                 \
510 |   }                                                                              \
511 |                                                                                  \
512 |     /* Force "avalanching" of final 127 bits */                                  \
513 |     hashv ^= hashv << 3;                                                         \
514 |     hashv += hashv >> 5;                                                         \
515 |     hashv ^= hashv << 4;                                                         \
516 |     hashv += hashv >> 17;                                                        \
517 |     hashv ^= hashv << 25;                                                        \
518 |     hashv += hashv >> 6;                                                         \
519 |     bkt = hashv & (num_bkts-1);                                                  \
520 | } while(0)
521 | 
522 | #ifdef HASH_USING_NO_STRICT_ALIASING
523 | /* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads.
524 |  * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error.
525 |  * MurmurHash uses the faster approach only on CPU's where we know it's safe.
526 |  *
527 |  * Note the preprocessor built-in defines can be emitted using:
528 |  *
529 |  *   gcc -m64 -dM -E - < /dev/null                  (on gcc)
530 |  *   cc -## a.c (where a.c is a simple test file)   (Sun Studio)
531 |  */
532 | #if (defined(__i386__) || defined(__x86_64__)  || defined(_M_IX86))
533 | #define MUR_GETBLOCK(p,i) p[i]
534 | #else /* non intel */
535 | #define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0)
536 | #define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1)
537 | #define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2)
538 | #define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3)
539 | #define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL))
540 | #if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__))
541 | #define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24))
542 | #define MUR_TWO_TWO(p)   ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16))
543 | #define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >>  8))
544 | #else /* assume little endian non-intel */
545 | #define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24))
546 | #define MUR_TWO_TWO(p)   ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16))
547 | #define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) <<  8))
548 | #endif
549 | #define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) :           \
550 |                             (MUR_PLUS1_ALIGNED(p) ? MUR_THREE_ONE(p) : \
551 |                              (MUR_PLUS2_ALIGNED(p) ? MUR_TWO_TWO(p) :  \
552 |                                                       MUR_ONE_THREE(p))))
553 | #endif
554 | #define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
555 | #define MUR_FMIX(_h) \
556 | do {                 \
557 |   _h ^= _h >> 16;    \
558 |   _h *= 0x85ebca6b;  \
559 |   _h ^= _h >> 13;    \
560 |   _h *= 0xc2b2ae35l; \
561 |   _h ^= _h >> 16;    \
562 | } while(0)
563 | 
564 | #define HASH_MUR(key,keylen,num_bkts,hashv,bkt)                        \
565 | do {                                                                   \
566 |   const uint8_t *_mur_data = (const uint8_t*)(key);                    \
567 |   const int _mur_nblocks = (keylen) / 4;                               \
568 |   uint32_t _mur_h1 = 0xf88D5353;                                       \
569 |   uint32_t _mur_c1 = 0xcc9e2d51;                                       \
570 |   uint32_t _mur_c2 = 0x1b873593;                                       \
571 |   uint32_t _mur_k1 = 0;                                                \
572 |   const uint8_t *_mur_tail;                                            \
573 |   const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \
574 |   int _mur_i;                                                          \
575 |   for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) {                      \
576 |     _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i);                        \
577 |     _mur_k1 *= _mur_c1;                                                \
578 |     _mur_k1 = MUR_ROTL32(_mur_k1,15);                                  \
579 |     _mur_k1 *= _mur_c2;                                                \
580 |                                                                        \
581 |     _mur_h1 ^= _mur_k1;                                                \
582 |     _mur_h1 = MUR_ROTL32(_mur_h1,13);                                  \
583 |     _mur_h1 = _mur_h1*5+0xe6546b64;                                    \
584 |   }                                                                    \
585 |   _mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4);            \
586 |   _mur_k1=0;                                                           \
587 |   switch((keylen) & 3) {                                               \
588 |     case 3: _mur_k1 ^= _mur_tail[2] << 16;                             \
589 |     case 2: _mur_k1 ^= _mur_tail[1] << 8;                              \
590 |     case 1: _mur_k1 ^= _mur_tail[0];                                   \
591 |     _mur_k1 *= _mur_c1;                                                \
592 |     _mur_k1 = MUR_ROTL32(_mur_k1,15);                                  \
593 |     _mur_k1 *= _mur_c2;                                                \
594 |     _mur_h1 ^= _mur_k1;                                                \
595 |   }                                                                    \
596 |   _mur_h1 ^= (keylen);                                                 \
597 |   MUR_FMIX(_mur_h1);                                                   \
598 |   hashv = _mur_h1;                                                     \
599 |   bkt = hashv & (num_bkts-1);                                          \
600 | } while(0)
601 | #endif  /* HASH_USING_NO_STRICT_ALIASING */
602 | 
603 | /* key comparison function; return 0 if keys equal */
604 | #define HASH_KEYCMP(a,b,len) memcmp(a,b,len)
605 | 
606 | /* iterate over items in a known bucket to find desired item */
607 | #define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out)                       \
608 | do {                                                                             \
609 |  if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head));          \
610 |  else out=NULL;                                                                  \
611 |  while (out) {                                                                   \
612 |     if ((out)->hh.keylen == keylen_in) {                                           \
613 |         if ((HASH_KEYCMP((out)->hh.key,keyptr,keylen_in)) == 0) break;             \
614 |     }                                                                            \
615 |     if ((out)->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,(out)->hh.hh_next)); \
616 |     else out = NULL;                                                             \
617 |  }                                                                               \
618 | } while(0)
619 | 
620 | /* add an item to a bucket  */
621 | #define HASH_ADD_TO_BKT(head,addhh)                                              \
622 | do {                                                                             \
623 |  head.count++;                                                                   \
624 |  (addhh)->hh_next = head.hh_head;                                                \
625 |  (addhh)->hh_prev = NULL;                                                        \
626 |  if (head.hh_head) { (head).hh_head->hh_prev = (addhh); }                        \
627 |  (head).hh_head=addhh;                                                           \
628 |  if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH)             \
629 |      && (addhh)->tbl->noexpand != 1) {                                           \
630 |        HASH_EXPAND_BUCKETS((addhh)->tbl);                                        \
631 |  }                                                                               \
632 | } while(0)
633 | 
634 | /* remove an item from a given bucket */
635 | #define HASH_DEL_IN_BKT(hh,head,hh_del)                                          \
636 |     (head).count--;                                                              \
637 |     if ((head).hh_head == hh_del) {                                              \
638 |       (head).hh_head = hh_del->hh_next;                                          \
639 |     }                                                                            \
640 |     if (hh_del->hh_prev) {                                                       \
641 |         hh_del->hh_prev->hh_next = hh_del->hh_next;                              \
642 |     }                                                                            \
643 |     if (hh_del->hh_next) {                                                       \
644 |         hh_del->hh_next->hh_prev = hh_del->hh_prev;                              \
645 |     }
646 | 
647 | /* Bucket expansion has the effect of doubling the number of buckets
648 |  * and redistributing the items into the new buckets. Ideally the
649 |  * items will distribute more or less evenly into the new buckets
650 |  * (the extent to which this is true is a measure of the quality of
651 |  * the hash function as it applies to the key domain).
652 |  *
653 |  * With the items distributed into more buckets, the chain length
654 |  * (item count) in each bucket is reduced. Thus by expanding buckets
655 |  * the hash keeps a bound on the chain length. This bounded chain
656 |  * length is the essence of how a hash provides constant time lookup.
657 |  *
658 |  * The calculation of tbl->ideal_chain_maxlen below deserves some
659 |  * explanation. First, keep in mind that we're calculating the ideal
660 |  * maximum chain length based on the *new* (doubled) bucket count.
661 |  * In fractions this is just n/b (n=number of items,b=new num buckets).
662 |  * Since the ideal chain length is an integer, we want to calculate
663 |  * ceil(n/b). We don't depend on floating point arithmetic in this
664 |  * hash, so to calculate ceil(n/b) with integers we could write
665 |  *
666 |  *      ceil(n/b) = (n/b) + ((n%b)?1:0)
667 |  *
668 |  * and in fact a previous version of this hash did just that.
669 |  * But now we have improved things a bit by recognizing that b is
670 |  * always a power of two. We keep its base 2 log handy (call it lb),
671 |  * so now we can write this with a bit shift and logical AND:
672 |  *
673 |  *      ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0)
674 |  *
675 |  */
676 | #define HASH_EXPAND_BUCKETS(tbl)                                                 \
677 | do {                                                                             \
678 |     unsigned _he_bkt;                                                            \
679 |     unsigned _he_bkt_i;                                                          \
680 |     struct UT_hash_handle *_he_thh, *_he_hh_nxt;                                 \
681 |     UT_hash_bucket *_he_new_buckets, *_he_newbkt;                                \
682 |     _he_new_buckets = (UT_hash_bucket*)uthash_malloc(                            \
683 |              2 * tbl->num_buckets * sizeof(struct UT_hash_bucket));              \
684 |     if (!_he_new_buckets) { uthash_fatal( "out of memory"); }                    \
685 |     memset(_he_new_buckets, 0,                                                   \
686 |             2 * tbl->num_buckets * sizeof(struct UT_hash_bucket));               \
687 |     tbl->ideal_chain_maxlen =                                                    \
688 |        (tbl->num_items >> (tbl->log2_num_buckets+1)) +                           \
689 |        ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0);                    \
690 |     tbl->nonideal_items = 0;                                                     \
691 |     for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++)                \
692 |     {                                                                            \
693 |         _he_thh = tbl->buckets[ _he_bkt_i ].hh_head;                             \
694 |         while (_he_thh) {                                                        \
695 |            _he_hh_nxt = _he_thh->hh_next;                                        \
696 |            HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt);            \
697 |            _he_newbkt = &(_he_new_buckets[ _he_bkt ]);                           \
698 |            if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) {                \
699 |              tbl->nonideal_items++;                                              \
700 |              _he_newbkt->expand_mult = _he_newbkt->count /                       \
701 |                                         tbl->ideal_chain_maxlen;                 \
702 |            }                                                                     \
703 |            _he_thh->hh_prev = NULL;                                              \
704 |            _he_thh->hh_next = _he_newbkt->hh_head;                               \
705 |            if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev =               \
706 |                 _he_thh;                                                         \
707 |            _he_newbkt->hh_head = _he_thh;                                        \
708 |            _he_thh = _he_hh_nxt;                                                 \
709 |         }                                                                        \
710 |     }                                                                            \
711 |     uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \
712 |     tbl->num_buckets *= 2;                                                       \
713 |     tbl->log2_num_buckets++;                                                     \
714 |     tbl->buckets = _he_new_buckets;                                              \
715 |     tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ?         \
716 |         (tbl->ineff_expands+1) : 0;                                              \
717 |     if (tbl->ineff_expands > 1) {                                                \
718 |         tbl->noexpand=1;                                                         \
719 |         uthash_noexpand_fyi(tbl);                                                \
720 |     }                                                                            \
721 |     uthash_expand_fyi(tbl);                                                      \
722 | } while(0)
723 | 
724 | 
725 | /* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */
726 | /* Note that HASH_SORT assumes the hash handle name to be hh.
727 |  * HASH_SRT was added to allow the hash handle name to be passed in. */
728 | #define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn)
729 | #define HASH_SRT(hh,head,cmpfcn)                                                 \
730 | do {                                                                             \
731 |   unsigned _hs_i;                                                                \
732 |   unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize;               \
733 |   struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail;            \
734 |   if (head) {                                                                    \
735 |       _hs_insize = 1;                                                            \
736 |       _hs_looping = 1;                                                           \
737 |       _hs_list = &((head)->hh);                                                  \
738 |       while (_hs_looping) {                                                      \
739 |           _hs_p = _hs_list;                                                      \
740 |           _hs_list = NULL;                                                       \
741 |           _hs_tail = NULL;                                                       \
742 |           _hs_nmerges = 0;                                                       \
743 |           while (_hs_p) {                                                        \
744 |               _hs_nmerges++;                                                     \
745 |               _hs_q = _hs_p;                                                     \
746 |               _hs_psize = 0;                                                     \
747 |               for ( _hs_i = 0; _hs_i  < _hs_insize; _hs_i++ ) {                  \
748 |                   _hs_psize++;                                                   \
749 |                   _hs_q = (UT_hash_handle*)((_hs_q->next) ?                      \
750 |                           ((void*)((char*)(_hs_q->next) +                        \
751 |                           (head)->hh.tbl->hho)) : NULL);                         \
752 |                   if (! (_hs_q) ) break;                                         \
753 |               }                                                                  \
754 |               _hs_qsize = _hs_insize;                                            \
755 |               while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) {           \
756 |                   if (_hs_psize == 0) {                                          \
757 |                       _hs_e = _hs_q;                                             \
758 |                       _hs_q = (UT_hash_handle*)((_hs_q->next) ?                  \
759 |                               ((void*)((char*)(_hs_q->next) +                    \
760 |                               (head)->hh.tbl->hho)) : NULL);                     \
761 |                       _hs_qsize--;                                               \
762 |                   } else if ( (_hs_qsize == 0) || !(_hs_q) ) {                   \
763 |                       _hs_e = _hs_p;                                             \
764 |                       if (_hs_p){                                                \
765 |                         _hs_p = (UT_hash_handle*)((_hs_p->next) ?                \
766 |                                 ((void*)((char*)(_hs_p->next) +                  \
767 |                                 (head)->hh.tbl->hho)) : NULL);                   \
768 |                        }                                                         \
769 |                       _hs_psize--;                                               \
770 |                   } else if ((                                                   \
771 |                       cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \
772 |                              DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \
773 |                              ) <= 0) {                                           \
774 |                       _hs_e = _hs_p;                                             \
775 |                       if (_hs_p){                                                \
776 |                         _hs_p = (UT_hash_handle*)((_hs_p->next) ?                \
777 |                                ((void*)((char*)(_hs_p->next) +                   \
778 |                                (head)->hh.tbl->hho)) : NULL);                    \
779 |                        }                                                         \
780 |                       _hs_psize--;                                               \
781 |                   } else {                                                       \
782 |                       _hs_e = _hs_q;                                             \
783 |                       _hs_q = (UT_hash_handle*)((_hs_q->next) ?                  \
784 |                               ((void*)((char*)(_hs_q->next) +                    \
785 |                               (head)->hh.tbl->hho)) : NULL);                     \
786 |                       _hs_qsize--;                                               \
787 |                   }                                                              \
788 |                   if ( _hs_tail ) {                                              \
789 |                       _hs_tail->next = ((_hs_e) ?                                \
790 |                             ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL);          \
791 |                   } else {                                                       \
792 |                       _hs_list = _hs_e;                                          \
793 |                   }                                                              \
794 |                   if (_hs_e) {                                                   \
795 |                   _hs_e->prev = ((_hs_tail) ?                                    \
796 |                      ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL);              \
797 |                   }                                                              \
798 |                   _hs_tail = _hs_e;                                              \
799 |               }                                                                  \
800 |               _hs_p = _hs_q;                                                     \
801 |           }                                                                      \
802 |           if (_hs_tail){                                                         \
803 |             _hs_tail->next = NULL;                                               \
804 |           }                                                                      \
805 |           if ( _hs_nmerges <= 1 ) {                                              \
806 |               _hs_looping=0;                                                     \
807 |               (head)->hh.tbl->tail = _hs_tail;                                   \
808 |               DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list));      \
809 |           }                                                                      \
810 |           _hs_insize *= 2;                                                       \
811 |       }                                                                          \
812 |       HASH_FSCK(hh,head);                                                        \
813 |  }                                                                               \
814 | } while (0)
815 | 
816 | /* This function selects items from one hash into another hash.
817 |  * The end result is that the selected items have dual presence
818 |  * in both hashes. There is no copy of the items made; rather
819 |  * they are added into the new hash through a secondary hash
820 |  * hash handle that must be present in the structure. */
821 | #define HASH_SELECT(hh_dst, dst, hh_src, src, cond)                              \
822 | do {                                                                             \
823 |   unsigned _src_bkt, _dst_bkt;                                                   \
824 |   void *_last_elt=NULL, *_elt;                                                   \
825 |   UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL;                         \
826 |   ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst));                 \
827 |   if (src) {                                                                     \
828 |     for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) {     \
829 |       for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head;                \
830 |           _src_hh;                                                               \
831 |           _src_hh = _src_hh->hh_next) {                                          \
832 |           _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh);                       \
833 |           if (cond(_elt)) {                                                      \
834 |             _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho);               \
835 |             _dst_hh->key = _src_hh->key;                                         \
836 |             _dst_hh->keylen = _src_hh->keylen;                                   \
837 |             _dst_hh->hashv = _src_hh->hashv;                                     \
838 |             _dst_hh->prev = _last_elt;                                           \
839 |             _dst_hh->next = NULL;                                                \
840 |             if (_last_elt_hh) { _last_elt_hh->next = _elt; }                     \
841 |             if (!dst) {                                                          \
842 |               DECLTYPE_ASSIGN(dst,_elt);                                         \
843 |               HASH_MAKE_TABLE(hh_dst,dst);                                       \
844 |             } else {                                                             \
845 |               _dst_hh->tbl = (dst)->hh_dst.tbl;                                  \
846 |             }                                                                    \
847 |             HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt);    \
848 |             HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh);            \
849 |             (dst)->hh_dst.tbl->num_items++;                                      \
850 |             _last_elt = _elt;                                                    \
851 |             _last_elt_hh = _dst_hh;                                              \
852 |           }                                                                      \
853 |       }                                                                          \
854 |     }                                                                            \
855 |   }                                                                              \
856 |   HASH_FSCK(hh_dst,dst);                                                         \
857 | } while (0)
858 | 
859 | #define HASH_CLEAR(hh,head)                                                      \
860 | do {                                                                             \
861 |   if (head) {                                                                    \
862 |     uthash_free((head)->hh.tbl->buckets,                                         \
863 |                 (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket));      \
864 |     HASH_BLOOM_FREE((head)->hh.tbl);                                             \
865 |     uthash_free((head)->hh.tbl, sizeof(UT_hash_table));                          \
866 |     (head)=NULL;                                                                 \
867 |   }                                                                              \
868 | } while(0)
869 | 
870 | #define HASH_OVERHEAD(hh,head)                                                   \
871 |  (size_t)((((head)->hh.tbl->num_items   * sizeof(UT_hash_handle))   +            \
872 |            ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket))   +            \
873 |             (sizeof(UT_hash_table))                                 +            \
874 |             (HASH_BLOOM_BYTELEN)))
875 | 
876 | #ifdef NO_DECLTYPE
877 | #define HASH_ITER(hh,head,el,tmp)                                                \
878 | for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL);       \
879 |   el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL))
880 | #else
881 | #define HASH_ITER(hh,head,el,tmp)                                                \
882 | for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL);                 \
883 |   el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL))
884 | #endif
885 | 
886 | /* obtain a count of items in the hash */
887 | #define HASH_COUNT(head) HASH_CNT(hh,head)
888 | #define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0)
889 | 
890 | typedef struct UT_hash_bucket {
891 |    struct UT_hash_handle *hh_head;
892 |    unsigned count;
893 | 
894 |    /* expand_mult is normally set to 0. In this situation, the max chain length
895 |     * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If
896 |     * the bucket's chain exceeds this length, bucket expansion is triggered).
897 |     * However, setting expand_mult to a non-zero value delays bucket expansion
898 |     * (that would be triggered by additions to this particular bucket)
899 |     * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH.
900 |     * (The multiplier is simply expand_mult+1). The whole idea of this
901 |     * multiplier is to reduce bucket expansions, since they are expensive, in
902 |     * situations where we know that a particular bucket tends to be overused.
903 |     * It is better to let its chain length grow to a longer yet-still-bounded
904 |     * value, than to do an O(n) bucket expansion too often.
905 |     */
906 |    unsigned expand_mult;
907 | 
908 | } UT_hash_bucket;
909 | 
910 | /* random signature used only to find hash tables in external analysis */
911 | #define HASH_SIGNATURE 0xa0111fe1
912 | #define HASH_BLOOM_SIGNATURE 0xb12220f2
913 | 
914 | typedef struct UT_hash_table {
915 |    UT_hash_bucket *buckets;
916 |    unsigned num_buckets, log2_num_buckets;
917 |    unsigned num_items;
918 |    struct UT_hash_handle *tail; /* tail hh in app order, for fast append    */
919 |    ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */
920 | 
921 |    /* in an ideal situation (all buckets used equally), no bucket would have
922 |     * more than ceil(#items/#buckets) items. that's the ideal chain length. */
923 |    unsigned ideal_chain_maxlen;
924 | 
925 |    /* nonideal_items is the number of items in the hash whose chain position
926 |     * exceeds the ideal chain maxlen. these items pay the penalty for an uneven
927 |     * hash distribution; reaching them in a chain traversal takes >ideal steps */
928 |    unsigned nonideal_items;
929 | 
930 |    /* ineffective expands occur when a bucket doubling was performed, but
931 |     * afterward, more than half the items in the hash had nonideal chain
932 |     * positions. If this happens on two consecutive expansions we inhibit any
933 |     * further expansion, as it's not helping; this happens when the hash
934 |     * function isn't a good fit for the key domain. When expansion is inhibited
935 |     * the hash will still work, albeit no longer in constant time. */
936 |    unsigned ineff_expands, noexpand;
937 | 
938 |    uint32_t signature; /* used only to find hash tables in external analysis */
939 | #ifdef HASH_BLOOM
940 |    uint32_t bloom_sig; /* used only to test bloom exists in external analysis */
941 |    uint8_t *bloom_bv;
942 |    char bloom_nbits;
943 | #endif
944 | 
945 | } UT_hash_table;
946 | 
947 | typedef struct UT_hash_handle {
948 |    struct UT_hash_table *tbl;
949 |    void *prev;                       /* prev element in app order      */
950 |    void *next;                       /* next element in app order      */
951 |    struct UT_hash_handle *hh_prev;   /* previous hh in bucket order    */
952 |    struct UT_hash_handle *hh_next;   /* next hh in bucket order        */
953 |    void *key;                        /* ptr to enclosing struct's key  */
954 |    unsigned keylen;                  /* enclosing struct's key len     */
955 |    unsigned hashv;                   /* result of hash-fcn(key)        */
956 | } UT_hash_handle;
957 | 
958 | #endif /* UTHASH_H */
959 | 


--------------------------------------------------------------------------------