├── .gitignore
├── test.sh
├── Makefile
├── include
    ├── well.h
    ├── cluster.h
    ├── distortion.h
    ├── quantizer.h
    ├── util.h
    ├── lines.h
    ├── pmf.h
    ├── qv_compressor.h
    └── codebook.h
├── generate_rd.sh
├── src
    ├── Makefile
    ├── Makefile.apple
    ├── mse_check.c
    ├── well.c
    ├── util.c
    ├── os_stream.c
    ├── qv_stream.c
    ├── lines.c
    ├── distortion.c
    ├── arith.c
    ├── quantizer.c
    ├── cluster.c
    ├── qv_compressor.c
    ├── main.c
    ├── pmf.c
    └── codebook.c
├── README.md
└── COPYING


/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt2
2 | *.swp
3 | .DS_Store
4 | *.o
5 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -v
 4 | 
 5 | make clean
 6 | make 
 7 | bin/qvz -u fref.txt -c 1 -f 0.5 -s test.in test.q > write
 8 | bin/qvz -x test.q test.dec > read
 9 | diff fref.txt test.dec
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	$(MAKE) -C src
 3 | 	mkdir -p bin
 4 | 	mv src/qvz bin/qvz
 5 | 
 6 | debug:
 7 | 	$(MAKE) -C src debug
 8 | 	mkdir -p bin
 9 | 	mv src/qvz bin/qvz
10 | 
11 | clean:
12 | 	$(MAKE) -C src clean
13 | 	rm -f bin/qvz
14 | 


--------------------------------------------------------------------------------
/include/well.h:
--------------------------------------------------------------------------------
 1 | #ifndef _WELL_H_
 2 | #define _WELL_H_
 3 | /**
 4 |  * Declarations necessary for using the WELL-1024a PRNG in other code
 5 |  */
 6 | 
 7 | #include <stdint.h>
 8 | 
 9 | struct well_state_t {
10 | 	uint32_t state[32];
11 | 	uint32_t n;
12 | 	uint32_t bit_output;
13 | 	uint32_t bits_left;
14 | };
15 | 
16 | 
17 | uint32_t well_1024a(struct well_state_t *state);
18 | uint32_t well_1024a_bits(struct well_state_t *state, uint8_t bits);
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/generate_rd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | idx=0
 4 | STATSFILE=`mktemp rd_statsXXXXXX`
 5 | while [ $idx -lt 20 ]; do
 6 | 	# If doing ratiod encoding
 7 | 	comp=$(echo "$idx*0.05" | bc -l)
 8 | 	bin/qvz -c 1 -f $comp -s $1 $2 | tee -a $STATSFILE
 9 | 
10 | 	# Fixed rate encoding
11 | 	#comp=$(echo "$idx*0.10" | bc -l)
12 | 	#bin/qvz -c 3 -r $comp -s $1 $2 | tee -a $STATSFILE
13 | 
14 | 	idx=$((idx+1))
15 | done
16 | awk '{print $2 $4 $6}' $STATSFILE > $3
17 | rm -f $STATSFILE
18 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for building C programs to do encoding, decoding, and clustering
 2 | 
 3 | SRC=well.c codebook.c main.c util.c lines.c quantizer.c pmf.c distortion.c qv_stream.c qv_compressor.c arith.c os_stream.c cluster.c
 4 | 
 5 | OBJ=$(SRC:.c=.o)
 6 | 
 7 | CC=gcc
 8 | RM=rm -f
 9 | 
10 | CFLAGS=-O3 -Wall -I../include -DLINUX
11 | LDFLAGS=-lc -lm -lrt
12 | 
13 | %.o : %.c
14 | 	$(CC) $(CFLAGS) -c $<
15 | 
16 | all : qvz
17 | 
18 | qvz : $(OBJ)
19 | 	$(CC) $(OBJ) -o qvz $(LDFLAGS)
20 | 
21 | debug : CFLAGS += -DDEBUG -ggdb -O0
22 | debug : qvz
23 | 
24 | clean :
25 | 	$(RM) *.o qvz
26 | 


--------------------------------------------------------------------------------
/src/Makefile.apple:
--------------------------------------------------------------------------------
 1 | # Makefile for building C programs to do encoding, decoding, and clustering
 2 | 
 3 | SRC=well.c codebook.c main.c util.c lines.c quantizer.c pmf.c distortion.c qv_stream.c qv_compressor.c arith.c os_stream.c cluster.c
 4 | 
 5 | OBJ=$(SRC:.c=.o)
 6 | 
 7 | CC=gcc
 8 | RM=rm -f
 9 | 
10 | CFLAGS=-O3 -Wall -I../include -D__APPLE__
11 | LDFLAGS=-lc -lm -lrt
12 | 
13 | %.o : %.c
14 | 	$(CC) $(CFLAGS) -c $<
15 | 
16 | all : qvz
17 | 
18 | qvz : $(OBJ)
19 | 	$(CC) $(OBJ) -o qvz $(LDFLAGS)
20 | 
21 | debug : CFLAGS += -DDEBUG -ggdb
22 | debug : qvz
23 | 
24 | clean :
25 | 	$(RM) *.o qvz
26 | 


--------------------------------------------------------------------------------
/src/mse_check.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main( int argc, char **argv){
 5 | 	
 6 | 	FILE *f1, *f2;
 7 | 	char *line1, *line2;
 8 | 	int error = 0, i = 0;
 9 | 	unsigned int num_lines = 0, columns = 36, num_lines_total = 500000;
10 | 	double distortion = 0.0;
11 | 
12 | 	line1 = (char*)calloc(4096, 1);
13 | 	line2 = (char*)calloc(4096, 1);
14 | 	f1 = fopen(argv[1], "r");
15 | 	f2 = fopen(argv[2], "r");
16 | 	
17 | 	fgets(line1, columns+2, f1);
18 | 	fgets(line2, columns+2, f2);
19 | 	do{
20 | 		num_lines++;
21 | 		error = 0;	
22 | 		if(num_lines%1000000 == 0) printf("Line: %dM\n", num_lines/1000000);
23 | 		for(i = 0; i<columns;i++){
24 | 			error += ((int)line1[i] - (int)line2[i])*((int)line1[i] - (int)line2[i]);
25 | 		}
26 | 		distortion += error / ( (double)columns);
27 | 		fgets(line1, columns+2, f1);
28 | 		fgets(line2, columns+2, f2);
29 | 	}while (num_lines < num_lines_total);
30 | 	distortion = distortion / (double)num_lines;
31 | 	printf ("MSE:%f\n", distortion);
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/include/cluster.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CLUSTER_H_
 2 | #define _CLUSTER_H_
 3 | 
 4 | // All of our structures are delcared in lines.h
 5 | // Because otherwise it makes this into a definition nightmare
 6 | 
 7 | #include "lines.h"
 8 | 
 9 | #define MAX_KMEANS_ITERATIONS 1000
10 | 
11 | // Memory management
12 | struct cluster_list_t *alloc_cluster_list(struct quality_file_t *info);
13 | void free_cluster_list(struct cluster_list_t *);
14 | 
15 | // Clustering algorithm internals
16 | uint8_t cluster_lines(struct line_block_t *block, struct quality_file_t *info);
17 | double recalculate_means(struct quality_file_t *info);
18 | uint8_t do_cluster_assignment(struct line_t *line, struct quality_file_t *info);
19 | uint8_t assign_cluster(struct line_t *line, struct quality_file_t *info);
20 | void find_distance(struct line_t *line, struct cluster_t *cluster, struct quality_file_t *t);
21 | 
22 | // Clustering interface
23 | void initialize_kmeans_clustering(struct quality_file_t *info);
24 | void do_kmeans_clustering(struct quality_file_t *info);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/include/distortion.h:
--------------------------------------------------------------------------------
 1 | #ifndef _DISTORTION_H_
 2 | #define _DISTORTION_H_
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | // Constants for generating distortion matrices
 7 | #define DISTORTION_MANHATTAN		1
 8 | #define DISTORTION_MSE				2
 9 | #define DISTORTION_LORENTZ			3
10 | #define DISTORTION_CUSTOM			4
11 | 
12 | /**
13 |  * Used to store distortion matrix information that is used during quantizer generation
14 |  */
15 | struct distortion_t {
16 | 	double *distortion;
17 | 	uint8_t symbols;
18 | };
19 | 
20 | // Memory management functions
21 | struct distortion_t *alloc_distortion_matrix(uint8_t symbols);
22 | void free_distortion_matrix(struct distortion_t *);
23 | 
24 | // Methods for generating distortion matrices of different types
25 | struct distortion_t *generate_distortion_matrix(uint8_t symbols, int type);
26 | struct distortion_t *gen_mse_distortion(uint8_t symbols);
27 | struct distortion_t *gen_manhattan_distortion(uint8_t symbols);
28 | struct distortion_t *gen_lorentzian_distortion(uint8_t symbols);
29 | struct distortion_t *gen_custom_distortion(uint8_t symbols, const char *filename);
30 | 
31 | // Accessors
32 | double get_distortion(struct distortion_t *dist, uint8_t x, uint8_t y);
33 | 
34 | void print_distortion(struct distortion_t *dist);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/quantizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _QUANTIZER_H_
 2 | #define _QUANTIZER_H_
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #include "pmf.h"
 7 | #include "distortion.h"
 8 | #include "util.h"
 9 | 
10 | #define QUANTIZER_MAX_ITER		100
11 | 
12 | /**
13 |  * Structure holding information about a quantizer, which just maps input symbols
14 |  * to output symbols for a specific alphabet
15 |  */
16 | struct quantizer_t {
17 | 	const struct alphabet_t *restrict alphabet;
18 | 	struct alphabet_t *restrict output_alphabet;
19 | 	symbol_t *restrict q;
20 |     double ratio;
21 | 	double mse;
22 | };
23 | 
24 | // Memory management
25 | struct quantizer_t *alloc_quantizer(const struct alphabet_t *);
26 | void free_quantizer(struct quantizer_t *);
27 | 
28 | // Generates a quantizer via optimization
29 | struct quantizer_t *generate_quantizer(struct pmf_t *restrict pmf, struct distortion_t *restrict dist, uint32_t states);
30 | 
31 | // Calculate the output pmf when the quantizer is applied to the input pmf
32 | struct pmf_t *apply_quantizer(struct quantizer_t *restrict q, struct pmf_t *restrict pmf, struct pmf_t *restrict output);
33 | 
34 | // Find the output alphabet of a quantizer
35 | void find_output_alphabet(struct quantizer_t *);
36 | 
37 | // Display/debugging
38 | void print_quantizer(struct quantizer_t *);
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/src/well.c:
--------------------------------------------------------------------------------
 1 | #include "well.h"
 2 | 
 3 | /**
 4 |  * Implementation of WELL-1024a PRNG
 5 |  * @param well_state_t RNG state to use for generation
 6 |  * @return 32 bit int random number
 7 |  */
 8 | uint32_t well_1024a(struct well_state_t *state) {
 9 | 	uint32_t *s = state->state;
10 | 	uint32_t n = state->n;
11 | 
12 | 	uint32_t z0 = s[(n+31)&31];
13 | 	uint32_t v_m1 = s[(n+3)&31];
14 | 	uint32_t v_m2 = s[(n+24)&31];
15 | 	uint32_t v_m3 = s[(n+10)&31];
16 | 	uint32_t z1 = s[n] ^ (v_m1 ^ (v_m1 >> 8));
17 | 	uint32_t z2 = (v_m2 ^ (v_m2 << 19)) ^ (v_m3 ^ (v_m3 << 14));
18 | 
19 | 	s[n] = z1 ^ z2;
20 | 	n = (n + 31) & 31;
21 | 	s[n] = (z0 ^ (z0 << 11)) ^ (z1 ^ (z1 << 7)) ^ (z2 ^ (z2 << 13));
22 | 	state->n = n;
23 | 	return s[n];
24 | }
25 | 
26 | /**
27 |  * Produces a number of random bits generated by well, amoritizing the cost of
28 |  * running the PRNG if the number of bits is less than 32
29 |  * @param state RNG state to use for generation
30 |  * @param bits Number of bits to generate
31 |  * @return Random integer of the specified number of bits, contained in 32 bits
32 |  */
33 | uint32_t well_1024a_bits(struct well_state_t *state, uint8_t bits) {
34 | 	uint32_t mask = (1 << bits) - 1;
35 | 	uint32_t rtn;
36 | 
37 | 	if (state->bits_left < bits) {
38 | 		state->bit_output = well_1024a(state);
39 | 		state->bits_left = 32;
40 | 	}
41 | 
42 | 	rtn = state->bit_output & mask;
43 | 	state->bit_output = state->bit_output >> bits;
44 | 	state->bits_left -= bits;
45 | 	return rtn;
46 | }
47 | 


--------------------------------------------------------------------------------
/include/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef _UTIL_H_
 2 | #define _UTIL_H_
 3 | /**
 4 |  * Utility functions to help do stuff and manage cross-platform issues
 5 |  */
 6 | 
 7 | #define _CRT_SECURE_NO_WARNINGS
 8 | 
 9 | #include <math.h>
10 | #include <stdlib.h>
11 | #include <float.h>
12 | #include <sys/types.h>
13 | #include <sys/stat.h>
14 | 
15 | #ifdef LINUX
16 | 	#include <time.h>
17 | 	#define _stat stat
18 | 	#define _alloca alloca
19 | 	#define restrict __restrict__
20 | #elif __APPLE__
21 |     #include <time.h>
22 |     #define _stat stat
23 |     #define _alloca alloca
24 | #else
25 |   #include <malloc.h>
26 | 	#include <windows.h>
27 | 	#define restrict __restrict
28 | #endif
29 | 
30 | struct hrtimer_t {
31 | #ifdef LINUX
32 | 	struct timespec start;
33 | 	struct timespec stop;
34 | 	struct timespec res;
35 | #elif __APPLE__
36 | 	struct timespec start;
37 | 	struct timespec stop;
38 | 	struct timespec res;
39 | #else
40 | 	LARGE_INTEGER start;
41 | 	LARGE_INTEGER stop;
42 | 	LARGE_INTEGER freq;
43 | #endif
44 | };
45 | 
46 | // Cross platform timer interface
47 | void start_timer(struct hrtimer_t *timer);
48 | void stop_timer(struct hrtimer_t *timer);
49 | double get_timer_interval(struct hrtimer_t *timer);
50 | 
51 | // ceiling(log2()) function used in bit calculations
52 | int cb_log2(int x);
53 | 
54 | // Missing log2 function
55 | #ifndef LINUX
56 | 	#define log2(x) (log(x)/log(2.0))
57 | #endif
58 | 
59 | // Missing math symbols
60 | #ifndef INFINITY
61 | 	#define INFINITY (DBL_MAX + DBL_MAX)
62 | #endif
63 | #ifndef NAN
64 | 	#define NAN (INFINITY - INFINITY)
65 | #endif
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/include/lines.h:
--------------------------------------------------------------------------------
 1 | #ifndef _LINES_H_
 2 | #define _LINES_H_
 3 | 
 4 | 
 5 | #include <stdint.h>
 6 | 
 7 | #include "pmf.h"
 8 | #include "distortion.h"
 9 | #include "well.h"
10 | 
11 | // This limits us to chunks that aren't too big to fit into a modest amount of memory at a time
12 | #define MAX_LINES_PER_BLOCK			1000000
13 | #define MAX_READS_PER_LINE			1022
14 | #define READ_LINEBUF_LENGTH			(MAX_READS_PER_LINE+2)
15 | 
16 | // Error codes for reading a line block
17 | #define LF_ERROR_NONE				0
18 | #define LF_ERROR_NOT_FOUND			1
19 | #define LF_ERROR_NO_MEMORY			2
20 | #define LF_ERROR_TOO_LONG			4
21 | 
22 | /**
23 |  * Points to a single line, which may be a pointer to a file in memory
24 |  */
25 | struct line_t {
26 | 	uint8_t cluster;		// Assigned cluster ID
27 | 	const symbol_t *m_data;	// Pointer to part of mmap'd region, has no offsets applied, do not modify!
28 | };
29 | 
30 | /**
31 |  * Points to a block of lines for incremental processing
32 |  */
33 | struct line_block_t {
34 | 	uint32_t count;
35 | 	struct line_t *lines;
36 | };
37 | 
38 | /**
39 |  * Stores information about a specific cluster
40 |  */
41 | struct cluster_t {
42 | 	// Used to do clustering
43 | 	uint8_t id;					// Cluster ID
44 | 	uint32_t count;				// Number of lines in this cluster
45 | 	symbol_t *mean;				// Mean values for this cluster
46 | 	uint64_t *accumulator;		// Accumulator for finding a new cluster center
47 | 
48 | 	// Used after clustering is done
49 | 	struct cond_pmf_list_t *training_stats;
50 | 	struct cond_quantizer_list_t *qlist;
51 | };
52 | 
53 | /**
54 |  * Stores all clusters
55 |  */
56 | struct cluster_list_t {
57 | 	uint8_t count;
58 | 	struct cluster_t *clusters;
59 | 	double *distances;			// Temporary storage for distances to each cluster center
60 | };
61 | 
62 | /**
63 |  * Points to a file descriptor that includes important metadata about the file
64 |  */
65 | struct quality_file_t {
66 | 	struct alphabet_t *alphabet;
67 | 	char *path;
68 | 	uint64_t lines;
69 | 	uint32_t columns;
70 | 	uint32_t block_count;
71 | 	struct line_block_t *blocks;
72 | 	uint8_t cluster_count;
73 | 	struct cluster_list_t *clusters;
74 | 	struct distortion_t *dist;
75 | 	struct qv_options_t *opts;
76 | 	struct well_state_t well;
77 | };
78 | 
79 | // Memory management
80 | uint32_t load_file(const char *path, struct quality_file_t *info, uint64_t max_lines);
81 | uint32_t alloc_blocks(struct quality_file_t *info);
82 | void free_blocks(struct quality_file_t *info);
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/src/util.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | 
 3 | #ifdef __MACH__
 4 | #include <mach/clock.h>
 5 | #include <mach/mach.h>
 6 | #endif
 7 | 
 8 | /**
 9 |  * Starts the high resolution timer
10 |  */
11 | void start_timer(struct hrtimer_t *timer) {
12 | #ifdef LINUX
13 | 	clock_gettime(CLOCK_REALTIME, &timer->start);
14 | #elif __APPLE__
15 | // OS X does not have clock_gettime, use clock_get_time
16 |     clock_serv_t cclock;
17 |     mach_timespec_t mts;
18 |     host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
19 |     clock_get_time(cclock, &mts);
20 |     mach_port_deallocate(mach_task_self(), cclock);
21 |     timer->start.tv_sec = mts.tv_sec;
22 |     timer->start.tv_nsec = mts.tv_nsec;
23 | #else
24 | 	QueryPerformanceFrequency(&timer->freq);
25 | 	QueryPerformanceCounter(&timer->start);
26 | #endif
27 | }
28 | 
29 | /**
30 |  * Stops the high resolution timer
31 |  */
32 | void stop_timer(struct hrtimer_t *timer) {
33 | #ifdef LINUX
34 | 	clock_gettime(CLOCK_REALTIME, &timer->stop);
35 | #elif __APPLE__
36 | // OS X does not have clock_gettime, use clock_get_time
37 |     clock_serv_t cclock;
38 |     mach_timespec_t mts;
39 |     host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
40 |     clock_get_time(cclock, &mts);
41 |     mach_port_deallocate(mach_task_self(), cclock);
42 |     timer->stop.tv_sec = mts.tv_sec;
43 |     timer->stop.tv_nsec = mts.tv_nsec;
44 | #else
45 | 	QueryPerformanceCounter(&timer->stop);
46 | #endif
47 | }
48 | 
49 | /**
50 |  * Reads the high resolution timer in seconds
51 |  */
52 | double get_timer_interval(struct hrtimer_t *timer) {
53 | #ifdef LINUX
54 | 	long dnsec = timer->stop.tv_nsec - timer->start.tv_nsec;
55 | 	int dsec = timer->stop.tv_sec - timer->start.tv_sec;
56 | 
57 | 	if (dnsec < 0) {
58 | 		dnsec += 1e9;
59 | 		dsec -= 1;
60 | 	}
61 | 	
62 | 	return ((double)dsec) + dnsec * 1.e-9;
63 | #elif __APPLE__
64 |     long dnsec = timer->stop.tv_nsec - timer->start.tv_nsec;
65 | 	long dsec = timer->stop.tv_sec - timer->start.tv_sec;
66 |     
67 | 	if (dnsec < 0) {
68 | 		dnsec += 1e9;
69 | 		dsec -= 1;
70 | 	}
71 | 	
72 | 	return ((double)dsec) + dnsec * 1.e-9;
73 | #else
74 | 	return ((double)(timer->stop.QuadPart - timer->start.QuadPart))/timer->freq.QuadPart;
75 | #endif
76 | }
77 | 
78 | /**
79 |  * Finds the ceiling of the log2 of a number iteratively
80 |  */
81 | int cb_log2(int x) {
82 | 	int res = 0;
83 | 	int x2 = x;
84 | 
85 | 	while (x2 > 1) {
86 | 		x2 >>= 1;
87 | 		res += 1;
88 | 	}
89 | 
90 | 	if ((1 << res) == x)
91 | 		return res;
92 | 	return res+1;
93 | }
94 | 


--------------------------------------------------------------------------------
/include/pmf.h:
--------------------------------------------------------------------------------
 1 | #ifndef _PMF_H_
 2 | #define _PMF_H_
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #include "util.h"
 7 | 
 8 | // Used to indicate a symbol not found during index lookup
 9 | #define ALPHABET_SYMBOL_NOT_FOUND			UINT32_MAX
10 | 
11 | #define ALPHABET_INDEX_SIZE_HINT			72
12 | 
13 | // Unfortunately this is a bit brittle so don't change it
14 | typedef uint8_t symbol_t;
15 | 
16 | /**
17 |  * Structure that stores information about an alphabet including
18 |  * the number of symbols and a list of the symbols themselves
19 |  */
20 | struct alphabet_t {
21 | 	uint32_t size;
22 | 	symbol_t *symbols;
23 | 	uint32_t *indexes;
24 | };
25 | 
26 | /**
27 |  * Structure for defining and storing a single PMF in a manner that is
28 |  * useful for computing empirical PMFs, but also allows PMFs to be manipulated
29 |  * as a set of probabilities (not just as empirical counts)
30 |  */
31 | struct pmf_t {
32 | 	uint8_t pmf_ready;
33 | 	const struct alphabet_t *alphabet;
34 | 	double *pmf;
35 | 	uint32_t *counts;
36 | 	uint32_t total;
37 | };
38 | 
39 | /**
40 |  * Stores a list of PMFs, used to track sets of conditional PMFs
41 |  */
42 | struct pmf_list_t {
43 | 	uint32_t size;
44 | 	struct pmf_t **pmfs;
45 | };
46 | 
47 | // Memory management functions
48 | struct alphabet_t *alloc_alphabet(uint32_t size);
49 | struct alphabet_t *duplicate_alphabet(const struct alphabet_t *);
50 | struct pmf_t *alloc_pmf(const struct alphabet_t *);
51 | struct pmf_list_t *alloc_pmf_list(uint32_t size, const struct alphabet_t *alphabet);
52 | void free_alphabet(struct alphabet_t *);
53 | void free_pmf(struct pmf_t *);
54 | void free_pmf_list(struct pmf_list_t *);
55 | 
56 | // PMF access
57 | uint32_t is_pmf_valid(struct pmf_t *);
58 | uint32_t get_symbol_index(const struct alphabet_t *alphabet, symbol_t symbol);
59 | double get_probability(struct pmf_t *pmf, uint32_t idx);
60 | double get_symbol_probability(struct pmf_t *pmf, symbol_t symbol);
61 | double get_entropy(struct pmf_t *pmf);
62 | double get_kl_divergence(struct pmf_t *p, struct pmf_t *q);
63 | 
64 | // PMF Manipulation
65 | struct pmf_t *combine_pmfs(struct pmf_t *a, struct pmf_t *b, double weight_a, double weight_b, struct pmf_t *result);
66 | void pmf_increment(struct pmf_t *pmf, uint32_t index);
67 | void recalculate_pmf(struct pmf_t *);
68 | void renormalize_pmf(struct pmf_t *);
69 | void pmf_to_counts(struct pmf_t *pmf, uint32_t m);
70 | void clear_pmf(struct pmf_t *);
71 | void clear_pmf_list(struct pmf_list_t *);
72 | 
73 | // Alphabet search
74 | void alphabet_compute_index(struct alphabet_t *);
75 | uint32_t alphabet_contains(const struct alphabet_t *alphabet, symbol_t symbol);
76 | uint32_t get_symbol_index(const struct alphabet_t *alphabet, symbol_t symbol);
77 | 
78 | // Compute the union of two alphabets
79 | void alphabet_union(const struct alphabet_t *restrict a, const struct alphabet_t *restrict b, struct alphabet_t *result);
80 | 
81 | // Display routines
82 | void print_alphabet(const struct alphabet_t *);
83 | void print_pmf(struct pmf_t *);
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/include/qv_compressor.h:
--------------------------------------------------------------------------------
 1 | #ifndef qv_compressor_h
 2 | #define qv_compressor_h
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <string.h>
 7 | #include <time.h>
 8 | #include <stdint.h>
 9 | #include <unistd.h>
10 | #include <sys/stat.h>
11 | #include <sys/types.h>
12 | #include <math.h>
13 | #include <stdio.h>
14 | #include <stdlib.h>
15 | #include <string.h>
16 | 
17 | #include "codebook.h"
18 | 
19 | #define m_arith  22
20 | 
21 | #define OS_STREAM_BUF_LEN		(4096*4096)
22 | 
23 | #define COMPRESSION 0
24 | #define DECOMPRESSION 1
25 | 
26 | typedef struct Arithmetic_code_t {
27 |     int32_t scale3;
28 |     
29 | 	uint32_t l;
30 |     uint32_t u;
31 |     uint32_t t;
32 | 
33 |     uint32_t m;
34 | 	uint32_t r;			// Rescaling condition
35 | }*Arithmetic_code;
36 | 
37 | typedef struct os_stream_t {
38 | 	FILE *fp;
39 | 	uint8_t *buf;
40 | 	uint32_t bufPos;
41 | 	uint8_t bitPos;
42 | 	uint64_t written;
43 | } *osStream;
44 | 
45 | typedef struct stream_stats_t {
46 |     uint32_t *counts;
47 |     uint32_t alphabetCard;
48 |     uint32_t step;
49 |     uint32_t n;
50 | } *stream_stats_ptr_t;
51 | 
52 | typedef struct arithStream_t {
53 | 	stream_stats_ptr_t cluster_stats;
54 |     stream_stats_ptr_t ***stats;
55 |     Arithmetic_code a;
56 |     osStream os;
57 | }*arithStream;
58 | 
59 | typedef struct qv_compressor_t{
60 |     arithStream Quals;
61 | }*qv_compressor;
62 | 
63 | 
64 | 
65 | 
66 | // Stream interface
67 | struct os_stream_t *alloc_os_stream(FILE *fp, uint8_t in);
68 | void free_os_stream(struct os_stream_t *);
69 | uint8_t stream_read_bit(struct os_stream_t *);
70 | uint32_t stream_read_bits(struct os_stream_t *os, uint8_t len);
71 | void stream_write_bit(struct os_stream_t *, uint8_t);
72 | void stream_write_bits(struct os_stream_t *os, uint32_t dw, uint8_t len);
73 | void stream_finish_byte(struct os_stream_t *);
74 | void stream_write_buffer(struct os_stream_t *);
75 | 
76 | // Arithmetic ncoder interface
77 | Arithmetic_code initialize_arithmetic_encoder(uint32_t m);
78 | void arithmetic_encoder_step(Arithmetic_code a, stream_stats_ptr_t stats, int32_t x, osStream os);
79 | int encoder_last_step(Arithmetic_code a, osStream os);
80 | uint32_t arithmetic_decoder_step(Arithmetic_code a, stream_stats_ptr_t stats, osStream is);
81 | uint32_t decoder_last_step(Arithmetic_code a, stream_stats_ptr_t stats);
82 | 
83 | // Encoding stats management
84 | stream_stats_ptr_t **initialize_stream_stats(struct cond_quantizer_list_t *q_list);
85 | void update_stats(stream_stats_ptr_t stats, uint32_t x, uint32_t r);
86 | 
87 | // Quality value compression interface
88 | void compress_qv(arithStream as, uint32_t x, uint8_t cluster, uint32_t column, uint32_t idx);
89 | void qv_write_cluster(arithStream as, uint8_t cluster);
90 | uint32_t decompress_qv(arithStream as, uint8_t cluster, uint32_t column, uint32_t idx);
91 | uint8_t qv_read_cluster(arithStream as);
92 | 
93 | qv_compressor initialize_qv_compressor(FILE *fout, uint8_t streamDirection, struct quality_file_t *info);
94 | 
95 | uint32_t start_qv_compression(struct quality_file_t *info, FILE *fout, double *dis, FILE * funcompressed);
96 | void start_qv_decompression(FILE *fout, FILE *fin, struct quality_file_t *info);
97 | 
98 | #endif
99 | 


--------------------------------------------------------------------------------
/src/os_stream.c:
--------------------------------------------------------------------------------
  1 | #include "qv_compressor.h"
  2 | 
  3 | /**
  4 |  * Allocates a file stream wrapper for the arithmetic encoder, with a given
  5 |  * already opened file handle
  6 |  */
  7 | struct os_stream_t *alloc_os_stream(FILE *fp, uint8_t in) {
  8 | 	struct os_stream_t *rtn = (struct os_stream_t *) calloc(1, sizeof(struct os_stream_t));
  9 | 
 10 | 	rtn->fp = fp;
 11 | 	rtn->buf = (uint8_t *) calloc(OS_STREAM_BUF_LEN, sizeof(uint8_t));
 12 | 
 13 | 	if (in) {
 14 | 		fread(rtn->buf, sizeof(uint8_t), OS_STREAM_BUF_LEN, fp);
 15 | 	}
 16 | 	rtn->bufPos = 0;
 17 | 	rtn->bitPos = 0;
 18 | 	rtn->written = 0;
 19 | 
 20 | 	return rtn;
 21 | }
 22 | 
 23 | /**
 24 |  * Deallocate the output stream. Note that this doesn't close the file because
 25 |  * this stream doesn't own it
 26 |  */
 27 | void free_os_stream(struct os_stream_t *os) {
 28 | 	free(os->buf);
 29 | 	free(os);
 30 | }
 31 | 
 32 | /**
 33 |  * Reads a single bit from the stream
 34 |  */
 35 | uint8_t stream_read_bit(struct os_stream_t *os) {
 36 | 	uint8_t rtn = os->buf[os->bufPos] >> 7;
 37 | 
 38 | 	os->buf[os->bufPos] = os->buf[os->bufPos] << 1;
 39 | 	os->bitPos += 1;
 40 | 
 41 | 	if (os->bitPos == 8) {
 42 | 		os->bitPos = 0;
 43 | 		os->bufPos += 1;
 44 | 		if (os->bufPos == OS_STREAM_BUF_LEN) {
 45 | 			fread(os->buf, sizeof(uint8_t), OS_STREAM_BUF_LEN, os->fp);
 46 | 			os->bufPos = 0;
 47 | 		}
 48 | 	}
 49 | 
 50 | 	return rtn;
 51 | }
 52 | 
 53 | /**
 54 |  * Reads a grouping of bits to be interpreted as a single integer, regardless of length
 55 |  * Bits are implicitly written in bit endian order ONE AT A TIME elsewhere in the code,
 56 |  * so this must read that way too
 57 |  */
 58 | uint32_t stream_read_bits(struct os_stream_t *os, uint8_t len) {
 59 | 	uint32_t rtn = 0;
 60 | 	int8_t bit;
 61 | 
 62 | 	for (bit = len-1; bit >= 0; --bit) {
 63 | 		rtn |= stream_read_bit(os) << bit;
 64 | 	}
 65 | 
 66 | 	return rtn;
 67 | }
 68 | 
 69 | /**
 70 |  * Writes a single bit to the stream
 71 |  */
 72 | void stream_write_bit(struct os_stream_t *os, uint8_t bit) {
 73 | 	bit = (bit & 1);
 74 | 	os->buf[os->bufPos] |= bit;
 75 | 
 76 | 	os->bitPos += 1;
 77 | 
 78 | 	if (os->bitPos == 8) {
 79 | 		os->bitPos = 0;
 80 | 		os->bufPos += 1;
 81 | 		if (os->bufPos == OS_STREAM_BUF_LEN) {
 82 | 			stream_write_buffer(os);
 83 | 		}
 84 | 	}
 85 | 	else {
 86 | 		os->buf[os->bufPos] <<= 1;
 87 | 	}
 88 | }
 89 | 
 90 | /**
 91 |  * Writes a grouping of bits to be interpreted as a single integer and read back the
 92 |  * same way. Bits need to be written msb first
 93 |  */
 94 | void stream_write_bits(struct os_stream_t *os, uint32_t dw, uint8_t len) {
 95 | 	int8_t bit;
 96 | 
 97 | 	for (bit = len-1; bit >= 0; --bit) {
 98 | 		stream_write_bit(os, (uint8_t)(dw >> bit));
 99 | 	}
100 | }
101 | 
102 | /**
103 |  * Finishes the current byte in progress and writes the buffer out
104 |  */
105 | void stream_finish_byte(struct os_stream_t *os) {
106 | 	os->buf[os->bufPos] <<= (7 - os->bitPos);
107 | 	os->bitPos = 0;
108 | 	os->bufPos += 1;
109 | 	stream_write_buffer(os);
110 | }
111 | 
112 | /**
113 |  * Writes out the current stream buffer regardless of fill amount
114 |  */
115 | void stream_write_buffer(struct os_stream_t *os) {
116 | 	fwrite(os->buf, sizeof(uint8_t), os->bufPos, os->fp);
117 | 	memset(os->buf, 0, sizeof(uint8_t)*os->bufPos);
118 | 	os->written += os->bufPos;
119 | 	os->bufPos = 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # qvz
 2 | 
 3 | Quality Value Zip (qvz) is a lossy compression algorithm for storing quality values associated
 4 | with DNA sequencing. This software implements the qvz algorithm for both encoding and decoding.
 5 | 
 6 | ## Installing
 7 | 
 8 | qvz can be used on Windows, Linux, or Mac. Currently we only provide a source distribution. qvz
 9 | has no external dependencies, linking only against libc, libm, and librt.
10 | 
11 | The distribution is configured out of the box for linux. To build on a mac, copy src/Makefile.apple to
12 | replace src/Makefile. Build with `make` in the toplevel folder. You are responsible for installing the
13 | binary in an appropriate system-wide location (i.e. /usr/bin) if you wish.
14 | 
15 | There is currently no makefile for Windows. A visual studio project can be made (and testing has been
16 | done on windows to verify compatibility) but the steps are beyond the scope of this guide, because you
17 | must take several additional steps to guarantee a sane environment on windows (such as replacing stdint
18 | and inttypes with correct versions). For this reason we also currently do not distribute a windows build
19 | script.
20 | 
21 | ## Usage
22 | 
23 | qvz is used from the command line. The general invocation is:
24 | 
25 | ```qvz (options) [input file] [output file]```
26 | 
27 | Input and output must be files, currently it does not process standard input or output. The input file
28 | must be a file consisting only of quality scores, with one read per line. Thus, the input would consist
29 | of every fourth line in a FASTQ file. The other three lines must be compressed separately.
30 | 
31 | Available options are:
32 | 
33 | ```
34 | Operating Mode:
35 | -q            Compress the quality score input file (default on)
36 | -x            Extract quality values from input file
37 | 
38 | Compression Parameters:
39 | -f [ratio]    Compress using a variable allocation of [ratio] bits per bit of input entropy per symbol
40 | -r [rate]     Compress using a fixed allocation of [rate] bits per symbol
41 | -d [M|L|A]    Compress while optimizing for MSE, Log(1+L1), or L1 distortions, respectively (default: MSE)
42 | 
43 | Clustering Parameters:
44 | -c [#]        Compress using # clusters. Going above 5 is not recommended due to computational complexity (default: 1)
45 | -T [#]        Use # as a threshold for cluster centroid movement distance before declaring an approximate clustering as "good enough"
46 | 
47 | Extra Options:
48 | -h            Print help summary
49 | -v            Enable verbose progress output
50 | -s            Print summary stats to STDOUT after compression (independent of -v)
51 | -u [file]     Write the quantized but not compressed values of [file] (default: off)
52 | ```
53 | 
54 | ## Algorithm
55 | 
56 | qvz uses the approach described in a paper submitted to Bioinformatics to perform lossy compression. Data is clustered
57 | to reduce global variability, then each cluster is compressed by calculating a set of quantization
58 | matrices that performs optimally under the chosen distortion metric  and the empirical statistics of
59 | the data, using a first order Markov prediction model.
60 | 
61 | ## License
62 | qvz is available under the terms of the GPLv3. See COPYING for more information.
63 | 
64 | ## Bugs and Feedback
65 | Please use GitHub issues to open bug reports or provide feedback about qvz.
66 | 
67 | ## Authors
68 | qvz was created by Greg Malysa, Mikel Hernaez, Idoia Ochoa, Milind Rao, and Karthik Ganesan at
69 | Stanford University.
70 | 


--------------------------------------------------------------------------------
/include/codebook.h:
--------------------------------------------------------------------------------
  1 | #ifndef _CODEBOOK_H_
  2 | #define _CODEBOOK_H_
  3 | /**
  4 |  * Functions and definitions relating to reading codebooks from files, used
  5 |  * for both the encoder and decoder code
  6 |  */
  7 | 
  8 | #include "util.h"
  9 | 
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <stdint.h>
 13 | #include <string.h>
 14 | 
 15 | #include "well.h"
 16 | #include "pmf.h"
 17 | #include "distortion.h"
 18 | #include "quantizer.h"
 19 | #include "lines.h"
 20 | 
 21 | #define MODE_RATIO		0	// Traditional implementation, output bitrate is scaled from input
 22 | #define MODE_FIXED		1	// Fixed rate per symbol
 23 | #define MODE_FIXED_MSE	2	// Fixed average MSE per column
 24 | 
 25 | /**
 26 |  * Options for the compression process
 27 |  */
 28 | struct qv_options_t {
 29 | 	uint8_t verbose;
 30 | 	uint8_t stats;
 31 | 	uint8_t mode;
 32 | 	uint8_t clusters;
 33 |     uint8_t uncompressed;
 34 |     uint8_t distortion;
 35 | 	char *dist_file;
 36 |     char *uncompressed_name;
 37 | 	double ratio;		// Used for parameter to all modes
 38 | 	double e_dist;		// Expected distortion as calculated during optimization
 39 | 	double cluster_threshold;
 40 | };
 41 | 
 42 | /**
 43 |  * Stores an array of conditional PMFs for the current column given the previous
 44 |  * column. PMF pointers are stored in a flat array so don't try to find the PMF you
 45 |  * want directly--use the accessor
 46 |  */
 47 | struct cond_pmf_list_t {
 48 | 	uint32_t columns;
 49 | 	const struct alphabet_t *alphabet;
 50 | 	struct pmf_t **pmfs;
 51 | 	struct pmf_list_t *marginal_pmfs;
 52 | };
 53 | 
 54 | /**
 55 |  * Stores an array of quantizer pointers for the column for all possible left context
 56 |  * values. Unused ones are left as null pointers. This is also stored as a flat array
 57 |  * so the accessor must be used to look up the correct quantizer
 58 |  * The dreaded triple pointer is used to store an array of (different length) arrays
 59 |  * of pointers to quantizers
 60 |  */
 61 | struct cond_quantizer_list_t {
 62 | 	uint32_t columns;
 63 | 	uint32_t lines;
 64 | 	struct alphabet_t **input_alphabets;
 65 | 	struct quantizer_t ***q;
 66 | 	double **ratio;				// Raw ratio
 67 | 	uint8_t **qratio;			// Quantized ratio
 68 | 	struct qv_options_t *options;
 69 | };
 70 | 
 71 | // Memory management
 72 | struct cond_pmf_list_t *alloc_conditional_pmf_list(const struct alphabet_t *alphabet, uint32_t columns);
 73 | struct cond_quantizer_list_t *alloc_conditional_quantizer_list(uint32_t columns);
 74 | void free_conditional_pmf_list(struct cond_pmf_list_t *);
 75 | void free_cond_quantizer_list(struct cond_quantizer_list_t *);
 76 | 
 77 | // Per-column initializer for conditional quantizer list
 78 | void cond_quantizer_init_column(struct cond_quantizer_list_t *list, uint32_t column, const struct alphabet_t *input_union);
 79 | 
 80 | // Accessors
 81 | struct pmf_t *get_cond_pmf(struct cond_pmf_list_t *list, uint32_t column, symbol_t prev);
 82 | struct quantizer_t *get_cond_quantizer_indexed(struct cond_quantizer_list_t *list, uint32_t column, uint32_t index);
 83 | struct quantizer_t *get_cond_quantizer(struct cond_quantizer_list_t *list, uint32_t column, symbol_t prev);
 84 | void store_cond_quantizers(struct quantizer_t *restrict lo, struct quantizer_t *restrict hi, double ratio, struct cond_quantizer_list_t *list, uint32_t column, symbol_t prev);
 85 | void store_cond_quantizers_indexed(struct quantizer_t *restrict lo, struct quantizer_t *restrict hi, double ratio, struct cond_quantizer_list_t *list, uint32_t column, uint32_t index);
 86 | struct quantizer_t *choose_quantizer(struct cond_quantizer_list_t *list, struct well_state_t *well, uint32_t column, symbol_t prev, uint32_t *q_idx);
 87 | uint32_t find_state_encoding(struct quantizer_t *codebook, symbol_t value);
 88 | 
 89 | // Meat of the implementation
 90 | void calculate_statistics(struct quality_file_t *);
 91 | double optimize_for_entropy(struct pmf_t *pmf, struct distortion_t *dist, double target, struct quantizer_t **lo, struct quantizer_t **hi);
 92 | void generate_codebooks(struct quality_file_t *info);
 93 | 
 94 | // Master functions to handle codebooks in the output file
 95 | void write_codebooks(FILE *fp, struct quality_file_t *info);
 96 | void write_codebook(FILE *fp, struct cond_quantizer_list_t *quantizers);
 97 | void read_codebooks(FILE *fp, struct quality_file_t *info);
 98 | struct cond_quantizer_list_t *read_codebook(FILE *fp, struct quality_file_t *info);
 99 | 
100 | #define MAX_CODEBOOK_LINE_LENGTH 3366
101 | #define COPY_Q_TO_LINE(line, q, i, size) for (i = 0; i < size; ++i) { line[i] = q[i] + 33; }
102 | #define COPY_Q_FROM_LINE(line, q, i, size) for (i = 0; i < size; ++i) { q[i] = line[i] - 33; }
103 | 
104 | void print_codebook(struct cond_quantizer_list_t *);
105 | 
106 | #endif
107 | 


--------------------------------------------------------------------------------
/src/qv_stream.c:
--------------------------------------------------------------------------------
  1 | #include "qv_compressor.h"
  2 | 
  3 | /**
  4 |  * Update stats structure used for adaptive arithmetic coding
  5 |  * @param stats Pointer to stats structure
  6 |  * @param x Symbol to update
  7 |  * @param r Rescaling condition (if n > r, rescale all stats)
  8 |  */
  9 | void update_stats(stream_stats_ptr_t stats, uint32_t x, uint32_t r) {
 10 |     uint32_t i = 0;
 11 | 
 12 | 	stats->counts[x] += stats->step;
 13 | 	stats->n += stats->step;
 14 | 
 15 | 	if (stats->n > r) {
 16 | 		stats->n = 0;
 17 | 		for (i = 0; i < stats->alphabetCard; ++i) {
 18 | 			if (stats->counts[i]) {
 19 | 				stats->counts[i] >>= 1;
 20 | 				stats->counts[i] += 1;
 21 | 				stats->n += stats->counts[i];
 22 | 			}
 23 | 		}
 24 | 	}
 25 | }
 26 | 
 27 | /**
 28 |  * Initialize stats structures used for adaptive arithmetic coding based on
 29 |  * the number of contexts required to handle the set of conditional quantizers
 30 |  * that we have (one context per quantizer)
 31 |  */
 32 | stream_stats_ptr_t **initialize_stream_stats(struct cond_quantizer_list_t *q_list) {
 33 |     stream_stats_ptr_t **s;
 34 |     uint32_t i = 0, j = 0, k = 0;
 35 |     
 36 |     s = (stream_stats_ptr_t **) calloc(q_list->columns, sizeof(stream_stats_ptr_t *));
 37 | 
 38 |     // Allocate jagged array, one set of stats per column
 39 |     for (i = 0; i < q_list->columns; ++i) {
 40 | 		// And for each column, one set of stats per low/high quantizer per previous context
 41 |         s[i] = (stream_stats_ptr_t *) calloc(2*q_list->input_alphabets[i]->size, sizeof(stream_stats_ptr_t));
 42 |         
 43 | 		// Finally each individual stat structure needs to be filled in uniformly
 44 |         for (j = 0; j < 2*q_list->input_alphabets[i]->size; ++j) {
 45 |             s[i][j] = (stream_stats_ptr_t) calloc(1, sizeof(struct stream_stats_t));
 46 |             s[i][j]->counts = (uint32_t *) calloc(q_list->q[i][j]->output_alphabet->size, sizeof(uint32_t));
 47 |             
 48 |             // Initialize the quantizer's stats uniformly
 49 |             for (k = 0; k < q_list->q[i][j]->output_alphabet->size; k++) {
 50 |                 s[i][j]->counts[k] = 1;
 51 |             }
 52 | 			s[i][j]->n = q_list->q[i][j]->output_alphabet->size;
 53 |             s[i][j]->alphabetCard = q_list->q[i][j]->output_alphabet->size;
 54 |             
 55 |             // Step size is 8 counts per symbol seen to speed convergence
 56 |             s[i][j]->step = 8;
 57 |         }
 58 |     }
 59 |     
 60 |     return s;
 61 | }
 62 | 
 63 | /**
 64 |  * @todo add cluster stats
 65 |  */
 66 | arithStream initialize_arithStream(FILE *fout, uint8_t decompressor_flag, struct quality_file_t *info) {
 67 |     arithStream as;
 68 | 	uint32_t i;
 69 | 
 70 | 	memset(&info->well, 0, sizeof(struct well_state_t));
 71 | 
 72 |     if (decompressor_flag) {
 73 |         fread(info->well.state, sizeof(uint32_t), 32, fout);
 74 |     }
 75 |     else {
 76 |         // Initialize WELL state vector with libc rand
 77 |         srand((uint32_t) time(0));
 78 |         for (i = 0; i < 32; ++i) {
 79 | #ifndef DEBUG
 80 |             info->well.state[i] = rand();
 81 | #else
 82 |             info->well.state[i] = 0x55555555;
 83 | #endif
 84 |         }
 85 |         
 86 |         // Write the initial WELL state vector to the file first (fixed size of 32 bytes)
 87 | 		// @todo strictly this needs to be stored in network order because we're interpreting it as a 32 bit int
 88 | 		// but I am a bit too lazy for that right now
 89 |         fwrite(info->well.state, sizeof(uint32_t), 32, fout);
 90 | 	}
 91 | 
 92 | 	// Must start at zero
 93 | 	info->well.n = 0;
 94 |     
 95 |     as = (arithStream) calloc(1, sizeof(struct arithStream_t));
 96 | 
 97 | 	as->cluster_stats = (stream_stats_ptr_t) calloc(1, sizeof(struct stream_stats_t));
 98 | 	as->cluster_stats->step = 8;
 99 | 	as->cluster_stats->counts = (uint32_t *) calloc(info->cluster_count, sizeof(uint32_t));
100 | 	as->cluster_stats->alphabetCard = info->cluster_count;
101 | 	as->cluster_stats->n = info->cluster_count;
102 | 
103 | 	as->stats = (stream_stats_ptr_t ***) calloc(info->cluster_count, sizeof(stream_stats_ptr_t **));
104 | 	for (i = 0; i < info->cluster_count; ++i) {
105 |     	as->stats[i] = initialize_stream_stats(info->clusters->clusters[i].qlist);
106 | 		as->cluster_stats->counts[i] = 1;
107 | 	}
108 |     
109 | 	as->a = initialize_arithmetic_encoder(m_arith);
110 | 	as->os = alloc_os_stream(fout, decompressor_flag);
111 | 
112 | 	if (decompressor_flag)
113 | 		as->a->t = stream_read_bits(as->os, as->a->m);
114 | 	else
115 | 		as->a->t = 0;
116 |     
117 |     return as;
118 | }
119 | 
120 | qv_compressor initialize_qv_compressor(FILE *fout, uint8_t streamDirection, struct quality_file_t *info) {
121 |     qv_compressor s;
122 |     s = calloc(1, sizeof(struct qv_compressor_t));
123 |     s->Quals = initialize_arithStream(fout, streamDirection, info);
124 |     return s;
125 | }
126 | 


--------------------------------------------------------------------------------
/src/lines.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Utility functions for manipulating the data from files, like reading it into memory
  3 |  * and converting between the different formats we use
  4 |  */
  5 | 
  6 | #include "util.h"
  7 | 
  8 | #include <stdio.h>
  9 | #include <string.h>
 10 | #include <fcntl.h>
 11 | #include <sys/mman.h>
 12 | 
 13 | #include "lines.h"
 14 | 
 15 | /**
 16 |  * This reads data from the given file pointer into memory, breaking it into segments
 17 |  * of the given number of lines, to ease memory management issues at the cost of some
 18 |  * overhead. This assumes that the file consists entirely of quality scores with no
 19 |  * other lines in between
 20 |  * @param path Path of the file to read
 21 |  * @param info Information structure to store in, this must be a valid pointer already
 22 |  * @param max_lines Maximum number of lines to read, will override the actual number in the file if >0
 23 |  * @todo @xxx This assumes we have only newlines in the file despite some vague attempts to handle \r\n as well
 24 |  * @todo @xxx It WILL break the mapping if given a file with \r characters
 25 |  * @todo Implement windows analog to mmap to provide the same facility
 26 |  */
 27 | uint32_t load_file(const char *path, struct quality_file_t *info, uint64_t max_lines) {
 28 | 	uint32_t status, block_idx, line_idx;
 29 | 	char line[READ_LINEBUF_LENGTH];
 30 | 	FILE *fp;
 31 | 	int fd;
 32 | 	struct _stat finfo;
 33 | 	void *file_mmap;
 34 | 
 35 | 	// Load metadata into the info structure
 36 | 	info->path = strdup(path);
 37 | 	fp = fopen(path, "rt");
 38 | 	fd = open(path, O_RDONLY);
 39 | 	if (!fp || fd == -1) {
 40 | 		return LF_ERROR_NOT_FOUND;
 41 | 	}
 42 | 
 43 | 	// Use the first line to figure out how long the file is
 44 | 	fgets(line, READ_LINEBUF_LENGTH, fp);
 45 | 	info->columns = strlen(line) - 1;
 46 | 	if (info->columns > MAX_READS_PER_LINE) {
 47 | 		fclose(fp);
 48 | 		return LF_ERROR_TOO_LONG;
 49 | 	}
 50 | 	fclose(fp);
 51 | 
 52 | 	// Figure out how many lines we'll need depending on whether we were limited or not
 53 | 	_stat(path, &finfo);
 54 | 	info->lines = finfo.st_size / ((uint64_t) (info->columns+1));
 55 | 	if (max_lines > 0 && info->lines > max_lines) {
 56 | 		info->lines = max_lines;
 57 | 	}
 58 | 	
 59 | 	status = alloc_blocks(info);
 60 | 	if (status != LF_ERROR_NONE)
 61 | 		return status;
 62 | 
 63 | 	// mmap the file to set up constant pointers indexing it
 64 | 	file_mmap = mmap(NULL, finfo.st_size, PROT_READ, MAP_SHARED, fd, 0);
 65 | 
 66 | 	// Process the file
 67 | 	block_idx = 0;
 68 | 	line_idx = 0;
 69 | 	while ((block_idx * MAX_LINES_PER_BLOCK + line_idx) < info->lines) {
 70 | 		// Setting up mmap indexing assumes we have only one line ending!
 71 | 		info->blocks[block_idx].lines[line_idx].m_data = file_mmap + ((uint64_t) (block_idx * MAX_LINES_PER_BLOCK + line_idx)) * (info->columns+1);
 72 | 
 73 | 		// Increment line/block pointers as necesary
 74 | 		line_idx += 1;
 75 | 		if (line_idx == info->blocks[block_idx].count) {
 76 | 			line_idx = 0;
 77 | 			block_idx += 1;
 78 | 		}
 79 | 	}
 80 | 
 81 | 	return LF_ERROR_NONE;
 82 | }
 83 | 
 84 | /**
 85 |  * Allocate an array of line block pointers and the memory within each block, so that we can
 86 |  * use it to store the results of reading the file
 87 |  */
 88 | uint32_t alloc_blocks(struct quality_file_t *info) {
 89 | 	uint64_t lines_left = info->lines;
 90 | 	struct line_block_t *cblock;
 91 | 
 92 | 	// Figure out how many blocks we'll need to store this file
 93 | 	info->block_count = (uint32_t) (info->lines / (uint64_t)MAX_LINES_PER_BLOCK);
 94 | 	if (info->block_count * MAX_LINES_PER_BLOCK != info->lines) {
 95 | 		info->block_count += 1;
 96 | 	}
 97 | 
 98 | 	info->blocks = (struct line_block_t *) calloc(info->block_count, sizeof(struct line_block_t));
 99 | 	if (!info->blocks) {
100 | 		return LF_ERROR_NO_MEMORY;
101 | 	}
102 | 	cblock = info->blocks;
103 | 
104 | 	while (lines_left > 0) {
105 | 		// Figure out how many lines we'll have in this block
106 | 		if (lines_left > MAX_LINES_PER_BLOCK) {
107 | 			lines_left -= MAX_LINES_PER_BLOCK;
108 | 			cblock->count = MAX_LINES_PER_BLOCK;
109 | 		}
110 | 		else {
111 | 			cblock->count = (uint32_t) lines_left;
112 | 			lines_left = 0;
113 | 		}
114 | 
115 | 		// Allocate array of line info structs for the block
116 | 		cblock->lines = (struct line_t *) calloc(cblock->count, sizeof(struct line_t));
117 | 		if (!cblock->lines) {
118 | 			return LF_ERROR_NO_MEMORY;
119 | 		}
120 | 
121 | 		// Advance to the next line block
122 | 		cblock += 1;
123 | 	}
124 | 
125 | 	return LF_ERROR_NONE;
126 | }
127 | 
128 | /**
129 |  * Deallocates the memory used to store file information in blocks
130 |  */
131 | void free_blocks(struct quality_file_t *info) {
132 | 	// Array of block pointers is a single allocation
133 | 	// For each block, array of lines is a single allocations
134 | 	uint32_t i;
135 | 
136 | 	for (i = 0; i < info->block_count; ++i) {
137 | 		free(info->blocks[i].lines);
138 | 	}
139 | 	free(info->blocks);
140 | }
141 | 


--------------------------------------------------------------------------------
/src/distortion.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | 
  6 | #include "distortion.h"
  7 | #include "util.h"
  8 | 
  9 | /**
 10 |  * Allocates memory for a distortion matrix
 11 |  */
 12 | struct distortion_t *alloc_distortion_matrix(uint8_t symbols) {
 13 | 	struct distortion_t *rtn = (struct distortion_t *) calloc(1, sizeof(struct distortion_t));
 14 | 	rtn->symbols = symbols;
 15 | 	rtn->distortion = (double *) calloc(symbols*symbols, sizeof(double));
 16 | 	return rtn;
 17 | }
 18 | 
 19 | /**
 20 |  * Deallocates memory from a distortion matrix
 21 |  */
 22 | void free_distortion_matrix(struct distortion_t *d) {
 23 | 	free(d->distortion);
 24 | 	free(d);
 25 | }
 26 | 
 27 | /**
 28 |  * Public facing method for allocating distortion matrices
 29 |  */
 30 | struct distortion_t *generate_distortion_matrix(uint8_t symbols, int type) {
 31 | 	switch (type) {
 32 | 		case DISTORTION_MANHATTAN:
 33 | 			return gen_manhattan_distortion(symbols);
 34 | 		case DISTORTION_MSE:
 35 | 			return gen_mse_distortion(symbols);
 36 | 		case DISTORTION_LORENTZ:
 37 | 			return gen_lorentzian_distortion(symbols);
 38 | 		case DISTORTION_CUSTOM:
 39 | 			printf("Custom distortion matrices should be allocated with gen_custom_distortion() instead.\n");
 40 | 			exit(1);
 41 | 		default:
 42 | 			printf("Invalid distortion type %d specified.\n", type);
 43 | 			exit(1);
 44 | 	}
 45 | }
 46 | 
 47 | /**
 48 |  * Generate a distortion matrix according to the Manhattan distance (L1) metric
 49 |  */
 50 | struct distortion_t *gen_manhattan_distortion(uint8_t symbols) {
 51 | 	struct distortion_t *rtn = alloc_distortion_matrix(symbols);
 52 | 	uint8_t x, y;
 53 | 
 54 | 	for (x = 0; x < symbols; ++x) {
 55 | 		for (y = 0; y < symbols; ++y) {
 56 | 			rtn->distortion[x + y*symbols] = abs(x - y);
 57 | 		}
 58 | 	}
 59 | 
 60 | 	return rtn;
 61 | }
 62 | 
 63 | /**
 64 |  * Generates a distortion matrix according to the MSE (L2) metric
 65 |  */
 66 | struct distortion_t *gen_mse_distortion(uint8_t symbols) {
 67 | 	struct distortion_t *rtn = alloc_distortion_matrix(symbols);
 68 | 	uint8_t x, y;
 69 | 
 70 | 	for (x = 0; x < symbols; ++x) {
 71 | 		for (y = 0; y < symbols; ++y) {
 72 | 			rtn->distortion[x + y*symbols] = (x - y)*(x - y);
 73 | 		}
 74 | 	}
 75 | 
 76 | 	return rtn;
 77 | }
 78 | 
 79 | /**
 80 |  * Generates a distortion matrix according to the lorentzian (log-L1) metric
 81 |  */
 82 | struct distortion_t *gen_lorentzian_distortion(uint8_t symbols) {
 83 | 	struct distortion_t *rtn = alloc_distortion_matrix(symbols);
 84 | 	uint8_t x, y;
 85 | 
 86 | 	for (x = 0; x < symbols; ++x) {
 87 | 		for (y = 0; y < symbols; ++y) {
 88 | 			rtn->distortion[x + y*symbols] = log2( 1.0 + (double)(abs(x - y)) );
 89 | 		}
 90 | 	}
 91 | 
 92 | 	return rtn;
 93 | }
 94 | 
 95 | /**
 96 |  * Reads in a custom distortion matrix specified in the given file
 97 |  * The file format is S rows of S columns containing double valued distortions
 98 |  * separated by commas. Lines beginning with a # are ignored as comments
 99 |  */
100 | struct distortion_t *gen_custom_distortion(uint8_t symbols, const char *filename) {
101 | 	struct distortion_t *dist = alloc_distortion_matrix(symbols);
102 | 	uint8_t x, y;
103 | 	FILE *fp;
104 | 	char line[1024];
105 | 	char *field;
106 | 	uint8_t missing;
107 | 
108 | 	fp = fopen(filename, "rt");
109 | 	if (!fp) {
110 | 		perror("Unable to open distortion definition file");
111 | 		exit(1);
112 | 	}
113 | 
114 | 	x = 0;
115 | 	while (x < symbols && fgets(line, 1024, fp) != NULL) {
116 | 		missing = 0;
117 | 		field = line - 1;
118 | 		y = 0;
119 | 
120 | 		if (line[0] == '#')
121 | 			continue;
122 | 
123 | 		while (y < symbols && field != NULL) {
124 | 			field += 1;
125 | 			dist->distortion[x + symbols*y] = atof(field);
126 | 			field = strchr(field, ',');
127 | 			y += 1;
128 | 		}
129 | 
130 | 		while (y < symbols) {
131 | 			missing = 1;
132 | 			dist->distortion[x + symbols*y] = 0.0;
133 | 		}
134 | 
135 | 		if (missing) {
136 | 			printf("Warning: one or more entries in the distortion matrix on line %d were missing", x);
137 | 			printf(" they have been filled with 0.0\n");
138 | 		}
139 | 
140 | 		x += 1;
141 | 	}
142 | 
143 | 	fclose(fp);
144 | 	return dist;
145 | }
146 | 
147 | /**
148 |  * Retrieve the distortion for a pair (x, y). Generally x is the true value and
149 |  * y is the reconstructed value. Handles the matrix->linear array indexing
150 |  */
151 | double get_distortion(struct distortion_t *dist, uint8_t x, uint8_t y) {
152 | 	return dist->distortion[x + dist->symbols*y];
153 | }
154 | 
155 | /**
156 |  * Print a distortion matrix to stdout for debuggin
157 |  */
158 | void print_distortion(struct distortion_t *dist) {
159 | 	uint8_t x, y;
160 | 
161 | 	printf("    |");
162 | 	for (y = 0; y < dist->symbols; ++y) {
163 | 		printf(" %2d |", y);
164 | 	}
165 | 	printf("\n");
166 | 
167 | 	printf("----+");
168 | 	for (y = 0; y < dist->symbols; ++y) {
169 | 		printf("----+");
170 | 	}
171 | 	printf("\n");
172 | 
173 | 	for (x = 0; x < dist->symbols; ++x) {
174 | 		printf(" %2d |", x);
175 | 		for (y = 0; y < dist->symbols; ++y) {
176 | 			printf("%2.2f|", dist->distortion[x + y*dist->symbols]);
177 | 		}
178 | 		printf("\n");
179 | 	}
180 | }
181 | 


--------------------------------------------------------------------------------
/src/arith.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdio.h>
  3 | #include "qv_compressor.h"
  4 | 
  5 | Arithmetic_code initialize_arithmetic_encoder(uint32_t m) {
  6 |     Arithmetic_code a_code;
  7 |     
  8 |     a_code = (Arithmetic_code) calloc(1, sizeof(struct Arithmetic_code_t));
  9 |     
 10 |     a_code->m = m;
 11 | 	a_code->r = 1 << (m - 3);
 12 |     a_code->l = 0;
 13 | 	a_code->u = (1 << m) - 1;
 14 |     
 15 |     return a_code;
 16 | }
 17 | 
 18 | /**
 19 |  * E1/E2 check for the MSB of the lower and upper regions being the same, indicating that a bit has
 20 |  * been determined and must be sent to the output stream
 21 |  * E3 checks for upper being 10xxxx... and lower being 01xxxx... indicating that after rescaling the
 22 |  * range we are still in the indetermined central region
 23 | */
 24 | void arithmetic_encoder_step(Arithmetic_code a, stream_stats_ptr_t stats, int32_t x, osStream os) {
 25 |     uint64_t range = 0;
 26 |     uint8_t msbU = 0, msbL = 0, E1_E2 = 0, E3 = 0, smsbL = 0, smsbU = 0;
 27 |     uint32_t cumCountX, cumCountX_1;
 28 |     int32_t i;
 29 | 
 30 | 	// These are actually constants, need to lift a->m out of the struct because it is compile-time constant
 31 | 	uint32_t msb_shift = a->m - 1;
 32 | 	uint32_t smsb_shift = a->m - 2;
 33 | 	uint32_t msb_clear_mask = (1 << msb_shift) - 1;
 34 |     
 35 |     range = a->u - a->l + 1;
 36 | 
 37 | 	assert(x < stats->alphabetCard);
 38 |     
 39 | 	cumCountX_1 = 0;
 40 | 	for (i = 0; i < x; ++i) {
 41 | 		cumCountX_1 += stats->counts[i];
 42 | 	}
 43 | 	cumCountX = cumCountX_1 + stats->counts[x];
 44 | 
 45 | 	assert(cumCountX_1 < cumCountX);
 46 |     
 47 |     a->u = a->l + (uint32_t)((range * cumCountX) / stats->n) - 1;
 48 |     a->l = a->l + (uint32_t)((range * cumCountX_1) / stats->n);
 49 |     
 50 | 	assert(a->l <= a->u);
 51 |     
 52 |     // Check the rescaling conditions
 53 |     msbL = a->l >> msb_shift;
 54 |     msbU = a->u >> msb_shift;
 55 |     E1_E2 = (msbL == msbU);
 56 | 	E3 = 0;
 57 |     
 58 |     if (!E1_E2) {
 59 | 		smsbL = a->l >> smsb_shift;
 60 | 		smsbU = a->u >> smsb_shift;
 61 | 		E3 = (smsbL == 0x01 && smsbU == 0x02);
 62 |     }
 63 |     
 64 | 	// While the bounds need rescaling
 65 |     while (E1_E2 || E3) {
 66 |         if (E1_E2) {
 67 | 			// We are in one half of the integer range so the next bit is fixed as the MSB
 68 | 			stream_write_bit(os, msbL);
 69 | 
 70 | 			// Clear the msb from both bounds and rescale them
 71 | 			a->l = (a->l & msb_clear_mask) << 1;
 72 | 			a->u = ((a->u & msb_clear_mask) << 1) + 1;
 73 |             
 74 | 			// Write any extra bits based on the number of rescalings without an output before now
 75 |             while (a->scale3 > 0) {
 76 | 				stream_write_bit(os, !msbL);
 77 |                 a->scale3 -= 1;
 78 |             }
 79 |         }
 80 | 		else { // E3 is true
 81 |             a->scale3 += 1;
 82 | 			a->u = (((a->u << 1) & msb_clear_mask) | (1 << msb_shift)) + 1;
 83 | 			a->l = (a->l << 1) & msb_clear_mask;
 84 |         }
 85 | 
 86 |         msbL = a->l >> msb_shift;
 87 |         msbU = a->u >> msb_shift;
 88 |         E1_E2 = (msbL == msbU);
 89 | 		E3 = 0;
 90 | 
 91 |         if (!E1_E2) {
 92 | 			smsbL = a->l >> smsb_shift;
 93 | 			smsbU = a->u >> smsb_shift;
 94 | 			E3 = (smsbL == 0x01 && smsbU == 0x02);
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | int encoder_last_step(Arithmetic_code a, osStream os) {
100 |     uint8_t msbL = a->l >> (a->m - 1);
101 | 
102 |     // Write the msb of the tag (l)
103 | 	stream_write_bit(os, msbL);
104 |     
105 |     // write as many !msbL as scale3 left
106 |     while (a->scale3 > 0) {
107 | 		stream_write_bit(os, !msbL);
108 |         a->scale3 -= 1;
109 |     }
110 |     
111 |     // write the rest of the tag (l)
112 | 	stream_write_bits(os, a->l, a->m - 1);
113 | 	stream_finish_byte(os);
114 |     
115 |     return os->written;
116 | }
117 | 
118 | uint32_t arithmetic_decoder_step(Arithmetic_code a, stream_stats_ptr_t stats, osStream is) {
119 |     uint64_t range = 0, tagGap = 0;
120 |     int32_t k = 0, x = -1, i;
121 |     uint32_t subRange = 0, cumCountX = 0, cumCountX_1 = 0, cumCount = 0;
122 |     
123 |     uint8_t msbU = 0, msbL = 0, E1_E2 = 0, E3 = 0, smsbL = 0, smsbU = 0;
124 |     
125 | 	// Again, these are actually constants
126 | 	uint32_t msb_shift = a->m - 1;
127 | 	uint32_t smsb_shift = a->m - 2;
128 | 	uint32_t msb_clear_mask = (1 << msb_shift) - 1;
129 | 
130 |     range = a->u - a->l + 1;
131 |     tagGap = a->t - a->l + 1;
132 |     
133 | 	// @todo figure this out
134 |     subRange = (uint32_t)((tagGap * stats->n - 1) / range);
135 |     while (subRange >= cumCount)
136 |         cumCount += stats->counts[k++];
137 |     x = --k;
138 |   
139 | 	cumCountX_1 = 0;
140 | 	for (i = 0; i < x; ++i) {
141 | 		cumCountX_1 += stats->counts[i];
142 | 	}
143 | 	cumCountX = cumCountX_1 + stats->counts[x];
144 |     
145 |     a->u = a->l + (uint32_t)((range * cumCountX) / stats->n) - 1;
146 |     a->l = a->l + (uint32_t)((range * cumCountX_1) / stats->n);
147 |     
148 |     // Check the rescaling conditions.
149 |     msbL = a->l >> msb_shift;
150 |     msbU = a->u >> msb_shift;
151 |     
152 |     E1_E2 = (msbL == msbU);
153 | 	E3 = 0;
154 |     
155 |     // If E1 or E2 doen't hold, check E3
156 |     if (!E1_E2) {
157 | 		smsbL = a->l >> smsb_shift;
158 | 		smsbU = a->u >> smsb_shift;
159 | 		E3 = (smsbL == 0x01 && smsbU == 0x02);
160 |     }
161 |     
162 |     // While any of E conditions hold
163 |     while (E1_E2 || E3) {
164 |         if (E1_E2) {
165 | 			a->l = (a->l & msb_clear_mask) << 1;
166 | 			a->u = ((a->u & msb_clear_mask) << 1) + 1;
167 | 			a->t = ((a->t & msb_clear_mask) << 1) + stream_read_bit(is);
168 |         }
169 |         else { // E3 is true
170 | 			a->l = (a->l << 1) & msb_clear_mask;
171 | 			a->u = (((a->u << 1) & msb_clear_mask) | (1 << msb_shift)) + 1;
172 | 			a->t = (((a->t & msb_clear_mask) << 1) ^ (1 << msb_shift)) + stream_read_bit(is);
173 |         }
174 |         
175 | 		msbL = a->l >> msb_shift;
176 |         msbU = a->u >> msb_shift;
177 |         E1_E2 = (msbL == msbU);
178 | 		E3 = 0;
179 |             
180 |         if (!E1_E2) {
181 | 			smsbL = a->l >> smsb_shift;
182 | 			smsbU = a->u >> smsb_shift;
183 | 			E3 = (smsbL == 0x01 && smsbU == 0x02);
184 |         }
185 |     }
186 |     
187 |     return x;
188 | }
189 | 
190 | uint32_t decoder_last_step(Arithmetic_code a, stream_stats_ptr_t stats) {
191 |     uint64_t range, tagGap, subRange;
192 |     uint32_t k = 0, cumCount = 0, x;
193 |     
194 |     range = a->u - a->l + 1;
195 |     tagGap = a->t - a->l + 1;
196 |     
197 |     subRange = (tagGap * stats->n -1) / range;
198 |     
199 |     while (subRange >= cumCount)
200 |         cumCount += stats->counts[k++];
201 |     
202 |     x = --k;
203 |     
204 |     return x;
205 | }
206 | 
207 | 


--------------------------------------------------------------------------------
/src/quantizer.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | 
  4 | #include "quantizer.h"
  5 | #include "util.h"
  6 | 
  7 | /**
  8 |  * Allocate enough room based on the size of the alphabet supplied
  9 |  */
 10 | struct quantizer_t *alloc_quantizer(const struct alphabet_t *alphabet) {
 11 | 	struct quantizer_t *rtn = (struct quantizer_t *) calloc(1, sizeof(struct quantizer_t));
 12 | 	rtn->alphabet = alphabet;
 13 | 	rtn->q = (symbol_t *) calloc(alphabet->size, sizeof(symbol_t));
 14 | 	return rtn;
 15 | }
 16 | 
 17 | /**
 18 |  * Free the quantizer itself but not the input alphabet
 19 |  * But do free the output alphabet
 20 |  */
 21 | void free_quantizer(struct quantizer_t *q) {
 22 | 	if (q->output_alphabet)
 23 | 		free_alphabet(q->output_alphabet);
 24 | 
 25 | 	free(q->q);
 26 | 	free(q);
 27 | }
 28 | 
 29 | /**
 30 |  * Produce a quantizer with the given number of states for the given pmf, and
 31 |  * optionally computes the expected distortion produced by this quantizer.
 32 |  * The bounds array here contains the left endpoint (inclusive) of each region
 33 |  */
 34 | struct quantizer_t *generate_quantizer(struct pmf_t *restrict pmf, struct distortion_t *restrict dist, uint32_t states) {
 35 | 	struct quantizer_t *q = alloc_quantizer(pmf->alphabet);
 36 | 	uint32_t changed = 1;
 37 | 	uint32_t iter = 0;
 38 | 	uint32_t i, j, r, size;
 39 | 	uint32_t min_r;
 40 | 	double mse, min_mse, next_mse;
 41 | 	symbol_t *bounds = (symbol_t *) _alloca((states+1)*sizeof(symbol_t));
 42 | 	symbol_t *reconstruction = (symbol_t *) _alloca(states*sizeof(symbol_t));
 43 | 
 44 | 	// Initial bounds and reconstruction points
 45 | 	bounds[0] = 0;
 46 | 	bounds[states] = pmf->alphabet->size;
 47 | 	for (j = 1; j < states; ++j) {
 48 | 		bounds[j] = (j * pmf->alphabet->size) / states;
 49 | 	}
 50 | 	for (j = 0; j < states; ++j) {
 51 | 		reconstruction[j] = (bounds[j] + bounds[j+1] - 1) / 2;
 52 | 	}
 53 | 
 54 | 	// Lloyd-Max quantizer design alternating between adjustment of bounds
 55 | 	// and of reconstruction point locations until there is no change
 56 | 	size = pmf->alphabet->size;
 57 | 	while (changed && iter < QUANTIZER_MAX_ITER) {
 58 | 		changed = 0;
 59 | 		iter += 1;
 60 | 
 61 | 		// First, adjust the reconstruction points for fixed bounds
 62 | 		for (j = 0; j < states; ++j) {
 63 | 			// Initial guess for min values
 64 | 			min_mse = DBL_MAX;
 65 | 			min_r = bounds[j];
 66 | 			
 67 | 			// For each possible reconstruction point
 68 | 			for (r = bounds[j]; r < bounds[j+1]; ++r) {
 69 | 				// Find its distortion when used for the whole region
 70 | 				mse = 0.0;
 71 | 				for (i = bounds[j]; i < bounds[j+1]; ++i) {
 72 | 					mse += get_probability(pmf, i) * get_distortion(dist, i, r);
 73 | 				}
 74 | 
 75 | 				// Compare to minimums, save if better
 76 | 				if (mse < min_mse) {
 77 | 					min_r = r;
 78 | 					min_mse = mse;
 79 | 				}
 80 | 			}
 81 | 
 82 | 			// Check if we've changed our reconstruction and save it
 83 | 			if (min_r != reconstruction[j]) {
 84 | 				changed = 1;
 85 | 				reconstruction[j] = min_r;
 86 | 			}
 87 | 		}
 88 | 
 89 | 		// Then, adjust the bounds for fixed reconstruction points by iterating
 90 | 		// over the positions (apart from the endpoints which always have a fixed
 91 | 		// assignment) and deciding which of the two nearest points they
 92 | 		// contribute the least expected distortion to
 93 | 		r = 0;
 94 | 		for (j = 1; j < size-1 && r < states-1; ++j) {
 95 | 			// Get distortion for the current and next reconstruction points
 96 | 			// I don't think the PMF actually affects this since it is the same
 97 | 			// coefficient for both and we are comparing them
 98 | 			mse = get_distortion(dist, j, reconstruction[r]);
 99 | 			next_mse = get_distortion(dist, j, reconstruction[r+1]);
100 | 
101 | 			// if the next one is lower, save the current symbol as the left bound
102 | 			// for that region
103 | 			if (next_mse < mse) {
104 | 				r += 1;
105 | 				bounds[r] = j;
106 | 			}
107 | 		}
108 | 	}
109 | 
110 | 	// Now, iterate over the regions and set up the quantizer mapping from input
111 | 	// to reconstruction point
112 | 	for (j = 0; j < states; ++j) {
113 | 		for (i = bounds[j]; i < bounds[j+1]; ++i) {
114 | 			q->q[i] = reconstruction[j];
115 | 		}
116 | 	}
117 | 
118 | 	// Save the output alphabet in the quantizer
119 | 	q->output_alphabet = alloc_alphabet(states);
120 | 	memcpy(q->output_alphabet->symbols, reconstruction, sizeof(symbol_t) * states);
121 | 	alphabet_compute_index(q->output_alphabet);
122 | 
123 | 	// Calculate the distortion and store it in the quantizer
124 | 	q->mse = 0.0;
125 | 	for (j = 0; j < states; ++j) {
126 | 		for (i = bounds[j]; i < bounds[j+1]; ++i) {
127 | 			q->mse += get_distortion(dist, i, reconstruction[j]) * get_probability(pmf, i);
128 | 		}
129 | 	}
130 |     
131 | 	return q;
132 | }
133 | 
134 | /**
135 |  * Calculate the PMF of the output when the given quantizer is used with symbols generated
136 |  * from the given input distribution. The input and output pmf structures cannot be the
137 |  * same. If output is null, a new PMF will be allocated and a pointer returned
138 |  */
139 | struct pmf_t *apply_quantizer(struct quantizer_t *restrict q, struct pmf_t *restrict pmf, struct pmf_t *restrict output) {
140 | 	uint32_t i;
141 | 
142 | 	if (!pmf->pmf_ready)
143 | 		recalculate_pmf(pmf);
144 | 	
145 | 	if (output) {
146 | 		// Clear existing pmf from output
147 | 		memset(output->pmf, 0, output->alphabet->size * sizeof(double));
148 | 	}
149 | 	else {
150 | 		// Allocate a new PMF for output
151 | 		output = alloc_pmf(pmf->alphabet);
152 | 	}
153 | 
154 | 	// Sum together input probabilities that map to the same output
155 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
156 | 		output->pmf[q->q[i]] += get_probability(pmf, i);
157 | 	}
158 | 	output->pmf_ready = 1;
159 | 
160 | 	return output;
161 | }
162 | 
163 | /**
164 |  * Generates the output alphabet from the quantization table, in case this isn't
165 |  * already available
166 |  */
167 | void find_output_alphabet(struct quantizer_t *q) {
168 | 	symbol_t p;
169 | 	uint32_t x;
170 | 	uint32_t size;
171 | 	symbol_t *uniques = (symbol_t *) _alloca(q->alphabet->size * sizeof(symbol_t));
172 | 
173 | 	// First symbol in quantizer output is always unique
174 | 	p = q->q[0];
175 | 	uniques[0] = p;
176 | 	size = 1;
177 | 
178 | 	// Search the rest of the quantizer
179 | 	for (x = 1; x < q->alphabet->size; ++x) {
180 | 		if (q->q[x] != p) {
181 | 			p = q->q[x];
182 | 			uniques[size] = p;
183 | 			size += 1;
184 | 		}
185 | 	}
186 | 
187 | 	// Make it into a proper alphabet
188 | 	q->output_alphabet = alloc_alphabet(size);
189 | 	memcpy(q->output_alphabet->symbols, uniques, size*sizeof(symbol_t));
190 | 	alphabet_compute_index(q->output_alphabet);
191 | }
192 | 
193 | /**
194 |  * Print a quantizer to stdout
195 |  */
196 | void print_quantizer(struct quantizer_t *q) {
197 | 	uint32_t i;
198 | 	char *tmp = (char *) _alloca(q->alphabet->size+1);
199 | 
200 | 	tmp[q->alphabet->size] = 0;
201 | 	for (i = 0; i < q->alphabet->size; ++i) {
202 | 		tmp[i] = (char) (q->q[i] + 33);
203 | 	}
204 | 	printf("Quantizer: %s\n", tmp);
205 | 
206 | 	tmp[q->output_alphabet->size] = 0;
207 | 	for (i = 0; i < q->output_alphabet->size; ++i) {
208 | 		tmp[i] = (char) (q->output_alphabet->symbols[i] + 33);
209 | 	}
210 | 	printf("Unique alphabet: %s\n", tmp);
211 | }
212 | 


--------------------------------------------------------------------------------
/src/cluster.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * k-means clustering implementation in C
  3 |  * 
  4 |  * The approach here is parallelizable and should be migrated to a block-based implementation
  5 |  * using opencl in order to run faster, or possibly just multithreaded, but I have left it
  6 |  * in plain C for the time being to get it working quickly. Note that the means established
  7 |  * are discrete values, rather than continuous.
  8 |  */
  9 | 
 10 | #include "util.h"
 11 | 
 12 | #include <stdlib.h>
 13 | #include <string.h>
 14 | #include <stdio.h>
 15 | //#include <malloc.h>
 16 | 
 17 | #include "pmf.h"
 18 | #include "codebook.h"
 19 | #include "cluster.h"
 20 | 
 21 | /**
 22 |  * Allocate the memory used for the clusters based on the number wanted and column config
 23 |  */
 24 | struct cluster_list_t *alloc_cluster_list(struct quality_file_t *info) {
 25 | 	uint8_t j;
 26 | 	struct cluster_list_t *rtn = (struct cluster_list_t *) calloc(1, sizeof(struct cluster_list_t));
 27 | 
 28 | 	// Allocate array of cluster structures
 29 | 	rtn->count = info->cluster_count;
 30 | 	rtn->clusters = (struct cluster_t *) calloc(info->cluster_count, sizeof(struct cluster_t));
 31 | 	rtn->distances = (double *) calloc(info->cluster_count, sizeof(double));
 32 | 
 33 | 	// Fill in each cluster
 34 | 	for (j = 0; j < info->cluster_count; ++j) {
 35 | 		rtn->clusters[j].id = j;
 36 | 		rtn->clusters[j].count = 0;
 37 | 		rtn->clusters[j].mean = (symbol_t *) calloc(info->columns, sizeof(symbol_t));
 38 | 		rtn->clusters[j].accumulator = (uint64_t *) calloc(info->columns, sizeof(uint64_t));
 39 | 		rtn->clusters[j].training_stats = alloc_conditional_pmf_list(info->alphabet, info->columns);
 40 | 	}
 41 | 
 42 | 	return rtn;
 43 | }
 44 | 
 45 | /**
 46 |  * Deallocate the memory used for the clusters.
 47 |  */
 48 | void free_cluster_list(struct cluster_list_t *clusters) {
 49 | 	uint8_t j;
 50 | 
 51 | 	for (j = 0; j < clusters->count; ++j) {
 52 | 		free(clusters->clusters[j].mean);
 53 | 		free(clusters->clusters[j].accumulator);
 54 | 		free_conditional_pmf_list(clusters->clusters[j].training_stats);
 55 | 	}
 56 | 	free(clusters->distances);
 57 | 	free(clusters->clusters);
 58 | 	free(clusters);
 59 | }
 60 | 
 61 | /**
 62 |  * Calculates cluster assignments for the block of lines given, and return
 63 |  * status indicating that at least one line changed clusters
 64 |  */
 65 | uint8_t cluster_lines(struct line_block_t *block, struct quality_file_t *info) {
 66 | 	uint32_t i;
 67 | 	uint8_t changed = 0;
 68 | 
 69 | 	for (i = 0; i < block->count; ++i) {
 70 | 		changed |= do_cluster_assignment(&block->lines[i], info);
 71 | 	}
 72 | 
 73 | 	return changed;
 74 | }
 75 | 
 76 | /**
 77 |  * Updates the cluster means based on their assigned lines. Also clears the line count for
 78 |  * the next iteration.
 79 |  */
 80 | double recalculate_means(struct quality_file_t *info) {
 81 | 	uint32_t block, line_idx;
 82 | 	uint32_t i, j;
 83 | 	struct line_t *line;
 84 | 	struct cluster_t *cluster;
 85 | 	uint8_t new_mean;
 86 | 	double dist, moved;
 87 | 	double move_max = 0.0;
 88 | 
 89 | 	// Reset cluster accumulators for new center calculation
 90 | 	for (i = 0; i < info->cluster_count; ++i) {
 91 | 		memset(info->clusters->clusters[i].accumulator, 0, info->columns*sizeof(uint64_t));
 92 | 	}
 93 | 
 94 | 	// Iterate linewise to accumulate into cluster centers
 95 | 	for (block = 0; block < info->block_count; ++block) {
 96 | 		for (line_idx = 0; line_idx < info->blocks[block].count; ++line_idx) {
 97 | 			line = &info->blocks[block].lines[line_idx];
 98 | 			cluster = &info->clusters->clusters[line->cluster];
 99 | 			for (i = 0; i < info->columns; ++i) {
100 | 				cluster->accumulator[i] += line->m_data[i];
101 | 			}
102 | 		}
103 | 	}
104 | 
105 | 	// Now find new cluster centers and compute motion
106 | 	for (i = 0; i < info->cluster_count; ++i) {
107 | 		cluster = &info->clusters->clusters[i];
108 | 		dist = 0.0;
109 | 		moved = 0.0;
110 | 
111 | 		for (j = 0; j < info->columns; ++j) {
112 | 			// Integer division to find the mean, guaranteed to be less than the alphabet size
113 | 			new_mean = (uint8_t) (cluster->accumulator[j] / cluster->count);
114 | 
115 | 			// Also figure out how far we've moved
116 | 			dist = new_mean - cluster->mean[j];
117 | 			moved += dist*dist;
118 | 
119 | 			// Write back the new cluster center
120 | 			cluster->mean[j] = new_mean;
121 | 		}
122 | 
123 | 		if (moved > move_max)
124 | 			move_max = moved;
125 | 
126 | 		if (info->opts->verbose)
127 | 			printf("Cluster %d moved %f.\n", i, moved);
128 | 	}
129 | 
130 | 	return move_max;
131 | }
132 | 
133 | /**
134 |  * Compare each line to each cluster to find distances
135 |  */
136 | uint8_t do_cluster_assignment(struct line_t *line, struct quality_file_t *info) {
137 | 	uint8_t i;
138 | 
139 | 	for (i = 0; i < info->cluster_count; ++i) {
140 | 		find_distance(line, &info->clusters->clusters[i], info);
141 | 	}
142 | 
143 | 	return assign_cluster(line, info);
144 | }
145 | 
146 | /**
147 |  * Assigns a cluster based on the one with the lowest distance
148 |  */
149 | uint8_t assign_cluster(struct line_t *line, struct quality_file_t *info) {
150 | 	uint8_t id = 0;
151 | 	uint8_t prev_id = line->cluster;
152 | 	uint8_t i;
153 | 	struct cluster_t *cluster;
154 | 	double *distances = info->clusters->distances;
155 | 	double d = distances[0];
156 | 
157 | 	// Find the cluster with minimum distance
158 | 	for (i = 1; i < info->cluster_count; ++i) {
159 | 		if (distances[i] < d) {
160 | 			id = i;
161 | 			d = distances[i];
162 | 		}
163 | 	}
164 | 
165 | 	// Assign to that cluster
166 | 	line->cluster = id;
167 | 	cluster = &info->clusters->clusters[id];
168 | 	cluster->count += 1;
169 | 
170 | 	return (prev_id == id) ? 0 : 1;
171 | }
172 | 
173 | /**
174 |  * Take a line and cluster information and calculates the distance, storing it in the line information vector
175 |  */
176 | void find_distance(struct line_t *line, struct cluster_t *cluster, struct quality_file_t *info) {
177 | 	double d = 0.0;
178 | 	uint32_t i;
179 | 	uint32_t data, mean;
180 | 
181 | 	for (i = 0; i < info->columns; ++i) {
182 | 		data = line->m_data[i];
183 | 		mean = cluster->mean[i];
184 | 		d += (data - mean) * (data - mean);
185 | 	}
186 | 	info->clusters->distances[cluster->id] = d;
187 | }
188 | 
189 | /**
190 |  * Initialize the cluster means based on the data given, using random selection
191 |  */
192 | void initialize_kmeans_clustering(struct quality_file_t *info) {
193 | 	uint8_t j;
194 | 	uint32_t block_id;
195 | 	uint32_t line_id;
196 | 	struct cluster_list_t *clusters = info->clusters;
197 | 
198 | 	for (j = 0; j < info->cluster_count; ++j) {
199 | 		block_id = rand() % info->block_count;
200 | 		line_id = rand() % info->blocks[block_id].count;
201 | 		memcpy(clusters->clusters[j].mean, info->blocks[block_id].lines[line_id].m_data, info->columns*sizeof(uint8_t));
202 | 		if (info->opts->verbose) {
203 | 			printf("Chose block %d, line %d.\n", block_id, line_id);
204 | 		}
205 | 	}
206 | }
207 | 
208 | /**
209 |  * Do k-means clustering over the set of blocks given to produce a set of clusters that
210 |  * fills the cluster list given
211 |  */
212 | void do_kmeans_clustering(struct quality_file_t *info) {
213 | 	uint32_t iter_count = 0;
214 | 	uint32_t j;
215 | 	uint8_t loop = 1;
216 | 	double moved;
217 | 	struct cluster_list_t *clusters = info->clusters;
218 | 
219 | 	initialize_kmeans_clustering(info);
220 | 
221 | 	while (iter_count < MAX_KMEANS_ITERATIONS && loop) {
222 | 		for (j = 0; j < clusters->count; ++j) {
223 | 			clusters->clusters[j].count = 0;
224 | 		}
225 | 
226 | 		for (j = 0; j < info->block_count; ++j) {
227 | 			cluster_lines(&info->blocks[j], info);
228 | 		}
229 | 
230 | 		loop = 0;
231 | 		moved = recalculate_means(info);
232 | 		if (moved > info->opts->cluster_threshold)
233 | 			loop = 1;
234 | 
235 | 		iter_count += 1;
236 | 		if (info->opts->verbose) {
237 | 			printf("\n");
238 | 		}
239 | 	}
240 | 
241 | 	if (info->opts->verbose) {
242 | 		printf("\nTotal number of iterations: %d.\n", iter_count);
243 | 	}
244 | }
245 | 


--------------------------------------------------------------------------------
/src/qv_compressor.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include "qv_compressor.h"
  3 | 
  4 | /**
  5 |  * Compress a quality value and send it into the arithmetic encoder output stream,
  6 |  * with appropriate context information
  7 |  */
  8 | void compress_qv(arithStream as, uint32_t x, uint8_t cluster, uint32_t column, uint32_t idx) {
  9 |     arithmetic_encoder_step(as->a, as->stats[cluster][column][idx], x, as->os);
 10 |     update_stats(as->stats[cluster][column][idx], x, as->a->r);
 11 | }
 12 | 
 13 | /**
 14 |  * Writes a cluster value to the arithmetic encoder
 15 |  * We don't need to do adaptive stats here but it saves us a number of bytes
 16 |  * on writing the number of lines in each cluster.
 17 |  * @todo Determine which has a lower bitrate (probably almost the same)
 18 |  */
 19 | void qv_write_cluster(arithStream as, uint8_t cluster) {
 20 | 	arithmetic_encoder_step(as->a, as->cluster_stats, cluster, as->os);
 21 | 	update_stats(as->cluster_stats, cluster, as->a->r);
 22 | }
 23 | 
 24 | /**
 25 |  * Retrieve a quality value from the arithmetic decoder input stream
 26 |  */
 27 | uint32_t decompress_qv(arithStream as, uint8_t cluster, uint32_t column, uint32_t idx) {
 28 |     uint32_t x;
 29 |     
 30 |     x = arithmetic_decoder_step(as->a, as->stats[cluster][column][idx], as->os);
 31 |     update_stats(as->stats[cluster][column][idx], x, as->a->r);
 32 |     
 33 |     return x;
 34 | }
 35 | 
 36 | uint8_t qv_read_cluster(arithStream as) {
 37 | 	uint32_t x;
 38 | 	
 39 | 	x = arithmetic_decoder_step(as->a, as->cluster_stats, as->os);
 40 | 	update_stats(as->cluster_stats, x, as->a->r);
 41 | 
 42 | 	return (uint8_t) x;
 43 | }
 44 | 
 45 | /**
 46 |  * Compress a sequence of quality scores including dealing with organization by cluster
 47 |  */
 48 | uint32_t start_qv_compression(struct quality_file_t *info, FILE *fout, double *dis, FILE * funcompressed) {
 49 |     unsigned int osSize = 0;
 50 |     
 51 |     qv_compressor qvc;
 52 |     
 53 | 	uint32_t s = 0, idx = 0, q_state = 0;
 54 | 	double distortion = 0.0;
 55 | 	double error = 0.0;
 56 |     uint8_t qv = 0, prev_qv = 0;
 57 |     uint32_t columns = info->columns;
 58 |     struct quantizer_t *q;
 59 | 	struct cond_quantizer_list_t *qlist;
 60 | 
 61 | 	uint32_t block_idx, line_idx;
 62 | 	uint8_t cluster_id;
 63 | 
 64 | 	struct line_t *line;
 65 | 	symbol_t data;
 66 |         
 67 |     
 68 |     // Initialize the compressor
 69 |     qvc = initialize_qv_compressor(fout, COMPRESSION, info);
 70 |     
 71 |     // Start compressing the file
 72 | 	distortion = 0.0;
 73 | 	block_idx = 0;
 74 | 	line_idx = 0;
 75 | 
 76 | 	do {
 77 | 		line = &info->blocks[block_idx].lines[line_idx];
 78 | 		
 79 |         if (info->opts->verbose && line_idx == 0) {
 80 |             printf("Line: %dM\n", block_idx);
 81 |         }
 82 | 
 83 | 		// Write clustering information and pull the correct codebook
 84 | 		cluster_id = line->cluster;
 85 | 		qlist = info->clusters->clusters[cluster_id].qlist;
 86 | 		qv_write_cluster(qvc->Quals, cluster_id);
 87 |         
 88 | 		// Select first column's codebook with no left context
 89 | 		q = choose_quantizer(qlist, &info->well, 0, 0, &idx);
 90 |         
 91 | 		// Quantize, compress and calculate error simultaneously
 92 | 		data = line->m_data[0] - 33;
 93 | 		qv = q->q[data];
 94 |         
 95 |         q_state = get_symbol_index(q->output_alphabet, qv);
 96 |         compress_qv(qvc->Quals, q_state, cluster_id, 0, idx);
 97 | 		error = get_distortion(info->dist, data, qv);
 98 |         
 99 |         // @todo use buffer to speed up the writing
100 |         if (funcompressed != NULL) {
101 |             fputc(qv+33, funcompressed);
102 |         }
103 |         
104 |         prev_qv = qv;
105 |         
106 | 		for (s = 1; s < columns; ++s) {
107 | 			q = choose_quantizer(qlist, &info->well, s, prev_qv, &idx);
108 | 			data = line->m_data[s] - 33;
109 | 			qv = q->q[data];
110 |             q_state = get_symbol_index(q->output_alphabet, qv);
111 |             
112 |             // @todo use buffer to speed up the writing
113 |             if (funcompressed != NULL) {
114 |                 fputc(qv+33, funcompressed);
115 |             }
116 |             
117 |             compress_qv(qvc->Quals, q_state, cluster_id, s, idx);
118 | 			error += get_distortion(info->dist, data, qv);
119 |             prev_qv = qv;
120 | 		}
121 |         
122 |        	// @todo use buffer to speed up the writing
123 |         if (funcompressed != NULL) {
124 |             fputc('\n', funcompressed);
125 |         }
126 |         
127 |         distortion += error / ((double) columns);
128 | 
129 | 		// Set up next set of pointers
130 | 		line_idx += 1;
131 | 		if (line_idx == info->blocks[block_idx].count) {
132 | 			line_idx = 0;
133 | 			block_idx += 1;
134 | 		}
135 | 	} while (block_idx < info->block_count);
136 |     
137 |     osSize = encoder_last_step(qvc->Quals->a, qvc->Quals->os);
138 |     
139 | 	if (dis)
140 |     	*dis = distortion / ((double) info->lines);
141 |     
142 |     return osSize;
143 | }
144 | 
145 | void start_qv_decompression(FILE *fout, FILE *fin, struct quality_file_t *info) {
146 |     qv_compressor qvc;
147 |     
148 | 	uint32_t s = 0, idx = 0, lineCtr = 0, q_state = 0;
149 |     uint8_t prev_qv = 0, cluster_id;
150 |     
151 |     uint32_t columns = info->columns;
152 | 	uint32_t lines = info->lines;
153 | 	struct cond_quantizer_list_t *qlist;
154 |     struct quantizer_t *q;
155 | 
156 | 	char *line = (char *) _alloca(columns+2);
157 |     line[columns] = '\n';
158 | 	line[columns+1] = '\0';
159 |     
160 |     // Initialize the compressor
161 |     qvc = initialize_qv_compressor(fin, DECOMPRESSION, info);
162 |     
163 | 	// Last line has to be handled separately to clear the arithmetic decoder
164 | 	while (lineCtr < lines - 1) {
165 |         if (info->opts->verbose && lineCtr%1000000 == 0){
166 |             printf("Line: %dM\n", lineCtr/1000000);
167 |         }
168 |         lineCtr++;
169 | 
170 | 		cluster_id = qv_read_cluster(qvc->Quals);
171 | 		assert(cluster_id < info->cluster_count);
172 | 		qlist = info->clusters->clusters[cluster_id].qlist;
173 |         
174 | 		// Select first column's codebook with no left context
175 | 		q = choose_quantizer(qlist, &info->well, 0, 0, &idx);
176 |         
177 | 		// Quantize, compress and calculate error simultaneously
178 | 		// Note that in this version the quantizer outputs are 0-72, so the +33 offset is different from before
179 |         q_state = decompress_qv(qvc->Quals, cluster_id, 0, idx);
180 |         line[0] = q->output_alphabet->symbols[q_state] + 33;
181 |         prev_qv = line[0] - 33;
182 |         
183 | 		for (s = 1; s < columns; ++s) {
184 | 			// Quantize and compute error for MSE
185 | 			q = choose_quantizer(qlist, &info->well, s, prev_qv, &idx);
186 |             q_state = decompress_qv(qvc->Quals, cluster_id, s, idx);
187 |             line[s] = q->output_alphabet->symbols[q_state] + 33;
188 |             prev_qv = line[s] - 33;
189 | 		}
190 |         
191 |         // Write this line to the output file, note '\n' at the end of the line buffer to get the right length
192 | 		fwrite(line, columns+1, sizeof(uint8_t), fout);
193 | 	}
194 |     
195 |     // Last Line
196 |     if (info->opts->verbose && lineCtr%1000000 == 0){
197 |         printf("Line: %dM\n", lineCtr/1000000);
198 |     }
199 |     lineCtr++;
200 |     
201 | 	cluster_id = qv_read_cluster(qvc->Quals);
202 | 		assert(cluster_id < info->cluster_count);
203 | 	qlist = info->clusters->clusters[cluster_id].qlist;
204 | 
205 |     // Select first column's codebook with no left context
206 |     q = choose_quantizer(qlist, &info->well, 0, 0, &idx);
207 |     
208 |     // Quantize, compress and calculate error simultaneously
209 |     // Note that in this version the quantizer outputs are 0-72, so the +33 offset is different from before
210 |     q_state = decompress_qv(qvc->Quals, cluster_id, 0, idx);
211 |     line[0] = q->output_alphabet->symbols[q_state] + 33;
212 |     prev_qv = line[0] - 33;
213 |     
214 |     for (s = 1; s < columns - 1; ++s) {
215 |         // Quantize and compute error for MSE
216 |         q = choose_quantizer(qlist, &info->well, s, prev_qv, &idx);
217 |         q_state = decompress_qv(qvc->Quals, cluster_id, s, idx);
218 |         line[s] = q->output_alphabet->symbols[q_state] + 33;
219 |         prev_qv = line[s] - 33;
220 |     }
221 |     
222 |     // Last column
223 |     q = choose_quantizer(qlist, &info->well, s, prev_qv, &idx);
224 |     q_state = decoder_last_step(qvc->Quals->a, qvc->Quals->stats[cluster_id][s][idx]);
225 |     line[s] = q->output_alphabet->symbols[q_state] + 33;
226 |     
227 |     // Write this line to the output file, note '\n' at the end of the line buffer to get the right length
228 |     fwrite(line, columns+1, sizeof(uint8_t), fout);
229 | 
230 | 	info->lines = lineCtr;
231 | }
232 | 


--------------------------------------------------------------------------------
/src/main.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "util.h"
  3 | 
  4 | #include <stdlib.h>
  5 | #include <stdio.h>
  6 | #include <string.h>
  7 | #include <stdint.h>
  8 | 
  9 | #include "codebook.h"
 10 | #include "qv_compressor.h"
 11 | #include "cluster.h"
 12 | 
 13 | #define ALPHABET_SIZE 72
 14 | 
 15 | /**
 16 |  *
 17 |  */
 18 | void encode(char *input_name, char *output_name, struct qv_options_t *opts) {
 19 | 	struct quality_file_t qv_info;
 20 | 	struct distortion_t *dist;
 21 | 	struct alphabet_t *alphabet = alloc_alphabet(ALPHABET_SIZE);
 22 | 	uint32_t status;
 23 | 	struct hrtimer_t cluster_time, stats, encoding, total;
 24 | 	FILE *fout, *funcompressed = NULL;
 25 | 	uint64_t bytes_used;
 26 |     double distortion;
 27 | 
 28 | 	start_timer(&total);
 29 | 
 30 | 	if (opts->distortion == DISTORTION_CUSTOM) {
 31 | 		dist = gen_custom_distortion(ALPHABET_SIZE, opts->dist_file);
 32 | 	}
 33 | 	else {
 34 | 		dist = generate_distortion_matrix(ALPHABET_SIZE, opts->distortion);
 35 | 	}
 36 |     
 37 | 	qv_info.alphabet = alphabet;
 38 | 	qv_info.dist = dist;
 39 | 	qv_info.cluster_count = opts->clusters;
 40 | 
 41 | 	// Load input file all at once
 42 | 	status = load_file(input_name, &qv_info, 0);
 43 | 	if (status != LF_ERROR_NONE) {
 44 | 		printf("load_file returned error: %d\n", status);
 45 | 		exit(1);
 46 | 	}
 47 | 
 48 | 	// Set up clustering data structures
 49 | 	qv_info.clusters = alloc_cluster_list(&qv_info);
 50 | 	qv_info.opts = opts;
 51 | 
 52 | 	// Do k-means clustering
 53 | 	start_timer(&cluster_time);
 54 | 	do_kmeans_clustering(&qv_info);
 55 | 	stop_timer(&cluster_time);
 56 | 	if (opts->verbose) {
 57 | 		printf("Clustering took %.4f seconds\n", get_timer_interval(&cluster_time));
 58 | 	}
 59 |     
 60 | 	// Then find stats and generate codebooks for each cluster
 61 | 	start_timer(&stats);
 62 | 	calculate_statistics(&qv_info);
 63 | 	generate_codebooks(&qv_info);
 64 | 	stop_timer(&stats);
 65 |     
 66 | 	if (opts->verbose) {
 67 | 		printf("Stats and codebook generation took %.4f seconds\n", get_timer_interval(&stats));
 68 | 		// @todo expected distortion is inaccurate due to lack of pmf
 69 | 		//printf("Expected distortion: %f\n", opts->e_dist);
 70 | 	}
 71 |     
 72 | 	// Note that we want \r\n translation in the input
 73 | 	// but we do not want it in the output
 74 | 	fout = fopen(output_name, "wb");
 75 | 	if (!fout) {
 76 | 		perror("Unable to open output file");
 77 | 		exit(1);
 78 | 	}
 79 |     
 80 |     if (opts->uncompressed) {
 81 |         funcompressed = fopen(opts->uncompressed_name, "w");
 82 |         if (!funcompressed) {
 83 |             perror("Unable to open uncompressed file");
 84 |             exit(1);
 85 |         }
 86 |     }
 87 | 	
 88 | 	// @todo qv_compression should use quality_file structure with data in memory, now
 89 | 	start_timer(&encoding);
 90 | 	write_codebooks(fout, &qv_info);
 91 |     bytes_used = start_qv_compression(&qv_info, fout, &distortion, funcompressed);
 92 | 	stop_timer(&encoding);
 93 | 	stop_timer(&total);
 94 | 
 95 | 	fclose(fout);
 96 |     
 97 | 	// Verbose stats
 98 | 	if (opts->verbose) {
 99 | 		// @todo add cluster info here
100 |         switch (opts->distortion) {
101 |             case DISTORTION_MANHATTAN:
102 |                 printf("L1 distortion: %f\n", distortion);
103 |                 break;
104 |             case DISTORTION_MSE:
105 |                 printf("MSE distortion: %f\n", distortion);
106 |                 break;
107 |             case DISTORTION_LORENTZ:
108 |                 printf("log(1+L1) distortion: %f\n", distortion);
109 |                 break;
110 | 			case DISTORTION_CUSTOM:
111 | 				printf("Custom distortion: %f\n", distortion);
112 | 				break;
113 |             default:
114 |                 break;
115 |         }
116 | 		printf("Lines: %llu\n", qv_info.lines);
117 | 		printf("Columns: %u\n", qv_info.columns);
118 | 		printf("Total bytes used: %llu\n", bytes_used);
119 | 		printf("Encoding took %.4f seconds.\n", get_timer_interval(&total));
120 | 		printf("Total time elapsed: %.4f seconds.\n", get_timer_interval(&total));
121 | 	}
122 | 
123 | 	// Parse-able stats
124 | 	if (opts->stats) {
125 | 		printf("rate, %.4f, distortion, %.4f, time, %.4f, size, %llu \n", (bytes_used*8.)/((double)(qv_info.lines)*qv_info.columns), distortion, get_timer_interval(&total), bytes_used);
126 | 	}
127 | }
128 | 
129 | /**
130 |  *
131 |  */
132 | void decode(char *input_file, char *output_file, struct qv_options_t *opts) {
133 | 	FILE *fin, *fout;
134 | 	struct hrtimer_t timer;
135 | 	struct quality_file_t qv_info;
136 | 	struct alphabet_t *A = alloc_alphabet(ALPHABET_SIZE);
137 |     
138 | 	qv_info.alphabet = A;
139 | 	qv_info.opts = opts;
140 | 
141 | 	start_timer(&timer);
142 | 
143 | 	fin = fopen(input_file, "rb");
144 | 	fout = fopen(output_file, "wt");
145 | 	if (!fin || !fout) {
146 | 		perror("Unable to open input or output files");
147 | 		exit(1);
148 | 	}
149 | 
150 | 	read_codebooks(fin, &qv_info);
151 |     start_qv_decompression(fout, fin, &qv_info);
152 | 
153 | 	fclose(fout);
154 | 	fclose(fin);
155 | 	stop_timer(&timer);
156 | 
157 | 	if (opts->verbose) {
158 | 		printf("Decoded %llu lines in %f seconds.\n", qv_info.lines, get_timer_interval(&timer));
159 | 	}
160 | }
161 | 
162 | /**
163 |  * Displays a usage name
164 |  * @param name Program name string
165 |  */
166 | void usage(char *name) {
167 | 	printf("Usage: %s (options) [input file] [output file]\n", name);
168 | 	printf("Options are:\n");
169 | 	printf("   -q           : Store quality values in compressed file (default)\n");
170 | 	printf("   -x           : Extract quality values from compressed file\n");
171 | 	printf("   -f [ratio]   : Compress using [ratio] bits per bit of input entropy per symbol\n");
172 | 	printf("   -r [rate]    : Compress using fixed [rate] bits per symbol\n");
173 |     printf("   -d [M|L|A]   : Optimize for MSE, Log(1+L1), L1 distortions, respectively (default: MSE)\n");
174 | 	printf("   -D [FILE]    : Optimize using the custom distortion matrix specified in FILE\n");
175 | 	printf("   -c [#]       : Compress using [#] clusters (default: 1)\n");
176 | 	printf("   -T [#]       : Use [#] as a threshold for cluster center movement (L2 norm) to declare a stable solution (default: 4).\n");
177 |     printf("   -u [FILE]    : Write the uncompressed lossy values to FILE (default: off)\n");
178 | 	printf("   -h           : Print this help\n");
179 | 	printf("   -s           : Print summary stats\n");
180 | 	printf("   -v           : Enable verbose output\n");
181 | 	printf("\nFor custom distortion matrices, a 72x72 matrix of values must be provided as the cost of reconstructing\n");
182 | 	printf("the x-th row as the y-th column, where x and y range from 0 to 71 (inclusive) corresponding to the possible\n");
183 | 	printf("Phred scores.\n");
184 | }
185 | 
186 | /**
187 |  *
188 |  */
189 | int main(int argc, char **argv) {
190 |     char *input_name = 0;
191 | 	char *output_name = 0;
192 | 	struct qv_options_t opts;
193 | 	uint32_t i;
194 | 
195 | 	uint8_t extract = 0;
196 | 	uint8_t file_idx = 0;
197 | 
198 | 	opts.verbose = 0;
199 | 	opts.stats = 0;
200 | 	opts.ratio = 0.5;
201 | 	opts.clusters = 1;
202 |     opts.uncompressed = 0;
203 |     opts.distortion = DISTORTION_MSE;
204 | 	opts.cluster_threshold = 4;
205 | 
206 | 	// No dependency, cross-platform command line parsing means no getopt
207 | 	// So we need to settle for less than optimal flexibility (no combining short opts, maybe that will be added later)
208 | 	i = 1;
209 | 	while (i < argc) {
210 | 		// Handle file names and reject any other untagged arguments
211 | 		if (argv[i][0] != '-') {
212 | 			switch (file_idx) {
213 | 				case 0:
214 | 					input_name = argv[i];
215 | 					file_idx = 1;
216 | 					break;
217 | 				case 1:
218 | 					output_name = argv[i];
219 | 					file_idx = 2;
220 | 					break;
221 | 				default:
222 | 					printf("Garbage argument \"%s\" detected.\n", argv[i]);
223 | 					usage(argv[0]);
224 | 					exit(1);
225 | 			}
226 | 			i += 1;
227 | 			continue;
228 | 		}
229 | 
230 | 		// Flags for options
231 | 		switch(argv[i][1]) {
232 | 			case 'x':
233 | 				extract = 1;
234 | 				i += 1;
235 | 				break;
236 | 			case 'q':
237 | 				extract = 0;
238 | 				i += 1;
239 | 				break;
240 | 			case 'f':
241 | 				extract = 0;
242 | 				opts.ratio = atof(argv[i+1]);
243 | 				opts.mode = MODE_RATIO;
244 | 				i += 2;
245 | 				break;
246 | 			case 'r':
247 | 				extract = 0;
248 | 				opts.ratio = atof(argv[i+1]);
249 | 				opts.mode = MODE_FIXED;
250 | 				i += 2;
251 | 				break;
252 | 			case 'c':
253 | 				opts.clusters = atoi(argv[i+1]);
254 | 				i += 2;
255 | 				break;
256 | 			case 'v':
257 | 				opts.verbose = 1;
258 | 				i += 1;
259 | 				break;
260 | 			case 'h':
261 | 				usage(argv[0]);
262 | 				exit(0);
263 | 			case 's':
264 | 				opts.stats = 1;
265 | 				i += 1;
266 | 				break;
267 |             case 'u':
268 |                 opts.uncompressed = 1;
269 |                 opts.uncompressed_name = argv[i+1];
270 |                 i += 2;
271 |                 break;
272 | 			case 'T':
273 | 				opts.cluster_threshold = atoi(argv[i+1]);
274 | 				i += 2;
275 | 				break;
276 |             case 'd':
277 |                 switch (argv[i+1][0]) {
278 |                     case 'M':
279 |                         opts.distortion = DISTORTION_MSE;
280 |                         break;
281 |                     case 'L':
282 |                         opts.distortion = DISTORTION_LORENTZ;
283 |                         break;
284 |                     case 'A':
285 |                         opts.distortion = DISTORTION_MANHATTAN;
286 |                         break;
287 |                     default:
288 |                         printf("Distortion measure not supported, using MSE.\n");
289 |                         break;
290 |                 }
291 |                 i += 2;
292 |                 break;
293 | 			case 'D':
294 | 				opts.distortion = DISTORTION_CUSTOM;
295 | 				opts.dist_file = argv[i+1];
296 | 				i += 2;
297 | 				break;
298 | 			default:
299 | 				printf("Unrecognized option -%c.\n", argv[i][1]);
300 | 				usage(argv[0]);
301 | 				exit(1);
302 | 		}
303 | 	}
304 | 
305 | 	if (file_idx != 2) {
306 | 		printf("Missing required filenames.\n");
307 | 		usage(argv[0]);
308 | 		exit(1);
309 | 	}
310 | 
311 | 	if (opts.verbose) {
312 | 		if (extract) {
313 | 			printf("%s will be decoded to %s.\n", input_name, output_name);
314 | 		}
315 | 		else {
316 | 			printf("%s will be encoded as %s.\n", input_name, output_name);
317 | 			if (opts.mode == MODE_RATIO)
318 | 				printf("Ratio mode selected, targeting %f compression ratio.\n", opts.ratio);
319 | 			else if (opts.mode == MODE_FIXED)
320 | 				printf("Fixed-rate mode selected, targeting %f bits per symbol.\n", opts.ratio);
321 | 			else if (opts.mode == MODE_FIXED_MSE)
322 | 				printf("Fixed-MSE mode selected, targeting %f average distortion per context.\n", opts.ratio);
323 | 
324 | 			switch (opts.distortion) {
325 | 				case DISTORTION_MSE:
326 | 					printf("MSE will be used as a distortion metric.\n");
327 | 					break;
328 | 				case DISTORTION_LORENTZ:
329 | 					printf("log(1+L1) will be used as a distortion metric.\n");
330 | 					break;
331 | 				case DISTORTION_MANHATTAN:
332 | 					printf("L1 will be used as a distortion metric.\n");
333 | 					break;
334 | 				case DISTORTION_CUSTOM:
335 | 					printf("A custom distortion metric stored in %s will be used.\n", opts.dist_file);
336 | 					break;
337 | 			}
338 | 
339 | 			printf("Compression will use %d clusters, with a movement threshold of %.0f.\n", opts.clusters, opts.cluster_threshold);
340 | 		}
341 | 	}
342 | 
343 | 	if (extract) {
344 | 		decode(input_name, output_name, &opts);
345 | 	}
346 | 	else {
347 | 		encode(input_name, output_name, &opts);
348 | 	}
349 | 
350 | #ifdef _WIN32
351 | 	system("pause");
352 | #endif
353 | 
354 | 	return 0;
355 | }
356 | 
357 | 


--------------------------------------------------------------------------------
/src/pmf.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | 
  4 | #include "pmf.h"
  5 | 
  6 | //#define log2(a) log(a)/log(2.0)
  7 | /**
  8 |  * Allocates the memory for an alphabet structure and fills the symbols
  9 |  * with a default list of 0 through size-1
 10 |  */
 11 | struct alphabet_t *alloc_alphabet(uint32_t size) {
 12 | 	symbol_t i;
 13 | 	struct alphabet_t *rtn = (struct alphabet_t *) calloc(1, sizeof(struct alphabet_t));
 14 | 	rtn->size = size;
 15 | 	rtn->symbols = (symbol_t *) calloc(size, sizeof(symbol_t));
 16 | 
 17 | 	for (i = 0; i < size; ++i) {
 18 | 		rtn->symbols[i] = i;
 19 | 	}
 20 | 	alphabet_compute_index(rtn);
 21 | 
 22 | 	return rtn;
 23 | }
 24 | 
 25 | /**
 26 |  * Makes a copy of the given alphabet
 27 |  */
 28 | struct alphabet_t *duplicate_alphabet(const struct alphabet_t *a) {
 29 | 	struct alphabet_t *rtn = (struct alphabet_t *) calloc(1, sizeof(struct alphabet_t));
 30 | 	rtn->size = a->size;
 31 | 	rtn->symbols = (symbol_t *) calloc(a->size, sizeof(symbol_t));
 32 | 
 33 | 	memcpy(rtn->symbols, a->symbols, a->size*sizeof(symbol_t));
 34 | 	alphabet_compute_index(rtn);
 35 | 
 36 | 	return rtn;
 37 | }
 38 | 
 39 | /**
 40 |  * Allocates a PMF structure for the given alphabet, but it does not copy the alphabet
 41 |  */
 42 | struct pmf_t *alloc_pmf(const struct alphabet_t *alphabet) {
 43 | 	struct pmf_t *rtn = (struct pmf_t *) calloc(1, sizeof(struct pmf_t));
 44 | 	rtn->alphabet = alphabet;
 45 | 	rtn->pmf = (double *) calloc(alphabet->size, sizeof(double));
 46 | 	rtn->counts = (uint32_t *) calloc(alphabet->size, sizeof(uint32_t));
 47 | 	return rtn;
 48 | }
 49 | 
 50 | /**
 51 |  * Allocates an array for tracking a list of PMFs along with the underlying PMFs
 52 |  */
 53 | struct pmf_list_t *alloc_pmf_list(uint32_t size, const struct alphabet_t *alphabet) {
 54 | 	uint32_t i;
 55 | 	struct pmf_list_t *rtn = (struct pmf_list_t *) calloc(1, sizeof(struct pmf_list_t));
 56 | 	rtn->size = size;
 57 | 	rtn->pmfs = (struct pmf_t **) calloc(size, sizeof(struct pmf_t *));
 58 | 	
 59 | 	for (i = 0; i < size; ++i) {
 60 | 		rtn->pmfs[i] = alloc_pmf(alphabet);
 61 | 	}
 62 | 
 63 | 	return rtn;
 64 | }
 65 | 
 66 | /**
 67 |  * Frees an alphabet
 68 |  */
 69 | void free_alphabet(struct alphabet_t *alphabet) {
 70 | 	free(alphabet->symbols);
 71 | 	free(alphabet->indexes);
 72 | 	free(alphabet);
 73 | }
 74 | 
 75 | /**
 76 |  * Frees a PMF
 77 |  */
 78 | void free_pmf(struct pmf_t *pmf) {
 79 | 	free(pmf->counts);
 80 | 	free(pmf);
 81 | }
 82 | 
 83 | /**
 84 |  * Frees a list of PMFs
 85 |  */
 86 | void free_pmf_list(struct pmf_list_t *pmfs) {
 87 | 	uint32_t i;
 88 | 	for (i = 0; i < pmfs->size; ++i) {
 89 | 		free_pmf(pmfs->pmfs[i]);
 90 | 	}
 91 | 	free(pmfs->pmfs);
 92 | 	free(pmfs);
 93 | }
 94 | 
 95 | /**
 96 |  * Determine if a pmf is valid (if it sums to 1, within some tolerance)
 97 |  */
 98 | uint32_t is_pmf_valid(struct pmf_t *pmf) {
 99 | 	double sum = 0;
100 | 	uint32_t i;
101 | 
102 | 	if (!pmf->pmf_ready)
103 | 		recalculate_pmf(pmf);
104 | 
105 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
106 | 		sum += pmf->pmf[i];
107 | 	}
108 | 
109 | 	if (fabs(sum - 1.0) < 0.0001)
110 | 		return 1;
111 | 	return 0;
112 | }
113 | 
114 | /**
115 |  * Gets the probability for a specific location, triggering lazy re-eval if
116 |  * necessary
117 |  */
118 | double get_probability(struct pmf_t *pmf, uint32_t idx) {
119 | 	if (!pmf->pmf_ready)
120 | 		recalculate_pmf(pmf);
121 | 	return pmf->pmf[idx];
122 | }
123 | 
124 | /**
125 |  * Gets the probability for a specific symbol, triggering lazy re-eval if
126 |  * necessary
127 |  */
128 | double get_symbol_probability(struct pmf_t *pmf, symbol_t symbol) {
129 | 	uint32_t idx = get_symbol_index(pmf->alphabet, symbol);
130 | 
131 | 	if (!pmf->pmf_ready)
132 | 		recalculate_pmf(pmf);
133 | 	if (idx != ALPHABET_SYMBOL_NOT_FOUND)
134 | 		return pmf->pmf[idx];
135 | 	return 0.0;
136 | }
137 | 
138 | /**
139 |  * Calculate the entropy of this pmf in bits
140 |  */
141 | double get_entropy(struct pmf_t *pmf) {
142 | 	double entropy = 0.0;
143 | 	uint32_t i = 0;
144 | 
145 | 	if (!pmf->pmf_ready)
146 | 		recalculate_pmf(pmf);
147 | 
148 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
149 | 		if (pmf->pmf[i] > 0.0) {
150 | 			entropy -= pmf->pmf[i] * log2(pmf->pmf[i]);
151 | 		}
152 | 	}
153 | 
154 | 	return entropy;
155 | }
156 | 
157 | /**
158 |  * Calculates the Kullbeck-Leibler Divergence between two PMFs, p and q, as D(p||q)
159 |  */
160 | double get_kl_divergence(struct pmf_t *p, struct pmf_t *q) {
161 | 	double d = 0.0;
162 | 	uint32_t i;
163 | 	
164 | 	if (p->alphabet != q->alphabet)
165 | 		return NAN;
166 | 
167 | 	if (!p->pmf_ready)
168 | 		recalculate_pmf(p);
169 | 	if (!q->pmf_ready)
170 | 		recalculate_pmf(q);
171 | 
172 | 	for (i = 0; i < p->alphabet->size; ++i) {
173 | 		if (q->pmf[i] > 0) {
174 | 			if (p->pmf[i] > 0) {
175 | 				d += p->pmf[i] * log2(p->pmf[i] / q->pmf[i]);
176 | 			}
177 | 		}
178 | 	}
179 | 
180 | 	return d;
181 | }
182 | 
183 | /**
184 |  * Combine two PMFs with two weight parameters to scale each before adding. This
185 |  * operates based on the probabilities, not the counts, so it is suitable for use
186 |  * in calculating the law of total probability: p(a)p(X|Y=a) + p(b)p(X|Y=b) when
187 |  * the empirical distributions do not contain the same number of observations
188 |  */
189 | struct pmf_t *combine_pmfs(struct pmf_t *a, struct pmf_t *b, double weight_a, double weight_b, struct pmf_t *result) {
190 | 	uint32_t i;
191 | 
192 | 	if (a->alphabet != b->alphabet || a->alphabet != result->alphabet)
193 | 		return NULL;
194 | 	
195 | 	if (!a->pmf_ready)
196 | 		recalculate_pmf(a);
197 | 	if (!b->pmf_ready)
198 | 		recalculate_pmf(b);
199 | 
200 | 	for (i = 0; i < a->alphabet->size; ++i) {
201 | 		result->pmf[i] = weight_a * a->pmf[i] + weight_b * b->pmf[i];
202 | 	}
203 | 	result->pmf_ready = 1;
204 | 	return result;
205 | }
206 | 
207 | /**
208 |  * When counting symbols, this handles incrementing everything for the given
209 |  * index
210 |  */
211 | void pmf_increment(struct pmf_t *pmf, uint32_t index) {
212 | 	pmf->counts[index] += 1;
213 | 	pmf->total += 1;
214 | }
215 | 
216 | /**
217 |  * Recalculates the PMF as a series of doubles from the empirical counts and total
218 |  */
219 | void recalculate_pmf(struct pmf_t *pmf) {
220 | 	uint32_t i;
221 | 	double total = (double) pmf->total;
222 | 
223 | 	pmf->pmf_ready = 1;
224 | 	if (pmf->total == 0)
225 | 		return;
226 | 	
227 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
228 | 		pmf->pmf[i] = ((double) pmf->counts[i]) / total;
229 | 	}
230 | }
231 | 
232 | /**
233 |  * Renormalizes a PMF if it is nonzero
234 |  */
235 | void renormalize_pmf(struct pmf_t *pmf) {
236 | 	double total = 0;
237 | 	uint32_t i;
238 | 
239 | 	// PMFs still in counts form never need renormalization
240 | 	if (!pmf->pmf_ready)
241 | 		return;
242 | 
243 | 	// Find total
244 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
245 | 		total += pmf->pmf[i];
246 | 	}
247 | 
248 | 	// If nonzero, scale every entry to ensure we sum to 1
249 | 	if (total > 0) {
250 | 		for (i = 0; i < pmf->alphabet->size; ++i) {
251 | 			pmf->pmf[i] = pmf->pmf[i] / total;
252 | 		}
253 | 	}
254 | }
255 | 
256 | /**
257 |  * Converts a PMF that is stored as a series of doubles back to the counts representation,
258 |  * or alternatively this can be viewed as quantizing it into a fixed point representation in
259 |  * 0.m format
260 |  */
261 | void pmf_to_counts(struct pmf_t *pmf, uint32_t m) {
262 | 	uint32_t i;
263 | 	double scale = ((1 << m) - 1);
264 | 
265 | 	pmf->total = 0;
266 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
267 | 		pmf->counts[i] = (uint32_t) (pmf->counts[i] * scale);
268 | 		pmf->total += pmf->counts[i];
269 | 	}
270 | }
271 | 
272 | /**
273 |  * Zeros out the counts and probabilities for a PMF to let us reuse the same memory allocation
274 |  */
275 | void clear_pmf(struct pmf_t *pmf) {
276 | 	memset(pmf->counts, 0, pmf->alphabet->size * sizeof(uint32_t));
277 | 	memset(pmf->pmf, 0, pmf->alphabet->size * sizeof(double));
278 | 	pmf->pmf_ready = 0;
279 | 	pmf->total = 0;
280 | }
281 | 
282 | /**
283 |  * Zeros out every pmf in the given list, so we can reuse the entire pmf list without
284 |  * deallocating/reallocating memory
285 |  */
286 | void clear_pmf_list(struct pmf_list_t *list) {
287 | 	uint32_t i;
288 | 	for (i = 0; i < list->size; ++i) {
289 | 		clear_pmf(list->pmfs[i]);
290 | 	}
291 | }
292 | 
293 | /**
294 |  * Determines if the given alphabet contains the given symbol
295 |  */
296 | uint32_t alphabet_contains(const struct alphabet_t *alphabet, symbol_t symbol) {
297 | 	return alphabet->indexes[symbol] != ALPHABET_SYMBOL_NOT_FOUND ? 1 : 0;
298 | }
299 | 
300 | /**
301 |  * Looks up the index of a symbol in the given alphabet, which may be useful
302 |  * if the alphabet doesn't start at zero, has gaps, etc.
303 |  */
304 | uint32_t get_symbol_index(const struct alphabet_t *alphabet, symbol_t symbol) {
305 | 	return alphabet->indexes[symbol];
306 | }
307 | 
308 | /**
309 |  * Finds the unique set of symbols across both input alphabets and creates an
310 |  * output alphabet
311 |  */
312 | void alphabet_union(const struct alphabet_t *restrict a, const struct alphabet_t *restrict b, struct alphabet_t *result) {
313 | 	symbol_t *sym = (symbol_t *) _alloca((a->size+b->size)*sizeof(symbol_t));
314 | 	uint32_t i = 0;
315 | 	uint32_t j = 0;
316 | 	uint32_t k = 0;
317 | 
318 | 	// Combine with a merge algorithm since alphabets are required to be sorted
319 | 	while (i < a->size && j < b->size) {
320 | 		if (a->symbols[i] < b->symbols[j]) {
321 | 			sym[k] = a->symbols[i];
322 | 			i += 1;
323 | 		}
324 | 		else if (a->symbols[i] == b->symbols[j]) {
325 | 			sym[k] = a->symbols[i];
326 | 			i += 1;
327 | 			j += 1;
328 | 		}
329 | 		else {
330 | 			sym[k] = b->symbols[j];
331 | 			j += 1;
332 | 		}
333 | 		k += 1;
334 | 	}
335 | 
336 | 	// Tail of the merge
337 | 	while (i < a->size) {
338 | 		sym[k] = a->symbols[i];
339 | 		k += 1;
340 | 		i += 1;
341 | 	}
342 | 	while (j < b->size) {
343 | 		sym[k] = b->symbols[j];
344 | 		k += 1;
345 | 		j += 1;
346 | 	}
347 | 
348 | 	// If we already have an output array, replace it with a new one
349 | 	if (result->symbols)
350 | 		free(result->symbols);
351 | 	result->symbols = (symbol_t *) calloc(k, sizeof(symbol_t));
352 | 
353 | 	// Copy over temporary data
354 | 	memcpy(result->symbols, sym, k*sizeof(symbol_t));
355 | 	result->size = k;
356 | 	alphabet_compute_index(result);
357 | }
358 | 
359 | /**
360 |  * Computes the index table (reverse mapping of symbols in the alphabet)
361 |  * that is used to speed up searches for symbols). This isn't a proper
362 |  * hash table and it will consume exponential memory if symbol_t changes
363 |  * size, so be careful
364 |  */
365 | void alphabet_compute_index(struct alphabet_t *A) {
366 | 	uint32_t i;
367 | 
368 | 	if (A->indexes)
369 | 		free(A->indexes);
370 | 	
371 | 	// Cheating but whatever
372 | 	A->indexes = (uint32_t *) calloc(ALPHABET_INDEX_SIZE_HINT, sizeof(uint32_t));
373 | 
374 | 	// Fill gaps in the table with an appropriate index so we can use this for search too
375 | 	for (i = 0; i < ALPHABET_INDEX_SIZE_HINT; ++i) {
376 | 		A->indexes[i] = ALPHABET_SYMBOL_NOT_FOUND;
377 | 	}
378 | 
379 | 	for (i = 0; i < A->size; ++i) {
380 | 		A->indexes[A->symbols[i]] = i;
381 | 	}
382 | }
383 | 
384 | /**
385 |  * Displays an alphabet as "(index): 'character' <number>" one per line
386 |  */
387 | void print_alphabet(const struct alphabet_t *alphabet) {
388 | 	uint32_t i;
389 | 	for (i = 0; i < alphabet->size; ++i) {
390 | 		printf("(%d): '%c' <%d>\n", i, alphabet->symbols[i], alphabet->symbols[i]);
391 | 	}
392 | }
393 | 
394 | /**
395 |  * Displays a PMF
396 |  */
397 | void print_pmf(struct pmf_t *pmf) {
398 | 	uint32_t i;
399 | 
400 | 	if (!pmf->pmf_ready)
401 | 		recalculate_pmf(pmf);
402 | 
403 | 	for (i = 0; i < pmf->alphabet->size; ++i) {
404 | 		printf("<%d>: %.5f (%d/%d)\n", pmf->alphabet->symbols[i], pmf->pmf[i], pmf->counts[i], pmf->total);
405 | 	}
406 | }
407 | 


--------------------------------------------------------------------------------
/src/codebook.c:
--------------------------------------------------------------------------------
  1 | #include "codebook.h"
  2 | #include "lines.h"
  3 | #include "cluster.h"
  4 | 
  5 | #include <stdio.h>
  6 | #include <assert.h>
  7 | 
  8 | #if defined(LINUX) || defined(__APPLE__)
  9 | 	#include <arpa/inet.h>
 10 | #endif
 11 | 
 12 | /**
 13 |  * To compute stats for the training data, we will need a set of conditional PMFs, one
 14 |  * per column
 15 |  * @param alphabet The symbol alphabet for each column
 16 |  * @param columns The number of columns to allocate conditional PMFs for
 17 |  */
 18 | struct cond_pmf_list_t *alloc_conditional_pmf_list(const struct alphabet_t *alphabet, uint32_t columns) {
 19 | 	uint32_t count = 1 + alphabet->size*(columns-1);
 20 | 	uint32_t i;
 21 | 	struct cond_pmf_list_t *list = (struct cond_pmf_list_t *) calloc(1, sizeof(struct cond_pmf_list_t));
 22 | 
 23 | 	// We need one array of PMF pointers that will index into the buffer allocated above, for the columns
 24 | 	list->columns = columns;
 25 | 	list->alphabet = alphabet;
 26 | 	list->pmfs = (struct pmf_t **) calloc(count, sizeof(struct pmf_t *));
 27 | 
 28 | 	// All PMFs are stored in a flat array, the accessor function will resolve a PMF's address
 29 | 	for (i = 0; i < count; ++i) {
 30 | 		list->pmfs[i] = alloc_pmf(alphabet);
 31 | 	}
 32 | 
 33 | 	return list;
 34 | }
 35 | 
 36 | /**
 37 |  * Deallocate the PMF list given and unallocate the two allocated memory blocks
 38 |  * @param list The conditional pmf list to deallocate
 39 |  */
 40 | void free_conditional_pmf_list(struct cond_pmf_list_t *list) {
 41 | 	uint32_t count = 1 + list->alphabet->size * (list->columns - 1);
 42 | 	uint32_t i;
 43 | 
 44 | 	for (i = 0; i < count; ++i) {
 45 | 		free_pmf(list->pmfs[i]);
 46 | 	}
 47 | 	free(list);
 48 | 
 49 | 	free_pmf_list(list->marginal_pmfs);
 50 | }
 51 | 
 52 | /**
 53 |  * Allocate the quantizer list structure and the first level of array based on columns
 54 |  * @param columns The number of columns for which we have quantizers
 55 |  * @return Pointer to conditional quantizer list structure
 56 |  */
 57 | struct cond_quantizer_list_t *alloc_conditional_quantizer_list(uint32_t columns) {
 58 | 	struct cond_quantizer_list_t *rtn = (struct cond_quantizer_list_t *) calloc(1, sizeof(struct cond_quantizer_list_t));
 59 | 	rtn->columns = columns;
 60 | 	rtn->input_alphabets = (struct alphabet_t **) calloc(columns, sizeof(struct alphabet_t *));
 61 | 	rtn->q = (struct quantizer_t ***) calloc(columns, sizeof(struct quantizer_t **));
 62 |     rtn->ratio = (double **) calloc(columns, sizeof(double *));
 63 | 	rtn->qratio = (uint8_t **) calloc(columns, sizeof(uint8_t *));
 64 | 	return rtn;
 65 | }
 66 | 
 67 | /**
 68 |  * Deallocate the quantizer list as well as any alphabets or pmfs that are stored
 69 |  * @param list The conditional quantizer list to deallocate
 70 |  */
 71 | void free_cond_quantizer_list(struct cond_quantizer_list_t *list) {
 72 | 	uint32_t i, j;
 73 | 
 74 | 	for (i = 0; i < list->columns; ++i) {
 75 | 		if (list->q[i]) {
 76 | 			for (j = 0; j < list->input_alphabets[i]->size; ++j) {
 77 | 				if (list->q[i][j])
 78 | 					free_quantizer(list->q[i][j]);
 79 | 			}
 80 | 			free_alphabet(list->input_alphabets[i]);
 81 | 			free(list->q[i]);
 82 | 			free(list->ratio[i]);
 83 | 			free(list->qratio[i]);
 84 | 		}
 85 | 	}
 86 | 
 87 | 	free(list->qratio);
 88 | 	free(list->ratio);
 89 | 	free(list->q);
 90 | 	free(list->input_alphabets);
 91 | 	free(list);
 92 | }
 93 | 
 94 | /**
 95 |  * Initialize the information within a quantizer for the given column. This can't be done
 96 |  * at allocation time because we don't know everything about this column until we get here
 97 |  * during the optimization process
 98 |  * @param list The conditional quantizer list to update
 99 |  * @param column The column to initialize
100 |  * @param input_union The alphabet of all possible left context symbols
101 |  */
102 | void cond_quantizer_init_column(struct cond_quantizer_list_t *list, uint32_t column, const struct alphabet_t *input_union) {
103 | 	list->input_alphabets[column] = duplicate_alphabet(input_union);
104 | 
105 | 	// Low and high quantizer per element of the input union
106 | 	list->q[column] = (struct quantizer_t **) calloc(input_union->size*2, sizeof(struct quantizer_t *));
107 | 
108 | 	// One ratio per element of input union
109 | 	list->ratio[column] = (double *) calloc(input_union->size, sizeof(double));
110 | 	list->qratio[column] = (uint8_t *) calloc(input_union->size, sizeof(uint8_t));
111 | }
112 | 
113 | /**
114 |  * Find a PMF for a specific column with the specific previous value
115 |  */
116 | struct pmf_t *get_cond_pmf(struct cond_pmf_list_t *list, uint32_t column, symbol_t prev) {
117 | 	if (column == 0)
118 | 		return list->pmfs[0];
119 | 	return list->pmfs[1 + (column-1)*list->alphabet->size + prev];
120 | }
121 | 
122 | /**
123 |  * Get a quantizer by its indexed location within the quantizer list for a column
124 |  */
125 | struct quantizer_t *get_cond_quantizer_indexed(struct cond_quantizer_list_t *list, uint32_t column, uint32_t index) {
126 | 	return list->q[column][index];
127 | }
128 | 
129 | /**
130 |  * Get a quantizer by its left context symbol
131 |  */
132 | struct quantizer_t *get_cond_quantizer(struct cond_quantizer_list_t *list, uint32_t column, symbol_t prev) {
133 | 	uint32_t idx = get_symbol_index(list->input_alphabets[column], prev);
134 | 	if (idx != ALPHABET_SYMBOL_NOT_FOUND)
135 | 		return get_cond_quantizer_indexed(list, column, idx);
136 | 	return NULL;
137 | }
138 | 
139 | /**
140 |  * Stores the given quantizers at the appropriate index corresponding to the left context symbol given
141 |  * for the specific column
142 |  */
143 | void store_cond_quantizers(struct quantizer_t *restrict lo, struct quantizer_t *restrict hi, double ratio, struct cond_quantizer_list_t *list, uint32_t column, symbol_t prev) {
144 | 	uint32_t idx = get_symbol_index(list->input_alphabets[column], prev);
145 | 	store_cond_quantizers_indexed(lo, hi, ratio, list, column, idx);
146 | }
147 | 
148 | /**
149 |  * Stores the given quantizers directly at the given index based on the previous context symbol. Faster when
150 |  * we know what the previous index was in addition to what the previous symbol was
151 |  */
152 | void store_cond_quantizers_indexed(struct quantizer_t *restrict lo, struct quantizer_t *restrict hi, double ratio, struct cond_quantizer_list_t *list, uint32_t column, uint32_t idx) {
153 |     list->q[column][2*idx] = lo;
154 | 	list->q[column][2*idx + 1] = hi;
155 |     list->ratio[column][idx] = ratio;
156 | 	list->qratio[column][idx] = (uint8_t) (ratio * 128.);
157 | }
158 | 
159 | /**
160 |  * Selects a quantizer for the given column from the quantizer list with the appropriate ratio
161 |  */
162 | struct quantizer_t *choose_quantizer(struct cond_quantizer_list_t *list, struct well_state_t *well, uint32_t column, symbol_t prev, uint32_t *q_idx) {
163 | 	uint32_t idx = get_symbol_index(list->input_alphabets[column], prev);
164 | 	assert(idx != ALPHABET_SYMBOL_NOT_FOUND);
165 | 	if (well_1024a_bits(well, 7) >= list->qratio[column][idx]) {
166 |         *q_idx = 2*idx+1;
167 | 		return list->q[column][2*idx+1];
168 | 	}
169 |     *q_idx = 2*idx;
170 | 	return list->q[column][2*idx];
171 | }
172 | 
173 | /**
174 |  * Converts a quality score into a state encoded value, which is the same as doing a symbol index lookup
175 |  * in the output alphabet. This needs to be inlined.
176 |  */
177 | uint32_t find_state_encoding(struct quantizer_t *q, symbol_t value) {
178 | 	return get_symbol_index(q->output_alphabet, value);
179 | }
180 | 
181 | /**
182 |  * Calculates the statistics, producing a conditional pmf list per cluster and storing
183 |  * it directly inside the cluster in question
184 |  */
185 | void calculate_statistics(struct quality_file_t *info) {
186 | 	uint32_t block, line_idx, column;
187 | 	uint32_t j;
188 | 	uint8_t c;
189 | 	struct line_t *line;
190 | 	struct cluster_t *cluster;
191 | 	struct cond_pmf_list_t *pmf_list;
192 | 
193 | 	for (block = 0; block < info->block_count; ++block) {
194 | 		for (line_idx = 0; line_idx < info->blocks[block].count; ++line_idx) {
195 | 			line = &info->blocks[block].lines[line_idx];
196 | 			cluster = &info->clusters->clusters[line->cluster];
197 | 			pmf_list = cluster->training_stats;
198 | 
199 | 			// First, find conditional PMFs
200 | 			pmf_increment(get_cond_pmf(pmf_list, 0, 0), line->m_data[0] - 33);
201 | 			for (column = 1; column < info->columns; ++column) {
202 | 				pmf_increment(get_cond_pmf(pmf_list, column, line->m_data[column-1] - 33), line->m_data[column] - 33);
203 | 			}
204 | 		}
205 | 	}
206 | 
207 | 	// Then find unconditional PMFs for each cluster once the full conditional ones are ready
208 | 	for (c = 0; c < info->cluster_count; ++c) {
209 | 		cluster = &info->clusters->clusters[c];
210 | 		pmf_list = cluster->training_stats;
211 | 
212 | 		pmf_list->marginal_pmfs = alloc_pmf_list(info->columns, pmf_list->alphabet);
213 | 		combine_pmfs(get_cond_pmf(pmf_list, 0, 0), pmf_list->marginal_pmfs->pmfs[0], 1.0, 0.0, pmf_list->marginal_pmfs->pmfs[0]);
214 | 		for (column = 1; column < info->columns; ++column) {
215 | 			for (j = 0; j < pmf_list->alphabet->size; ++j) {
216 | 				combine_pmfs(pmf_list->marginal_pmfs->pmfs[column], get_cond_pmf(pmf_list, column, j), 1.0, get_probability(pmf_list->marginal_pmfs->pmfs[column-1], j), pmf_list->marginal_pmfs->pmfs[column]);
217 | 			}
218 | 		}
219 | 	}
220 | }
221 | 
222 | /**
223 |  * Searches (linearly) for the pair of quantizers that surround the target entropy by guessing and checking the number of states
224 |  * @param pmf The pmf that is to be quantized
225 |  * @param dist The distortion metric to quantize against
226 |  * @param lo Place to store the pointer to the low quantizer
227 |  * @param hi Place to store the pointer to the high quantizer
228 |  * @return double The ratio necessary to combine these two quantizers to achieve the target
229 |  */
230 | double optimize_for_entropy(struct pmf_t *pmf, struct distortion_t *dist, double target, struct quantizer_t **lo, struct quantizer_t **hi) {
231 | 	struct quantizer_t *q_temp;
232 | 	double lo_entropy, hi_entropy;
233 | 	struct pmf_t *pmf_temp = alloc_pmf(pmf->alphabet);
234 | 	uint32_t states = 1;
235 | 
236 | 	if (target == 0.0) {
237 | 		*lo = generate_quantizer(pmf, dist, 1);
238 | 		*hi = generate_quantizer(pmf, dist, 1);
239 | 		
240 | 		free_pmf(pmf_temp);
241 | 		return 1.0;
242 | 	}
243 | 	
244 | 	q_temp = generate_quantizer(pmf, dist, states);
245 | 	hi_entropy = get_entropy(apply_quantizer(q_temp, pmf, pmf_temp));
246 | 	*hi = q_temp;
247 | 	*lo = alloc_quantizer(pmf->alphabet);
248 | 
249 | 	do {
250 | 		free_quantizer(*lo);
251 | 		*lo = *hi;
252 | 		lo_entropy = hi_entropy;
253 | 		
254 | 		states += 1;
255 | 		q_temp = generate_quantizer(pmf, dist, states);
256 | 		hi_entropy = get_entropy(apply_quantizer(q_temp, pmf, pmf_temp));
257 | 		*hi = q_temp;
258 | 	} while (hi_entropy < target && states < pmf->alphabet->size);
259 | 
260 | 	free_pmf(pmf_temp);
261 | 
262 | 	// Assign ratio based on how we did against our entropy target
263 | 	if (hi_entropy < target)
264 | 		return 0.0;
265 | 	else if (lo_entropy >= target || hi_entropy == lo_entropy)
266 | 		return 1.0;
267 | 	else
268 | 		return (target - hi_entropy) / (lo_entropy - hi_entropy);
269 | }
270 | 
271 | /**
272 |  * 
273 |  */
274 | void compute_qpmf_quan_list(struct quantizer_t *q_lo, struct quantizer_t *q_hi, struct pmf_list_t *q_x_pmf, double ratio, struct alphabet_t *q_output_union) {
275 |     symbol_t x;
276 |     uint32_t q_symbol, idx;
277 |     
278 |     for (x = 0; x < q_lo->alphabet->size; x++) {
279 |         for (idx = 0; idx < q_output_union->size; idx++) {
280 |             q_symbol = q_output_union->symbols[idx];
281 |             
282 |             if (q_lo->q[x] == q_symbol)
283 |                 q_x_pmf->pmfs[x]->pmf[idx] += ratio;
284 |             
285 |             if (q_hi->q[x] == q_symbol)
286 |                 q_x_pmf->pmfs[x]->pmf[idx] += (1-ratio);
287 |         }
288 |     }
289 | }
290 | 
291 | void compute_qpmf_list(struct pmf_list_t *qpmf_list, struct cond_pmf_list_t *in_pmfs, uint32_t column, struct pmf_list_t *prev_qpmf_list, struct alphabet_t * q_alphabet_union, struct alphabet_t * prev_q_alphabet_union, struct cond_quantizer_list_t *q_list) {
292 |     symbol_t x;
293 |     double p_q_xq = 0.0, p_temp = 0.0;
294 |     uint32_t q_symbol, idx, k, j;
295 |     struct quantizer_t *q_hi, *q_lo;
296 |     
297 |     // compute P(Q_i | X_i)
298 |     for (k = 0; k < qpmf_list->size; k++) {
299 |         // compute P(Q_i | X_i = k)
300 |         for (idx = 0; idx < q_alphabet_union->size; idx++) {
301 |             q_symbol = q_alphabet_union->symbols[idx];
302 |             
303 |             // compute P(Q_i = q_symbol | X_i = k)
304 |             for (j = 0; j < prev_q_alphabet_union->size; j++) {
305 |                 p_q_xq = 0.0;
306 | 
307 |                 // extract the jth quantizers of X_i;
308 |                 q_lo = get_cond_quantizer_indexed(q_list, column-1, 2*j);
309 |                 q_hi = get_cond_quantizer_indexed(q_list, column-1, (2*j)+1);
310 |                 
311 |                 // Given the quantizers q_lo and q_hi, compute P(Q_i = q_symbol|X_i = k ,Q_{i-1} chooses the jth quantizer of X_i)
312 |                 if (q_lo->q[k] == q_symbol)
313 |                     p_q_xq += q_lo->ratio;
314 |                 
315 |                 if (q_hi->q[k] == q_symbol)
316 |                     p_q_xq += q_hi->ratio;
317 |                 
318 |                 p_temp = 0;
319 |                 for (x = 0; x < prev_qpmf_list->size; ++x) {
320 |                     p_temp += get_probability(prev_qpmf_list->pmfs[x], j) * get_probability(get_cond_pmf(in_pmfs, column-1, x), k) * get_probability(in_pmfs->marginal_pmfs->pmfs[column-2], x);
321 |                 }
322 |                 qpmf_list->pmfs[k]->pmf[idx] += p_q_xq * p_temp;
323 |             }
324 |         }
325 |         
326 |         // Normilize P(Q_i | X_i = k)
327 |         qpmf_list->pmfs[k]->pmf_ready = 1;
328 |         renormalize_pmf(qpmf_list->pmfs[k]);
329 |     }
330 | }
331 | 
332 | void compute_xpmf_list(struct pmf_list_t *qpmf_list, struct cond_pmf_list_t *in_pmfs, uint32_t column, struct pmf_list_t *xpmf_list, struct alphabet_t * q_alphabet_union){
333 |     symbol_t x;
334 |     uint32_t idx, k;
335 |     
336 |     // compute P(X_{i+1} | Q_i)
337 |     for (idx = 0; idx < q_alphabet_union->size; idx++) {
338 |         // compute P(X_{i+1} | Q_i = q)
339 |         for (k = 0; k < qpmf_list->size; k++) {
340 |             // compute P(X_{i+1} = k | Q_i = q)
341 |             for (x = 0; x < qpmf_list->size; ++x) {
342 |                 xpmf_list->pmfs[idx]->pmf[k] += get_probability(qpmf_list->pmfs[x], idx) * get_probability(get_cond_pmf(in_pmfs, column, x), k) * get_probability(in_pmfs->marginal_pmfs->pmfs[column-1], x);
343 |             }
344 |         }
345 |         // Normilize P(X_{i+1} | Q_i = q)
346 |         xpmf_list->pmfs[idx]->pmf_ready = 1;
347 |         renormalize_pmf(xpmf_list->pmfs[idx]);
348 |     }
349 | }
350 | 
351 | /**
352 |  * For a set of already clustered data, generate codebooks for each cluster and
353 |  * store them inside the cluster data structure
354 |  */
355 | void generate_codebooks(struct quality_file_t *info) {
356 | 	// Stuff for state allocation and mixing
357 | 	double ratio;
358 |     
359 | 	// Miscellaneous variables
360 | 	uint32_t column, j;
361 | 	double total_mse;
362 |     
363 | 	// Output list of conditional quantizers
364 | 	struct cond_quantizer_list_t *q_list;
365 |     
366 | 	// Constant alphabet of all possible input symbols
367 | 	const struct alphabet_t *A = info->alphabet;
368 |     
369 | 	// Temporary/extra pointers
370 | 	struct quantizer_t *q_lo;
371 |     struct quantizer_t *q_hi;
372 |     
373 | 	// List of conditionally quantized PMFs after quantizer has been added out
374 | 	struct pmf_list_t *xpmf_list;
375 |     
376 | 	// List of conditionally quantized PMFs after the next quantizer was applied
377 | 	struct pmf_list_t *qpmf_list;
378 | 	struct pmf_list_t *prev_qpmf_list;
379 |     
380 | 	// Alphabet of all possible quantizer outputs from the previous column
381 | 	struct alphabet_t *q_output_union;
382 |     struct alphabet_t *q_prev_output_union;
383 | 
384 | 	uint8_t cluster_id;
385 | 	struct cond_pmf_list_t *in_pmfs;
386 | 	struct qv_options_t *opts = info->opts;
387 | 	struct distortion_t *dist = info->dist;
388 | 
389 | 	for (cluster_id = 0; cluster_id < info->cluster_count; ++cluster_id) {
390 | 		q_list = alloc_conditional_quantizer_list(info->columns);
391 | 		info->clusters->clusters[cluster_id].qlist = q_list;
392 | 		in_pmfs = info->clusters->clusters[cluster_id].training_stats;
393 |     
394 |     	// For the column 0 the quantizers aren't conditional, so find them directly
395 |     	q_output_union = alloc_alphabet(1);
396 |     	cond_quantizer_init_column(q_list, 0, q_output_union);
397 | 		q_list->options = opts;
398 |     
399 |     	// Initialize the new pmfs (dummy)
400 |     	qpmf_list = alloc_pmf_list(A->size, q_output_union);
401 |     
402 |     	// Handle column zero specially
403 | 		// @todo handle fixed mse target
404 | 		if (opts->mode == MODE_RATIO)
405 | 			ratio = optimize_for_entropy(get_cond_pmf(in_pmfs, 0, 0), dist, get_entropy(get_cond_pmf(in_pmfs, 0, 0))*opts->ratio, &q_lo, &q_hi);
406 | 		else
407 | 			ratio = optimize_for_entropy(get_cond_pmf(in_pmfs, 0, 0), dist, opts->ratio, &q_lo, &q_hi);
408 | 		q_lo->ratio = ratio;
409 | 		q_hi->ratio = 1-ratio;
410 | 		total_mse = ratio*q_lo->mse + (1-ratio)*q_hi->mse;
411 |     	store_cond_quantizers(q_lo, q_hi, ratio, q_list, 0, 0);
412 |     
413 |     	// free the used pmfs and alphabet
414 |     	// (do not free q_prev_output_union and prev_qpmf_output as it's the first assignment).
415 |     	q_prev_output_union = q_output_union;
416 |     	prev_qpmf_list = qpmf_list;
417 |     
418 |     	// Start computing the quantizers of the rest of the columns
419 |     	for (column = 1; column < info->columns; column++) {
420 |         	// Compute the next output alphabet union over all quantizers for this column
421 | 			q_output_union = duplicate_alphabet(get_cond_quantizer_indexed(q_list, column-1, 0)->output_alphabet);
422 | 			for (j = 1; j < 2*q_prev_output_union->size; ++j) {
423 | 				alphabet_union(q_output_union, get_cond_quantizer_indexed(q_list, column-1, j)->output_alphabet, q_output_union);
424 | 			}
425 |         	cond_quantizer_init_column(q_list, column, q_output_union);
426 |         	
427 |         	// Initialize the new pmfs
428 |         	qpmf_list = alloc_pmf_list(A->size, q_output_union);
429 |         	xpmf_list = alloc_pmf_list(q_output_union->size, A);
430 |         
431 |         	// Compute P(Q_i|X_i)
432 |         	if (column == 1)
433 |         	    compute_qpmf_quan_list(q_lo, q_hi, qpmf_list, ratio, q_output_union);
434 |         	else
435 |         	    compute_qpmf_list(qpmf_list, in_pmfs, column, prev_qpmf_list, q_output_union, q_prev_output_union, q_list);
436 |         	
437 |         	// Compute P(X_{i+1}|Q_i)
438 |         	compute_xpmf_list(qpmf_list, in_pmfs, column, xpmf_list, q_output_union);
439 |         
440 |         	// for each previous value Q_i compute the quantizers
441 |         	for (j = 0; j < q_output_union->size; ++j) {
442 |         	    // Find and save quantizers
443 | 				// @todo handle fixed mse target
444 | 				if (opts->mode == MODE_RATIO)
445 | 					ratio = optimize_for_entropy(xpmf_list->pmfs[j], dist, get_entropy(xpmf_list->pmfs[j])*opts->ratio, &q_lo, &q_hi);
446 | 				else
447 | 					ratio = optimize_for_entropy(xpmf_list->pmfs[j], dist, opts->ratio, &q_lo, &q_hi);
448 | 				q_lo->ratio = ratio;
449 | 				q_hi->ratio = 1-ratio;
450 |         	    store_cond_quantizers_indexed(q_lo, q_hi, ratio, q_list, column, j);
451 | 
452 | 				// This actually needs to be scaled by the probability of this quantizer pair being used to be accurate, uniform assumption is an approximation
453 | 				total_mse += (ratio*q_lo->mse + (1-ratio)*q_hi->mse) / q_output_union->size;
454 |         	}
455 |         
456 |         	// deallocated the memory of the used pmfs and alphabet
457 |         	free(q_prev_output_union);
458 |         	q_prev_output_union = q_output_union;
459 | 	        free_pmf_list(prev_qpmf_list);
460 | 			prev_qpmf_list = qpmf_list;
461 | 	        free_pmf_list(xpmf_list);
462 |     	}
463 |     	
464 | 		// Final cleanup, things we saved at the end of the final iteration that aren't needed
465 | 		free_pmf_list(qpmf_list);
466 |     	free(q_output_union);
467 | 	}
468 | }
469 | 
470 | /**
471 |  * Writes all of the codebooks for the set of quantizers given, along with necessary
472 |  * metadata (columns, lines, cluster counts) first
473 |  */
474 | void write_codebooks(FILE *fp, struct quality_file_t *info) {
475 | 	uint32_t columns, lines;
476 | 	uint32_t j;
477 | 	char linebuf[1];
478 | 
479 | 	// Header line is number of clusters (1 byte)
480 | 	// number of columns (4), total number of lines (4), then a newline
481 | 	columns = htonl(info->columns);
482 | 	lines = htonl((uint32_t)info->lines);
483 | 	linebuf[0] = info->cluster_count;
484 | 	fwrite(linebuf, sizeof(char), 1, fp);
485 | 	fwrite(&columns, sizeof(uint32_t), 1, fp);
486 | 	fwrite(&lines, sizeof(uint32_t), 1, fp);
487 | 
488 | 	// Now, write each cluster's codebook in order
489 | 	for (j = 0; j < info->cluster_count; ++j) {
490 | 		write_codebook(fp, info->clusters->clusters[j].qlist);
491 | 	}
492 | }
493 | 
494 | /**
495 |  * Writes a codebook to a file that will be used by the decoder to initialize the arithmetic decoder
496 |  * identically to how it was set up during encoding. The format for the file is:
497 |  * Line 1: 1 byte ratio offset by 33 to be human readable
498 |  * Line 2: 1 byte per quantizer symbol for column 0, low
499 |  * Line 3: 1 byte per quantizer symbol for column 0, high
500 |  * Lines (1+3j, 2+3j, 3+3j):
501 |  *  1: 1 byte per ratio per unique output of previous column
502 |  *  2: 1 byte per quantizer symbol for each low quantizer in order of symbols in previous column
503 |  *  3: 1 byte per quantizer symbol for each high quantizer in order of symbols in previous column
504 |  */
505 | void write_codebook(FILE *fp, struct cond_quantizer_list_t *quantizers) {
506 | 	uint32_t i, j, k;
507 | 	uint32_t columns = quantizers->columns;
508 | 	struct quantizer_t *q_temp = get_cond_quantizer_indexed(quantizers, 0, 0);
509 | 	uint32_t size = q_temp->alphabet->size;
510 | 	uint32_t buflen = columns > size ? columns : size;
511 | 	char *eol = "\n";
512 | 	char *linebuf = (char *) _alloca(sizeof(char)*buflen);
513 | 
514 | 	// First line, ratio for zero context quantizer
515 | 	linebuf[0] = quantizers->qratio[0][0] + 33;
516 | 	linebuf[1] = eol[0];
517 | 	fwrite(linebuf, sizeof(char), 2, fp);
518 | 
519 | 	// Second line is low quantizer
520 | 	COPY_Q_TO_LINE(linebuf, q_temp->q, i, size);
521 | 	fwrite(linebuf, sizeof(char), size, fp);
522 | 	fwrite(eol, sizeof(char), 1, fp);
523 | 
524 | 	// Third line is high quantizer
525 | 	q_temp = get_cond_quantizer_indexed(quantizers, 0, 1);
526 | 	COPY_Q_TO_LINE(linebuf, q_temp->q, i, size);
527 | 	fwrite(linebuf, sizeof(char), size, fp);
528 | 	fwrite(eol, sizeof(char), 1, fp);
529 | 
530 | 	// Now for the rest of the columns, use the same format
531 | 	for (i = 1; i < columns; ++i) {
532 | 		// First a line containing ratios for each previous context
533 | 		for (j = 0; j < quantizers->input_alphabets[i]->size; ++j) {
534 | 			linebuf[j] = quantizers->qratio[i][j] + 33;
535 | 		}
536 | 		fwrite(linebuf, sizeof(char), quantizers->input_alphabets[i]->size, fp);
537 | 		fwrite(eol, sizeof(char), 1, fp);
538 | 
539 | 		// Next, the low quantizers in index order
540 | 		for (j = 0; j < quantizers->input_alphabets[i]->size; ++j) {
541 | 			q_temp = get_cond_quantizer_indexed(quantizers, i, 2*j);
542 | 			COPY_Q_TO_LINE(linebuf, q_temp->q, k, size);
543 | 			fwrite(linebuf, sizeof(char), size, fp);
544 | 		}
545 | 		fwrite(eol, sizeof(char), 1, fp);
546 | 
547 | 		// Finally, the high quantizers in index order
548 | 		for (j = 0; j < quantizers->input_alphabets[i]->size; ++j) {
549 | 			q_temp = get_cond_quantizer_indexed(quantizers, i, 2*j+1);
550 | 			COPY_Q_TO_LINE(linebuf, q_temp->q, k, size);
551 | 			fwrite(linebuf, sizeof(char), size, fp);
552 | 		}
553 | 		fwrite(eol, sizeof(char), 1, fp);
554 | 	}
555 | }
556 | 
557 | /**
558 |  * Reads in all of the codebooks for the clusters from the given file
559 |  */
560 | void read_codebooks(FILE *fp, struct quality_file_t *info) {
561 | 	uint8_t j;
562 | 	char line[9];
563 | 
564 | 	// Figure out how many clusters we have to set up cluster sizes
565 | 	fread(line, sizeof(char), 9, fp);
566 | 	info->cluster_count = line[0];
567 | 
568 | 	// Recover columns and lines as 32 bit integers
569 | 	info->columns = (line[1] & 0xff) | ((line[2] << 8) & 0xff00) | ((line[3] << 16) & 0xff0000) | ((line[4] << 24) & 0xff000000);
570 | 	info->columns = ntohl(info->columns);
571 | 	info->lines = (line[5] & 0xff) | ((line[6] << 8) & 0xff00) | ((line[7] << 16) & 0xff0000) | ((line[8] << 24) & 0xff000000);
572 | 	info->lines = ntohl(info->lines);
573 | 	
574 | 	// Can't allocate clusters until we know how many columns there are
575 | 	info->clusters = alloc_cluster_list(info);
576 | 
577 | 	// Read codebooks in order
578 | 	for (j = 0; j < info->cluster_count; ++j) {
579 | 		info->clusters->clusters[j].qlist = read_codebook(fp, info);
580 | 	}
581 | }
582 | 
583 | /**
584 |  * Reads a single codebook and sets up the quantizer list
585 |  */
586 | struct cond_quantizer_list_t *read_codebook(FILE *fp, struct quality_file_t *info) {
587 | 	uint32_t column, size;
588 | 	uint32_t i, j;
589 | 	struct quantizer_t *q_lo, *q_hi;
590 | 	struct cond_quantizer_list_t *qlist;
591 | 	struct alphabet_t *uniques;
592 | 	char line[MAX_CODEBOOK_LINE_LENGTH];
593 | 	uint8_t qratio;
594 | 	struct alphabet_t *A = info->alphabet;
595 | 	uint32_t columns = info->columns;
596 | 
597 | 	uniques = alloc_alphabet(1);
598 | 	qlist = alloc_conditional_quantizer_list(info->columns);
599 | 	cond_quantizer_init_column(qlist, 0, uniques);
600 | 	free_alphabet(uniques);
601 | 
602 | 	// Next line is qratio for zero quantizer offset by 33
603 | 	fgets(line, MAX_CODEBOOK_LINE_LENGTH, fp);
604 | 	qratio = line[0] - 33;
605 | 
606 | 	// Allocate some quantizers and copy the tables from lines 3 and 4
607 | 	q_lo = alloc_quantizer(A);
608 | 	q_hi = alloc_quantizer(A);
609 | 	fgets(line, MAX_CODEBOOK_LINE_LENGTH, fp);
610 | 	COPY_Q_FROM_LINE(line, q_lo->q, j, A->size);
611 | 	fgets(line, MAX_CODEBOOK_LINE_LENGTH, fp);
612 | 	COPY_Q_FROM_LINE(line, q_hi->q, j, A->size);
613 | 
614 | 	// Fill in missing uniques information and store
615 | 	find_output_alphabet(q_lo);
616 | 	find_output_alphabet(q_hi);
617 | 	uniques = alloc_alphabet(0);
618 | 	alphabet_union(q_lo->output_alphabet, q_hi->output_alphabet, uniques);
619 | 	store_cond_quantizers_indexed(q_lo, q_hi, 0.0, qlist, 0, 0);
620 | 	qlist->qratio[0][0] = qratio;
621 | 
622 | 	// Now handle the remaining columns uniformly
623 | 	for (column = 1; column < columns; ++column) {
624 | 		// Initialize the column information so we can write to it directly
625 | 		cond_quantizer_init_column(qlist, column, uniques);
626 | 		size = uniques->size;
627 | 		free_alphabet(uniques);
628 | 		uniques = alloc_alphabet(0);
629 | 		
630 | 		// First line is the ratios
631 | 		fgets(line, MAX_CODEBOOK_LINE_LENGTH, fp);
632 | 		for (i = 0; i < size; ++i) {
633 | 			qlist->qratio[column][i] = line[i] - 33;
634 | 		}
635 | 
636 | 		// Next line is a number of low quantizers
637 | 		for (i = 0; i < size; ++i) {
638 | 			q_lo = alloc_quantizer(A);
639 | 			fread(line, A->size*sizeof(symbol_t), 1, fp);
640 | 			COPY_Q_FROM_LINE(line, q_lo->q, j, A->size);
641 | 			
642 | 			find_output_alphabet(q_lo);
643 | 			qlist->q[column][2*i] = q_lo;
644 | 			alphabet_union(uniques, q_lo->output_alphabet, uniques);
645 | 		}
646 | 
647 | 		// Kill the line with fgets to handle \n or \r\n automatically
648 | 		(void) fgets(line, 2, fp);
649 | 
650 | 		// Next line is a number of high quantizers
651 | 		for (i = 0; i < size; ++i) {
652 | 			q_hi = alloc_quantizer(A);
653 | 			fread(line, A->size*sizeof(symbol_t), 1, fp);
654 | 			COPY_Q_FROM_LINE(line, q_hi->q, j, A->size);
655 | 
656 | 			find_output_alphabet(q_hi);
657 | 			qlist->q[column][2*i+1] = q_hi;
658 | 			alphabet_union(uniques, q_hi->output_alphabet, uniques);
659 | 		}
660 | 
661 | 		// Kill the line with fgets again
662 | 		(void) fgets(line, 2, fp);
663 | 	}
664 | 
665 | 	// We don't use the uniques from the last column
666 | 	free_alphabet(uniques);
667 | 
668 | 	return qlist;
669 | }
670 | 
671 | /**
672 |  * Print out a codebook by printing all of the quantizers
673 |  */
674 | void print_codebook(struct cond_quantizer_list_t *q) {
675 | 	struct alphabet_t *A;
676 | 	uint32_t j;
677 | 	uint32_t column;
678 | 
679 | 	for (column = 0; column < q->columns; ++column) {
680 | 		A = q->input_alphabets[column];
681 | 		for (j = 0; j < 2*A->size; ++j) {
682 | 			print_quantizer(q->q[column][j]);
683 | 		}
684 | 	}
685 | }
686 | 
687 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------