├── .gitignore ├── Makefile ├── README.md ├── main.c ├── sample.c └── sample.h /.gitignore: -------------------------------------------------------------------------------- 1 | sample 2 | sample.core 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROJECT = sample 2 | OPTIMIZE = -O3 3 | WARN = -Wall -pedantic 4 | #CDEFS += 5 | #CINCS += 6 | CSTD += -std=c99 -D_BSD_SOURCE #-D_POSIX_C_SOURCE=200112L 7 | 8 | CFLAGS += ${CSTD} -g ${WARN} ${CDEFS} ${CINCS} ${OPTIMIZE} 9 | 10 | all: ${PROJECT} 11 | 12 | OBJS= ${PROJECT}.o 13 | 14 | # Basic targets 15 | 16 | ${PROJECT}: main.o ${OBJS} 17 | ${CC} -o $@ main.o ${OBJS} ${LDFLAGS} 18 | 19 | clean: 20 | rm -f ${PROJECT} test_${PROJECT} *.o *.a *.core 21 | 22 | # Installation 23 | PREFIX ?= /usr/local 24 | INSTALL ?= install 25 | RM ?= rm 26 | 27 | install: ${PROJECT} 28 | ${INSTALL} -c ${PROJECT} ${PREFIX}/bin 29 | 30 | uninstall: 31 | ${RM} -f ${PREFIX}/bin/${PROJECT} 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | sample - filter for random sampling of input. 2 | 3 | Basic usage: 4 | 5 | sample [-h] [-d files] [-n count] [-p percent] [-s seed] [FILE ...] 6 | 7 | Examples: 8 | 9 | sample FILE # randomly choose & print 4 lines from file, in order 10 | input | sample # same, from stream (file defaults to stdin) 11 | sample FILE FILE2 FILE3 # randomly choose 4 lines between multiple input files 12 | sample -n 10 FILE # choose 10 lines 13 | sample -p 10 FILE # 10% chance of choosing each line 14 | input | sample -p 5 # randomly print 5% of input lines 15 | input | sample -d a,b,c # append input to files a, b, and c, even odds 16 | input | sample -d a,b,c, # append input to files a, b, c, or /dev/null 17 | 18 | Options: 19 | 20 | -h - Print help 21 | -d FILES - randomly deal lines to multiple files (',' separated) 22 | -n COUNT - Set sample count (default: -n 4) 23 | -p PERC - Sample PERC percent for input(s) (',' separated) 24 | -s SEED - Set a specific random seed (default: seed based on time) 25 | 26 | 27 | Sampling by count (-n) works in O(n) space (but n=number of samples, not 28 | input size) and outputs samples when end of input is reached. The input order 29 | of the samples is preserved. 30 | 31 | Sampling by percentage (-p) works in constant space (no data is accumulated) 32 | and outputs as each group of lines is read. 33 | 34 | If multiple files are used, they can have custom probability percentages: 35 | 36 | input | sample -d a,b,c,d -p 0.5,0.25,0.125,0.125 37 | 38 | If the last probability is omitted, it will take all remaining: 39 | 40 | input | sample -d a,b,c,d -p 0.5,0.25,0.125, 41 | 42 | If a file is omitted, it will behave like /dev/null: 43 | 44 | input | sample -d a,b,c, -p 0.25,0.25,0.25,0.25 45 | 46 | If probabilities are between 1 and 100, they will be treated as percentages: 47 | 48 | input | sample -d a,b,c,d -p 25,25,25,25 49 | 50 | 51 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011-14 Scott Vokes 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | */ 16 | 17 | #include 18 | 19 | #include "sample.h" 20 | 21 | /* Version 0.1.0. */ 22 | #define SAMPLE_VERSION_MAJOR 0 23 | #define SAMPLE_VERSION_MINOR 1 24 | #define SAMPLE_VERSION_PATCH 0 25 | #define SAMPLE_AUTHOR "Scott Vokes " 26 | 27 | static void cleanup(config *cfg); 28 | static void parse_percent_settings(config *cfg, 29 | char *deal_arg, char *perc_arg); 30 | 31 | static void usage(const char *msg) { 32 | if (msg) { fprintf(stderr, "%s\n\n", msg); } 33 | fprintf(stderr, "sample v. %d.%d.%d by %s\n", 34 | SAMPLE_VERSION_MAJOR, SAMPLE_VERSION_MINOR, 35 | SAMPLE_VERSION_PATCH, SAMPLE_AUTHOR); 36 | fprintf(stderr, 37 | "Usage: sample [-h] [-d files] [-n count] [-p percent] [-s seed] [FILE ...]\n" 38 | " -h - Print help\n" 39 | " -d FILES - randomly deal lines to multiple files (',' separated)\n" 40 | " -n COUNT - Set sample count (default: -n 4)\n" 41 | " -p PERC - Sample PERC percent for input(s) (',' separated)\n" 42 | " If >= 1, argument is used as -p 5 => 0.05 (5%%).\n" 43 | " -s SEED - Set a specific random seed (default: seed based on time)\n" 44 | ); 45 | exit(1); 46 | } 47 | 48 | static void set_mode(config *cfg, filter_mode m) { 49 | if (cfg->mode == m) { return; } 50 | if (cfg->mode != M_UNDEF) { usage("Error: Mix of percent and count modes."); } 51 | cfg->mode = m; 52 | } 53 | 54 | static void handle_args(config *cfg, int argc, char **argv) { 55 | char *deal_arg = NULL; 56 | char *perc_arg = NULL; 57 | 58 | int fl, i; 59 | while ((fl = getopt(argc, argv, "hd:n:p:s:")) != -1) { 60 | switch (fl) { 61 | case 'h': /* help */ 62 | usage(NULL); 63 | break; /* NOTREACHED */ 64 | case 'd': /* deal out to N files */ 65 | set_mode(cfg, M_PERC); 66 | if (deal_arg) { usage("Error: Multiple -d arguments."); } 67 | deal_arg = optarg; 68 | break; 69 | case 'n': /* number of samples */ 70 | set_mode(cfg, M_COUNT); 71 | cfg->u.count.samples = atol(optarg); 72 | if (cfg->u.count.samples < 1) { 73 | fprintf(stderr, "Bad sample count.\n"); 74 | exit(1); 75 | } 76 | break; 77 | case 'p': /* percent(s) */ 78 | set_mode(cfg, M_PERC); 79 | perc_arg = optarg; 80 | break; 81 | case 's': /* seed */ 82 | cfg->seed = atol(optarg); 83 | break; 84 | case '?': 85 | default: 86 | usage(NULL); 87 | /* NOTREACHED */ 88 | } 89 | } 90 | 91 | if (cfg->mode == M_PERC) { 92 | parse_percent_settings(cfg, deal_arg, perc_arg); 93 | } else { /* default to sample count */ 94 | cfg->mode = M_COUNT; 95 | } 96 | 97 | argc -= (optind-1); 98 | argv += (optind-1); 99 | 100 | cfg->fn_count = argc - 1; 101 | if (cfg->fn_count == 0) { 102 | cfg->fn_count = 1; 103 | } 104 | 105 | cfg->fnames = calloc(cfg->fn_count, sizeof(cfg->fnames[0])); 106 | if (cfg->fnames == NULL) { err(1, "calloc"); } 107 | 108 | cfg->fnames[0] = "-"; /* default to stdin */ 109 | for (i = 1; i < argc; i++) { 110 | cfg->fnames[i - 1] = argv[i]; 111 | } 112 | } 113 | 114 | static void parse_percent_settings(config *cfg, 115 | char *deal_arg, char *perc_arg) { 116 | const int def_pairs = 1; 117 | struct out_pair *pairs = calloc(def_pairs, sizeof(*pairs)); 118 | if (pairs == NULL) { err(1, "calloc"); } 119 | 120 | assert(deal_arg != NULL || perc_arg != NULL); 121 | 122 | if (deal_arg) { 123 | /* Multiple output files: */ 124 | int pair_count = 0; 125 | int pair_max = def_pairs; 126 | 127 | /* Open files by name, split by ',' */ 128 | for (char *fn = strsep(&deal_arg, ","); 129 | fn; fn = strsep(&deal_arg, ",")) { 130 | FILE *f = NULL; 131 | if (fn[0] == '\0') { /* /dev/null */ 132 | } else if (0 == strcmp("-", fn)) { /* stdout */ 133 | f = stdout; 134 | } else { /* file */ 135 | f = fopen(fn, "ab"); 136 | if (f == NULL) { err(1, "fopen"); } 137 | } 138 | pairs[pair_count].out = f; 139 | pair_count++; 140 | if (pair_count == pair_max) { /* grow array */ 141 | int nmax = 2 * pair_max; 142 | struct out_pair *npairs = realloc(pairs, nmax * sizeof(*npairs)); 143 | if (npairs == NULL) { err(1, "realloc"); } 144 | pair_max = nmax; 145 | pairs = npairs; 146 | } 147 | } 148 | 149 | /* Deal uneven percentages to the files */ 150 | if (perc_arg) { 151 | double total = 0; 152 | int perc_count = 0; 153 | for (char *perc = strsep(&perc_arg, ","); 154 | perc; perc = strsep(&perc_arg, ",")) { 155 | double v = 0; 156 | if (perc[0] == '\0') { /* trailing "," -> remaining */ 157 | v = 1.0 - total; 158 | } else { 159 | v = strtod(perc, NULL); 160 | } 161 | if (v < 0 || v > 100) { usage("Error: Bad percentage."); } 162 | if (v >= 1.0 && v <= 100.0) { v /= 100.0; } 163 | total += v; 164 | pairs[perc_count].perc = total; 165 | perc_count++; 166 | } 167 | 168 | if (perc_count == 1) { 169 | for (int i = 1; i < pair_count; i++) { 170 | total += pairs[0].perc; 171 | pairs[i].perc = total; /* project */ 172 | } 173 | } else if (perc_count < pair_count) { 174 | usage("Error: Percent count does not match output count"); 175 | } 176 | if (total > 1.0) { usage("Error: Total is over 100%"); } 177 | } else { /* divide evenly */ 178 | double total = 0; 179 | for (int i = 0; i < pair_count; i++) { 180 | total += 1.0 / pair_count; 181 | pairs[i].perc = total; 182 | } 183 | } 184 | 185 | cfg->u.percent.pair_count = pair_count; 186 | cfg->u.percent.pairs = pairs; 187 | } else if (perc_arg) { 188 | /* -p % to stdout */ 189 | double v = strtod(perc_arg, NULL); 190 | if (v < 0 || v > 100.0) { usage("Error: Bad percentage."); } 191 | if (v >= 1.0 && v <= 100.0) { v /= 100.0; } 192 | pairs[0].perc = v; 193 | pairs[0].out = stdout; 194 | cfg->u.percent.pair_count = 1; 195 | cfg->u.percent.pairs = pairs; 196 | } 197 | } 198 | 199 | static void set_defaults(config *cfg) { 200 | cfg->mode = M_UNDEF; 201 | cfg->u.count.samples = 4; 202 | 203 | struct timeval tv; 204 | if (gettimeofday(&tv, NULL) != 0) { err(1, "gettimeofday"); } 205 | cfg->seed = (unsigned int)tv.tv_usec; 206 | } 207 | 208 | static bool open_next_file(config *cfg) { 209 | while (cfg->fn_index < cfg->fn_count) { 210 | const char *fn = cfg->fnames[cfg->fn_index]; 211 | cfg->fn_index++; 212 | if (0 == strcmp("-", fn)) { 213 | cfg->cur_file = stdin; 214 | return true; 215 | } else { 216 | FILE *f = fopen(fn, "r"); 217 | if (f) { 218 | cfg->cur_file = f; 219 | return true; 220 | } else { 221 | warn("%s", fn); 222 | errno = 0; 223 | } 224 | } 225 | } 226 | return false; 227 | } 228 | 229 | static const char *line_iter(config *cfg, size_t *length) { 230 | size_t buf_used = 0; 231 | char *buf = cfg->buf; 232 | buf[cfg->buf_size - 1] = '\0'; 233 | for (;;) { 234 | if (cfg->cur_file == NULL) { 235 | if (!open_next_file(cfg)) { return NULL; } 236 | } 237 | 238 | char *line = fgets(&buf[buf_used], cfg->buf_size - buf_used - 1, cfg->cur_file); 239 | if (line) { 240 | size_t len = strcspn(line, "\n"); 241 | size_t nlen = buf_used + len; 242 | if (buf[nlen] == '\n') { 243 | buf[nlen] = '\0'; 244 | if (length) { *length = len; } 245 | return buf; 246 | } else { /* grow buf and read more */ 247 | buf_used += len; 248 | size_t nsize = 2 * cfg->buf_size; 249 | char *nbuf = realloc(buf, nsize); 250 | if (nbuf) { 251 | cfg->buf_size = nsize; 252 | cfg->buf = nbuf; 253 | buf = cfg->buf; 254 | } else { 255 | return NULL; 256 | } 257 | } 258 | } else { 259 | fclose(cfg->cur_file); 260 | cfg->cur_file = NULL; 261 | } 262 | } 263 | } 264 | 265 | static void cleanup(config *cfg) { 266 | free(cfg->fnames); 267 | free(cfg->buf); 268 | if (cfg->mode == M_PERC) { 269 | struct out_pair *pairs = cfg->u.percent.pairs; 270 | for (int i = 0; i < cfg->u.percent.pair_count; i++) { 271 | if (pairs[i].out) { fclose(pairs[i].out); } 272 | } 273 | free(pairs); 274 | } 275 | } 276 | 277 | int main(int argc, char **argv) { 278 | config cfg; 279 | memset(&cfg, 0, sizeof(cfg)); 280 | set_defaults(&cfg); 281 | handle_args(&cfg, argc, argv); 282 | 283 | cfg.buf = malloc(SAMPLE_DEF_BUF_SIZE); 284 | if (cfg.buf == 0) { err(1, "malloc"); } 285 | cfg.buf_size = SAMPLE_DEF_BUF_SIZE; 286 | 287 | srandom(cfg.seed); 288 | 289 | int res = 0; 290 | 291 | switch (cfg.mode) { 292 | case M_COUNT: 293 | res = sample_count(&cfg, line_iter); 294 | break; 295 | case M_PERC: 296 | res = sample_percent(&cfg, line_iter); 297 | break; 298 | default: 299 | assert(0); 300 | } 301 | 302 | cleanup(&cfg); 303 | return res; 304 | } 305 | -------------------------------------------------------------------------------- /sample.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011-14 Scott Vokes 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | */ 16 | 17 | #include "sample.h" 18 | 19 | typedef struct { 20 | char *buf; 21 | size_t buf_sz; 22 | size_t length; 23 | size_t line_number; 24 | } sample_info; 25 | 26 | static int si_cmp(const void *a, const void *b); 27 | static bool keep(sample_info *kept, const char *line, size_t len, size_t id); 28 | 29 | #define DEF_BUF_SIZE 8 /* Start small to exercise buffer growing */ 30 | 31 | /* Randomly sample N values from a stream of unknown length, with 32 | * unknown probability. Knuth 3.4.2, algorithm R (Reservoir sampling). */ 33 | int sample_count(config *cfg, line_iter_cb *line_iter) { 34 | size_t samples = cfg->u.count.samples; 35 | sample_info *kept = calloc(samples, sizeof(*kept)); 36 | if (kept == NULL) { err(1, "calloc"); } 37 | 38 | size_t i = 0; 39 | for (i = 0; i < samples; i++) { 40 | sample_info *si = &kept[i]; 41 | si->buf = malloc(DEF_BUF_SIZE); 42 | if (si->buf == NULL) { err(1, "malloc"); } 43 | si->buf_sz = DEF_BUF_SIZE; 44 | } 45 | 46 | const char *line = NULL; 47 | size_t len = 0; 48 | 49 | /* Fill buffer with SAMPLES, then randomly replace them 50 | * with others with a samples/i chance. */ 51 | 52 | for (i = 0; i < samples; i++) { 53 | line = line_iter(cfg, &len); 54 | if (line == NULL) { break; } 55 | keep(&kept[i], line, len, i); 56 | } 57 | 58 | line = line_iter(cfg, &len); 59 | 60 | while (line) { 61 | long rv = (random() % i); 62 | if (rv < samples) { 63 | keep(&kept[rv], line, len, i); 64 | } 65 | i++; 66 | line = line_iter(cfg, &len); 67 | } 68 | 69 | /* Output samples, in input order. */ 70 | qsort(kept, samples, sizeof(sample_info), si_cmp); 71 | for (i = 0; i < samples; i++) { 72 | if (kept[i].length > 0) { printf("%s\n", kept[i].buf); } 73 | } 74 | 75 | /* free kept and buffers */ 76 | for (i = 0; i < samples; i++) { 77 | free(kept[i].buf); 78 | } 79 | free(kept); 80 | 81 | return 0; 82 | } 83 | 84 | static int si_cmp(const void *a, const void *b) { 85 | size_t ia = ((sample_info *)a)->line_number; 86 | size_t ib = ((sample_info *)b)->line_number; 87 | return ia < ib ? -1 : ia > ib ? 1 : 0; 88 | } 89 | 90 | static bool keep(sample_info *si, const char *line, size_t len, size_t id) { 91 | if (si->buf_sz < len + 1) { 92 | size_t nsize = 2 * si->buf_sz; 93 | while (nsize < len + 1) { nsize *= 2; } 94 | char *nbuf = realloc(si->buf, nsize); 95 | if (nbuf) { 96 | si->buf_sz = nsize; 97 | si->buf = nbuf; 98 | } else { 99 | return false; 100 | } 101 | } 102 | 103 | memcpy(si->buf, line, len); 104 | si->buf[len] = '\0'; 105 | si->length = len; 106 | si->line_number = id; 107 | 108 | return true; 109 | } 110 | 111 | /* Just print each line, with (percent)% probability. */ 112 | int sample_percent(config *cfg, line_iter_cb *line_iter) { 113 | struct out_pair *pairs = cfg->u.percent.pairs; 114 | size_t pair_count = cfg->u.percent.pair_count; 115 | 116 | const char *line = line_iter(cfg, NULL); 117 | while (line) { 118 | double rv = (random() % RAND_MAX)/(double)RAND_MAX; 119 | for (int i = 0; i < pair_count; i++) { 120 | if (rv < pairs[i].perc) { 121 | if (pairs[i].out) { 122 | fprintf(pairs[i].out, "%s\n", line); 123 | } 124 | break; 125 | } 126 | } 127 | 128 | line = line_iter(cfg, NULL); 129 | } 130 | return 0; 131 | } 132 | -------------------------------------------------------------------------------- /sample.h: -------------------------------------------------------------------------------- 1 | #ifndef SAMPLE_H 2 | #define SAMPLE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | typedef enum { 16 | M_UNDEF, /* undefined */ 17 | M_COUNT, /* get N samples */ 18 | M_PERC, /* get N% of input */ 19 | } filter_mode; 20 | 21 | enum { 22 | SAMPLE_DEF_BUF_SIZE = 2, 23 | }; 24 | 25 | typedef struct { 26 | unsigned int seed; 27 | 28 | /* Input stream(s) */ 29 | FILE *cur_file; 30 | const char **fnames; 31 | size_t fn_count; 32 | size_t fn_index; 33 | 34 | filter_mode mode; 35 | union { 36 | struct { 37 | size_t samples; 38 | } count; 39 | struct { 40 | size_t pair_count; 41 | struct out_pair *pairs; 42 | } percent; 43 | } u; 44 | 45 | size_t buf_size; 46 | char *buf; 47 | } config; 48 | 49 | struct out_pair { 50 | double perc; /* cumulative probability, <= 1.0 */ 51 | FILE *out; 52 | }; 53 | 54 | /* Get the next line from the input source(s), or NULL on EOS. */ 55 | typedef const char *(line_iter_cb)(config *cfg, size_t *length); 56 | 57 | int sample_count(config *cfg, line_iter_cb *line_iter); 58 | int sample_percent(config *cfg, line_iter_cb *line_iter); 59 | 60 | #endif 61 | --------------------------------------------------------------------------------