├── .gitignore
├── Makefile
├── README.md
├── main.c
├── sample.c
└── sample.h


/.gitignore:
--------------------------------------------------------------------------------
1 | sample
2 | sample.core
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PROJECT =	sample
 2 | OPTIMIZE =	-O3
 3 | WARN =		-Wall -pedantic
 4 | #CDEFS +=
 5 | #CINCS +=
 6 | CSTD +=		-std=c99 -D_BSD_SOURCE #-D_POSIX_C_SOURCE=200112L
 7 | 
 8 | CFLAGS +=	${CSTD} -g ${WARN} ${CDEFS} ${CINCS} ${OPTIMIZE}
 9 | 
10 | all: ${PROJECT}
11 | 
12 | OBJS=	${PROJECT}.o
13 | 
14 | # Basic targets
15 | 
16 | ${PROJECT}: main.o ${OBJS}
17 | 	${CC} -o $@ main.o ${OBJS} ${LDFLAGS}
18 | 
19 | clean:
20 | 	rm -f ${PROJECT} test_${PROJECT} *.o *.a *.core
21 | 
22 | # Installation
23 | PREFIX ?=	/usr/local
24 | INSTALL ?=	install
25 | RM ?=		rm
26 | 
27 | install: ${PROJECT}
28 | 	${INSTALL} -c ${PROJECT} ${PREFIX}/bin
29 | 
30 | uninstall:
31 | 	${RM} -f ${PREFIX}/bin/${PROJECT}
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | sample - filter for random sampling of input.
 2 | 
 3 | Basic usage:
 4 | 
 5 |     sample [-h] [-d files] [-n count] [-p percent] [-s seed] [FILE ...]
 6 | 
 7 | Examples:
 8 | 
 9 |     sample FILE                 # randomly choose & print 4 lines from file, in order
10 |     input | sample              # same, from stream (file defaults to stdin)
11 |     sample FILE FILE2 FILE3     # randomly choose 4 lines between multiple input files
12 |     sample -n 10 FILE           # choose 10 lines
13 |     sample -p 10 FILE           # 10% chance of choosing each line
14 |     input | sample -p 5         # randomly print 5% of input lines
15 |     input | sample -d a,b,c     # append input to files a, b, and c, even odds
16 |     input | sample -d a,b,c,    # append input to files a, b, c, or /dev/null
17 | 
18 | Options:
19 | 
20 |     -h       - Print help
21 |     -d FILES - randomly deal lines to multiple files (',' separated)
22 |     -n COUNT - Set sample count (default: -n 4)
23 |     -p PERC  - Sample PERC percent for input(s) (',' separated)
24 |     -s SEED  - Set a specific random seed (default: seed based on time)
25 | 
26 | 
27 | Sampling by count (-n) works in O(n) space (but n=number of samples, not
28 | input size) and outputs samples when end of input is reached. The input order
29 | of the samples is preserved.
30 | 
31 | Sampling by percentage (-p) works in constant space (no data is accumulated)
32 | and outputs as each group of lines is read.
33 | 
34 | If multiple files are used, they can have custom probability percentages:
35 | 
36 |     input | sample -d a,b,c,d -p 0.5,0.25,0.125,0.125
37 | 
38 | If the last probability is omitted, it will take all remaining:
39 | 
40 |     input | sample -d a,b,c,d -p 0.5,0.25,0.125,
41 | 
42 | If a file is omitted, it will behave like /dev/null:
43 | 
44 |     input | sample -d a,b,c, -p 0.25,0.25,0.25,0.25
45 | 
46 | If probabilities are between 1 and 100, they will be treated as percentages:
47 | 
48 |     input | sample -d a,b,c,d -p 25,25,25,25
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Copyright (c) 2011-14 Scott Vokes <vokes.s@gmail.com>
  3 |  *  
  4 |  * Permission to use, copy, modify, and/or distribute this software for any
  5 |  * purpose with or without fee is hereby granted, provided that the above
  6 |  * copyright notice and this permission notice appear in all copies.
  7 |  *  
  8 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11 |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13 |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14 |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15 |  */
 16 | 
 17 | #include <getopt.h>
 18 | 
 19 | #include "sample.h"
 20 | 
 21 | /* Version 0.1.0. */
 22 | #define SAMPLE_VERSION_MAJOR 0
 23 | #define SAMPLE_VERSION_MINOR 1
 24 | #define SAMPLE_VERSION_PATCH 0
 25 | #define SAMPLE_AUTHOR "Scott Vokes <vokes.s@gmail.com>"
 26 | 
 27 | static void cleanup(config *cfg);
 28 | static void parse_percent_settings(config *cfg,
 29 |     char *deal_arg, char *perc_arg);
 30 | 
 31 | static void usage(const char *msg) {
 32 |     if (msg) { fprintf(stderr, "%s\n\n", msg); }
 33 |     fprintf(stderr, "sample v. %d.%d.%d by %s\n",
 34 |         SAMPLE_VERSION_MAJOR, SAMPLE_VERSION_MINOR,
 35 |         SAMPLE_VERSION_PATCH, SAMPLE_AUTHOR);
 36 |     fprintf(stderr,
 37 |         "Usage: sample [-h] [-d files] [-n count] [-p percent] [-s seed] [FILE ...]\n"
 38 |         "    -h       - Print help\n"
 39 |         "    -d FILES - randomly deal lines to multiple files (',' separated)\n"
 40 |         "    -n COUNT - Set sample count (default: -n 4)\n"
 41 |         "    -p PERC  - Sample PERC percent for input(s) (',' separated)\n"
 42 |         "               If >= 1, argument is used as -p 5 => 0.05 (5%%).\n"
 43 |         "    -s SEED  - Set a specific random seed (default: seed based on time)\n"
 44 |         );
 45 |     exit(1);
 46 | }
 47 | 
 48 | static void set_mode(config *cfg, filter_mode m) {
 49 |     if (cfg->mode == m) { return; }
 50 |     if (cfg->mode != M_UNDEF) { usage("Error: Mix of percent and count modes."); }
 51 |     cfg->mode = m;
 52 | }
 53 | 
 54 | static void handle_args(config *cfg, int argc, char **argv) {
 55 |     char *deal_arg = NULL;
 56 |     char *perc_arg = NULL;
 57 | 
 58 |     int fl, i;
 59 |     while ((fl = getopt(argc, argv, "hd:n:p:s:")) != -1) {
 60 |         switch (fl) {
 61 |         case 'h':               /* help */
 62 |             usage(NULL);
 63 |             break;              /* NOTREACHED */
 64 |         case 'd':               /* deal out to N files */
 65 |             set_mode(cfg, M_PERC);
 66 |             if (deal_arg) { usage("Error: Multiple -d arguments."); }
 67 |             deal_arg = optarg;
 68 |             break;
 69 |         case 'n':               /* number of samples */
 70 |             set_mode(cfg, M_COUNT);
 71 |             cfg->u.count.samples = atol(optarg);
 72 |             if (cfg->u.count.samples < 1) {
 73 |                 fprintf(stderr, "Bad sample count.\n");
 74 |                 exit(1);
 75 |             }
 76 |             break;
 77 |         case 'p':               /* percent(s) */
 78 |             set_mode(cfg, M_PERC);
 79 |             perc_arg = optarg;
 80 |             break;
 81 |         case 's':               /* seed */
 82 |             cfg->seed = atol(optarg);
 83 |             break;
 84 |         case '?':
 85 |         default:
 86 |             usage(NULL);
 87 |             /* NOTREACHED */
 88 |         }
 89 |     }
 90 | 
 91 |     if (cfg->mode == M_PERC) {
 92 |         parse_percent_settings(cfg, deal_arg, perc_arg);
 93 |     } else {                    /* default to sample count */
 94 |         cfg->mode = M_COUNT;
 95 |     }
 96 | 
 97 |     argc -= (optind-1);
 98 |     argv += (optind-1);
 99 |     
100 |     cfg->fn_count = argc - 1;
101 |     if (cfg->fn_count == 0) {
102 |         cfg->fn_count = 1;
103 |     }
104 |     
105 |     cfg->fnames = calloc(cfg->fn_count, sizeof(cfg->fnames[0]));
106 |     if (cfg->fnames == NULL) { err(1, "calloc"); }
107 | 
108 |     cfg->fnames[0] = "-";        /* default to stdin */
109 |     for (i = 1; i < argc; i++) {
110 |         cfg->fnames[i - 1] = argv[i];
111 |     }
112 | }
113 | 
114 | static void parse_percent_settings(config *cfg,
115 |         char *deal_arg, char *perc_arg) {
116 |     const int def_pairs = 1;
117 |     struct out_pair *pairs = calloc(def_pairs, sizeof(*pairs));
118 |     if (pairs == NULL) { err(1, "calloc"); }
119 | 
120 |     assert(deal_arg != NULL || perc_arg != NULL);
121 | 
122 |     if (deal_arg) {
123 |         /* Multiple output files: */
124 |         int pair_count = 0;
125 |         int pair_max = def_pairs;
126 | 
127 |         /* Open files by name, split by ',' */
128 |         for (char *fn = strsep(&deal_arg, ",");
129 |              fn; fn = strsep(&deal_arg, ",")) {
130 |             FILE *f = NULL;
131 |             if (fn[0] == '\0') {                /* /dev/null */
132 |             } else if (0 == strcmp("-", fn)) {  /* stdout */
133 |                 f = stdout;
134 |             } else {                            /* file */
135 |                 f = fopen(fn, "ab");
136 |                 if (f == NULL) { err(1, "fopen"); }
137 |             }
138 |             pairs[pair_count].out = f;
139 |             pair_count++;
140 |             if (pair_count == pair_max) {  /* grow array */
141 |                 int nmax = 2 * pair_max;
142 |                 struct out_pair *npairs = realloc(pairs, nmax * sizeof(*npairs));
143 |                 if (npairs == NULL) { err(1, "realloc"); }
144 |                 pair_max = nmax;
145 |                 pairs = npairs;
146 |             }
147 |         }
148 | 
149 |         /* Deal uneven percentages to the files */
150 |         if (perc_arg) {
151 |             double total = 0;
152 |             int perc_count = 0;
153 |             for (char *perc = strsep(&perc_arg, ",");
154 |                  perc; perc = strsep(&perc_arg, ",")) {
155 |                 double v = 0;
156 |                 if (perc[0] == '\0') {   /* trailing "," -> remaining */
157 |                     v = 1.0 - total;
158 |                 } else {
159 |                     v = strtod(perc, NULL);
160 |                 }
161 |                 if (v < 0 || v > 100) { usage("Error: Bad percentage."); }
162 |                 if (v >= 1.0 && v <= 100.0) { v /= 100.0; }
163 |                 total += v;
164 |                 pairs[perc_count].perc = total;
165 |                 perc_count++;
166 |             }
167 | 
168 |             if (perc_count == 1) {
169 |                 for (int i = 1; i < pair_count; i++) {
170 |                     total += pairs[0].perc;
171 |                     pairs[i].perc = total;  /* project */
172 |                 }
173 |             } else if (perc_count < pair_count) {
174 |                 usage("Error: Percent count does not match output count");
175 |             }
176 |             if (total > 1.0) { usage("Error: Total is over 100%"); }
177 |         } else {                /* divide evenly */
178 |             double total = 0;
179 |             for (int i = 0; i < pair_count; i++) {
180 |                 total += 1.0 / pair_count;
181 |                 pairs[i].perc = total;
182 |             }
183 |         }
184 | 
185 |         cfg->u.percent.pair_count = pair_count;
186 |         cfg->u.percent.pairs = pairs;
187 |     } else if (perc_arg) {
188 |         /* -p % to stdout */
189 |         double v = strtod(perc_arg, NULL);
190 |         if (v < 0 || v > 100.0) { usage("Error: Bad percentage."); }
191 |         if (v >= 1.0 && v <= 100.0) { v /= 100.0; }
192 |         pairs[0].perc = v;
193 |         pairs[0].out = stdout;
194 |         cfg->u.percent.pair_count = 1;
195 |         cfg->u.percent.pairs = pairs;
196 |     }
197 | }
198 | 
199 | static void set_defaults(config *cfg) {
200 |     cfg->mode = M_UNDEF;
201 |     cfg->u.count.samples = 4;
202 | 
203 |     struct timeval tv;
204 |     if (gettimeofday(&tv, NULL) != 0) { err(1, "gettimeofday"); }
205 |     cfg->seed = (unsigned int)tv.tv_usec;
206 | }
207 | 
208 | static bool open_next_file(config *cfg) {
209 |     while (cfg->fn_index < cfg->fn_count) {
210 |         const char *fn = cfg->fnames[cfg->fn_index];
211 |          cfg->fn_index++;
212 |         if (0 == strcmp("-", fn)) {
213 |             cfg->cur_file = stdin;
214 |             return true;
215 |         } else {
216 |             FILE *f = fopen(fn, "r");
217 |             if (f) {
218 |                 cfg->cur_file = f;
219 |                 return true;
220 |             } else {
221 |                 warn("%s", fn);
222 |                 errno = 0;
223 |             }
224 |         }
225 |     }
226 |     return false;
227 | }
228 | 
229 | static const char *line_iter(config *cfg, size_t *length) {
230 |     size_t buf_used = 0;
231 |     char *buf = cfg->buf;
232 |     buf[cfg->buf_size - 1] = '\0';
233 |     for (;;) {
234 |         if (cfg->cur_file == NULL) {
235 |             if (!open_next_file(cfg)) { return NULL; }
236 |         }
237 |         
238 |         char *line = fgets(&buf[buf_used], cfg->buf_size - buf_used - 1, cfg->cur_file);
239 |         if (line) {
240 |             size_t len = strcspn(line, "\n");
241 |             size_t nlen = buf_used + len;
242 |             if (buf[nlen] == '\n') {
243 |                 buf[nlen] = '\0';
244 |                 if (length) { *length = len; }
245 |                 return buf;
246 |             } else {                /* grow buf and read more */
247 |                 buf_used += len;
248 |                 size_t nsize = 2 * cfg->buf_size;
249 |                 char *nbuf = realloc(buf, nsize);
250 |                 if (nbuf) {
251 |                     cfg->buf_size = nsize;
252 |                     cfg->buf = nbuf;
253 |                     buf = cfg->buf;
254 |                 } else {
255 |                     return NULL;
256 |                 }
257 |             }
258 |         } else {
259 |             fclose(cfg->cur_file);
260 |             cfg->cur_file = NULL;
261 |         }
262 |     }
263 | }
264 | 
265 | static void cleanup(config *cfg) {
266 |     free(cfg->fnames);
267 |     free(cfg->buf);
268 |     if (cfg->mode == M_PERC) {
269 |         struct out_pair *pairs = cfg->u.percent.pairs;
270 |         for (int i = 0; i < cfg->u.percent.pair_count; i++) {
271 |             if (pairs[i].out) { fclose(pairs[i].out); }
272 |         }
273 |         free(pairs);
274 |     }
275 | }
276 | 
277 | int main(int argc, char **argv) {
278 |     config cfg;
279 |     memset(&cfg, 0, sizeof(cfg));
280 |     set_defaults(&cfg);
281 |     handle_args(&cfg, argc, argv);
282 | 
283 |     cfg.buf = malloc(SAMPLE_DEF_BUF_SIZE);
284 |     if (cfg.buf == 0) { err(1, "malloc"); }
285 |     cfg.buf_size = SAMPLE_DEF_BUF_SIZE;
286 | 
287 |     srandom(cfg.seed);
288 | 
289 |     int res = 0;
290 | 
291 |     switch (cfg.mode) {
292 |     case M_COUNT:
293 |         res = sample_count(&cfg, line_iter);
294 |         break;
295 |     case M_PERC:
296 |         res = sample_percent(&cfg, line_iter);
297 |         break;
298 |     default:
299 |         assert(0);
300 |     }
301 | 
302 |     cleanup(&cfg);
303 |     return res;
304 | }
305 | 


--------------------------------------------------------------------------------
/sample.c:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Copyright (c) 2011-14 Scott Vokes <vokes.s@gmail.com>
  3 |  *  
  4 |  * Permission to use, copy, modify, and/or distribute this software for any
  5 |  * purpose with or without fee is hereby granted, provided that the above
  6 |  * copyright notice and this permission notice appear in all copies.
  7 |  *  
  8 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11 |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13 |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14 |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15 |  */
 16 | 
 17 | #include "sample.h"
 18 | 
 19 | typedef struct {
 20 |     char *buf;
 21 |     size_t buf_sz;
 22 |     size_t length;
 23 |     size_t line_number;
 24 | } sample_info;
 25 | 
 26 | static int si_cmp(const void *a, const void *b);
 27 | static bool keep(sample_info *kept, const char *line, size_t len, size_t id);
 28 | 
 29 | #define DEF_BUF_SIZE 8          /* Start small to exercise buffer growing */
 30 | 
 31 | /* Randomly sample N values from a stream of unknown length, with
 32 |  * unknown probability. Knuth 3.4.2, algorithm R (Reservoir sampling). */
 33 | int sample_count(config *cfg, line_iter_cb *line_iter) {
 34 |     size_t samples = cfg->u.count.samples;
 35 |     sample_info *kept = calloc(samples, sizeof(*kept));
 36 |     if (kept == NULL) { err(1, "calloc"); }
 37 | 
 38 |     size_t i = 0;
 39 |     for (i = 0; i < samples; i++) {
 40 |         sample_info *si = &kept[i];
 41 |         si->buf = malloc(DEF_BUF_SIZE);
 42 |         if (si->buf == NULL) { err(1, "malloc"); }
 43 |         si->buf_sz = DEF_BUF_SIZE;
 44 |     }
 45 | 
 46 |     const char *line = NULL;
 47 |     size_t len = 0;
 48 | 
 49 |     /* Fill buffer with SAMPLES, then randomly replace them
 50 |      * with others with a samples/i chance. */
 51 | 
 52 |     for (i = 0; i < samples; i++) {
 53 |         line = line_iter(cfg, &len);
 54 |         if (line == NULL) { break; }
 55 |         keep(&kept[i], line, len, i);
 56 |     }
 57 | 
 58 |     line = line_iter(cfg, &len);
 59 | 
 60 |     while (line) {
 61 |         long rv = (random() % i);
 62 |         if (rv < samples) {
 63 |             keep(&kept[rv], line, len, i);
 64 |         }
 65 |         i++;
 66 |         line = line_iter(cfg, &len);
 67 |     }
 68 | 
 69 |     /* Output samples, in input order. */
 70 |     qsort(kept, samples, sizeof(sample_info), si_cmp);
 71 |     for (i = 0; i < samples; i++) {
 72 |         if (kept[i].length > 0) { printf("%s\n", kept[i].buf); }
 73 |     }
 74 | 
 75 |     /* free kept and buffers */
 76 |     for (i = 0; i < samples; i++) {
 77 |         free(kept[i].buf);
 78 |     }
 79 |     free(kept);
 80 | 
 81 |     return 0;
 82 | }
 83 | 
 84 | static int si_cmp(const void *a, const void *b) {
 85 |     size_t ia = ((sample_info *)a)->line_number;
 86 |     size_t ib = ((sample_info *)b)->line_number;
 87 |     return ia < ib ? -1 : ia > ib ? 1 : 0;
 88 | }
 89 | 
 90 | static bool keep(sample_info *si, const char *line, size_t len, size_t id) {
 91 |     if (si->buf_sz < len + 1) {
 92 |         size_t nsize = 2 * si->buf_sz;
 93 |         while (nsize < len + 1) { nsize *= 2; }
 94 |         char *nbuf = realloc(si->buf, nsize);
 95 |         if (nbuf) {
 96 |             si->buf_sz = nsize;
 97 |             si->buf = nbuf;
 98 |         } else {
 99 |             return false;
100 |         }
101 |     }
102 | 
103 |     memcpy(si->buf, line, len);
104 |     si->buf[len] = '\0';
105 |     si->length = len;
106 |     si->line_number = id;
107 | 
108 |     return true;
109 | }
110 | 
111 | /* Just print each line, with (percent)% probability. */
112 | int sample_percent(config *cfg, line_iter_cb *line_iter) {
113 |     struct out_pair *pairs = cfg->u.percent.pairs;
114 |     size_t pair_count = cfg->u.percent.pair_count;
115 | 
116 |     const char *line = line_iter(cfg, NULL);
117 |     while (line) {
118 |         double rv = (random() % RAND_MAX)/(double)RAND_MAX;
119 |         for (int i = 0; i < pair_count; i++) {
120 |             if (rv < pairs[i].perc) {
121 |                 if (pairs[i].out) {
122 |                     fprintf(pairs[i].out, "%s\n", line);
123 |                 }
124 |                 break;
125 |             }
126 |         }
127 | 
128 |         line = line_iter(cfg, NULL);
129 |     }
130 |     return 0;
131 | }
132 | 


--------------------------------------------------------------------------------
/sample.h:
--------------------------------------------------------------------------------
 1 | #ifndef SAMPLE_H
 2 | #define SAMPLE_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <unistd.h>
 6 | #include <stdio.h>
 7 | #include <string.h>
 8 | #include <err.h>
 9 | #include <assert.h>
10 | #include <stdbool.h>
11 | 
12 | #include <errno.h>
13 | #include <sys/time.h>
14 | 
15 | typedef enum {
16 |     M_UNDEF,                    /* undefined */
17 |     M_COUNT,                    /* get N samples */
18 |     M_PERC,                     /* get N% of input */
19 | } filter_mode;
20 | 
21 | enum {
22 |     SAMPLE_DEF_BUF_SIZE = 2,
23 | };
24 | 
25 | typedef struct {
26 |     unsigned int seed;
27 | 
28 |     /* Input stream(s) */
29 |     FILE *cur_file;
30 |     const char **fnames;
31 |     size_t fn_count;
32 |     size_t fn_index;
33 | 
34 |     filter_mode mode;
35 |     union {
36 |         struct {
37 |             size_t samples;
38 |         } count;
39 |         struct {
40 |             size_t pair_count;
41 |             struct out_pair *pairs;
42 |         } percent;
43 |     } u;
44 | 
45 |     size_t buf_size;
46 |     char *buf;
47 | } config;
48 | 
49 | struct out_pair {
50 |     double perc;                /* cumulative probability, <= 1.0 */
51 |     FILE *out;
52 | };
53 | 
54 | /* Get the next line from the input source(s), or NULL on EOS. */
55 | typedef const char *(line_iter_cb)(config *cfg, size_t *length);
56 | 
57 | int sample_count(config *cfg, line_iter_cb *line_iter);
58 | int sample_percent(config *cfg, line_iter_cb *line_iter);
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------