├── .gitignore ├── ARCHITECTURE.md ├── LICENSE ├── Makefile ├── README.md ├── aardwarc.c ├── aardwarc.h ├── config.c ├── config.h ├── flint.lnt ├── getjob.c ├── gzip.c ├── header.c ├── ident.c ├── index.c ├── main.c ├── main_audit.c ├── main_byid.c ├── main_cgi.c ├── main_dumpindex.c ├── main_filter.c ├── main_get.c ├── main_housekeeping.c ├── main_info.c ├── main_mksilo.c ├── main_rebuild.c ├── main_reindex.c ├── main_stevedore.c ├── main_store.c ├── main_stow.c ├── main_testbytes.c ├── miniobj.h ├── proto.c ├── rsilo.c ├── segjob.c ├── silo.c ├── tests ├── alltest.sh ├── gcov_report.py ├── test.rc ├── test00.sh ├── test01.sh ├── test02.sh ├── test03.sh ├── test04.sh └── test05.sh ├── utilities └── select_silos.py ├── vas.c ├── vas.h ├── vdef.h ├── vlu.c ├── vlu.h ├── vnum.c ├── vqueue.h ├── vsb.c ├── vsb.h ├── warcinfo.c └── wsilo.c /.gitignore: -------------------------------------------------------------------------------- 1 | .depend* 2 | *.o 3 | *.gcno 4 | *.gcda 5 | *.gcov 6 | aardwarc 7 | aardwarc.debug 8 | aardwarc.full 9 | _.coverage.raw 10 | _.coverage.txt 11 | -------------------------------------------------------------------------------- /ARCHITECTURE.md: -------------------------------------------------------------------------------- 1 | # Architectural notes about AardWARC 2 | 3 | Some of the architectural decisions in AardWARC are very 4 | counter-intuitive, and therefore worthy of an explanation. 5 | 6 | ## Permanence 7 | 8 | Because AardWARC is written for digital collections in museums, 9 | permanence is the topmost priority. 10 | 11 | The WARC/ISO-28500 file format goes a long way in this respect, 12 | both in terms of documentation and built in integrity checks. 13 | 14 | What AardWARC brings to the table in this respect is mostly auditing 15 | and index rebuild facilities, so that the WARC files alone define 16 | the storage, with the index just being an adjunct access speed-up. 17 | 18 | ## Scaling 19 | 20 | Museums never throw anything out and therefore AardWARC stores 21 | are unbounded in size, both in terms of items and bytes. 22 | 23 | As far as storing objects go, WARC/ISO-28500 has that down pat, 24 | but retrieving objects is a different matter. 25 | 26 | One can, as a last resort, read all the WARC files sequentially 27 | to find something, but in day to day usage, an index is required. 28 | 29 | I decided against SQLite3 as index-engine because it is 273KLOC 30 | where AardWARC is barely 8KLOC, it would truly be the tail wagging 31 | the dog. 32 | 33 | Berkeley DB is a more reasonable 13KLOC, but Oracle owns the project 34 | which means that relying on it in the long term is not indicated. 35 | 36 | Technically even Berkeley DB would be overkill because each indexed 37 | object is born with a well behaved, and well distributed, by which 38 | I mean uniformly distributed, long random(-ish) unique key, and we 39 | never delete anything. 40 | 41 | The implemented AardWARC index consists of two parts, a sorted list 42 | of 32 byte entries, and an unsorted "appendix" containing the most 43 | recently written items. 44 | 45 | Because the keys are uniformly distributed, lookups in the sorted 46 | index can go almost directly to the entry from the WARC-ID (see 47 | comments at the top of index.c for the "almost" part) and if not 48 | found there, the "appendix" is small enough to read sequentially. 49 | 50 | When adding new items only an atomic and robust append write to 51 | the "index.appendix" file is required. 52 | 53 | Periodically, a "housekeeping" operation sorts the appendix and 54 | merges it with the sorted index to create a new sorted index. 55 | 56 | At some point, the sorted index file will grow too big, at which time 57 | it will be split into multiple files, based on a prefix of the 58 | WARC-ID bits but this is not yet implemented. 59 | 60 | *phk* 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, Poul-Henning Kamp 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SRCS += aardwarc.c 2 | SRCS += config.c 3 | SRCS += getjob.c 4 | SRCS += gzip.c 5 | SRCS += header.c 6 | SRCS += ident.c 7 | SRCS += index.c 8 | SRCS += main.c 9 | SRCS += main_audit.c 10 | SRCS += main_byid.c 11 | SRCS += main_cgi.c 12 | SRCS += main_dumpindex.c 13 | SRCS += main_filter.c 14 | SRCS += main_get.c 15 | SRCS += main_housekeeping.c 16 | SRCS += main_info.c 17 | SRCS += main_mksilo.c 18 | SRCS += main_rebuild.c 19 | SRCS += main_reindex.c 20 | SRCS += main_stevedore.c 21 | SRCS += main_store.c 22 | SRCS += main_stow.c 23 | SRCS += main_testbytes.c 24 | SRCS += proto.c 25 | SRCS += rsilo.c 26 | SRCS += segjob.c 27 | SRCS += silo.c 28 | SRCS += vas.c 29 | SRCS += vlu.c 30 | SRCS += vnum.c 31 | SRCS += vsb.c 32 | SRCS += warcinfo.c 33 | SRCS += wsilo.c 34 | 35 | PROG = aardwarc 36 | 37 | LDADD += -lmd 38 | LDADD += -lm 39 | LDADD += -lz 40 | 41 | CFLAGS += -DGITREV=`cd ${.CURDIR} && git log -n 1 '--format=format:"%h"'` 42 | CFLAGS += ${COVERAGE_FLAGS} 43 | 44 | COVFILES = *.gcov *.gcda *.gcno _.coverage.txt _.coverage.raw 45 | CLEANFILES += ${COVFILES} 46 | 47 | WARNS ?= 6 48 | 49 | MK_MAN = no 50 | 51 | DESTDIR ?= /usr/local/bin 52 | 53 | .include 54 | 55 | coverage: 56 | make cleandir 57 | rm -rf ${COVFILES} _.coverage 58 | make depend 59 | make COVERAGE_FLAGS="-O0 -g --coverage" 60 | make runtest 61 | llvm-cov gcov -f ${SRCS} | tee _.coverage.raw | \ 62 | python3 tests/gcov_report.py > _.coverage.txt 63 | mkdir -p _.coverage 64 | mv ${COVFILES} _.coverage 65 | make clean all 66 | tail -4 _.coverage/_.coverage.txt 67 | 68 | flint: 69 | cd ${.CURDIR} && flexelint \ 70 | -I. \ 71 | -I/usr/include \ 72 | flint.lnt \ 73 | ${SRCS} 74 | 75 | test: ${PROG} runtest 76 | 77 | runtest: 78 | cd ${.CURDIR}/tests && env AA=${.OBJDIR}/aardwarc sh alltest.sh 79 | 80 | t2: ${PROG} 81 | 82 | ./aardwarc -c mnt.conf audit /mnt/AA/0/00000000.warc.gz 83 | 84 | t3: 85 | ./aardwarc -c mnt.conf audit 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AardWARC 2 | 3 | ## Museum-quality bit-archive storage management 4 | 5 | This is a small, high-quality storage engine for saving files into, 6 | and retrieving files out of permanent ISO 28500 compliant WARC silos. 7 | 8 | This is only an storage engine, it supports little more than the 9 | two operations "store this" and "get that", all the other aspects 10 | of a proper bit-archive, access control, user interfaces, 11 | data validation and so on must be provided elsewhere. 12 | 13 | Each stored file gets assigned a WARC-Record-ID. WARC records of 14 | type "resource" uses the SHA256 sum of the stored file, and WARC 15 | "metadata" records uses the SHA256 of the "WARC-Refers-To:" header 16 | and the metadata file content. 17 | 18 | An auxillary rebuildable index facilitates rapid access to individual 19 | stored objects keyed by the WARC-Record-ID. 20 | 21 | Almost the entire focus during development has been on correctness and 22 | robustness, and presently FreeBSD is the only supported platform, 23 | but porting it to other UNIX-like operating systems is expected to 24 | require a trivial effort. (Patches/Pulls are welcome). 25 | 26 | One "test-application" called "stow" is built in, it makes very 27 | neat deduplicated permanent personal archive. In compliance with 28 | the "dogfood" principle, I have all 217 Gigabytes of my personal 29 | archive stored with "stow". 30 | 31 | *phk* 32 | -------------------------------------------------------------------------------- /aardwarc.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "vdef.h" 39 | 40 | #include "miniobj.h" 41 | #include "vsb.h" 42 | #include "vas.h" 43 | 44 | #include "aardwarc.h" 45 | 46 | struct aardwarc * 47 | AardWARC_New(const char *config_file, struct vsb *err) 48 | { 49 | int e; 50 | struct aardwarc *aa; 51 | const char *p, *p2; 52 | uintmax_t um; 53 | 54 | ALLOC_OBJ(aa, AARDWARC_MAGIC); 55 | if (aa == NULL) 56 | return (aa); 57 | 58 | do { 59 | aa->cfg = Config_Read(config_file); 60 | if (aa->cfg == NULL) { 61 | VSB_printf(err, "Cannot open %s: %s\n", 62 | config_file, strerror(errno)); 63 | break; 64 | } 65 | 66 | if (Config_Get(aa->cfg, "WARC-Record-ID", &aa->prefix, &p)) { 67 | VSB_printf(err, 68 | "'WARC-Record-ID' not found in config.\n"); 69 | break; 70 | } 71 | if (aa->prefix[strlen(aa->prefix) - 1] != '/') { 72 | VSB_printf(err, 73 | "'WARC-Record-ID' must end in '/'\n"); 74 | break; 75 | } 76 | if (p != NULL) { 77 | aa->id_size = strtoul(p, NULL, 0); 78 | if (aa->id_size < 64 || aa->id_size > 256) { 79 | VSB_printf(err, 80 | "Illegal 'WARC-Record-ID' length %s.\n", 81 | "\tMust be [64...256] bits"); 82 | break; 83 | } 84 | if (aa->id_size & 3) { 85 | VSB_printf(err, 86 | "Illegal 'WARC-Record-ID' length %s.\n", 87 | "\tMust be divisible by 4 bits"); 88 | break; 89 | } 90 | aa->id_size >>= 2; 91 | } else 92 | aa->id_size = 32; 93 | assert(aa->id_size >= 16 && aa->id_size <= 64); 94 | 95 | if (Config_Get(aa->cfg, "silo.directory", &p, NULL)) { 96 | VSB_printf(err, 97 | "'silo.directory' not found in config.\n"); 98 | break; 99 | } 100 | aa->silo_dirname = p; 101 | if (aa->silo_dirname[strlen(aa->silo_dirname) - 1] != '/') { 102 | VSB_printf(err, 103 | "'silo.directory' must end in '/'\n"); 104 | break; 105 | } 106 | 107 | if (Config_Get(aa->cfg, "silo.max_size", &p, NULL)) 108 | p = "3.5G"; 109 | 110 | p2 = VNUM_2bytes(p, &um, 0); 111 | if (p2 != NULL) { 112 | VSB_printf(err, 113 | "'silo.max_size' size \"%s\":\t%s\n", p, p2); 114 | break; 115 | } 116 | aa->silo_maxsize = (off_t)um; 117 | assert(um == (uintmax_t)aa->silo_maxsize); 118 | 119 | if (Config_Get(aa->cfg, "silo.basename", &p, NULL)) 120 | p = "%08u.warc.gz"; 121 | 122 | if (fmtcheck(p, "%u") != p) { 123 | VSB_printf(err, 124 | "'silo.basename' wrong format. %s\n", 125 | "Must have a single %u compatible printf-pattern"); 126 | break; 127 | } 128 | aa->silo_basename = p; 129 | if (strchr(p, '/') != NULL) { 130 | VSB_printf(err, 131 | "'silo.basename' Cannot contain '/'\n"); 132 | break; 133 | } 134 | 135 | if (Config_Get(aa->cfg, "index.sort_size", &p, NULL)) 136 | p = "10M"; 137 | p2 = VNUM_2bytes(p, &um, 0); 138 | if (p2 != NULL) { 139 | VSB_printf(err, 140 | "'index.sort_size' size \"%s\":\t%s\n", p, p2); 141 | break; 142 | } 143 | aa->index_sort_size = (off_t)um; 144 | aa->index_sort_size &= ~0x1f; 145 | if (aa->index_sort_size < 4096) { 146 | VSB_printf(err, 147 | "'index.sort_size' is too small (>= 4k)\n"); 148 | break; 149 | } 150 | 151 | aa->cache_first_non_silo = 0; 152 | aa->cache_first_space_silo = 0; 153 | 154 | return (aa); 155 | 156 | } while (0); 157 | e = errno; 158 | /* XXX: free aa->cfg */ 159 | FREE_OBJ(aa); 160 | errno = e; 161 | return (NULL); 162 | } 163 | 164 | void 165 | AardWARC_ReadCache(struct aardwarc *aa) 166 | { 167 | struct vsb *vsb; 168 | uint8_t buf[4 * 2]; 169 | int i; 170 | int fd; 171 | 172 | vsb = VSB_new_auto(); 173 | AN(vsb); 174 | VSB_printf(vsb, "%s/_.cache", aa->silo_dirname); 175 | AZ(VSB_finish(vsb)); 176 | fd = open(VSB_data(vsb), O_RDONLY); 177 | if (fd >= 0) { 178 | i = read(fd, buf, sizeof buf); 179 | if (i == sizeof buf) { 180 | aa->cache_first_non_silo = be32dec(buf); 181 | aa->cache_first_space_silo = be32dec(buf + 4); 182 | } 183 | AZ(close(fd)); 184 | } 185 | VSB_delete(vsb); 186 | } 187 | 188 | void 189 | AardWARC_WriteCache(const struct aardwarc *aa) 190 | { 191 | struct vsb *vsb; 192 | uint8_t buf[4 * 2]; 193 | int fd; 194 | 195 | vsb = VSB_new_auto(); 196 | AN(vsb); 197 | VSB_printf(vsb, "%s/_.cache", aa->silo_dirname); 198 | AZ(VSB_finish(vsb)); 199 | be32enc(buf, aa->cache_first_non_silo); 200 | be32enc(buf + 4, aa->cache_first_space_silo); 201 | fd = open(VSB_data(vsb), O_WRONLY|O_CREAT, 0644); 202 | if (fd >= 0) { 203 | (void)write(fd, buf, sizeof buf); 204 | AZ(close(fd)); 205 | } 206 | VSB_delete(vsb); 207 | } 208 | -------------------------------------------------------------------------------- /aardwarc.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include "vqueue.h" 31 | 32 | struct vsb; 33 | struct getjob; 34 | struct config; 35 | struct rsilo; 36 | struct wsilo; 37 | struct header; 38 | struct segjob; 39 | 40 | /* 41 | * An instance of an aardwarc store 42 | */ 43 | 44 | struct aardwarc { 45 | unsigned magic; 46 | #define AARDWARC_MAGIC 0x50136925 47 | 48 | /* Arguments */ 49 | int json; 50 | 51 | /* From Configuration */ 52 | struct config *cfg; 53 | const char *prefix; 54 | const char *silo_dirname; 55 | const char *silo_basename; 56 | off_t silo_maxsize; 57 | const char *mime_validator; 58 | unsigned id_size; 59 | 60 | size_t index_sort_size; 61 | 62 | uint32_t cache_first_non_silo; 63 | uint32_t cache_first_space_silo; 64 | }; 65 | 66 | /* A general iterator for walking over bytes */ 67 | typedef int byte_iter_f(void *priv, const void *ptr, ssize_t len); 68 | 69 | /* aardwarc.c */ 70 | struct aardwarc *AardWARC_New(const char *config_file, struct vsb *err); 71 | void AardWARC_ReadCache(struct aardwarc *aa); 72 | void AardWARC_WriteCache(const struct aardwarc *aa); 73 | 74 | /* config.c */ 75 | 76 | struct config *Config_Read(const char *fn); 77 | int Config_Get(const struct config *, const char *section, const char **np, 78 | const char **ap); 79 | 80 | typedef int config_f(void *priv, const char *name, const char *arg); 81 | 82 | int Config_Iter(const struct config *, const char *section, void *priv, 83 | config_f func); 84 | int Config_Find(const struct config *, const char *section, const char *name, 85 | const char **ap); 86 | 87 | /* getjob.c */ 88 | 89 | struct getjob *GetJob_New(struct aardwarc *, const char *id, struct vsb *); 90 | void GetJob_Delete(struct getjob **); 91 | const struct header *GetJob_Header(const struct getjob *, int first); 92 | void GetJob_Iter(const struct getjob *, byte_iter_f *func, void *priv, 93 | int gzip); 94 | off_t GetJob_TotalLength(const struct getjob *, int gzip); 95 | int GetJob_IsSegmented(const struct getjob *); 96 | struct vsb *GetJob_Headers(const struct getjob *); 97 | 98 | /* gzip.c */ 99 | 100 | void Gzip_Vsb(struct vsb **, int level); 101 | extern const uint8_t Gzip_crnlcrnl[24]; 102 | #ifdef Z_OK 103 | void Gzip_AddAa(z_stream *); 104 | #define AA_COMPRESSION Z_BEST_COMPRESSION 105 | //#define AA_COMPRESSION Z_NO_COMPRESSION 106 | void Gzip_InitDeflate(z_stream *zs); 107 | #endif 108 | int64_t Gzip_ReadAa(const void *, size_t); 109 | void Gzip_WriteAa(int, int64_t); 110 | 111 | struct gzip_stitch; 112 | struct gzip_stitch * gzip_stitch_new(byte_iter_f *func, void *priv); 113 | int gzip_stitch_feed(void *priv, const void *ptr, ssize_t len); 114 | int gzip_stitch_fini(struct gzip_stitch *gs); 115 | 116 | /* header.c */ 117 | 118 | struct header *Header_New(const struct aardwarc *); 119 | void Header_Delete(struct header *, const char *); 120 | void Header_Destroy(struct header **hdp); 121 | struct header *Header_Clone(const struct header *hd); 122 | int Header_Len(const char *name, const char *val, ...) 123 | v_printflike_(2, 3); 124 | void Header_Set(struct header *, const char *name, const char *val, ...) 125 | v_printflike_(3, 4); 126 | struct vsb *Header_Serialize(const struct header *, int level); 127 | const char *Header_Get_Id(const struct header *); 128 | intmax_t Header_Get_Number(const struct header *, const char *); 129 | void Header_Set_Id(struct header *, const char *); 130 | void Header_Set_Date(struct header *); 131 | void Header_Set_Ref(struct header *, const char *name, const char *ref); 132 | struct header *Header_Parse(const struct aardwarc *, char *); 133 | //off_t Header_Get_GZlen(const struct header *); 134 | const char *Header_Get(const struct header *, const char *name); 135 | 136 | /* ident.c */ 137 | 138 | void Ident_Create(const struct aardwarc *, const struct header *, 139 | const char *, char *ident); 140 | void Ident_Set(const struct aardwarc *, struct header *, 141 | const char *, const char *); 142 | char *Digest2Ident(const struct aardwarc *, const char *); 143 | 144 | /* index.c */ 145 | 146 | void IDX_Insert(const struct aardwarc *aa, const char *key, uint32_t flags, 147 | uint32_t silo, uint64_t offset, const char *cont); 148 | 149 | typedef int idx_iter_f(void *priv, const char *key, 150 | uint32_t flag, uint32_t silo, int64_t offset, const char *cont); 151 | 152 | int IDX_Iter(const struct aardwarc *aa, const char *key_part, 153 | idx_iter_f *func, void *priv); 154 | 155 | void IDX_Resort(const struct aardwarc *aa); 156 | 157 | const char *IDX_Valid_Id(const struct aardwarc *, 158 | const char *id, const char **nid); 159 | 160 | /* bottom nibble: type */ 161 | /* 0 -> continuation */ 162 | #define IDX_F_WARCINFO (1 << 1) 163 | #define IDX_F_RESOURCE (1 << 2) 164 | #define IDX_F_METADATA (1 << 3) 165 | 166 | /* next nibble: segmentation */ 167 | #define IDX_F_SEGMENTED (1 << 4) 168 | #define IDX_F_FIRSTSEG (1 << 5) 169 | #define IDX_F_LASTSEG (1 << 6) 170 | 171 | /* proto.c */ 172 | 173 | int proto_in(int fd, unsigned *cmd, unsigned *len); 174 | void proto_out(int fd, unsigned cmd, const void *ptr, size_t len); 175 | void proto_send_msg(int fd, const char *fmt, ...) v_printflike_(2,3); 176 | 177 | typedef void proto_ev_func_f(int fd, void *priv, int revents); 178 | uintptr_t proto_add_ev(int fd, short events, proto_ev_func_f *func, void *priv); 179 | void proto_del_ev(uintptr_t *id); 180 | void proto_ctl_ev(uintptr_t id, int enable); 181 | void proto_dispatch_evs(void); 182 | 183 | #define PROTO_MSG 0 184 | #define PROTO_FILTER 1 185 | #define PROTO_DATA 2 186 | #define PROTO_META 3 187 | 188 | #define STOW_META "application/json" 189 | 190 | /* segment.c */ 191 | 192 | struct segjob *SegJob_New(struct aardwarc *, const struct header *, 193 | const char *); 194 | void SegJob_Feed(struct segjob *, const void *ptr, ssize_t len); 195 | char *SegJob_Commit(struct segjob *); 196 | 197 | /* silo.c */ 198 | struct vsb *Silo_Filename(const struct aardwarc *, unsigned number, int hold); 199 | int Silo_Iter(const struct aardwarc *, byte_iter_f *func, void *priv); 200 | 201 | 202 | /* silo_read.c */ 203 | struct rsilo *Rsilo_Open(struct aardwarc *, const char *fn, uint32_t nsilo, 204 | int64_t off); 205 | void Rsilo_Close(struct rsilo **); 206 | void Rsilo_NextHeader(struct rsilo *); 207 | struct header *Rsilo_ReadHeader(struct rsilo *); 208 | uintmax_t Rsilo_ReadChunk(struct rsilo *, byte_iter_f *, void *); 209 | int64_t Rsilo_BodyLen(const struct rsilo *); 210 | int Rsilo_ReadGZChunk(struct rsilo *, byte_iter_f *, void *); 211 | off_t Rsilo_Tell(const struct rsilo *); 212 | void Rsilo_SkipCRNL(struct rsilo *rs); 213 | 214 | /* silo_write.c */ 215 | struct wsilo *Wsilo_New(struct aardwarc *aa, uint32_t silono); 216 | struct wsilo *Wsilo_Next(struct aardwarc *); 217 | void Wsilo_GetSpace(const struct wsilo *, void **ptr, ssize_t *len); 218 | int Wsilo_Store(struct wsilo *, ssize_t len); 219 | void Wsilo_Finish(struct wsilo *); 220 | void Wsilo_Header(struct wsilo *, struct header *, int pad); 221 | void Wsilo_Commit(struct wsilo **, int segd, const char *id, const char *rid); 222 | void Wsilo_Install(struct wsilo **); 223 | void Wsilo_Abandon(struct wsilo **); 224 | 225 | /* vnum.c */ 226 | const char *VNUM_2bytes(const char *p, uintmax_t *r, uintmax_t rel); 227 | 228 | /* warcinfo.c */ 229 | char *Warcinfo_New(const struct aardwarc *, struct wsilo *, uint32_t silono); 230 | 231 | /* main*c */ 232 | 233 | void usage(const char *a0, const char *err); 234 | int call_main(const char *a0, struct aardwarc *aa, int argc, char **argv); 235 | typedef int main_f(const char *a0, struct aardwarc *, 236 | int argc, char **argv); 237 | extern main_f main_audit; 238 | extern main_f main_byid; 239 | extern main_f main_cgi; 240 | extern main_f main_dumpindex; 241 | extern main_f main_filter; 242 | extern main_f main_get; 243 | extern main_f main_housekeeping; 244 | extern main_f main_info; 245 | extern main_f main_mksilo; 246 | extern main_f main_rebuild; 247 | extern main_f main_reindex; 248 | extern main_f main_stevedore; 249 | extern main_f main_store; 250 | extern main_f main_stow; 251 | extern main_f main__testbytes; 252 | -------------------------------------------------------------------------------- /config.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "vdef.h" 40 | #include "miniobj.h" 41 | #include "vas.h" 42 | 43 | #include "aardwarc.h" 44 | 45 | struct entry { 46 | unsigned magic; 47 | #define ENTRY_MAGIC 0x4101dadc 48 | VTAILQ_ENTRY(entry) list; 49 | const char *name; 50 | const char *arg; 51 | }; 52 | 53 | struct section { 54 | unsigned magic; 55 | #define SECTION_MAGIC 0x18a00a8c 56 | unsigned nentry; 57 | VTAILQ_ENTRY(section) list; 58 | const char *name; 59 | VTAILQ_HEAD(,entry) entries; 60 | }; 61 | 62 | struct config { 63 | unsigned magic; 64 | #define CONFIG_MAGIC 0x31b7d969 65 | unsigned nsection; 66 | VTAILQ_HEAD(,section) sections; 67 | u_char *space; 68 | }; 69 | 70 | static void 71 | config_destroy(struct config *cfg) 72 | { 73 | 74 | REPLACE(cfg->space, NULL); 75 | FREE_OBJ(cfg); 76 | } 77 | 78 | static const char * 79 | config_parse(struct config *cfg) 80 | { 81 | char *p, *q, *r, *e; 82 | struct section *sc = NULL; 83 | struct entry *ent = NULL; 84 | 85 | CHECK_OBJ_NOTNULL(cfg, CONFIG_MAGIC); 86 | 87 | for (p = (char*)cfg->space; *p != '\0'; p = e) { 88 | e = strchr(p, '\n'); 89 | if (e != NULL) 90 | *e++ = '\0'; 91 | else 92 | e = strchr(p, '\0'); 93 | q = strchr(p, '#'); 94 | if (q != NULL) 95 | *q = '\0'; 96 | r = NULL; 97 | for (q = p; *q != '\0'; q++) 98 | if (!isspace(*q)) 99 | r = q; 100 | if (r == NULL) 101 | continue; 102 | r[1] = '\0'; 103 | if (!isspace(*p)) { 104 | r = strchr(p, ':'); 105 | if (r == NULL || r[1] != '\0') 106 | return ("Section colon trouble"); 107 | *r = '\0'; 108 | VTAILQ_FOREACH(sc, &cfg->sections, list) 109 | if (!strcmp(sc->name, p)) 110 | return ("Duplicate sections"); 111 | ALLOC_OBJ(sc, SECTION_MAGIC); 112 | AN(sc); 113 | VTAILQ_INIT(&sc->entries); 114 | VTAILQ_INSERT_TAIL(&cfg->sections, sc, list); 115 | cfg->nsection++; 116 | sc->name = p; 117 | } else { 118 | if (sc == NULL) 119 | return ("No section yet"); 120 | for (q = p; isspace(*q); q++) 121 | continue; 122 | ALLOC_OBJ(ent, ENTRY_MAGIC); 123 | AN(ent); 124 | VTAILQ_INSERT_TAIL(&sc->entries, ent, list); 125 | sc->nentry++; 126 | ent->name = q; 127 | for (; *q != '\0' && !isspace(*q) ; q++) 128 | continue; 129 | if (isspace(*q)) { 130 | *q++ = '\0'; 131 | while (isspace(*q)) 132 | q++; 133 | ent->arg = q; 134 | } 135 | } 136 | } 137 | return (NULL); 138 | } 139 | 140 | #if 0 141 | static void 142 | config_dump(const struct config *cfg) 143 | { 144 | struct section *sc; 145 | struct entry *ent; 146 | 147 | CHECK_OBJ_NOTNULL(cfg, CONFIG_MAGIC); 148 | VTAILQ_FOREACH(sc, &cfg->sections, list) { 149 | printf("%s:\n", sc->name); 150 | VTAILQ_FOREACH(ent, &sc->entries, list) { 151 | if (ent->arg != NULL) 152 | printf("\t%s\t%s\n", ent->name, ent->arg); 153 | else 154 | printf("\t%s\n", ent->name); 155 | } 156 | } 157 | } 158 | #endif 159 | 160 | struct config * 161 | Config_Read(const char *fn) 162 | { 163 | struct config *cfg; 164 | int fd; 165 | struct stat st; 166 | ssize_t ssz; 167 | u_char *p; 168 | const char *e; 169 | 170 | fd = open(fn, O_RDONLY); 171 | if (fd < 0) 172 | return (NULL); 173 | AZ(fstat(fd, &st)); 174 | if ((st.st_mode & S_IFMT) != S_IFREG) { 175 | AZ(close(fd)); 176 | errno = EINVAL; 177 | return (NULL); 178 | } 179 | ALLOC_OBJ(cfg, CONFIG_MAGIC); 180 | AN(cfg); 181 | VTAILQ_INIT(&cfg->sections); 182 | cfg->space = malloc(st.st_size + 1); 183 | AN(cfg->space); 184 | ssz = read(fd, cfg->space, st.st_size); 185 | AZ(close(fd)); 186 | if (ssz != st.st_size) { 187 | config_destroy(cfg); 188 | errno = EIO; 189 | return (NULL); 190 | } 191 | 192 | /* Check file is not obviously bogus UTF-8 */ 193 | for (p = cfg->space; p < cfg->space + st.st_size; p++) { 194 | if (*p == 0x00 || *p == 0xc0 || *p == 0xc1 || *p > 0xf4) { 195 | config_destroy(cfg); 196 | errno = EBADF; 197 | return (NULL); 198 | } 199 | } 200 | cfg->space[st.st_size] = '\0'; 201 | e = config_parse(cfg); 202 | if (e != NULL) 203 | fprintf(stderr, "Config = %s\n", e); 204 | 205 | return (cfg); 206 | } 207 | 208 | int 209 | Config_Get(const struct config *cfg, const char *section, const char **np, 210 | const char **ap) 211 | { 212 | struct section *sc; 213 | struct entry *ent; 214 | 215 | CHECK_OBJ_NOTNULL(cfg, CONFIG_MAGIC); 216 | VTAILQ_FOREACH(sc, &cfg->sections, list) 217 | if (!strcasecmp(sc->name, section)) 218 | break; 219 | if (sc == NULL) 220 | return (errno = ENOENT); 221 | if (sc->nentry != 1) 222 | return (errno = E2BIG); 223 | ent = VTAILQ_FIRST(&sc->entries); 224 | AZ(VTAILQ_NEXT(ent, list)); 225 | if (np != NULL && ap == NULL && ent->arg != NULL) 226 | return (errno = E2BIG); 227 | if (np != NULL) 228 | *np = ent->name; 229 | if (ap != NULL) 230 | *ap = ent->arg; 231 | return (0); 232 | } 233 | 234 | int 235 | Config_Find(const struct config *cfg, const char *section, const char *name, 236 | const char **ap) 237 | { 238 | struct section *sc; 239 | struct entry *ent; 240 | 241 | CHECK_OBJ_NOTNULL(cfg, CONFIG_MAGIC); 242 | VTAILQ_FOREACH(sc, &cfg->sections, list) 243 | if (!strcasecmp(sc->name, section)) 244 | break; 245 | if (sc == NULL) 246 | return (ENOENT); 247 | VTAILQ_FOREACH(ent, &sc->entries, list) { 248 | if (strcmp(ent->name, "*") && strcasecmp(name, ent->name)) 249 | continue; 250 | if (ap != NULL) 251 | *ap = ent->arg; 252 | return (0); 253 | } 254 | return (ENOENT); 255 | } 256 | 257 | int 258 | Config_Iter(const struct config *cfg, const char *section, void *priv, 259 | config_f func) 260 | { 261 | struct section *sc; 262 | struct entry *ent; 263 | int i; 264 | 265 | CHECK_OBJ_NOTNULL(cfg, CONFIG_MAGIC); 266 | VTAILQ_FOREACH(sc, &cfg->sections, list) 267 | if (!strcasecmp(sc->name, section)) 268 | break; 269 | if (sc == NULL) 270 | return (errno = ENOENT); 271 | if (sc->nentry == 0) 272 | return (errno = ENOENT); 273 | VTAILQ_FOREACH(ent, &sc->entries, list) { 274 | i = func(priv, ent->name, ent->arg); 275 | if (i) 276 | return (i); 277 | } 278 | return (0); 279 | } 280 | -------------------------------------------------------------------------------- /config.h: -------------------------------------------------------------------------------- 1 | /* Empty file for varnish source tree compatibility */ 2 | -------------------------------------------------------------------------------- /flint.lnt: -------------------------------------------------------------------------------- 1 | -esym(755, VLIST_*) // Global macro not ref. 2 | -esym(755, VSLIST_*) 3 | -esym(755, VSTAILQ_*) 4 | -esym(755, VTAILQ_*) 5 | -esym(755, CAST_OBJ*) 6 | -esym(755, CHECK_OBJ*) 7 | -esym(755, NEEDLESS_RETURN) 8 | -esym(755, VALID_OBJ) 9 | -esym(755, bprintf) 10 | -esym(755, XXXAN) 11 | -esym(755, XXXAZ) 12 | -esym(755, INCOMPL) 13 | -esym(755, INIT_OBJ) 14 | -esym(755, REPLACE) 15 | -esym(769, vas_e::*) 16 | 17 | -sem(VAS_Fail, r_no) // does not return 18 | -emacro(506, assert) // constant value boolean 19 | -emacro(827, assert) // loop not reachable 20 | -emacro(774, assert) // boolean always true 21 | -emacro(731, assert) // boolean arg to eq/non-eq 22 | -emacro(731, xxxassert) // arg to eq/non-eq 23 | -emacro(527, WRONG) // unreachable code 24 | -emacro(774, VALID_OBJ) // boolean always true 25 | 26 | -emacro(779, REPLACE) // string constant != 27 | -emacro(774, REPLACE) // if(bool) always true 28 | -emacro(506, REPLACE) // const bool 29 | 30 | -esym(534, printf) 31 | -esym(534, fflush) 32 | -esym(534, fprintf) 33 | -esym(534, memset) 34 | -esym(534, memcpy) 35 | -esym(534, memmove) 36 | -esym(534, strlcpy) 37 | -esym(534, strcat) 38 | -esym(534, strcpy) 39 | -esym(534, fputc) 40 | -esym(759, VSB_*) // Could be made static 41 | -esym(714, VSB_*) // Not referenced 42 | -esym(765, VSB_*) // could be made static 43 | 44 | -esym(534, VSB_bcat) 45 | -esym(534, VSB_putc) 46 | -esym(534, VSB_cat) 47 | -esym(534, VSB_printf) 48 | -esym(534, VSB_vprintf) 49 | 50 | -emacro((826), VTAILQ_PREV) // Suspicious pointer-to-pointer conversion 51 | -emacro((826), VTAILQ_LAST) // Suspicious pointer-to-pointer conversion 52 | // (area too small) 53 | -emacro(740, VTAILQ_PREV) // Unusual pointer cast 54 | // (incompatible indirect types) 55 | -emacro(740, VTAILQ_LAST) // Unusual pointer cast 56 | // (incompatible indirect types) 57 | -emacro(506, VTAILQ_FOREACH_SAFE) // Const boolean 58 | 59 | 60 | 61 | /////////////////////////////////////////////////////////////////////// 62 | -sem(config_destroy, custodial(1)) 63 | /////////////////////////////////////////////////////////////////////// 64 | 65 | -e747 // Significant prototype coercion (___) ___ to ___ 66 | -e712 // Loss of precision (___) (___ to ___) 67 | -e732 // Loss of sign (___) (___ to ___) 68 | -e726 // Extraneous comma ignored 69 | -e663 // Suspicious array to pointer conversion 70 | -e737 // Loss of sign in promotion from int to unsigned long 71 | -e716 // while(1) ... 72 | -e728 // (static var) not explicitly init 73 | -e703 // shift left signed quant (long) 74 | -------------------------------------------------------------------------------- /getjob.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | 31 | #include 32 | #include 33 | #include 34 | 35 | #include "vdef.h" 36 | #include "vas.h" 37 | #include "vsb.h" 38 | #include "miniobj.h" 39 | 40 | #include "aardwarc.h" 41 | 42 | struct getjobseg { 43 | unsigned magic; 44 | #define GETJOBSEG_MAGIC 0xedd545dd 45 | 46 | VTAILQ_ENTRY(getjobseg) list; 47 | 48 | struct rsilo *rs; 49 | struct header *hdr; 50 | uint32_t idx_flag; 51 | char *idx_cont; 52 | 53 | unsigned segno; 54 | }; 55 | 56 | VTAILQ_HEAD(getjobseg_head, getjobseg); 57 | 58 | struct getjob { 59 | unsigned magic; 60 | #define GETJOB_MAGIC 0xd0848010 61 | const char *id; 62 | struct vsb *vsb; 63 | struct aardwarc *aa; 64 | const char *err; 65 | 66 | struct getjobseg_head segs; 67 | int nsegs; 68 | }; 69 | 70 | static int v_matchproto_(idx_iter_f) 71 | getjob_iter(void *priv, const char *key, 72 | uint32_t flag, uint32_t silo, int64_t offset, const char *cont) 73 | { 74 | struct getjob *gj; 75 | const char *p; 76 | struct getjobseg *gjs; 77 | struct rsilo *rs; 78 | struct header *hdr; 79 | unsigned segno = 1; 80 | intmax_t im; 81 | 82 | CAST_OBJ_NOTNULL(gj, priv, GETJOB_MAGIC); 83 | (void)key; 84 | 85 | rs = Rsilo_Open(gj->aa, NULL, silo, offset); 86 | AN(rs); 87 | hdr = Rsilo_ReadHeader(rs); 88 | AN(hdr); 89 | 90 | p = Header_Get_Id(hdr); 91 | if ((flag & IDX_F_WARCINFO) && !strcasecmp(p, gj->id)) { 92 | gj->err = "ID is warcinfo segment"; 93 | Header_Destroy(&hdr); 94 | Rsilo_Close(&rs); 95 | return (-1); 96 | } 97 | gjs = VTAILQ_LAST(&gj->segs, getjobseg_head); 98 | if (gjs == NULL) { 99 | if (strcasecmp(p, gj->id)) { 100 | Header_Destroy(&hdr); 101 | Rsilo_Close(&rs); 102 | return (0); 103 | } 104 | if ((flag & IDX_F_SEGMENTED) && !(flag & IDX_F_FIRSTSEG)) { 105 | gj->err = "ID is continuation segment"; 106 | Header_Destroy(&hdr); 107 | Rsilo_Close(&rs); 108 | return (-1); 109 | } 110 | } else { 111 | p = Header_Get(hdr, "WARC-Segment-Origin-ID"); 112 | AN(p); 113 | assert(p[0] == '<'); 114 | p++; 115 | assert(!memcmp(p, gj->aa->prefix, strlen(gj->aa->prefix))); 116 | p += strlen(gj->aa->prefix); 117 | assert(p[gj->aa->id_size] == '>'); 118 | assert(p[gj->aa->id_size + 1L] == '\0'); 119 | if (strncasecmp(p, gj->id, gj->aa->id_size)) { 120 | /* 121 | * The next field in the index can be ambiguous 122 | * but we just ignore the other ones. 123 | */ 124 | Header_Destroy(&hdr); 125 | Rsilo_Close(&rs); 126 | return (0); 127 | } 128 | im = Header_Get_Number(hdr, "WARC-Segment-Number"); 129 | assert(im >= 0); 130 | segno = im; 131 | if (segno != gjs->segno + 1) { 132 | gj->err = 133 | "Index Inconsistency: " 134 | "Continuation out of order."; 135 | Header_Destroy(&hdr); 136 | Rsilo_Close(&rs); 137 | return (-1); 138 | } 139 | } 140 | 141 | ALLOC_OBJ(gjs, GETJOBSEG_MAGIC); 142 | AN(gjs); 143 | 144 | gjs->rs = rs; 145 | gjs->hdr = hdr; 146 | gjs->idx_flag = flag; 147 | gjs->idx_cont = strdup(cont); 148 | gjs->segno = segno; 149 | AN(gjs->idx_cont); 150 | VTAILQ_INSERT_TAIL(&gj->segs, gjs, list); 151 | gj->nsegs++; 152 | return(1); 153 | } 154 | 155 | void 156 | GetJob_Delete(struct getjob **pp) 157 | { 158 | struct getjob *gj; 159 | struct getjobseg *gjs; 160 | 161 | AN(pp); 162 | CAST_OBJ_NOTNULL(gj, *pp, GETJOB_MAGIC); 163 | *pp = NULL; 164 | 165 | while (1) { 166 | gjs = VTAILQ_FIRST(&gj->segs); 167 | if (gjs == NULL) 168 | break; 169 | VTAILQ_REMOVE(&gj->segs, gjs, list); 170 | if (gjs->hdr) 171 | Header_Destroy(&gjs->hdr); 172 | if (gjs->rs) 173 | Rsilo_Close(&gjs->rs); 174 | REPLACE(gjs->idx_cont, NULL); 175 | FREE_OBJ(gjs); 176 | } 177 | FREE_OBJ(gj); 178 | } 179 | 180 | struct getjob * 181 | GetJob_New(struct aardwarc *aa, const char *id, struct vsb *vsb) 182 | { 183 | int i; 184 | struct getjob *gj; 185 | struct getjobseg *gjs; 186 | const char *nid, *e; 187 | 188 | e = IDX_Valid_Id(aa, id, &nid); 189 | if (e != NULL) { 190 | VSB_printf(vsb, "%s", e); 191 | return (NULL); 192 | } 193 | 194 | ALLOC_OBJ(gj, GETJOB_MAGIC); 195 | AN(gj); 196 | VTAILQ_INIT(&gj->segs); 197 | gj->aa = aa; 198 | gj->id = nid; 199 | gj->vsb = vsb; 200 | gj->err = "ID not found"; 201 | 202 | while (1) { 203 | i = IDX_Iter(aa, nid, getjob_iter, gj); 204 | if (i <= 0) { 205 | AN (gj->err); 206 | VSB_printf(vsb, "%s", gj->err); 207 | GetJob_Delete(&gj); 208 | return (NULL); 209 | } 210 | gjs = VTAILQ_LAST(&gj->segs, getjobseg_head); 211 | CHECK_OBJ_NOTNULL(gjs, GETJOBSEG_MAGIC); 212 | 213 | AZ(gjs->idx_flag & IDX_F_WARCINFO); 214 | 215 | if (!(gjs->idx_flag & IDX_F_SEGMENTED)) { 216 | AZ(gjs->idx_flag & IDX_F_FIRSTSEG); 217 | AZ(gjs->idx_flag & IDX_F_LASTSEG); 218 | AZ(strcmp(gjs->idx_cont, "00000000")); 219 | assert(VTAILQ_FIRST(&gj->segs) == gjs); 220 | break; 221 | } 222 | if (gjs->idx_flag & IDX_F_LASTSEG) 223 | break; 224 | nid = gjs->idx_cont; 225 | } 226 | return (gj); 227 | } 228 | 229 | const struct header * 230 | GetJob_Header(const struct getjob *gj, int first) 231 | { 232 | struct getjobseg *gjs; 233 | 234 | CHECK_OBJ_NOTNULL(gj, GETJOB_MAGIC); 235 | if (first) 236 | gjs = VTAILQ_FIRST(&gj->segs); 237 | else 238 | gjs = VTAILQ_LAST(&gj->segs, getjobseg_head); 239 | CHECK_OBJ_NOTNULL(gjs, GETJOBSEG_MAGIC); 240 | return (gjs->hdr); 241 | } 242 | 243 | void 244 | GetJob_Iter(const struct getjob *gj, byte_iter_f *func, void *priv, int gzip) 245 | { 246 | struct getjobseg *gjs; 247 | struct gzip_stitch *gs; 248 | 249 | CHECK_OBJ_NOTNULL(gj, GETJOB_MAGIC); 250 | AN(func); 251 | if (!gzip) { 252 | VTAILQ_FOREACH(gjs, &gj->segs, list) 253 | if (Rsilo_ReadChunk(gjs->rs, func, priv) == 0) 254 | break; 255 | } else if (gj->nsegs == 1) { 256 | gjs = VTAILQ_FIRST(&gj->segs); 257 | (void)Rsilo_ReadGZChunk(gjs->rs, func, priv); 258 | } else { 259 | gs = gzip_stitch_new(func, priv); 260 | VTAILQ_FOREACH(gjs, &gj->segs, list) 261 | if (Rsilo_ReadGZChunk(gjs->rs, gzip_stitch_feed, gs) == 0) 262 | break; 263 | (void)gzip_stitch_fini(gs); 264 | } 265 | } 266 | 267 | off_t 268 | GetJob_TotalLength(const struct getjob *gj, int gzip) 269 | { 270 | struct getjobseg *gjs; 271 | off_t sum = 0; 272 | intmax_t im; 273 | 274 | CHECK_OBJ_NOTNULL(gj, GETJOB_MAGIC); 275 | 276 | // XXX: ... also available in headers in last segment. 277 | VTAILQ_FOREACH(gjs, &gj->segs, list) { 278 | if (gzip) 279 | im = Rsilo_BodyLen(gjs->rs); 280 | else 281 | im = Header_Get_Number(gjs->hdr, "Content-Length"); 282 | assert(im > 0); 283 | sum += im; 284 | } 285 | return (sum); 286 | } 287 | 288 | int 289 | GetJob_IsSegmented(const struct getjob *gj) 290 | { 291 | struct getjobseg *gjs; 292 | 293 | CHECK_OBJ_NOTNULL(gj, GETJOB_MAGIC); 294 | 295 | gjs = VTAILQ_FIRST(&gj->segs); 296 | if (VTAILQ_NEXT(gjs, list) == NULL) 297 | return (0); 298 | return (1); 299 | } 300 | 301 | struct vsb * 302 | GetJob_Headers(const struct getjob *gj) 303 | { 304 | struct getjobseg *gjs, *gjl; 305 | struct header *hdr; 306 | struct vsb *vsb; 307 | const char *p; 308 | 309 | CHECK_OBJ_NOTNULL(gj, GETJOB_MAGIC); 310 | 311 | gjs = VTAILQ_FIRST(&gj->segs); 312 | AN(gjs); 313 | gjl = VTAILQ_LAST(&gj->segs, getjobseg_head); 314 | AN(gjl); 315 | 316 | if (gjs == gjl) { 317 | vsb = Header_Serialize(gjs->hdr, -1); 318 | } else { 319 | // Move headers around to make segmentation less painful 320 | 321 | hdr = Header_Clone(gjs->hdr); 322 | AN(hdr); 323 | 324 | p = Header_Get(gjl->hdr, "WARC-Segment-Total-Length"); 325 | AN(p); 326 | Header_Set(hdr, "Content-Length", "%s", p); 327 | 328 | p = Header_Get(gjs->hdr, "WARC-Payload-Digest"); 329 | AN(p); 330 | Header_Set(hdr, "WARC-Block-Digest", "%s", p); 331 | 332 | vsb = Header_Serialize(hdr, -1); 333 | Header_Destroy(&hdr); 334 | } 335 | return (vsb); 336 | } 337 | -------------------------------------------------------------------------------- /gzip.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #define ZLIB_CONST 36 | #include 37 | 38 | #include "vdef.h" 39 | #include "miniobj.h" 40 | 41 | #include "vsb.h" 42 | #include "vas.h" 43 | 44 | #include "aardwarc.h" 45 | 46 | 47 | /********************************************************************** 48 | * This is the gzip header we expect to find 49 | */ 50 | 51 | static const uint8_t gzip_head[] = { 52 | 0x1f, // ID1 53 | 0x8b, // ID2 54 | 0x08, // CM 55 | 0x04, // FLAGS 56 | 0x00, 0x00, 0x00, 0x00, // MTIME 57 | 0x02, // XFL (can be 2 or 4, see code) 58 | 0x03, // OS 59 | 0x0c, 0x00, // XLEN 60 | 0x41, 0x61, // SI1, SI2 61 | 0x08, 0x00, // LEN 62 | }; 63 | 64 | static int 65 | Gzip_GoodAa(const void *p, size_t l) 66 | { 67 | uint8_t buf[sizeof gzip_head]; 68 | 69 | assert(l >= sizeof buf); 70 | memcpy(buf, p, sizeof buf); 71 | if (buf[8] != 0x02 && buf[8] != 0x04) 72 | return (0); 73 | buf[8] = 0x02; 74 | if (memcmp(buf, gzip_head, sizeof gzip_head)) 75 | return (0); 76 | return (1); 77 | } 78 | 79 | /********************************************************************** 80 | * Placeholder FEXTRA field for writing gzip files 81 | */ 82 | 83 | static uint8_t gzh_extra[] = { 84 | 'A', 'a', 85 | 8, 0, 86 | 0, 0, 0, 0, 0, 0, 0, 0 87 | }; 88 | 89 | static struct gz_header_s gzh = { 90 | .os = 3, // UNIX 91 | .extra = gzh_extra, 92 | .extra_len = sizeof gzh_extra, 93 | }; 94 | 95 | void 96 | Gzip_AddAa(z_stream *gz) 97 | { 98 | 99 | AZ(deflateSetHeader(gz, &gzh)); 100 | } 101 | 102 | /********************************************************************** 103 | * Update the length in an Aa field 104 | */ 105 | 106 | void 107 | Gzip_WriteAa(int fd, int64_t len) 108 | { 109 | ssize_t i; 110 | uint8_t buf[sizeof gzip_head]; 111 | 112 | assert(len > 0); 113 | 114 | /* Write the gzip'ed length to the 'Aa' extra header */ 115 | i = read(fd, buf, sizeof gzip_head); 116 | assert(i == sizeof gzip_head); 117 | assert(Gzip_GoodAa(buf, i)); 118 | le64enc(buf, (uint64_t)len); 119 | i = write(fd, buf, 8); 120 | assert(i == 8); 121 | } 122 | 123 | /********************************************************************** 124 | * Read the length from an Aa field 125 | */ 126 | 127 | int64_t 128 | Gzip_ReadAa(const void *p, size_t l) 129 | { 130 | int64_t len; 131 | 132 | assert(Gzip_GoodAa(p, l)); 133 | xxxassert(l >= sizeof sizeof gzip_head + 8); 134 | len = (int64_t)le64dec((const uint8_t*)p + sizeof gzip_head); 135 | assert(len > 0); 136 | return (len); 137 | } 138 | 139 | /**********************************************************************/ 140 | 141 | const uint8_t Gzip_crnlcrnl[24] = { 142 | 0x1f, 0x8b, 0x08, 0x00, 0x20, 0x01, 0x19, 0x66, 143 | 0x02, 0x03, 0xe3, 0xe5, 0xe2, 0xe5, 0x02, 0x00, 144 | 0x44, 0x15, 0xc2, 0x8b, 0x04, 0x00, 0x00, 0x00 145 | }; 146 | 147 | /**********************************************************************/ 148 | 149 | void 150 | Gzip_Vsb(struct vsb **vsbp, int level) 151 | { 152 | struct vsb *input; 153 | struct vsb *output; 154 | char buf[1024]; 155 | int i; 156 | z_stream zs[1]; 157 | char *p; 158 | 159 | AN(vsbp); 160 | input = *vsbp;; 161 | *vsbp = NULL; 162 | AN(input); 163 | 164 | memset(zs, 0, sizeof zs); 165 | i = deflateInit2( 166 | zs, 167 | level, 168 | Z_DEFLATED, 169 | 16 + 15, 170 | 8, 171 | Z_DEFAULT_STRATEGY 172 | ); 173 | assert(i == Z_OK); 174 | Gzip_AddAa(zs); 175 | 176 | zs->avail_in = VSB_len(input); 177 | zs->next_in = (const void*) VSB_data(input); 178 | 179 | output = VSB_new_auto(); 180 | AN(output); 181 | do { 182 | zs->avail_out = sizeof buf; 183 | zs->next_out = (void*)buf; 184 | i = deflate(zs, Z_FINISH); 185 | VSB_bcat(output, buf, sizeof buf - zs->avail_out); 186 | } while (i != Z_STREAM_END); 187 | AZ(VSB_finish(output)); 188 | assert(deflateEnd(zs) == Z_OK); 189 | VSB_delete(input); 190 | p = VSB_data(output); 191 | assert(Gzip_GoodAa(p, VSB_len(output))); 192 | le64enc(p + sizeof gzip_head, VSB_len(output)); 193 | *vsbp = output; 194 | } 195 | 196 | /**********************************************************************/ 197 | 198 | void 199 | Gzip_InitDeflate(z_stream *zs) 200 | { 201 | int i; 202 | 203 | memset(zs, 0, sizeof *zs); 204 | i = deflateInit2( 205 | zs, 206 | AA_COMPRESSION, 207 | Z_DEFLATED, 208 | 16 + MAX_WBITS, 209 | 8, 210 | Z_DEFAULT_STRATEGY 211 | ); 212 | assert(i == Z_OK); 213 | } 214 | 215 | /********************************************************************** 216 | * Code to stitch multiple WARC segments, individually gzip'ed into 217 | * a single gzip object, because browser-people are morons who cannot 218 | * read. 219 | */ 220 | 221 | struct gzip_stitch { 222 | unsigned magic; 223 | #define GZIP_STITCH_MAGIC 0x62672ece 224 | byte_iter_f *func; 225 | void *priv; 226 | const char *state; 227 | ssize_t gzlen; 228 | int retval; 229 | uint32_t crc; 230 | uint32_t l_crc; 231 | uint8_t tailbuf[13]; 232 | }; 233 | 234 | static const char GZSTATE_OUTSIDE[] = "OUTSIDE GZIP"; 235 | static const char GZSTATE_INSIDE[] = "INSIDE GZIP"; 236 | static const char GZSTATE_TAIL[] = "TAIL GZIP"; 237 | 238 | static const uint8_t gzip_stitch_head[] = { 239 | 0x1f, // ID1 240 | 0x8b, // ID2 241 | 0x08, // CM 242 | 0x00, // FLAGS 243 | 0x00, 0x00, 0x00, 0x00, // MTIME 244 | 0x04, // XFL 245 | 0x03, // OS 246 | }; 247 | 248 | struct gzip_stitch * 249 | gzip_stitch_new(byte_iter_f *func, void *priv) 250 | { 251 | struct gzip_stitch *gs; 252 | 253 | AN(func); 254 | 255 | ALLOC_OBJ(gs, GZIP_STITCH_MAGIC); 256 | AN(gs); 257 | gs->func = func; 258 | gs->priv = priv; 259 | gs->state = GZSTATE_OUTSIDE; 260 | gs->crc = crc32(0L, NULL, 0); 261 | 262 | gs->retval = gs->func(gs->priv, 263 | gzip_stitch_head, sizeof gzip_stitch_head); 264 | 265 | return(gs); 266 | } 267 | 268 | int 269 | gzip_stitch_feed(void *priv, const void *ptr, ssize_t len) 270 | { 271 | struct gzip_stitch *gs; 272 | ssize_t skip; 273 | const uint8_t *p = ptr; 274 | uint32_t crc, bytes; 275 | AN(p); 276 | 277 | CAST_OBJ_NOTNULL(gs, priv, GZIP_STITCH_MAGIC); 278 | if (gs->retval) 279 | return (gs->retval); 280 | 281 | while (len > 0) { 282 | if (gs->state == GZSTATE_OUTSIDE) { 283 | /* Pick up gzlen from Aa extension and skip hdr */ 284 | xxxassert(len >= 24); 285 | assert(p[0] == 0x1f); 286 | assert(p[1] == 0x8b); 287 | assert(p[2] == 0x08); 288 | assert(p[3] == 0x04); 289 | assert(p[12] == 0x41); 290 | assert(p[13] == 0x61); 291 | assert(p[14] == 0x08); 292 | assert(p[15] == 0x00); 293 | gs->gzlen = le64dec(p + 16); 294 | gs->state = GZSTATE_INSIDE; 295 | p += 24; 296 | len -= 24; 297 | gs->gzlen -= 24; 298 | continue; 299 | } 300 | if (gs->state == GZSTATE_INSIDE) { 301 | /* Pass on all the boring stuff in the middle */ 302 | assert(len <= gs->gzlen); 303 | skip = len; 304 | if (skip > gs->gzlen - 13) 305 | skip = gs->gzlen - 13; 306 | gs->retval = gs->func(gs->priv, p, skip); 307 | if (gs->retval) 308 | return (gs->retval); 309 | p += skip; 310 | len -= skip; 311 | gs->gzlen -= skip; 312 | if (gs->gzlen == 13) 313 | gs->state = GZSTATE_TAIL; 314 | continue; 315 | } 316 | if (gs->state == GZSTATE_TAIL) { 317 | /* Strip from last stop-bit and accumulate CRC */ 318 | memcpy(gs->tailbuf + 13 - gs->gzlen, p, len); 319 | p += len; 320 | gs->gzlen -= len; 321 | len -= len; 322 | if (gs->gzlen) 323 | continue; 324 | p = gs->tailbuf; 325 | if (p[3] == 0x03 && p[4] == 0x00) { 326 | gs->retval = gs->func(gs->priv, p, 3); 327 | if (gs->retval) 328 | return (gs->retval); 329 | } else if (p[0] == 0x01 && 330 | p[1] == 0x00 && p[2] == 0x00 && 331 | p[3] == 0xff && p[4] == 0xff) { 332 | } else { 333 | WRONG("Z_FINISH stop bit not found"); 334 | } 335 | crc = le32dec(p + 5); 336 | bytes = le32dec(p + 9); 337 | gs->l_crc += bytes; 338 | gs->crc = crc32_combine(gs->crc, crc, bytes); 339 | gs->state = GZSTATE_OUTSIDE; 340 | len -= 13; 341 | } 342 | } 343 | return (gs->retval); 344 | } 345 | 346 | int 347 | gzip_stitch_fini(struct gzip_stitch *gs) 348 | { 349 | int retval; 350 | 351 | CHECK_OBJ_NOTNULL(gs, GZIP_STITCH_MAGIC); 352 | 353 | if (!gs->retval) { 354 | /* Emit a new stop-bit and a new CRC+LEN trailer */ 355 | gs->tailbuf[0] = 0x01; 356 | gs->tailbuf[1] = 0x00; 357 | gs->tailbuf[2] = 0x00; 358 | gs->tailbuf[3] = 0xff; 359 | gs->tailbuf[4] = 0xff; 360 | le32enc(gs->tailbuf + 5, gs->crc); 361 | le32enc(gs->tailbuf + 9, gs->l_crc); 362 | gs->retval = gs->func(gs->priv, gs->tailbuf, 13); 363 | } 364 | retval = gs->retval; 365 | FREE_OBJ(gs); 366 | return (retval); 367 | } 368 | -------------------------------------------------------------------------------- /header.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "vdef.h" 39 | 40 | #include "miniobj.h" 41 | #include "vas.h" 42 | #include "vsb.h" 43 | 44 | #include "aardwarc.h" 45 | 46 | struct hfield { 47 | unsigned magic; 48 | #define HFIELD_MAGIC 0x10b9767d 49 | char *name; 50 | char *val; 51 | VTAILQ_ENTRY(hfield) list; 52 | }; 53 | 54 | struct header { 55 | unsigned magic; 56 | #define HEADER_MAGIC 0x5bf750ab 57 | const struct aardwarc *aa; 58 | VTAILQ_HEAD(, hfield) hfields; 59 | char *warc_record_id; 60 | }; 61 | 62 | struct header * 63 | Header_New(const struct aardwarc *aa) 64 | { 65 | struct header *hdr; 66 | 67 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 68 | ALLOC_OBJ(hdr, HEADER_MAGIC); 69 | AN(hdr); 70 | VTAILQ_INIT(&hdr->hfields); 71 | hdr->aa = aa; 72 | hdr->warc_record_id = calloc(1, hdr->aa->id_size + 1L); 73 | AN(hdr->warc_record_id); 74 | memset(hdr->warc_record_id, '_', hdr->aa->id_size); 75 | hdr->warc_record_id[hdr->aa->id_size] = '\0'; 76 | return (hdr); 77 | } 78 | 79 | void 80 | Header_Destroy(struct header **hdp) 81 | { 82 | struct header *hdr; 83 | struct hfield *hf; 84 | 85 | AN(hdp); 86 | hdr = *hdp; 87 | *hdp = NULL; 88 | CHECK_OBJ_NOTNULL(hdr, HEADER_MAGIC); 89 | 90 | REPLACE(hdr->warc_record_id, NULL); 91 | while (1) { 92 | hf = VTAILQ_FIRST(&hdr->hfields); 93 | if (hf == NULL) 94 | break; 95 | REPLACE(hf->name, NULL); 96 | REPLACE(hf->val, NULL); 97 | VTAILQ_REMOVE(&hdr->hfields, hf, list); 98 | FREE_OBJ(hf); 99 | } 100 | FREE_OBJ(hdr); 101 | } 102 | 103 | struct header * 104 | Header_Clone(const struct header *hd) 105 | { 106 | struct header *hdn; 107 | struct hfield *hf, *hfn; 108 | 109 | hdn = Header_New(hd->aa); 110 | AN(hdn); 111 | if (hd->warc_record_id != NULL) { 112 | hdn->warc_record_id = strdup(hd->warc_record_id); 113 | AN(hdn->warc_record_id); 114 | } 115 | VTAILQ_FOREACH(hf, &hd->hfields, list) { 116 | ALLOC_OBJ(hfn, HFIELD_MAGIC); 117 | AN(hfn); 118 | hfn->name = strdup(hf->name); 119 | AN(hfn->name); 120 | hfn->val = strdup(hf->val); 121 | AN(hfn->val); 122 | VTAILQ_INSERT_TAIL(&hdn->hfields, hfn, list); 123 | } 124 | return (hdn); 125 | } 126 | 127 | int 128 | Header_Len(const char *name, const char *val, ...) 129 | { 130 | va_list ap; 131 | char buf[1]; 132 | int l; 133 | 134 | l = strlen(name); 135 | l += strlen(": "); 136 | va_start(ap, val); 137 | l += vsnprintf(buf, 0, val, ap); 138 | va_end(ap); 139 | l += strlen("\r\n"); 140 | return (l); 141 | } 142 | 143 | void 144 | Header_Delete(struct header *hd, const char *name) 145 | { 146 | struct hfield *hf, *hf2; 147 | 148 | CHECK_OBJ_NOTNULL(hd, HEADER_MAGIC); 149 | AN(name); 150 | assert(strchr(name, ':') == NULL); 151 | AN(strcasecmp(name, "WARC-Record-ID")); 152 | 153 | VTAILQ_FOREACH_SAFE(hf, &hd->hfields, list, hf2) { 154 | CHECK_OBJ_NOTNULL(hf, HFIELD_MAGIC); 155 | if (strcasecmp(name, hf->name)) 156 | continue; 157 | VTAILQ_REMOVE(&hd->hfields, hf, list); 158 | REPLACE(hf->name, NULL); 159 | REPLACE(hf->val, NULL); 160 | free(hf); 161 | } 162 | } 163 | 164 | void 165 | Header_Set(struct header *hd, const char *name, const char *val, ...) 166 | { 167 | struct hfield *hf, *hf2; 168 | int i; 169 | va_list ap; 170 | 171 | CHECK_OBJ_NOTNULL(hd, HEADER_MAGIC); 172 | AN(name); 173 | assert(strchr(name, ':') == NULL); 174 | AN(strcasecmp(name, "WARC-Record-ID")); 175 | AN(val); 176 | 177 | VTAILQ_FOREACH(hf, &hd->hfields, list) 178 | if (!strcasecmp(name, hf->name)) 179 | break; 180 | if (hf == NULL) { 181 | ALLOC_OBJ(hf, HFIELD_MAGIC); 182 | AN(hf); 183 | hf->name = strdup(name); 184 | AN(hf->name); 185 | } else { 186 | VTAILQ_REMOVE(&hd->hfields, hf, list); 187 | REPLACE(hf->val, NULL); 188 | } 189 | va_start(ap, val); 190 | (void)vasprintf(&hf->val, val, ap); 191 | va_end(ap); 192 | AN(hf->val); 193 | VTAILQ_FOREACH(hf2, &hd->hfields, list) { 194 | CHECK_OBJ_NOTNULL(hf2, HFIELD_MAGIC); 195 | i = strcasecmp(name, hf2->name); 196 | if (i < 0) { 197 | VTAILQ_INSERT_BEFORE(hf2, hf, list); 198 | return; 199 | } 200 | if (i == 0) 201 | WRONG("Multiple headers with same name"); 202 | } 203 | VTAILQ_INSERT_TAIL(&hd->hfields, hf, list); 204 | } 205 | 206 | const char * 207 | Header_Get(const struct header *hd, const char *name) 208 | { 209 | struct hfield *hf; 210 | 211 | CHECK_OBJ_NOTNULL(hd, HEADER_MAGIC); 212 | AN(name); 213 | VTAILQ_FOREACH(hf, &hd->hfields, list) 214 | if (!strcasecmp(name, hf->name)) 215 | return (hf->val); 216 | return (NULL); 217 | } 218 | 219 | intmax_t 220 | Header_Get_Number(const struct header *hd, const char *name) 221 | { 222 | intmax_t r = 0; 223 | const char *p; 224 | 225 | CHECK_OBJ_NOTNULL(hd, HEADER_MAGIC); 226 | AN(name); 227 | p = Header_Get(hd, name); 228 | if (p == NULL) 229 | return (-1); 230 | for (; *p != '\0'; p++) { 231 | if (*p < '0' || *p > '9') 232 | return (-1); 233 | r *= 10; 234 | r += *p - '0'; 235 | } 236 | return (r); 237 | } 238 | 239 | struct vsb * 240 | Header_Serialize(const struct header *hdr, int level) 241 | { 242 | struct vsb *vsb; 243 | struct hfield *hf; 244 | 245 | CHECK_OBJ_NOTNULL(hdr, HEADER_MAGIC); 246 | AN(hdr->warc_record_id); 247 | 248 | vsb = VSB_new_auto(); 249 | AN(vsb); 250 | 251 | VSB_cat(vsb, "WARC/1.1\r\n"); 252 | 253 | VSB_cat(vsb, "WARC-Record-ID: <"); 254 | VSB_cat(vsb, hdr->aa->prefix); 255 | VSB_cat(vsb, hdr->warc_record_id); 256 | VSB_cat(vsb, ">\r\n"); 257 | 258 | VTAILQ_FOREACH(hf, &hdr->hfields, list) { 259 | CHECK_OBJ_NOTNULL(hf, HFIELD_MAGIC); 260 | AN(hf->name); 261 | AN(hf->val); 262 | VSB_cat(vsb, hf->name); 263 | VSB_cat(vsb, ": "); 264 | VSB_cat(vsb, hf->val); 265 | VSB_cat(vsb, "\r\n"); 266 | } 267 | VSB_cat(vsb, "\r\n"); 268 | 269 | if (level == -1) 270 | return (vsb); // NB: No VSB_finish() call 271 | 272 | AZ(VSB_finish(vsb)); 273 | Gzip_Vsb(&vsb, level); 274 | return (vsb); 275 | } 276 | 277 | const char * 278 | Header_Get_Id(const struct header *hdr) 279 | { 280 | 281 | CHECK_OBJ_NOTNULL(hdr, HEADER_MAGIC); 282 | 283 | AN(hdr->warc_record_id); 284 | return (hdr->warc_record_id); 285 | } 286 | 287 | void 288 | Header_Set_Id(struct header *hdr, const char *id) 289 | { 290 | size_t i; 291 | 292 | CHECK_OBJ_NOTNULL(hdr, HEADER_MAGIC); 293 | for (i = 0; id[i] != '\0'; i++) 294 | assert(isgraph(id[i])); 295 | REPLACE(hdr->warc_record_id, id); 296 | assert(i >= hdr->aa->id_size); 297 | hdr->warc_record_id[hdr->aa->id_size] = '\0'; 298 | } 299 | 300 | void 301 | Header_Set_Date(struct header *hdr) 302 | { 303 | struct tm tm; 304 | time_t t; 305 | char buf[100]; 306 | 307 | CHECK_OBJ_NOTNULL(hdr, HEADER_MAGIC); 308 | 309 | (void)time(&t); 310 | 311 | AN(gmtime_r(&t, &tm)); 312 | assert(strftime(buf, sizeof buf, "%Y-%m-%dT%H:%M:%SZ", &tm) == 20); 313 | Header_Set(hdr, "WARC-Date", "%s", buf); 314 | } 315 | 316 | void 317 | Header_Set_Ref(struct header *hdr, const char *name, const char *ref) 318 | { 319 | 320 | CHECK_OBJ_NOTNULL(hdr, HEADER_MAGIC); 321 | AN(name); 322 | AN(ref); 323 | assert(strlen(ref) >= hdr->aa->id_size); 324 | 325 | Header_Set(hdr, name, "<%s%.*s>", 326 | hdr->aa->prefix, hdr->aa->id_size, ref); 327 | } 328 | 329 | /* Parse one of our own WARC headers ---------------------------------- 330 | * 331 | * NB: This is *not* a general purpose WARC header parser. 332 | */ 333 | 334 | // Flexelint bug: 335 | //lint -efunc(818, Header_Parse) 336 | 337 | struct header * 338 | Header_Parse(const struct aardwarc *aa, char *p) 339 | { 340 | const char *q10 = "WARC/1.0\r\nWARC-Record-ID: <"; 341 | const char *q11 = "WARC/1.1\r\nWARC-Record-ID: <"; 342 | char *q, *r, *s; 343 | struct header *hdr; 344 | 345 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 346 | AN(p); 347 | 348 | ALLOC_OBJ(hdr, HEADER_MAGIC); 349 | AN(hdr); 350 | VTAILQ_INIT(&hdr->hfields); 351 | hdr->aa = aa; 352 | 353 | assert(!memcmp(p, q10, strlen(q10)) || !memcmp(p, q11, strlen(q11))); 354 | p = strchr(p, '\n'); 355 | AN(p); 356 | for (p++; *p != '\0'; p = q) { 357 | q = strchr(p, '\r'); 358 | AN(q); 359 | *q++ = '\0'; 360 | assert(*q == '\n'); 361 | *q++ = '\0'; 362 | if (*p == '\0') { 363 | assert(*q == '\0'); 364 | break; 365 | } 366 | r = strchr(p, ':'); 367 | AN(r); 368 | *r++ = '\0'; 369 | assert(*r == ' '); 370 | *r++ = '\0'; 371 | if (strcmp(p, "WARC-Record-ID")) { 372 | Header_Set(hdr, p, "%s", r); 373 | continue; 374 | } 375 | assert(*r == '<'); 376 | r++; 377 | s = strchr(r, '>'); 378 | AN(s); 379 | AZ(s[1]); 380 | s[0] = '\0'; 381 | s = strrchr(r, '/'); 382 | AN(s); 383 | REPLACE(hdr->warc_record_id, s + 1); 384 | s[1] = '\0'; 385 | assert(!strcmp(r, aa->prefix)); 386 | } 387 | return (hdr); 388 | } 389 | -------------------------------------------------------------------------------- /ident.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2018 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "vdef.h" 37 | #include "miniobj.h" 38 | #include "vas.h" 39 | #include "vsb.h" 40 | #include "sha256.h" 41 | 42 | #include "aardwarc.h" 43 | 44 | void 45 | Ident_Create(const struct aardwarc *aa, const struct header *hdr, 46 | const char *payload_digest, char *ident) 47 | { 48 | const char *typ, *ref; 49 | struct SHA256Context sha256[1]; 50 | 51 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 52 | AN(hdr); 53 | AN(payload_digest); 54 | typ = Header_Get(hdr, "WARC-Type"); 55 | AN(typ); 56 | if (!strcmp(typ, "resource")) { 57 | /* We use the payload digest as ID */ 58 | strcpy(ident, payload_digest); 59 | } else if (!strcmp(typ, "continuation")) { 60 | /* We use the payload digest as ID */ 61 | strcpy(ident, payload_digest); 62 | } else if (!strcmp(typ, "warcinfo")) { 63 | /* We use the payload digest as ID */ 64 | strcpy(ident, payload_digest); 65 | } else if (!strcmp(typ, "metadata")) { 66 | /* ID=SHA256(reference_id + "\n" + SHA256(body) + "\n") */ 67 | SHA256_Init(sha256); 68 | ref = Header_Get(hdr, "WARC-Refers-To"); 69 | AN(ref); 70 | SHA256_Update(sha256, ref, strlen(ref)); 71 | SHA256_Update(sha256, "\n", 1); 72 | SHA256_Update(sha256, payload_digest, strlen(payload_digest)); 73 | SHA256_Update(sha256, "\n", 1); 74 | AN(SHA256_End(sha256, ident)); 75 | } else { 76 | fprintf(stderr, "XXX %s\n", typ); 77 | WRONG("Unknown WARC-Type"); 78 | } 79 | 80 | ident[aa->id_size] = '\0'; 81 | } 82 | 83 | void 84 | Ident_Set(const struct aardwarc *aa, struct header *hdr, 85 | const char *digest, const char *input) 86 | { 87 | char id[SHA256_DIGEST_STRING_LENGTH]; 88 | 89 | assert(input == NULL || IDX_Valid_Id(aa, input, NULL) == NULL); 90 | 91 | if (input == NULL) { 92 | Ident_Create(aa, hdr, digest, id); 93 | Header_Set_Id(hdr, id); 94 | } else { 95 | Header_Set_Id(hdr, input); 96 | } 97 | } 98 | 99 | char * 100 | Digest2Ident(const struct aardwarc *aa, const char *digest) 101 | { 102 | struct vsb *vsb; 103 | char *id; 104 | 105 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 106 | AN(digest); 107 | 108 | vsb = VSB_new_auto(); 109 | AN(vsb); 110 | VSB_printf(vsb, "%s%s", aa->prefix, digest); 111 | AZ(VSB_finish(vsb)); 112 | AZ(IDX_Valid_Id(aa, VSB_data(vsb), NULL)); 113 | id = strdup(VSB_data(vsb)); 114 | AN(id); 115 | VSB_destroy(&vsb); 116 | return (id); 117 | } 118 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "vdef.h" 37 | 38 | #include "vas.h" 39 | #include "vsb.h" 40 | 41 | #include "aardwarc.h" 42 | 43 | /*---------------------------------------------------------------------*/ 44 | 45 | static const struct mains { 46 | const char *name; 47 | main_f *func; 48 | int json; 49 | const char *line1; 50 | } mains[] = { 51 | #define MAIN(l,j,d) { #l, main_##l, j, d} 52 | MAIN(audit, 0, "Audit silos"), 53 | MAIN(byid, 0, "List entries by ID"), 54 | MAIN(cgi, 0, "CGI service"), 55 | MAIN(dumpindex, 0, "Dump index"), 56 | MAIN(filter, 0, "Filter list of IDs"), 57 | MAIN(get, 0, "Get record"), 58 | MAIN(housekeeping, 0, "Do housekeeping"), 59 | MAIN(info, 1, "Information about the archive"), 60 | MAIN(mksilo, 0, "Build a new silo"), 61 | MAIN(rebuild, 0, "Rebuild silos"), 62 | MAIN(reindex, 0, "Rebuild index"), 63 | MAIN(stevedore, 0, "Act as server"), 64 | MAIN(store, 0, "Store data"), 65 | MAIN(stow, 0, "Stow data to remote server"), 66 | MAIN(_testbytes, 0, "Bytes for tests"), 67 | { NULL, NULL, 0, NULL} 68 | }; 69 | 70 | void 71 | usage(const char *a0, const char *err) 72 | { 73 | const struct mains *mp; 74 | 75 | if (err != NULL) 76 | fprintf(stderr, "%s\n", err); 77 | fprintf(stderr, "Usage:\n"); 78 | fprintf(stderr, "\t%s [global options] operation [arguments]\n", a0); 79 | fprintf(stderr, "Global options:\n"); 80 | fprintf(stderr, "\t-c config_file\n"); 81 | fprintf(stderr, "Operations:\n"); 82 | for(mp = mains; mp->name != NULL; mp++) 83 | if (mp->name[0] != '.') 84 | fprintf(stderr, "\t%-12s %s\n", mp->name, mp->line1); 85 | } 86 | 87 | int 88 | call_main(const char *a0, struct aardwarc *aa, int argc, char **argv) 89 | { 90 | const struct mains *mp; 91 | for(mp = mains; mp->name != NULL; mp++) 92 | if (!strcmp(mp->name, argv[0])) 93 | break; 94 | if (mp->name == NULL) { 95 | usage(a0, "Unknown operation"); 96 | return (1); 97 | } 98 | if (aa->json && !mp->json) { 99 | usage(a0, "This subcommand does not do JSON."); 100 | return (2); 101 | } 102 | return (mp->func(a0, aa, argc, argv)); 103 | } 104 | 105 | int 106 | main(int argc, char **argv) 107 | { 108 | int ch, json = 0; 109 | const char *cf = NULL; 110 | struct vsb *vsb1, *vsb2, *vsb3; 111 | struct aardwarc *aa = NULL; 112 | const char *a0; 113 | char *home; 114 | char buf[BUFSIZ]; 115 | 116 | /* Parse global option flags ----------------------------------*/ 117 | 118 | a0 = *argv; 119 | while ((ch = getopt(argc, argv, "c:hj")) != -1) { 120 | switch(ch) { 121 | case 'h': 122 | usage(a0, NULL); 123 | exit(1); 124 | break; 125 | case 'j': 126 | json = 1; 127 | break; 128 | case 'c': 129 | cf = optarg; 130 | break; 131 | default: 132 | usage(a0, "Unknown global option error"); 133 | exit(1); 134 | break; 135 | } 136 | } 137 | argc -= optind; 138 | argv += optind; 139 | 140 | optreset = 1; 141 | optind = 1; 142 | 143 | if (argc == 0) { 144 | usage(a0, "Need command argument"); 145 | exit (1); 146 | } 147 | 148 | /* Open and parse our configuration ---------------------------*/ 149 | 150 | vsb1 = VSB_new_auto(); 151 | AN(vsb1); 152 | vsb2 = VSB_new_auto(); 153 | AN(vsb2); 154 | vsb3 = VSB_new_auto(); 155 | AN(vsb3); 156 | buf[0] = '\0'; 157 | 158 | if (cf != NULL) { 159 | aa = AardWARC_New(cf, vsb1); 160 | if (aa == NULL) { 161 | AZ(VSB_finish(vsb1)); 162 | fprintf(stderr, "%s", VSB_data(vsb1)); 163 | exit(2); 164 | } 165 | } else { 166 | home = getenv("HOME"); 167 | if (home != NULL) { 168 | bprintf(buf, "%s/.aardwarc.conf", home); 169 | aa = AardWARC_New(buf, vsb1); 170 | AZ(VSB_finish(vsb1)); 171 | } 172 | if (aa == NULL) { 173 | aa = AardWARC_New("/etc/aardwarc.conf", vsb2); 174 | AZ(VSB_finish(vsb2)); 175 | } 176 | if (aa == NULL) { 177 | aa = AardWARC_New("/usr/local/etc/aardwarc.conf", vsb3); 178 | AZ(VSB_finish(vsb3)); 179 | } 180 | if (aa == NULL) { 181 | fprintf(stderr, "No config file found, tried:\n"); 182 | if (buf[0] != '\0') 183 | fprintf(stderr, " %s\n\t%s\n", 184 | buf, VSB_data(vsb1)); 185 | fprintf(stderr, " /etc/aardwarc.conf\n\t%s\n", 186 | VSB_data(vsb2)); 187 | fprintf(stderr, 188 | " /usr/local/etc/aardwarc.conf\n\t%s\n", 189 | VSB_data(vsb3)); 190 | exit(1); 191 | } 192 | } 193 | aa->json = json; 194 | 195 | VSB_delete(vsb1); 196 | VSB_delete(vsb2); 197 | VSB_delete(vsb3); 198 | 199 | return (call_main(a0, aa, argc, argv)); 200 | } 201 | -------------------------------------------------------------------------------- /main_byid.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "vdef.h" 36 | 37 | #include "vas.h" 38 | #include "miniobj.h" 39 | 40 | #include "aardwarc.h" 41 | 42 | static 43 | void 44 | usage_byid(const char *a0, const char *a00, const char *err) 45 | { 46 | usage(a0, err); 47 | fprintf(stderr, "Usage for this operation:\n"); 48 | fprintf(stderr, "\t%s [global options] %s [options] [silo]...\n", 49 | a0, a00); 50 | fprintf(stderr, "Options:\n"); 51 | fprintf(stderr, "\t[-e]\tAlways exit zero\n"); 52 | } 53 | 54 | struct privs { 55 | unsigned magic; 56 | #define PRIVS_MAGIC 0x37ea975a 57 | struct aardwarc *aa; 58 | int retval; 59 | }; 60 | 61 | static int v_matchproto_(idx_iter_f) 62 | byid_iter(void *priv, const char *key, 63 | uint32_t flag, uint32_t silo, int64_t offset, const char *cont) 64 | { 65 | struct rsilo *rs; 66 | struct header *hdr; 67 | const char *p; 68 | struct privs *pp; 69 | 70 | CAST_OBJ_NOTNULL(pp, priv, PRIVS_MAGIC); 71 | (void)key; 72 | (void)flag; 73 | (void)cont; 74 | // printf("%s 0x%08x %8u %12ju %s\n", key, flag, silo, offset, cont); 75 | rs = Rsilo_Open(pp->aa, NULL, silo, offset); 76 | AN(rs); 77 | hdr = Rsilo_ReadHeader(rs); 78 | AN(hdr); 79 | p = Header_Get_Id(hdr); 80 | printf("id %s", p); 81 | p = Header_Get(hdr, "WARC-Type"); 82 | printf(" wt %s", p); 83 | printf("\n"); 84 | Header_Destroy(&hdr); 85 | Rsilo_Close(&rs); 86 | pp->retval++; 87 | return(0); 88 | } 89 | 90 | int v_matchproto_(main_f) 91 | main_byid(const char *a0, struct aardwarc *aa, int argc, char **argv) 92 | { 93 | int ch; 94 | const char *a00 = *argv; 95 | const char *nid; 96 | struct privs privs[1]; 97 | int ok = 0; 98 | 99 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 100 | INIT_OBJ(privs, PRIVS_MAGIC); 101 | privs->aa = aa; 102 | 103 | while ((ch = getopt(argc, argv, "he")) != -1) { 104 | switch (ch) { 105 | case 'h': 106 | usage_byid(a0, a00, NULL); 107 | exit(1); 108 | case 'e': 109 | ok = 1; 110 | break; 111 | default: 112 | usage_byid(a0, a00, "Unknown option error."); 113 | exit(1); 114 | } 115 | } 116 | argc -= optind; 117 | argv += optind; 118 | 119 | for (;argc > 0; argc--, argv++) { 120 | nid = *argv; 121 | if (!strncasecmp(nid, aa->prefix, strlen(aa->prefix))) 122 | nid += strlen(aa->prefix); 123 | if (strspn(nid, "0123456789abcdefABCDEF") != strlen(nid)) { 124 | fprintf(stderr, "Invalid ID-fragment\n"); 125 | exit(1); 126 | } 127 | (void)IDX_Iter(aa, nid, byid_iter, privs); 128 | } 129 | if (ok) 130 | return(0); 131 | return (privs->retval < 256 ? privs->retval : 255); 132 | } 133 | -------------------------------------------------------------------------------- /main_cgi.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "vdef.h" 36 | 37 | #include "vas.h" 38 | #include "vsb.h" 39 | #include "miniobj.h" 40 | 41 | #include "aardwarc.h" 42 | 43 | static 44 | void 45 | usage_cgi(const char *a0, const char *a00, const char *err) 46 | { 47 | usage(a0, err); 48 | fprintf(stderr, "Usage for this operation:\n"); 49 | fprintf(stderr, "\t%s [global options] %s [options]\n", 50 | a0, a00); 51 | } 52 | 53 | static int v_matchproto_(byte_iter_f) 54 | get_iter(void *priv, const void *ptr, ssize_t len) 55 | { 56 | 57 | (void)priv; 58 | assert(len == (ssize_t)fwrite(ptr, 1, len, stdout)); 59 | return (0); 60 | } 61 | 62 | int v_matchproto_(main_f) 63 | main_cgi(const char *a0, struct aardwarc *aa, int argc, char **argv) 64 | { 65 | int ch; 66 | const char *a00 = *argv; 67 | struct vsb *vsb; 68 | struct getjob *gj; 69 | const struct header *hdr; 70 | const char *p; 71 | const char *id; 72 | const char *ct; 73 | int gzip = 0; 74 | off_t o; 75 | 76 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 77 | 78 | while ((ch = getopt(argc, argv, "h")) != -1) { 79 | switch (ch) { 80 | case 'h': 81 | usage_cgi(a0, a00, NULL); 82 | exit(1); 83 | default: 84 | usage_cgi(a0, a00, "Unknown option error."); 85 | exit(1); 86 | } 87 | } 88 | argc -= optind; 89 | argv += optind; 90 | 91 | if (argc != 0) { 92 | usage_cgi(a0, a00, "Too many arguments."); 93 | exit (1); 94 | } 95 | AZ(*argv); 96 | 97 | p = getenv("GATEWAY_INTERFACE"); 98 | if (p == NULL || strcmp(p, "CGI/1.1")) { 99 | usage_cgi(a0, a00, "No (good) $GATEWAY_INTERFACE"); 100 | exit(1); 101 | } 102 | p = getenv("REQUEST_METHOD"); 103 | if (p == NULL || strcmp(p, "GET")) { 104 | usage_cgi(a0, a00, "No (good) $REQUEST_METHOD"); 105 | exit(1); 106 | } 107 | 108 | id = getenv("PATH_INFO"); 109 | if (id == NULL) { 110 | usage_cgi(a0, a00, "No $PATH_INFO"); 111 | exit (1); 112 | } 113 | 114 | p = getenv("HTTP_ACCEPT_ENCODING"); 115 | if (p != NULL && strstr(p, "gzip")) 116 | gzip = 1; 117 | 118 | if (*id == '/') 119 | id++; 120 | 121 | vsb = VSB_new_auto(); 122 | AN(vsb); 123 | 124 | gj = GetJob_New(aa, id, vsb); 125 | if (gj == NULL) { 126 | AZ(VSB_finish(vsb)); 127 | printf("Content-Type: text/html\n"); 128 | printf("Status: 501 Error\n"); 129 | printf("\n"); 130 | printf(""); 131 | printf("
");
132 | 		printf("%s\n", VSB_data(vsb));
133 | 		printf("
"); 134 | printf(""); 135 | exit (0); 136 | } 137 | VSB_delete(vsb); 138 | 139 | /* 140 | * We cannot do the gzip trick for segmented objects (exactly 141 | * the ones that need it most) because firefox and curl do not 142 | * properly handle concatenated gzip files. 143 | * XXX: we could stich them, doing the CRC editing dance... 144 | */ 145 | if (GetJob_IsSegmented(gj)) 146 | gzip = 0; 147 | 148 | hdr = GetJob_Header(gj, 1); 149 | AN(hdr); 150 | 151 | ct = Header_Get(hdr, "Content-Type"); 152 | if (ct == NULL) 153 | ct = "application/binary"; 154 | printf("Content-Type: %s\n", ct); 155 | 156 | if (gzip) 157 | printf("Content-Encoding: gzip\n"); 158 | 159 | o = GetJob_TotalLength(gj, gzip); 160 | printf("Content-Length: %jd\n", (intmax_t)o); 161 | printf("Status: 200\n"); 162 | printf("\n"); 163 | 164 | GetJob_Iter(gj, get_iter, NULL, gzip); 165 | 166 | GetJob_Delete(&gj); 167 | return (0); 168 | } 169 | -------------------------------------------------------------------------------- /main_dumpindex.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "vdef.h" 36 | 37 | #include "vas.h" 38 | #include "miniobj.h" 39 | 40 | #include "aardwarc.h" 41 | 42 | static 43 | void 44 | usage_dumpindex(const char *a0, const char *a00, const char *err) 45 | { 46 | usage(a0, err); 47 | fprintf(stderr, "Usage for this operation:\n"); 48 | fprintf(stderr, "\t%s [global options] %s [options] [id-part]...\n", 49 | a0, a00); 50 | fprintf(stderr, "\t\t -t {metadata|resource|warcinfo}\n"); 51 | } 52 | 53 | static int v_matchproto_(idx_iter_f) 54 | dumpindex_iter(void *priv, const char *key, 55 | uint32_t flag, uint32_t silo, int64_t offset, const char *cont) 56 | { 57 | uint32_t *u; 58 | 59 | u = priv; 60 | if (*u != 0 && flag != *u) 61 | return (0); 62 | printf("%s 0x%08x %8u %12jd %s\n", key, flag, silo, offset, cont); 63 | return(0); 64 | } 65 | 66 | int v_matchproto_(main_f) 67 | main_dumpindex(const char *a0, struct aardwarc *aa, int argc, char **argv) 68 | { 69 | int ch; 70 | const char *a00 = *argv; 71 | uint32_t u = 0; 72 | 73 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 74 | 75 | while ((ch = getopt(argc, argv, "ht:")) != -1) { 76 | switch (ch) { 77 | case 'h': 78 | usage_dumpindex(a0, a00, NULL); 79 | exit(1); 80 | case 't': 81 | if (!strcmp(optarg, "metadata")) 82 | u = IDX_F_METADATA; 83 | else if (!strcmp(optarg, "resource")) 84 | u = IDX_F_RESOURCE; 85 | else if (!strcmp(optarg, "warcinfo")) 86 | u = IDX_F_WARCINFO; 87 | else { 88 | usage_dumpindex(a0, a00, 89 | "Wrong type for -t."); 90 | exit (1); 91 | } 92 | break; 93 | default: 94 | usage_dumpindex(a0, a00, "Unknown option error."); 95 | exit(1); 96 | } 97 | } 98 | argc -= optind; 99 | argv += optind; 100 | 101 | if (argc == 0) 102 | (void)IDX_Iter(aa, NULL, dumpindex_iter, &u); 103 | else 104 | for (;argc > 0; argc--, argv++) 105 | (void)IDX_Iter(aa, *argv, dumpindex_iter, &u); 106 | return (0); 107 | } 108 | -------------------------------------------------------------------------------- /main_filter.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "vdef.h" 37 | 38 | #include "vas.h" 39 | #include "miniobj.h" 40 | 41 | #include "aardwarc.h" 42 | 43 | struct cand { 44 | unsigned magic; 45 | #define CAND_MAGIC 0x882a5fd4 46 | int found; 47 | char *id; 48 | char *line; 49 | VTAILQ_ENTRY(cand) list; 50 | VTAILQ_ENTRY(cand) sortlist; 51 | }; 52 | 53 | VTAILQ_HEAD(candhead,cand); 54 | 55 | static struct candhead candidates = 56 | VTAILQ_HEAD_INITIALIZER(candidates); 57 | static struct candhead sorted = 58 | VTAILQ_HEAD_INITIALIZER(sorted); 59 | static int ncand; 60 | static struct cand *next_cand; 61 | 62 | static int s_flag; 63 | 64 | struct filt { 65 | unsigned magic; 66 | #define FILT_MAGIC 0xc4b794e6 67 | struct aardwarc *aa; 68 | char last[65]; 69 | }; 70 | 71 | static 72 | void 73 | usage_filter(const char *a0, const char *a00, const char *err) 74 | { 75 | usage(a0, err); 76 | fprintf(stderr, "Usage for this operation:\n"); 77 | fprintf(stderr, 78 | "\t%s [global options] %s [options] [id-list-file]...\n", 79 | a0, a00); 80 | fprintf(stderr, "Options:\n"); 81 | fprintf(stderr, "\t-s Check the silo headers\n"); 82 | fprintf(stderr, "\t-r Report found (rather than missing) objects\n"); 83 | fprintf(stderr, "\t-v Report precense status on each line of output\n"); 84 | } 85 | 86 | static int 87 | filter_s_check(const struct filt *fp, uint32_t silo, int64_t offset, 88 | const char *key) 89 | { 90 | struct rsilo *rs; 91 | struct header *hdr; 92 | const char *p; 93 | int retval = 0; 94 | 95 | rs = Rsilo_Open(fp->aa, NULL, silo, offset); 96 | AN(rs); 97 | hdr = Rsilo_ReadHeader(rs); 98 | AN(hdr); 99 | p = Header_Get_Id(hdr); 100 | AN(p); 101 | if (strcasecmp(p, key)) 102 | retval = 1; 103 | Header_Destroy(&hdr); 104 | Rsilo_Close(&rs); 105 | return (retval); 106 | } 107 | 108 | static int v_matchproto_(idx_iter_f) 109 | filter_iter(void *priv, const char *key, 110 | uint32_t flag, uint32_t silo, int64_t offset, const char *cont) 111 | { 112 | struct cand *c; 113 | struct filt *fp; 114 | 115 | CAST_OBJ_NOTNULL(fp, priv, FILT_MAGIC); 116 | (void)flag; 117 | (void)cont; 118 | 119 | if (VTAILQ_EMPTY(&sorted)) 120 | return (1); 121 | if (next_cand == NULL || strcmp(key, fp->last) < 0) 122 | next_cand = VTAILQ_FIRST(&sorted); 123 | strlcpy(fp->last, key, sizeof fp->last); 124 | while (next_cand != NULL && strcmp(next_cand->id, key) < 0) 125 | next_cand = VTAILQ_NEXT(next_cand, sortlist); 126 | while (next_cand != NULL && 127 | !strncmp(key, next_cand->id, strlen(key))) { 128 | c = next_cand; 129 | next_cand = VTAILQ_NEXT(c, sortlist); 130 | if (!s_flag || !filter_s_check(fp, silo, offset, c->id)) { 131 | c->found = 1; 132 | VTAILQ_REMOVE(&sorted, c, sortlist); 133 | } 134 | } 135 | return(0); 136 | } 137 | 138 | static int 139 | read_file(const struct aardwarc *aa, FILE *fi) 140 | { 141 | char buf[BUFSIZ], *p; 142 | struct cand *c1, *c2; 143 | int retval = 0; 144 | size_t sl; 145 | 146 | while (fgets(buf, sizeof buf, fi) != NULL) { 147 | sl = strlen(buf); 148 | AN(sl); 149 | if (buf[sl - 1] != '\n') { 150 | fprintf(stderr, "Over long line \"%.40s...\"\n", 151 | buf); 152 | exit(1); 153 | } 154 | buf[--sl] = '\0'; 155 | if (sl == 0) 156 | continue; 157 | ALLOC_OBJ(c1, CAND_MAGIC); 158 | AN(c1); 159 | REPLACE(c1->line, buf); 160 | AN(c1->line); 161 | p = buf; 162 | if (!strncasecmp(p, aa->prefix, strlen(aa->prefix))) 163 | p += strlen(aa->prefix); 164 | if (strlen(p) < aa->id_size) { 165 | fprintf(stderr, "ID too short: \"%s\"\n", buf); 166 | exit(1); 167 | } 168 | p[aa->id_size] = '\0'; 169 | if (strspn(p, "0123456789abcdefABCDEF") != strlen(p)) { 170 | fprintf(stderr, "Non-hex characters in id: \"%s\"\n", 171 | buf); 172 | exit(1); 173 | } 174 | REPLACE(c1->id, p); 175 | AN(c1->id); 176 | c2 = VTAILQ_LAST(&candidates, candhead); 177 | ncand++; 178 | VTAILQ_INSERT_TAIL(&candidates, c1, list); 179 | VTAILQ_INSERT_TAIL(&sorted, c1, sortlist); 180 | if (c2 != NULL && strcmp(c2->id, c1->id) > 0) 181 | retval = 1; 182 | } 183 | return (retval); 184 | } 185 | 186 | static int 187 | cand_cmp(const void *p1, const void *p2) 188 | { 189 | const struct cand *c1, *c2; 190 | 191 | CAST_OBJ_NOTNULL(c1, *(const struct cand * const*)p1, CAND_MAGIC); 192 | CAST_OBJ_NOTNULL(c2, *(const struct cand * const*)p2, CAND_MAGIC); 193 | return (strcmp(c1->id, c2->id)); 194 | } 195 | 196 | int v_matchproto_(main_f) 197 | main_filter(const char *a0, struct aardwarc *aa, int argc, char **argv) 198 | { 199 | int ch; 200 | const char *a00 = *argv; 201 | FILE *fi, *fo = stdout; 202 | const char *ofile = NULL; 203 | int stdin_done = 0; 204 | int needs_sort = 0; 205 | int r_flag = 0; 206 | int v_flag = 0; 207 | struct cand *c; 208 | struct cand **cp; 209 | struct filt *fp; 210 | 211 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 212 | 213 | ALLOC_OBJ(fp, FILT_MAGIC); 214 | AN(fp); 215 | fp->aa = aa; 216 | 217 | while ((ch = getopt(argc, argv, "ho:rsv")) != -1) { 218 | switch (ch) { 219 | case 'h': 220 | usage_filter(a0, a00, NULL); 221 | exit(1); 222 | case 'o': 223 | ofile = optarg; 224 | break; 225 | case 'r': 226 | r_flag = 1 - r_flag; 227 | break; 228 | case 's': 229 | s_flag = 1 - s_flag; 230 | break; 231 | case 'v': 232 | v_flag = 1 - v_flag; 233 | break; 234 | default: 235 | usage_filter(a0, a00, "Unknown option error."); 236 | exit(1); 237 | } 238 | } 239 | argc -= optind; 240 | argv += optind; 241 | 242 | if (argc == 0) 243 | needs_sort |= read_file(aa, stdin); 244 | 245 | if (ofile != NULL) { 246 | fo = fopen(ofile, "w"); 247 | if (fo == NULL) { 248 | fprintf(stderr, "Cannot open %s: %s\n", 249 | ofile, strerror(errno)); 250 | exit (1); 251 | } 252 | } 253 | 254 | for (;argc > 0; argc--, argv++) { 255 | if (!strcmp(*argv, "-")) { 256 | if (stdin_done++) { 257 | fprintf(stderr, "STDIN already processed\n"); 258 | exit(1); 259 | } 260 | needs_sort |= read_file(aa, stdin); 261 | } else { 262 | fi = fopen(*argv, "r"); 263 | if (fi == NULL) { 264 | fprintf(stderr, "Cannot open %s: %s\n", 265 | *argv, strerror(errno)); 266 | exit(1); 267 | } 268 | needs_sort |= read_file(aa, fi); 269 | AZ(fclose(fi)); 270 | } 271 | } 272 | if (needs_sort) { 273 | cp = calloc(ncand, sizeof *cp); 274 | AN(cp); 275 | ch = 0; 276 | VTAILQ_FOREACH(c, &candidates, list) 277 | cp[ch++] = c; 278 | assert(ch == ncand); 279 | qsort(cp, ncand, sizeof *cp, cand_cmp); 280 | VTAILQ_INIT(&sorted); 281 | for(ch = 0; ch < ncand; ch++) 282 | VTAILQ_INSERT_TAIL(&sorted, cp[ch], sortlist); 283 | free(cp); 284 | } 285 | (void)IDX_Iter(aa, NULL, filter_iter, fp); 286 | while (1) { 287 | c = VTAILQ_FIRST(&candidates); 288 | if (c == NULL) 289 | break; 290 | VTAILQ_REMOVE(&candidates, c, list); 291 | if (v_flag) 292 | fprintf(fo, "%d %s\n", c->found, c->line); 293 | else if (r_flag == c->found) 294 | fprintf(fo, "%s\n", c->line); 295 | REPLACE(c->id, NULL); 296 | FREE_OBJ(c); 297 | } 298 | FREE_OBJ(fp); 299 | return (0); 300 | } 301 | -------------------------------------------------------------------------------- /main_get.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "vdef.h" 37 | 38 | #include "vas.h" 39 | #include "vsb.h" 40 | #include "miniobj.h" 41 | 42 | #include "aardwarc.h" 43 | 44 | static 45 | void 46 | usage_get(const char *a0, const char *a00, const char *err) 47 | { 48 | usage(a0, err); 49 | fprintf(stderr, "Usage for this operation:\n"); 50 | fprintf(stderr, "\t%s [global options] %s [options] [silo]...\n", 51 | a0, a00); 52 | fprintf(stderr, "\t-n Headers only\n"); 53 | fprintf(stderr, "\t-o file Output file\n"); 54 | fprintf(stderr, "\t-q Quiet (no headers)\n"); 55 | fprintf(stderr, "\t-z Gzip output\n"); 56 | } 57 | 58 | struct get { 59 | unsigned magic; 60 | #define GET_MAGIC 0xc6629054 61 | 62 | uintmax_t len; 63 | struct SHA256Context sha256[1]; 64 | FILE *dst; 65 | FILE *hdr; 66 | int zip; 67 | }; 68 | 69 | static int v_matchproto_(byte_iter_f) 70 | get_iter(void *priv, const void *ptr, ssize_t len) 71 | { 72 | struct get *gp; 73 | 74 | CAST_OBJ_NOTNULL(gp, priv, GET_MAGIC); 75 | assert(len == (ssize_t)fwrite(ptr, 1, len, gp->dst)); 76 | if (!gp->zip) 77 | SHA256_Update(gp->sha256, ptr, len); 78 | gp->len += len; 79 | return (0); 80 | } 81 | 82 | int v_matchproto_(main_f) 83 | main_get(const char *a0, struct aardwarc *aa, int argc, char **argv) 84 | { 85 | int ch; 86 | const char *a00 = *argv; 87 | struct vsb *vsb; 88 | struct getjob *gj; 89 | struct get *gp; 90 | const struct header *hdr1, *hdr9; 91 | char *dig; 92 | const char *p; 93 | char buf[32]; 94 | int quiet = 0; 95 | const char *of = NULL; 96 | int zip = 0, hdr_only = 0; 97 | 98 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 99 | 100 | while ((ch = getopt(argc, argv, "ho:nqz")) != -1) { 101 | switch (ch) { 102 | case 'h': 103 | usage_get(a0, a00, NULL); 104 | exit(1); 105 | case 'n': 106 | hdr_only = !hdr_only; 107 | break; 108 | case 'o': 109 | of = optarg; 110 | break; 111 | case 'q': 112 | quiet = !quiet; 113 | break; 114 | case 'z': 115 | zip = !zip; 116 | break; 117 | default: 118 | usage_get(a0, a00, "Unknown option error."); 119 | exit(1); 120 | } 121 | } 122 | argc -= optind; 123 | argv += optind; 124 | 125 | if (argc != 1) { 126 | usage_get(a0, a00, "Too many IDs."); 127 | exit (1); 128 | } 129 | AN(*argv); 130 | 131 | ALLOC_OBJ(gp, GET_MAGIC); 132 | AN(gp); 133 | gp->zip = zip; 134 | SHA256_Init(gp->sha256); 135 | 136 | if (of != NULL) { 137 | gp->dst = fopen(of, "w"); 138 | gp->hdr = stdout; 139 | } else { 140 | gp->dst = stdout; 141 | gp->hdr = stderr; 142 | } 143 | 144 | vsb = VSB_new_auto(); 145 | AN(vsb); 146 | 147 | gj = GetJob_New(aa, *argv, vsb); 148 | if (gj == NULL) { 149 | AZ(VSB_finish(vsb)); 150 | fprintf(stderr, "%s\n", VSB_data(vsb)); 151 | exit (1); 152 | } 153 | VSB_delete(vsb); 154 | hdr1 = GetJob_Header(gj, 1); 155 | AN(hdr1); 156 | hdr9 = GetJob_Header(gj, 0); 157 | AN(hdr9); 158 | if (!quiet) { 159 | vsb = GetJob_Headers(gj); 160 | AZ(VSB_finish(vsb)); 161 | fprintf(gp->hdr, "%s", VSB_data(vsb)); 162 | } 163 | 164 | if (!hdr_only) { 165 | GetJob_Iter(gj, get_iter, gp, zip); 166 | 167 | dig = SHA256_End(gp->sha256, NULL); 168 | AN(dig); 169 | 170 | if (!zip) { 171 | p = Header_Get(hdr1, "WARC-Payload-Digest"); 172 | if (p == NULL) 173 | p = Header_Get(hdr1, "WARC-Block-Digest"); 174 | AN(p); 175 | assert(!memcmp(p, "sha256:", 7)); 176 | p += 7; 177 | assert(!strncmp(p, dig, aa->id_size)); 178 | 179 | p = Header_Get(hdr9, "WARC-Segment-Total-Length"); 180 | if (p == NULL) 181 | p = Header_Get(hdr9, "Content-Length"); 182 | AN(p); 183 | bprintf(buf, "%ju", (uintmax_t)gp->len); 184 | assert(!strcmp(p, buf)); 185 | } 186 | } 187 | 188 | FREE_OBJ(gp); 189 | GetJob_Delete(&gj); 190 | return (0); 191 | } 192 | -------------------------------------------------------------------------------- /main_housekeeping.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | #include "vdef.h" 35 | 36 | #include "vas.h" 37 | #include "miniobj.h" 38 | 39 | #include "aardwarc.h" 40 | 41 | static 42 | void 43 | usage_housekeeping(const char *a0, const char *a00, const char *err) 44 | { 45 | usage(a0, err); 46 | fprintf(stderr, "Usage for this operation:\n"); 47 | fprintf(stderr, "\t%s [global options] %s [options] [silo]...\n", 48 | a0, a00); 49 | } 50 | 51 | int v_matchproto_(main_f) 52 | main_housekeeping(const char *a0, struct aardwarc *aa, int argc, char **argv) 53 | { 54 | int ch; 55 | const char *a00 = *argv; 56 | 57 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 58 | 59 | while ((ch = getopt(argc, argv, "h")) != -1) { 60 | switch (ch) { 61 | case 'h': 62 | usage_housekeeping(a0, a00, NULL); 63 | exit(1); 64 | default: 65 | usage_housekeeping(a0, a00, "Unknown option error."); 66 | exit(1); 67 | } 68 | } 69 | argc -= optind; 70 | argv += optind; 71 | 72 | AZ(argc); 73 | AZ(*argv); 74 | 75 | IDX_Resort(aa); 76 | 77 | return (0); 78 | } 79 | -------------------------------------------------------------------------------- /main_info.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "vdef.h" 36 | #include "vas.h" 37 | 38 | #include "miniobj.h" 39 | 40 | #include "aardwarc.h" 41 | 42 | static 43 | void 44 | usage_info(const char *a0, const char *a00, const char *err) 45 | { 46 | usage(a0, err); 47 | fprintf(stderr, "Usage for this operation:\n"); 48 | fprintf(stderr, "\t%s [global options] %s [options] [silo]...\n", 49 | a0, a00); 50 | } 51 | 52 | int v_matchproto_(main_f) 53 | main_info(const char *a0, struct aardwarc *aa, int argc, char **argv) 54 | { 55 | int ch; 56 | const char *a00 = *argv; 57 | 58 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 59 | 60 | while ((ch = getopt(argc, argv, "h")) != -1) { 61 | switch (ch) { 62 | case 'h': 63 | usage_info(a0, a00, NULL); 64 | exit(1); 65 | default: 66 | usage_info(a0, a00, "Unknown option error."); 67 | exit(1); 68 | } 69 | } 70 | if (argc > optind) { 71 | usage_info(a0, a00, "No arguments allowed."); 72 | exit(1); 73 | } 74 | if (aa->json) { 75 | printf("[ \"AardWARC\", \"info\", \"1\", {\n"); 76 | printf(" \"id_size\": %u\n", aa->id_size); 77 | printf("} ]\n"); 78 | } else { 79 | printf("id_size: %u\n", aa->id_size); 80 | } 81 | return (0); 82 | } 83 | -------------------------------------------------------------------------------- /main_mksilo.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "vdef.h" 38 | #include "vas.h" 39 | #include "miniobj.h" 40 | #include "aardwarc.h" 41 | 42 | static void 43 | usage_mksilo(const char *a0, const char *a00, const char *err) 44 | { 45 | usage(a0, err); 46 | fprintf(stderr, "Usage for this operation:\n"); 47 | fprintf(stderr, "\t%s [global options] %s [options] silo#\n", 48 | a0, a00); 49 | } 50 | 51 | int v_matchproto_(main_f) 52 | main_mksilo(const char *a0, struct aardwarc *aa, int argc, char **argv) 53 | { 54 | int ch, retval = 0; 55 | const char *a00 = *argv; 56 | unsigned u; 57 | struct wsilo *ws; 58 | 59 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 60 | setbuf(stdout, NULL); 61 | setbuf(stderr, NULL); 62 | 63 | while ((ch = getopt(argc, argv, "h")) != -1) { 64 | switch (ch) { 65 | case 'h': 66 | usage_mksilo(a0, a00, NULL); 67 | exit(1); 68 | default: 69 | usage_mksilo(a0, a00, "Unknown option error."); 70 | exit(1); 71 | } 72 | } 73 | argc -= optind; 74 | argv += optind; 75 | 76 | if (argc == 0) { 77 | usage_mksilo(a0, a00, "No silo#."); 78 | exit(1); 79 | } 80 | u = strtoul(argv[0], NULL, 0); 81 | ws = Wsilo_New(aa, u); 82 | if (ws == NULL) { 83 | usage_mksilo(a0, a00, "Could not."); 84 | exit(1); 85 | } 86 | Wsilo_Install(&ws); 87 | 88 | return (retval); 89 | } 90 | -------------------------------------------------------------------------------- /main_rebuild.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #define ZLIB_CONST 38 | #include 39 | 40 | #include "vdef.h" 41 | 42 | #include "vas.h" 43 | #include "vsb.h" 44 | #include "miniobj.h" 45 | 46 | #include "aardwarc.h" 47 | 48 | struct rebuild { 49 | unsigned magic; 50 | #define REBUILD_MAGIC 0xf972c416 51 | struct aardwarc *aa; 52 | int fdo; 53 | 54 | char hdrbuf[BUFSIZ]; 55 | unsigned char fixbuf[BUFSIZ]; 56 | size_t hdrlen; 57 | struct vsb *vsb; 58 | int state; 59 | intmax_t rlen; 60 | off_t body_start; 61 | z_stream zs[1]; 62 | unsigned char obuf[128 * 1024]; 63 | struct SHA256Context sha256[1]; 64 | struct header *h; 65 | intmax_t clen; 66 | }; 67 | 68 | static void 69 | rebuild_process(struct rebuild *rb, const unsigned char *ptr, ssize_t len) 70 | { 71 | ssize_t oz, wz; 72 | char *p; 73 | const char *q; 74 | off_t ll; 75 | int i; 76 | 77 | CHECK_OBJ_NOTNULL(rb, REBUILD_MAGIC); 78 | 79 | while (len > 0) { 80 | if (rb->state == 0) { 81 | AZ(rb->hdrlen); 82 | rb->state = 1; 83 | } 84 | 85 | if (rb->state == 1) { 86 | assert(rb->hdrlen + len + 1 < sizeof rb->hdrbuf); 87 | memcpy(rb->hdrbuf + rb->hdrlen, ptr, len); 88 | rb->hdrlen += len; 89 | len = 0; 90 | rb->hdrbuf[rb->hdrlen] = '\0'; 91 | p = strstr(rb->hdrbuf, "\r\n\r\n"); 92 | if (p == NULL) 93 | return; 94 | p += 4; 95 | xxxassert(p == rb->hdrbuf + rb->hdrlen); 96 | AZ(rb->h); 97 | rb->h = Header_Parse(rb->aa, rb->hdrbuf); 98 | AN(rb->h); 99 | rb->clen = Header_Get_Number(rb->h, "Content-Length"); 100 | 101 | Header_Delete(rb->h, "Content-Length-GZIP"); 102 | Header_Delete(rb->h, "z"); 103 | 104 | q = Header_Get(rb->h, "WARC-record-digest"); 105 | if (q != NULL) { 106 | rb->state = 3; 107 | rb->rlen = 0; 108 | SHA256_Init(rb->sha256); 109 | continue; 110 | } 111 | 112 | VSB_destroy(&rb->vsb); 113 | rb->vsb = Header_Serialize(rb->h, 0); 114 | wz = write(rb->fdo, 115 | VSB_data(rb->vsb), VSB_len(rb->vsb)); 116 | assert(wz == VSB_len(rb->vsb)); 117 | Header_Destroy(&rb->h); 118 | rb->hdrlen = 0; 119 | rb->body_start = lseek(rb->fdo, 0, SEEK_CUR); 120 | 121 | Gzip_InitDeflate(rb->zs); 122 | Gzip_AddAa(rb->zs); 123 | 124 | rb->state = 10; 125 | rb->rlen = rb->clen; 126 | continue; 127 | } 128 | 129 | if (rb->state == 3) { 130 | VSB_destroy(&rb->vsb); 131 | rb->vsb = Header_Serialize(rb->h, -1); 132 | AZ(VSB_finish(rb->vsb)); 133 | printf("FIXUP from\n%s\n", VSB_data(rb->vsb)); 134 | 135 | Header_Delete(rb->h, "WARC-record-digest"); 136 | oz = rb->clen - rb->rlen; 137 | if (len < oz) 138 | oz = len; 139 | SHA256_Update(rb->sha256, ptr, oz); 140 | memcpy(rb->fixbuf + rb->rlen, ptr, oz); 141 | ptr += oz; 142 | len -= oz; 143 | rb->rlen += oz; 144 | if (rb->rlen < rb->clen) 145 | continue; 146 | p = SHA256_End(rb->sha256, NULL); 147 | Header_Set(rb->h, "WARC-Block-digest", 148 | "sha256:%s", p); 149 | Header_Set_Id(rb->h, p); 150 | free(p); 151 | 152 | VSB_destroy(&rb->vsb); 153 | rb->vsb = Header_Serialize(rb->h, -1); 154 | AZ(VSB_finish(rb->vsb)); 155 | printf("FIXUP to\n%s\n", VSB_data(rb->vsb)); 156 | 157 | VSB_destroy(&rb->vsb); 158 | rb->vsb = Header_Serialize(rb->h, 0); 159 | wz = write(rb->fdo, 160 | VSB_data(rb->vsb), VSB_len(rb->vsb)); 161 | assert(wz == VSB_len(rb->vsb)); 162 | Header_Destroy(&rb->h); 163 | rb->hdrlen = 0; 164 | 165 | rb->body_start = lseek(rb->fdo, 0, SEEK_CUR); 166 | Gzip_InitDeflate(rb->zs); 167 | Gzip_AddAa(rb->zs); 168 | rb->zs->avail_in = rb->clen; 169 | rb->zs->next_in = rb->fixbuf; 170 | rb->rlen = 0; 171 | rb->state = 11; 172 | continue; 173 | } 174 | if (rb->state == 10) { 175 | oz = rb->rlen; 176 | if (len < oz) 177 | oz = len; 178 | rb->zs->avail_in = oz; 179 | rb->zs->next_in = ptr; 180 | len -= oz; 181 | ptr += oz; 182 | rb->rlen -= oz; 183 | rb->state = 11; 184 | // NB: No continue here, have to do 11 before return 185 | } 186 | if (rb->state == 11) { 187 | rb->zs->avail_out = sizeof rb->obuf; 188 | rb->zs->next_out = rb->obuf; 189 | if (rb->rlen > 0) { 190 | i = deflate(rb->zs, 0); 191 | assert(i == Z_OK); 192 | } else { 193 | i = deflate(rb->zs, Z_SYNC_FLUSH); 194 | assert(i == Z_OK); 195 | i = deflate(rb->zs, Z_FINISH); 196 | assert(i == Z_STREAM_END); 197 | } 198 | oz = sizeof rb->obuf - rb->zs->avail_out; 199 | if (oz) { 200 | wz = write(rb->fdo, rb->obuf, oz); 201 | assert(wz == oz); 202 | } 203 | if (rb->rlen) { 204 | rb->state = 10; 205 | continue; 206 | } 207 | assert(deflateEnd(rb->zs) == Z_OK); 208 | ll = lseek(rb->fdo, 0, SEEK_CUR); 209 | (void)lseek(rb->fdo, rb->body_start, SEEK_SET); 210 | Gzip_WriteAa(rb->fdo, ll - rb->body_start); 211 | (void)lseek(rb->fdo, ll, SEEK_SET); 212 | wz = write(rb->fdo, 213 | Gzip_crnlcrnl, sizeof Gzip_crnlcrnl); 214 | assert(wz == sizeof Gzip_crnlcrnl); 215 | rb->state = 20; 216 | rb->rlen = 4; 217 | continue; 218 | } 219 | if (rb->state == 20) { 220 | oz = rb->rlen; 221 | if (len < oz) 222 | oz = len; 223 | len -= oz; 224 | ptr += oz; 225 | rb->rlen -= oz; 226 | if (!rb->rlen) 227 | rb->state = 0; 228 | continue; 229 | } 230 | } 231 | } 232 | 233 | 234 | static int v_matchproto_(byte_iter_f) 235 | rebuild_silo_iter(void *priv, const void *fn, ssize_t silono) 236 | { 237 | struct rebuild *rb; 238 | z_stream zs[1]; 239 | int ps = getpagesize(); 240 | unsigned char ibuf[ps * 16]; 241 | unsigned char obuf[ps * 16]; 242 | FILE *fi; 243 | size_t rz; 244 | ssize_t oz; 245 | int i; 246 | 247 | CAST_OBJ_NOTNULL(rb, priv, REBUILD_MAGIC); 248 | fi = fopen(fn, "rb"); 249 | if (fi == NULL) { 250 | fprintf(stderr, 251 | "Cannot open %s: %s\n", (const char*)fn, strerror(errno)); 252 | return (0); 253 | } 254 | fprintf(stderr, "SILO NO %zd FN %s\n", silono, (const char *)fn); 255 | VSB_clear(rb->vsb); 256 | VSB_cat(rb->vsb, fn); 257 | VSB_cat(rb->vsb, "_"); 258 | AZ(VSB_finish(rb->vsb)); 259 | rb->fdo = open(VSB_data(rb->vsb), O_RDWR | O_CREAT | O_TRUNC, 0600); 260 | assert(rb->fdo >= 0); 261 | 262 | memset(zs, 0, sizeof zs); 263 | zs->next_in = (void*)ibuf; 264 | i = inflateInit2(zs, 15 + 32); 265 | assert(i == Z_OK); 266 | 267 | do { 268 | zs->next_out = (void*)obuf; 269 | zs->avail_out = sizeof obuf; 270 | if (!zs->avail_in) { 271 | rz = fread(ibuf, 1, sizeof ibuf, fi); 272 | if (!rz) 273 | break; 274 | zs->next_in = ibuf; 275 | zs->avail_in = rz; 276 | } 277 | i = inflate(zs, 0); 278 | oz = sizeof obuf - zs->avail_out; 279 | obuf[oz] = '\0'; 280 | rebuild_process(rb, obuf, oz); 281 | if (i == Z_STREAM_END) { 282 | assert(inflateEnd(zs) == Z_OK); 283 | i = inflateInit2(zs, 15 + 32); 284 | assert(i == Z_OK); 285 | } 286 | } while (i == Z_OK); 287 | 288 | AZ(close(rb->fdo)); 289 | 290 | return(0); 291 | } 292 | 293 | static void 294 | usage_rebuild(const char *a0, const char *a00, const char *err) 295 | { 296 | usage(a0, err); 297 | fprintf(stderr, "Usage for this operation:\n"); 298 | fprintf(stderr, "\t%s [global options] %s [options] [silo]...\n", 299 | a0, a00); 300 | } 301 | 302 | int v_matchproto_(main_f) 303 | main_rebuild(const char *a0, struct aardwarc *aa, int argc, char **argv) 304 | { 305 | int ch, retval = 0; 306 | const char *a00 = *argv; 307 | struct rebuild *rb; 308 | 309 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 310 | setbuf(stdout, NULL); 311 | setbuf(stderr, NULL); 312 | 313 | while ((ch = getopt(argc, argv, "h")) != -1) { 314 | switch (ch) { 315 | case 'h': 316 | usage_rebuild(a0, a00, NULL); 317 | exit(1); 318 | default: 319 | usage_rebuild(a0, a00, "Unknown option error."); 320 | exit(1); 321 | } 322 | } 323 | argc -= optind; 324 | argv += optind; 325 | 326 | ALLOC_OBJ(rb, REBUILD_MAGIC); 327 | AN(rb); 328 | rb->vsb = VSB_new_auto(); 329 | AN(rb->vsb); 330 | rb->aa = aa; 331 | 332 | if (argc == 0) 333 | retval |= Silo_Iter(aa, rebuild_silo_iter, rb); 334 | while (argc-- > 0) 335 | retval |= rebuild_silo_iter(rb, *argv++, -1); 336 | 337 | VSB_destroy(&rb->vsb); 338 | FREE_OBJ(rb); 339 | return (retval); 340 | } 341 | -------------------------------------------------------------------------------- /main_reindex.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | * TODO: 29 | * Check filename in warcinfo 30 | * Check if silo has correct filename based on silono (reindex2) 31 | * Don't explode on corrupt silos. (reindex2) 32 | * Handle duplicates (reindex2) 33 | */ 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include "vdef.h" 41 | 42 | #include "vas.h" 43 | #include "miniobj.h" 44 | 45 | #include "aardwarc.h" 46 | 47 | struct seg { 48 | unsigned magic; 49 | #define SEG_MAGIC 0x0753a4c0 50 | char *id; 51 | uint32_t flg; 52 | char *parent; 53 | ssize_t silono; 54 | off_t off, segno; 55 | VTAILQ_ENTRY(seg) list; 56 | int used; 57 | int done; 58 | }; 59 | 60 | static VTAILQ_HEAD(seghead,seg) segs = VTAILQ_HEAD_INITIALIZER(segs); 61 | static unsigned nsegs = 0; 62 | 63 | static void 64 | dump(const struct aardwarc *aa, const char *pfx, struct seg *seg) 65 | { 66 | printf("%s%p %.*s %d%d %zd %s %jd %jd\n", 67 | pfx, seg, (int)aa->id_size, seg->parent, seg->used, seg->done, 68 | seg->silono, seg->id, seg->off, seg->segno); 69 | } 70 | 71 | static void 72 | dump_left(const struct aardwarc *aa) 73 | { 74 | struct seg *seg; 75 | 76 | printf("NSEGS %u\n", nsegs); 77 | VTAILQ_FOREACH(seg, &segs, list) 78 | dump(aa, "", seg); 79 | } 80 | 81 | static void 82 | drop_seg(struct seg *seg) 83 | { 84 | 85 | VTAILQ_REMOVE(&segs, seg, list); 86 | AN(seg->done); 87 | REPLACE(seg->id, NULL); 88 | REPLACE(seg->parent, NULL); 89 | FREE_OBJ(seg); 90 | nsegs--; 91 | } 92 | 93 | static void 94 | emit_seg(const struct aardwarc *aa, struct seg *seg, struct seg *seg2) 95 | { 96 | 97 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 98 | CHECK_OBJ_NOTNULL(seg, SEG_MAGIC); 99 | CHECK_OBJ_NOTNULL(seg2, SEG_MAGIC); 100 | IDX_Insert(aa, seg->id, seg->flg, seg->silono, seg->off, seg2->id); 101 | AZ(seg->done); 102 | AZ(seg2->used); 103 | seg->done++; 104 | seg2->used++; 105 | if (seg->used && seg->done) 106 | drop_seg(seg); 107 | if (seg2->used && seg2->done) 108 | drop_seg(seg2); 109 | } 110 | 111 | static void 112 | try_seg(const struct aardwarc *aa, struct seg *seg) 113 | { 114 | struct seg *seg2; 115 | 116 | seg2 = VTAILQ_NEXT(seg, list); 117 | if (seg2 != NULL && !seg2->done && seg2->segno + 1 == seg->segno && 118 | !strncmp(seg2->parent, seg->parent, aa->id_size)) 119 | emit_seg(aa, seg2, seg); 120 | } 121 | 122 | static int v_matchproto_(idx_iter_f) 123 | reindex_iter(void *priv, const char *key, 124 | uint32_t flag, uint32_t silo, int64_t offset, const char *cont) 125 | { 126 | struct seg *seg, *seg2; 127 | 128 | (void)silo; 129 | (void)offset; 130 | if (!(flag & IDX_F_SEGMENTED)) 131 | return(0); 132 | VTAILQ_FOREACH_SAFE(seg, &segs, list, seg2) { 133 | if (!strncmp(key, seg->id, 16)) { 134 | IDX_Insert(priv, seg->id, seg->flg, seg->silono, 135 | seg->off, cont); 136 | seg->done++; 137 | drop_seg(seg); 138 | } 139 | } 140 | return (0); 141 | } 142 | 143 | static void 144 | got_seg(const struct aardwarc *aa, const struct header *hdr, 145 | uint32_t flg, off_t off, off_t segno, ssize_t silono) 146 | { 147 | const char *id; 148 | const char *parent; 149 | const char *tl; 150 | struct seg *seg, *seg2; 151 | 152 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 153 | AN(hdr); 154 | id = Header_Get_Id(hdr); 155 | flg |= IDX_F_SEGMENTED; 156 | if (segno > 1) { 157 | parent = Header_Get(hdr, "WARC-Segment-Origin-ID"); 158 | AN(parent); 159 | parent = strrchr(parent, '/'); 160 | AN(parent); 161 | parent++; 162 | } else { 163 | flg |= IDX_F_FIRSTSEG; 164 | parent = id; 165 | } 166 | tl = Header_Get(hdr, "WARC-Segment-Total-Length"); 167 | if (tl != NULL) 168 | flg |= IDX_F_LASTSEG; 169 | 170 | ALLOC_OBJ(seg, SEG_MAGIC); 171 | AN(seg); 172 | nsegs++; 173 | REPLACE(seg->id, id); 174 | REPLACE(seg->parent, parent); 175 | seg->flg = flg; 176 | seg->off = off; 177 | seg->segno = segno; 178 | seg->silono = silono; 179 | 180 | if (segno == 1) 181 | seg->used = 1; 182 | if (tl != NULL) { 183 | IDX_Insert(aa, seg->id, seg->flg, seg->silono, seg->off, NULL); 184 | seg->done = 1; 185 | } 186 | 187 | VTAILQ_FOREACH(seg2, &segs, list) { 188 | if (strncmp(seg2->parent, seg->parent, aa->id_size) < 0) 189 | continue; 190 | if (strncmp(seg2->parent, seg->parent, aa->id_size) > 0) 191 | break; 192 | if (seg2->segno < seg->segno) 193 | break; 194 | } 195 | if (seg2 != NULL) 196 | VTAILQ_INSERT_BEFORE(seg2, seg, list); 197 | else 198 | VTAILQ_INSERT_TAIL(&segs, seg, list); 199 | 200 | seg2 = VTAILQ_PREV(seg, seghead, list); 201 | try_seg(aa, seg); 202 | if (seg2 != NULL) 203 | try_seg(aa, seg2); 204 | } 205 | 206 | static int v_matchproto_(byte_iter_f) 207 | silo_iter(void *priv, const void *fn, ssize_t silono) 208 | { 209 | struct aardwarc *aa; 210 | struct rsilo *rs; 211 | struct header *hdr; 212 | off_t off, segno; 213 | intmax_t im; 214 | uint32_t flg; 215 | const char *p, *wt; 216 | 217 | CAST_OBJ_NOTNULL(aa, priv, AARDWARC_MAGIC); 218 | 219 | rs = Rsilo_Open(aa, fn, silono, 0); 220 | if (rs == NULL) 221 | return (-1); 222 | 223 | if (silono < 0) { 224 | p = strrchr(fn, '/'); 225 | AN(p); 226 | p++; 227 | silono = (ssize_t)strtoul(p, NULL, 10); 228 | } 229 | 230 | while (1) { 231 | off = Rsilo_Tell(rs); 232 | hdr = Rsilo_ReadHeader(rs); 233 | if (hdr == NULL) 234 | break; 235 | 236 | flg = 0; 237 | wt = Header_Get(hdr, "WARC-Type"); 238 | AN(wt); 239 | if (!strcmp(wt, "warcinfo")) 240 | flg = IDX_F_WARCINFO; 241 | else if (!strcmp(wt, "metadata")) 242 | flg = IDX_F_METADATA; 243 | else if (!strcmp(wt, "resource")) 244 | flg = IDX_F_RESOURCE; 245 | 246 | im = Header_Get_Number(hdr, "WARC-Segment-Number"); 247 | if (im < 0) { 248 | segno = 0; 249 | IDX_Insert(aa, Header_Get_Id(hdr), 250 | flg, silono, off, NULL); 251 | } else { 252 | segno = (off_t)im; 253 | got_seg(aa, hdr, flg, off, segno, silono); 254 | } 255 | 256 | Header_Destroy(&hdr); 257 | Rsilo_NextHeader(rs); 258 | } 259 | Rsilo_Close(&rs); 260 | IDX_Resort(aa); 261 | return (0); 262 | } 263 | 264 | static void 265 | usage_reindex(const char *a0, const char *a00, const char *err) 266 | { 267 | usage(a0, err); 268 | fprintf(stderr, "Usage for this operation:\n"); 269 | fprintf(stderr, "\t%s [global options] %s [options] [silo]...\n", 270 | a0, a00); 271 | //fprintf(stderr, "Options:\n"); 272 | //fprintf(stderr, "\t-m mime_type\n"); 273 | //fprintf(stderr, "\t-t {metadata|resource}\n"); 274 | } 275 | 276 | int v_matchproto_(main_f) 277 | main_reindex(const char *a0, struct aardwarc *aa, int argc, char **argv) 278 | { 279 | int ch, retval = 0; 280 | const char *a00 = *argv; 281 | 282 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 283 | 284 | while ((ch = getopt(argc, argv, "h")) != -1) { 285 | switch (ch) { 286 | case 'h': 287 | usage_reindex(a0, a00, NULL); 288 | exit(1); 289 | default: 290 | usage_reindex(a0, a00, "Unknown option error."); 291 | exit(1); 292 | } 293 | } 294 | argc -= optind; 295 | argv += optind; 296 | 297 | if (argc == 0) 298 | retval |= Silo_Iter(aa, silo_iter, aa); 299 | while (argc-- > 0) 300 | retval |= silo_iter(aa, *argv++, -1); 301 | if (nsegs > 0) { 302 | printf("Rematch (%u)\n", nsegs); 303 | (void)IDX_Iter(aa, NULL, reindex_iter, aa); 304 | } 305 | if (nsegs > 0) { 306 | printf("Leftovers\n"); 307 | dump_left(aa); 308 | } 309 | return (retval); 310 | } 311 | -------------------------------------------------------------------------------- /main_store.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "vdef.h" 38 | 39 | #include "vas.h" 40 | #include "vsb.h" 41 | #include "miniobj.h" 42 | 43 | #include "aardwarc.h" 44 | 45 | static const char * const WT_RESOURCE = "resource"; 46 | static const char * const WT_METADATA = "metadata"; 47 | 48 | static int 49 | mime_print(void *priv, const char *name, const char *arg) 50 | { 51 | (void)priv; 52 | (void)arg; 53 | fprintf(stderr, "\t%s\n", name); 54 | return (0); 55 | } 56 | 57 | static int 58 | mime_type(struct aardwarc *aa, const char *wt, const char *mt) 59 | { 60 | int i; 61 | const char *p, *g; 62 | 63 | if (wt == WT_RESOURCE) 64 | g = "resource.mime-types"; 65 | else 66 | g = "metadata.mime-types"; 67 | 68 | i = Config_Find(aa->cfg, g, mt, &p); 69 | if (i) { 70 | fprintf(stderr, "Illegal mime-type for %s, pick one of:\n", wt); 71 | (void)Config_Iter(aa->cfg, g, NULL, mime_print); 72 | } else 73 | aa->mime_validator = p; 74 | return (i); 75 | } 76 | 77 | static 78 | void 79 | usage_store(const char *a0, const char *a00, const char *err) 80 | { 81 | usage(a0, err); 82 | fprintf(stderr, "Usage for this operation:\n"); 83 | fprintf(stderr, "\t%s [global options] %s [options] {filename|-}\n", 84 | a0, a00); 85 | fprintf(stderr, "Options:\n"); 86 | fprintf(stderr, "\t-i Forced identifier (metadata only)\n"); 87 | fprintf(stderr, "\t-m mime_type\n"); 88 | fprintf(stderr, "\t-r WARC-Refers-To: reference (metadata only)\n"); 89 | fprintf(stderr, "\t-t {metadata|resource}\n"); 90 | } 91 | 92 | int v_matchproto_(main_f) 93 | main_store(const char *a0, struct aardwarc *aa, int argc, char **argv) 94 | { 95 | int ch; 96 | int fd = -1; 97 | const char *wt = NULL; 98 | const char *mt = "application/octet-stream"; 99 | struct header *hdr; 100 | struct getjob *gj; 101 | struct segjob *sj; 102 | char *id; 103 | const char *a00 = *argv; 104 | char *ibuf_ptr; 105 | const char *r_arg = NULL; 106 | const char *i_arg = NULL; 107 | const char *ref = NULL; 108 | ssize_t ibuf_len, rlen; 109 | struct vsb *vsb; 110 | const char *e; 111 | 112 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 113 | 114 | while ((ch = getopt(argc, argv, "hi:m:t:r:")) != -1) { 115 | switch (ch) { 116 | case 'h': 117 | usage_store(a0, a00, NULL); 118 | exit(1); 119 | case 'i': 120 | if (i_arg != NULL) { 121 | usage_store(a0, a00, 122 | "More than one -i argument."); 123 | exit(1); 124 | } 125 | i_arg = optarg; 126 | break; 127 | case 'm': 128 | mt = optarg; 129 | break; 130 | case 't': 131 | if (wt != NULL) { 132 | usage_store(a0, a00, 133 | "More than one -t argument."); 134 | exit(1); 135 | } else if (!strcasecmp(optarg, "resource")) 136 | wt = WT_RESOURCE; 137 | else if (!strcasecmp(optarg, "metadata")) 138 | wt = WT_METADATA; 139 | else { 140 | usage_store(a0, a00, "Illegal -t argument."); 141 | exit(1); 142 | } 143 | break; 144 | case 'r': 145 | if (r_arg != NULL) { 146 | usage_store(a0, a00, 147 | "More than one -r argument."); 148 | exit(1); 149 | } 150 | r_arg = optarg; 151 | break; 152 | default: 153 | usage_store(a0, a00, "Unknown option error."); 154 | exit(1); 155 | } 156 | } 157 | argc -= optind; 158 | argv += optind; 159 | 160 | if (wt == NULL) 161 | wt = WT_RESOURCE; 162 | 163 | if (wt != WT_METADATA) { 164 | if (r_arg != NULL) { 165 | fprintf(stderr, 166 | "Can only specify -r ID for metadata\n"); 167 | exit(1); 168 | } 169 | if (i_arg != NULL) { 170 | fprintf(stderr, 171 | "Can only specify -i ID for metadata\n"); 172 | exit(1); 173 | } 174 | } 175 | 176 | if (wt == WT_METADATA) { 177 | if (r_arg == NULL) { 178 | fprintf(stderr, "Must specify -r ID for metadata\n"); 179 | exit(1); 180 | } 181 | if (i_arg != NULL) { 182 | e = IDX_Valid_Id(aa, i_arg, NULL); 183 | if (e != NULL) { 184 | fprintf(stderr, "Illegal id (-i): %s\n", e); 185 | exit(1); 186 | } 187 | } 188 | 189 | e = IDX_Valid_Id(aa, r_arg, NULL); 190 | if (e != NULL) { 191 | usage_store(a0, a00, e); 192 | exit(1); 193 | } 194 | if (strlen(r_arg) == aa->id_size) 195 | ref = Digest2Ident(aa, r_arg); 196 | else 197 | ref = r_arg; 198 | } 199 | 200 | /* Figure out the input file ----------------------------------*/ 201 | 202 | if (argc == 0) { 203 | fd = 0; 204 | } else if (argc != 1) { 205 | usage_store(a0, a00, "Too many input files"); 206 | exit(1); 207 | } else if (!strcmp(*argv, "-")) { 208 | fd = 0; 209 | } else { 210 | fd = open(*argv, O_RDONLY); 211 | if (fd < 0) { 212 | fprintf(stderr, 213 | "Cannot open %s: %s\n", *argv, strerror(errno)); 214 | exit(1); 215 | } 216 | } 217 | 218 | ibuf_len = 128 * 1024; 219 | ibuf_ptr = malloc(ibuf_len); 220 | AN(ibuf_ptr); 221 | 222 | rlen = read(fd, ibuf_ptr, ibuf_len); 223 | if (rlen < 0) { 224 | fprintf(stderr, "Input file read error: %s\n", strerror(errno)); 225 | exit(1); 226 | } 227 | if (rlen == 0) { 228 | fprintf(stderr, "Input file empty\n"); 229 | exit(1); 230 | } 231 | 232 | /* Check the mime type ----------------------------------------*/ 233 | 234 | if (mime_type(aa, wt, mt)) 235 | exit(1); 236 | 237 | /* Create headers ---------------------------------------------*/ 238 | 239 | hdr = Header_New(aa); 240 | AN(hdr); 241 | Header_Set_Date(hdr); 242 | Header_Set(hdr, "Content-Type", "%s", mt); 243 | Header_Set(hdr, "WARC-Type", "%s", wt); 244 | 245 | if (ref != NULL) { 246 | assert(wt == WT_METADATA); 247 | vsb = VSB_new_auto(); 248 | AN(vsb); 249 | gj = GetJob_New(aa, ref, vsb); 250 | if (gj == NULL) { 251 | AZ(VSB_finish(vsb)); 252 | fprintf(stderr, "Referenced (-r) ID does not exist:\n"); 253 | fprintf(stderr, "\t%s\n", VSB_data(vsb)); 254 | exit(1); 255 | } 256 | VSB_destroy(&vsb); 257 | GetJob_Delete(&gj); 258 | Header_Set(hdr, "WARC-Refers-To", "<%s>", ref); 259 | } 260 | 261 | sj = SegJob_New(aa, hdr, i_arg); 262 | AN(sj); 263 | 264 | SegJob_Feed(sj, ibuf_ptr, rlen); 265 | do { 266 | rlen = read(fd, ibuf_ptr, ibuf_len); 267 | if (rlen > 0) 268 | SegJob_Feed(sj, ibuf_ptr, rlen); 269 | } while (rlen > 0); 270 | 271 | id = SegJob_Commit(sj); 272 | printf("%s\n", id); 273 | 274 | REPLACE(ibuf_ptr, NULL); 275 | Header_Destroy(&hdr); 276 | 277 | return (0); 278 | } 279 | -------------------------------------------------------------------------------- /main_testbytes.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "vdef.h" 36 | 37 | #include "vas.h" 38 | #include "sha256.h" 39 | #include "miniobj.h" 40 | 41 | #include "aardwarc.h" 42 | 43 | static 44 | void 45 | usage_testbytes(const char *a0, const char *a00, const char *err) 46 | { 47 | usage(a0, err); 48 | fprintf(stderr, "Usage for this operation:\n"); 49 | fprintf(stderr, "\t%s [global options] %s [options] {filename|-}\n", 50 | a0, a00); 51 | fprintf(stderr, "Options:\n"); 52 | fprintf(stderr, "\t-n number of bytes\n"); 53 | } 54 | 55 | int v_matchproto_(main_f) 56 | main__testbytes(const char *a0, struct aardwarc *aa, int argc, char **argv) 57 | { 58 | int ch; 59 | const char *a00 = *argv; 60 | unsigned long nbytes = 0; 61 | struct SHA256Context sha256[1]; 62 | unsigned char dig[32]; 63 | 64 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 65 | 66 | while ((ch = getopt(argc, argv, "hn:")) != -1) { 67 | switch (ch) { 68 | case 'h': 69 | usage_testbytes(a0, a00, NULL); 70 | exit(1); 71 | case 'n': 72 | nbytes = strtoul(optarg, NULL, 0); 73 | break; 74 | default: 75 | usage_testbytes(a0, a00, "Unknown option error."); 76 | exit(1); 77 | } 78 | } 79 | argc -= optind; 80 | argv += optind; 81 | 82 | AZ(argc); 83 | (void)argv; 84 | 85 | memset(dig, 0, sizeof dig); 86 | while (nbytes > 0) { 87 | SHA256_Init(sha256); 88 | SHA256_Update(sha256, dig, sizeof dig); 89 | SHA256_Final(dig, sha256); 90 | for (ch = 0; ch < 32 && nbytes > 0; ch++, nbytes--) 91 | fputc(dig[ch], stdout); 92 | } 93 | return (0); 94 | } 95 | -------------------------------------------------------------------------------- /miniobj.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Written by Poul-Henning Kamp 3 | * 4 | * This file is in the public domain. 5 | * 6 | */ 7 | 8 | #define ZERO_OBJ(to, sz) \ 9 | do { \ 10 | void *(*volatile z_obj)(void *, int, size_t) = memset; \ 11 | (void)z_obj(to, 0, sz); \ 12 | } while (0) 13 | 14 | #define INIT_OBJ(to, type_magic) \ 15 | do { \ 16 | (void)memset(to, 0, sizeof *(to)); \ 17 | (to)->magic = (type_magic); \ 18 | } while (0) 19 | 20 | #define ALLOC_OBJ(to, type_magic) \ 21 | do { \ 22 | (to) = calloc(1, sizeof *(to)); \ 23 | if ((to) != NULL) \ 24 | (to)->magic = (type_magic); \ 25 | } while (0) 26 | 27 | #define FREE_OBJ(to) \ 28 | do { \ 29 | ZERO_OBJ(&(to)->magic, sizeof (to)->magic); \ 30 | free(to); \ 31 | to = NULL; \ 32 | } while (0) 33 | 34 | #define VALID_OBJ(ptr, type_magic) \ 35 | ((ptr) != NULL && (ptr)->magic == (type_magic)) 36 | 37 | #define CHECK_OBJ(ptr, type_magic) \ 38 | do { \ 39 | assert((ptr)->magic == type_magic); \ 40 | } while (0) 41 | 42 | #define CHECK_OBJ_NOTNULL(ptr, type_magic) \ 43 | do { \ 44 | assert((ptr) != NULL); \ 45 | assert((ptr)->magic == type_magic); \ 46 | } while (0) 47 | 48 | #define CHECK_OBJ_ORNULL(ptr, type_magic) \ 49 | do { \ 50 | if ((ptr) != NULL) \ 51 | assert((ptr)->magic == type_magic); \ 52 | } while (0) 53 | 54 | #define CAST_OBJ(to, from, type_magic) \ 55 | do { \ 56 | (to) = (from); \ 57 | if ((to) != NULL) \ 58 | CHECK_OBJ((to), (type_magic)); \ 59 | } while (0) 60 | 61 | #define CAST_OBJ_NOTNULL(to, from, type_magic) \ 62 | do { \ 63 | (to) = (from); \ 64 | AN((to)); \ 65 | CHECK_OBJ((to), (type_magic)); \ 66 | } while (0) 67 | 68 | #define TAKE_OBJ_NOTNULL(to, pfrom, type_magic) \ 69 | do { \ 70 | AN((pfrom)); \ 71 | (to) = *(pfrom); \ 72 | *(pfrom) = NULL; \ 73 | CHECK_OBJ_NOTNULL((to), (type_magic)); \ 74 | } while (0) 75 | 76 | #define REPLACE(ptr, val) \ 77 | do { \ 78 | const char *_vreplace = (val); \ 79 | free(ptr); \ 80 | if (_vreplace != NULL) { \ 81 | ptr = strdup(_vreplace); \ 82 | AN((ptr)); \ 83 | } else { \ 84 | ptr = NULL; \ 85 | } \ 86 | } while (0) 87 | -------------------------------------------------------------------------------- /proto.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | * The stow/stevedore protocol is really simple: 29 | * 30 | * +--+--+--+--+--+--+--+--+ 31 | * |Size |Reserved|Command | 32 | * +--+--+--+--+--+--+--+--+ 33 | * 34 | * Size: 35 | * 0 zero bytes 36 | * 1 32 bytes 37 | * 2 be8 length 38 | * 3 be32 length 39 | * 40 | * Cmd: 41 | * 0 Debug message 42 | * 1 sha256 for filtering 43 | * 2 send file 44 | * 3 metadata 45 | * 46 | */ 47 | 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | 58 | #include "vdef.h" 59 | 60 | #include "vas.h" 61 | #include "vsb.h" 62 | #include "miniobj.h" 63 | 64 | #include "aardwarc.h" 65 | 66 | struct ev { 67 | unsigned magic; 68 | #define EV_MAGIC 0xff6b684d 69 | int fd; 70 | int idx; 71 | int enable; 72 | short events; 73 | proto_ev_func_f *func; 74 | void *priv; 75 | VTAILQ_ENTRY(ev) list; 76 | }; 77 | 78 | static VTAILQ_HEAD(,ev) evs = VTAILQ_HEAD_INITIALIZER(evs); 79 | static int nevs; 80 | 81 | uintptr_t 82 | proto_add_ev(int fd, short events, proto_ev_func_f *func, void *priv) 83 | { 84 | struct ev *ev; 85 | 86 | ALLOC_OBJ(ev, EV_MAGIC); 87 | AN(ev); 88 | ev->fd = fd; 89 | ev->events = events; 90 | ev->func = func; 91 | ev->priv = priv; 92 | ev->enable = 1; 93 | ev->idx = -1; 94 | VTAILQ_INSERT_TAIL(&evs, ev, list); 95 | nevs++; 96 | return (uintptr_t)ev; 97 | } 98 | 99 | void 100 | proto_del_ev(uintptr_t *id) 101 | { 102 | struct ev *ev; 103 | 104 | VTAILQ_FOREACH(ev, &evs, list) { 105 | if (*id == (uintptr_t)ev) 106 | break; 107 | } 108 | CHECK_OBJ_NOTNULL(ev, EV_MAGIC); 109 | VTAILQ_REMOVE(&evs, ev, list); 110 | AZ(close(ev->fd)); 111 | FREE_OBJ(ev); 112 | *id = 0; 113 | } 114 | 115 | void 116 | proto_ctl_ev(uintptr_t id, int enable) 117 | { 118 | struct ev *ev; 119 | 120 | VTAILQ_FOREACH(ev, &evs, list) { 121 | if (id == (uintptr_t)ev) 122 | break; 123 | } 124 | CHECK_OBJ_NOTNULL(ev, EV_MAGIC); 125 | ev->enable = enable; 126 | } 127 | 128 | void 129 | proto_dispatch_evs(void) 130 | { 131 | struct pollfd *fds = NULL; 132 | int nfds = 0, idx, i; 133 | struct ev *ev, *ev2; 134 | 135 | while (!VTAILQ_EMPTY(&evs)) { 136 | if (nfds < nevs) { 137 | fds = realloc(fds, sizeof *fds * nevs); 138 | nfds = nevs; 139 | } 140 | AN(fds); 141 | memset(fds, 0, sizeof *fds * nfds); 142 | idx = 0; 143 | VTAILQ_FOREACH(ev, &evs, list) { 144 | if (!ev->enable) { 145 | ev->idx = -1; 146 | continue; 147 | } 148 | fds[idx].fd = ev->fd; 149 | fds[idx].events = ev->events; 150 | ev->idx = idx++; 151 | } 152 | AN(idx); 153 | i = poll(fds, idx, -1); 154 | assert (i > 0); 155 | VTAILQ_FOREACH_SAFE(ev, &evs, list, ev2) { 156 | if (ev->idx < 0 || !fds[ev->idx].revents) 157 | continue; 158 | ev->func(ev->fd, ev->priv, fds[ev->idx].revents); 159 | } 160 | } 161 | free(fds); 162 | } 163 | 164 | int 165 | proto_in(int fd, unsigned *cmd, unsigned *len) 166 | { 167 | uint8_t u[129]; 168 | ssize_t i; 169 | int j; 170 | 171 | assert (fd >= 0); 172 | AN(cmd); 173 | AN(len); 174 | 175 | i = read(fd, u, 1); 176 | if (i == 0) 177 | return (0); 178 | if (i != 1) 179 | return (-1); 180 | *cmd = u[0] & 7; 181 | switch(u[0] >> 6) { 182 | case 0: 183 | *len = 0; 184 | break; 185 | case 1: 186 | *len = 32; 187 | break; 188 | case 2: 189 | i = read(fd, u + 1, 1); 190 | if (i != 1) 191 | return (-1); 192 | *len = u[1]; 193 | break; 194 | case 3: 195 | /* 196 | * Reads can return short, but not empty, so this is 197 | * the most fool-proof way to receive 4 bytes. 198 | */ 199 | for (j = 1; j < 5; j++) { 200 | i = read(fd, u + j, 1); 201 | if (i != 1) 202 | return (-1); 203 | } 204 | *len = be32dec(u + 1); 205 | break; 206 | default: 207 | WRONG("Cannot happen"); 208 | } 209 | return (1); 210 | } 211 | 212 | void 213 | proto_out(int fd, unsigned cmd, const void *ptr, size_t len) 214 | { 215 | uint8_t u[5]; 216 | struct iovec iov[2]; 217 | ssize_t sz; 218 | 219 | assert(fd >= 0); 220 | AZ(cmd & ~7); 221 | if (len > 0) 222 | AN(ptr); 223 | 224 | iov[0].iov_base = u; 225 | iov[0].iov_len = 1; 226 | iov[1].iov_base = (void*)(uintptr_t)ptr; 227 | iov[1].iov_len = len; 228 | 229 | u[0] = (uint8_t)cmd; 230 | if (len == 0) { 231 | } else if (len == 32) { 232 | u[0] |= 1 << 6; 233 | } else if (len < 256) { 234 | u[0] |= 2 << 6; 235 | u[1] = (uint8_t)len; 236 | iov[0].iov_len += 1; 237 | } else { 238 | u[0] |= 3 << 6; 239 | be32enc(u + 1, len); 240 | iov[0].iov_len += 4; 241 | } 242 | sz = writev(fd, iov, len == 0 ? 1 : 2); 243 | if ((size_t)sz != iov[0].iov_len + iov[1].iov_len) { 244 | fprintf(stderr, "Write error on connection: %s\n", 245 | strerror(errno)); 246 | } 247 | } 248 | 249 | void 250 | proto_send_msg(int fd, const char *fmt, ...) 251 | { 252 | va_list ap; 253 | struct vsb *vsb; 254 | 255 | vsb = VSB_new_auto(); 256 | AN(vsb); 257 | va_start(ap, fmt); 258 | VSB_vprintf(vsb, fmt, ap); 259 | va_end(ap); 260 | AZ(VSB_finish(vsb)); 261 | proto_out(fd, 0, VSB_data(vsb), VSB_len(vsb)); 262 | } 263 | 264 | 265 | -------------------------------------------------------------------------------- /rsilo.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #define ZLIB_CONST 36 | #include 37 | 38 | #include "vdef.h" 39 | 40 | #include "vas.h" 41 | #include "vsb.h" 42 | #include "miniobj.h" 43 | 44 | #include "aardwarc.h" 45 | 46 | struct rsilo { 47 | unsigned magic; 48 | #define RSILO_MAGIC 0x61dd094a 49 | 50 | uint32_t silo_no; 51 | struct aardwarc *aa; 52 | 53 | char *silo_fn; 54 | int silo_fd; 55 | 56 | int64_t silo_bodylen; 57 | 58 | enum { 59 | RS_HEAD, 60 | RS_BODY, 61 | RS_CRLF} silo_where; 62 | }; 63 | 64 | /*---------------------------------------------------------------------*/ 65 | 66 | static void 67 | rsilo_seek(const struct rsilo *rs, int64_t o) 68 | { 69 | off_t o2; 70 | 71 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 72 | assert(o >= 0); 73 | o2 = lseek(rs->silo_fd, (off_t)o, SEEK_SET); 74 | assert((int64_t)o2 == o); 75 | } 76 | 77 | /* Open a silo for reading --------------------------------------------*/ 78 | 79 | static struct rsilo * 80 | rsilo_open_fn(const char *fn, struct aardwarc *aa, uint32_t silono) 81 | { 82 | struct rsilo *rs; 83 | int fd; 84 | 85 | AN(fn); 86 | 87 | fd = open(fn, O_RDONLY); 88 | if (fd < 0) 89 | return (NULL); 90 | 91 | ALLOC_OBJ(rs, RSILO_MAGIC); 92 | if (rs == NULL) { 93 | AZ(close(fd)); 94 | return (NULL); 95 | } 96 | rs->silo_no = silono; 97 | rs->aa = aa; 98 | 99 | rs->silo_fd = fd; 100 | REPLACE(rs->silo_fn, fn); 101 | 102 | return (rs); 103 | } 104 | 105 | struct rsilo * 106 | Rsilo_Open(struct aardwarc *aa, const char *fn, uint32_t nsilo, int64_t off) 107 | { 108 | struct vsb *vsb = NULL; 109 | struct rsilo *rs; 110 | 111 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 112 | 113 | if (fn == NULL) { 114 | vsb = Silo_Filename(aa, nsilo, 0); 115 | AN(vsb); 116 | rs = rsilo_open_fn(VSB_data(vsb), aa, nsilo); 117 | VSB_delete(vsb); 118 | } else { 119 | rs = rsilo_open_fn(fn, aa, 0xffffffff); 120 | } 121 | if (rs != NULL) { 122 | rsilo_seek(rs, off); 123 | rs->silo_where = RS_HEAD; 124 | } 125 | return (rs); 126 | } 127 | 128 | /* Close silo ---------------------------------------------------------*/ 129 | 130 | void 131 | Rsilo_Close(struct rsilo **p) 132 | { 133 | 134 | struct rsilo *rs; 135 | 136 | AN(p); 137 | rs = *p; 138 | *p = NULL; 139 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 140 | REPLACE(rs->silo_fn, NULL); 141 | AZ(close(rs->silo_fd)); 142 | FREE_OBJ(rs); 143 | } 144 | 145 | /* Seek/Tell functions ------------------------------------------------*/ 146 | 147 | off_t 148 | Rsilo_Tell(const struct rsilo *rs) 149 | { 150 | off_t o; 151 | 152 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 153 | o = lseek(rs->silo_fd, 0, SEEK_CUR); 154 | assert(o >= 0); 155 | return (o); 156 | } 157 | 158 | 159 | /* Read a WARC header -------------------------------------------------*/ 160 | 161 | struct header * 162 | Rsilo_ReadHeader(struct rsilo *rs) 163 | { 164 | z_stream zs[1]; 165 | int ps = getpagesize(); 166 | char ibuf[ps]; 167 | char obuf[ps + 1]; 168 | int i; 169 | 170 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 171 | 172 | assert(rs->silo_where == RS_HEAD); 173 | i = read(rs->silo_fd, ibuf, ps); 174 | assert(i >= 0); 175 | if (i == 0) 176 | return (NULL); 177 | 178 | memset(zs, 0, sizeof zs); 179 | zs->next_in = (void*)ibuf; 180 | zs->avail_in = i; 181 | zs->next_out = (void*)obuf; 182 | zs->avail_out = sizeof obuf - 1; 183 | i = inflateInit2(zs, 15 + 32); 184 | assert(i == Z_OK); 185 | 186 | i = inflate(zs, 0); 187 | xxxassert(i == Z_STREAM_END); // One page is enough for everybody... 188 | 189 | obuf[ps - zs->avail_out] = '\0'; 190 | 191 | rs->silo_bodylen = Gzip_ReadAa(zs->next_in, zs->avail_in); 192 | 193 | (void)lseek(rs->silo_fd, -(off_t)zs->avail_in, SEEK_CUR); 194 | 195 | i = inflateEnd(zs); 196 | assert(i == Z_OK); 197 | 198 | rs->silo_where = RS_BODY; 199 | return (Header_Parse(rs->aa, obuf)); 200 | } 201 | 202 | int64_t 203 | Rsilo_BodyLen(const struct rsilo *rs) 204 | { 205 | 206 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 207 | assert(rs->silo_where == RS_BODY); 208 | return(rs->silo_bodylen); 209 | } 210 | 211 | void 212 | Rsilo_NextHeader(struct rsilo *rs) 213 | { 214 | off_t o; 215 | 216 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 217 | 218 | assert(rs->silo_where == RS_BODY); 219 | o = lseek(rs->silo_fd, 220 | rs->silo_bodylen + (off_t)sizeof Gzip_crnlcrnl, SEEK_CUR); 221 | assert(o > 0); 222 | rs->silo_where = RS_HEAD; 223 | } 224 | 225 | /* Read a WARC body ---------------------------------------------------*/ 226 | 227 | int 228 | Rsilo_ReadGZChunk(struct rsilo *rs, byte_iter_f *func, void *priv) 229 | { 230 | int ps = getpagesize(); 231 | ssize_t sz; 232 | char ibuf[ps * 32]; 233 | int j; 234 | off_t ll = 0; 235 | 236 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 237 | AN(func); 238 | assert(rs->silo_where == RS_BODY); 239 | 240 | do { 241 | sz = sizeof ibuf; 242 | if (sz > rs->silo_bodylen) 243 | sz = rs->silo_bodylen; 244 | sz = read(rs->silo_fd, ibuf, sz); 245 | if (sz <= 0) 246 | return(0); 247 | ll += sz; 248 | j = func(priv, ibuf, sz); 249 | if (j) 250 | return(0); 251 | rs->silo_bodylen -= sz; 252 | } while (rs->silo_bodylen > 0); 253 | rs->silo_where = RS_CRLF; 254 | return (ll); 255 | } 256 | 257 | /* Read a WARC body ---------------------------------------------------*/ 258 | 259 | uintmax_t 260 | Rsilo_ReadChunk(struct rsilo *rs, byte_iter_f *func, void *priv) 261 | { 262 | z_stream zs[1]; 263 | int ps = getpagesize(); 264 | char ibuf[ps * 100]; 265 | char obuf[ps * 100]; 266 | int i, j; 267 | 268 | CHECK_OBJ_NOTNULL(rs, RSILO_MAGIC); 269 | AN(func); 270 | assert(rs->silo_where == RS_BODY); 271 | 272 | memset(zs, 0, sizeof zs); 273 | i = inflateInit2(zs, 15 + 32); 274 | assert(i == Z_OK); 275 | 276 | do { 277 | if (zs->avail_in == 0) { 278 | i = read(rs->silo_fd, ibuf, ps); 279 | assert(i > 0); 280 | zs->next_in = (void*)ibuf; 281 | zs->avail_in = i; 282 | } 283 | 284 | zs->next_out = (void*)obuf; 285 | zs->avail_out = sizeof obuf; 286 | 287 | i = inflate(zs, 0); 288 | assert(i >= Z_OK); 289 | if (zs->avail_out < sizeof obuf) 290 | j = func(priv, obuf, sizeof obuf - zs->avail_out); 291 | else 292 | j = 0; 293 | } while (i >= Z_OK && i != Z_STREAM_END && j == 0); 294 | 295 | if (zs->avail_in > 0) 296 | (void)lseek(rs->silo_fd, -(off_t)zs->avail_in, SEEK_CUR); 297 | 298 | rs->silo_where = RS_CRLF; 299 | i = inflateEnd(zs); 300 | assert(i == Z_OK); 301 | if (j != 0) 302 | return(0); 303 | return(zs->total_in); 304 | } 305 | 306 | /* Read a CRNLCRNL separator ------------------------------------------*/ 307 | 308 | void 309 | Rsilo_SkipCRNL(struct rsilo *rs) 310 | { 311 | uint8_t buf[sizeof Gzip_crnlcrnl]; 312 | ssize_t i; 313 | 314 | assert(rs->silo_where == RS_CRLF); 315 | i = read(rs->silo_fd, buf, sizeof buf); 316 | assert(i == sizeof buf); 317 | assert(!memcmp(buf, Gzip_crnlcrnl, sizeof buf)); 318 | rs->silo_where = RS_HEAD; 319 | } 320 | -------------------------------------------------------------------------------- /segjob.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #define ZLIB_CONST 36 | #include 37 | 38 | #include "vdef.h" 39 | 40 | #include "vas.h" 41 | #include "vsb.h" 42 | #include "miniobj.h" 43 | 44 | #include "aardwarc.h" 45 | 46 | /* 47 | * A segment of an object 48 | */ 49 | 50 | struct segment { 51 | unsigned magic; 52 | #define SEGMENT_MAGIC 0x28a64188 53 | VTAILQ_ENTRY(segment) list; 54 | 55 | int segno; 56 | struct header *hdr; 57 | struct wsilo *silo; 58 | off_t size; 59 | }; 60 | 61 | struct segjob { 62 | unsigned magic; 63 | #define SEGJOB_MAGIC 0x61a52fde 64 | 65 | struct aardwarc *aa; 66 | const struct header *hdr; 67 | const char *ident; 68 | 69 | int nseg; 70 | VTAILQ_HEAD(,segment) segments; 71 | struct segment *cur_seg; 72 | struct SHA256Context sha256_payload[1]; 73 | struct SHA256Context sha256_segment[1]; 74 | 75 | off_t size; 76 | size_t obuflen; 77 | z_stream gz[1]; 78 | int gz_flag; 79 | }; 80 | 81 | static void 82 | segjob_destroy(struct segjob *sj) 83 | { 84 | struct segment *sg; 85 | 86 | CHECK_OBJ_NOTNULL(sj, SEGJOB_MAGIC); 87 | while (!VTAILQ_EMPTY(&sj->segments)) { 88 | sg = VTAILQ_FIRST(&sj->segments); 89 | VTAILQ_REMOVE(&sj->segments, sg, list); 90 | if (sg->silo != NULL) 91 | Wsilo_Abandon(&sg->silo); 92 | Header_Destroy(&sg->hdr); 93 | FREE_OBJ(sg); 94 | } 95 | FREE_OBJ(sj); 96 | } 97 | 98 | static void 99 | segjob_newseg(struct segjob *sj) 100 | { 101 | struct segment *sg; 102 | char *digest; 103 | int pad = 0; 104 | intmax_t im; 105 | 106 | CHECK_OBJ_NOTNULL(sj, SEGJOB_MAGIC); 107 | AZ(sj->cur_seg); 108 | 109 | ALLOC_OBJ(sg, SEGMENT_MAGIC); 110 | AN(sg); 111 | 112 | sg->segno = ++sj->nseg; 113 | 114 | digest = SHA256_Data("", 0, NULL); 115 | AN(digest); 116 | 117 | sg->hdr = Header_Clone(sj->hdr); 118 | 119 | /* 120 | * No matter how hard we try, there is no way to predict the headers 121 | * precisely so we must reserve a padding space for the stuff we will 122 | * only find out later as the size increases. 123 | */ 124 | 125 | Header_Set(sg->hdr, "WARC-Block-Digest", "sha256:%s", digest); 126 | 127 | /* 128 | * We reserve two extra digits to allow up to 99% compression. 129 | * This also covers the case where data is already gzip'ed and 130 | * the C-L-G is longer than the C-L 131 | */ 132 | im = sj->aa->silo_maxsize; 133 | Header_Set(sg->hdr, "Content-Length", "00%jd", im); 134 | 135 | if (sg->segno == 1) { 136 | pad += Header_Len("WARC-Segment-Number", "1"); 137 | pad += Header_Len("WARC-Payload-Digest", "sha256:%s", digest); 138 | } else { 139 | Header_Set(sg->hdr, "WARC-Segment-Number", "%d", sg->segno); 140 | Header_Set(sg->hdr, "WARC-Type", "continuation"); 141 | Header_Set_Ref(sg->hdr, "WARC-Segment-Origin-ID", digest); 142 | 143 | /* In case this becomes the last segment */ 144 | im = sj->size + sj->aa->silo_maxsize; 145 | pad += Header_Len("WARC-Segment-Total-Length", "%00jd", im); 146 | } 147 | 148 | REPLACE(digest, NULL); 149 | 150 | sg->silo = Wsilo_Next(sj->aa); 151 | AN(sg->silo); 152 | Wsilo_Header(sg->silo, sg->hdr, pad); 153 | 154 | VTAILQ_INSERT_TAIL(&sj->segments, sg, list); 155 | 156 | SHA256_Init(sj->sha256_segment); 157 | Gzip_InitDeflate(sj->gz); 158 | sj->gz_flag = 0; 159 | Gzip_AddAa(sj->gz); 160 | sj->cur_seg = sg; 161 | } 162 | 163 | static void 164 | segjob_finishseg(struct segjob *sj) 165 | { 166 | char *dig; 167 | struct segment *sg; 168 | 169 | CHECK_OBJ_NOTNULL(sj, SEGJOB_MAGIC); 170 | 171 | sg = sj->cur_seg; 172 | sj->cur_seg = NULL; 173 | CHECK_OBJ_NOTNULL(sg, SEGMENT_MAGIC); 174 | 175 | assert(deflateEnd(sj->gz) == Z_OK); 176 | dig = SHA256_End(sj->sha256_segment, NULL); 177 | AN(dig); 178 | Header_Set(sg->hdr, "WARC-Block-Digest", "sha256:%s", dig); 179 | Header_Set(sg->hdr, "Content-Length", "%jd", (intmax_t)sg->size); 180 | Ident_Set(sj->aa, sg->hdr, dig, sg->segno == 1 ? sj->ident : NULL); 181 | Wsilo_Finish(sg->silo); 182 | REPLACE(dig, NULL); 183 | } 184 | 185 | struct segjob * 186 | SegJob_New(struct aardwarc *aa, const struct header *hdr, const char *ident) 187 | { 188 | struct segjob *sj; 189 | 190 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 191 | AN(hdr); 192 | AN(Header_Get(hdr, "Content-Type")); 193 | AN(Header_Get(hdr, "WARC-Type")); 194 | AN(Header_Get(hdr, "WARC-Date")); 195 | AZ(Header_Get(hdr, "WARC-Segment-Number")); 196 | AZ(Header_Get(hdr, "WARC-Payload-Digest")); 197 | AZ(Header_Get(hdr, "WARC-Segment-Origin-ID")); 198 | AZ(Header_Get(hdr, "WARC-Segment-Total-Length")); 199 | assert(ident == NULL || IDX_Valid_Id(aa, ident, NULL) == NULL); 200 | 201 | ALLOC_OBJ(sj, SEGJOB_MAGIC); 202 | AN(sj); 203 | 204 | VTAILQ_INIT(&sj->segments); 205 | SHA256_Init(sj->sha256_payload); 206 | sj->aa = aa; 207 | sj->hdr = hdr; 208 | sj->ident = ident; 209 | 210 | return (sj); 211 | } 212 | 213 | static void 214 | segjob_setup_outbuf(struct segjob *sj, const struct segment *sg) 215 | { 216 | void *obuf_ptr; 217 | 218 | Wsilo_GetSpace(sg->silo, &obuf_ptr, &sj->obuflen); 219 | assert(sj->obuflen > 0); 220 | sj->obuflen -= sizeof Gzip_crnlcrnl; 221 | sj->gz->avail_out = sj->obuflen; 222 | sj->gz->next_out = obuf_ptr; 223 | } 224 | 225 | static void 226 | segjob_deflate(struct segjob *sj, const struct segment *sg) 227 | { 228 | int i; 229 | ssize_t len; 230 | 231 | i = deflate(sj->gz, sj->gz_flag); 232 | len = sj->obuflen - sj->gz->avail_out; 233 | assert(i == Z_OK || (sj->gz_flag == Z_FINISH && i == Z_STREAM_END)); 234 | 235 | len = sj->obuflen - sj->gz->avail_out; 236 | if (len > 0) 237 | AZ(Wsilo_Store(sg->silo, len)); 238 | } 239 | 240 | void 241 | SegJob_Feed(struct segjob *sj, const void *iptr, ssize_t ilen) 242 | { 243 | struct segment *sg; 244 | const char *ip = iptr; 245 | ssize_t len; 246 | void *ptr; 247 | 248 | CHECK_OBJ_NOTNULL(sj, SEGJOB_MAGIC); 249 | 250 | do { 251 | /* Get current segment --------------------------------*/ 252 | 253 | if (sj->cur_seg == NULL) 254 | segjob_newseg(sj); 255 | 256 | sg = sj->cur_seg; 257 | CHECK_OBJ_NOTNULL(sg, SEGMENT_MAGIC); 258 | 259 | segjob_setup_outbuf(sj, sg); 260 | 261 | /* Steer gzip to fill silo almost exactly -------------*/ 262 | 263 | if (sj->obuflen < 52 || ilen == 0) { 264 | /* 40 is found by experiment, 52 for safety */ 265 | 266 | AZ(sj->gz->avail_in); 267 | 268 | /* Flush to byte-boundary, so we can do CRC-tricks */ 269 | sj->gz_flag = Z_SYNC_FLUSH; 270 | segjob_deflate(sj, sg); 271 | 272 | sj->gz_flag = Z_FINISH; 273 | segjob_setup_outbuf(sj, sg); 274 | segjob_deflate(sj, sg); 275 | 276 | Wsilo_GetSpace(sg->silo, &ptr, &sj->obuflen); 277 | assert(sj->obuflen > (ssize_t)sizeof Gzip_crnlcrnl); 278 | memcpy(ptr, Gzip_crnlcrnl, sizeof Gzip_crnlcrnl); 279 | AZ(Wsilo_Store(sg->silo, sizeof Gzip_crnlcrnl)); 280 | 281 | segjob_finishseg(sj); 282 | continue; 283 | } 284 | 285 | if (sj->gz->avail_in == 0) { 286 | /* 287 | * At most we pass in half as much data as we have 288 | * output space for, measured in bytes so we avoid 289 | * Zeno's Paradox about Achilles and the Tortoise. 290 | */ 291 | len = sj->gz->avail_out >> 1; 292 | if (len > ilen) 293 | len = ilen; 294 | assert(len > 0); 295 | 296 | sj->gz->avail_in = len; 297 | sj->gz->next_in = (void*)(uintptr_t)ip; 298 | 299 | sj->size += len; 300 | sg->size += len; 301 | SHA256_Update(sj->sha256_segment, ip, len); 302 | SHA256_Update(sj->sha256_payload, ip, len); 303 | 304 | ilen -= len; 305 | ip += len; 306 | } 307 | 308 | if (sj->gz->avail_out < 128 * 1024) { 309 | /* 310 | * From here on we flush all gzip output in order 311 | * to not get surprised by a big lump later on. 312 | * Experiments indicate that the required limit 313 | * may be as low as 75K. 314 | * We play it safe with 128K at the cost of 0.02% 315 | * less efficient compression. 316 | */ 317 | sj->gz_flag = Z_PARTIAL_FLUSH; 318 | } 319 | 320 | segjob_deflate(sj, sg); 321 | 322 | } while (sj->gz->avail_in > 0 || ilen > 0); 323 | } 324 | 325 | char * 326 | SegJob_Commit(struct segjob *sj) 327 | { 328 | char *id; 329 | struct segment *sg, *sgn; 330 | const char *fid, *rid; 331 | struct getjob *gj; 332 | struct vsb *vsb; 333 | 334 | CHECK_OBJ_NOTNULL(sj, SEGJOB_MAGIC); 335 | SegJob_Feed(sj, "", 0); 336 | AN(sj->size); 337 | 338 | sg = VTAILQ_FIRST(&sj->segments); 339 | AN(sg); 340 | 341 | if (sj->nseg > 1) { 342 | /* Update ID of first segment */ 343 | id = SHA256_End(sj->sha256_payload, NULL); 344 | AN(id); 345 | Header_Set(sg->hdr, "WARC-Payload-Digest", "sha256:%s", id); 346 | Ident_Set(sj->aa, sg->hdr, id, sj->ident); 347 | REPLACE(id, NULL); 348 | } 349 | 350 | fid = Header_Get_Id(sg->hdr); 351 | id = Digest2Ident(sj->aa, fid); 352 | 353 | vsb = VSB_new_auto(); 354 | AN(vsb); 355 | gj = GetJob_New(sj->aa, fid, vsb); 356 | if (gj != NULL) { 357 | GetJob_Delete(&gj); 358 | fprintf(stderr, "ID %s already in archive\n", fid); 359 | segjob_destroy(sj); 360 | return (id); 361 | } 362 | 363 | if (sj->nseg == 1) { 364 | Wsilo_Commit(&sg->silo, 0, fid, NULL); 365 | segjob_destroy(sj); 366 | return (id); 367 | } 368 | 369 | VTAILQ_FOREACH(sg, &sj->segments, list) { 370 | if (sg->segno == 1) 371 | Header_Set(sg->hdr, "WARC-Segment-Number", "1"); 372 | else 373 | Header_Set_Ref(sg->hdr, "WARC-Segment-Origin-ID", fid); 374 | 375 | sgn = VTAILQ_NEXT(sg, list); 376 | if (sgn == NULL) { 377 | Header_Set(sg->hdr, "WARC-Segment-Total-Length", 378 | "%jd", (intmax_t)sj->size); 379 | rid = NULL; 380 | } else { 381 | rid = Header_Get_Id(sgn->hdr); 382 | } 383 | 384 | Wsilo_Commit(&sg->silo, 1, Header_Get_Id(sg->hdr), rid); 385 | } 386 | return (id); 387 | } 388 | -------------------------------------------------------------------------------- /silo.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | 33 | #include "vdef.h" 34 | 35 | #include "vas.h" 36 | #include "vsb.h" 37 | #include "miniobj.h" 38 | 39 | #include "aardwarc.h" 40 | 41 | /* Construct a silo filename ------------------------------------------ 42 | * 43 | * We want to limit the number of silos in each directory for any 44 | * number of reasons, but we don't want to commit to a particular 45 | * depth up front. 46 | * 47 | * This code build an adaptive hierarchy: 48 | * prefix/0/{100 silos} 49 | * prefix/1/{100 subdirs}/{100 silos} 50 | * prefix/2/{100 subdirs}/{100 subdirs}/{100 silos} 51 | * ... 52 | * 53 | * A directory with 100 silos is approx 4K large. 54 | * 55 | */ 56 | 57 | static void 58 | numpart(struct vsb *vsb, int lvl, unsigned num) 59 | { 60 | if (num >= 100U) 61 | numpart(vsb, lvl + 1, num / 100U); 62 | else 63 | VSB_printf(vsb, "%d/", lvl); 64 | if (lvl > 0) 65 | VSB_printf(vsb, "%02u/", num % 100U); 66 | } 67 | 68 | struct vsb * 69 | Silo_Filename(const struct aardwarc *aa, unsigned number, int hold) 70 | { 71 | struct vsb *vsb; 72 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 73 | 74 | vsb = VSB_new_auto(); 75 | AN(vsb); 76 | VSB_cat(vsb, aa->silo_dirname); 77 | numpart(vsb, 0, number); 78 | VSB_printf(vsb, aa->silo_basename, number); 79 | if (hold) 80 | VSB_cat(vsb, ".hold"); 81 | AZ(VSB_finish(vsb)); 82 | return (vsb); 83 | } 84 | 85 | int 86 | Silo_Iter(const struct aardwarc *aa, byte_iter_f *func, void *priv) 87 | { 88 | struct vsb *vsb; 89 | uint32_t u; 90 | struct stat st; 91 | int i, retval = 0; 92 | 93 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 94 | AN(func); 95 | 96 | for (u = 0; retval == 0; u++) { 97 | vsb = VSB_new_auto(); 98 | AN(vsb); 99 | VSB_cat(vsb, aa->silo_dirname); 100 | numpart(vsb, 0, u); 101 | AZ(VSB_finish(vsb)); 102 | i = stat(VSB_data(vsb), &st); 103 | if (i && errno == ENOENT) 104 | break; 105 | VSB_delete(vsb); 106 | vsb = Silo_Filename(aa, u, 0); 107 | i = stat(VSB_data(vsb), &st); 108 | if (!i && S_ISREG(st.st_mode)) 109 | retval = func(priv, VSB_data(vsb), u); 110 | VSB_delete(vsb); 111 | } 112 | return (0); 113 | } 114 | -------------------------------------------------------------------------------- /tests/alltest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | for i in test??.sh 6 | do 7 | sh $i 8 | done 9 | -------------------------------------------------------------------------------- /tests/gcov_report.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import sys 4 | 5 | r1 = [] 6 | r2 = [] 7 | 8 | for i in sys.stdin: 9 | j = i.replace(':', ' ').split() 10 | if not j: 11 | continue 12 | if j[0] in ('File', 'Function'): 13 | subj = j 14 | elif j[0] == 'Lines': 15 | assert j[2][-1] == '%' 16 | j[2] = j[2][:-1] 17 | p = float(j[2]) 18 | l = int(j[4]) 19 | m = (100.0 - p) * l * .01 20 | if subj[0] == 'File': 21 | r1.append([m, p, l, subj[0], subj[1]]) 22 | else: 23 | r2.append([m, p, l, subj[0], subj[1]]) 24 | 25 | for r in (r1, r2): 26 | print('Percent Lines Missing') 27 | print('=' * 60) 28 | r.sort() 29 | tl = 0 30 | tm = 0 31 | for m, p,l,t,n in r: 32 | tl += l 33 | tm += m 34 | print("%6.2f" % p, " %4d" % l, " %4d" % m, " ", t, n) 35 | print("") 36 | 37 | print('Percent Lines Missing') 38 | print('-' * 60) 39 | print("%6.2f" % (100.0 * (tl-tm) / tl), " %4d" % tl, " %4d" % tm) 40 | print('-' * 60) 41 | 42 | -------------------------------------------------------------------------------- /tests/test.rc: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | if [ "x${AA}" == "x" ] ; then 6 | AA=`pwd`/../aardwarc 7 | fi 8 | ADIR=${TMPDIR-/tmp}/_test_aardward 9 | mkdir -p ${ADIR} 10 | AXEC="${AA} -c ${ADIR}/aardwarc.conf" 11 | 12 | new_aardwarc() ( 13 | rm -rf ${ADIR} 14 | mkdir -p ${ADIR} 15 | 16 | ( 17 | echo "WARC-Record-ID:" 18 | echo " file://`pwd`/$$/ 128" 19 | echo "" 20 | echo "warcinfo.body:" 21 | echo " description: http://github/bsdphk/Aardwarc testrun" 22 | echo "" 23 | echo "silo.directory:" 24 | echo " ${ADIR}/" 25 | echo "" 26 | echo "silo.max_size:" 27 | echo " 15k" 28 | echo "" 29 | echo "silo.basename:" 30 | echo " %08u.warc.gz" 31 | echo "" 32 | echo "resource.mime-types:" 33 | echo " application/octet-stream" 34 | echo " text/plain" 35 | echo "" 36 | echo "metadata.mime-types:" 37 | echo " text/plain" 38 | echo " application/json" 39 | echo "" 40 | ) > ${ADIR}/aardwarc.conf 41 | 42 | ) 43 | 44 | fail ( ) ( 45 | expect=$1 46 | shift 47 | pattern=$1 48 | shift 49 | echo "#### $0 '$*'" 50 | set +e 51 | $* > ${ADIR}/_ 2>&1 52 | status=$? 53 | set -e 54 | if [ $status -ne $expect ] ; then 55 | echo "#### $0 Expected status $expect got status $status" 56 | sed 's/^/ /' ${ADIR}/_ 57 | exit 1 58 | fi 59 | if ! egrep -q "$pattern" ${ADIR}/_ ; then 60 | echo "#### $0 pattern $pattern not found in" 61 | sed 's/^/ /' ${ADIR}/_ 62 | exit 1 63 | fi 64 | ) 65 | 66 | -------------------------------------------------------------------------------- /tests/test00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Basic store/get test 4 | 5 | set -e 6 | 7 | . test.rc 8 | 9 | new_aardwarc 10 | 11 | for i in ../* 12 | do 13 | if [ ! -f $i ] ; then 14 | continue 15 | fi 16 | echo "#### $0 $i" 17 | 18 | # Store the file 19 | cp $i _1 20 | ${AXEC} store -t resource -m application/octet-stream _1 > _2 21 | ${AXEC} audit 22 | 23 | # Store the metadata 24 | ls -l $i > _1m 25 | sha256 $i >> _1m 26 | ${AXEC} store -t metadata -m text/plain -r `cat _2` _1m > _2m 27 | ${AXEC} audit 28 | 29 | # Get them both back again 30 | ${AXEC} get -o _3 `cat _2` > _4 31 | ${AXEC} get -o _3m `cat _2m` > _4m 32 | ${AXEC} get -n -o _3mn `cat _2m` > _4mn 33 | 34 | # Get back also in gzip'ed format 35 | ${AXEC} get -z -o _5 `cat _2` > _6 36 | ${AXEC} get -z -o _5m `cat _2m` > _6m 37 | 38 | # Check payload is identical 39 | cmp _1 _3 40 | cmp _1m _3m 41 | 42 | # Check gzip'ed payload is identical 43 | zcat _5 | cmp - _1 44 | zcat _5m | cmp - _1m 45 | 46 | # Check headers are the same 47 | diff _4 _6 48 | diff _4m _6m 49 | diff _4mn _6m 50 | 51 | # Check ID headers 52 | fgrep -q "WARC-Record-ID: <`cat _2`>" _4 53 | fgrep -q "WARC-Record-ID: <`cat _2m`>" _4m 54 | fgrep -q "WARC-Refers-To: <`cat _2`>" _4m 55 | 56 | # Check other headers 57 | fgrep -q "WARC-Type: resource" _4 58 | fgrep -q "Content-Type: application/octet-stream" _4 59 | fgrep -q "WARC-Type: metadata" _4m 60 | fgrep -q "Content-Type: text/plain" _4m 61 | fgrep -q "WARC-Type: resource" _6 62 | fgrep -q "Content-Type: application/octet-stream" _6 63 | fgrep -q "WARC-Type: metadata" _6m 64 | fgrep -q "Content-Type: text/plain" _6m 65 | 66 | # Check Content-Length headers 67 | l3=`stat -f '%z' _3` 68 | fgrep -q "Content-Length: $l3" _4 69 | l3m=`stat -f '%z' _3m` 70 | fgrep -q "Content-Length: $l3m" _4m 71 | 72 | # Check Digest headers 73 | s3=`sha256 < _3` 74 | fgrep -q "WARC-Block-Digest: sha256:$s3" _4 75 | s3m=`sha256 < _3m` 76 | fgrep -q "WARC-Block-Digest: sha256:$s3m" _4m 77 | s5=`zcat _5 | sha256` 78 | fgrep -q "WARC-Block-Digest: sha256:$s5" _6 79 | s5m=`zcat _5m | sha256` 80 | fgrep -q "WARC-Block-Digest: sha256:$s5m" _6m 81 | done 82 | 83 | echo "## $0 DONE" 84 | rm -f _[1-6]* 85 | -------------------------------------------------------------------------------- /tests/test01.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Audit 4 | 5 | set -ex 6 | 7 | . test.rc 8 | 9 | ${AXEC} audit 10 | -------------------------------------------------------------------------------- /tests/test02.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Index housekeeping & rebuild 4 | 5 | set -e 6 | 7 | . test.rc 8 | 9 | LN_I1=`${AXEC} dumpindex | sort -u | wc -l` 10 | LN_M1=`${AXEC} dumpindex | sort -u | md5` 11 | 12 | echo "#### $0 Housekeeping" 13 | ${AXEC} housekeeping 14 | 15 | LN_I2=`${AXEC} dumpindex | sort -u | wc -l` 16 | LN_M2=`${AXEC} dumpindex | sort -u | md5` 17 | 18 | if [ ${LN_I1} != ${LN_I2} ] ; then 19 | echo "Index changed length on housekeeping" 20 | exit 1 21 | fi 22 | if [ ${LN_M1} != ${LN_M2} ] ; then 23 | echo "Index changed content on housekeeping" 24 | exit 1 25 | fi 26 | 27 | echo "#### $0 Reindex" 28 | rm -f ${ADIR}/index.sorted ${ADIR}/index.appendix 29 | ${AXEC} reindex 30 | 31 | LN_I3=`${AXEC} dumpindex | sort -u | wc -l` 32 | LN_M3=`${AXEC} dumpindex | sort -u | md5` 33 | 34 | if [ ${LN_I1} != ${LN_I3} ] ; then 35 | echo "Index changed length on reindex" 36 | exit 1 37 | fi 38 | if [ ${LN_M1} != ${LN_M3} ] ; then 39 | echo "Index changed content on reindex" 40 | exit 1 41 | fi 42 | -------------------------------------------------------------------------------- /tests/test03.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Stow 4 | 5 | set -e 6 | 7 | . test.rc 8 | 9 | if [ -f ${ADIR}/aardwarc.conf ] ; then 10 | sed -i '' '/^stow.test03/,$d' ${ADIR}/aardwarc.conf 11 | else 12 | new_aardwarc 13 | fi 14 | 15 | ( 16 | echo '.git' 17 | echo '*.gcda' 18 | ) > ${ADIR}/_stow.exclude 19 | 20 | ( 21 | echo "stow.test03:" 22 | echo " directory `pwd`/.." 23 | echo " exclude ${ADIR}/_stow.exclude" 24 | echo " cmd ${AXEC} stevedore" 25 | echo "" 26 | echo "stow.test03a:" 27 | echo " directory `pwd`/.." 28 | echo " cmd sleep 1 ; exit 2" 29 | echo "" 30 | echo "stow.test03b:" 31 | echo " directory ${ADIR}/test3b" 32 | echo " cmd ${AXEC} stevedore" 33 | echo "" 34 | ) >> ${ADIR}/aardwarc.conf 35 | 36 | echo "#### $0 Stow" 37 | #fail 0 'Done job test03' ${AXEC} stow test03 38 | #fail 2 '(Remote) command failed' ${AXEC} stow test03a 39 | 40 | # Test 3b is a file which changed underway 41 | 42 | rm -rf ${ADIR}/test3b 43 | mkdir ${ADIR}/test3b 44 | for s in d1 d2 d3 45 | do 46 | mkdir -p ${ADIR}/test3b/$s 47 | date > ${ADIR}/test3b/$s/file 48 | done 49 | 50 | echo 'abcdefghij' > ${ADIR}/test3b/_stow_canary1 51 | echo '0123456789' > ${ADIR}/test3b/_stow_canary2 52 | 53 | fail 0 'Done job test03b' ${AXEC} stow test03b 54 | 55 | echo 'IIIIIIIVV' > ${ADIR}/test3b/_stow_canary3 56 | echo 'yksikaksi' > ${ADIR}/test3b/_stow_canary4 57 | 58 | ( 59 | echo '#!' 60 | echo "ADIR=${ADIR}" 61 | 62 | echo ' 63 | 64 | /usr/sbin/mtree $* > ${ADIR}/_mt 65 | echo 'ABCDEfghij' > ${ADIR}/test3b/_stow_canary3 66 | 67 | # XXX: Fails with panic 68 | # rm ${ADIR}/test3b/_stow_canary4 69 | 70 | cat ${ADIR}/_mt 71 | ' 72 | ) > ${ADIR}/mtree 73 | 74 | chmod +x ${ADIR}/mtree 75 | 76 | P0=${PATH} 77 | export PATH=${ADIR}:${PATH} 78 | fail 0 'Done job test03b' ${AXEC} stow test03b 79 | export PATH=${P0} 80 | -------------------------------------------------------------------------------- /tests/test04.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Coverage tests 4 | 5 | set -e 6 | 7 | . test.rc 8 | 9 | sed -i '' '/^stow.test04/,$d' ${ADIR}/aardwarc.conf 10 | 11 | ( 12 | echo "stow.test04_a:" 13 | echo " exclude `pwd`/stow.exclude" 14 | echo " cmd `pwd`/${AXEC} stevedore" 15 | echo "" 16 | echo "stow.test04_b:" 17 | echo " exclude `pwd`/stow.exclude" 18 | echo " directory /nonexistent" 19 | echo " cmd `pwd`/${AXEC} stevedore" 20 | echo "" 21 | echo "stow.test04_c:" 22 | echo " exclude `pwd`/stow.exclude" 23 | echo " mumble" 24 | echo " cmd `pwd`/${AXEC} stevedore" 25 | echo "" 26 | ) >> ${ADIR}/aardwarc.conf 27 | 28 | echo "#### $0 top usage" 29 | 30 | fail 1 'Usage' ${AXEC} -h 31 | fail 1 'Unknown global option' ${AXEC} $p -x 32 | fail 1 'Need command argument' ${AXEC} 33 | fail 1 'Unknown operation' ${AXEC} xyzzy 34 | 35 | echo "#### $0 util usage" 36 | for p in \ 37 | audit \ 38 | byid \ 39 | cgi \ 40 | dumpindex \ 41 | filter \ 42 | get \ 43 | housekeeping \ 44 | info \ 45 | reindex \ 46 | stevedore \ 47 | store \ 48 | stow \ 49 | _testbytes 50 | do 51 | fail 1 'Usage' ${AXEC} $p -h 52 | fail 1 'Unknown option' ${AXEC} $p -x 53 | done 54 | 55 | fail 2 'This subcommand does not do JSON' \ 56 | ${AXEC} -j reindex 57 | 58 | # This will fail in two different ways, depending on if 59 | # it finds a config file somewhere or not. 60 | fail 1 '(No config file found, tried|Unknown operation)' \ 61 | env HOME=/nonexistent ${AA} xyzzy 62 | 63 | fail 2 'Cannot open' \ 64 | ${AA} -c /dev/null xyzzy 65 | 66 | echo "#### $0 stow Argument and Usage code" 67 | fail 1 'Must specify' ${AXEC} stow 68 | fail 1 'Cannot find stow.blah' ${AXEC} stow blah 69 | fail 1 'have no directory' ${AXEC} stow test04_a 70 | fail 1 'Cannot open target directory' ${AXEC} stow test04_b 71 | fail 1 'has unknown config' ${AXEC} stow test04_c 72 | 73 | echo "#### $0 stevedore Argument and Usage code" 74 | fail 1 'Usage' ${AXEC} stevedore xyz 75 | 76 | echo "#### $0 store Argument and Usage code" 77 | fail 1 'More than one -t argument' \ 78 | ${AXEC} store -t resource -t metadata 79 | fail 1 'Illegal -t argument' \ 80 | ${AXEC} store -t warcinfo 81 | fail 1 'Can only specify -r ID for metadata' \ 82 | ${AXEC} store -t resource -r 1 83 | fail 1 'Can only specify -i ID for metadata' \ 84 | ${AXEC} store -t resource -i 1 85 | fail 1 'More than one -r argument' \ 86 | ${AXEC} store -t metadata -r 1 -r 2 87 | fail 1 'More than one -i argument' \ 88 | ${AXEC} store -t metadata -i 1 -i 1 89 | fail 1 'Must specify -r ID for metadata' \ 90 | ${AXEC} store -t metadata -i 1 91 | fail 1 'Illegal id .-i.:' \ 92 | ${AXEC} store -t metadata -r foobar -i ____ 93 | fail 1 'ID is invalid .non-hex characters.' \ 94 | ${AXEC} store -t metadata -r foobar 95 | fail 1 'Too many input files' \ 96 | ${AXEC} store -t resource a b 97 | fail 1 'Cannot open /nonexistent' \ 98 | ${AXEC} store -t resource /nonexistent 99 | fail 1 'Input file empty' \ 100 | ${AXEC} store -t resource /dev/null 101 | 102 | ${AXEC} store -t metadata -m text/plain \ 103 | -r `sha256 < ../main_store.c | cut -c1-32` test.rc > /dev/null 104 | 105 | fail 1 'Illegal mime-type' \ 106 | ${AXEC} store -t metadata -m text/weird \ 107 | -r `sha256 < ../main_store.c | cut -c1-32` test.rc 108 | 109 | fail 1 'Referenced .-r. ID does not exist' \ 110 | ${AXEC} store -t metadata -m text/plain \ 111 | -i 00000000000000000000000000000000 \ 112 | -r 00000000000000000000000000000001 test.rc 113 | 114 | # Find ID of warcinfo record 115 | m=`${AXEC} dumpindex | awk '$2 == "0x00000002" {print $1 ; exit(0)}'` 116 | mm=`${AXEC} byid $m | awk '{print $2}'` 117 | 118 | fail 1 'Referenced .-r. ID does not exist' \ 119 | ${AXEC} store -t metadata -m text/plain \ 120 | -i 00000000000000000000000000000000 \ 121 | -r "$mm" test.rc 122 | 123 | ${AXEC} store -t metadata -m text/plain \ 124 | -i 00000000000000000000000000000000 \ 125 | -r `sha256 < ../main_store.c | cut -c1-32` test.rc > /dev/null 126 | 127 | -------------------------------------------------------------------------------- /tests/test05.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Coverage tests 4 | 5 | set -e 6 | 7 | . test.rc 8 | 9 | unset GATEWAY_INTERFACE 10 | unset REQUEST_METHOD 11 | unset PATH_INFO 12 | unset HTTP_ACCEPT_ENCODING 13 | 14 | fail 1 'Too many arguments' \ 15 | ${AXEC} cgi foo 16 | 17 | fail 1 'No .good. [$]GATEWAY_INTERFACE' \ 18 | ${AXEC} cgi 19 | 20 | export GATEWAY_INTERFACE=CGI/1.0 21 | fail 1 'No .good. [$]GATEWAY_INTERFACE' \ 22 | ${AXEC} cgi 23 | 24 | export GATEWAY_INTERFACE=CGI/1.1 25 | fail 1 'No .good. [$]REQUEST_METHOD' \ 26 | ${AXEC} cgi 27 | 28 | export REQUEST_METHOD="PUT" 29 | fail 1 'No .good. [$]REQUEST_METHOD' \ 30 | ${AXEC} cgi 31 | 32 | export REQUEST_METHOD="GET" 33 | fail 1 'No [$]PATH_INFO' \ 34 | ${AXEC} cgi 35 | 36 | export PATH_INFO="/000000" 37 | fail 0 'ID is invalid .too short.' \ 38 | ${AXEC} cgi 39 | 40 | i=`sha256 < ../README.md | cut -c1-32` 41 | export PATH_INFO=$i 42 | fail 0 'Museum-quality bit-archive storage management' \ 43 | ${AXEC} cgi 44 | export HTTP_ACCEPT_ENCODING=gzip 45 | fail 0 'Content-Encoding: gzip' \ 46 | ${AXEC} cgi 47 | -------------------------------------------------------------------------------- /utilities/select_silos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- encoding: utf-8 -*- 3 | # 4 | # SPDX-License-Identifier: BSD-2-Clause 5 | # 6 | # Copyright (c) 2018, Poul-Henning Kamp 7 | # All rights reserved. 8 | # 9 | # Redistribution and use in source and binary forms, with or without 10 | # modification, are permitted provided that the following conditions are met: 11 | # 12 | # * Redistributions of source code must retain the above copyright notice, this 13 | # list of conditions and the following disclaimer. 14 | # 15 | # * Redistributions in binary form must reproduce the above copyright notice, 16 | # this list of conditions and the following disclaimer in the documentation 17 | # and/or other materials provided with the distribution. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | ''' 31 | AardWARC encrypted backup utility 32 | ================================= 33 | 34 | Depending what you put in your AardWARC silos, you may want to encrypt 35 | (off-site) backups. This python script does that, in a very specific 36 | way, focused on survivability of the collection. 37 | 38 | Overview 39 | -------- 40 | 41 | Every time this script is run, a number of silos will be read from 42 | `SILODIR`, encrypted, hashed, hmac'ed and written to the `STAGING` 43 | directory. 44 | 45 | For each silo three files are produced: 46 | 47 | The encrypted silo: 48 | 49 | ########.warc.gz.bin 50 | 51 | The SHA256 hash of the encrypted silo: 52 | (This can be used for offline integrity checks at remote sites) 53 | 54 | ########.warc.gz.bin.sha256 55 | 56 | The HMAC of the encrypted silo: 57 | (This can be used to verify integrity of files retrieved from remote sites) 58 | 59 | ########.warc.gz.bin.hmac 60 | 61 | The files from the `STAGING` directory can be copied across the net 62 | with rsync(1) in either push or pull mode - or with any other program 63 | or protocol for that matter. 64 | 65 | The `STAGING` directory is renamed into place, so rsync(1) will never 66 | see any partially produced files. 67 | 68 | Cryptography 69 | ------------ 70 | 71 | Encryption is done with Colin-Approved™ AES256-CTR: 72 | 73 | https://www.daemonology.net/blog/2009-06-24-encrypt-then-mac.html 74 | 75 | Furthermore, the CTR mode have the important property that bit-errors 76 | in the encrypted silo do not cascade to the rest if the silo, which 77 | means that recovery from a damaged encrypted backup copy will contain 78 | the damage to the directly hit WARC records. 79 | 80 | The function `get_secrets` is responsible for returning the 256 bit 81 | encryption key and the 128 bit IV, feel free to customize. The default 82 | is to read them directly from a file named `secrets.txt`, which 83 | should contain (only!) two strings of 64 respectively 32 random 84 | hexdigits, separated by whitespace. 85 | 86 | One way to produce a `secret.txt` is: 87 | 88 | dd if=/dev/random count=1 | sha256 > secrets.txt 89 | dd if=/dev/random count=1 | md5 >> secrets.txt 90 | 91 | Scheduling 92 | ---------- 93 | 94 | A maximum of `QUOTA` bytes is scheduled for backup every time the 95 | script is run. This limits both the amount of disk space needed 96 | for the `STAGING` directory (twice `QUOTA` !), and the amount of 97 | data a remote backup site could have to download every day. 98 | 99 | The silos to be staged are selected in order of bytes added since 100 | last staged with time since last staged as backup criteria. 101 | 102 | This means that silos will be offered for backup periodically, so 103 | that remote sites starting from scratch will eventually catch up. 104 | 105 | The information about when a silo was last encrypted and what size 106 | it had then is cached in the file `silodates.txt` 107 | 108 | Security Considerations 109 | ----------------------- 110 | 111 | Obviously: Protect the keys. 112 | 113 | The suggested setup is: 114 | 115 | Silos are created with mode 0640, and their group can be used to 116 | grant read-only access to them. 117 | 118 | Make sure to set the directory group owner appropriately. 119 | 120 | user=stevedore, uid=700, gid=700, member of silo-read group. 121 | 122 | Also member of a group to get read access to the silos (if necessary). 123 | 124 | Runs this script, encrypted files end up with 700:700 in `STAGING` 125 | 126 | `secret.txt` is stored in ~stevedore, mode 400 127 | 128 | This script should be run from the stevedore users crontab 129 | 130 | user=backup, uid=701, gid=701, member of group 700 131 | 132 | Used for running rsync. 133 | 134 | Can read encrypted copies in `STAGING` but not the 'raw' silos. 135 | 136 | ''' 137 | 138 | import os 139 | import math 140 | import time 141 | import shutil 142 | 143 | from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes 144 | from cryptography.hazmat.primitives import hashes, hmac 145 | from cryptography.hazmat.backends import default_backend 146 | 147 | 148 | STAGING = os.environ.get("AA_STAGING") 149 | if not STAGING: 150 | STAGING = "/tmp/BackupStaging" # Where to stage the encrypted files 151 | 152 | SILODIR = os.environ.get("AA_SILODIR") 153 | if not SILODIR: 154 | SILODIR = "/bitstore/BitStore" # Where the *.warc.gz files live 155 | 156 | QUOTA = os.environ.get("AA_QUOTA") 157 | if not QUOTA: 158 | QUOTA = 10 << 30 # Max daily download quota in bytes 159 | else: 160 | QUOTA = int(QUOTA) 161 | 162 | SILODATES = "silodates.txt" 163 | 164 | SILOS = {} 165 | 166 | 167 | def get_secrets(_sfn, _dfn): 168 | ''' Retrieve the secrets for one particular file ''' 169 | pkey, skey = open("secrets.txt").read().split() 170 | assert len(pkey) == 64 171 | assert len(skey) == 32 172 | return bytearray.fromhex(pkey), bytearray.fromhex(skey) 173 | 174 | def Encrypt_File(sfn, dfn): 175 | ''' 176 | Encryption and Decryption is the same operation, but the IV 177 | is derived from the source filename, which must therefore be 178 | the same for both operations. 179 | ''' 180 | 181 | assert sfn != dfn 182 | 183 | pkey, skey = get_secrets(sfn, dfn) 184 | 185 | h = hmac.HMAC( 186 | skey, 187 | hashes.SHA256(), 188 | backend=default_backend() 189 | ) 190 | x = sfn.replace(".bin", "") 191 | x = os.path.basename(x) 192 | h.update(x.encode("UTF-8")) 193 | iv = h.finalize() 194 | 195 | encryption = Cipher( 196 | algorithms.AES(pkey), 197 | modes.CTR(iv[:16]), 198 | backend=default_backend() 199 | ).encryptor() 200 | 201 | authentication = hmac.HMAC( 202 | skey, 203 | hashes.SHA256(), 204 | backend=default_backend() 205 | ) 206 | 207 | integrity = hashes.Hash( 208 | hashes.SHA256(), 209 | backend=default_backend() 210 | ) 211 | 212 | fi = open(sfn, "rb") 213 | fo = open(dfn, "wb") 214 | while True: 215 | a = fi.read(65536) 216 | if not a: 217 | break 218 | b = encryption.update(a) 219 | authentication.update(b) 220 | integrity.update(b) 221 | fo.write(b) 222 | 223 | b = encryption.finalize() 224 | assert len(b) == 0 # CTR mode 225 | # otherwise: 226 | # authentication.update(b) 227 | # integrity.update(b) 228 | # fo.write(b) 229 | 230 | open(dfn + ".hmac", "w").write(authentication.finalize().hex() + "\n") 231 | open(dfn + ".sha256", "w").write( 232 | "SHA256 (%s) = " % os.path.basename(dfn) + 233 | integrity.finalize().hex() + "\n" 234 | ) 235 | 236 | 237 | class Silo(): 238 | ''' An AardWARC silo ''' 239 | def __init__(self, fn): 240 | SILOS[fn] = self 241 | self.fn = fn 242 | self.now_sz = 0 243 | self.last_sz = 0 244 | self.last_tm = 0 245 | 246 | def __repr__(self): 247 | return "" % self.fn 248 | 249 | def __lt__(self, other): 250 | return self.sortkey() > other.sortkey() 251 | 252 | def sortkey(self): 253 | return [ 254 | math.fabs(self.now_sz - self.last_sz), 255 | time.time() - self.last_tm 256 | ] 257 | 258 | def set_last_size(self, sz): 259 | self.last_sz = sz 260 | 261 | def set_last_time(self, tm): 262 | self.last_tm = tm 263 | 264 | def probe(self, fpn): 265 | st = os.stat(fpn) 266 | self.now_sz = st.st_size 267 | self.fpn = fpn 268 | 269 | def silo(f): 270 | s = SILOS.get(f) 271 | if not s: 272 | s = Silo(f) 273 | return s 274 | 275 | def main(sdir): 276 | 277 | try: 278 | for i in open(SILODATES): 279 | j = i.split() 280 | s = silo(j[0]) 281 | s.set_last_time(float(j[1])) 282 | s.set_last_size(int(j[2])) 283 | except FileNotFoundError: 284 | pass 285 | 286 | for path, _b, files in os.walk(SILODIR): 287 | for f in files: 288 | if f[-8:] == ".warc.gz": 289 | silo(f).probe(os.path.join(path, f)) 290 | 291 | q = 0 292 | for s in sorted(list(SILOS.values())): 293 | if q + s.now_sz > QUOTA: 294 | break 295 | q += s.now_sz 296 | print("DO", "%14d" % q, s.fpn, s.sortkey()) 297 | if True: 298 | Encrypt_File( 299 | s.fpn, 300 | os.path.join(sdir, s.fn + ".bin") 301 | ) 302 | s.last_tm = time.time() 303 | s.last_sz = s.now_sz 304 | 305 | fo = open(SILODATES + ".new", "w") 306 | for _n, s in sorted(SILOS.items()): 307 | fo.write("%s %.3f %d\n" % (s.fn, s.last_tm, s.last_sz)) 308 | fo.close() 309 | 310 | os.rename(SILODATES + ".new", SILODATES) 311 | 312 | if __name__ == "__main__": 313 | 314 | snew = STAGING + ".new" 315 | sold = STAGING + ".old" 316 | 317 | os.umask(0o22) 318 | 319 | shutil.rmtree(snew, ignore_errors=True) 320 | shutil.rmtree(sold, ignore_errors=True) 321 | 322 | os.mkdir(snew, mode=0o755) 323 | 324 | main(snew) 325 | 326 | try: 327 | os.rename(STAGING, sold) 328 | except FileNotFoundError: 329 | pass 330 | 331 | os.rename(snew, STAGING) 332 | 333 | shutil.rmtree(sold, ignore_errors=True) 334 | -------------------------------------------------------------------------------- /vas.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2006 Verdens Gang AS 3 | * Copyright (c) 2006-2016 Varnish Software AS 4 | * All rights reserved. 5 | * 6 | * Author: Poul-Henning Kamp 7 | * 8 | * SPDX-License-Identifier: BSD-2-Clause 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met: 13 | * 1. Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * 2. Redistributions in binary form must reproduce the above copyright 16 | * notice, this list of conditions and the following disclaimer in the 17 | * documentation and/or other materials provided with the distribution. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 | * SUCH DAMAGE. 30 | * 31 | * This is the default backend function for libvarnish' assert facilities. 32 | */ 33 | 34 | #include "config.h" 35 | 36 | #include 37 | #include 38 | #include 39 | 40 | #include "vdef.h" 41 | 42 | #include "vas.h" 43 | 44 | vas_f *VAS_Fail_Func v_noreturn_; 45 | 46 | void v_noreturn_ 47 | VAS_Fail(const char *func, const char *file, int line, 48 | const char *cond, enum vas_e kind) 49 | { 50 | int err = errno; 51 | 52 | if (VAS_Fail_Func != NULL) { 53 | VAS_Fail_Func(func, file, line, cond, kind); 54 | } else { 55 | if (kind == VAS_MISSING) { 56 | fprintf(stderr, 57 | "Missing error handling code in %s(), %s line %d:\n" 58 | " Condition(%s) not true.\n", 59 | func, file, line, cond); 60 | } else if (kind == VAS_INCOMPLETE) { 61 | fprintf(stderr, 62 | "Incomplete code in %s(), %s line %d:\n", 63 | func, file, line); 64 | } else if (kind == VAS_WRONG) { 65 | fprintf(stderr, 66 | "Wrong turn in %s(), %s line %d: %s\n", 67 | func, file, line, cond); 68 | } else { 69 | fprintf(stderr, 70 | "Assert error in %s(), %s line %d:\n" 71 | " Condition(%s) not true.\n", 72 | func, file, line, cond); 73 | } 74 | if (err) 75 | fprintf(stderr, 76 | " errno = %d (%s)\n", err, strerror(err)); 77 | } 78 | abort(); 79 | } 80 | -------------------------------------------------------------------------------- /vas.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2006 Verdens Gang AS 3 | * Copyright (c) 2006-2011 Varnish Software AS 4 | * All rights reserved. 5 | * 6 | * Author: Poul-Henning Kamp 7 | * 8 | * SPDX-License-Identifier: BSD-2-Clause 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met: 13 | * 1. Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * 2. Redistributions in binary form must reproduce the above copyright 16 | * notice, this list of conditions and the following disclaimer in the 17 | * documentation and/or other materials provided with the distribution. 18 | * 19 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 23 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 | * SUCH DAMAGE. 30 | * 31 | * assert(), AN() and AZ() are static checks that should not happen. 32 | * In general asserts should be cheap, such as checking return 33 | * values and similar. 34 | * diagnostic() are asserts which are so expensive that we may want 35 | * to compile them out for performance at a later date. 36 | * xxxassert(), XXXAN() and XXXAZ() marks conditions we ought to 37 | * handle gracefully, such as malloc failure. 38 | */ 39 | 40 | #ifndef VAS_H_INCLUDED 41 | #define VAS_H_INCLUDED 42 | 43 | #include 44 | 45 | enum vas_e { 46 | VAS_WRONG, 47 | VAS_MISSING, 48 | VAS_ASSERT, 49 | VAS_INCOMPLETE, 50 | VAS_VCL, 51 | }; 52 | 53 | typedef void vas_f(const char *, const char *, int, const char *, enum vas_e); 54 | 55 | extern vas_f *VAS_Fail_Func v_noreturn_; 56 | extern vas_f VAS_Fail v_noreturn_; 57 | 58 | #ifdef WITHOUT_ASSERTS 59 | #define assert(e) ((void)(e)) 60 | #else /* WITH_ASSERTS */ 61 | #define assert(e) \ 62 | do { \ 63 | if (!(e)) { \ 64 | VAS_Fail(__func__, __FILE__, __LINE__, \ 65 | #e, VAS_ASSERT); \ 66 | } \ 67 | } while (0) 68 | #endif 69 | 70 | #define xxxassert(e) \ 71 | do { \ 72 | if (!(e)) { \ 73 | VAS_Fail(__func__, __FILE__, __LINE__, \ 74 | #e, VAS_MISSING); \ 75 | } \ 76 | } while (0) 77 | 78 | /* Assert zero return value */ 79 | #define AZ(foo) do { assert((foo) == 0); } while (0) 80 | #define AN(foo) do { assert((foo) != 0); } while (0) 81 | #define XXXAZ(foo) do { xxxassert((foo) == 0); } while (0) 82 | #define XXXAN(foo) do { xxxassert((foo) != 0); } while (0) 83 | #define diagnostic(foo) assert(foo) 84 | #define WRONG(expl) \ 85 | do { \ 86 | VAS_Fail(__func__, __FILE__, __LINE__, expl, VAS_WRONG); \ 87 | } while (0) 88 | 89 | #define INCOMPL() \ 90 | do { \ 91 | VAS_Fail(__func__, __FILE__, __LINE__, \ 92 | "", VAS_INCOMPLETE); \ 93 | } while (0) 94 | 95 | /* 96 | * Most of this nightmare is stolen from FreeBSD's 97 | */ 98 | #ifndef __has_extension 99 | # define __has_extension(x) 0 100 | #endif 101 | 102 | #if __has_extension(c_static_assert) 103 | # define v_static_assert _Static_assert 104 | #elif __GNUC_PREREQ__(4,6) && !defined(__cplusplus) 105 | # define v_static_assert _Static_assert 106 | #else 107 | # if defined(__COUNTER__) 108 | # define v_static_assert(x, y) __v_static_assert(x, __COUNTER__) 109 | # else 110 | # define v_static_assert(x, y) __v_static_assert(x, __LINE__) 111 | # endif 112 | # define __v_static_assert(x, y) ___v_static_assert(x, y) 113 | # define ___v_static_assert(x, y) \ 114 | typedef char __vassert_## y[(x) ? 1 : -1] v_unused_ 115 | #endif 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /vdef.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2006 Verdens Gang AS 3 | * Copyright (c) 2012 Fastly Inc 4 | * Copyright (c) 2006-2015 Varnish Software AS 5 | * All rights reserved. 6 | * 7 | * Author: Poul-Henning Kamp 8 | * Author: Rogier 'DocWilco' Mulhuijzen 9 | * 10 | * Inspired by FreeBSD's 11 | * 12 | * SPDX-License-Identifier: BSD-2-Clause 13 | * 14 | * Redistribution and use in source and binary forms, with or without 15 | * modification, are permitted provided that the following conditions 16 | * are met: 17 | * 1. Redistributions of source code must retain the above copyright 18 | * notice, this list of conditions and the following disclaimer. 19 | * 2. Redistributions in binary form must reproduce the above copyright 20 | * notice, this list of conditions and the following disclaimer in the 21 | * documentation and/or other materials provided with the distribution. 22 | * 23 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 24 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 27 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 | * SUCH DAMAGE. 34 | * 35 | * Names of the form "v_[a-z_]*_" is reserved for this file. 36 | * 37 | * This file should always be the first non <...> include in a .c file. 38 | */ 39 | 40 | #ifdef VDEF_H_INCLUDED 41 | # error "vdef.h included multiple times" 42 | #endif 43 | #define VDEF_H_INCLUDED 44 | 45 | /* Safe printf into a fixed-size buffer */ 46 | #define bprintf(buf, fmt, ...) \ 47 | do { \ 48 | int ibprintf; \ 49 | ibprintf = snprintf(buf, sizeof buf, fmt, __VA_ARGS__); \ 50 | assert(ibprintf >= 0 && ibprintf < (int)sizeof buf); \ 51 | } while (0) 52 | 53 | /* Safe printf into a fixed-size buffer */ 54 | #define vbprintf(buf, fmt, ap) \ 55 | do { \ 56 | int ivbprintf; \ 57 | ivbprintf = vsnprintf(buf, sizeof buf, fmt, ap); \ 58 | assert(ivbprintf >= 0 && ivbprintf < (int)sizeof buf); \ 59 | } while (0) 60 | 61 | /* Safe strcpy into a fixed-size buffer */ 62 | #define bstrcpy(dst, src) \ 63 | do { \ 64 | assert(strlen(src) + 1 <= sizeof (dst)); \ 65 | strcpy((dst), (src)); \ 66 | } while (0) 67 | 68 | // TODO #define strcpy BANNED 69 | // TODO then revert 0fa4baead49f0a45f68d3db0b7743c5e4e93ad4d 70 | // TODO and replace with flexelint exception 71 | 72 | /* Close and discard filedescriptor */ 73 | #define closefd(fdp) \ 74 | do { \ 75 | assert(*(fdp) >= 0); \ 76 | AZ(close(*(fdp))); \ 77 | *(fdp) = -1; \ 78 | } while (0) 79 | 80 | #ifndef __GNUC_PREREQ__ 81 | # if defined __GNUC__ && defined __GNUC_MINOR__ 82 | # define __GNUC_PREREQ__(maj, min) \ 83 | (__GNUC__ > (maj) || (__GNUC__ == (maj) && __GNUC_MINOR__ >= (min))) 84 | # else 85 | # define __GNUC_PREREQ__(maj, min) 0 86 | # endif 87 | #endif 88 | 89 | #if __GNUC_PREREQ__(2, 95) || defined(__INTEL_COMPILER) 90 | # define v_printflike_(f,a) __attribute__((format(printf, f, a))) 91 | #else 92 | # define v_printflike_(f,a) 93 | #endif 94 | 95 | #define v_noreturn_ __attribute__((__noreturn__)) 96 | 97 | #ifdef __GNUC__ 98 | # define v_deprecated_ __attribute__((deprecated)) 99 | #else 100 | # define v_deprecated_ 101 | #endif 102 | 103 | /********************************************************************* 104 | * Pointer alignment magic 105 | */ 106 | 107 | #if defined(__sparc__) 108 | /* NB: Overbroad test for 32bit userland on 64bit SPARC cpus. */ 109 | # define PALGN (sizeof(double) - 1) /* size of alignment */ 110 | #else 111 | # define PALGN (sizeof(void *) - 1) /* size of alignment */ 112 | #endif 113 | #define PAOK(p) (((uintptr_t)(p) & PALGN) == 0) /* is aligned */ 114 | #define PRNDDN(p) ((uintptr_t)(p) & ~PALGN) /* Round down */ 115 | #define PRNDUP(p) (((uintptr_t)(p) + PALGN) & ~PALGN) /* Round up */ 116 | 117 | /********************************************************************* 118 | * To be used as little as possible to wash off const/volatile etc. 119 | */ 120 | #define TRUST_ME(ptr) ((void*)(uintptr_t)(ptr)) 121 | 122 | /********************************************************************** 123 | * Generic power-2 rounding macros 124 | */ 125 | 126 | #define PWR2(x) ((((x)-1UL)&(x))==0) /* Is a power of two */ 127 | #define RDN2(x, y) ((x)&(~((uintptr_t)(y)-1UL))) /* PWR2(y) true */ 128 | #define RUP2(x, y) (((x)+((y)-1))&(~((uintptr_t)(y)-1UL))) /* PWR2(y) true */ 129 | 130 | /********************************************************************** 131 | * FlexeLint and compiler shutuppery 132 | */ 133 | 134 | /* 135 | * In OO-light situations, functions have to match their prototype 136 | * even if that means not const'ing a const'able argument. 137 | * The typedef should be specified as argument to the macro. 138 | */ 139 | #define v_matchproto_(xxx) /*lint --e{818} */ 140 | 141 | /* 142 | * State variables may change value before we have considered the 143 | * previous value 144 | */ 145 | #define v_statevariable_(varname) varname /*lint -esym(838,varname) */ 146 | 147 | #ifdef __SUNPRO_C 148 | # define NEEDLESS(s) {} 149 | #else 150 | # define NEEDLESS(s) s 151 | #endif 152 | 153 | #if __GNUC_PREREQ__(2, 7) 154 | # define v_unused_ __attribute__((__unused__)) 155 | #else 156 | # define v_unused_ 157 | #endif 158 | 159 | /* VTIM API overhaul WIP */ 160 | typedef double vtim_mono; 161 | typedef double vtim_real; 162 | typedef double vtim_dur; 163 | -------------------------------------------------------------------------------- /vlu.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2005-2008 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * SPDX-License-Identifier: BSD-2-Clause 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | * Functions for assembling a bytestream into text-lines and calling 29 | * a function on each. 30 | */ 31 | 32 | #include "config.h" 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "vdef.h" 40 | 41 | #include "vas.h" // XXX Flexelint "not used" - but req'ed for assert() 42 | #include "miniobj.h" 43 | 44 | #include "vlu.h" 45 | 46 | struct vlu { 47 | unsigned magic; 48 | #define LINEUP_MAGIC 0x8286661 49 | char *buf; 50 | unsigned bufl; 51 | unsigned bufp; 52 | void *priv; 53 | vlu_f *func; 54 | }; 55 | 56 | struct vlu * 57 | VLU_New(vlu_f *func, void *priv, unsigned bufsize) 58 | { 59 | struct vlu *l; 60 | 61 | if (bufsize == 0) 62 | bufsize = BUFSIZ; 63 | ALLOC_OBJ(l, LINEUP_MAGIC); 64 | if (l != NULL) { 65 | l->func = func; 66 | l->priv = priv; 67 | l->bufl = bufsize - 1; 68 | l->buf = malloc(l->bufl + 1L); 69 | if (l->buf == NULL) { 70 | FREE_OBJ(l); 71 | l = NULL; 72 | } 73 | } 74 | return (l); 75 | } 76 | 77 | void 78 | VLU_Reset(struct vlu *l) 79 | { 80 | CHECK_OBJ_NOTNULL(l, LINEUP_MAGIC); 81 | l->bufp = 0; 82 | } 83 | 84 | void 85 | VLU_Destroy(struct vlu **lp) 86 | { 87 | struct vlu *l; 88 | 89 | TAKE_OBJ_NOTNULL(l, lp, LINEUP_MAGIC); 90 | free(l->buf); 91 | FREE_OBJ(l); 92 | } 93 | 94 | static int 95 | LineUpProcess(struct vlu *l) 96 | { 97 | char *p, *q; 98 | int i; 99 | 100 | l->buf[l->bufp] = '\0'; 101 | for (p = l->buf; *p != '\0'; p = q) { 102 | /* Find first CR or NL */ 103 | for (q = p; *q != '\0'; q++) { 104 | if (*q == '\n' || *q == '\r') 105 | break; 106 | } 107 | if (*q == '\0') 108 | break; 109 | *q++ = '\0'; 110 | i = l->func(l->priv, p); 111 | if (i != 0) 112 | return (i); 113 | } 114 | if (*p != '\0') { 115 | q = strchr(p, '\0'); 116 | assert(q != NULL); 117 | l->bufp = (unsigned)(q - p); 118 | memmove(l->buf, p, l->bufp); 119 | l->buf[l->bufp] = '\0'; 120 | } else 121 | l->bufp = 0; 122 | return (0); 123 | } 124 | 125 | int 126 | VLU_Fd(struct vlu *l, int fd) 127 | { 128 | int i; 129 | 130 | CHECK_OBJ_NOTNULL(l, LINEUP_MAGIC); 131 | i = read(fd, l->buf + l->bufp, l->bufl - l->bufp); 132 | if (i == 0) 133 | return (-2); 134 | if (i < 0) 135 | return (-1); 136 | l->bufp += i; 137 | return (LineUpProcess(l)); 138 | } 139 | 140 | int 141 | VLU_File(int fd, vlu_f *func, void *priv, unsigned bufsize) 142 | { 143 | struct vlu *vlu; 144 | int i; 145 | 146 | vlu = VLU_New(func, priv, bufsize); 147 | AN(vlu); 148 | do { 149 | i = VLU_Fd(vlu, fd); 150 | } while (i == 0); 151 | VLU_Destroy(&vlu); 152 | return (i); 153 | } 154 | 155 | int 156 | VLU_Feed(struct vlu *l, const char *ptr, int len) 157 | { 158 | int i = 0; 159 | unsigned u; 160 | 161 | CHECK_OBJ_NOTNULL(l, LINEUP_MAGIC); 162 | AN(ptr); 163 | assert(len > 0); 164 | while (len > 0) { 165 | u = len; 166 | if (u > l->bufl - l->bufp) 167 | u = l->bufl - l->bufp; 168 | memcpy(l->buf + l->bufp, ptr, u); 169 | len -= u; 170 | ptr += u; 171 | l->bufp += u; 172 | i = LineUpProcess(l); 173 | if (i) 174 | return (i); 175 | } 176 | return (i); 177 | } 178 | -------------------------------------------------------------------------------- /vlu.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2005-2008 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * SPDX-License-Identifier: BSD-2-Clause 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | * Functions for assembling a bytestream into text-lines and calling 29 | * a function on each. 30 | */ 31 | 32 | #ifdef VLU_H_INCLUDED 33 | # error "vlu.h included multiple times" 34 | #endif 35 | #define VLU_H_INCLUDED 36 | 37 | typedef int (vlu_f)(void *, const char *); 38 | struct vlu *VLU_New(vlu_f *, void *, unsigned); 39 | void VLU_Reset(struct vlu *); 40 | int VLU_Fd(struct vlu *, int); 41 | void VLU_Destroy(struct vlu **); 42 | int VLU_File(int, vlu_f *, void *, unsigned); 43 | int VLU_Feed(struct vlu *, const char*, int); 44 | -------------------------------------------------------------------------------- /vnum.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2008-2009 Varnish Software AS 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | * Deal with numbers with data storage suffix scaling 29 | */ 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "vdef.h" 38 | 39 | #include "aardwarc.h" 40 | #include "vas.h" 41 | 42 | static const char err_miss_num[] = "Missing number"; 43 | static const char err_invalid_num[] = "Invalid number"; 44 | static const char err_abs_req[] = "Absolute number required"; 45 | static const char err_invalid_suff[] = "Invalid suffix"; 46 | 47 | /********************************************************************** 48 | * Convert (all of!) a string to a floating point number, and if we can 49 | * not, return NAN. 50 | */ 51 | 52 | static double 53 | VNUMpfx(const char *p, const char **t) 54 | { 55 | double m = 0., ee = 0.; 56 | double ms = 1.0; 57 | double es = 1.0, e = 1.0, ne = 0.0; 58 | 59 | AN(p); 60 | AN(t); 61 | *t = NULL; 62 | while (isspace(*p)) 63 | p++; 64 | 65 | if (*p == '-' || *p == '+') 66 | ms = (*p++ == '-' ? -1.0 : 1.0); 67 | 68 | for (; *p != '\0'; p++) { 69 | if (isdigit(*p)) { 70 | m = m * 10. + *p - '0'; 71 | e = ne; 72 | if (e) 73 | ne = e - 1.0; 74 | } else if (*p == '.' && ne == 0.0) { 75 | ne = -1.0; 76 | } else 77 | break; 78 | } 79 | if (e > 0.0) 80 | return(nan("")); // No digits 81 | if (*p == 'e' || *p == 'E') { 82 | p++; 83 | if (*p == '-' || *p == '+') 84 | es = (*p++ == '-' ? -1.0 : 1.0); 85 | if (!isdigit(*p)) 86 | return (nan("")); 87 | for (; isdigit(*p); p++) 88 | ee = ee * 10. + *p - '0'; 89 | } 90 | while (isspace(*p)) 91 | p++; 92 | if (*p != '\0') 93 | *t = p; 94 | return (ms * m * pow(10., e + es * ee)); 95 | } 96 | 97 | /**********************************************************************/ 98 | 99 | const char * 100 | VNUM_2bytes(const char *p, uintmax_t *r, uintmax_t rel) 101 | { 102 | double fval; 103 | const char *end; 104 | 105 | if (p == NULL || *p == '\0') 106 | return (err_miss_num); 107 | 108 | fval = VNUMpfx(p, &end); 109 | if (isnan(fval)) 110 | return (err_invalid_num); 111 | 112 | if (end == NULL) { 113 | *r = (uintmax_t)fval; 114 | return (NULL); 115 | } 116 | 117 | if (end[0] == '%' && end[1] == '\0') { 118 | if (rel == 0) 119 | return (err_abs_req); 120 | fval *= rel / 100.0; 121 | } else { 122 | /* accept a space before the multiplier */ 123 | if (end[0] == ' ' && end[1] != '\0') 124 | ++end; 125 | 126 | switch (end[0]) { 127 | case 'k': case 'K': 128 | fval *= (uintmax_t)1 << 10; 129 | ++end; 130 | break; 131 | case 'm': case 'M': 132 | fval *= (uintmax_t)1 << 20; 133 | ++end; 134 | break; 135 | case 'g': case 'G': 136 | fval *= (uintmax_t)1 << 30; 137 | ++end; 138 | break; 139 | case 't': case 'T': 140 | fval *= (uintmax_t)1 << 40; 141 | ++end; 142 | break; 143 | case 'p': case 'P': 144 | fval *= (uintmax_t)1 << 50; 145 | ++end; 146 | break; 147 | default: 148 | break; 149 | } 150 | 151 | /* [bB] is a generic suffix of no effect */ 152 | if (end[0] == 'b' || end[0] == 'B') 153 | end++; 154 | 155 | if (end[0] != '\0') 156 | return (err_invalid_suff); 157 | } 158 | 159 | *r = (uintmax_t)round(fval); 160 | return (NULL); 161 | } 162 | -------------------------------------------------------------------------------- /vsb.h: -------------------------------------------------------------------------------- 1 | /*- 2 | * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 | * 4 | * Copyright (c) 2000-2011 Poul-Henning Kamp 5 | * Copyright (c) 2000-2008 Dag-Erling Coïdan Smørgrav 6 | * All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions 10 | * are met: 11 | * 1. Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer 13 | * in this position and unchanged. 14 | * 2. Redistributions in binary form must reproduce the above copyright 15 | * notice, this list of conditions and the following disclaimer in the 16 | * documentation and/or other materials provided with the distribution. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | * SUCH DAMAGE. 29 | * 30 | * $FreeBSD: head/sys/sys/vsb.h 221993 2011-05-16 16:18:40Z phk $ 31 | */ 32 | 33 | #ifndef VSB_H_INCLUDED 34 | #define VSB_H_INCLUDED 35 | 36 | /* 37 | * Structure definition 38 | */ 39 | struct vsb { 40 | unsigned magic; 41 | #define VSB_MAGIC 0x4a82dd8a 42 | int s_error; /* current error code */ 43 | char *s_buf; /* storage buffer */ 44 | ssize_t s_size; /* size of storage buffer */ 45 | ssize_t s_len; /* current length of string */ 46 | #define VSB_FIXEDLEN 0x00000000 /* fixed length buffer (default) */ 47 | #define VSB_AUTOEXTEND 0x00000001 /* automatically extend buffer */ 48 | #define VSB_USRFLAGMSK 0x0000ffff /* mask of flags the user may specify */ 49 | #define VSB_DYNAMIC 0x00010000 /* s_buf must be freed */ 50 | #define VSB_FINISHED 0x00020000 /* set by VSB_finish() */ 51 | #define VSB_DYNSTRUCT 0x00080000 /* vsb must be freed */ 52 | int s_flags; /* flags */ 53 | int s_indent; /* Ident level */ 54 | }; 55 | 56 | #ifdef __cplusplus 57 | extern "C" { 58 | #endif 59 | /* 60 | * API functions 61 | */ 62 | struct vsb *VSB_new(struct vsb *, char *, int, int); 63 | #define VSB_new_auto() \ 64 | VSB_new(NULL, NULL, 0, VSB_AUTOEXTEND) 65 | void VSB_clear(struct vsb *); 66 | int VSB_bcat(struct vsb *, const void *, ssize_t); 67 | int VSB_cat(struct vsb *, const char *); 68 | int VSB_printf(struct vsb *, const char *, ...) 69 | v_printflike_(2, 3); 70 | #ifdef va_start 71 | int VSB_vprintf(struct vsb *, const char *, va_list) 72 | v_printflike_(2, 0); 73 | #endif 74 | int VSB_putc(struct vsb *, int); 75 | int VSB_error(const struct vsb *); 76 | int VSB_finish(struct vsb *); 77 | char *VSB_data(const struct vsb *); 78 | ssize_t VSB_len(const struct vsb *); 79 | void VSB_delete(struct vsb *); 80 | void VSB_destroy(struct vsb **); 81 | #define VSB_QUOTE_NONL 1 82 | #define VSB_QUOTE_JSON 2 83 | #define VSB_QUOTE_HEX 4 84 | #define VSB_QUOTE_CSTR 8 85 | #define VSB_QUOTE_UNSAFE 16 86 | #define VSB_QUOTE_ESCHEX 32 87 | void VSB_quote_pfx(struct vsb *, const char*, const void *, 88 | int len, int how); 89 | void VSB_quote(struct vsb *, const void *, int len, int how); 90 | void VSB_indent(struct vsb *, int); 91 | int VSB_tofile(int fd, const struct vsb *); 92 | #ifdef __cplusplus 93 | }; 94 | #endif 95 | 96 | #endif 97 | -------------------------------------------------------------------------------- /warcinfo.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * Copyright (c) 2016 Poul-Henning Kamp 3 | * All rights reserved. 4 | * 5 | * Author: Poul-Henning Kamp 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 1. Redistributions of source code must retain the above copyright 11 | * notice, this list of conditions and the following disclaimer. 12 | * 2. Redistributions in binary form must reproduce the above copyright 13 | * notice, this list of conditions and the following disclaimer in the 14 | * documentation and/or other materials provided with the distribution. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | * SUCH DAMAGE. 27 | * 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | #include "vdef.h" 35 | 36 | #include "vas.h" 37 | #include "vsb.h" 38 | #include "miniobj.h" 39 | 40 | #include "aardwarc.h" 41 | 42 | #ifndef GITREV 43 | # define GITREV "unknown version" 44 | #endif 45 | 46 | struct warcinfo { 47 | unsigned magic; 48 | #define WARCINFO_MAGIC 0x9bef8242 49 | 50 | struct header *hdr; 51 | struct vsb *body; 52 | }; 53 | 54 | static int v_matchproto_(config_f) 55 | c_iter(void *priv, const char *name, const char *arg) 56 | { 57 | struct warcinfo *wi; 58 | 59 | CAST_OBJ_NOTNULL(wi, priv, WARCINFO_MAGIC); 60 | AN(wi->body); 61 | if (!strcasecmp(name, "conformsTo:")) 62 | return(0); 63 | if (!strcasecmp(name, "format:")) 64 | return(0); 65 | if (!strcasecmp(name, "software:")) 66 | return(0); 67 | if (!strcasecmp(name, "title:")) 68 | return(0); 69 | VSB_printf(wi->body, "%s %s\r\n", name, arg); 70 | return (0); 71 | } 72 | 73 | char * 74 | Warcinfo_New(const struct aardwarc *aa, struct wsilo *wsl, uint32_t silono) 75 | { 76 | struct warcinfo *wi; 77 | char *p; 78 | const char *r; 79 | struct vsb *vsb; 80 | void *ptr; 81 | ssize_t len, len2; 82 | 83 | CHECK_OBJ_NOTNULL(aa, AARDWARC_MAGIC); 84 | AN(wsl); 85 | ALLOC_OBJ(wi, WARCINFO_MAGIC); 86 | AN(wi); 87 | 88 | wi->hdr = Header_New(aa); 89 | AN(wi->hdr); 90 | 91 | Header_Set(wi->hdr, "WARC-Filename", aa->silo_basename, silono); 92 | r = Header_Get(wi->hdr, "WARC-Filename"); 93 | AN(r); 94 | 95 | /* Assemble Body first, we need it's len & digest for the header */ 96 | 97 | wi->body = VSB_new_auto(); 98 | AN(wi->body); 99 | VSB_printf(wi->body, "title: %s\r\n", r); 100 | AZ(Config_Iter(aa->cfg, "warcinfo.body", wi, c_iter)); 101 | VSB_printf(wi->body, "format: WARC file version 1.1\r\n"); 102 | VSB_printf(wi->body, "conformsTo: %s\r\n", 103 | "http://iipc.github.io/warc-specifications/" 104 | "specifications/warc-format/warc-1.1/"); 105 | VSB_printf(wi->body, "software: %s (%s)\r\n", 106 | "https://github.com/bsdphk/AardWARC", GITREV); 107 | AZ(VSB_finish(wi->body)); 108 | 109 | Header_Set_Date(wi->hdr); 110 | Header_Set(wi->hdr, "WARC-Type", "warcinfo"); 111 | Header_Set(wi->hdr, "Content-Type", "application/warc-fields"); 112 | Header_Set(wi->hdr, "Content-Length", "%zd", VSB_len(wi->body)); 113 | 114 | p = SHA256_Data(VSB_data(wi->body), VSB_len(wi->body), NULL); 115 | Header_Set(wi->hdr, "WARC-Block-Digest", "sha256:%s", p); 116 | 117 | Ident_Set(aa, wi->hdr, p, NULL); 118 | 119 | REPLACE(p, NULL); 120 | 121 | Gzip_Vsb(&wi->body, 0); 122 | 123 | vsb = Header_Serialize(wi->hdr, 0); 124 | AN(vsb); 125 | len2 = VSB_len(vsb); 126 | assert(len2 > 0); 127 | 128 | Wsilo_GetSpace(wsl, &ptr, &len); 129 | assert(len2 <= len); 130 | memcpy(ptr, VSB_data(vsb), len2); 131 | AZ(Wsilo_Store(wsl, len2)); 132 | 133 | p = strdup(Header_Get_Id(wi->hdr)); 134 | AN(p); 135 | 136 | VSB_delete(vsb); 137 | Header_Destroy(&wi->hdr); 138 | 139 | len2 = VSB_len(wi->body); 140 | assert(len2 > 0); 141 | 142 | Wsilo_GetSpace(wsl, &ptr, &len); 143 | assert(len2 <= len); 144 | memcpy(ptr, VSB_data(wi->body), len2); 145 | AZ(Wsilo_Store(wsl, len2)); 146 | 147 | len2 = sizeof Gzip_crnlcrnl; 148 | Wsilo_GetSpace(wsl, &ptr, &len); 149 | assert(len2 <= len); 150 | memcpy(ptr, Gzip_crnlcrnl, len2); 151 | AZ(Wsilo_Store(wsl, len2)); 152 | 153 | VSB_delete(wi->body); 154 | 155 | FREE_OBJ(wi); 156 | return (p); 157 | } 158 | --------------------------------------------------------------------------------