├── .travis.yml ├── .gitignore ├── package.json ├── LICENSE ├── README.md ├── test ├── fixtures │ ├── CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wet │ └── CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wat ├── index.js └── edge.js └── index.js /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .DS_Store 3 | node_modules/ 4 | npm-debug.log 5 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "warc", 3 | "version": "1.0.1", 4 | "description": "Parse WARC (Web Archive Files) as a node.js stream", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "node_modules/.bin/tape test/*.js" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/eugeneware/warc" 12 | }, 13 | "keywords": [ 14 | "warc", 15 | "common-crawl", 16 | "stream", 17 | "streaming" 18 | ], 19 | "author": "Eugene Ware ", 20 | "license": "BSD-3-Clause", 21 | "bugs": { 22 | "url": "https://github.com/eugeneware/warc/issues" 23 | }, 24 | "dependencies": {}, 25 | "devDependencies": { 26 | "http-parser-js": "^0.1.0", 27 | "tape": "^3.0.1", 28 | "through2": "^0.6.3" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Eugene Ware 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 3. Neither the name of Eugene Ware nor the names of its contributors 13 | may be used to endorse or promote products derived from this software 14 | without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY EUGENE WARE ''AS IS'' AND ANY 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL EUGENE WARE BE LIABLE FOR ANY 20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # warc 2 | 3 | Parse WARC (Web Archive Files) as a node.js stream 4 | 5 | [![build status](https://secure.travis-ci.org/eugeneware/warc.png)](http://travis-ci.org/eugeneware/warc) 6 | 7 | This stream parses the Web Archive file format as used by the 8 | [Common Crawl](http://commoncrawl.org/the-data/get-started/) project. 9 | 10 | NB: That this stream doesn't do any gzip decompression, it assumes a 11 | decompressed WARC file format. The WARC files that use used by common-crawl 12 | are actually multi-part Gzip files, and there is a [big bug](https://github.com/joyent/node/pull/6442) with the `zlib` library which is present as of the time of writing (node `0.10.32`) which 13 | will only process the first gzipped chunk. 14 | 15 | ## Installation 16 | 17 | This module is installed via npm: 18 | 19 | ``` bash 20 | $ npm install warc 21 | ``` 22 | 23 | ## Example Usage 24 | 25 | Assumes an uncompressed WARC stream. The `content` field will be returned as a 26 | node `Buffer`. 27 | 28 | ``` js 29 | var WARCStream = require('warc'), 30 | fs = require('fs'); 31 | 32 | var w = new WARCStream(); 33 | fs.createReadStream('./CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wat') 34 | .pipe(w) 35 | .on('data', function (data) { 36 | console.log(data); 37 | /* 38 | { protocol: 'WARC/1.0', 39 | headers: 40 | { 'WARC-Type': 'response', 41 | 'WARC-Date': '2014-08-21T04:21:14Z', 42 | 'WARC-Record-ID': '', 43 | 'Content-Length': '174', 44 | 'Content-Type': 'application/http; msgtype=response', 45 | 'WARC-Warcinfo-ID': '', 46 | 'WARC-Concurrent-To': '', 47 | 'WARC-IP-Address': '65.52.108.2', 48 | 'WARC-Target-URI': 'http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors', 49 | 'WARC-Payload-Digest': 'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ', 50 | 'WARC-Block-Digest': 'sha1:UHJK3TXZIQRATBF4CIGW33NQ4QAGTE4M' }, 51 | content: } 52 | */ 53 | }); 54 | ``` 55 | -------------------------------------------------------------------------------- /test/fixtures/CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wet: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2014-09-09T10:14:59Z 4 | WARC-Filename: CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wet.gz 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 286 8 | 9 | Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050 10 | Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT 11 | robots: classic 12 | isPartOf: CC-MAIN-2014-35 13 | operator: CommonCrawl Admin 14 | description: Wide crawl of the web with URLs provided by Blekko for August 2014 15 | publisher: CommonCrawl 16 | 17 | 18 | 19 | WARC/1.0 20 | WARC-Type: conversion 21 | WARC-Target-URI: http://0pointer.de/photos/?gallery=Australia&photo=496 22 | WARC-Date: 2014-08-21T05:52:58Z 23 | WARC-Record-ID: 24 | WARC-Refers-To: 25 | WARC-Block-Digest: sha1:IXDB7TEXWWXPAQYRUF4QT3LWM26O2VI2 26 | Content-Type: text/plain 27 | Content-Length: 629 28 | 29 | Photos 30 | [ style: dark classic gorilla ] 31 | Photo Gallery 32 | Photo Gallery Index 33 | > Australia 34 | > Photo 496 35 | Previous 36 | Next 37 | 2007/01/24 05:07:59 | Canon | Canon EOS 350D DIGITAL | 22mm | f/9 | 0.1/20s | ISO400 | 16 | display table 38 | Photo 496 39 | This image has been viewed 2550 times. 40 | MQ Post a Comment: 41 | [ Show FormHide Form ] 42 | Name: 43 | Remember Name: 44 | 6160 45 | Retype PIN Above: 46 | Comment: 47 | < Previous 48 | Next > 49 | All Panoramas | Special Photo Series | Lennart's Blog | Lennart's Homepage | Lennart's Photos | Impressum/Imprint 50 | Lennart Poettering 51 | This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License. 52 | 53 | 54 | WARC/1.0 55 | WARC-Type: conversion 56 | WARC-Target-URI: http://0pointer.de/photos/?gallery=Thailand%202011-01&photo=726&show_thumbs=yes&exif_style= 57 | WARC-Date: 2014-08-21T06:06:05Z 58 | WARC-Record-ID: 59 | WARC-Refers-To: 60 | WARC-Block-Digest: sha1:4IDACYQUFGTORIWTBWGQX2AKV5NVPNCY 61 | Content-Type: text/plain 62 | Content-Length: 626 63 | 64 | Photos 65 | [ style: dark classic gorilla ] 66 | Photo Gallery 67 | Photo Gallery Index 68 | > Bangkok 69 | > Photo 726 70 | : Previous 71 | Next 72 | 2011/02/01 03:19:14 | Canon | Canon EOS REBEL T2i | 22mm | f/8 | 0.1/10s | ISO100 | 16 | display table 73 | Photo 726 74 | This image has been viewed 1364 times. 75 | MQ Post a Comment: 76 | [ Show FormHide Form ] 77 | Name: 78 | Remember Name: 79 | 9008 80 | Retype PIN Above: 81 | Comment: 82 | < Previous 83 | Next > 84 | All Panoramas | Special Photo Series | Lennart's Blog | Lennart's Homepage | Lennart's Photos | Impressum/Imprint 85 | Lennart Poettering 86 | This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License. 87 | 88 | 89 | WARC/1.0 90 | WARC-Type: conversion 91 | WARC-Target-URI: http://0x20.be/Special:WhatLinksHere/Thread:Talk:Gentse_feesten/foo 92 | WARC-Date: 2014-08-21T04:31:23Z 93 | WARC-Record-ID: 94 | WARC-Refers-To: 95 | WARC-Block-Digest: sha1:4EFUOG7D2OHS24DLCNUOKROUBD47SD2Q 96 | Content-Type: text/plain 97 | Content-Length: 1283 98 | 99 | Pages that link to "Thread:Talk:Gentse feesten/foo" - Whitespace (Hackerspace Gent) 100 | Pages that link to "Thread:Talk:Gentse feesten/foo" 101 | ← Thread:Talk:Gentse feesten/foo 102 | Jump to: navigation, search 103 | What links here 104 | Page: Namespace: all 105 | (Main) 106 | Talk 107 | User 108 | User talk 109 | Whitespace (Hackerspace Gent) 110 | Whitespace (Hackerspace Gent) talk 111 | File 112 | File talk 113 | MediaWiki 114 | MediaWiki talk 115 | Template 116 | Template talk 117 | Help 118 | Help talk 119 | Category 120 | Category talk 121 | Thread 122 | Thread talk 123 | Summary 124 | Summary talk 125 | Property 126 | Property talk 127 | Type 128 | Type talk 129 | Form 130 | Form talk 131 | Concept 132 | Concept talk 133 | Widget 134 | Widget talk 135 | Layer 136 | Layer talk 137 | Filters 138 | Hide transclusions | Hide links | Hide redirects 139 | No pages link to Thread:Talk:Gentse feesten/foo. 140 | Retrieved from "http://0x20.be/Special:WhatLinksHere/Thread:Talk:Gentse_feesten/foo" Navigation menu 141 | Personal tools 142 | Create accountLog in Namespaces 143 | Thread 144 | Discussion 145 | Variants 146 | Views 147 | Read 148 | Edit 149 | View history 150 | Actions 151 | Search 152 | Navigation 153 | Main page 154 | Recent changes 155 | Infrastructure 156 | Whitespace 157 | Membership 158 | FAQ 159 | Contact 160 | Wanted 161 | Social 162 | Google+ page 163 | G+ community 164 | Twitter 165 | Facebook 166 | Tumblr 167 | Add or edit data 168 | Events 169 | Meetings 170 | Event images 171 | Projects 172 | Locations 173 | People 174 | Hackerspaces 175 | Documentation 176 | List of spaces 177 | Belgian spaces 178 | Tools 179 | Special pages 180 | Printable version 181 | Privacy policy 182 | About Whitespace (Hackerspace Gent) 183 | Disclaimers 184 | 185 | 186 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var stream = require('stream'), 2 | util = require('util'); 3 | module.exports = WARCStream; 4 | 5 | var STATE = { 6 | PROTOCOL : 1, 7 | HEADERS : 2, 8 | CONTENT : 3, 9 | SEPARATOR : 4 10 | }; 11 | 12 | var headerRegex = /^([^\:]+)\: ([^$]+)$/; 13 | 14 | function WARCStream(opts) { 15 | if (!(this instanceof WARCStream)) { 16 | return new WARCStream(); 17 | } 18 | 19 | if (typeof opts === 'undefined') { 20 | opts = {}; 21 | } 22 | if (typeof opts.objectMode === 'undefined') { 23 | opts.objectMode = true; 24 | } 25 | 26 | stream.Transform.call(this, opts); 27 | 28 | this.state = STATE.PROTOCOL; 29 | this.data = new Buffer(0); 30 | this.content = new Buffer(0); 31 | this.separator = new Buffer('\r\n\r\n'); 32 | this.offset = 0; 33 | this.protocol = null; 34 | this.headers = {}; 35 | this.contentLength = 0; 36 | this.matcher = new Buffer('\r\n'); 37 | } 38 | 39 | util.inherits(WARCStream, stream.Transform); 40 | 41 | WARCStream.prototype._transform = function (chunk, enc, cb) { 42 | var result; 43 | 44 | // append chunk 45 | this.data = Buffer.concat([this.data, chunk]); 46 | 47 | do { 48 | switch (this.state) { 49 | case STATE.PROTOCOL: 50 | this.protocol = null; 51 | result = this.parseProtocol(); 52 | if (result) { 53 | this.state = STATE.HEADERS; 54 | this.headers = {}; 55 | this.emit('protocol', this.protocol); 56 | } 57 | break; 58 | 59 | case STATE.HEADERS: 60 | result = false; 61 | result = this.parseHeaders(); 62 | if (result) { 63 | this.contentLength = parseInt(this.headers['Content-Length']); 64 | this.content = new Buffer(0); 65 | this.emit('headers', this.headers); 66 | } 67 | break; 68 | 69 | case STATE.CONTENT: 70 | result = this.parseContent(); 71 | if (result) { 72 | this.state = STATE.SEPARATOR; 73 | this.emit('content', this.content); 74 | this.push({ 75 | protocol: this.protocol, 76 | headers: this.headers, 77 | content: this.content 78 | }); 79 | } 80 | break; 81 | 82 | case STATE.SEPARATOR: 83 | result = this.parseSeparator(); 84 | if (result) { 85 | this.state = STATE.PROTOCOL; 86 | } 87 | break; 88 | 89 | default: 90 | result = false; 91 | break; 92 | } 93 | } while (result && this.offset < this.data.length); 94 | 95 | // store only the part we haven't processed yet 96 | this.data = this.data.slice(this.offset); 97 | this.offset = 0; 98 | 99 | cb(); 100 | }; 101 | 102 | WARCStream.prototype._flush = function (cb) { 103 | cb(); 104 | }; 105 | 106 | WARCStream.prototype.parseProtocol = function () { 107 | var idx = firstMatch(this.matcher, this.data, this.offset); 108 | 109 | if (idx !== false && idx <= this.data.length) { 110 | var protocol = this.data.slice(this.offset, idx); 111 | this.offset = idx + this.matcher.length; 112 | this.protocol = protocol.toString(); 113 | return true; 114 | } else { 115 | return false; 116 | } 117 | }; 118 | 119 | WARCStream.prototype.parseHeaders = function () { 120 | var result; 121 | do { 122 | result = this.parseHeader(); 123 | } while (result); 124 | return !result && this.state === STATE.CONTENT; 125 | }; 126 | 127 | WARCStream.prototype.parseHeader = function () { 128 | var idx = firstMatch(this.matcher, this.data, this.offset); 129 | 130 | if (idx !== false && idx < this.data.length) { 131 | var header= this.data.slice(this.offset, idx); 132 | this.offset = idx + this.matcher.length; 133 | 134 | if (header.length === 0) { 135 | this.state = STATE.CONTENT; 136 | return false; 137 | } 138 | 139 | var m = headerRegex.exec(header.toString()); 140 | if (m) { 141 | this.headers[m[1]] = m[2]; 142 | } 143 | return true; 144 | } else { 145 | return false; 146 | } 147 | }; 148 | 149 | WARCStream.prototype.parseContent = function () { 150 | var appendLength = Math.min( 151 | this.data.length - this.offset, 152 | this.contentLength - this.content.length); 153 | this.content = Buffer.concat([ 154 | this.content, this.data.slice(this.offset, this.offset + appendLength)]); 155 | this.offset += appendLength; 156 | return this.contentLength === this.content.length; 157 | }; 158 | 159 | WARCStream.prototype.parseSeparator = function () { 160 | var idx = firstMatch(this.separator, this.data, this.offset); 161 | 162 | if (idx !== false && idx < this.data.length) { 163 | var separator = this.data.slice(this.offset, idx); 164 | this.offset = idx + this.separator.length; 165 | if (separator.length === 0) { 166 | return true; 167 | } 168 | } 169 | 170 | return false; 171 | }; 172 | 173 | function firstMatch(matcher, buf, offset) { 174 | var i = offset; 175 | if (offset >= buf.length) return false; 176 | for (var i = offset; i < buf.length; i++) { 177 | if (buf[i] === matcher[0]) { 178 | if (matcher.length > 1) { 179 | var fullMatch = true; 180 | for (var j = i, k = 0; j < i + matcher.length; j++, k++) { 181 | if (buf[j] !== matcher[k]) { 182 | fullMatch = false; 183 | break; 184 | } 185 | } 186 | if (fullMatch) return j - matcher.length; 187 | } else { 188 | break; 189 | } 190 | } 191 | } 192 | 193 | var idx = i + matcher.length - 1; 194 | return idx; 195 | } 196 | -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | var it = require('tape'), 2 | path = require('path'), 3 | fs = require('fs'), 4 | through2 = require('through2'), 5 | HTTPParser = require('http-parser-js').HTTPParser, 6 | WARCStream = require('..'); 7 | 8 | var watFile = 'CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wat'; 9 | var wetFile = 'CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wet'; 10 | var warcFile = 'CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc'; 11 | function fixture(file) { 12 | return fs.createReadStream(path.join(__dirname, 'fixtures', file)); 13 | } 14 | 15 | it('should be able to create a new stream', function(t) { 16 | var w = new WARCStream(); 17 | t.ok(w); 18 | t.end(); 19 | }); 20 | 21 | it('should be able to parse the protocol', function(t) { 22 | t.plan(9*2); 23 | var f = fixture(watFile); 24 | var w = new WARCStream(); 25 | var count = 0; 26 | f 27 | .pipe(w) 28 | .on('protocol', function (protocol) { 29 | t.equal(protocol, 'WARC/1.0'); 30 | t.equal(w.protocol, 'WARC/1.0'); 31 | }) 32 | }); 33 | 34 | it('should be able to parse the headers', function(t) { 35 | t.plan(9); 36 | var f = fixture(watFile); 37 | var w = new WARCStream(); 38 | var first = true; 39 | f 40 | .pipe(w) 41 | .on('headers', function (headers) { 42 | if (first) { 43 | var expected = { 44 | 'WARC-Type': 'warcinfo', 45 | 'WARC-Date': '2014-09-09T10:14:59Z', 46 | 'WARC-Filename': 'CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz', 47 | 'WARC-Record-ID': '', 48 | 'Content-Type': 'application/warc-fields', 49 | 'Content-Length': '108' }; 50 | t.deepEquals(headers, expected); 51 | first = false; 52 | } else { 53 | t.ok(Object.keys(headers).length); 54 | } 55 | }) 56 | }); 57 | 58 | it('should be able to parse the content', function(t) { 59 | t.plan(9); 60 | var f = fixture(watFile); 61 | var w = new WARCStream(); 62 | var first = true; 63 | f 64 | .pipe(w) 65 | .on('content', function (content) { 66 | if (first) { 67 | var expected = [ 68 | 'Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050', 69 | 'Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT', 70 | '', 71 | '' 72 | ].join('\r\n'); 73 | t.equals(expected, content.toString()); 74 | first = false; 75 | } else { 76 | t.ok(content); 77 | } 78 | }) 79 | }); 80 | 81 | it('should be able to send data events', function(t) { 82 | t.plan(10); 83 | var f = fixture(watFile); 84 | var w = new WARCStream(); 85 | var results = []; 86 | f 87 | .pipe(w) 88 | .on('data', function (data) { 89 | results.push(data); 90 | t.ok(data); 91 | }) 92 | .on('end', function () { 93 | t.equals(results.length, 9); 94 | }); 95 | }); 96 | 97 | it('should be able to worth with streams', function(t) { 98 | t.plan(10); 99 | var f = fixture(watFile); 100 | var w = new WARCStream(); 101 | var results = []; 102 | f 103 | .pipe(w) 104 | .pipe(through2.obj( 105 | function (data, enc, cb) { 106 | results.push(data); 107 | t.ok(data); 108 | cb(); 109 | }, 110 | function () { 111 | t.equals(results.length, 9); 112 | })); 113 | }); 114 | 115 | it('should be able to worth with WET files', function(t) { 116 | t.plan(3); 117 | var f = fixture(wetFile); 118 | var w = new WARCStream(); 119 | var results = []; 120 | f 121 | .pipe(w) 122 | .pipe(through2.obj( 123 | function (data, enc, cb) { 124 | if (data.headers['Content-Type'] === 'text/plain') { 125 | t.ok(data.content.length); 126 | } 127 | cb(); 128 | })); 129 | }); 130 | 131 | function simpleHttpParser() { 132 | var parser = new HTTPParser(HTTPParser.RESPONSE); 133 | return function (chunk, cb) { 134 | parser.reinitialize(HTTPParser.RESPONSE); 135 | var body = new Buffer(0); 136 | var headers = {}; 137 | parser.onHeadersComplete = function (info) { 138 | headers = {}; 139 | for (var i = 0; i < info.headers.length; i += 2) { 140 | headers[info.headers[i]] = info.headers[i + 1]; 141 | }; 142 | }; 143 | parser.onBody = function (b, start, len) { 144 | body = b.slice(start, start + len); 145 | }; 146 | parser.execute(chunk, 0, chunk.length); 147 | if (body.length === 0) { 148 | parser.onBody(body, 0, body.length); 149 | } 150 | cb(null, body, headers); 151 | }; 152 | } 153 | 154 | it('should be able to work with WARC files', function(t) { 155 | t.plan(6); 156 | var f = fixture(warcFile); 157 | var w = new WARCStream(); 158 | var parser = simpleHttpParser(); 159 | var count = 0; 160 | f 161 | .pipe(w) 162 | .pipe(through2.obj( 163 | function (data, enc, cb) { 164 | if (data.headers['Content-Type'] === 'application/http; msgtype=response') { 165 | parser(data.content, function (err, body, headers) { 166 | if (count === 1) { 167 | var expected = { 168 | date: 'Thu, 21 Aug 2014 05:52:57 GMT', 169 | server: 'Apache', 170 | 'x-powered-by': 'PHP/5.3.8-1+b1', 171 | 'content-length': '7204', 172 | connection: 'close', 173 | 'content-type': 'text/html; charset=utf-8' }; 174 | t.deepEqual(headers, expected); 175 | t.assert(~body.toString().indexOf('')); 176 | } 177 | }); 178 | t.ok(data.content.length); 179 | count++; 180 | } 181 | cb(); 182 | })); 183 | }); 184 | -------------------------------------------------------------------------------- /test/edge.js: -------------------------------------------------------------------------------- 1 | var it = require('tape'), 2 | path = require('path'), 3 | fs = require('fs'), 4 | through2 = require('through2'), 5 | WARCStream = require('..'); 6 | 7 | it('should be able to parse the protocol', function(t) { 8 | t.plan(1); 9 | var w = new WARCStream(); 10 | w.on('protocol', function (protocol) { 11 | t.equal('WARC/1.0', protocol); 12 | }) 13 | w.write(new Buffer('WARC/1.0\r\n')); 14 | }); 15 | 16 | it('should be able to parse a split', function(t) { 17 | t.plan(1); 18 | var w = new WARCStream(); 19 | w.on('protocol', function (protocol) { 20 | t.equal('WARC/1.0', protocol); 21 | }) 22 | w.write(new Buffer('WARC/')); 23 | w.write(new Buffer('1.0')); 24 | w.write(new Buffer('\r\n')); 25 | }); 26 | 27 | it('should be able to parse the headers', function(t) { 28 | t.plan(2); 29 | var w = new WARCStream(); 30 | w.on('protocol', function (protocol) { 31 | t.equal('WARC/1.0', protocol); 32 | }) 33 | w.on('headers', function (headers) { 34 | var expected = { 35 | 'Content-Type': 'application/warc-fields', 36 | 'Content-Length': '108' }; 37 | t.deepEquals(headers, expected); 38 | }); 39 | var buf = new Buffer([ 40 | 'WARC/1.0', 41 | 'Content-Type: application/warc-fields', 42 | 'Content-Length: 108', 43 | '', 44 | '' 45 | ].join('\r\n')); 46 | var chunks = [ 47 | 'WARC/1.0\r\n' + 48 | 'Content-Type: application/warc-fields\r\n' + 49 | 'Content-Length: 108\r\n' + 50 | '\r\n' 51 | ]; 52 | chunks.forEach(function (chunk) { 53 | var buf = new Buffer(chunk); 54 | w.write(buf); 55 | }); 56 | }); 57 | 58 | it('should be able to parse split headers', function(t) { 59 | t.plan(2); 60 | var w = new WARCStream(); 61 | w.on('protocol', function (protocol) { 62 | t.equal('WARC/1.0', protocol); 63 | }) 64 | w.on('headers', function (headers) { 65 | var expected = { 66 | 'Content-Type': 'application/warc-fields', 67 | 'Content-Length': '108' }; 68 | t.deepEquals(headers, expected); 69 | }); 70 | var chunks = [ 71 | 'WARC/1.0\r\n', 72 | 'Content-Type: ', 'application/warc-fields\r\n', 73 | 'Content-Len', 'gth: 108', '\r\n', 74 | '\r\n' 75 | ]; 76 | 77 | chunks.forEach(function (chunk) { 78 | var buf = new Buffer(chunk); 79 | w.write(buf); 80 | }); 81 | }); 82 | 83 | it('should be able to parse content', function(t) { 84 | t.plan(3); 85 | var w = new WARCStream(); 86 | w.on('protocol', function (protocol) { 87 | t.equal(protocol, 'WARC/1.0'); 88 | }) 89 | w.on('headers', function (headers) { 90 | var expected = { 91 | 'Content-Type': 'application/warc-fields', 92 | 'Content-Length': '108' }; 93 | t.deepEquals(headers, expected); 94 | }); 95 | w.on('content', function (content) { 96 | var expected = [ 97 | 'Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 98 | 'Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT\r\n', 99 | '\r\n'].join(''); 100 | t.equals(content.toString(), expected); 101 | }); 102 | var chunks = [ 103 | 'WARC/1.0\r\n', 104 | 'Content-Type: ', 'application/warc-fields\r\n', 105 | 'Content-Len', 'gth: 108', '\r\n', 106 | '\r\n', 107 | 'Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 108 | 'Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT\r\n', 109 | '\r\n' 110 | ]; 111 | 112 | chunks.forEach(function (chunk) { 113 | var buf = new Buffer(chunk); 114 | w.write(buf); 115 | }); 116 | }); 117 | 118 | it('should be able to parse split content', function(t) { 119 | t.plan(3); 120 | var w = new WARCStream(); 121 | w.on('protocol', function (protocol) { 122 | t.equal(protocol, 'WARC/1.0'); 123 | }) 124 | w.on('headers', function (headers) { 125 | var expected = { 126 | 'Content-Type': 'application/warc-fields', 127 | 'Content-Length': '108' }; 128 | t.deepEquals(headers, expected); 129 | }); 130 | w.on('content', function (content) { 131 | var expected = [ 132 | 'Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 133 | 'Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT\r\n', 134 | '\r\n'].join(''); 135 | t.equals(content.toString(), expected); 136 | }); 137 | var chunks = [ 138 | 'WARC/1.0\r\n', 139 | 'Content-Type: ', 'application/warc-fields\r\n', 140 | 'Content-Len', 'gth: 108', '\r\n', 141 | '\r\n', 142 | 'Software-Info:', ' ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 143 | 'Extracted-Date: Tue, ', '09 Sep 2014 10:14:59 GMT', '\r\n', 144 | '\r\n' 145 | ]; 146 | 147 | chunks.forEach(function (chunk) { 148 | var buf = new Buffer(chunk); 149 | w.write(buf); 150 | }); 151 | }); 152 | 153 | it('should be able to parse across a packet boundary', function(t) { 154 | t.plan(6); 155 | var w = new WARCStream(); 156 | w.on('protocol', function (protocol) { 157 | t.equal(protocol, 'WARC/1.0'); 158 | }) 159 | w.on('headers', function (headers) { 160 | var expected = { 161 | 'Content-Type': 'application/warc-fields', 162 | 'Content-Length': '108' }; 163 | t.deepEquals(headers, expected); 164 | }); 165 | w.on('content', function (content) { 166 | var expected = [ 167 | 'Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 168 | 'Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT\r\n', 169 | '\r\n'].join(''); 170 | t.equals(content.toString(), expected); 171 | }); 172 | var chunks = [ 173 | 'WARC/1.0\r\n', 174 | 'Content-Type: ', 'application/warc-fields\r\n', 175 | 'Content-Len', 'gth: 108', '\r\n', 176 | '\r\n', 177 | 'Software-Info:', ' ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 178 | 'Extracted-Date: Tue, ', '09 Sep 2014 10:14:59 GMT', '\r\n', 179 | '\r\n', 180 | 181 | '\r\n', 182 | '\r\n', 183 | 184 | 'WARC/1.0\r\n', 185 | 'Content-Type: ', 'application/warc-fields\r\n', 186 | 'Content-Len', 'gth: 108', '\r\n', 187 | '\r\n', 188 | 'Software-Info:', ' ia-web-commons.1.0-SNAPSHOT-20140819100050\r\n', 189 | 'Extracted-Date: Tue, ', '09 Sep 2014 10:14:59 GMT', '\r\n', 190 | '\r\n' 191 | ]; 192 | 193 | chunks.forEach(function (chunk) { 194 | var buf = new Buffer(chunk); 195 | w.write(buf); 196 | }); 197 | }); 198 | -------------------------------------------------------------------------------- /test/fixtures/CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.wat: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: warcinfo 3 | WARC-Date: 2014-09-09T10:14:59Z 4 | WARC-Filename: CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz 5 | WARC-Record-ID: 6 | Content-Type: application/warc-fields 7 | Content-Length: 108 8 | 9 | Software-Info: ia-web-commons.1.0-SNAPSHOT-20140819100050 10 | Extracted-Date: Tue, 09 Sep 2014 10:14:59 GMT 11 | 12 | 13 | 14 | WARC/1.0 15 | WARC-Type: metadata 16 | WARC-Target-URI: CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz 17 | WARC-Date: 2014-09-05T15:18:43Z 18 | WARC-Record-ID: 19 | WARC-Refers-To: 20 | Content-Type: application/json 21 | Content-Length: 1178 22 | 23 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"273","Block-Digest":"sha1:P6VMFZQH6KPMHAYZ3VCWJYNYJLQETZS6","Actual-Content-Length":"371","WARC-Header-Metadata":{"WARC-Type":"warcinfo","WARC-Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz","WARC-Date":"2014-09-05T15:18:43Z","Content-Length":"371","WARC-Record-ID":"","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"0","Actual-Content-Type":"application/warc-fields","Actual-Content-Length":"371","Headers-Corrupt":true,"WARC-Info-Metadata":{"robots":"classic","software":"Nutch 1.6 (CC)/CC WarcExport 1.0","description":"Wide crawl of the web with URLs provided by Blekko for August 2014","hostname":"ip-10-180-136-8.ec2.internal","format":"WARC File Format 1.0","isPartOf":"CC-MAIN-2014-35","operator":"CommonCrawl Admin","publisher":"CommonCrawl"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"454","Header-Length":"10","Inflated-CRC":"-759485782","Inflated-Length":"648"},"Offset":"0","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 24 | 25 | WARC/1.0 26 | WARC-Type: metadata 27 | WARC-Target-URI: http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors 28 | WARC-Date: 2014-08-21T04:21:14Z 29 | WARC-Record-ID: 30 | WARC-Refers-To: 31 | Content-Type: application/json 32 | Content-Length: 1694 33 | 34 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"507","Block-Digest":"sha1:DI5K5RNSGZ6YJAMJYLNTHAZ5JJLHRW7L","Actual-Content-Length":"422","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2014-08-21T04:21:14Z","WARC-Warcinfo-ID":"","Content-Length":"422","WARC-Record-ID":"","WARC-Target-URI":"http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors","WARC-IP-Address":"65.52.108.2","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"0.r.msn.com","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"420","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"557","Header-Length":"10","Inflated-CRC":"248420074","Inflated-Length":"933"},"Offset":"454","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 35 | 36 | WARC/1.0 37 | WARC-Type: metadata 38 | WARC-Target-URI: http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors 39 | WARC-Date: 2014-08-21T04:21:14Z 40 | WARC-Record-ID: 41 | WARC-Refers-To: 42 | Content-Type: application/json 43 | Content-Length: 1646 44 | 45 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"696","Block-Digest":"sha1:UHJK3TXZIQRATBF4CIGW33NQ4QAGTE4M","Actual-Content-Length":"174","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Date":"2014-08-21T04:21:14Z","WARC-Warcinfo-ID":"","Content-Length":"174","WARC-Record-ID":"","WARC-Block-Digest":"sha1:UHJK3TXZIQRATBF4CIGW33NQ4QAGTE4M","WARC-Payload-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ","WARC-Target-URI":"http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors","WARC-IP-Address":"65.52.108.2","WARC-Concurrent-To":"","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"p3p":"CP=BUS CUR CONo FIN IVDo ONL OUR PHY SAMo TELo","Date":"Thu, 21 Aug 2014 04:21:14 GMT","Content-Length":"0","Connection":"close","Server":"Microsoft-IIS/8.0"},"Headers-Length":"174","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"638","Header-Length":"10","Inflated-CRC":"16556146","Inflated-Length":"874"},"Offset":"1011","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 46 | 47 | WARC/1.0 48 | WARC-Type: metadata 49 | WARC-Target-URI: http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors 50 | WARC-Date: 2014-08-21T04:21:14Z 51 | WARC-Record-ID: 52 | WARC-Refers-To: 53 | Content-Type: application/json 54 | Content-Length: 1206 55 | 56 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"536","Block-Digest":"sha1:QOCGDJGXOEQRPJC7ZSJNH66GLUOGIMOM","Actual-Content-Length":"19","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2014-08-21T04:21:14Z","WARC-Warcinfo-ID":"","Content-Length":"19","WARC-Record-ID":"","WARC-Target-URI":"http://0.r.msn.com/?ld=7v7Pf0o6dfvcggjmXvvsEKhzVUCUxwxRmKzEhcbUqMsh2Ubu9FZw1vPvSOUQKjNaf9lLFIpVKW3sQMR6aOgbPhwm9WR843zZRpT1jbKN7YgaGETlBJG5fdKcfifIi9WSQu9hAx6A&u=www.sportsmanias.com%2Frumors","WARC-Concurrent-To":"","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"43"}],"Actual-Content-Length":"19"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"434","Header-Length":"10","Inflated-CRC":"563332706","Inflated-Length":"559"},"Offset":"1649","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 57 | 58 | WARC/1.0 59 | WARC-Type: metadata 60 | WARC-Target-URI: http://0pointer.de/photos/?gallery=Australia&photo=496 61 | WARC-Date: 2014-08-21T05:52:58Z 62 | WARC-Record-ID: 63 | WARC-Refers-To: 64 | Content-Type: application/json 65 | Content-Length: 1424 66 | 67 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"372","Block-Digest":"sha1:6M4H6J5EE7LTEZNE5UJID3T77VLNXYIK","Actual-Content-Length":"285","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2014-08-21T05:52:58Z","WARC-Warcinfo-ID":"","Content-Length":"285","WARC-Record-ID":"","WARC-Target-URI":"http://0pointer.de/photos/?gallery=Australia&photo=496","WARC-IP-Address":"85.214.72.216","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"0pointer.de","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"283","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/photos/?gallery=Australia&photo=496"},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"437","Header-Length":"10","Inflated-CRC":"1822023262","Inflated-Length":"661"},"Offset":"2083","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 68 | 69 | WARC/1.0 70 | WARC-Type: metadata 71 | WARC-Target-URI: http://0pointer.de/photos/?gallery=Australia&photo=496 72 | WARC-Date: 2014-08-21T05:52:58Z 73 | WARC-Record-ID: 74 | WARC-Refers-To: 75 | Content-Type: application/json 76 | Content-Length: 5356 77 | 78 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"562","Block-Digest":"sha1:AYF2WX53LMGKI5AH2DGOBCMFQR7SICSS","Actual-Content-Length":"7387","WARC-Header-Metadata":{"WARC-Type":"response","WARC-Date":"2014-08-21T05:52:58Z","WARC-Warcinfo-ID":"","Content-Length":"7387","WARC-Record-ID":"","WARC-Block-Digest":"sha1:AYF2WX53LMGKI5AH2DGOBCMFQR7SICSS","WARC-Payload-Digest":"sha1:FQGN6GPYXVZ7SHCJRPDRTHW2GP6M6KAJ","WARC-Target-URI":"http://0pointer.de/photos/?gallery=Australia&photo=496","WARC-IP-Address":"85.214.72.216","WARC-Concurrent-To":"","Content-Type":"application/http; msgtype=response"},"Payload-Metadata":{"Trailing-Slop-Length":"4","Actual-Content-Type":"application/http; msgtype=response","HTTP-Response-Metadata":{"Headers":{"Date":"Thu, 21 Aug 2014 05:52:57 GMT","Content-Length":"7204","Content-Type":"text/html; charset=utf-8","Connection":"close","X-Powered-By":"PHP/5.3.8-1+b1","Server":"Apache"},"Headers-Length":"183","Entity-Length":"7204","Entity-Trailing-Slop-Bytes":"0","Response-Message":{"Status":"200","Version":"HTTP/1.1","Reason":"OK"},"HTML-Metadata":{"Links":[{"text":"dark","title":"dark","path":"A@/href","url":"#"},{"text":"classic","title":"classic","path":"A@/href","url":"#"},{"text":"gorilla","title":"gorilla","path":"A@/href","url":"#"},{"text":"Photo Gallery","path":"A@/href","url":"http://0pointer.de/photos/"},{"text":"Photo Gallery Index","path":"A@/href","url":"/photos/"},{"text":"Australia","path":"A@/href","url":"/photos/?gallery=Australia"},{"title":"Show Thumbnail Navigation","path":"A@/href","url":"/photos/?gallery=Australia&photo=496&exif_style=&show_thumbs=yes"},{"alt":"496","path":"IMG@/src","url":"./galleries/Australia/lq/img-496.jpg"},{"alt":"Previous","path":"IMG@/src","url":"./galleries/Australia/thumbs/img-495.jpg"},{"text":"Previous","path":"A@/href","url":"/photos/?gallery=Australia&photo=495&exif_style=&show_thumbs="},{"alt":"Next","path":"IMG@/src","url":"./galleries/Australia/thumbs/img-497.jpg"},{"text":"Next","path":"A@/href","url":"/photos/?gallery=Australia&photo=497&exif_style=&show_thumbs="},{"text":"display table","path":"A@/href","url":"/photos/?galerie=Australia&photo=496&exif_style=descriptive&show_thumbs="},{"text":"MQ","path":"A@/href","url":"./galleries/Australia/mq/img-496.jpg"},{"text":"Show FormHide Form","path":"A@/href","url":"javascript:toggle_comment()"},{"path":"FORM@/action","method":"post","url":"/photos/?gallery=Australia&photo=496"},{"text":"Name:","title":"Enter your name.","path":"A@/href","url":"#"},{"text":"Remember Name:","title":"Should the browser remember your name?","path":"A@/href","url":"#"},{"text":"Retype PIN Above:","title":"Enter the number shown above.","path":"A@/href","url":"#"},{"text":"Comment:","title":"Allowed HTML tags: a,b,i,ul,li,blockquote,br.","path":"A@/href","url":"#"},{"text":"< Previous","path":"A@/href","url":"/photos/?gallery=Australia&photo=495&exif_style=&show_thumbs="},{"text":"Next >","path":"A@/href","url":"/photos/?gallery=Australia&photo=497&exif_style=&show_thumbs="},{"text":"All Panoramas","path":"A@/href","url":"/static/panoramas.cgi"},{"text":"Special Photo Series","path":"A@/href","url":"/static/index.cgi"},{"text":"Lennart's Blog","path":"A@/href","url":"http://0pointer.de/blog"},{"text":"Lennart's Homepage","path":"A@/href","url":"http://0pointer.de/lennart/"},{"text":"Lennart's Photos","path":"A@/href","url":"http://0pointer.de/photos/"},{"text":"Impressum/Imprint","path":"A@/href","url":"http://0pointer.de/imprint"},{"alt":"Creative Commons License","path":"IMG@/src","url":"http://creativecommons.org/images/public/somerights20.png"},{"path":"A@/href","url":"http://creativecommons.org/licenses/by-sa/3.0/"},{"text":"Creative Commons Attribution-ShareAlike 3.0 License","path":"A@/href","url":"http://creativecommons.org/licenses/by-sa/3.0/"}],"Head":{"Link":[{"path":"LINK@/href","rel":"icon","type":"image/png","url":"stock_camera-16.png"},{"path":"LINK@/href","rel":"shortcut icon","type":"image/x-icon","url":"favicon.ico"},{"path":"LINK@/href","rel":"Top","url":"/photos/"},{"path":"LINK@/href","rel":"First","url":"/photos/?gallery=Australia&photo=1"},{"path":"LINK@/href","rel":"Previous","url":"/photos/?gallery=Australia&photo=495"},{"path":"LINK@/href","rel":"Next","url":"/photos/?gallery=Australia&photo=497"},{"path":"LINK@/href","rel":"Last","url":"/photos/?gallery=Australia&photo=1242"},{"path":"LINK@/href","rel":"stylesheet","type":"text/css","url":"inc/styles/dark/dark.css"},{"path":"LINK@/href","rel":"alternate stylesheet","type":"text/css","url":"inc/styles/classic/classic.css"},{"path":"LINK@/href","rel":"alternate stylesheet","type":"text/css","url":"inc/styles/gorilla/gorilla.css"}],"Scripts":[{"path":"SCRIPT@/src","type":"text/javascript","url":"inc/global.js"}],"Title":"Photos"}},"Entity-Digest":"sha1:FQGN6GPYXVZ7SHCJRPDRTHW2GP6M6KAJ"}}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"2828","Header-Length":"10","Inflated-CRC":"-274687686","Inflated-Length":"7953"},"Offset":"2520","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 79 | 80 | WARC/1.0 81 | WARC-Type: metadata 82 | WARC-Target-URI: http://0pointer.de/photos/?gallery=Australia&photo=496 83 | WARC-Date: 2014-08-21T05:52:58Z 84 | WARC-Record-ID: 85 | WARC-Refers-To: 86 | Content-Type: application/json 87 | Content-Length: 1071 88 | 89 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"399","Block-Digest":"sha1:UKOGIISFNHG3LXY37OW4VTRT63NIYPOD","Actual-Content-Length":"20","WARC-Header-Metadata":{"WARC-Type":"metadata","WARC-Date":"2014-08-21T05:52:58Z","WARC-Warcinfo-ID":"","Content-Length":"20","WARC-Record-ID":"","WARC-Target-URI":"http://0pointer.de/photos/?gallery=Australia&photo=496","WARC-Concurrent-To":"","Content-Type":"application/warc-fields"},"Payload-Metadata":{"Trailing-Slop-Length":"4","WARC-Metadata-Metadata":{"Trailing-Slop-Length":"0","Metadata-Records":[{"Name":"fetchTimeMs","Value":"980"}],"Actual-Content-Length":"20"},"Actual-Content-Type":"application/metadata-fields"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"310","Header-Length":"10","Inflated-CRC":"1018636576","Inflated-Length":"423"},"Offset":"5348","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 90 | 91 | WARC/1.0 92 | WARC-Type: metadata 93 | WARC-Target-URI: http://0pointer.de/photos/?gallery=Thailand%202011-01&photo=726&show_thumbs=yes&exif_style= 94 | WARC-Date: 2014-08-21T06:06:05Z 95 | WARC-Record-ID: 96 | WARC-Refers-To: 97 | Content-Type: application/json 98 | Content-Length: 1498 99 | 100 | {"Envelope":{"Format":"WARC","WARC-Header-Length":"409","Block-Digest":"sha1:XLWBHSGXSYUVQL2L5MVTSXRCUT5MEUVY","Actual-Content-Length":"322","WARC-Header-Metadata":{"WARC-Type":"request","WARC-Date":"2014-08-21T06:06:05Z","WARC-Warcinfo-ID":"","Content-Length":"322","WARC-Record-ID":"","WARC-Target-URI":"http://0pointer.de/photos/?gallery=Thailand%202011-01&photo=726&show_thumbs=yes&exif_style=","WARC-IP-Address":"85.214.72.216","Content-Type":"application/http; msgtype=request"},"Payload-Metadata":{"Trailing-Slop-Length":"4","HTTP-Request-Metadata":{"Headers":{"Accept-Language":"en-us,en-gb,en;q=0.7,*;q=0.3","Host":"0pointer.de","Accept-Encoding":"x-gzip, gzip, deflate","User-Agent":"CCBot/2.0 (http://commoncrawl.org/faq/)","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},"Headers-Length":"320","Entity-Length":"0","Entity-Trailing-Slop-Bytes":"0","Request-Message":{"Method":"GET","Version":"HTTP/1.0","Path":"/photos/?gallery=Thailand%202011-01&photo=726&show_thumbs=yes&exif_style="},"Entity-Digest":"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"},"Actual-Content-Type":"application/http; msgtype=request"}},"Container":{"Compressed":true,"Gzip-Metadata":{"Footer-Length":"8","Deflate-Length":"465","Header-Length":"10","Inflated-CRC":"1464693846","Inflated-Length":"735"},"Offset":"5658","Filename":"CC-MAIN-20140820021334-00006-ip-10-180-136-8.ec2.internal.warc.gz"}} 101 | 102 | --------------------------------------------------------------------------------