├── .github └── workflows │ └── maven.yml ├── .gitignore ├── LICENSE ├── README.md ├── SOURCE ├── comparison ├── pom.xml └── src │ ├── Bench.java │ └── EncodedWords.java ├── pom.xml ├── resources └── org │ └── netpreserve │ └── jwarc │ └── net │ ├── inject.js │ ├── recorder-sw.js │ ├── recorder.html │ └── sw.js ├── src └── org │ └── netpreserve │ └── jwarc │ ├── BrotliUtils.java │ ├── ChunkedBody.java │ ├── ChunkedBody.rl │ ├── ConcurrentRecordSet.java │ ├── DecodedBody.java │ ├── DigestingMessageBody.java │ ├── FetchOptions.java │ ├── FetchResult.java │ ├── GeminiParser.java │ ├── GeminiParser.rl │ ├── GeminiRequest.java │ ├── GeminiResponse.java │ ├── GunzipChannel.java │ ├── GzipChannel.java │ ├── HeaderValidator.java │ ├── HttpMessage.java │ ├── HttpParser.java │ ├── HttpParser.rl │ ├── HttpRequest.java │ ├── HttpResponse.java │ ├── IOUtils.java │ ├── InetAddresses.java │ ├── InflateChannel.java │ ├── LengthedBody.java │ ├── MediaType.java │ ├── MediaType.rl │ ├── Message.java │ ├── MessageBody.java │ ├── MessageHeaders.java │ ├── MessageParser.java │ ├── MessageVersion.java │ ├── ParsingException.java │ ├── URIs.java │ ├── WarcCaptureRecord.java │ ├── WarcCompression.java │ ├── WarcContinuation.java │ ├── WarcConversion.java │ ├── WarcDigest.java │ ├── WarcFilter.java │ ├── WarcFilterCompiler.java │ ├── WarcFilterException.java │ ├── WarcFilterLexer.java │ ├── WarcMetadata.java │ ├── WarcParser.java │ ├── WarcParser.rl │ ├── WarcPayload.java │ ├── WarcReader.java │ ├── WarcRecord.java │ ├── WarcRequest.java │ ├── WarcResource.java │ ├── WarcResponse.java │ ├── WarcRevisit.java │ ├── WarcTargetRecord.java │ ├── WarcTruncationReason.java │ ├── WarcWriter.java │ ├── Warcinfo.java │ ├── cdx │ ├── CdxFields.java │ ├── CdxFormat.java │ ├── CdxReader.java │ ├── CdxRecord.java │ ├── CdxRequestEncoder.java │ ├── CdxWriter.java │ ├── JsonException.java │ ├── JsonToken.java │ └── JsonTokenizer.java │ ├── net │ ├── Browser.java │ ├── Capture.java │ ├── CaptureIndex.java │ ├── CertificateAuthority.java │ ├── HttpExchange.java │ ├── HttpHandler.java │ ├── HttpServer.java │ ├── WarcRecorder.java │ ├── WarcRenderer.java │ ├── WarcServer.java │ └── package-info.java │ ├── package-info.java │ └── tools │ ├── CdxTool.java │ ├── DedupeTool.java │ ├── ExtractTool.java │ ├── FetchTool.java │ ├── FilterTool.java │ ├── ListTool.java │ ├── RecordTool.java │ ├── RecorderTool.java │ ├── SavebackTool.java │ ├── ScreenshotTool.java │ ├── ServeTool.java │ ├── StatsTool.java │ ├── Utils.java │ ├── ValidateTool.java │ ├── WarcTool.java │ └── package-info.java ├── test-resources └── org │ └── netpreserve │ └── jwarc │ ├── cc.warc.gz │ └── gzip_extra_sl.warc.gz └── test └── org └── netpreserve └── jwarc ├── ChunkedBodyTest.java ├── GunzipChannelTest.java ├── GzipChannelTest.java ├── HeaderValidatorTest.java ├── HttpParserTest.java ├── HttpRequestTest.java ├── HttpResponseTest.java ├── InetAddressesTest.java ├── LengthedBodyTest.java ├── MessageHeadersTest.java ├── URIsTest.java ├── WarcDigestTest.java ├── WarcParserTest.java ├── WarcRecordTest.java ├── WarcTargetRecordTest.java ├── apitests ├── ArcTest.java ├── MediaTypeTest.java ├── MessageVersionTest.java ├── StandardExamples.java ├── WarcContinuationTest.java ├── WarcConversionTest.java ├── WarcFilterTest.java ├── WarcMetadataTest.java ├── WarcReaderTest.java ├── WarcRequestTest.java ├── WarcResourceTest.java ├── WarcResponseTest.java ├── WarcRevisitTest.java ├── WarcWriterTest.java └── WarcinfoTest.java ├── cdx ├── CdxFormatTest.java ├── CdxReaderTest.java ├── CdxRequestEncoderTest.java ├── CdxWriterTest.java └── JsonTokenizerTest.java └── net └── WarcServerTest.java /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | name: Java CI with Maven 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up JDK 11 17 | uses: actions/setup-java@v2 18 | with: 19 | java-version: '11' 20 | distribution: 'temurin' 21 | cache: maven 22 | - name: Build with Maven 23 | run: mvn -B package --file pom.xml 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target 3 | *.iml -------------------------------------------------------------------------------- /SOURCE: -------------------------------------------------------------------------------- 1 | The native binary versions of jwarc are built with GraalVM's native-image tool and may contain compiled versions of 2 | code licensed under the GPL 2 with Classpath Exception and/or LGPL. The corresponding source code is located at: 3 | 4 | https://github.com/oracle/graal/archive/refs/tags/vm-ce-21.2.0.tar.gz 5 | 6 | In case these links change or unavailable please check the following pages for updated information: 7 | 8 | https://github.com/iipc/jwarc/blob/master/SOURCE 9 | https://www.nla.gov.au/source-code 10 | 11 | Also see the upstream source offers in THIRD_PARTY_README. 12 | 13 | You may also request the source code, within the period required by the applicable license, by contacting the National 14 | Library of Australia. 15 | 16 | Source code request c/o Digital Division 17 | National Library of Australia 18 | Canberra ACT 2600 19 | AUSTRALIA 20 | 21 | Please include in your request your contact details, the name and version of the binary, the date you received it 22 | and if possible the URL you downloaded it from. -------------------------------------------------------------------------------- /comparison/pom.xml: -------------------------------------------------------------------------------- 1 | 5 | 6 | 8 | 9 | 4.0.0 10 | org.netpreserve 11 | jwarc-comparison 12 | 0.1.0 13 | 14 | 15 | ${basedir}/src 16 | 17 | 18 | 19 | 20 | org.jwat 21 | jwat-warc 22 | 1.1.1 23 | 24 | 25 | org.netpreserve.commons 26 | webarchive-commons 27 | 1.1.9 28 | 29 | 30 | org.netpreserve 31 | jwarc 32 | 0.8.4-SNAPSHOT 33 | compile 34 | 35 | 36 | 37 | 38 | UTF-8 39 | 1.8 40 | 1.8 41 | 42 | 43 | -------------------------------------------------------------------------------- /comparison/src/Bench.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | import org.archive.io.ArchiveReader; 7 | import org.archive.io.ArchiveRecord; 8 | import org.archive.io.warc.WARCReaderFactory; 9 | import org.jwat.warc.WarcReader; 10 | import org.jwat.warc.WarcReaderFactory; 11 | import org.jwat.warc.WarcRecord; 12 | 13 | import java.io.File; 14 | import java.io.FileInputStream; 15 | import java.io.IOException; 16 | import java.nio.channels.FileChannel; 17 | import java.nio.file.Files; 18 | import java.nio.file.Paths; 19 | import java.util.function.Function; 20 | import java.util.zip.GZIPInputStream; 21 | 22 | public class Bench { 23 | 24 | @FunctionalInterface 25 | public interface ThrowingFunction { 26 | R apply(T t) throws E; 27 | } 28 | 29 | private static void bench(String name, ThrowingFunction func, String filename) { 30 | long start = System.currentTimeMillis(); 31 | try { 32 | String res = func.apply(filename); 33 | System.out.println(name + " " + res + " in " + (System.currentTimeMillis() - start) + "ms"); 34 | } catch(IOException e) { 35 | System.out.println(name + " failed after " + (System.currentTimeMillis() - start) + "ms throwing " + e); 36 | } 37 | } 38 | 39 | private static String gzip(String filename, int bufferSize) throws IOException { 40 | byte[] buf = new byte[bufferSize]; 41 | try (GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(new File(filename)), bufferSize)) { 42 | while (true) { 43 | int n = gzis.read(buf); 44 | if (n < 0) { 45 | break; 46 | } 47 | } 48 | } 49 | return ""; 50 | } 51 | 52 | private static String gzip8k(String filename) throws IOException { 53 | return gzip(filename, 8192); 54 | } 55 | 56 | private static String gzip64k(String filename) throws IOException { 57 | return gzip(filename, 65536); 58 | } 59 | 60 | private static String webarchiveCommons(String filename) throws IOException { 61 | long count = 0; 62 | try (ArchiveReader reader = WARCReaderFactory.get(new File(filename))) { 63 | for (ArchiveRecord record : reader) { 64 | count++; 65 | } 66 | } 67 | return Long.toString(count); 68 | } 69 | 70 | private static String webarchiveCommonsNoDigest(String filename) throws IOException { 71 | long count = 0; 72 | try (ArchiveReader reader = WARCReaderFactory.get(new File(filename))) { 73 | reader.setDigest(false); 74 | for (ArchiveRecord record : reader) { 75 | count++; 76 | } 77 | } 78 | return Long.toString(count); 79 | } 80 | 81 | private static String jwat(String filename) throws IOException { 82 | long count = 0; 83 | try (WarcReader reader = WarcReaderFactory.getReader(new FileInputStream(filename))) { 84 | for (WarcRecord record : reader) { 85 | count++; 86 | } 87 | } 88 | return Long.toString(count); 89 | } 90 | 91 | private static String jwatBuff(String filename) throws IOException { 92 | long count = 0; 93 | try (WarcReader reader = WarcReaderFactory.getReader(new FileInputStream(filename), 8192)) { 94 | for (WarcRecord record : reader) { 95 | count++; 96 | } 97 | } 98 | return Long.toString(count); 99 | } 100 | 101 | private static String jwarc(String filename) throws IOException { 102 | long count = 0; 103 | try (org.netpreserve.jwarc.WarcReader reader = new org.netpreserve.jwarc.WarcReader(FileChannel.open(Paths.get(filename)))) { 104 | for (org.netpreserve.jwarc.WarcRecord record : reader) { 105 | count++; 106 | } 107 | } 108 | return Long.toString(count); 109 | } 110 | 111 | public static void main(String[] args) { 112 | String filename = args[0]; 113 | System.out.println("Benchmarking " + filename); 114 | 115 | int iterations = 3; 116 | 117 | try { 118 | Thread.sleep(1000); // sleep a short time to be able to attach a profiler 119 | } catch(Exception e) { 120 | } 121 | 122 | for (int i = 1; i <= iterations; i++) { 123 | System.out.println("iteration " + i); 124 | 125 | if (filename.endsWith(".gz")) { 126 | bench("gzipinputstream (buffer 8kB)", Bench::gzip8k, filename); 127 | bench("gzipinputstream (buffer 64kB)", Bench::gzip64k, filename); 128 | } 129 | 130 | bench("webarchive-commons", Bench::webarchiveCommons, filename); 131 | bench("webarchive-commons (no digest check)", Bench::webarchiveCommonsNoDigest, filename); 132 | 133 | //bench("jwat", Bench::jwat, filename); 134 | bench("jwat buff", Bench::jwatBuff, filename); 135 | 136 | bench("jwarc", Bench::jwarc, filename); 137 | 138 | System.out.println(""); 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /comparison/src/EncodedWords.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | import org.archive.io.ArchiveReader; 7 | import org.archive.io.ArchiveRecord; 8 | import org.archive.io.warc.WARCReaderFactory; 9 | import org.jwat.warc.WarcReader; 10 | import org.jwat.warc.WarcReaderFactory; 11 | import org.jwat.warc.WarcRecord; 12 | 13 | import java.io.ByteArrayInputStream; 14 | import java.io.IOException; 15 | import java.nio.charset.StandardCharsets; 16 | 17 | public class EncodedWords { 18 | static String test = "WARC/1.0\r\n" + 19 | "Encoded: =?iso-8859-1?q?this=20is=20some=20text?=\r\n" + 20 | "Folded: a \r\n" + 21 | " b c \r\n" + 22 | "\t d \r\n" + 23 | "\r\n"; 24 | 25 | public static void main(String args[]) throws IOException { 26 | System.out.println("wa-commons"); 27 | try (ArchiveReader reader = WARCReaderFactory.get("test.warc", new ByteArrayInputStream(test.getBytes(StandardCharsets.US_ASCII)), false)) { 28 | ArchiveRecord record = reader.get(); 29 | System.out.println(record.getHeader().getHeaderValue("Folded")); 30 | System.out.println(record.getHeader().getHeaderValue("Encoded")); 31 | } 32 | 33 | System.out.println("\njwat"); 34 | try (WarcReader reader = WarcReaderFactory.getReader(new ByteArrayInputStream(test.getBytes(StandardCharsets.US_ASCII)))) { 35 | WarcRecord record = reader.getNextRecord(); 36 | System.out.println(record.getHeader("Folded").value); 37 | System.out.println(record.getHeader("Encoded").value); 38 | } 39 | 40 | System.out.println("\njwarc"); 41 | try (org.netpreserve.jwarc.WarcReader reader = new org.netpreserve.jwarc.WarcReader(new ByteArrayInputStream(test.getBytes(StandardCharsets.US_ASCII)))) { 42 | org.netpreserve.jwarc.WarcRecord record = reader.next().get(); 43 | System.out.println(record.headers().sole("Folded").get()); 44 | System.out.println(record.headers().sole("Encoded").get()); 45 | } 46 | 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /resources/org/netpreserve/jwarc/net/inject.js: -------------------------------------------------------------------------------- 1 | let reload = !navigator.serviceWorker.controller; 2 | navigator.serviceWorker.register('/__jwarc__/sw.js', {scope: '/replay/'}).then(function (registration) { 3 | registration.update(); 4 | if (reload) { 5 | window.location.reload(); 6 | } 7 | }); 8 | 9 | window.addEventListener("load", function() { 10 | var anchors = document.getElementsByTagName("a"); 11 | var prefix = /(\/replay\/[0-9]+\/https?:\/\/[^\/]*)\/.*/.exec(location.pathname)[1]; 12 | for (var i = 0; i < anchors.length; i++) { 13 | var a = anchors[i]; 14 | if (a.href.startsWith(location.origin + '/') && !a.href.startsWith(location.origin + '/replay/')) { 15 | var url = new URL(a.href); 16 | url.pathname = prefix + url.pathname; 17 | a.href = url.toString(); 18 | } 19 | } 20 | }); -------------------------------------------------------------------------------- /resources/org/netpreserve/jwarc/net/recorder-sw.js: -------------------------------------------------------------------------------- 1 | self.addEventListener('install', function (event) { 2 | console.log('[recorder-sw.js] Installed'); 3 | event.waitUntil(self.skipWaiting()); 4 | }); 5 | 6 | self.addEventListener('activate', function (event) { 7 | console.log('[recorder-sw.js] Activated'); 8 | event.waitUntil(self.clients.claim()); 9 | }); 10 | 11 | self.addEventListener('fetch', function (event) { 12 | const request = event.request; 13 | console.log('[recorder-sw.js] Fetch intercepted for:', request.url); 14 | 15 | let target = request.url; 16 | const url = new URL(target); 17 | if (url.origin === self.location.origin) { 18 | if (url.pathname === "/" || url.pathname.startsWith("/__jwarc__/")) { 19 | return; 20 | } 21 | const referrerLiveUrl = request.referrer.replace(/.*\/jwarcrecorder\/record\//, ""); 22 | target = new URL(url.pathname + url.search, referrerLiveUrl).toString(); 23 | } 24 | 25 | console.log("[recorder-sw.js] Fetching " + target); 26 | const newUrl = "/__jwarc__/record/" + target; 27 | const newRequest = new Request(newUrl, { 28 | method: request.method, 29 | headers: request.headers, 30 | mode: request.mode, 31 | credentials: request.credentials, 32 | redirect: request.redirect, 33 | referrer: request.referrer, 34 | body: request.body, 35 | }); 36 | event.respondWith(fetch(newRequest)); 37 | } 38 | ); -------------------------------------------------------------------------------- /resources/org/netpreserve/jwarc/net/recorder.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | WARC Recorder 4 | 11 | 40 | 41 | 44 | 46 | 47 | -------------------------------------------------------------------------------- /resources/org/netpreserve/jwarc/net/sw.js: -------------------------------------------------------------------------------- 1 | importScripts('https://oduwsdl.github.io/Reconstructive/reconstructive.js'); 2 | 3 | const rc = new Reconstructive({ 4 | debug: false, 5 | showBanner: true, 6 | urimPattern: `${self.location.origin}/replay//`, 7 | bannerElementLocation: 'https://oduwsdl.github.io/Reconstructive/reconstructive-banner.js', 8 | }); 9 | rc.exclusions.specialEndpint = function (event, config) { 10 | return event.request.url.startsWith(self.location.origin + '/__jwarc__/'); 11 | }; 12 | self.addEventListener('fetch', function (event) { 13 | rc.reroute(event); 14 | }); 15 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/BrotliUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2024 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.channels.Channels; 10 | import java.nio.channels.ReadableByteChannel; 11 | 12 | import org.brotli.dec.BrotliInputStream; 13 | 14 | /** 15 | * Utility class to read brotli-encoded data, based on org.brotli:dec. 16 | */ 17 | public final class BrotliUtils { 18 | 19 | public static ReadableByteChannel brotliChannel(ReadableByteChannel brotli) throws IOException { 20 | return Channels.newChannel(new BrotliInputStream(Channels.newInputStream(brotli))); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/ConcurrentRecordSet.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import java.net.URI; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | /** 8 | * A set for testing whether WARC records are concurrent (i.e. part of the same capture event). 9 | */ 10 | public class ConcurrentRecordSet { 11 | private final Set set = new HashSet<>(); 12 | 13 | /** 14 | * Adds a record to the set. 15 | */ 16 | public void add(WarcRecord record) { 17 | set.add(record.id()); 18 | if (record instanceof WarcCaptureRecord) { 19 | set.addAll(((WarcCaptureRecord) record).concurrentTo()); 20 | } 21 | } 22 | 23 | /** 24 | * Tests if the given record is concurrent to any previously added record. 25 | */ 26 | public boolean contains(WarcRecord record) { 27 | if (set.contains(record.id())) return true; 28 | if (record instanceof WarcCaptureRecord) { 29 | for (URI id : ((WarcCaptureRecord) record).concurrentTo()) { 30 | if (set.contains(id)) return true; 31 | } 32 | } 33 | return false; 34 | } 35 | 36 | /** 37 | * Removes all records from the set. 38 | */ 39 | public void clear() { 40 | set.clear(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/DecodedBody.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2024 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.nio.channels.ReadableByteChannel; 11 | 12 | 13 | /** 14 | * A message body which decodes content on-the-fly using the specified encoding. 15 | */ 16 | public class DecodedBody extends MessageBody { 17 | 18 | public static enum Encoding { 19 | DEFLATE, 20 | GZIP, 21 | BROTLI 22 | } 23 | 24 | private final ReadableByteChannel channel; 25 | private final Encoding encoding; 26 | long position = 0; 27 | 28 | private DecodedBody(ReadableByteChannel channel, Encoding encoding) throws IOException { 29 | this.encoding = encoding; 30 | switch (this.encoding) { 31 | case DEFLATE: 32 | this.channel = IOUtils.inflateChannel(channel); 33 | break; 34 | case GZIP: 35 | this.channel = IOUtils.gunzipChannel(channel); 36 | break; 37 | case BROTLI: 38 | try { 39 | this.channel = BrotliUtils.brotliChannel(channel); 40 | } catch (NoClassDefFoundError e) { 41 | throw new IOException("Brotli decoder not found, please install org.brotli:dec", e); 42 | } 43 | break; 44 | default: 45 | throw new IOException("Unsupported encoding"); 46 | } 47 | } 48 | 49 | public static DecodedBody create(ReadableByteChannel channel, Encoding encoding) throws IOException { 50 | return new DecodedBody(channel, encoding); 51 | } 52 | 53 | @Override 54 | public long position() throws IOException { 55 | return position; 56 | }; 57 | 58 | @Override 59 | public int read(ByteBuffer dst) throws IOException { 60 | int n = channel.read(dst); 61 | if (n > 0) { 62 | position += n; 63 | } 64 | return n; 65 | } 66 | 67 | @Override 68 | public boolean isOpen() { 69 | return channel.isOpen(); 70 | } 71 | 72 | @Override 73 | public void close() throws IOException { 74 | channel.close(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/DigestingMessageBody.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2021 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.security.MessageDigest; 11 | 12 | /** 13 | * Wrapper around a MessageBody which calculates a MessageDigest while the body 14 | * is read. 15 | */ 16 | class DigestingMessageBody extends MessageBody { 17 | private final MessageBody body; 18 | private final MessageDigest digest; 19 | 20 | DigestingMessageBody(MessageBody digestedBody, MessageDigest digest) { 21 | this.body = digestedBody; 22 | this.digest = digest; 23 | } 24 | 25 | @Override 26 | public int read(ByteBuffer dst) throws IOException { 27 | int i = body.read(dst); 28 | if (i > 0) { 29 | ByteBuffer tmp = dst.duplicate(); 30 | tmp.position(dst.position() - i); 31 | tmp.limit(dst.position()); 32 | digest.update(tmp); 33 | } 34 | return i; 35 | } 36 | 37 | @Override 38 | public boolean isOpen() { 39 | return body.isOpen(); 40 | } 41 | 42 | @Override 43 | public void close() throws IOException { 44 | body.close(); 45 | } 46 | 47 | @Override 48 | public long position() throws IOException { 49 | return body.position(); 50 | } 51 | 52 | public MessageDigest getDigest() { 53 | return digest; 54 | } 55 | 56 | @Override 57 | public long size() throws IOException { 58 | return body.size(); 59 | } 60 | } -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/FetchOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | package org.netpreserve.jwarc; 6 | 7 | import java.io.OutputStream; 8 | import java.net.URI; 9 | 10 | 11 | /** 12 | * Options for fetching a remote resource. 13 | * 14 | * @see WarcWriter#fetch(URI, FetchOptions) 15 | */ 16 | @SuppressWarnings("UnusedReturnValue") 17 | public class FetchOptions { 18 | long maxLength = 0; 19 | long maxTime = 0; 20 | int readTimeout = 60000; 21 | String userAgent = "jwarc"; 22 | OutputStream copyTo; 23 | 24 | /** 25 | * Stops the fetch after this many bytes are received (including any protocol headers). If this limit was reached 26 | * the header "WARC-Truncated: length" will be added to the response record. 27 | */ 28 | public FetchOptions maxLength(long bytes) { 29 | this.maxLength = bytes; 30 | return this; 31 | } 32 | 33 | /** 34 | * Stops the fetch after this many milliseconds have elapsed. If this limit was reached the header 35 | * "WARC-Truncated: time" will be added to the response record. 36 | */ 37 | public FetchOptions maxTime(long millis) { 38 | this.maxTime = millis; 39 | return this; 40 | } 41 | 42 | 43 | /** 44 | * Sets the read timeout in milliseconds on the socket. Defaults to 60000. Set to 0 for no timout. 45 | * 46 | * @see java.net.Socket#setSoTimeout(int) 47 | */ 48 | public FetchOptions readTimeout(int millis) { 49 | this.readTimeout = millis; 50 | return this; 51 | } 52 | 53 | /** 54 | * Sets the User-Agent request header. Default: "jwarc" 55 | *

56 | * If a custom HTTP request is provided this option will be ignored. 57 | */ 58 | public FetchOptions userAgent(String userAgent) { 59 | this.userAgent = userAgent; 60 | return this; 61 | } 62 | 63 | /** 64 | * If specified the response will also be copied to this OutputStream as well as the WARC file. 65 | */ 66 | public FetchOptions copyTo(OutputStream copyTo) { 67 | this.copyTo = copyTo; 68 | return this; 69 | } 70 | } -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/FetchResult.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | /** 9 | * The result of a fetch operation. This contains the request and response as WARC records (without payloads) so that 10 | * the request and response headers can be inspected. 11 | */ 12 | public class FetchResult { 13 | private final WarcRequest request; 14 | private final WarcResponse response; 15 | private final Throwable exception; 16 | 17 | FetchResult(WarcRequest request, WarcResponse response, Throwable exception) { 18 | this.request = request; 19 | this.response = response; 20 | this.exception = exception; 21 | } 22 | 23 | /** 24 | * The WARC record containing the request that was sent. The request body will not be readable. 25 | */ 26 | public WarcRequest request() { 27 | return request; 28 | } 29 | 30 | /** 31 | * The WARC record containing the request that was sent. The response body will not be readable. 32 | */ 33 | public WarcResponse response() { 34 | return response; 35 | } 36 | 37 | /** 38 | * If the fetch was interrupted by an exception but truncated records were still written this will return the caught 39 | * exception. This can occur if the WarcWriter was closed during the fetch. 40 | */ 41 | public Throwable exception() { 42 | return exception; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/GeminiParser.rl: -------------------------------------------------------------------------------- 1 | // recompile: ragel -J GeminiParser.rl -o GeminiParser.java 2 | // diagram: ragel -Vp GeminiParser.rl | dot -Tpng | feh - 3 | // spec: https://gemini.circumlunar.space/docs/specification.gmi 4 | %%{ 5 | 6 | machine gemini; 7 | 8 | getkey (data.get(p) & 0xff); 9 | 10 | action push { push(data.get(p)); } 11 | action add_status { status = status * 10 + data.get(p) - '0'; } 12 | action handle_meta { meta = new String(buf, 0, bufPos, UTF_8); bufPos = 0; } 13 | action handle_url { url = new String(buf, 0, bufPos, UTF_8); bufPos = 0; } 14 | action finish { finished = true; fbreak; } 15 | 16 | CRLF = "\r\n"; 17 | 18 | # spec doesn't mention any disallowed characters 19 | # but we assume \r and \n 20 | utf8_string = (any - '\r' - '\n')*; 21 | 22 | url = utf8_string $push %handle_url; 23 | gemini_request := url CRLF @finish; 24 | 25 | meta = utf8_string $push %handle_meta; 26 | status = digit {2} $add_status; 27 | gemini_response := status " " meta CRLF @finish; 28 | 29 | }%% 30 | 31 | package org.netpreserve.jwarc; 32 | 33 | import java.io.EOFException; 34 | import java.io.IOException; 35 | import java.nio.ByteBuffer; 36 | import java.nio.channels.ReadableByteChannel; 37 | import java.nio.channels.WritableByteChannel; 38 | import java.util.*; 39 | 40 | import static java.nio.charset.StandardCharsets.UTF_8; 41 | 42 | public class GeminiParser extends MessageParser { 43 | private int initialState; 44 | private int cs; 45 | private long position; 46 | private boolean finished; 47 | private byte[] buf = new byte[256]; 48 | private int bufPos = 0; 49 | private int status; 50 | private String meta; 51 | private String url; 52 | 53 | public GeminiParser() { 54 | reset(); 55 | } 56 | 57 | public void reset() { 58 | %% write init; 59 | bufPos = 0; 60 | if (buf.length > 8192) { 61 | buf = new byte[256]; // if our buffer grew really big release it 62 | } 63 | status = 0; 64 | meta = null; 65 | url = null; 66 | position = 0; 67 | finished = false; 68 | cs = initialState; 69 | } 70 | 71 | public int status() { 72 | return status; 73 | } 74 | 75 | public String meta() { 76 | return meta; 77 | } 78 | 79 | public String url() { 80 | return url; 81 | } 82 | 83 | public boolean isFinished() { 84 | return finished; 85 | } 86 | 87 | public boolean isError() { 88 | return cs == gemini_error; 89 | } 90 | 91 | /** 92 | * Configures the parser to read a gemini request while rejecting deviations from the standard. 93 | */ 94 | public void strictRequest() { 95 | cs = gemini_en_gemini_request; 96 | initialState = cs; 97 | } 98 | 99 | /** 100 | * Configures the parser to read a gemini response while rejecting deviations from the standard. 101 | */ 102 | public void strictResponse() { 103 | cs = gemini_en_gemini_response; 104 | initialState = cs; 105 | } 106 | 107 | /** 108 | * Runs the parser on a buffer of data. Passing null as the buffer indicates the end of input. 109 | */ 110 | @SuppressWarnings({"UnusedAssignment", "ConstantConditions", "ConditionalBreakInInfiniteLoop"}) 111 | public void parse(ByteBuffer data) throws ParsingException { 112 | int p; 113 | int pe; 114 | int eof; 115 | 116 | if (data == null) { 117 | p = 0; 118 | pe = 0; 119 | eof = 0; 120 | } else { 121 | p = data.position(); 122 | pe = data.limit(); 123 | eof = -1; 124 | } 125 | 126 | %% write exec; 127 | 128 | if (data != null) { 129 | position += p - data.position(); 130 | data.position(p); 131 | } 132 | } 133 | 134 | public void parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException { 135 | parse(channel, buffer, null); 136 | } 137 | 138 | void parse(ReadableByteChannel channel, ByteBuffer buffer, WritableByteChannel copyTo) throws IOException { 139 | while (true) { 140 | ByteBuffer copy = buffer.duplicate(); 141 | long buffOffset = buffer.position() - position; 142 | parse(buffer); 143 | if (copyTo != null) { 144 | copy.limit(buffer.position()); 145 | copyTo.write(copy); 146 | } 147 | if (isFinished()) { 148 | break; 149 | } 150 | if (isError()) { 151 | throw new ParsingException("invalid gemini message at byte position " + position + ": " 152 | + getErrorContext(buffer.duplicate(), (int) (buffOffset + position), 40)); 153 | } 154 | buffer.compact(); 155 | int n = channel.read(buffer); 156 | buffer.flip(); 157 | if (n < 0) { 158 | parse(null); 159 | break; 160 | } 161 | } 162 | } 163 | 164 | private void push(byte b) throws ParsingException { 165 | if (bufPos >= 1024) throw new ParsingException("gemini header field longer than 1024 bytes"); 166 | if (bufPos >= buf.length) { 167 | buf = Arrays.copyOf(buf, buf.length * 2); 168 | } 169 | buf[bufPos++] = b; 170 | } 171 | 172 | %% write data; 173 | } -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/GeminiRequest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.nio.channels.ReadableByteChannel; 11 | 12 | public class GeminiRequest { 13 | private final String url; 14 | 15 | public GeminiRequest(String url) { 16 | this.url = url; 17 | } 18 | 19 | public static GeminiRequest parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException { 20 | GeminiParser parser = new GeminiParser(); 21 | parser.strictRequest(); 22 | parser.parse(channel, buffer, null); 23 | return new GeminiRequest(parser.url()); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/GeminiResponse.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.nio.channels.ReadableByteChannel; 11 | import java.nio.charset.StandardCharsets; 12 | import java.util.Collections; 13 | 14 | public class GeminiResponse extends Message { 15 | private final int status; 16 | private final String meta; 17 | 18 | public GeminiResponse(int status, String meta, MessageBody body) { 19 | super(MessageVersion.GEMINI, new MessageHeaders(Collections.emptyMap()), body); 20 | this.status = status; 21 | this.meta = meta; 22 | } 23 | 24 | public static GeminiResponse parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException { 25 | GeminiParser parser = new GeminiParser(); 26 | parser.strictResponse(); 27 | parser.parse(channel, buffer, null); 28 | return new GeminiResponse(parser.status(), parser.meta(), LengthedBody.createFromContentLength(channel, buffer, null)); 29 | } 30 | 31 | public int status() { 32 | return status; 33 | } 34 | 35 | /** 36 | * Returns the HTTP equivalent of the status code. (e.g. 20 -> 200, 51 -> 404) 37 | */ 38 | public int statusHttpEquivalent() { 39 | switch (status) { 40 | case 20: 41 | return 200; 42 | case 31: // redirect - temporary 43 | return 307; 44 | case 32: // redirect - permanent 45 | return 308; 46 | case 40: // temporary failure 47 | return 503; 48 | case 41: // server unavailable 49 | return 503; 50 | case 42: // CGI error 51 | return 500; 52 | case 43: // proxy error 53 | return 502; 54 | case 44: // slow down 55 | return 429; 56 | case 50: // permanent failure 57 | return 500; 58 | case 51: // not found 59 | return 404; 60 | case 52: // gone 61 | return 410; 62 | case 53: // proxy request refused 63 | return 502; 64 | case 59: // bad request 65 | return 400; 66 | case 60: // client certificate required 67 | return 401; 68 | case 61: // certificate not authorized 69 | return 403; 70 | case 62: // certificate not valid 71 | return 403; 72 | default: 73 | if (status > 10 && status < 20) { // input 74 | return 100; 75 | } else if (status >= 20 && status < 30) { // success 76 | return 200; 77 | } else if (status >= 30 && status < 40) { // redirect 78 | return 307; 79 | } else if (status >= 60 && status < 70) { // client cert required 80 | return 401; 81 | } else { 82 | return 500; 83 | } 84 | } 85 | } 86 | 87 | public String meta() { 88 | return meta; 89 | } 90 | 91 | @Override 92 | public byte[] serializeHeader() { 93 | return (String.format("%02d", status) + " " + meta + "\r\n").getBytes(StandardCharsets.UTF_8); 94 | } 95 | 96 | @Override 97 | public MediaType contentType() { 98 | if (status >= 20 && status < 30) { 99 | if (meta.isEmpty()) { 100 | return MediaType.parse("text/gemini; charset=utf-8"); 101 | } 102 | return MediaType.parseLeniently(meta); 103 | } else { 104 | return MediaType.OCTET_STREAM; 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/HttpMessage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.nio.charset.Charset; 9 | import java.util.List; 10 | 11 | import static java.nio.charset.StandardCharsets.ISO_8859_1; 12 | 13 | import java.io.IOException; 14 | 15 | public abstract class HttpMessage extends Message { 16 | HttpMessage(MessageVersion version, MessageHeaders headers, MessageBody body) { 17 | super(version, headers, body); 18 | } 19 | 20 | @Override 21 | Charset headerCharset() { 22 | return ISO_8859_1; 23 | } 24 | 25 | /** 26 | * The HTTP payload with Content-Encoding decoded. 27 | * 28 | * @return a message body with content decoded following the HTTP 29 | * Content-Encoding header. 30 | * @throws IOException 31 | */ 32 | public MessageBody bodyDecoded() throws IOException { 33 | MessageBody payload = body(); 34 | List contentEncodings = headers().all("Content-Encoding"); 35 | if (contentEncodings.isEmpty()) { 36 | return payload; 37 | } else if (contentEncodings.size() > 1) { 38 | throw new IOException("Multiple Content-Encodings not supported: " + contentEncodings); 39 | } else if (contentEncodings.get(0).equalsIgnoreCase("identity") 40 | || contentEncodings.get(0).equalsIgnoreCase("none")) { 41 | return payload; 42 | } else if (contentEncodings.get(0).equalsIgnoreCase("gzip") 43 | || contentEncodings.get(0).equalsIgnoreCase("x-gzip")) { 44 | return DecodedBody.create(payload, DecodedBody.Encoding.GZIP); 45 | } else if (contentEncodings.get(0).equalsIgnoreCase("br")) { 46 | return DecodedBody.create(payload, DecodedBody.Encoding.BROTLI); 47 | } else if (contentEncodings.get(0).equalsIgnoreCase("deflate")) { 48 | return DecodedBody.create(payload, DecodedBody.Encoding.DEFLATE); 49 | } else { 50 | throw new IOException("Content-Encoding not supported: " + contentEncodings.get(0)); 51 | } 52 | 53 | } 54 | 55 | public abstract static class AbstractBuilder> extends Message.AbstractBuilder { 56 | public AbstractBuilder() { 57 | super(MessageVersion.HTTP_1_1); 58 | } 59 | 60 | @Override 61 | public B version(MessageVersion version) { 62 | version.requireProtocol("HTTP"); 63 | return super.version(version); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/HttpRequest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.ByteArrayOutputStream; 9 | import java.io.EOFException; 10 | import java.io.IOException; 11 | import java.net.URI; 12 | import java.nio.ByteBuffer; 13 | import java.nio.channels.Channels; 14 | import java.nio.channels.ReadableByteChannel; 15 | import java.nio.channels.WritableByteChannel; 16 | 17 | public class HttpRequest extends HttpMessage { 18 | private final String method; 19 | private final String target; 20 | 21 | HttpRequest(String method, String target, MessageVersion version, MessageHeaders headers, MessageBody body) { 22 | super(version, headers, body); 23 | this.method = method; 24 | this.target = target; 25 | } 26 | 27 | public String target() { 28 | return target; 29 | } 30 | 31 | public String method() { 32 | return method; 33 | } 34 | 35 | @Override 36 | void serializeHeaderTo(Appendable output) throws IOException { 37 | output.append(method); 38 | output.append(' '); 39 | output.append(target); 40 | output.append(' '); 41 | output.append(version().toString()); 42 | output.append("\r\n"); 43 | headers().appendTo(output); 44 | output.append("\r\n"); 45 | } 46 | 47 | /** 48 | * Parses a HTTP request while leniently allowing common deviations from the standard. 49 | */ 50 | public static HttpRequest parse(ReadableByteChannel channel) throws IOException { 51 | ByteBuffer buffer = ByteBuffer.allocate(8192); 52 | buffer.flip(); 53 | return parse(channel, buffer); 54 | } 55 | 56 | /** 57 | * Parses a HTTP request while leniently allowing common deviations from the standard. 58 | */ 59 | public static HttpRequest parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException { 60 | return parse(channel, buffer, null); 61 | } 62 | 63 | /** 64 | * Parses a HTTP request while strictly rejecting deviations from the standard. 65 | */ 66 | public static HttpRequest parseStrictly(ReadableByteChannel channel, ByteBuffer buffer) throws IOException { 67 | return parse(channel, buffer, null, true); 68 | } 69 | 70 | static HttpRequest parse(ReadableByteChannel channel, ByteBuffer buffer, WritableByteChannel copyTo) throws IOException { 71 | return parse(channel, buffer, copyTo, false); 72 | } 73 | 74 | private static HttpRequest parse(ReadableByteChannel channel, ByteBuffer buffer, WritableByteChannel copyTo, boolean strict) throws IOException { 75 | HttpParser parser = new HttpParser(); 76 | if (strict) { 77 | parser.strictRequest(); 78 | } else { 79 | parser.lenientRequest(); 80 | } 81 | ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(); 82 | parser.parse(channel, buffer, Channels.newChannel(headerBuffer)); 83 | byte[] headerBytes = headerBuffer.toByteArray(); 84 | if (headerBytes.length == 0) throw new EOFException(); 85 | if (copyTo != null) { 86 | copyTo.write(ByteBuffer.wrap(headerBytes)); 87 | copyTo.write(buffer.duplicate()); 88 | } 89 | MessageHeaders headers = parser.headers(); 90 | Long contentLength; 91 | try { 92 | contentLength = headers.first("Content-Length").map(Long::parseLong).orElse(null); 93 | } catch (NumberFormatException e) { 94 | if (strict) throw new IOException("Invalid Content-Length header", e); 95 | contentLength = null; 96 | } 97 | LengthedBody body = LengthedBody.createFromContentLength(channel, buffer, contentLength); 98 | HttpRequest request = new HttpRequest(parser.method(), parser.target(), parser.version(), headers, body); 99 | request.serializedHeader = headerBytes; 100 | return request; 101 | } 102 | 103 | public static class Builder extends AbstractBuilder { 104 | private final String method; 105 | private final String target; 106 | 107 | public Builder(String method, String target) { 108 | super(); 109 | this.method = method; 110 | this.target = target; 111 | } 112 | 113 | /** 114 | * Create a new HTTP request builder from a URI. 115 | *

116 | * The request target will be set to the path and query of the URI. 117 | * The Host header will be set to the host and port of the URI. 118 | */ 119 | public Builder(String method, URI uri) { 120 | this(method, uri.getRawQuery() == null ? uri.getRawPath() : uri.getRawPath() + "?" + uri.getRawQuery()); 121 | setHeader("Host", uri.getPort() == -1 ? uri.getHost() : uri.getHost() + ":" + uri.getPort()); 122 | } 123 | 124 | public HttpRequest build() { 125 | return new HttpRequest(method, target, version, new MessageHeaders(headerMap), makeBody()); 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/HttpResponse.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.ByteArrayOutputStream; 9 | import java.io.IOException; 10 | import java.nio.ByteBuffer; 11 | import java.nio.channels.Channels; 12 | import java.nio.channels.ReadableByteChannel; 13 | import java.nio.channels.SeekableByteChannel; 14 | import java.nio.channels.WritableByteChannel; 15 | 16 | public class HttpResponse extends HttpMessage { 17 | private final int status; 18 | private final String reason; 19 | 20 | HttpResponse(int status, String reason, MessageVersion version, MessageHeaders headers, MessageBody body) { 21 | super(version, headers, body); 22 | this.status = status; 23 | this.reason = reason; 24 | } 25 | 26 | @Override 27 | void serializeHeaderTo(Appendable output) throws IOException { 28 | output.append(version().toString()); 29 | output.append(' '); 30 | output.append(Integer.toString(status)); 31 | output.append(' '); 32 | output.append(reason); 33 | output.append("\r\n"); 34 | headers().appendTo(output); 35 | output.append("\r\n"); 36 | } 37 | 38 | /** 39 | * Parses a HTTP response while leniently allowing common deviations from the standard. 40 | */ 41 | public static HttpResponse parse(ReadableByteChannel channel) throws IOException { 42 | return parse(channel, null); 43 | } 44 | 45 | /** 46 | * Parses a HTTP response while strictly rejecting deviations from the standard. 47 | */ 48 | public static HttpResponse parseStrictly(ReadableByteChannel channel) throws IOException { 49 | return parse(channel, null, true, false); 50 | } 51 | 52 | static HttpResponse parse(ReadableByteChannel channel, WritableByteChannel copyTo) throws IOException { 53 | return parse(channel, copyTo, false, false); 54 | } 55 | 56 | public static HttpResponse parseWithoutBody(ReadableByteChannel channel, WritableByteChannel copyTo) throws IOException { 57 | return parse(channel, copyTo, false, true); 58 | } 59 | 60 | private static HttpResponse parse(ReadableByteChannel channel, WritableByteChannel copyTo, boolean strict, boolean withoutBody) throws IOException { 61 | ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(); 62 | ByteBuffer buffer = ByteBuffer.allocate(8192); 63 | buffer.flip(); 64 | HttpParser parser = new HttpParser(); 65 | if (strict) { 66 | parser.strictResponse(); 67 | } else { 68 | parser.lenientResponse(); 69 | } 70 | parser.parse(channel, buffer, Channels.newChannel(headerBuffer)); 71 | byte[] headerBytes = headerBuffer.toByteArray(); 72 | if (copyTo != null) { 73 | copyTo.write(ByteBuffer.wrap(headerBytes)); 74 | copyTo.write(buffer.duplicate()); 75 | } 76 | MessageHeaders headers = parser.headers(); 77 | MessageBody body; 78 | if (withoutBody) { 79 | body = MessageBody.empty(); 80 | } else if (headers.contains("Transfer-Encoding", "chunked")) { 81 | ChunkedBody chunkedBody = new ChunkedBody(channel, buffer); 82 | if (strict) { 83 | chunkedBody.strict(); 84 | } 85 | body = chunkedBody; 86 | } else { 87 | Long contentLength; 88 | try { 89 | contentLength = headers.first("Content-Length") 90 | .map(Long::parseLong) 91 | .orElse(null); 92 | } catch (NumberFormatException e) { 93 | if (strict) throw new IOException("Invalid Content-Length header", e); 94 | contentLength = null; 95 | } 96 | body = LengthedBody.createFromContentLength(channel, buffer, contentLength); 97 | } 98 | HttpResponse response = new HttpResponse(parser.status(), parser.reason(), parser.version(), headers, body); 99 | response.serializedHeader = headerBytes; 100 | return response; 101 | } 102 | 103 | 104 | /** 105 | * The 3 digit response status code. 106 | */ 107 | public int status() { 108 | return status; 109 | } 110 | 111 | /** 112 | * The resposne status reason phrase. 113 | */ 114 | public String reason() { 115 | return reason; 116 | } 117 | 118 | public static class Builder extends HttpMessage.AbstractBuilder { 119 | private final int status; 120 | private final String reasonPhrase; 121 | 122 | public Builder(int status, String reasonPhrase) { 123 | super(); 124 | this.status = status; 125 | this.reasonPhrase = reasonPhrase; 126 | } 127 | 128 | @Override 129 | public HttpResponse build() { 130 | return new HttpResponse(status, reasonPhrase, version, new MessageHeaders(headerMap), makeBody()); 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/IOUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import javax.net.ssl.SSLSocketFactory; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.io.OutputStream; 12 | import java.net.Socket; 13 | import java.nio.ByteBuffer; 14 | import java.nio.channels.ReadableByteChannel; 15 | import java.util.Arrays; 16 | import java.util.Objects; 17 | 18 | /** 19 | * This class is public only due to technical constraints. Please don't depend on it your own code. 20 | */ 21 | public final class IOUtils { 22 | 23 | /** 24 | * Transfers as many bytes as possible from src to dst. 25 | * @return the number of bytes transferred. 26 | */ 27 | static int transfer(ByteBuffer src, ByteBuffer dst) { 28 | return transferExactly(src, dst, Math.min(src.remaining(), dst.remaining())); 29 | } 30 | 31 | /** 32 | * Transfers up to limits from src to dst. 33 | * @return the number of bytes transferred. 34 | */ 35 | static int transfer(ByteBuffer src, ByteBuffer dst, long limit) { 36 | return transferExactly(src, dst, (int)Math.min(Math.min(src.remaining(), dst.remaining()), limit)); 37 | } 38 | 39 | private static int transferExactly(ByteBuffer src, ByteBuffer dst, int n) { 40 | if (src.remaining() > n) { 41 | int savedLimit = src.limit(); 42 | try { 43 | src.limit(src.position() + n); 44 | dst.put(src); 45 | return n; 46 | } finally { 47 | src.limit(savedLimit); 48 | } 49 | } 50 | dst.put(src); 51 | return n; 52 | } 53 | 54 | static int transfer(ReadableByteChannel src, ByteBuffer dst, long limit) throws IOException { 55 | if (dst.remaining() > limit) { 56 | int savedLimit = dst.limit(); 57 | try { 58 | dst.limit(dst.position() + (int) limit); 59 | int n = src.read(dst); 60 | return n; 61 | } finally { 62 | dst.limit(savedLimit); 63 | } 64 | } 65 | return src.read(dst); 66 | } 67 | 68 | static ReadableByteChannel prefixChannel(ByteBuffer prefix, ReadableByteChannel channel) { 69 | return new ReadableByteChannel() { 70 | @Override 71 | public int read(ByteBuffer byteBuffer) throws IOException { 72 | int n = 0; 73 | if (prefix.hasRemaining()) { 74 | n += IOUtils.transfer(prefix, byteBuffer); 75 | } 76 | if (byteBuffer.hasRemaining()) { 77 | n += channel.read(byteBuffer); 78 | } 79 | return n; 80 | } 81 | 82 | @Override 83 | public boolean isOpen() { 84 | return channel.isOpen(); 85 | } 86 | 87 | @Override 88 | public void close() throws IOException { 89 | channel.close(); 90 | } 91 | }; 92 | } 93 | 94 | public static void copy(InputStream inputStream, OutputStream outputStream) throws IOException { 95 | byte[] buffer = new byte[8192]; 96 | while (true) { 97 | int n = inputStream.read(buffer); 98 | if (n < 0) break; 99 | outputStream.write(buffer, 0, n); 100 | } 101 | } 102 | 103 | public static ReadableByteChannel gunzipChannel(ReadableByteChannel gzipped) throws IOException { 104 | ByteBuffer buffer = ByteBuffer.allocate(8192); 105 | buffer.flip(); 106 | return new GunzipChannel(gzipped, buffer); 107 | } 108 | 109 | public static ReadableByteChannel inflateChannel(ReadableByteChannel deflated) throws IOException { 110 | ByteBuffer buffer = ByteBuffer.allocate(8192); 111 | buffer.flip(); 112 | return new InflateChannel(deflated, buffer); 113 | } 114 | 115 | static Socket connect(String scheme, String host, int port) throws IOException { 116 | Objects.requireNonNull(host); 117 | if ("http".equalsIgnoreCase(scheme)) { 118 | return new Socket(host, port < 0 ? 80 : port); 119 | } else if ("https".equalsIgnoreCase(scheme)) { 120 | return SSLSocketFactory.getDefault().createSocket(host, port < 0 ? 443 : port); 121 | } else { 122 | throw new IllegalArgumentException("Unsupported URI scheme: " + scheme); 123 | } 124 | } 125 | 126 | public static byte[] readNBytes(InputStream stream, int n) throws IOException { 127 | byte[] buffer = new byte[n]; 128 | for (int remaining = n; remaining > 0;) { 129 | int read = stream.read(buffer, buffer.length - remaining, remaining); 130 | if (read < 0) { 131 | return Arrays.copyOf(buffer, buffer.length - remaining); 132 | } 133 | remaining -= read; 134 | } 135 | return buffer; 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/InflateChannel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2020 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.nio.channels.ReadableByteChannel; 11 | import java.util.zip.DataFormatException; 12 | import java.util.zip.Inflater; 13 | import java.util.zip.ZipException; 14 | 15 | /** 16 | * A ReadableByteChannel inflating deflate-compressed content on read. Used to 17 | * uncompress HTTP payload with header Content-Encoding: deflate. 18 | */ 19 | public class InflateChannel implements ReadableByteChannel { 20 | 21 | private final ReadableByteChannel channel; 22 | private final ByteBuffer buffer; 23 | private final Inflater inflater = new Inflater(true); 24 | 25 | public InflateChannel(ReadableByteChannel channel, ByteBuffer buffer) throws IllegalArgumentException { 26 | this.channel = channel; 27 | this.buffer = buffer; 28 | if (!buffer.hasArray()) { 29 | throw new IllegalArgumentException("ByteBuffer must be array-backed and writable"); 30 | } 31 | } 32 | 33 | @Override 34 | public int read(ByteBuffer dest) throws IOException { 35 | if (inflater.finished()) { 36 | return -1; 37 | } 38 | 39 | if (inflater.needsInput()) { 40 | if (!buffer.hasRemaining()) { 41 | buffer.compact(); 42 | channel.read(buffer); 43 | buffer.flip(); 44 | } 45 | inflater.setInput(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); 46 | } 47 | 48 | try { 49 | int n = inflater.inflate(dest.array(), dest.arrayOffset() + dest.position(), dest.remaining()); 50 | dest.position(dest.position() + n); 51 | 52 | int newBufferPosition = buffer.limit() - inflater.getRemaining(); 53 | buffer.position(newBufferPosition); 54 | 55 | return n; 56 | } catch (DataFormatException e) { 57 | throw new ZipException(e.getMessage()); 58 | } 59 | } 60 | 61 | @Override 62 | public boolean isOpen() { 63 | return channel.isOpen(); 64 | } 65 | 66 | @Override 67 | public void close() throws IOException { 68 | channel.close(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/MessageBody.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.nio.ByteBuffer; 7 | import java.nio.channels.Channels; 8 | import java.nio.channels.ReadableByteChannel; 9 | 10 | public abstract class MessageBody extends MessageParser implements ReadableByteChannel { 11 | 12 | MessageBody() { 13 | } 14 | 15 | public static MessageBody empty() { 16 | return LengthedBody.create(Channels.newChannel(new ByteArrayInputStream(new byte[0])), 17 | ByteBuffer.allocate(0), 0); 18 | } 19 | 20 | /** 21 | * Returns the length of the body. This may be less than the Content-Length header if the record was truncated. 22 | * Returns -1 if the length cannot be determined (such as when chunked encoding is used). 23 | */ 24 | public long size() throws IOException { 25 | return -1; 26 | } 27 | 28 | public abstract long position() throws IOException; 29 | 30 | public InputStream stream() throws IOException { 31 | return Channels.newInputStream(this); 32 | } 33 | 34 | public void consume() throws IOException { 35 | ByteBuffer buffer = ByteBuffer.allocate(8192); 36 | while (read(buffer) >= 0) { 37 | buffer.clear(); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/MessageHeaders.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.nio.channels.ReadableByteChannel; 11 | import java.nio.charset.StandardCharsets; 12 | import java.util.*; 13 | import java.util.regex.Pattern; 14 | 15 | import static java.util.Collections.emptyList; 16 | 17 | public class MessageHeaders { 18 | private static Pattern COMMA_SEPARATOR = Pattern.compile("[ \t]*,[ \t]*"); 19 | private Map> map; 20 | 21 | public static MessageHeaders of(String... keysAndValues) { 22 | if (keysAndValues.length % 2 != 0) { 23 | throw new IllegalArgumentException("an even number keysAndValues must be provided"); 24 | } 25 | Map> map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); 26 | for (int i = 0; i < keysAndValues.length; i += 2) { 27 | map.computeIfAbsent(keysAndValues[i], k -> new ArrayList<>()).add(keysAndValues[i + 1]); 28 | } 29 | return new MessageHeaders(map); 30 | } 31 | 32 | MessageHeaders(Map> map) { 33 | map.replaceAll((name, values) -> Collections.unmodifiableList(values)); 34 | this.map = Collections.unmodifiableMap(map); 35 | } 36 | 37 | /** 38 | * Returns the value of a single-valued header field. Throws an exception if there are more than one. 39 | */ 40 | public Optional sole(String name) { 41 | List values = all(name); 42 | if (values.size() > 1) { 43 | throw new IllegalArgumentException("record has " + values.size() + " " + name + " headers"); 44 | } 45 | return values.stream().findFirst(); 46 | } 47 | 48 | /** 49 | * Returns the first value of a header field. 50 | */ 51 | public Optional first(String name) { 52 | return all(name).stream().findFirst(); 53 | } 54 | 55 | /** 56 | * Returns all the values of a header field. 57 | */ 58 | public List all(String name) { 59 | return map.getOrDefault(name, emptyList()); 60 | } 61 | 62 | /** 63 | * Returns a map of header fields to their values. 64 | */ 65 | public Map> map() { 66 | return map; 67 | } 68 | 69 | /** 70 | * Returns true when the given header value is present. 71 | * 72 | * Fields are interpreted as a comma-separated list and the value is compared case-insensitively. 73 | */ 74 | public boolean contains(String name, String value) { 75 | for (String rawValue : all(name)) { 76 | for (String splitValue : COMMA_SEPARATOR.split(rawValue)) { 77 | if (splitValue.equalsIgnoreCase(value)) { 78 | return true; 79 | } 80 | } 81 | } 82 | return false; 83 | } 84 | 85 | @Override 86 | public String toString() { 87 | return map.toString(); 88 | } 89 | 90 | /** 91 | * Parses application/warc-fields. 92 | */ 93 | public static MessageHeaders parse(ReadableByteChannel channel) throws IOException { 94 | WarcParser parser = WarcParser.newWarcFieldsParser(); 95 | ByteBuffer buffer = ByteBuffer.allocate(8192); 96 | while (!parser.isFinished()) { 97 | int n = channel.read(buffer); 98 | if (n < 0) { 99 | parser.parse(ByteBuffer.wrap("\r\n\r\n".getBytes(StandardCharsets.US_ASCII))); 100 | break; 101 | } 102 | buffer.flip(); 103 | parser.parse(buffer); 104 | if (parser.isError()) throw new ParsingException("invalid WARC fields"); 105 | buffer.compact(); 106 | } 107 | return parser.headers(); 108 | } 109 | 110 | private static final boolean[] ILLEGAL = initIllegalLookup(); 111 | private static boolean[] initIllegalLookup() { 112 | boolean[] illegal = new boolean[256]; 113 | String separators = "()<>@,;:\\\"/[]?={} \t"; 114 | for (int i = 0; i < separators.length(); i++) { 115 | illegal[separators.charAt(i)] = true; 116 | } 117 | for (int i = 0; i < 32; i++) { // control characters 118 | illegal[i] = true; 119 | } 120 | return illegal; 121 | } 122 | 123 | static String format(Map> map) { 124 | StringBuilder out = new StringBuilder(); 125 | for (Map.Entry> entry : map.entrySet()) { 126 | String name = entry.getKey(); 127 | for (String value : entry.getValue()) { 128 | out.append(name).append(": ").append(value).append("\r\n"); 129 | } 130 | } 131 | return out.toString(); 132 | } 133 | 134 | public void appendTo(Appendable appendable) throws IOException { 135 | for (Map.Entry> entry : map.entrySet()) { 136 | String name = entry.getKey(); 137 | for (String value : entry.getValue()) { 138 | appendable.append(name).append(": ").append(value).append("\r\n"); 139 | } 140 | } 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/MessageParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2020 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.nio.ByteBuffer; 9 | import java.util.function.Consumer; 10 | 11 | public class MessageParser { 12 | private Consumer warningHandler; 13 | 14 | protected void emitWarning(String message) { 15 | if (warningHandler != null) { 16 | warningHandler.accept(message); 17 | } 18 | } 19 | 20 | void onWarning(Consumer warningHandler) { 21 | this.warningHandler = warningHandler; 22 | } 23 | 24 | protected static String getErrorContext(String input, int position, int length) { 25 | StringBuilder context = new StringBuilder(); 26 | 27 | int start = position - length; 28 | if (start < 0) { 29 | start = 0; 30 | } else { 31 | context.append("..."); 32 | } 33 | int end = Math.min(input.length(), (position + length)); 34 | 35 | context.append(input.substring(start, position)); 36 | context.append("<-- HERE -->"); 37 | context.append(input.substring(position, end)); 38 | 39 | if (end < input.length()) { 40 | context.append("..."); 41 | } 42 | 43 | return context.toString(); 44 | } 45 | 46 | protected static String getErrorContext(ByteBuffer buffer, int position, int length) { 47 | StringBuilder context = new StringBuilder(); 48 | 49 | int start = position - length; 50 | if (start < 0) { 51 | start = 0; 52 | } else { 53 | context.append("..."); 54 | } 55 | 56 | ByteBuffer copy = buffer.duplicate(); 57 | copy.position(start); 58 | 59 | int end = position + length; 60 | if (end < buffer.limit()) { 61 | copy.limit(end); 62 | } 63 | 64 | while (true) { 65 | if (copy.position() == position) { 66 | context.append("<-- HERE -->"); 67 | } 68 | if (!copy.hasRemaining()) break; 69 | int c = (int) copy.get(); 70 | if (c < 0x7f && c >= 0x20) { 71 | context.append((char) c); 72 | } else if (c == 0x09) { 73 | context.append("\\t"); 74 | } else if (c == 0x0a) { 75 | context.append("\\n"); 76 | } else if (c == 0x0d) { 77 | context.append("\\r"); 78 | } else { 79 | context.append(String.format("\\x%02x", c)); 80 | } 81 | } 82 | 83 | if (copy.position() < buffer.limit()) { 84 | context.append("..."); 85 | } 86 | 87 | return context.toString(); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/MessageVersion.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.util.Objects; 9 | 10 | public final class MessageVersion { 11 | public static final MessageVersion GEMINI = new MessageVersion("gemini"); 12 | public static final MessageVersion HTTP_1_0 = new MessageVersion("HTTP", 1, 0); 13 | public static final MessageVersion HTTP_1_1 = new MessageVersion("HTTP", 1, 1); 14 | public static final MessageVersion WARC_1_0 = new MessageVersion("WARC", 1, 0); 15 | public static final MessageVersion WARC_1_1 = new MessageVersion("WARC", 1, 1); 16 | public static final MessageVersion ARC_1_1 = new MessageVersion("ARC", 1, 1); 17 | 18 | private final String protocol; 19 | private final int major; 20 | private final int minor; 21 | 22 | public MessageVersion(String protocol) { 23 | this.protocol = protocol; 24 | major = 0; 25 | minor = 0; 26 | } 27 | 28 | public MessageVersion(String protocol, int major, int minor) { 29 | this.protocol = protocol; 30 | this.major = major; 31 | this.minor = minor; 32 | } 33 | 34 | public String getProtocol() { 35 | return protocol; 36 | } 37 | 38 | void requireProtocol(String expectedProtocol) { 39 | if (!protocol.equals(expectedProtocol)) { 40 | throw new IllegalArgumentException("Expected a version of " + expectedProtocol + " but got " + this); 41 | } 42 | } 43 | 44 | public int getMajor() { 45 | return major; 46 | } 47 | 48 | public int getMinor() { 49 | return minor; 50 | } 51 | 52 | @Override 53 | public boolean equals(Object o) { 54 | if (this == o) return true; 55 | if (o == null || getClass() != o.getClass()) return false; 56 | MessageVersion that = (MessageVersion) o; 57 | return major == that.major && 58 | minor == that.minor && 59 | Objects.equals(protocol, that.protocol); 60 | } 61 | 62 | @Override 63 | public int hashCode() { 64 | return Objects.hash(protocol, major, minor); 65 | } 66 | 67 | @Override 68 | public String toString() { 69 | if (major == 0 && minor == 0) return protocol; 70 | return protocol + "/" + major + "." + minor; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/ParsingException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | 10 | public class ParsingException extends IOException { 11 | public ParsingException(String message) { 12 | super(message); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcCaptureRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.net.Inet6Address; 10 | import java.net.InetAddress; 11 | import java.net.URI; 12 | import java.nio.ByteBuffer; 13 | import java.nio.channels.ReadableByteChannel; 14 | import java.util.List; 15 | import java.util.Optional; 16 | 17 | import static java.util.stream.Collectors.toList; 18 | 19 | /** 20 | * A type of WARC record created as part of a web capture event. 21 | */ 22 | public abstract class WarcCaptureRecord extends WarcTargetRecord { 23 | WarcCaptureRecord(MessageVersion version, MessageHeaders headers, MessageBody body) { 24 | super(version, headers, body); 25 | } 26 | 27 | /** 28 | * The IP address of the server involved in the capture event this record belongs to. 29 | */ 30 | public Optional ipAddress() { 31 | return headers().sole("WARC-IP-Address").map(InetAddresses::forString); 32 | } 33 | 34 | /** 35 | * The IDs of other records created during the same capture event as this one. 36 | */ 37 | public List concurrentTo() { 38 | return headers().all("WARC-Concurrent-To").stream().map(WarcRecord::parseRecordID).collect(toList()); 39 | } 40 | 41 | /** 42 | * Content-Type of the payload. 43 | */ 44 | public MediaType payloadType() throws IOException { 45 | return contentType(); 46 | } 47 | 48 | public abstract static class AbstractBuilder> extends WarcTargetRecord.Builder { 49 | protected AbstractBuilder(String type) { 50 | super(type); 51 | } 52 | 53 | public B body(MediaType type, Message message) throws IOException { 54 | ByteBuffer header = ByteBuffer.wrap(message.serializeHeader()); 55 | ReadableByteChannel channel = IOUtils.prefixChannel(header, message.body()); 56 | return body(type, channel, message.body().size() + header.remaining()); 57 | } 58 | 59 | public B concurrentTo(URI recordId) { 60 | return addHeader("WARC-Concurrent-To", "<" + recordId.toString() + ">"); 61 | } 62 | 63 | public B ipAddress(InetAddress ipAddress) { 64 | return addHeader("WARC-IP-Address", InetAddresses.toAddrString(ipAddress)); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcCompression.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.nio.file.Path; 9 | 10 | public enum WarcCompression { 11 | NONE, GZIP; 12 | 13 | static WarcCompression forPath(Path path) { 14 | if (path.getFileName().toString().endsWith(".gz")) { 15 | return GZIP; 16 | } else { 17 | return NONE; 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcContinuation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.net.URI; 9 | import java.util.Optional; 10 | 11 | public class WarcContinuation extends WarcTargetRecord { 12 | WarcContinuation(MessageVersion version, MessageHeaders headers, MessageBody body) { 13 | super(version, headers, body); 14 | } 15 | 16 | /** 17 | * The id of the first record in the series of segments this record is part of. 18 | */ 19 | public URI segmentOriginId() { 20 | return headers().sole("WARC-Segment-Origin-ID").map(WarcRecord::parseRecordID).get(); 21 | } 22 | 23 | /** 24 | * The total length of the content blocks of all segments added together. 25 | */ 26 | public Optional segmentTotalLength() { 27 | return headers().sole("WARC-Segment-Total-Length").map(Long::valueOf); 28 | } 29 | 30 | public static class Builder extends WarcTargetRecord.Builder { 31 | public Builder() { 32 | super("continuation"); 33 | } 34 | 35 | public Builder segmentOriginId(URI recordId) { 36 | return setHeader("WARC-Segment-Origin-Id", WarcRecord.formatId(recordId)); 37 | } 38 | 39 | public Builder segmentTotalLength(long segmentTotalLength) { 40 | return setHeader("WARC-Segment-Total-Length", Long.toString(segmentTotalLength)); 41 | } 42 | 43 | @Override 44 | public WarcContinuation build() { 45 | return build(WarcContinuation::new); 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcConversion.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.net.URI; 10 | import java.util.Optional; 11 | 12 | public class WarcConversion extends WarcTargetRecord { 13 | WarcConversion(MessageVersion version, MessageHeaders headers, MessageBody body) { 14 | super(version, headers, body); 15 | } 16 | 17 | public Optional payload() throws IOException { 18 | return Optional.of(new WarcPayload(body()) { 19 | @Override 20 | public MediaType type() { 21 | return contentType(); 22 | } 23 | 24 | @Override 25 | Optional identifiedType() { 26 | return Optional.empty(); 27 | } 28 | 29 | @Override 30 | public Optional digest() { 31 | Optional payloadDigest = payloadDigest(); 32 | return payloadDigest.isPresent() ? payloadDigest : blockDigest(); 33 | } 34 | }); 35 | } 36 | 37 | /** 38 | * The record id of the source of the conversion. 39 | */ 40 | public Optional refersTo() { 41 | return headers().sole("WARC-Refers-To").map(WarcRecord::parseRecordID); 42 | } 43 | 44 | public static class Builder extends WarcTargetRecord.Builder { 45 | public Builder() { 46 | super("conversion"); 47 | } 48 | 49 | public Builder refersTo(URI recordId) { 50 | return addHeader("WARC-Refers-To", WarcRecord.formatId(recordId)); 51 | } 52 | 53 | @Override 54 | public WarcConversion build() { 55 | return build(WarcConversion::new); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcFilter.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import java.util.function.Predicate; 4 | 5 | /** 6 | * Filter expressions for matching WARC records. 7 | *

8 | * Simplified grammar for the expression language: 9 | *

10 |  * {@code
11 |  * expression = "(" expression ")"         ; grouping
12 |  *            | "!(" expression ")"        ; boolean NOT
13 |  *            | expression "&&" expression ; boolean AND
14 |  *            | expression "||" expression ; boolean OR
15 |  *            | field "==" string          ; string equality
16 |  *            | field "!=" string          ; string inequality
17 |  *            | field "=~" string          ; regex match
18 |  *            | field "!~" string          ; regex non-match
19 |  *            | field "==" number          ; integer equality
20 |  *            | field "!=" number          ; integer inequality
21 |  *            | field "<"  number          ; integer less-than
22 |  *            | field "<=" number          ; integer less-than-or-equal
23 |  *            | field ">"  number          ; integer greater-than
24 |  *            | field ">=" number          ; integer greater-than-or-equal
25 |  *
26 |  * field = ":status"          ; HTTP response code psuedo-field
27 |  *       | "http:" field-name ; HTTP header field
28 |  *       | field-name         ; WARC header field
29 |  *
30 |  * string = '"' [^"]* '"'
31 |  * }
32 |  * 
33 | * Whitespace outside a string or field is ignored. Fields that do not exist are treated as an empty string when subject 34 | * to string comparison. Fields that do not contain a valid number are treated as zero when subject to integer 35 | * comparison. 36 | */ 37 | public class WarcFilter implements Predicate { 38 | private final String expression; 39 | private final Predicate predicate; 40 | 41 | private WarcFilter(String expression, Predicate predicate) { 42 | this.expression = expression; 43 | this.predicate = predicate; 44 | } 45 | 46 | /** 47 | * Compiles a filter expression from a string. 48 | * 49 | * @throws WarcFilterException when the expression contains a syntax error 50 | */ 51 | public static WarcFilter compile(String expression) { 52 | return new WarcFilter(expression, new WarcFilterCompiler(expression).predicate()); 53 | } 54 | 55 | @Override 56 | public boolean test(WarcRecord warcRecord) { 57 | return predicate.test(warcRecord); 58 | } 59 | 60 | @Override 61 | public String toString() { 62 | return expression; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcFilterException.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import java.util.Arrays; 4 | 5 | /** 6 | * Thrown when a syntax error is encountered when compiling a filter expression. 7 | */ 8 | public class WarcFilterException extends RuntimeException { 9 | private final String input; 10 | private final int position; 11 | 12 | public WarcFilterException(String message, int position, String input) { 13 | super(message); 14 | this.position = position; 15 | this.input = input; 16 | } 17 | 18 | /** 19 | * Returns the character position of the error within the input. 20 | */ 21 | public int position() { 22 | return position; 23 | } 24 | 25 | /** 26 | * Returns the expression containing the error. 27 | */ 28 | public String input() { 29 | return input; 30 | } 31 | 32 | /** 33 | * Returns a user-friendly error message. 34 | */ 35 | public String prettyPrint() { 36 | char[] indent = new char[position]; 37 | Arrays.fill(indent, ' '); 38 | return input + "\n" + new String(indent) + "^\nError: " + getMessage(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcFilterLexer.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | class WarcFilterLexer { 7 | private static Pattern REGEX = Pattern.compile("([a-zA-Z0-9:_-]+)|(&&|[|][|]|!=|==|!~|=~|[<>]=?|!?[(]|[)])|\"([^\"]*)\"|(\\s+)"); 8 | private static final int TOKEN = 1, OPERATOR = 2, STRING = 3, WHITESPACE = 4; 9 | 10 | private final String input; 11 | private final Matcher matcher; 12 | 13 | WarcFilterLexer(String input) { 14 | this.input = input; 15 | this.matcher = REGEX.matcher(input); 16 | } 17 | 18 | Object stringOrNumber() { 19 | Object value = peek().group(STRING); 20 | if (value == null) { 21 | String token = matcher.group(TOKEN); 22 | if (token != null) { 23 | try { 24 | value = Long.parseLong(matcher.group(TOKEN)); 25 | } catch (NumberFormatException e) { 26 | // not a number 27 | } 28 | } 29 | } 30 | if (value == null) throw error("expected string or integer"); 31 | advance(); 32 | return value; 33 | } 34 | 35 | String string() { 36 | String str = peek().group(STRING); 37 | if (str == null) throw error("expected string"); 38 | advance(); 39 | return str; 40 | } 41 | 42 | String token() { 43 | String field = peek().group(TOKEN); 44 | if (field == null) throw error("expected field name"); 45 | advance(); 46 | return field; 47 | } 48 | 49 | String operator() { 50 | String operator = peekOperator(); 51 | if (operator == null) throw error("expected operator"); 52 | advance(); 53 | return operator; 54 | } 55 | 56 | String peekOperator() { 57 | return peek().group(OPERATOR); 58 | } 59 | 60 | private Matcher peek() { 61 | while (true) { 62 | if (atEnd()) throw error("unexpected end of input"); 63 | if (!matcher.lookingAt()) throw error("syntax error"); 64 | if (matcher.group(WHITESPACE) == null) return matcher; 65 | advance(); 66 | } 67 | } 68 | 69 | void advance() { 70 | matcher.region(matcher.end(), matcher.regionEnd()); 71 | } 72 | 73 | WarcFilterException error(String message) { 74 | return new WarcFilterException(message, matcher.regionStart(), input); 75 | } 76 | 77 | boolean atEnd() { 78 | return matcher.regionStart() == matcher.regionEnd(); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcMetadata.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.net.URI; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Optional; 13 | 14 | import static java.nio.charset.StandardCharsets.UTF_8; 15 | 16 | public class WarcMetadata extends WarcCaptureRecord { 17 | private MessageHeaders fields; 18 | 19 | WarcMetadata(MessageVersion version, MessageHeaders headers, MessageBody body) { 20 | super(version, headers, body); 21 | } 22 | 23 | /** 24 | * Metadata records do not have a payload so this method always returns empty. 25 | */ 26 | @Override 27 | public Optional payload() throws IOException { 28 | return Optional.empty(); 29 | } 30 | 31 | /** 32 | * Parses the body as application/warc-fields. 33 | *

34 | * This is a convenience method for Headers.parse(metadata.body()). 35 | */ 36 | public MessageHeaders fields() throws IOException { 37 | if (fields == null) { 38 | fields = MessageHeaders.parse(body()); 39 | } 40 | return fields; 41 | } 42 | 43 | public static class Builder extends AbstractBuilder { 44 | public Builder() { 45 | super("metadata"); 46 | } 47 | 48 | @Override 49 | public WarcMetadata build() { 50 | return build(WarcMetadata::new); 51 | } 52 | 53 | public Builder fields(Map> map) { 54 | return body(MediaType.WARC_FIELDS, MessageHeaders.format(map).getBytes(UTF_8)); 55 | } 56 | 57 | public Builder targetURI(String uri) { 58 | addHeader("WARC-Target-URI", uri); 59 | return this; 60 | } 61 | 62 | public Builder targetURI(URI uri) { 63 | return targetURI(uri.toString()); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcPayload.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.util.Optional; 9 | 10 | public abstract class WarcPayload { 11 | private final MessageBody body; 12 | 13 | WarcPayload(MessageBody body) { 14 | this.body = body; 15 | } 16 | 17 | public MessageBody body() { 18 | return body; 19 | } 20 | 21 | public abstract MediaType type(); 22 | 23 | abstract Optional identifiedType(); 24 | 25 | public abstract Optional digest(); 26 | } 27 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcRequest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.ByteArrayOutputStream; 9 | import java.io.IOException; 10 | import java.net.URI; 11 | import java.nio.ByteBuffer; 12 | import java.nio.channels.Channels; 13 | import java.util.Optional; 14 | 15 | public class WarcRequest extends WarcCaptureRecord { 16 | 17 | private HttpRequest http; 18 | 19 | WarcRequest(MessageVersion version, MessageHeaders headers, MessageBody body) { 20 | super(version, headers, body); 21 | } 22 | 23 | /** 24 | * Parses the content body of this record as HTTP request. 25 | *

26 | * This is a convenience method for HttpRequest.parse(request.body()). 27 | */ 28 | public HttpRequest http() throws IOException { 29 | if (http == null) { 30 | ByteBuffer buffer = ByteBuffer.allocate(8192); 31 | buffer.flip(); 32 | MessageBody body = body(); 33 | if (body.position() != 0) throw new IllegalStateException("http() cannot be called after reading from body"); 34 | if (body instanceof LengthedBody) { 35 | // if we can, save a copy of the raw header and push it back so we don't invalidate body 36 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 37 | LengthedBody lengthed = (LengthedBody) body; 38 | http = HttpRequest.parse(lengthed.discardPushbackOnRead(), buffer, Channels.newChannel(baos)); 39 | lengthed.pushback(baos.toByteArray()); 40 | } else { 41 | http = HttpRequest.parse(body, buffer); 42 | } 43 | } 44 | return http; 45 | } 46 | 47 | @Override 48 | public MediaType payloadType() throws IOException { 49 | return http().contentType(); 50 | } 51 | 52 | public static class Builder extends AbstractBuilder { 53 | public Builder(URI targetURI) { 54 | this(targetURI.toString()); 55 | } 56 | 57 | public Builder(String targetURI) { 58 | super("request"); 59 | setHeader("WARC-Target-URI", targetURI); 60 | } 61 | 62 | @Override 63 | public WarcRequest build() { 64 | return build(WarcRequest::new); 65 | } 66 | 67 | public Builder body(HttpRequest httpRequest) throws IOException { 68 | return body(MediaType.HTTP_REQUEST, httpRequest); 69 | } 70 | } 71 | 72 | public Optional payload() throws IOException { 73 | if (contentType().base().equals(MediaType.HTTP)) { 74 | return Optional.of(new WarcPayload(http().body()) { 75 | 76 | @Override 77 | public MediaType type() { 78 | return http.contentType(); 79 | } 80 | 81 | @Override 82 | Optional identifiedType() { 83 | return identifiedPayloadType(); 84 | } 85 | 86 | @Override 87 | public Optional digest() { 88 | return payloadDigest(); 89 | } 90 | }); 91 | } 92 | return Optional.empty(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcResource.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.net.URI; 9 | 10 | public class WarcResource extends WarcCaptureRecord { 11 | WarcResource(MessageVersion version, MessageHeaders headers, MessageBody body) { 12 | super(version, headers, body); 13 | } 14 | 15 | public static class Builder extends AbstractBuilder { 16 | public Builder(URI targetURI) { 17 | super("resource"); 18 | setHeader("WARC-Target-URI", targetURI.toString()); 19 | } 20 | 21 | public Builder() { 22 | super("resource"); 23 | } 24 | 25 | @Override 26 | public WarcResource build() { 27 | return build(WarcResource::new); 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcResponse.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.ByteArrayOutputStream; 9 | import java.io.IOException; 10 | import java.net.URI; 11 | import java.nio.ByteBuffer; 12 | import java.nio.channels.Channels; 13 | import java.util.Optional; 14 | 15 | public class WarcResponse extends WarcCaptureRecord { 16 | 17 | private HttpResponse http; 18 | private GeminiResponse gemini; 19 | 20 | WarcResponse(MessageVersion version, MessageHeaders headers, MessageBody body) { 21 | super(version, headers, body); 22 | } 23 | 24 | /** 25 | * Parses the HTTP response captured by this record. 26 | *

27 | * This is a convenience method for HttpResponse.parse(response.body().channel()). 28 | */ 29 | public HttpResponse http() throws IOException { 30 | if (http == null) { 31 | MessageBody body = body(); 32 | if (body.position() != 0) throw new IllegalStateException("http() cannot be called after reading from body"); 33 | if (body instanceof LengthedBody) { 34 | // if we can, save a copy of the raw header and push it back so we don't invalidate body 35 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 36 | LengthedBody lengthed = (LengthedBody) body; 37 | http = HttpResponse.parse(lengthed.discardPushbackOnRead(), Channels.newChannel(baos)); 38 | lengthed.pushback(baos.toByteArray()); 39 | } else { 40 | http = HttpResponse.parse(body); 41 | } 42 | } 43 | return http; 44 | } 45 | 46 | public GeminiResponse gemini() throws IOException { 47 | if (gemini == null) { 48 | MessageBody body = body(); 49 | if (body.position() != 0) throw new IllegalStateException("gemini() cannot be called after reading from body"); 50 | ByteBuffer buffer = ByteBuffer.allocate(8192); 51 | buffer.flip(); 52 | gemini = GeminiResponse.parse(body, buffer); 53 | if (body instanceof LengthedBody) { 54 | ((LengthedBody)body).pushback(gemini.serializeHeader()); 55 | } 56 | } 57 | return gemini; 58 | } 59 | 60 | @Override 61 | public MediaType payloadType() throws IOException { 62 | return payload().map(WarcPayload::type).orElse(MediaType.OCTET_STREAM); 63 | } 64 | 65 | public Optional payload() throws IOException { 66 | if (contentType().base().equals(MediaType.HTTP)) { 67 | return Optional.of(new WarcPayload(http().body()) { 68 | 69 | @Override 70 | public MediaType type() { 71 | return http.contentType(); 72 | } 73 | 74 | @Override 75 | Optional identifiedType() { 76 | return identifiedPayloadType(); 77 | } 78 | 79 | @Override 80 | public Optional digest() { 81 | return payloadDigest(); 82 | } 83 | }); 84 | } else if (contentType().base().equals(MediaType.GEMINI)) { 85 | return Optional.of(new WarcPayload(gemini().body()) { 86 | 87 | @Override 88 | public MediaType type() { 89 | return gemini.contentType(); 90 | } 91 | 92 | @Override 93 | Optional identifiedType() { 94 | return identifiedPayloadType(); 95 | } 96 | 97 | @Override 98 | public Optional digest() { 99 | return payloadDigest(); 100 | } 101 | }); 102 | } 103 | return Optional.empty(); 104 | } 105 | 106 | public static class Builder extends AbstractBuilder { 107 | public Builder(URI targetURI) { 108 | this(targetURI.toString()); 109 | } 110 | 111 | public Builder(String targetURI) { 112 | super("response"); 113 | setHeader("WARC-Target-URI", targetURI); 114 | } 115 | 116 | public Builder body(HttpResponse httpResponse) throws IOException { 117 | return body(MediaType.HTTP_RESPONSE, httpResponse); 118 | } 119 | 120 | @Override 121 | public WarcResponse build() { 122 | return build(WarcResponse::new); 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcTargetRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.net.URI; 10 | import java.util.Optional; 11 | 12 | /** 13 | * A WARC record associated with some target URI. 14 | *

15 | * This class exists solely to differentiate between the {@link Warcinfo} record type and all the other standard 16 | * record types. 17 | */ 18 | public abstract class WarcTargetRecord extends WarcRecord { 19 | WarcTargetRecord(MessageVersion version, MessageHeaders headers, MessageBody body) { 20 | super(version, headers, body); 21 | } 22 | 23 | /** 24 | * The URI of the original target resource this record holds information about as an unparsed string. 25 | *

26 | * Strips enclosing angle brackets if present as a compatibility quirk with WARC 1.0. 27 | */ 28 | public String target() { 29 | String value = headers().sole("WARC-Target-URI").orElse(null); 30 | 31 | /* 32 | * Quirk: The grammar in the WARC 1.0 standard included angle brackets around the value of WARC-Target-URI. 33 | * This was likely an editing mistake as it was not present in the drafts of the standard, nor in the examples 34 | * or most implementations. The grammar was corrected in WARC 1.1. It is what ended up published as 1.0 though 35 | * and consequently some software in the wild (e.g. Wget) generates WARCs with angle brackets in this field. 36 | */ 37 | if (value != null && value.startsWith("<") && value.endsWith(">")) { 38 | return value.substring(1, value.length() - 1); 39 | } else { 40 | return value; 41 | } 42 | } 43 | 44 | /** 45 | * The URI of the original target resource this record holds information about. 46 | *

47 | * This method uses URIs.parseLeniently() to percent encode characters that are rejected by the URI class and so may 48 | * return a value that is not identical to the value of the WARC-Target-URI field. Using {@link #target()} should 49 | * be preferred unless you actually need an instance of the URI class. 50 | */ 51 | public URI targetURI() { 52 | return URIs.parseLeniently(target()); 53 | } 54 | 55 | /** 56 | * Digest values that were calculated by applying hash functions to payload. 57 | */ 58 | public Optional payloadDigest() { 59 | return headers().sole("WARC-Payload-Digest").map(WarcDigest::new); 60 | } 61 | 62 | /** 63 | * A content-type that was identified by an independent check (not just what the server said). 64 | */ 65 | public Optional identifiedPayloadType() { 66 | return headers().sole("WARC-Identified-Payload-Type").map(MediaType::parseLeniently); 67 | } 68 | 69 | /** 70 | * Returns the payload of this record if one is present. 71 | *

72 | * This method returns an empty optional when the payload is undefined for this record type or if this library does 73 | * not know how to parse the body in order to extract the payload. If the payload is well defined but 74 | * happens to be zero bytes in length this method still returns a WarcPayload object. 75 | */ 76 | public Optional payload() throws IOException { 77 | return Optional.of(new WarcPayload(body()) { 78 | @Override 79 | public MediaType type() { 80 | return contentType(); 81 | } 82 | 83 | @Override 84 | Optional identifiedType() { 85 | return Optional.empty(); 86 | } 87 | 88 | @Override 89 | public Optional digest() { 90 | Optional payloadDigest = payloadDigest(); 91 | return payloadDigest.isPresent() ? payloadDigest : blockDigest(); 92 | } 93 | }); 94 | } 95 | 96 | /** 97 | * The ID of a {@link Warcinfo} record associated with this record. 98 | */ 99 | public Optional warcinfoID() { 100 | return headers().sole("WARC-Warcinfo-ID").map(WarcRecord::parseRecordID); 101 | } 102 | 103 | @Override 104 | public String toString() { 105 | return getClass().getSimpleName() + "<" + date() + " " + target() + ">"; 106 | } 107 | 108 | public static abstract class Builder> extends AbstractBuilder { 109 | public Builder(String type) { 110 | super(type); 111 | } 112 | 113 | public B payloadDigest(WarcDigest payloadDigest) { 114 | return addHeader("WARC-Payload-Digest", payloadDigest.prefixedBase32()); 115 | } 116 | 117 | public B identifiedPayloadType(String identifiedPayloadType) { 118 | return setHeader("WARC-Identified-Payload-Type", identifiedPayloadType); 119 | } 120 | 121 | public B warcinfoId(URI recordId) { 122 | return addHeader("WARC-Warcinfo-ID", WarcRecord.formatId(recordId)); 123 | } 124 | 125 | public B payloadDigest(String algorithm, String value) { 126 | return payloadDigest(new WarcDigest(algorithm, value)); 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/WarcTruncationReason.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | public enum WarcTruncationReason { 9 | /** 10 | * no truncation occurred 11 | */ 12 | NOT_TRUNCATED, 13 | 14 | /** 15 | * exceeds configured max length 16 | */ 17 | LENGTH, 18 | 19 | /** 20 | * exceeds configured max time 21 | */ 22 | TIME, 23 | 24 | /** 25 | * network disconnect 26 | */ 27 | DISCONNECT, 28 | 29 | /** 30 | * other/unknown reason 31 | */ 32 | UNSPECIFIED 33 | } 34 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/Warcinfo.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import java.io.IOException; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Optional; 12 | 13 | import static java.nio.charset.StandardCharsets.UTF_8; 14 | 15 | /** 16 | * The warcinfo record contains information about the web crawl that generated the records following it. 17 | */ 18 | public class Warcinfo extends WarcRecord { 19 | 20 | private MessageHeaders fields; 21 | 22 | Warcinfo(MessageVersion version, MessageHeaders headers, MessageBody body) { 23 | super(version, headers, body); 24 | } 25 | 26 | /** 27 | * The name of the file originally containing this warcinfo record. 28 | */ 29 | public Optional filename() { 30 | return headers().sole("WARC-Filename"); 31 | } 32 | 33 | /** 34 | * Parses the content body as application/warc-fields. 35 | */ 36 | public MessageHeaders fields() throws IOException { 37 | if (fields == null) { 38 | fields = MessageHeaders.parse(body()); 39 | } 40 | return fields; 41 | } 42 | 43 | public static class Builder extends AbstractBuilder { 44 | public Builder() { 45 | super("warcinfo"); 46 | } 47 | 48 | public Builder filename(String filename) { 49 | return setHeader("WARC-Filename", filename); 50 | } 51 | 52 | @Override 53 | public Warcinfo build() { 54 | return build(Warcinfo::new); 55 | } 56 | 57 | public Builder fields(Map> map) { 58 | return body(MediaType.WARC_FIELDS, MessageHeaders.format(map).getBytes(UTF_8)); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/cdx/CdxFields.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2021 National Library of Australia 4 | */ 5 | 6 | package org.netpreserve.jwarc.cdx; 7 | 8 | import org.netpreserve.jwarc.*; 9 | 10 | import java.time.format.DateTimeFormatter; 11 | 12 | import static java.time.ZoneOffset.UTC; 13 | 14 | public final class CdxFields { 15 | 16 | static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(UTC); 17 | 18 | private CdxFields() { 19 | } 20 | 21 | public static final byte ORIGINAL_URL = 'a'; 22 | public static final byte DATE = 'b'; 23 | public static final byte CHECKSUM = 'k'; 24 | public static final byte FILENAME = 'g'; 25 | public static final byte MIME_TYPE = 'm'; 26 | public static final byte REDIRECT = 'r'; 27 | public static final byte RESPONSE_CODE = 's'; 28 | public static final byte NORMALIZED_SURT = 'N'; 29 | public static final byte COMPRESSED_RECORD_SIZE = 'S'; 30 | public static final byte COMPRESSED_ARC_FILE_OFFSET = 'V'; 31 | 32 | public static String format(byte field, WarcCaptureRecord record) { 33 | try { 34 | return CdxFormat.CDX11.formatField(field, record, null , -1, -1, null); 35 | } catch (Exception e) { 36 | return "-"; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/cdx/CdxReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2021 National Library of Australia 4 | */ 5 | 6 | package org.netpreserve.jwarc.cdx; 7 | 8 | import java.io.*; 9 | import java.nio.charset.StandardCharsets; 10 | import java.util.Iterator; 11 | import java.util.NoSuchElementException; 12 | import java.util.Optional; 13 | 14 | public class CdxReader implements Iterable, Closeable { 15 | private final BufferedReader reader; 16 | private CdxFormat format; 17 | 18 | public CdxReader(InputStream stream) { 19 | this(new BufferedReader(new InputStreamReader(stream, StandardCharsets.US_ASCII))); 20 | } 21 | 22 | CdxReader(BufferedReader reader) { 23 | this.reader = reader; 24 | } 25 | 26 | public Optional next() throws IOException { 27 | for (String line = reader.readLine(); line != null; line = reader.readLine()) { 28 | if (line.isEmpty() || line.startsWith("#")) { 29 | continue; // ignore comments 30 | } 31 | if (line.startsWith(" CDX ") || line.startsWith("CDX ")) { 32 | format = new CdxFormat(line); 33 | continue; 34 | } 35 | return Optional.of(new CdxRecord(line, format)); 36 | } 37 | return Optional.empty(); 38 | } 39 | 40 | @Override 41 | public Iterator iterator() { 42 | return new Iterator() { 43 | private CdxRecord next; 44 | 45 | @Override 46 | public boolean hasNext() { 47 | if (next == null) { 48 | try { 49 | next = CdxReader.this.next().orElse(null); 50 | } catch (IOException e) { 51 | throw new UncheckedIOException(e); 52 | } 53 | } 54 | return next != null; 55 | } 56 | 57 | @Override 58 | public CdxRecord next() { 59 | if (!hasNext()) { 60 | throw new NoSuchElementException(); 61 | } 62 | CdxRecord record = next; 63 | next = null; 64 | return record; 65 | } 66 | }; 67 | } 68 | 69 | @Override 70 | public void close() throws IOException { 71 | reader.close(); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/cdx/CdxRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2021 National Library of Australia 4 | */ 5 | 6 | package org.netpreserve.jwarc.cdx; 7 | 8 | import org.netpreserve.jwarc.MediaType; 9 | import org.netpreserve.jwarc.URIs; 10 | 11 | import java.io.IOException; 12 | import java.net.URI; 13 | import java.time.Instant; 14 | 15 | public class CdxRecord { 16 | private final String[] values; 17 | private final CdxFormat format; 18 | 19 | CdxRecord(String line, CdxFormat format) throws IOException { 20 | this.values = line.split(" "); 21 | if (format != null) { 22 | this.format = format; 23 | } else if (values.length == 9) { 24 | this.format = CdxFormat.CDX9; 25 | } else if (values.length == 10) { 26 | this.format = CdxFormat.CDX10; 27 | } else if (values.length == 11) { 28 | this.format = CdxFormat.CDX11; 29 | } else { 30 | throw new IOException("Unable to determine the CDX format"); 31 | } 32 | } 33 | 34 | public String get(int field) { 35 | int i = format.indexOf(field); 36 | if (i == -1) return null; 37 | String value = values[i]; 38 | return value.equals("-") ? null : value; 39 | } 40 | 41 | public Instant date() { 42 | String value = get(CdxFields.DATE); 43 | return value == null ? null : CdxFields.DATE_FORMAT.parse(value, Instant::from); 44 | } 45 | 46 | public String filename() { 47 | return get(CdxFields.FILENAME); 48 | } 49 | 50 | public String target() { 51 | return get(CdxFields.ORIGINAL_URL); 52 | } 53 | 54 | public URI targetURI() { 55 | String value = target(); 56 | return value == null ? null : URIs.parseLeniently(value); 57 | } 58 | 59 | /** 60 | * Length of the WARC record in bytes. Including headers and measured after any compression is applied. 61 | */ 62 | public Long size() { 63 | String value = get(CdxFields.COMPRESSED_RECORD_SIZE); 64 | return value == null ? null : Long.parseLong(value); 65 | } 66 | 67 | /** 68 | * Position in bytes of the record in the WARC file. 69 | */ 70 | public Long position() { 71 | String value = get(CdxFields.COMPRESSED_ARC_FILE_OFFSET); 72 | return value == null ? null : Long.parseLong(value); 73 | } 74 | 75 | /** 76 | * HTTP response status code. 77 | */ 78 | public Integer status() { 79 | String value = get(CdxFields.RESPONSE_CODE); 80 | return value == null ? null : Integer.parseInt(value); 81 | } 82 | 83 | /** 84 | * A cryptographic digest of the response payload. Most commonly this is a SHA-1 digest in base 32 or an MD5 digest 85 | * in hexadecimal. 86 | */ 87 | public String digest() { 88 | return get(CdxFields.CHECKSUM); 89 | } 90 | 91 | /** 92 | * The value of the Location HTTP header for redirect responses. 93 | */ 94 | public String redirect() { 95 | return get(CdxFields.REDIRECT); 96 | } 97 | 98 | public MediaType contentType() { 99 | String value = get(CdxFields.MIME_TYPE); 100 | return value == null ? null : MediaType.parseLeniently(value); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/cdx/JsonException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | package org.netpreserve.jwarc.cdx; 6 | 7 | class JsonException extends Exception { 8 | public JsonException(String message) { 9 | super(message); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/cdx/JsonToken.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | package org.netpreserve.jwarc.cdx; 6 | 7 | enum JsonToken { 8 | FIELD_NAME, START_OBJECT, END_OBJECT, START_ARRAY, END_ARRAY, 9 | STRING, NUMBER_INT, NUMBER_FLOAT, TRUE, FALSE, NULL 10 | } 11 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/Browser.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.net.InetSocketAddress; 6 | import java.net.URI; 7 | import java.nio.channels.FileChannel; 8 | import java.nio.file.Files; 9 | import java.nio.file.Path; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.List; 13 | import java.util.concurrent.TimeUnit; 14 | 15 | import static java.nio.file.StandardOpenOption.DELETE_ON_CLOSE; 16 | 17 | public class Browser { 18 | private final String executable; 19 | private final String userAgent; 20 | private final InetSocketAddress proxy; 21 | private final List options = Arrays.asList("--headless", "--disable-gpu", "--ignore-certificate-errors", 22 | "--hide-scrollbars"); 23 | private final static long DEFAULT_TIMEOUT = 60000; 24 | 25 | public static Browser chrome(String executable, InetSocketAddress proxy) { 26 | return new Browser(executable, proxy, null); 27 | } 28 | 29 | Browser(String executable, InetSocketAddress proxy, String userAgent) { 30 | this.executable = executable; 31 | this.proxy = proxy; 32 | this.userAgent = userAgent; 33 | } 34 | 35 | public void browse(URI uri) throws IOException { 36 | run(uri.toString()); 37 | } 38 | 39 | public void screenshot(URI uri, Path outfile) throws IOException { 40 | screenshot(uri.toString(), outfile); 41 | } 42 | 43 | public void screenshot(String url, Path outfile) throws IOException { 44 | run("--screenshot=" + outfile, url); 45 | } 46 | 47 | public FileChannel screenshot(URI uri) throws IOException { 48 | return screenshot(uri.toString()); 49 | } 50 | 51 | public FileChannel screenshot(String uri) throws IOException { 52 | Path outfile = Files.createTempFile("jwarc-screenshot", ".png"); 53 | try { 54 | run("--screenshot=" + outfile, uri); 55 | return FileChannel.open(outfile, DELETE_ON_CLOSE); 56 | } catch (Exception e) { 57 | Files.deleteIfExists(outfile); 58 | throw e; 59 | } 60 | } 61 | 62 | private void run(String... args) throws IOException { 63 | List command = new ArrayList<>(); 64 | command.add(executable); 65 | command.addAll(options); 66 | if (proxy != null) { 67 | command.add("--proxy-server=" + proxy.getHostString() + ":" + proxy.getPort()); 68 | } 69 | if (userAgent != null) { 70 | command.add("--user-agent=" + userAgent); 71 | } 72 | command.addAll(Arrays.asList(args)); 73 | 74 | try { 75 | Process process = new ProcessBuilder(command) 76 | .inheritIO() 77 | .redirectOutput(devNull()) 78 | .start(); 79 | if (DEFAULT_TIMEOUT > 0) { 80 | if (!process.waitFor(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS)) { 81 | process.destroy(); 82 | process.waitFor(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS); 83 | process.destroyForcibly(); 84 | throw new IOException("timed out after " + DEFAULT_TIMEOUT + "ms"); 85 | } 86 | } else { 87 | process.waitFor(); 88 | } 89 | if (process.exitValue() != 0) { 90 | throw new IOException("browser returned exit status: " + process.exitValue()); 91 | } 92 | } catch (InterruptedException e) { 93 | Thread.currentThread().interrupt(); 94 | } 95 | } 96 | 97 | private static File devNull() { 98 | return new File(System.getProperty("os.name").startsWith("Windows") ? "NUL" : "/dev/null"); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/Capture.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import java.net.URI; 4 | import java.nio.file.Path; 5 | import java.time.Instant; 6 | 7 | /** 8 | * An entry in the {@link CaptureIndex}. 9 | *

10 | * Hods the location of a particular captured version of a resource. 11 | */ 12 | class Capture { 13 | private final String uri; 14 | private final Instant date; 15 | private final Path file; 16 | private final long position; 17 | 18 | Capture(String uri, Instant date) { 19 | this(uri, date, null, -1); 20 | } 21 | 22 | Capture(String uri, Instant date, Path file, long position) { 23 | this.uri = uri; 24 | this.date = date; 25 | this.file = file; 26 | this.position = position; 27 | } 28 | 29 | Instant date() { 30 | return date; 31 | } 32 | 33 | String uri() { 34 | return uri; 35 | } 36 | 37 | Path file() { 38 | return file; 39 | } 40 | 41 | long position() { 42 | return position; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/CaptureIndex.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import org.netpreserve.jwarc.*; 4 | 5 | import java.io.IOException; 6 | import java.net.URI; 7 | import java.nio.file.Path; 8 | import java.time.Instant; 9 | import java.util.List; 10 | import java.util.NavigableSet; 11 | import java.util.TreeSet; 12 | 13 | import static java.util.Comparator.comparing; 14 | 15 | public class CaptureIndex { 16 | private final NavigableSet entries = new TreeSet<>(comparing(Capture::uri).thenComparing(Capture::date)); 17 | private Capture entrypoint; 18 | 19 | public CaptureIndex(List warcs) throws IOException { 20 | for (Path warc : warcs) { 21 | try (WarcReader reader = new WarcReader(warc)) { 22 | for (WarcRecord record : reader) { 23 | if ((record instanceof WarcResponse || record instanceof WarcResource)) { 24 | WarcCaptureRecord capture = (WarcCaptureRecord) record; 25 | if (URIs.hasHttpOrHttpsScheme(capture.target())) { 26 | Capture entry = new Capture(capture.target(), capture.date(), warc, reader.position()); 27 | add(entry); 28 | if (entrypoint == null && MediaType.HTML.equals(capture.payloadType().base())) { 29 | entrypoint = entry; 30 | } 31 | } 32 | } 33 | } 34 | } 35 | } 36 | } 37 | 38 | void add(Capture capture) { 39 | entries.add(capture); 40 | } 41 | 42 | NavigableSet query(String uri) { 43 | return entries.subSet(new Capture(uri, Instant.MIN), true, new Capture(uri, Instant.MAX), true); 44 | } 45 | 46 | Capture entrypoint() { 47 | return entrypoint; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/HttpExchange.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import org.netpreserve.jwarc.HttpRequest; 4 | import org.netpreserve.jwarc.HttpResponse; 5 | import org.netpreserve.jwarc.IOUtils; 6 | import org.netpreserve.jwarc.MediaType; 7 | 8 | import javax.net.ssl.SSLProtocolException; 9 | import java.io.IOException; 10 | import java.io.OutputStream; 11 | import java.net.Socket; 12 | import java.net.SocketException; 13 | import java.util.regex.Matcher; 14 | 15 | import static java.nio.charset.StandardCharsets.UTF_8; 16 | 17 | class HttpExchange { 18 | private static final MediaType HTML_UTF8 = MediaType.parse("text/html;charset=utf-8"); 19 | 20 | private final Socket socket; 21 | private final HttpRequest request; 22 | private final Matcher matcher; 23 | 24 | HttpExchange(Socket socket, HttpRequest request, Matcher matcher) { 25 | this.socket = socket; 26 | this.request = request; 27 | this.matcher = matcher; 28 | } 29 | 30 | public String param(int i) { 31 | return matcher.group(i); 32 | } 33 | 34 | public HttpRequest request() { 35 | return request; 36 | } 37 | 38 | public void redirect(String location) throws IOException { 39 | send(new HttpResponse.Builder(307, "Redirect") 40 | .addHeader("Content-Length", "0") 41 | .addHeader("Location", location) 42 | .build()); 43 | } 44 | 45 | public void send(int status, String html) throws IOException { 46 | send(status, HTML_UTF8, html); 47 | } 48 | 49 | public void send(int status, MediaType type, String body) throws IOException { 50 | send(new HttpResponse.Builder(status, " ").body(type, body.getBytes(UTF_8)).build()); 51 | } 52 | 53 | public void send(HttpResponse response) throws IOException { 54 | try { 55 | OutputStream outputStream = socket.getOutputStream(); 56 | outputStream.write(response.serializeHeader()); 57 | IOUtils.copy(response.body().stream(), outputStream); 58 | } catch (SSLProtocolException | SocketException e) { 59 | socket.close(); // client probably closed 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/HttpHandler.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import java.io.IOException; 4 | 5 | @FunctionalInterface 6 | public interface HttpHandler { 7 | void handle(HttpExchange exchange) throws Exception; 8 | } 9 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/WarcRecorder.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import org.netpreserve.jwarc.*; 4 | 5 | import java.io.*; 6 | import java.net.ServerSocket; 7 | import java.net.Socket; 8 | import java.net.URI; 9 | import java.nio.channels.Channels; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | import static java.nio.charset.StandardCharsets.*; 14 | 15 | /** 16 | * HTTP proxy which records requests and responses as WARC records. 17 | */ 18 | public class WarcRecorder extends HttpServer { 19 | private final WarcWriter warcWriter; 20 | 21 | public WarcRecorder(ServerSocket serverSocket, WarcWriter warcWriter) { 22 | super(serverSocket); 23 | this.warcWriter = warcWriter; 24 | try { 25 | on("GET", "/", WarcServer.resource("recorder.html")); 26 | on("GET", "/__jwarc__/recorder-sw.js", WarcServer.resource("recorder-sw.js")); 27 | } catch (IOException e) { 28 | throw new UncheckedIOException(e); 29 | } 30 | } 31 | 32 | @Override 33 | void handle(Socket socket, String target, HttpRequest httpRequest) throws Exception { 34 | boolean rewriteHeaders = false; 35 | if (target.startsWith("/__jwarc__/record/")) { 36 | target = target.substring("/__jwarc__/record/".length()); 37 | rewriteHeaders = true; 38 | } else if (target.startsWith("/")) { 39 | super.handle(socket, target, httpRequest); 40 | return; 41 | } 42 | URI uri = new URI(target); 43 | if (uri.getPath().isEmpty()) { 44 | uri = new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), "/", uri.getQuery(), uri.getFragment()); 45 | } 46 | String path = uri.getRawPath(); 47 | if (uri.getRawQuery() != null) { 48 | path += "?" + uri.getRawQuery(); 49 | } 50 | HttpRequest.Builder rb = new HttpRequest.Builder(httpRequest.method(), path).version(MessageVersion.HTTP_1_0); 51 | for (Map.Entry> e : httpRequest.headers().map().entrySet()) { 52 | if (e.getKey().equalsIgnoreCase("TE")) continue; 53 | if (e.getKey().equalsIgnoreCase("Accept-Encoding")) continue; 54 | if (e.getKey().equalsIgnoreCase("Connection")) continue; 55 | for (String v : e.getValue()) { 56 | rb.addHeader(e.getKey(), v); 57 | } 58 | } 59 | rb.setHeader("Host", uri.getPort() != -1 ? uri.getHost() + ":" + uri.getPort() : uri.getHost()); 60 | OutputStream outputStream = socket.getOutputStream(); 61 | if (rewriteHeaders) outputStream = new HeaderRewriter(outputStream); 62 | warcWriter.fetch(uri, rb.build(), outputStream); 63 | socket.close(); 64 | } 65 | 66 | private static class HeaderRewriter extends FilterOutputStream { 67 | private ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 68 | private static final byte[] SENTINEL = "\r\n\r\n".getBytes(US_ASCII); 69 | private int state = 0; 70 | 71 | public HeaderRewriter(OutputStream out) { 72 | super(out); 73 | } 74 | 75 | @Override 76 | public void write(int b) throws IOException { 77 | write(new byte[]{(byte) b}, 0, 1); 78 | } 79 | 80 | @Override 81 | public void write(byte[] b, int off, int len) throws IOException { 82 | if (state == SENTINEL.length) { 83 | out.write(b, off, len); 84 | return; 85 | } 86 | for (int i = off; i < off + len; i++) { 87 | if (b[i] == SENTINEL[state]) { 88 | state++; 89 | if (state == SENTINEL.length) { 90 | buffer.write(b, off, i - off + 1); 91 | HttpResponse response = HttpResponse.parseWithoutBody(Channels.newChannel(new ByteArrayInputStream(buffer.toByteArray())), null); 92 | HttpResponse.Builder builder = new HttpResponse.Builder(response.status(), response.reason()) 93 | .version(response.version()) 94 | .addHeaders(response.headers().map()) 95 | .setHeader("Content-Length", null) 96 | .setHeader("Connection", "close") 97 | .setHeader("X-Frame-Options", null) 98 | .setHeader("Content-Security-Policy-Report-Only", null); 99 | response.headers().first("Location").ifPresent(location -> 100 | builder.setHeader("Location", "/__jwarc__/record/" + location)); 101 | out.write(builder.build().serializeHeader()); 102 | out.write(b, i + 1, len - (i - off + 1)); 103 | buffer = null; 104 | return; 105 | } 106 | } else { 107 | state = 0; 108 | } 109 | } 110 | buffer.write(b, off, len); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/WarcRenderer.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import org.netpreserve.jwarc.MediaType; 4 | import org.netpreserve.jwarc.WarcResource; 5 | import org.netpreserve.jwarc.WarcWriter; 6 | 7 | import java.io.Closeable; 8 | import java.io.IOException; 9 | import java.net.InetAddress; 10 | import java.net.InetSocketAddress; 11 | import java.net.ServerSocket; 12 | import java.net.URI; 13 | import java.nio.channels.FileChannel; 14 | import java.nio.file.Files; 15 | import java.nio.file.Path; 16 | import java.time.Instant; 17 | import java.time.format.DateTimeFormatter; 18 | 19 | import static java.time.ZoneOffset.UTC; 20 | 21 | public class WarcRenderer implements Closeable { 22 | private static final DateTimeFormatter ARC_TIME = DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(UTC); 23 | 24 | private final ServerSocket proxySocket; 25 | private final WarcServer server; 26 | private final String browserExecutable; 27 | 28 | public WarcRenderer(CaptureIndex index) throws IOException { 29 | this(index, System.getenv().getOrDefault("BROWSER", "google-chrome")); 30 | } 31 | 32 | public WarcRenderer(CaptureIndex index, String browserExecutable) throws IOException { 33 | this.proxySocket = new ServerSocket(0, -1, InetAddress.getLoopbackAddress()); 34 | this.server = new WarcServer(proxySocket, index); 35 | this.browserExecutable = browserExecutable; 36 | new Thread(server::listen).start(); 37 | } 38 | 39 | public void screenshot(URI uri, Instant date, WarcWriter warcWriter) throws IOException { 40 | screenshot(uri.toString(), date, warcWriter); 41 | } 42 | 43 | public void screenshot(String url, Instant date, WarcWriter warcWriter) throws IOException { 44 | Path screenshot = Files.createTempFile("jwarc-screenshot", ".png"); 45 | try { 46 | Browser browser = new Browser(browserExecutable, (InetSocketAddress) proxySocket.getLocalSocketAddress(), 47 | "WarcRenderer (arctime/" + ARC_TIME.format(date) + ")"); 48 | browser.screenshot(url, screenshot); 49 | try (FileChannel channel = FileChannel.open(screenshot)) { 50 | long size = channel.size(); 51 | if (size == 0) return; 52 | warcWriter.write(new WarcResource.Builder(URI.create("screenshot:" + url)) 53 | .date(date) 54 | .body(MediaType.parse("image/png"), channel, size) 55 | .build()); 56 | } 57 | } finally { 58 | Files.deleteIfExists(screenshot); 59 | } 60 | } 61 | 62 | @Override 63 | public void close() throws IOException { 64 | proxySocket.close(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/net/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for constructing network services which operate on WARC files. 3 | */ 4 | package org.netpreserve.jwarc.net; 5 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for parsing, serializing and manipulating WARC records. 3 | */ 4 | package org.netpreserve.jwarc; -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/CdxTool.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2021 National Library of Australia 4 | */ 5 | 6 | package org.netpreserve.jwarc.tools; 7 | 8 | 9 | import org.netpreserve.jwarc.WarcRecord; 10 | import org.netpreserve.jwarc.WarcRevisit; 11 | import org.netpreserve.jwarc.cdx.CdxFormat; 12 | import org.netpreserve.jwarc.cdx.CdxWriter; 13 | 14 | 15 | import java.io.IOException; 16 | import java.io.OutputStreamWriter; 17 | import java.nio.file.Path; 18 | import java.nio.file.Paths; 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | import java.util.function.Predicate; 22 | 23 | public class CdxTool { 24 | public static void main(String[] args) throws IOException { 25 | List files = new ArrayList<>(); 26 | CdxFormat.Builder cdxFormatBuilder = new CdxFormat.Builder(); 27 | boolean printHeader = true; 28 | boolean fullFilePath = false; 29 | boolean postAppend = false; 30 | Predicate filter = null; 31 | for (int i = 0; i < args.length; i++) { 32 | if (args[i].startsWith("-")) { 33 | switch (args[i]) { 34 | case "-f": 35 | case "--format": 36 | String format = args[++i]; 37 | switch (format) { 38 | case "CDX9": 39 | cdxFormatBuilder.legend(CdxFormat.CDX9_LEGEND); 40 | break; 41 | case "CDX10": 42 | cdxFormatBuilder.legend(CdxFormat.CDX10_LEGEND); 43 | break; 44 | case "CDX11": 45 | cdxFormatBuilder.legend(CdxFormat.CDX11_LEGEND); 46 | break; 47 | default: 48 | cdxFormatBuilder.legend(format); 49 | break; 50 | } 51 | break; 52 | case "-h": 53 | case "--help": 54 | System.out.println("Usage: jwarc cdx [--format LEGEND] warc-files..."); 55 | System.out.println(); 56 | System.out.println(" -d, --digest-unchanged Include records with unchanged digest"); 57 | System.out.println(" -f, --format LEGEND CDX format may be CDX9, CDX11 or a custom legend"); 58 | System.out.println(" --no-header Don't print the CDX header line"); 59 | System.out.println(" -p, --post-append Append the request body to the urlkey field"); 60 | System.out.println(" --revisits-excluded Don't index revisit records"); 61 | System.out.println(" -w, --warc-full-path Use absolute paths for the filename field"); 62 | return; 63 | case "--no-header": 64 | printHeader = false; 65 | break; 66 | case "-p": 67 | case "--post-append": 68 | postAppend = true; 69 | break; 70 | case "-d": 71 | case "--digest-unchanged": 72 | cdxFormatBuilder.digestUnchanged(); 73 | break; 74 | case "-r": 75 | case "--revisits-included": 76 | filter = null; 77 | break; 78 | case "--revisits-excluded": 79 | filter = record -> !(record instanceof WarcRevisit); 80 | break; 81 | case "-w": 82 | case "--warc-full-path": 83 | fullFilePath = true; 84 | break; 85 | default: 86 | System.err.println("Unrecognized option: " + args[i]); 87 | System.err.println("Usage: jwarc cdx [--format LEGEND] warc-files..."); 88 | System.exit(1); 89 | return; 90 | } 91 | } else { 92 | files.add(Paths.get(args[i])); 93 | } 94 | } 95 | 96 | try (CdxWriter cdxWriter = new CdxWriter(new OutputStreamWriter(System.out))) { 97 | cdxWriter.onWarning(System.err::println); 98 | cdxWriter.setFormat(cdxFormatBuilder.build()); 99 | cdxWriter.setPostAppend(postAppend); 100 | cdxWriter.setRecordFilter(filter); 101 | 102 | if (printHeader) cdxWriter.writeHeaderLine(); 103 | cdxWriter.process(files, fullFilePath); 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/FetchTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import org.netpreserve.jwarc.FetchOptions; 4 | import org.netpreserve.jwarc.WarcWriter; 5 | 6 | import java.io.IOException; 7 | import java.net.URI; 8 | import java.net.URISyntaxException; 9 | import java.nio.file.Path; 10 | import java.nio.file.Paths; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | public class FetchTool { 15 | public static void main(String[] args) throws IOException, URISyntaxException { 16 | FetchOptions options = new FetchOptions(); 17 | List urls = new ArrayList<>(); 18 | Path outputFile = null; 19 | for (int i = 0; i < args.length; i++) { 20 | switch (args[i]) { 21 | case "-h": 22 | case "--help": 23 | System.out.println("Usage: jwarc fetch [options] url..."); 24 | System.out.println("Fetches a URL while writing the request and response as WARC records"); 25 | System.out.println(); 26 | System.out.println("Options:"); 27 | System.out.println(" -A, --user-agent STRING Sets the User-Agent header"); 28 | System.out.println(" --read-timeout MILLIS Sets the socket read timeout"); 29 | System.out.println(" --max-length BYTES Truncate response after BYTES received"); 30 | System.out.println(" --max-time MILLIS Truncate response after MILLIS elapsed"); 31 | System.out.println(" -o, --output-file FILE Write WARC records to FILE instead of stdout"); 32 | System.out.println(); 33 | System.exit(0); 34 | break; 35 | case "-A": 36 | case "--user-agent": 37 | options.userAgent(args[++i]); 38 | break; 39 | case "--read-timeout": 40 | options.readTimeout(Integer.parseInt(args[++i])); 41 | break; 42 | case "--max-length": 43 | options.maxLength(Integer.parseInt(args[++i])); 44 | break; 45 | case "--max-time": 46 | options.maxTime(Integer.parseInt(args[++i])); 47 | break; 48 | case "-o": 49 | case "--output-file": 50 | outputFile = Paths.get(args[++i]); 51 | break; 52 | default: 53 | if (args[i].startsWith("-")) { 54 | System.err.println("Unknown option: " + args[i]); 55 | System.exit(1); 56 | } 57 | urls.add(new URI(args[i])); 58 | } 59 | } 60 | if (urls.isEmpty()) { 61 | System.err.println("No URLs specified. Try: jwarc fetch --help"); 62 | System.exit(1); 63 | } 64 | try (WarcWriter writer = outputFile == null ? new WarcWriter(System.out) : new WarcWriter(outputFile)) { 65 | Runtime.getRuntime().addShutdownHook(new Thread(() -> { 66 | try { 67 | // Ensure current progress is written before exiting. 68 | writer.close(); 69 | } catch (IOException e) { 70 | e.printStackTrace(); 71 | } 72 | }, "FetchToolShutdownHook")); 73 | for (URI url : urls) { 74 | writer.fetch(url, options); 75 | } 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/FilterTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import org.netpreserve.jwarc.*; 4 | 5 | import java.io.IOException; 6 | import java.nio.file.Paths; 7 | import java.util.Arrays; 8 | 9 | public class FilterTool { 10 | public static void main(String[] args) throws Exception { 11 | try { 12 | String[] files; 13 | if (args.length == 0) { 14 | System.err.println("Usage: jwarc filter [warc-file]..."); 15 | System.err.println(" e.g. jwarc filter 'warc-type == \"response\" && http:content-type =~ \"image/.*\" && :status == 200' example.warc"); 16 | System.exit(1); 17 | return; 18 | } else if (args.length > 1) { 19 | files = Arrays.copyOfRange(args, 1, args.length); 20 | } else { 21 | if (System.console() != null) { 22 | System.err.println("Warning: No input files specified, reading from STDIN"); 23 | } 24 | files = new String[]{"-"}; 25 | } 26 | WarcFilter filter = WarcFilter.compile(args[0]); 27 | try (WarcWriter writer = new WarcWriter(System.out)) { 28 | for (String file : files) { 29 | try (WarcReader reader = file.equals("-") ? new WarcReader(System.in) : new WarcReader(Paths.get(file))) { 30 | filterRecords(filter, writer, reader); 31 | } 32 | } 33 | } 34 | } catch (WarcFilterException e) { 35 | System.err.println(e.prettyPrint()); 36 | System.exit(2); 37 | } 38 | } 39 | 40 | private static void filterRecords(WarcFilter filter, WarcWriter writer, WarcReader reader) throws IOException { 41 | for (WarcRecord record : reader) { 42 | if (filter.test(record)) { 43 | writer.write(record); 44 | } 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/ListTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import org.netpreserve.jwarc.*; 4 | 5 | import java.io.IOException; 6 | import java.nio.file.Paths; 7 | 8 | public class ListTool { 9 | public static void main(String[] args) throws IOException { 10 | for (String arg : args) { 11 | try (WarcReader reader = new WarcReader(Paths.get(arg))) { 12 | for (WarcRecord record : reader) { 13 | String url = "-"; 14 | if (record instanceof WarcTargetRecord) { 15 | url = ((WarcTargetRecord) record).target(); 16 | } 17 | 18 | String methodOrStatus = "-"; 19 | if (record.contentType().base().equals(MediaType.HTTP)) { 20 | if (record instanceof WarcRequest) { 21 | methodOrStatus = ((WarcRequest) record).http().method(); 22 | } else if (record instanceof WarcResponse) { 23 | methodOrStatus = String.valueOf(((WarcResponse) record).http().status()); 24 | } 25 | } 26 | 27 | System.out.format("%10d %-10s %-4s %s\n", reader.position(), record.type(), methodOrStatus, url); 28 | } 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/RecordTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import org.netpreserve.jwarc.WarcWriter; 4 | import org.netpreserve.jwarc.net.Browser; 5 | import org.netpreserve.jwarc.net.WarcRecorder; 6 | 7 | import java.net.InetAddress; 8 | import java.net.InetSocketAddress; 9 | import java.net.ServerSocket; 10 | import java.net.URI; 11 | 12 | public class RecordTool { 13 | public static void main(String[] args) throws Exception { 14 | try (ServerSocket socket = new ServerSocket(0, -1, InetAddress.getLoopbackAddress())) { 15 | WarcRecorder recorder = new WarcRecorder(socket, new WarcWriter(System.out)); 16 | new Thread(recorder::listen).start(); 17 | InetSocketAddress proxy = (InetSocketAddress) socket.getLocalSocketAddress(); 18 | System.err.println("WarcRecorder listening on " + proxy); 19 | String executable = System.getenv().getOrDefault("BROWSER", "google-chrome"); 20 | Browser browser = Browser.chrome(executable, proxy); 21 | for (String arg : args) { 22 | browser.browse(URI.create(arg)); 23 | } 24 | } 25 | System.exit(0); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/ScreenshotTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import org.netpreserve.jwarc.*; 4 | import org.netpreserve.jwarc.net.CaptureIndex; 5 | import org.netpreserve.jwarc.net.WarcRenderer; 6 | 7 | import java.io.IOException; 8 | import java.nio.file.Path; 9 | import java.nio.file.Paths; 10 | import java.util.List; 11 | import java.util.stream.Collectors; 12 | import java.util.stream.Stream; 13 | 14 | public class ScreenshotTool { 15 | public static void main(String[] args) throws Exception { 16 | List warcs = Stream.of(args).map(Paths::get).collect(Collectors.toList()); 17 | try (WarcWriter warcWriter = new WarcWriter(System.out); 18 | WarcRenderer renderer = new WarcRenderer(new CaptureIndex(warcs))) { 19 | for (String arg : args) { 20 | try (WarcReader reader = new WarcReader(Paths.get(arg))) { 21 | for (WarcRecord record : reader) { 22 | if (!isNormalPage(record)) continue; 23 | WarcCaptureRecord capture = (WarcCaptureRecord) record; 24 | renderer.screenshot(capture.target(), capture.date(), warcWriter); 25 | } 26 | } 27 | } 28 | } 29 | } 30 | 31 | private static boolean isNormalPage(WarcRecord record) throws IOException { 32 | if (!(record instanceof WarcResponse) && !(record instanceof WarcResource)) { 33 | return false; 34 | } 35 | WarcCaptureRecord capture = (WarcCaptureRecord) record; 36 | if (!(URIs.hasHttpOrHttpsScheme(capture.target()))) { 37 | return false; 38 | } 39 | try { 40 | if (!(capture.payload().isPresent() && capture.payload().get().type().base().equals(MediaType.HTML))) { 41 | return false; 42 | } 43 | } catch (IllegalArgumentException e) { 44 | return false; 45 | } 46 | return !(capture instanceof WarcResponse) || ((WarcResponse) capture).http().status() == 200; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/ServeTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import org.netpreserve.jwarc.net.WarcServer; 4 | 5 | import java.net.ServerSocket; 6 | import java.nio.file.Path; 7 | import java.nio.file.Paths; 8 | import java.util.List; 9 | import java.util.stream.Collectors; 10 | import java.util.stream.Stream; 11 | 12 | public class ServeTool { 13 | public static void main(String[] args) throws Exception { 14 | if (args.length == 0) { 15 | System.err.println("Usage: WarcTool serve "); 16 | System.err.println("Obeys environment variable PORT."); 17 | System.exit(1); 18 | } 19 | List warcs = Stream.of(args).map(Paths::get).collect(Collectors.toList()); 20 | int port = Integer.parseInt(System.getenv().getOrDefault("PORT", "8080")); 21 | WarcServer server = new WarcServer(new ServerSocket(port), warcs); 22 | System.err.println("Listening on port " + port); 23 | server.listen(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/Utils.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.net.URL; 6 | import java.util.Properties; 7 | 8 | class Utils { 9 | static String getJwarcVersion() { 10 | Properties properties = new Properties(); 11 | URL resource = WarcTool.class.getResource("/META-INF/maven/org.netpreserve/jwarc/pom.properties"); 12 | if (resource != null) { 13 | try (InputStream stream = resource.openStream()) { 14 | properties.load(stream); 15 | } catch (IOException e) { 16 | // alas! 17 | } 18 | } 19 | return properties.getProperty("version"); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/WarcTool.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.tools; 2 | 3 | import java.util.Arrays; 4 | 5 | public class WarcTool { 6 | public static void main(String[] args) throws Exception { 7 | if (args.length == 0) { 8 | usage(); 9 | return; 10 | } 11 | String[] rest = Arrays.copyOfRange(args, 1, args.length); 12 | switch (args[0]) { 13 | case "cdx": 14 | CdxTool.main(rest); 15 | break; 16 | case "dedupe": 17 | DedupeTool.main(rest); 18 | break; 19 | case "extract": 20 | ExtractTool.main(rest); 21 | break; 22 | case "fetch": 23 | FetchTool.main(rest); 24 | break; 25 | case "filter": 26 | FilterTool.main(rest); 27 | break; 28 | case "-h": 29 | case "--help": 30 | case "help": 31 | usage(); 32 | break; 33 | case "ls": 34 | ListTool.main(rest); 35 | break; 36 | case "record": 37 | RecordTool.main(rest); 38 | break; 39 | case "recorder": 40 | RecorderTool.main(rest); 41 | break; 42 | case "saveback": 43 | SavebackTool.main(rest); 44 | break; 45 | case "screenshot": 46 | ScreenshotTool.main(rest); 47 | break; 48 | case "serve": 49 | ServeTool.main(rest); 50 | break; 51 | case "stats": 52 | StatsTool.main(rest); 53 | break; 54 | case "validate": 55 | ValidateTool.main(rest); 56 | break; 57 | case "--version": 58 | case "version": 59 | version(); 60 | break; 61 | default: 62 | System.err.println("jwarc: '" + args[0] + "' is not a jwarc command. See 'jwarc help'."); 63 | System.exit(1); 64 | } 65 | } 66 | 67 | private static void usage() { 68 | System.out.println("usage: jwarc [args]..."); 69 | System.out.println(); 70 | System.out.println("Commands:"); 71 | System.out.println(); 72 | System.out.println(" cdx List records in CDX format"); 73 | System.out.println(" dedupe Deduplicate records by looking up a CDX server"); 74 | System.out.println(" extract Extract record by offset"); 75 | System.out.println(" fetch Download a URL recording the request and response"); 76 | System.out.println(" filter Copy records that match a given filter expression"); 77 | System.out.println(" ls List records in WARC file(s)"); 78 | System.out.println(" record Fetch a page and subresources using headless Chrome"); 79 | System.out.println(" recorder Run a recording proxy"); 80 | System.out.println(" saveback Saves wayback-style replayed pages as WARC records"); 81 | System.out.println(" screenshot Take a screenshot of each page in the given WARCs"); 82 | System.out.println(" serve Serve WARC files with a basic replay server/proxy"); 83 | System.out.println(" stats Print statistics about WARC and CDX files"); 84 | System.out.println(" validate Validate WARC or ARC files"); 85 | System.out.println(" version Print version information"); 86 | } 87 | 88 | private static void version() { 89 | String version = Utils.getJwarcVersion(); 90 | System.out.println("jwarc " + (version == null ? "unknown version" : version)); 91 | System.out.println(System.getProperty("java.vm.name") + " " + System.getProperty("java.version")); 92 | System.out.println(System.getProperty("os.name") + " " + System.getProperty("os.version") + " " + System.getProperty("os.arch")); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/org/netpreserve/jwarc/tools/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Command-line tools for manipulating WARC files. 3 | */ 4 | package org.netpreserve.jwarc.tools; -------------------------------------------------------------------------------- /test-resources/org/netpreserve/jwarc/cc.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/jwarc/7828aa0bae9b52ac2b31c3e783ee9ce3817feeda/test-resources/org/netpreserve/jwarc/cc.warc.gz -------------------------------------------------------------------------------- /test-resources/org/netpreserve/jwarc/gzip_extra_sl.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iipc/jwarc/7828aa0bae9b52ac2b31c3e783ee9ce3817feeda/test-resources/org/netpreserve/jwarc/gzip_extra_sl.warc.gz -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/ChunkedBodyTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | import org.netpreserve.jwarc.ChunkedBody; 5 | import org.netpreserve.jwarc.ParsingException; 6 | 7 | import java.io.ByteArrayInputStream; 8 | import java.io.EOFException; 9 | import java.io.IOException; 10 | import java.nio.ByteBuffer; 11 | import java.nio.channels.Channels; 12 | import java.nio.channels.ReadableByteChannel; 13 | import java.nio.charset.StandardCharsets; 14 | import java.util.Arrays; 15 | 16 | import static java.nio.charset.StandardCharsets.US_ASCII; 17 | import static org.junit.Assert.*; 18 | 19 | public class ChunkedBodyTest { 20 | @Test 21 | public void test() throws IOException { 22 | byte[] one = "3\r\nhel\r\n0007\r\nlo ".getBytes(US_ASCII); 23 | byte[] two = "worl\r\n1\r\nd\r\n00000\r\n\r\n".getBytes(US_ASCII); 24 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(two)); 25 | ByteBuffer b1 = ByteBuffer.wrap(one); 26 | ChunkedBody decoder = new ChunkedBody(chan, b1); 27 | ByteBuffer buf = ByteBuffer.allocate(32); 28 | while (true) { 29 | int n = decoder.read(buf); 30 | assertNotEquals(0, n); 31 | if (n == -1) { 32 | break; 33 | } 34 | } 35 | assertFalse(b1.hasRemaining()); 36 | assertEquals("hello world", new String(Arrays.copyOf(buf.array(), buf.position()), US_ASCII)); 37 | } 38 | 39 | @Test(expected = ParsingException.class) 40 | public void testErr() throws IOException { 41 | new ChunkedBody(Channels.newChannel(new ByteArrayInputStream(new byte[0])), ByteBuffer.allocate(16)) 42 | .strict() 43 | .read(ByteBuffer.allocate(32)); 44 | } 45 | 46 | @Test(expected = EOFException.class) 47 | public void testEOF() throws IOException { 48 | ByteBuffer buf = ByteBuffer.allocate(16); 49 | buf.flip(); 50 | new ChunkedBody(Channels.newChannel(new ByteArrayInputStream(new byte[0])), buf) 51 | .read(ByteBuffer.allocate(32)); 52 | } 53 | 54 | /** Test optimisation when internal buffer is bypassed on large chunks */ 55 | @Test 56 | public void testBypassInternalBuffer() throws IOException { 57 | String bodyString = "hello world, hello world!"; 58 | byte[] body = ("19\r\n" + bodyString + "\r\n00000\r\n\r\n").getBytes(US_ASCII); 59 | ByteBuffer buf = ByteBuffer.allocate(8192); 60 | ByteBuffer initBuf = ByteBuffer.allocate(12); 61 | initBuf.flip(); 62 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(body)); 63 | ChunkedBody decoder = new ChunkedBody(chan, initBuf); 64 | while (true) { 65 | int n = decoder.read(buf); 66 | assertNotEquals(0, n); 67 | if (n < 0) { 68 | break; 69 | } 70 | } 71 | assertFalse(initBuf.hasRemaining()); 72 | assertEquals(bodyString, new String(Arrays.copyOf(buf.array(), buf.position()), US_ASCII)); 73 | } 74 | 75 | /** Test trailing whitespace after chunk length (#33) */ 76 | @Test 77 | public void testChunkLengthTrailingWhiteSpace() throws IOException { 78 | String bodyString = "hello world, hello world!"; 79 | byte[] body = ("19 \r\n" + bodyString + "\r\n00000\r\n\r\n").getBytes(US_ASCII); 80 | ByteBuffer buf = ByteBuffer.allocate(8192); 81 | ByteBuffer initBuf = ByteBuffer.allocate(8192); 82 | initBuf.flip(); 83 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(body)); 84 | ChunkedBody decoder = new ChunkedBody(chan, initBuf); 85 | while (true) { 86 | int n = decoder.read(buf); 87 | assertNotEquals(0, n); 88 | if (n < 0) { 89 | break; 90 | } 91 | } 92 | assertFalse(initBuf.hasRemaining()); 93 | assertEquals(bodyString, new String(Arrays.copyOf(buf.array(), buf.position()), US_ASCII)); 94 | } 95 | 96 | @Test 97 | public void testLenientMode() throws IOException { 98 | String string = "33hello world!"; 99 | byte[] body = string.getBytes(US_ASCII); 100 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(body)); 101 | ByteBuffer buf = ByteBuffer.allocate(100); 102 | ByteBuffer initBuf = ByteBuffer.allocate(100); 103 | initBuf.flip(); 104 | ChunkedBody decoder = new ChunkedBody(chan, initBuf); 105 | int n = decoder.read(buf); 106 | buf.flip(); 107 | assertEquals(string, US_ASCII.decode(buf).toString()); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/GunzipChannelTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | import static org.junit.Assert.assertNotNull; 10 | import static org.junit.Assert.assertTrue; 11 | 12 | import java.io.ByteArrayInputStream; 13 | import java.io.ByteArrayOutputStream; 14 | import java.io.IOException; 15 | import java.net.URISyntaxException; 16 | import java.net.URL; 17 | import java.nio.ByteBuffer; 18 | import java.nio.ByteOrder; 19 | import java.nio.channels.Channels; 20 | import java.nio.channels.FileChannel; 21 | import java.nio.channels.ReadableByteChannel; 22 | import java.nio.charset.StandardCharsets; 23 | import java.nio.file.Paths; 24 | import java.util.zip.GZIPOutputStream; 25 | 26 | import org.junit.Ignore; 27 | import org.junit.Test; 28 | 29 | public class GunzipChannelTest { 30 | 31 | private ByteArrayOutputStream getHelloWorldGzipByteStream() throws IOException { 32 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 33 | GZIPOutputStream gzos = new GZIPOutputStream(baos); 34 | gzos.write("Hello world".getBytes(StandardCharsets.US_ASCII)); 35 | gzos.finish(); 36 | return baos; 37 | } 38 | 39 | @Test 40 | public void test() throws IOException { 41 | ByteBuffer inBuffer = ByteBuffer.allocate(1024); 42 | inBuffer.flip(); 43 | 44 | ByteArrayOutputStream baos = getHelloWorldGzipByteStream(); 45 | 46 | ReadableByteChannel input = Channels.newChannel(new ByteArrayInputStream(baos.toByteArray())); 47 | 48 | GunzipChannel channel = new GunzipChannel(input, inBuffer); 49 | 50 | ByteBuffer buffer = ByteBuffer.allocate(20); 51 | channel.read(buffer); 52 | channel.close(); 53 | buffer.flip(); 54 | 55 | byte[] bytes = new byte[buffer.remaining()]; 56 | buffer.get(bytes); 57 | 58 | assertEquals("Hello world", new String(bytes, StandardCharsets.US_ASCII)); 59 | 60 | } 61 | 62 | @Test 63 | public void testExtraField() throws IOException, URISyntaxException { 64 | ByteBuffer inBuffer = ByteBuffer.allocate(1024); 65 | inBuffer.flip(); 66 | 67 | URL warcFile = getClass().getClassLoader().getResource("org/netpreserve/jwarc/gzip_extra_sl.warc.gz"); 68 | assertNotNull("WARC file gzip_extra_sl.warc.gz not found", warcFile); 69 | ReadableByteChannel input = FileChannel.open(Paths.get(warcFile.toURI())); 70 | 71 | GunzipChannel channel = new GunzipChannel(input, inBuffer); 72 | 73 | ByteBuffer buffer = ByteBuffer.allocate(20); 74 | channel.read(buffer); 75 | buffer.flip(); 76 | 77 | byte[] bytes = new byte[buffer.remaining()]; 78 | buffer.get(bytes); 79 | 80 | assertTrue("Failed reading WARC file: expected \"WARC/1.0\" as first line", 81 | new String(bytes).startsWith("WARC/1.0")); 82 | 83 | // consume remaining compressed content to determine the length 84 | do { 85 | buffer.clear(); 86 | } while (channel.read(buffer) > -1); 87 | channel.close(); 88 | 89 | // check GunzipChannel position 90 | long warcFileSize = FileChannel.open(Paths.get(warcFile.toURI())).size(); 91 | assertEquals("Wrong input position", warcFileSize, channel.inputPosition()); 92 | } 93 | 94 | private void checkExternalBuffer(ByteBuffer buffer) throws IOException { 95 | ByteArrayOutputStream baos = getHelloWorldGzipByteStream(); 96 | 97 | ReadableByteChannel input = Channels.newChannel(new ByteArrayInputStream(baos.toByteArray())); 98 | 99 | GunzipChannel channel = new GunzipChannel(input, buffer); 100 | ByteBuffer output = ByteBuffer.allocate(20); 101 | int n = channel.read(output); 102 | channel.close(); 103 | assertEquals(11, n); 104 | assertEquals("Hello world", new String(output.array(), 0, 11, StandardCharsets.US_ASCII)); 105 | } 106 | 107 | @Test(expected = IllegalArgumentException.class) 108 | public void externalBufferNoArray() throws IOException { 109 | ByteBuffer buffer = ByteBuffer.allocate(1024).asReadOnlyBuffer(); 110 | buffer.flip(); 111 | checkExternalBuffer(buffer); 112 | } 113 | 114 | @Ignore("User must ensure buffer is in read state") 115 | @Test 116 | public void externalBufferNoReadState() throws IOException, URISyntaxException { 117 | ByteBuffer buffer = ByteBuffer.allocate(8192); 118 | // not calling buffer.flip() 119 | checkExternalBuffer(buffer); 120 | } 121 | 122 | @Test 123 | public void externalBufferByteOrderLE() throws IOException, URISyntaxException { 124 | ByteBuffer buffer = ByteBuffer.allocate(8192); 125 | buffer.order(ByteOrder.LITTLE_ENDIAN); 126 | buffer.flip(); 127 | checkExternalBuffer(buffer); 128 | } 129 | 130 | @Test 131 | public void externalBufferByteOrderBE() throws IOException, URISyntaxException { 132 | ByteBuffer buffer = ByteBuffer.allocate(8192); 133 | buffer.order(ByteOrder.BIG_ENDIAN); 134 | buffer.flip(); 135 | checkExternalBuffer(buffer); 136 | } 137 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/GzipChannelTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2020 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | import static org.junit.Assert.assertTrue; 10 | 11 | import java.io.ByteArrayInputStream; 12 | import java.io.ByteArrayOutputStream; 13 | import java.io.IOException; 14 | import java.nio.ByteBuffer; 15 | import java.nio.ByteOrder; 16 | import java.nio.channels.Channels; 17 | import java.nio.charset.StandardCharsets; 18 | import java.util.Arrays; 19 | import java.util.zip.GZIPInputStream; 20 | 21 | import org.junit.Test; 22 | 23 | public class GzipChannelTest { 24 | 25 | protected String text = "Hello world"; 26 | protected byte[] textBytes = text.getBytes(StandardCharsets.US_ASCII); 27 | 28 | private void checkGzip(byte[] gzipped) { 29 | // did we get valid gzipped data? 30 | short magic = ByteBuffer.wrap(gzipped).order(ByteOrder.LITTLE_ENDIAN).getShort(); 31 | assertEquals(magic, GzipChannel.GZIP_MAGIC); 32 | assertTrue(gzipped.length >= 20); 33 | } 34 | 35 | @Test 36 | public void test() throws IOException { 37 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 38 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos)); 39 | int written = channel.write(ByteBuffer.wrap(textBytes)); 40 | assertEquals(written, textBytes.length); 41 | channel.close(); 42 | byte[] gzipped = baos.toByteArray(); 43 | 44 | checkGzip(gzipped); 45 | 46 | GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzipped)); 47 | byte[] inBytes = new byte[8192]; 48 | int n = gzis.read(inBytes); 49 | 50 | assertEquals(n, textBytes.length); 51 | assertEquals(text, new String(inBytes, 0, n, StandardCharsets.US_ASCII)); 52 | } 53 | 54 | /** 55 | * Test that zero content (empty string, zero bytes input) is written as valid 56 | * gzip data, otherwise uncompressing will cause an error. 57 | */ 58 | @Test 59 | public void testEmpty() throws IOException { 60 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 61 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos)); 62 | channel.write(ByteBuffer.allocate(0)); 63 | channel.finish(); 64 | channel.close(); 65 | byte[] gzipped = baos.toByteArray(); 66 | 67 | checkGzip(gzipped); 68 | 69 | byte[] inBytes = new byte[8192]; 70 | int n = (new GZIPInputStream(new ByteArrayInputStream(gzipped))).read(inBytes); 71 | assertTrue(n <= 0); 72 | 73 | // test without calling write() and finish() 74 | baos = new ByteArrayOutputStream(); 75 | channel = new GzipChannel(Channels.newChannel(baos)); 76 | channel.close(); 77 | gzipped = baos.toByteArray(); 78 | 79 | checkGzip(gzipped); 80 | 81 | n = (new GZIPInputStream(new ByteArrayInputStream(gzipped))).read(inBytes); 82 | assertTrue(n <= 0); 83 | } 84 | 85 | @Test 86 | public void testMultiMember() throws IOException { 87 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 88 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos)); 89 | int written = channel.write(ByteBuffer.wrap(textBytes)); 90 | assertEquals(written, textBytes.length); 91 | channel.finish(); // finish first member 92 | long posSecond = channel.outputPosition(); 93 | written = channel.write(ByteBuffer.wrap(textBytes)); 94 | assertEquals(written, textBytes.length); 95 | channel.close(); 96 | byte[] gzipped = baos.toByteArray(); 97 | 98 | checkGzip(gzipped); 99 | checkGzip(Arrays.copyOfRange(gzipped, (int) posSecond, gzipped.length)); 100 | 101 | GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzipped)); 102 | byte[] inBytes = new byte[8192]; 103 | int n = gzis.read(inBytes); 104 | 105 | assertEquals(n, textBytes.length); 106 | assertEquals(text, new String(inBytes, 0, n, StandardCharsets.US_ASCII)); 107 | 108 | // read second member 109 | n = gzis.read(inBytes); 110 | assertEquals(n, textBytes.length); 111 | assertEquals(text, new String(inBytes, 0, n, StandardCharsets.US_ASCII)); 112 | } 113 | 114 | @Test(expected = IllegalArgumentException.class) 115 | public void testBufferNoArray() throws IOException { 116 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 117 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos), ByteBuffer.allocate(1024).asReadOnlyBuffer()); 118 | channel.close(); 119 | } 120 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/HeaderValidatorTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.*; 6 | 7 | import static org.junit.Assert.*; 8 | 9 | public class HeaderValidatorTest { 10 | private HeaderValidator headerValidator = HeaderValidator.warc_1_1(); 11 | 12 | @Test 13 | public void testValid() { 14 | MessageHeaders headers = MessageHeaders.of( 15 | "WARC-Record-ID", "", 16 | "Content-Length", "123456", 17 | "WARC-Date", "2020-01-01T00:00:00Z", 18 | "WARC-Type", "response", 19 | "WARC-Target-URI", "http://example.com/", 20 | "Content-Type", "application/http; msgtype=response", 21 | "WARC-Concurrent-To", "", 22 | "WARC-Concurrent-To", "" 23 | ); 24 | assertEquals(Collections.emptyList(), headerValidator.validate(headers)); 25 | } 26 | 27 | @Test 28 | public void testMissingMandatoryFields() { 29 | MessageHeaders headers = MessageHeaders.of( 30 | "Content-Length", "123456", 31 | "WARC-Date", "2020-01-01T00:00:00Z", 32 | "WARC-Type", "response" 33 | ); 34 | List validationErrors = headerValidator.validate(headers); 35 | assertFalse(validationErrors.isEmpty()); 36 | assertTrue(validationErrors.contains("Missing mandatory field: WARC-Record-ID")); 37 | } 38 | 39 | @Test 40 | public void testInvalidPatternValidation() { 41 | MessageHeaders headers = MessageHeaders.of( 42 | "WARC-Record-ID", "", 43 | "Content-Length", "123456", 44 | "WARC-Date", "2020-01-01T00:00:00Z", 45 | "WARC-Type", "response", 46 | "Content-Type", "invalid_content_type" 47 | ); 48 | List validationErrors = headerValidator.validate(headers); 49 | assertFalse(validationErrors.isEmpty()); 50 | assertTrue(validationErrors.contains("Field has invalid value: invalid_content_type")); 51 | } 52 | 53 | @Test 54 | public void testNonRepeatableField() { 55 | MessageHeaders headers = MessageHeaders.of( 56 | "WARC-Record-ID", "", 57 | "Content-Length", "123456", 58 | "WARC-Date", "2020-01-01T00:00:00Z", 59 | "WARC-Type", "response", 60 | "WARC-Date", "2020-01-01T00:00:00Z", 61 | "WARC-Date", "2020-01-02T00:00:00Z" 62 | ); 63 | List validationErrors = headerValidator.validate(headers); 64 | assertFalse(validationErrors.isEmpty()); 65 | assertTrue(validationErrors.contains("Field must not be repeated: WARC-Date")); 66 | } 67 | 68 | @Test 69 | public void testForbiddenFieldsOnRecordType() { 70 | MessageHeaders headers = MessageHeaders.of( 71 | "WARC-Record-ID", "", 72 | "Content-Length", "123456", 73 | "WARC-Date", "2020-01-01T00:00:00Z", 74 | "WARC-Type", "response", 75 | "WARC-Filename", "test.warc.gz" 76 | ); 77 | List validationErrors = headerValidator.validate(headers); 78 | assertFalse(validationErrors.isEmpty()); 79 | assertTrue(validationErrors.contains("Field not allowed on response record: WARC-Filename")); 80 | } 81 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/HttpRequestTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.io.ByteArrayInputStream; 6 | import java.io.IOException; 7 | import java.nio.channels.Channels; 8 | import java.util.Optional; 9 | 10 | import static java.nio.charset.StandardCharsets.US_ASCII; 11 | import static org.junit.Assert.*; 12 | 13 | public class HttpRequestTest { 14 | @Test 15 | public void serializeHeaderShouldPreserveExactly() throws IOException { 16 | String header = "POST / HTTP/1.1\r\n" + 17 | "Connection: close\r\n" + 18 | "Host: example.org\n" + 19 | "Content-Length: 6\r\n\r\n"; 20 | String message = header + "[body]"; 21 | HttpRequest request = HttpRequest.parse(Channels.newChannel(new ByteArrayInputStream(message.getBytes(US_ASCII)))); 22 | assertEquals("POST", request.method()); 23 | assertEquals("/", request.target()); 24 | assertEquals(Optional.of("example.org"), request.headers().first("Host")); 25 | assertEquals(header, new String(request.serializeHeader(), US_ASCII)); 26 | } 27 | 28 | @Test(expected = IllegalArgumentException.class) 29 | public void invalidVersionShouldThrow() { 30 | new HttpRequest.Builder("GET", "/").version(MessageVersion.WARC_1_0); 31 | } 32 | 33 | @Test 34 | public void invalidContentLengthHeader() throws IOException { 35 | String header = "POST / HTTP/1.1\r\n" + 36 | "Connection: close\r\n" + 37 | "Host: example.org\n" + 38 | "Content-Length: 6 dinosaurs\r\n\r\n"; 39 | String message = header + "[body]"; 40 | HttpRequest request = HttpRequest.parse(LengthedBody.create(message.getBytes(US_ASCII))); 41 | assertEquals("POST", request.method()); 42 | assertEquals("[body]", new String(IOUtils.readNBytes(request.body().stream(), 10))); 43 | } 44 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/HttpResponseTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.io.ByteArrayInputStream; 6 | import java.io.IOException; 7 | import java.nio.channels.Channels; 8 | 9 | import static java.nio.charset.StandardCharsets.US_ASCII; 10 | import static org.junit.Assert.assertEquals; 11 | 12 | public class HttpResponseTest { 13 | @Test 14 | public void serializeHeaderShouldPreserveExactly() throws IOException { 15 | String header = "HTTP/1.0 404 Not Found\r\n" + 16 | "Server: example\n" + 17 | "Content-Length: 6\r\n\r\n"; 18 | String message = header + "[body]"; 19 | HttpResponse response = HttpResponse.parse(Channels.newChannel(new ByteArrayInputStream(message.getBytes(US_ASCII)))); 20 | assertEquals(404, response.status()); 21 | assertEquals("Not Found", response.reason()); 22 | assertEquals(header, new String(response.serializeHeader(), US_ASCII)); 23 | } 24 | 25 | @Test 26 | public void parsingBogusContentLengthFolding() throws IOException { 27 | String header = "HTTP/1.0 200 OK\r\n" + 28 | "Content-Length: 6\r\n" + 29 | " Content-Type: text/html\r\n\r\n"; 30 | String message = header + "[body]"; 31 | HttpResponse response = HttpResponse.parse(LengthedBody.create(message.getBytes(US_ASCII))); 32 | assertEquals(200, response.status()); 33 | assertEquals("[body]", new String(IOUtils.readNBytes(response.body().stream(), 10))); 34 | } 35 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/InetAddressesTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.net.InetAddress; 6 | 7 | import static org.junit.Assert.*; 8 | import static org.netpreserve.jwarc.InetAddresses.toAddrString; 9 | 10 | public class InetAddressesTest { 11 | @Test 12 | public void testCanonicalInet6() throws Exception { 13 | assertEquals("2001:db8::1", 14 | toAddrString(InetAddress.getByName("2001:db8:0:0:0:0:0:1"))); 15 | assertEquals("::", 16 | toAddrString(InetAddress.getByName("0:0:0:0:0:0:0:0"))); 17 | assertEquals("::1", 18 | toAddrString(InetAddress.getByName("0:0:0:0:0:0:0:1"))); 19 | assertEquals("2001:db8:1:1:1:1:1:1", 20 | toAddrString(InetAddress.getByName("2001:db8:1:1:1:1:1:1"))); 21 | assertEquals("2001:0:0:1::1", 22 | toAddrString(InetAddress.getByName("2001:0:0:1:0:0:0:1"))); 23 | assertEquals("2001:db8:f::1", 24 | toAddrString(InetAddress.getByName("2001:db8:000f:0:0:0:0:1"))); 25 | assertEquals("2001:db8::1:0:0:1", 26 | toAddrString(InetAddress.getByName("2001:0db8:0000:0000:0001:0000:0000:0001"))); 27 | assertEquals("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", 28 | toAddrString(InetAddress.getByName("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"))); 29 | assertEquals("2001:200f::1", 30 | toAddrString(InetAddress.getByName("2001:200f:0:0:0:0:0:1"))); 31 | // https://datatracker.ietf.org/doc/html/rfc5952#section-4.2.2 32 | // "The symbol "::" MUST NOT be used to shorten just one 16-bit 0 field." 33 | assertEquals("2001:0:3:4:5:6:7:8", 34 | toAddrString(InetAddress.getByName("2001:0:3:4:5:6:7:8"))); 35 | // shorten first of same-length consecutive 0 fields, also in initial position 36 | assertEquals("::4:0:0:0:ffff", 37 | toAddrString(InetAddress.getByName("0:0:0:4:0:0:0:ffff"))); 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/LengthedBodyTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.io.IOException; 6 | import java.nio.ByteBuffer; 7 | import java.nio.channels.FileChannel; 8 | import java.nio.channels.SeekableByteChannel; 9 | import java.nio.file.Files; 10 | import java.nio.file.Path; 11 | 12 | import static java.nio.charset.StandardCharsets.US_ASCII; 13 | import static java.nio.file.StandardOpenOption.*; 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class LengthedBodyTest { 17 | 18 | @Test 19 | public void test() throws IOException { 20 | Path temp = Files.createTempFile("jwarc-test", ".tmp"); 21 | try (FileChannel channel = FileChannel.open(temp, DELETE_ON_CLOSE, WRITE, READ)) { 22 | channel.write(ByteBuffer.wrap("xx0123456789yy".getBytes(US_ASCII))); 23 | channel.position(2); 24 | ByteBuffer buf = ByteBuffer.allocate(2); 25 | buf.flip(); 26 | SeekableByteChannel body = (SeekableByteChannel) LengthedBody.create(channel, buf, channel.size() - 4); 27 | { 28 | ByteBuffer b = ByteBuffer.allocate(32); 29 | while (true) { 30 | if (body.read(b) < 0) break; 31 | } 32 | b.flip(); 33 | assertEquals("0123456789", US_ASCII.decode(b).toString()); 34 | } 35 | 36 | { 37 | body.position(3); 38 | ByteBuffer b = ByteBuffer.allocate(4); 39 | body.read(b); 40 | b.flip(); 41 | assertEquals("3456", US_ASCII.decode(b).toString()); 42 | } 43 | 44 | } 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/MessageHeadersTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.TreeMap; 9 | 10 | import static org.junit.Assert.*; 11 | 12 | public class MessageHeadersTest { 13 | @Test 14 | public void testContains() { 15 | assertFalse(headers("Z", "1") 16 | .contains("Transfer-Encoding", "chunked")); 17 | assertTrue(headers("A", "0", "Transfer-Encoding", "chunked", "Z", "1") 18 | .contains("Transfer-Encoding", "chunked")); 19 | assertFalse(headers("Transfer-Encoding", "xchunkedx") 20 | .contains("Transfer-Encoding", "chunked")); 21 | assertFalse(headers("Transfer-Encoding", "gzip chunked") 22 | .contains("Transfer-Encoding", "chunked")); 23 | assertTrue(headers("Transfer-Encoding", "gzip, chunked, chunked, gzip") 24 | .contains("Transfer-Encoding", "chunked")); 25 | assertTrue(headers("Transfer-Encoding", "gzip, \tCHUNKED,,, GZIP") 26 | .contains("Transfer-Encoding", "Chunked")); 27 | } 28 | 29 | private static MessageHeaders headers(String... headers) { 30 | Map> map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); 31 | for (int i = 0; i < headers.length; i += 2) { 32 | map.computeIfAbsent(headers[i], (k) -> new ArrayList<>()).add(headers[i + 1]); 33 | } 34 | return new MessageHeaders(map); 35 | } 36 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/URIsTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.assertEquals; 6 | 7 | public class URIsTest { 8 | @Test 9 | public void toNormalizedSurt() { 10 | assertEquals("org,example:8080)/foo?&&a&b&c", URIs.toNormalizedSurt("http://wWw.EXAMPLE.org:8080/FOO?c&A&&&b")); 11 | } 12 | 13 | @Test 14 | public void testParseLeniently() { 15 | roundtripParseLeniently(""); 16 | roundtripParseLeniently("https://www.example.com#anchor"); 17 | roundtripParseLeniently("https://example.com?a=b&cd[]=4"); 18 | roundtripParseLeniently("/path/to/resource"); 19 | roundtripParseLeniently("http://[2001:db8::1]/resource"); 20 | roundtripParseLeniently("https://example.com/path%20with%20spaces"); 21 | roundtripParseLeniently("https://example.com#fragment%20with%20spaces"); 22 | roundtripParseLeniently("https://example.com?query%20with%20spaces"); 23 | roundtripParseLeniently("https://example.com/路径"); 24 | roundtripParseLeniently("https://example.com?query=测试"); 25 | roundtripParseLeniently("https://////example.com?query=测试"); 26 | roundtripParseLeniently("https://www.prijmeni.cz/Kr%C3%A1kora"); 27 | roundtripParseLeniently("https://dx.doi.org/10.1038%2F35008096"); 28 | 29 | assertEquals("https://example.com/path%20with%20spaces", URIs.parseLeniently("https://example.com/path with spaces").toString()); 30 | assertEquals("https://example.com?query%20with%20spaces", URIs.parseLeniently("https://example.com?query with spaces").toString()); 31 | assertEquals("https://example.com#fragment%20with%20spaces", URIs.parseLeniently("https://example.com#fragment with spaces").toString()); 32 | assertEquals("https://example.com/a%20b%25", URIs.parseLeniently("https://example.com/a b%25").toString()); 33 | assertEquals("https://example.com/a%20b路径", URIs.parseLeniently("https://example.com/a b路径").toString()); 34 | assertEquals("https://example.com?a%20b%25", URIs.parseLeniently("https://example.com?a b%25").toString()); 35 | assertEquals("https://example.com?a%20b路径", URIs.parseLeniently("https://example.com?a b路径").toString()); 36 | assertEquals("https://example.com#a%20b%25", URIs.parseLeniently("https://example.com#a b%25").toString()); 37 | assertEquals("https://example.com/a%20b%25路径%5b?a%20b%25路径[?#a%20b%25路径[?", URIs.parseLeniently("https://example.com/a b%25路径[?a b%25路径[?#a b%25路径[?").toString()); 38 | assertEquals("https://example.com/a%20b?c%20d#e%20f", URIs.parseLeniently("https://example.com/a b?c d#e f").toString()); 39 | } 40 | 41 | private void roundtripParseLeniently(String s) { 42 | assertEquals(s, URIs.parseLeniently(s).toString()); 43 | } 44 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/WarcParserTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.nio.ByteBuffer; 6 | import java.nio.charset.StandardCharsets; 7 | import java.util.Optional; 8 | 9 | import static org.junit.Assert.*; 10 | 11 | public class WarcParserTest { 12 | @Test 13 | public void testParsingArcWithBogusMime() { 14 | WarcParser parser = parse("http://example.com/ 1.2.3.4 20110104111607 @[=*�Content-Type] 494\n"); 15 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length")); 16 | parser = parse("http://example.com/ 1.2.3.4 20110104111607 charset=foo 494\n"); 17 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length")); 18 | parser = parse("http://example.com/ 1.2.3.4 20110104111607 image(jpeg) 494\n"); 19 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length")); 20 | parser = parse("http://example.com/ 1.2.3.4 20110104111607 ERROR: 494\n"); 21 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length")); 22 | } 23 | 24 | @Test 25 | public void testParsingArcWithCorruptDates() { 26 | WarcParser parser = parse("http://example.com/ 1.2.3.4 200012120739 text/html 42\n"); 27 | assertEquals(Optional.of("2000-12-12T07:39:00Z"), parser.headers().first("WARC-Date")); 28 | parser = parse("http://example.com/ 1.2.3.4 2000121207394211 text/html 1942\n"); 29 | assertEquals(Optional.of("2000-12-12T07:39:42Z"), parser.headers().first("WARC-Date")); 30 | parser = parse("http://example.com/ 1.2.3.4 99999999999999 text/html 1942\n"); 31 | assertEquals(Optional.empty(), parser.headers().first("WARC-Date")); 32 | } 33 | 34 | @Test 35 | public void testLenientParsing() { 36 | WarcParser parser = parse( "WARC/0.18\nHello\u0007:\u0008world\r\n\r\n", true); 37 | assertEquals(Optional.of("\u0008world"), parser.headers().sole("Hello\u0007")); 38 | } 39 | 40 | @Test(expected = AssertionError.class) 41 | public void testStrictParsing() { 42 | parse( "WARC/1.0\r\nHello\u0007:\u0008world\r\n\r\n"); 43 | } 44 | 45 | private static WarcParser parse(String input) { 46 | return parse(input, false); 47 | } 48 | 49 | private static WarcParser parse(String input, boolean lenient) { 50 | WarcParser parser = new WarcParser(); 51 | parser.setLenient(lenient); 52 | parser.parse(ByteBuffer.wrap(input.getBytes(StandardCharsets.ISO_8859_1))); 53 | assertFalse(parser.isError()); 54 | assertTrue(parser.isFinished()); 55 | return parser; 56 | } 57 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/WarcRecordTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.time.Instant; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | 9 | public class WarcRecordTest { 10 | @Test 11 | public void datePrecision() { 12 | Instant date = Instant.parse("2021-08-30T07:49:07.466148Z"); 13 | WarcResource warc10Record = new WarcResource.Builder().version(MessageVersion.WARC_1_0).date(date).build(); 14 | assertEquals(0, warc10Record.date().getNano()); 15 | WarcResource warc11Record = new WarcResource.Builder().date(date).version(MessageVersion.WARC_1_1).build(); 16 | assertEquals(466148000, warc11Record.date().getNano()); 17 | } 18 | 19 | @Test(expected = IllegalArgumentException.class) 20 | public void invalidVersionShouldThrow() { 21 | new Warcinfo.Builder().version(MessageVersion.HTTP_1_0); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/WarcTargetRecordTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc; 2 | 3 | import org.junit.Test; 4 | 5 | import java.net.URI; 6 | import java.util.*; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class WarcTargetRecordTest { 11 | @Test 12 | public void testTargetURIAngleBracketsQuirk() { // per warc 1.0 grammar 13 | Map> headers = new HashMap<>(); 14 | headers.put("WARC-Target-URI", Collections.singletonList("")); 15 | WarcTargetRecord record = new WarcTargetRecord(MessageVersion.WARC_1_0, new MessageHeaders(headers), MessageBody.empty()) { 16 | }; 17 | assertEquals("http://example.org/", record.target()); 18 | assertEquals(URI.create("http://example.org/"), record.targetURI()); 19 | } 20 | 21 | @Test 22 | public void testTargetURINormal() { // per warc 1.1 (and warc 1.0 examples) 23 | Map> headers = new HashMap<>(); 24 | headers.put("WARC-Target-URI", Collections.singletonList("http://example.org/")); 25 | WarcTargetRecord record = new WarcTargetRecord(MessageVersion.WARC_1_0, new MessageHeaders(headers), MessageBody.empty()) { 26 | }; 27 | assertEquals("http://example.org/", record.target()); 28 | assertEquals(URI.create("http://example.org/"), record.targetURI()); 29 | } 30 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/MediaTypeTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Test; 9 | import org.netpreserve.jwarc.MediaType; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | public class MediaTypeTest { 14 | 15 | @Test 16 | public void test() { 17 | MediaType type = MediaType.parse("text/html; charset=\"foo\\\" bar\";foo=bar ;b=c"); 18 | assertEquals("text/html;b=c;charset=\"foo\\\" bar\";foo=bar", type.toString()); 19 | assertEquals("foo\" bar", type.parameters().get("charset")); 20 | assertEquals("text", type.type()); 21 | assertEquals("html", type.subtype()); 22 | assertEquals("text/html", type.base().toString()); 23 | assertEquals(MediaType.parse("text/html"), MediaType.parse("teXT/htML")); 24 | assertEquals(MediaType.parse("text/html;charset=utf-8"), MediaType.parse("teXT/htML ;\tCHARsET=utf-8")); 25 | assertEquals(MediaType.parse("text/html;charset=utf-8").hashCode(), MediaType.parse("teXT/htML ;\tCHARsET=utf-8").hashCode()); 26 | assertNotEquals(MediaType.parse("text/html;chartset=utf-8"), MediaType.parse("text/html;chartset=UTF-8")); 27 | assertEquals(MediaType.parse("text/html"), MediaType.parse("teXT/htML ;\tCHARsET=utf-8").base()); 28 | assertTrue(type.base().parameters().isEmpty()); 29 | assertEquals("one", MediaType.parse("text/html;CHARSET=one;charset=two;charset=three").parameters().get("charset")); 30 | } 31 | 32 | @Test 33 | public void testParseLeniently() { 34 | { 35 | MediaType mediaType = MediaType.parseLeniently("text/html;ISO-8859-1;a\0=2;ok=ok"); 36 | assertFalse(mediaType.isValid()); 37 | assertEquals("text/html;ok=ok", mediaType.toString()); 38 | assertEquals(1, mediaType.parameters().size()); 39 | assertEquals("ok", mediaType.parameters().get("ok")); 40 | mediaType.raw().equals("text/html;ISO-8859-1;a\0=2;ok=ok"); 41 | } 42 | assertEquals("bog\0us", MediaType.parseLeniently("bog\0us").toString()); 43 | assertEquals("\0/\0", MediaType.parseLeniently("\0/\0").toString()); 44 | assertEquals("", MediaType.parseLeniently("").toString()); 45 | } 46 | 47 | @Test(expected = IllegalArgumentException.class) 48 | public void strictParsingShouldThrow() { 49 | MediaType.parse("text/html;ISO-8859-1;a\0=2;ok=ok"); 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/MessageVersionTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Test; 9 | import org.netpreserve.jwarc.MessageVersion; 10 | 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class MessageVersionTest { 17 | @Test 18 | public void test() { 19 | Map map = new HashMap<>(); 20 | map.put(MessageVersion.HTTP_1_0, 10); 21 | map.put(MessageVersion.HTTP_1_1, 11); 22 | assertEquals(10, (int) map.get(MessageVersion.HTTP_1_0)); 23 | assertEquals("HTTP", MessageVersion.HTTP_1_0.getProtocol()); 24 | assertEquals(1, MessageVersion.HTTP_1_0.getMajor()); 25 | assertEquals(0, MessageVersion.HTTP_1_0.getMinor()); 26 | assertEquals("HTTP/1.0", MessageVersion.HTTP_1_0.toString()); 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcContinuationTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Test; 9 | import org.netpreserve.jwarc.WarcContinuation; 10 | import org.netpreserve.jwarc.WarcReader; 11 | import org.netpreserve.jwarc.WarcResponse; 12 | 13 | import java.io.ByteArrayInputStream; 14 | import java.io.IOException; 15 | import java.net.URI; 16 | import java.util.Optional; 17 | 18 | import static java.nio.charset.StandardCharsets.UTF_8; 19 | import static org.junit.Assert.assertEquals; 20 | 21 | public class WarcContinuationTest { 22 | 23 | final static String continuation1 = "WARC/1.0\r\n" + 24 | "WARC-Type: response\r\n" + 25 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 26 | "WARC-Date: 2006-09-19T17:20:24Z\r\n" + 27 | "WARC-Block-Digest: sha1:2ASS7ZUZY6ND6CCHXETFVJDENAWF7KQ2\r\n" + 28 | "WARC-Payload-Digest: sha1:CCHXETFVJD2MUZY6ND6SS7ZENMWF7KQ2\r\n" + 29 | "WARC-IP-Address: 207.241.233.58\r\n" + 30 | "WARC-Record-ID: \r\n" + 31 | "WARC-Segment-Number: 1\r\n" + 32 | "Content-Type: application/http;msgtype=response\r\n" + 33 | "Content-Length: 1600\r\n" + 34 | "\r\n" + 35 | "HTTP/1.1 200 OK\r\n" + 36 | "Date: Tue, 19 Sep 2006 17:18:40 GMT\r\n" + 37 | "Server: Apache/2.0.54 (Ubuntu)\r\n" + 38 | "Last-Modified: Mon, 16 Jun 2003 22:28:51 GMT\r\n" + 39 | "ETag: \"3e45-67e-2ed02ec0\"\r\n" + 40 | "Accept-Ranges: bytes\r\n" + 41 | "Content-Length: 1662\r\n" + 42 | "Connection: close\r\n" + 43 | "Content-Type: image/jpeg\r\n" + 44 | "\r\n" + 45 | "[first 1360 bytes of image/jpeg binary data here]"; 46 | 47 | final static String continuation2 = "WARC/1.0\r\n" + 48 | "WARC-Type: continuation\r\n" + 49 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 50 | "WARC-Date: 2006-09-19T17:20:24Z\r\n" + 51 | "WARC-Block-Digest: sha1:T7HXETFVA92MSS7ZENMFZY6ND6WF7KB7\r\n" + 52 | "WARC-Record-ID: \r\n" + 53 | "WARC-Segment-Origin-ID: \r\n" + 54 | "WARC-Segment-Number: 2\r\n" + 55 | "WARC-Segment-Total-Length: 1902\r\n" + 56 | "WARC-Identified-Payload-Type: image/jpeg\r\n" + 57 | "Content-Length: 302\r\n" + 58 | "\r\n" + 59 | "[last 302 bytes of image/jpeg binary data here]"; 60 | 61 | 62 | @Test 63 | public void test() throws IOException { 64 | WarcResponse response = (WarcResponse) new WarcReader(new ByteArrayInputStream(continuation1.getBytes(UTF_8))).next().get(); 65 | assertEquals(Optional.of(1L), response.segmentNumber()); 66 | 67 | WarcContinuation continuation = (WarcContinuation) new WarcReader(new ByteArrayInputStream(continuation2.getBytes(UTF_8))).next().get(); 68 | assertEquals(response.id(), continuation.segmentOriginId()); 69 | assertEquals(Optional.of(2L), continuation.segmentNumber()); 70 | assertEquals(Optional.of(1902L), continuation.segmentTotalLength()); 71 | } 72 | 73 | @Test 74 | public void builder() { 75 | URI id = URI.create("urn:uuid:70653950-a77f-b212-e434-7a7c6ec909ef"); 76 | WarcContinuation continuation = new WarcContinuation.Builder() 77 | .segmentOriginId(id) 78 | .segmentNumber(3) 79 | .segmentTotalLength(1024) 80 | .build(); 81 | assertEquals("continuation", continuation.type()); 82 | assertEquals(id, continuation.segmentOriginId()); 83 | assertEquals(Optional.of(3L), continuation.segmentNumber()); 84 | assertEquals(Optional.of(1024L), continuation.segmentTotalLength()); 85 | } 86 | 87 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcConversionTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | import org.netpreserve.jwarc.MediaType; 11 | import org.netpreserve.jwarc.WarcConversion; 12 | import org.netpreserve.jwarc.WarcReader; 13 | 14 | import java.io.ByteArrayInputStream; 15 | import java.io.IOException; 16 | import java.net.URI; 17 | 18 | import static java.nio.charset.StandardCharsets.UTF_8; 19 | import static org.junit.Assert.assertEquals; 20 | 21 | public class WarcConversionTest { 22 | 23 | final static String warc = "WARC/1.0\r\n" + 24 | "WARC-Type: conversion\r\n" + 25 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 26 | "WARC-Date: 2016-09-19T19:00:40Z\r\n" + 27 | "WARC-Record-ID: \r\n" + 28 | "WARC-Refers-To: \r\n" + 29 | "WARC-Block-Digest: sha1:XQMRY75YY42ZWC6JAT6KNXKD37F7MOEK\r\n" + 30 | "Content-Type: image/neoimg\r\n" + 31 | "Content-Length: 934\r\n" + 32 | "\r\n" + 33 | "[image/neoimg binary data here]"; 34 | 35 | @Test 36 | public void test() throws IOException { 37 | WarcConversion conversion = (WarcConversion) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get(); 38 | assertEquals(URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0"), conversion.refersTo().get()); 39 | assertEquals(934, conversion.body().size()); 40 | Assert.assertEquals(MediaType.parse("image/neoimg"), conversion.contentType()); 41 | assertEquals(URI.create("http://www.archive.org/images/logoc.jpg"), conversion.targetURI()); 42 | } 43 | 44 | @Test 45 | public void builder() throws IOException { 46 | URI reference = URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0"); 47 | WarcConversion conversion = new WarcConversion.Builder() 48 | .refersTo(reference) 49 | .build(); 50 | assertEquals("conversion", conversion.type()); 51 | assertEquals(reference, conversion.refersTo().get()); 52 | } 53 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcFilterTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.apitests; 2 | 3 | import org.junit.Test; 4 | import org.netpreserve.jwarc.HttpResponse; 5 | import org.netpreserve.jwarc.WarcRequest; 6 | import org.netpreserve.jwarc.WarcResponse; 7 | 8 | import java.io.IOException; 9 | import java.net.URI; 10 | import java.text.ParseException; 11 | 12 | import static org.junit.Assert.*; 13 | import static org.netpreserve.jwarc.WarcFilter.compile; 14 | 15 | public class WarcFilterTest { 16 | 17 | @Test 18 | public void test() throws ParseException, IOException { 19 | WarcResponse response = new WarcResponse.Builder(URI.create("http://example.org/")) 20 | .setHeader("five", "5") 21 | .body(new HttpResponse.Builder(200, "OK") 22 | .setHeader("Transfer-Encoding", "chunked") 23 | .build()) 24 | .build(); 25 | assertTrue(compile("WARC-Type == \"response\"").test(response)); 26 | assertTrue(compile("warc-typE== \t \"response\"").test(response)); 27 | assertFalse(compile("WARC-Type != \"response\"").test(response)); 28 | assertTrue(compile("WARC-Target-URI =~ \"http:.*\"").test(response)); 29 | assertFalse(compile("WARC-Target-URI =~ \"org\"").test(response)); 30 | assertFalse(compile("WARC-Target-URI !~ \"http:.*\"").test(response)); 31 | assertTrue(compile("content-length < 500").test(response)); 32 | assertTrue(compile("warc-type <= 500").test(response)); 33 | assertTrue(compile("five >= 5").test(response)); 34 | assertTrue(compile("five == 5").test(response)); 35 | assertTrue(compile(":status == 200").test(response)); 36 | assertTrue(compile("http:transfer-encoding == \"chunked\"").test(response)); 37 | assertTrue(compile("(((five >= 5)))").test(response)); 38 | assertFalse(compile("!(five >= 5)").test(response)); 39 | assertFalse(compile("five > 5").test(response)); 40 | assertTrue(compile("five > 10 || five > 11 || five <= 5").test(response)); 41 | assertFalse(compile("five < 10 && five > 10").test(response)); 42 | assertTrue(compile("(five < 10 || five > 10) && five == \"5\"").test(response)); 43 | assertFalse(compile("(five > 100) && five < 10").test(response)); 44 | assertFalse(compile("(five < 10) && five > 100").test(response)); 45 | assertFalse(compile("(five < 10 || five > 10) && five > 100").test(response)); 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcMetadataTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | import org.netpreserve.jwarc.MediaType; 11 | import org.netpreserve.jwarc.WarcMetadata; 12 | import org.netpreserve.jwarc.WarcReader; 13 | 14 | import java.io.ByteArrayInputStream; 15 | import java.io.IOException; 16 | import java.util.Arrays; 17 | import java.util.HashMap; 18 | import java.util.List; 19 | import java.util.Map; 20 | 21 | import static java.nio.charset.StandardCharsets.UTF_8; 22 | import static org.junit.Assert.assertEquals; 23 | 24 | public class WarcMetadataTest { 25 | final static String warc = "WARC/1.1\r\n" + 26 | "WARC-Type: metadata\r\n" + 27 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 28 | "WARC-Date: 2016-09-19T17:20:24Z\r\n" + 29 | "WARC-Record-ID: \r\n" + 30 | "WARC-Concurrent-To: \r\n" + 31 | "Content-Type: application/warc-fields\r\n" + 32 | "WARC-Block-Digest: sha1:VXT4AF5BBZVHDYKNC2CSM8TEAWDB6CH8\r\n" + 33 | "Content-Length: 59\r\n" + 34 | "\r\n" + 35 | "via: http://www.archive.org/\r\n" + 36 | "hopsFromSeed: E\r\n" + 37 | "fetchTimeMs: 565"; 38 | 39 | @Test 40 | public void test() throws IOException { 41 | WarcMetadata metadata = (WarcMetadata) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get(); 42 | assertEquals("http://www.archive.org/", metadata.fields().sole("via").get()); 43 | } 44 | 45 | @Test 46 | public void builder() throws IOException { 47 | Map> fields = new HashMap<>(); 48 | fields.put("hello", Arrays.asList("one", "two")); 49 | WarcMetadata metadata = new WarcMetadata.Builder() 50 | .fields(fields) 51 | .build(); 52 | Assert.assertEquals(MediaType.WARC_FIELDS, metadata.contentType()); 53 | assertEquals("one", metadata.fields().first("hello").get()); 54 | assertEquals(Arrays.asList("one", "two"), metadata.fields().all("hello")); 55 | } 56 | 57 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcRequestTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | import org.netpreserve.jwarc.*; 11 | 12 | import java.io.BufferedReader; 13 | import java.io.ByteArrayInputStream; 14 | import java.io.IOException; 15 | import java.net.URI; 16 | import java.nio.ByteBuffer; 17 | import java.nio.channels.Channels; 18 | import java.nio.charset.StandardCharsets; 19 | import java.util.Arrays; 20 | import java.util.Collections; 21 | import java.util.Optional; 22 | 23 | import static java.nio.charset.StandardCharsets.UTF_8; 24 | import static org.junit.Assert.assertEquals; 25 | 26 | public class WarcRequestTest { 27 | 28 | final static String warc = "WARC/1.1\r\n" + 29 | "WARC-Type: request\r\n" + 30 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 31 | "WARC-Warcinfo-ID: \r\n" + 32 | "WARC-Date: 2016-09-19T17:20:24Z\r\n" + 33 | "Content-Length: 242\r\n" + 34 | "WARC-Record-ID: \r\n" + 35 | "Content-Type: application/http;msgtype=request\r\n" + 36 | "WARC-Concurrent-To: \r\n" + 37 | "\r\n" + 38 | "GET /images/logoc.jpg HTTP/1.0\r\n" + 39 | "User-Agent: Mozilla/5.0 (compatible; heritrix/1.10.0)\r\n" + 40 | "From: stack@example.org\r\n" + 41 | "Connection: close\r\n" + 42 | "Referer: http://www.archive.org/\r\n" + 43 | "Host: www.archive.org\r\n" + 44 | "Cookie: PHPSESSID=009d7bb11022f80605aa87e18224d824\r\n\r\n\r\n"; 45 | 46 | @Test 47 | public void test() throws IOException { 48 | WarcRequest request = sampleRequest(); 49 | assertEquals(Collections.singletonList(URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0")), request.concurrentTo()); 50 | Assert.assertEquals(MediaType.HTTP_REQUEST, request.contentType()); 51 | Assert.assertEquals(MessageVersion.WARC_1_1, request.version()); 52 | assertEquals(MessageVersion.HTTP_1_0, request.http().version()); 53 | assertEquals(Optional.of("close"), request.http().headers().sole("connection")); 54 | } 55 | 56 | @Test 57 | public void builder() throws IOException { 58 | WarcRequest request = new WarcRequest.Builder(URI.create("http://example.org/")) 59 | .concurrentTo(URI.create("id:1")) 60 | .concurrentTo(URI.create("id:2")) 61 | .build(); 62 | assertEquals(Arrays.asList(URI.create("id:1"), URI.create("id:2")), request.concurrentTo()); 63 | } 64 | 65 | @Test 66 | public void callingHttpShouldNotCorruptBody() throws IOException { 67 | WarcRequest request = sampleRequest(); 68 | request.http(); 69 | assertEquals(0, request.body().position()); 70 | String line = new BufferedReader(Channels.newReader(request.body(), UTF_8.name())).readLine(); 71 | assertEquals("GET /images/logoc.jpg HTTP/1.0", line); 72 | } 73 | 74 | private WarcRequest sampleRequest() throws IOException { 75 | return (WarcRequest) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get(); 76 | } 77 | 78 | @Test(expected = IllegalStateException.class) 79 | public void readingBodyShouldInvalidateHttp() throws IOException { 80 | WarcRequest response = sampleRequest(); 81 | response.body().read(ByteBuffer.allocate(1)); 82 | response.http(); 83 | } 84 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcResourceTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Test; 9 | import org.netpreserve.jwarc.WarcResource; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | public class WarcResourceTest { 14 | @Test 15 | public void builder() { 16 | WarcResource resource = new WarcResource.Builder().build(); 17 | assertEquals("resource", resource.type()); 18 | } 19 | 20 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcRevisitTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Test; 9 | import org.netpreserve.jwarc.WarcReader; 10 | import org.netpreserve.jwarc.WarcRevisit; 11 | 12 | import java.io.ByteArrayInputStream; 13 | import java.io.IOException; 14 | import java.net.URI; 15 | import java.time.Instant; 16 | import java.util.Optional; 17 | 18 | import static java.nio.charset.StandardCharsets.UTF_8; 19 | import static org.junit.Assert.assertEquals; 20 | import static org.junit.Assert.assertFalse; 21 | 22 | public class WarcRevisitTest { 23 | final static String warc = "WARC/1.1\r\n" + 24 | "WARC-Type: revisit\r\n" + 25 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 26 | "WARC-Date: 2017-06-23T12:43:35Z\r\n" + 27 | "WARC-Profile: http://netpreserve.org/warc/1.1/revisit/server-not-modified\r\n" + 28 | "WARC-Record-ID: \r\n" + 29 | "WARC-Refers-To: \r\n" + 30 | "WARC-Refers-To-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" + 31 | "WARC-Refers-To-Date: 2016-09-19T17:20:24Z\r\n" + 32 | "Content-Type: message/http\r\n" + 33 | "Content-Length: 202\r\n" + 34 | "\r\n" + 35 | "HTTP/1.0 304 Not Modified\r\n" + 36 | "Date: Tue, 06 Mar 2017 00:43:35 GMT\r\n" + 37 | "Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 Connection: Keep-Alive\r\n" + 38 | "Keep-Alive: timeout=15, max=100\r\n" + 39 | "ETag: \"3e45-67e-2ed02ec0\"\r\n" + 40 | "\r\n" + 41 | "this line should not be read"; 42 | 43 | @Test 44 | public void test() throws IOException { 45 | WarcRevisit revisit = (WarcRevisit) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get(); 46 | assertEquals(WarcRevisit.SERVER_NOT_MODIFIED_1_1, revisit.profile()); 47 | assertEquals(Instant.parse("2016-09-19T17:20:24Z"), revisit.refersToDate().get()); 48 | assertEquals(URI.create("http://www.archive.org/images/logoc.jpg"), revisit.refersToTargetURI().get()); 49 | assertEquals(URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0"), revisit.refersTo().get()); 50 | assertEquals(304, revisit.http().status()); 51 | assertEquals(Optional.of("timeout=15, max=100"), revisit.http().headers().sole("Keep-Alive")); 52 | assertFalse(revisit.payload().isPresent()); 53 | } 54 | 55 | @Test 56 | public void buildingWithoutRefersToRecordId() { 57 | WarcRevisit revisit = new WarcRevisit.Builder(URI.create("http://example.org/"), 58 | WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_1) 59 | .refersTo((URI)null, URI.create("http://example.org/other"), Instant.parse("2016-09-19T17:20:24Z")) 60 | .build(); 61 | assertEquals(Optional.empty(), revisit.refersTo()); 62 | assertEquals(Optional.of(URI.create("http://example.org/other")), revisit.refersToTargetURI()); 63 | assertEquals(Optional.of(Instant.parse("2016-09-19T17:20:24Z")), revisit.refersToDate()); 64 | } 65 | 66 | @Test 67 | public void builder() throws IOException { 68 | URI target = URI.create("http://example.org/"); 69 | Instant date = Instant.now(); 70 | URI reference = URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0"); 71 | WarcRevisit revisit = new WarcRevisit.Builder(target, WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_1) 72 | .refersTo(reference, target, date) 73 | .build(); 74 | assertEquals(WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_1, revisit.profile()); 75 | assertEquals(target, revisit.targetURI()); 76 | assertEquals(date, revisit.refersToDate().get()); 77 | assertEquals(target, revisit.refersToTargetURI().get()); 78 | assertEquals(reference, revisit.refersTo().get()); 79 | } 80 | 81 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/apitests/WarcinfoTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors 4 | */ 5 | 6 | package org.netpreserve.jwarc.apitests; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | import org.netpreserve.jwarc.MediaType; 11 | import org.netpreserve.jwarc.MessageHeaders; 12 | import org.netpreserve.jwarc.WarcReader; 13 | import org.netpreserve.jwarc.Warcinfo; 14 | 15 | import java.io.ByteArrayInputStream; 16 | import java.io.IOException; 17 | import java.net.URI; 18 | import java.time.Instant; 19 | import java.util.Arrays; 20 | import java.util.HashMap; 21 | import java.util.List; 22 | import java.util.Map; 23 | 24 | import static java.nio.charset.StandardCharsets.UTF_8; 25 | import static org.junit.Assert.assertEquals; 26 | 27 | public class WarcinfoTest { 28 | final static String warc = "WARC/1.0\r\n" + 29 | "WARC-Type: warcinfo\r\n" + 30 | "WARC-Date: 2006-09-19T17:20:14Z\r\n" + 31 | "WARC-Record-ID: \r\n" + 32 | "WARC-Filename:hello.warc\r\n" + 33 | "Content-Type: application/warc-fields\r\n" + 34 | "Content-Length: 399\r\n" + 35 | "Folded: a \r\n" + 36 | " b\t \r\n" + 37 | "\t\tc \r\n" + 38 | "\r\n" + 39 | "software: Heritrix 1.12.0 http://crawler.archive.org\r\n" + 40 | "hostname: crawling017.archive.org\r\n" + 41 | "ip: 207.241.227.234\r\n" + 42 | "isPartOf: testcrawl-20050708\r\n" + 43 | "description: testcrawl with WARC output\r\n" + 44 | "operator: IA\\_Admin\r\n" + 45 | "http-header-user-agent:\r\n" + 46 | " Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org)\r\n" + 47 | "format: WARC file version 1.0\r\n" + 48 | "conformsTo:\r\n" + 49 | " http://www.archive.org/documents/WarcFileFormat-1.0.html\r\n\r\n"; 50 | 51 | @Test 52 | public void test() throws IOException { 53 | Warcinfo warcinfo = (Warcinfo) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get(); 54 | assertEquals(URI.create("urn:uuid:d7ae5c10-e6b3-4d27-967d-34780c58ba39"), warcinfo.id()); 55 | assertEquals(Instant.parse("2006-09-19T17:20:14Z"), warcinfo.date()); 56 | assertEquals("hello.warc", warcinfo.filename().get()); 57 | assertEquals(399, warcinfo.body().size()); 58 | Assert.assertEquals(MediaType.WARC_FIELDS, warcinfo.contentType()); 59 | MessageHeaders fields = warcinfo.fields(); 60 | assertEquals("207.241.227.234", fields.sole("ip").get()); 61 | assertEquals("http://www.archive.org/documents/WarcFileFormat-1.0.html", fields.sole("conformsTo").get()); 62 | assertEquals("a b c", warcinfo.headers().sole("Folded").get()); 63 | } 64 | 65 | @Test 66 | public void builder() throws IOException { 67 | Map> fields = new HashMap<>(); 68 | fields.put("hello", Arrays.asList("one", "two")); 69 | Warcinfo warcinfo = new Warcinfo.Builder() 70 | .filename("hello.warc") 71 | .fields(fields) 72 | .build(); 73 | assertEquals("warcinfo", warcinfo.type()); 74 | assertEquals("hello.warc", warcinfo.filename().get()); 75 | assertEquals("one", warcinfo.fields().first("hello").get()); 76 | assertEquals(Arrays.asList("one", "two"), warcinfo.fields().all("hello")); 77 | assertEquals(MediaType.WARC_FIELDS, warcinfo.contentType()); 78 | } 79 | 80 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/cdx/CdxReaderTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.cdx; 2 | 3 | import org.junit.Test; 4 | import org.netpreserve.jwarc.MediaType; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.IOException; 8 | import java.io.StringReader; 9 | import java.time.Instant; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | public class CdxReaderTest { 14 | @Test 15 | public void test() throws IOException { 16 | String data = "- 20220302214434 http://example.org/ text/html 200 AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7 - - 1062 760582405 example.warc.gz\n" + 17 | "- 20220302214433 https://example.org/page/ application/rss+xml 200 AQO24VNPMHIM6GUNVSCP7IUUETZ4U52J - - 971 760584354 example.warc.gz\n" + 18 | "- 20220302214434 https://example.org/style.css text/css 200 AG2PTU7G6DMXCBP6IBSR5VG5RUMYOHHN - - 749 760586303 example.warc.gz\n"; 19 | 20 | try (CdxReader reader = new CdxReader(new BufferedReader(new StringReader(data)))) { 21 | CdxRecord record = reader.next().get(); 22 | assertEquals(200, (int) record.status()); 23 | assertEquals("http://example.org/", record.target()); 24 | assertEquals("AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7", record.digest()); 25 | assertEquals(760582405, (long) record.position()); 26 | assertEquals(1062, (long) record.size()); 27 | assertEquals("example.warc.gz", record.filename()); 28 | assertEquals(Instant.parse("2022-03-02T21:44:34Z"), record.date()); 29 | assertEquals(MediaType.HTML, record.contentType()); 30 | 31 | assertTrue(reader.next().isPresent()); 32 | assertTrue(reader.next().isPresent()); 33 | assertFalse(reader.next().isPresent()); 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/cdx/CdxWriterTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.cdx; 2 | 3 | import org.junit.Rule; 4 | import org.junit.Test; 5 | import org.junit.rules.TemporaryFolder; 6 | import org.netpreserve.jwarc.*; 7 | 8 | import java.io.IOException; 9 | import java.io.StringWriter; 10 | import java.nio.file.Files; 11 | import java.nio.file.Path; 12 | import java.time.Instant; 13 | import java.util.Collections; 14 | 15 | import static java.nio.file.StandardOpenOption.CREATE; 16 | import static java.nio.file.StandardOpenOption.WRITE; 17 | import static org.junit.Assert.assertEquals; 18 | 19 | public class CdxWriterTest { 20 | @Rule 21 | public TemporaryFolder temporaryFolder = new TemporaryFolder(); 22 | 23 | 24 | @Test 25 | public void test() throws IOException { 26 | Path testWarcFile = temporaryFolder.newFile().toPath().toAbsolutePath(); 27 | try (WarcWriter warcWriter = new WarcWriter(Files.newByteChannel(testWarcFile, CREATE, WRITE))) { 28 | HttpResponse httpResponse = new HttpResponse.Builder(404, "Not Found") 29 | .body(MediaType.HTML, new byte[0]) 30 | .build(); 31 | warcWriter.write(new WarcResponse.Builder("http://example.org/") 32 | .date(Instant.parse("2022-03-01T12:44:34Z")) 33 | .body(httpResponse) 34 | .payloadDigest("sha256", "b04af472c47a8b1b5059b3404caac0e1bfb5a3c07b329be66f65cfab5ee8d3f3") 35 | .build()); 36 | warcWriter.write(new WarcRevisit.Builder("http://example.org/") 37 | .date(Instant.parse("2022-03-02T21:44:34Z")) 38 | .body(httpResponse) 39 | .payloadDigest("sha256", "b04af472c47a8b1b5059b3404caac0e1bfb5a3c07b329be66f65cfab5ee8d3f3") 40 | .build()); 41 | } 42 | 43 | StringWriter cdxBuffer = new StringWriter(); 44 | CdxWriter cdxWriter = new CdxWriter(cdxBuffer); 45 | cdxWriter.setFormat(new CdxFormat.Builder().digestUnchanged().build()); 46 | cdxWriter.writeHeaderLine(); 47 | cdxWriter.process(Collections.singletonList(testWarcFile), true); 48 | assertEquals(" CDX N b a m s k r M S V g\n" + 49 | "org,example)/ 20220301124434 http://example.org/ text/html 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 398 0 " + testWarcFile + "\n" + 50 | "org,example)/ 20220302214434 http://example.org/ warc/revisit 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 397 398 " + testWarcFile + "\n", 51 | cdxBuffer.toString()); 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/cdx/JsonTokenizerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-License-Identifier: Apache-2.0 3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors 4 | */ 5 | package org.netpreserve.jwarc.cdx; 6 | 7 | import org.junit.Test; 8 | 9 | import java.io.IOException; 10 | import java.io.StringReader; 11 | import java.util.ArrayList; 12 | import java.util.Arrays; 13 | import java.util.List; 14 | 15 | import static java.util.Collections.singletonList; 16 | import static org.junit.Assert.assertEquals; 17 | import static org.netpreserve.jwarc.cdx.JsonToken.*; 18 | 19 | public class JsonTokenizerTest { 20 | static List tokenize(String json) throws IOException, JsonException { 21 | List tokens = new ArrayList<>(); 22 | JsonTokenizer parser = new JsonTokenizer(new StringReader(json)); 23 | while (true) { 24 | JsonToken token = parser.nextToken(); 25 | if (token == null) break; 26 | tokens.add(token); 27 | } 28 | return tokens; 29 | } 30 | 31 | static List tokenizeValues(String json) throws IOException, JsonException { 32 | List values = new ArrayList<>(); 33 | JsonTokenizer parser = new JsonTokenizer(new StringReader(json)); 34 | while (true) { 35 | JsonToken token = parser.nextToken(); 36 | if (token == null) break; 37 | if (token == STRING) { 38 | values.add(parser.stringValue()); 39 | } else if (token == NUMBER_INT) { 40 | values.add(Integer.parseInt(parser.stringValue())); 41 | } else if (token == NUMBER_FLOAT) { 42 | values.add(Double.parseDouble(parser.stringValue())); 43 | } else { 44 | values.add(token); 45 | } 46 | } 47 | return values; 48 | } 49 | 50 | @Test 51 | public void test() throws IOException, JsonException { 52 | assertEquals(Arrays.asList(START_ARRAY, END_ARRAY), tokenize("[]")); 53 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, END_ARRAY), tokenize("[5]")); 54 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_INT, END_ARRAY), tokenize("[5, 6]")); 55 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_FLOAT, END_ARRAY), tokenize(" [ 5,\t\t6.0 ] ")); 56 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_FLOAT, STRING, END_ARRAY), tokenize("[5,6.0,\"foo\"]")); 57 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_FLOAT, STRING, TRUE, FALSE, NULL, END_ARRAY), tokenize("[5,6.0,\"foo\",true,false,null]")); 58 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, NUMBER_INT, END_OBJECT), tokenize("{\"foo\":5}")); 59 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, NUMBER_INT, FIELD_NAME, NUMBER_FLOAT, END_OBJECT), tokenize("{\"foo\":5,\"bar\":6.0}")); 60 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, NUMBER_INT, FIELD_NAME, NUMBER_FLOAT, FIELD_NAME, STRING, END_OBJECT), tokenize("{\"foo\":5,\"bar\":6.0,\"baz\":\"q\"}")); 61 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, START_OBJECT, FIELD_NAME, NUMBER_INT, END_OBJECT, END_OBJECT), tokenize("{\"foo\":{\"bar\":5}}")); 62 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, START_ARRAY, NUMBER_INT, START_ARRAY, END_ARRAY, NUMBER_FLOAT, END_ARRAY, END_OBJECT), tokenize("{\"foo\":[5,[],6.0]}")); 63 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("0.0")); 64 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1e0")); 65 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1e+0")); 66 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1e-0")); 67 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1.0e0")); 68 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1.0e+0")); 69 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1.0e-0")); 70 | assertEquals(Arrays.asList(START_ARRAY, 0.0, -0.0, 1.0, 5, END_ARRAY), tokenizeValues("[0.0, -0.0, 1.0, 5]")); 71 | assertEquals(singletonList(" \t\r\n\0ሴ\"\\/"), tokenizeValues("\" \\t\\r\\n\\u0000\\u1234\\\"\\\\\\/\"")); 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /test/org/netpreserve/jwarc/net/WarcServerTest.java: -------------------------------------------------------------------------------- 1 | package org.netpreserve.jwarc.net; 2 | 3 | import org.junit.Test; 4 | 5 | import java.io.IOException; 6 | import java.net.ServerSocket; 7 | import java.util.Collections; 8 | 9 | import static org.junit.Assert.*; 10 | 11 | public class WarcServerTest { 12 | @Test 13 | public void test() throws IOException { 14 | try (ServerSocket serverSocket = new ServerSocket()) { 15 | // so far just testing that we can instantiate it and load the .js files 16 | WarcServer server = new WarcServer(serverSocket, Collections.emptyList()); 17 | } 18 | } 19 | } --------------------------------------------------------------------------------