set = new HashSet<>();
12 |
13 | /**
14 | * Adds a record to the set.
15 | */
16 | public void add(WarcRecord record) {
17 | set.add(record.id());
18 | if (record instanceof WarcCaptureRecord) {
19 | set.addAll(((WarcCaptureRecord) record).concurrentTo());
20 | }
21 | }
22 |
23 | /**
24 | * Tests if the given record is concurrent to any previously added record.
25 | */
26 | public boolean contains(WarcRecord record) {
27 | if (set.contains(record.id())) return true;
28 | if (record instanceof WarcCaptureRecord) {
29 | for (URI id : ((WarcCaptureRecord) record).concurrentTo()) {
30 | if (set.contains(id)) return true;
31 | }
32 | }
33 | return false;
34 | }
35 |
36 | /**
37 | * Removes all records from the set.
38 | */
39 | public void clear() {
40 | set.clear();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/DecodedBody.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2024 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 | import java.nio.channels.ReadableByteChannel;
11 |
12 |
13 | /**
14 | * A message body which decodes content on-the-fly using the specified encoding.
15 | */
16 | public class DecodedBody extends MessageBody {
17 |
18 | public static enum Encoding {
19 | DEFLATE,
20 | GZIP,
21 | BROTLI
22 | }
23 |
24 | private final ReadableByteChannel channel;
25 | private final Encoding encoding;
26 | long position = 0;
27 |
28 | private DecodedBody(ReadableByteChannel channel, Encoding encoding) throws IOException {
29 | this.encoding = encoding;
30 | switch (this.encoding) {
31 | case DEFLATE:
32 | this.channel = IOUtils.inflateChannel(channel);
33 | break;
34 | case GZIP:
35 | this.channel = IOUtils.gunzipChannel(channel);
36 | break;
37 | case BROTLI:
38 | try {
39 | this.channel = BrotliUtils.brotliChannel(channel);
40 | } catch (NoClassDefFoundError e) {
41 | throw new IOException("Brotli decoder not found, please install org.brotli:dec", e);
42 | }
43 | break;
44 | default:
45 | throw new IOException("Unsupported encoding");
46 | }
47 | }
48 |
49 | public static DecodedBody create(ReadableByteChannel channel, Encoding encoding) throws IOException {
50 | return new DecodedBody(channel, encoding);
51 | }
52 |
53 | @Override
54 | public long position() throws IOException {
55 | return position;
56 | };
57 |
58 | @Override
59 | public int read(ByteBuffer dst) throws IOException {
60 | int n = channel.read(dst);
61 | if (n > 0) {
62 | position += n;
63 | }
64 | return n;
65 | }
66 |
67 | @Override
68 | public boolean isOpen() {
69 | return channel.isOpen();
70 | }
71 |
72 | @Override
73 | public void close() throws IOException {
74 | channel.close();
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/DigestingMessageBody.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2021 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 | import java.security.MessageDigest;
11 |
12 | /**
13 | * Wrapper around a MessageBody which calculates a MessageDigest while the body
14 | * is read.
15 | */
16 | class DigestingMessageBody extends MessageBody {
17 | private final MessageBody body;
18 | private final MessageDigest digest;
19 |
20 | DigestingMessageBody(MessageBody digestedBody, MessageDigest digest) {
21 | this.body = digestedBody;
22 | this.digest = digest;
23 | }
24 |
25 | @Override
26 | public int read(ByteBuffer dst) throws IOException {
27 | int i = body.read(dst);
28 | if (i > 0) {
29 | ByteBuffer tmp = dst.duplicate();
30 | tmp.position(dst.position() - i);
31 | tmp.limit(dst.position());
32 | digest.update(tmp);
33 | }
34 | return i;
35 | }
36 |
37 | @Override
38 | public boolean isOpen() {
39 | return body.isOpen();
40 | }
41 |
42 | @Override
43 | public void close() throws IOException {
44 | body.close();
45 | }
46 |
47 | @Override
48 | public long position() throws IOException {
49 | return body.position();
50 | }
51 |
52 | public MessageDigest getDigest() {
53 | return digest;
54 | }
55 |
56 | @Override
57 | public long size() throws IOException {
58 | return body.size();
59 | }
60 | }
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/FetchOptions.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 | package org.netpreserve.jwarc;
6 |
7 | import java.io.OutputStream;
8 | import java.net.URI;
9 |
10 |
11 | /**
12 | * Options for fetching a remote resource.
13 | *
14 | * @see WarcWriter#fetch(URI, FetchOptions)
15 | */
16 | @SuppressWarnings("UnusedReturnValue")
17 | public class FetchOptions {
18 | long maxLength = 0;
19 | long maxTime = 0;
20 | int readTimeout = 60000;
21 | String userAgent = "jwarc";
22 | OutputStream copyTo;
23 |
24 | /**
25 | * Stops the fetch after this many bytes are received (including any protocol headers). If this limit was reached
26 | * the header "WARC-Truncated: length" will be added to the response record.
27 | */
28 | public FetchOptions maxLength(long bytes) {
29 | this.maxLength = bytes;
30 | return this;
31 | }
32 |
33 | /**
34 | * Stops the fetch after this many milliseconds have elapsed. If this limit was reached the header
35 | * "WARC-Truncated: time" will be added to the response record.
36 | */
37 | public FetchOptions maxTime(long millis) {
38 | this.maxTime = millis;
39 | return this;
40 | }
41 |
42 |
43 | /**
44 | * Sets the read timeout in milliseconds on the socket. Defaults to 60000. Set to 0 for no timout.
45 | *
46 | * @see java.net.Socket#setSoTimeout(int)
47 | */
48 | public FetchOptions readTimeout(int millis) {
49 | this.readTimeout = millis;
50 | return this;
51 | }
52 |
53 | /**
54 | * Sets the User-Agent request header. Default: "jwarc"
55 | *
56 | * If a custom HTTP request is provided this option will be ignored.
57 | */
58 | public FetchOptions userAgent(String userAgent) {
59 | this.userAgent = userAgent;
60 | return this;
61 | }
62 |
63 | /**
64 | * If specified the response will also be copied to this OutputStream as well as the WARC file.
65 | */
66 | public FetchOptions copyTo(OutputStream copyTo) {
67 | this.copyTo = copyTo;
68 | return this;
69 | }
70 | }
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/FetchResult.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | /**
9 | * The result of a fetch operation. This contains the request and response as WARC records (without payloads) so that
10 | * the request and response headers can be inspected.
11 | */
12 | public class FetchResult {
13 | private final WarcRequest request;
14 | private final WarcResponse response;
15 | private final Throwable exception;
16 |
17 | FetchResult(WarcRequest request, WarcResponse response, Throwable exception) {
18 | this.request = request;
19 | this.response = response;
20 | this.exception = exception;
21 | }
22 |
23 | /**
24 | * The WARC record containing the request that was sent. The request body will not be readable.
25 | */
26 | public WarcRequest request() {
27 | return request;
28 | }
29 |
30 | /**
31 | * The WARC record containing the request that was sent. The response body will not be readable.
32 | */
33 | public WarcResponse response() {
34 | return response;
35 | }
36 |
37 | /**
38 | * If the fetch was interrupted by an exception but truncated records were still written this will return the caught
39 | * exception. This can occur if the WarcWriter was closed during the fetch.
40 | */
41 | public Throwable exception() {
42 | return exception;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/GeminiParser.rl:
--------------------------------------------------------------------------------
1 | // recompile: ragel -J GeminiParser.rl -o GeminiParser.java
2 | // diagram: ragel -Vp GeminiParser.rl | dot -Tpng | feh -
3 | // spec: https://gemini.circumlunar.space/docs/specification.gmi
4 | %%{
5 |
6 | machine gemini;
7 |
8 | getkey (data.get(p) & 0xff);
9 |
10 | action push { push(data.get(p)); }
11 | action add_status { status = status * 10 + data.get(p) - '0'; }
12 | action handle_meta { meta = new String(buf, 0, bufPos, UTF_8); bufPos = 0; }
13 | action handle_url { url = new String(buf, 0, bufPos, UTF_8); bufPos = 0; }
14 | action finish { finished = true; fbreak; }
15 |
16 | CRLF = "\r\n";
17 |
18 | # spec doesn't mention any disallowed characters
19 | # but we assume \r and \n
20 | utf8_string = (any - '\r' - '\n')*;
21 |
22 | url = utf8_string $push %handle_url;
23 | gemini_request := url CRLF @finish;
24 |
25 | meta = utf8_string $push %handle_meta;
26 | status = digit {2} $add_status;
27 | gemini_response := status " " meta CRLF @finish;
28 |
29 | }%%
30 |
31 | package org.netpreserve.jwarc;
32 |
33 | import java.io.EOFException;
34 | import java.io.IOException;
35 | import java.nio.ByteBuffer;
36 | import java.nio.channels.ReadableByteChannel;
37 | import java.nio.channels.WritableByteChannel;
38 | import java.util.*;
39 |
40 | import static java.nio.charset.StandardCharsets.UTF_8;
41 |
42 | public class GeminiParser extends MessageParser {
43 | private int initialState;
44 | private int cs;
45 | private long position;
46 | private boolean finished;
47 | private byte[] buf = new byte[256];
48 | private int bufPos = 0;
49 | private int status;
50 | private String meta;
51 | private String url;
52 |
53 | public GeminiParser() {
54 | reset();
55 | }
56 |
57 | public void reset() {
58 | %% write init;
59 | bufPos = 0;
60 | if (buf.length > 8192) {
61 | buf = new byte[256]; // if our buffer grew really big release it
62 | }
63 | status = 0;
64 | meta = null;
65 | url = null;
66 | position = 0;
67 | finished = false;
68 | cs = initialState;
69 | }
70 |
71 | public int status() {
72 | return status;
73 | }
74 |
75 | public String meta() {
76 | return meta;
77 | }
78 |
79 | public String url() {
80 | return url;
81 | }
82 |
83 | public boolean isFinished() {
84 | return finished;
85 | }
86 |
87 | public boolean isError() {
88 | return cs == gemini_error;
89 | }
90 |
91 | /**
92 | * Configures the parser to read a gemini request while rejecting deviations from the standard.
93 | */
94 | public void strictRequest() {
95 | cs = gemini_en_gemini_request;
96 | initialState = cs;
97 | }
98 |
99 | /**
100 | * Configures the parser to read a gemini response while rejecting deviations from the standard.
101 | */
102 | public void strictResponse() {
103 | cs = gemini_en_gemini_response;
104 | initialState = cs;
105 | }
106 |
107 | /**
108 | * Runs the parser on a buffer of data. Passing null as the buffer indicates the end of input.
109 | */
110 | @SuppressWarnings({"UnusedAssignment", "ConstantConditions", "ConditionalBreakInInfiniteLoop"})
111 | public void parse(ByteBuffer data) throws ParsingException {
112 | int p;
113 | int pe;
114 | int eof;
115 |
116 | if (data == null) {
117 | p = 0;
118 | pe = 0;
119 | eof = 0;
120 | } else {
121 | p = data.position();
122 | pe = data.limit();
123 | eof = -1;
124 | }
125 |
126 | %% write exec;
127 |
128 | if (data != null) {
129 | position += p - data.position();
130 | data.position(p);
131 | }
132 | }
133 |
134 | public void parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException {
135 | parse(channel, buffer, null);
136 | }
137 |
138 | void parse(ReadableByteChannel channel, ByteBuffer buffer, WritableByteChannel copyTo) throws IOException {
139 | while (true) {
140 | ByteBuffer copy = buffer.duplicate();
141 | long buffOffset = buffer.position() - position;
142 | parse(buffer);
143 | if (copyTo != null) {
144 | copy.limit(buffer.position());
145 | copyTo.write(copy);
146 | }
147 | if (isFinished()) {
148 | break;
149 | }
150 | if (isError()) {
151 | throw new ParsingException("invalid gemini message at byte position " + position + ": "
152 | + getErrorContext(buffer.duplicate(), (int) (buffOffset + position), 40));
153 | }
154 | buffer.compact();
155 | int n = channel.read(buffer);
156 | buffer.flip();
157 | if (n < 0) {
158 | parse(null);
159 | break;
160 | }
161 | }
162 | }
163 |
164 | private void push(byte b) throws ParsingException {
165 | if (bufPos >= 1024) throw new ParsingException("gemini header field longer than 1024 bytes");
166 | if (bufPos >= buf.length) {
167 | buf = Arrays.copyOf(buf, buf.length * 2);
168 | }
169 | buf[bufPos++] = b;
170 | }
171 |
172 | %% write data;
173 | }
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/GeminiRequest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 | import java.nio.channels.ReadableByteChannel;
11 |
12 | public class GeminiRequest {
13 | private final String url;
14 |
15 | public GeminiRequest(String url) {
16 | this.url = url;
17 | }
18 |
19 | public static GeminiRequest parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException {
20 | GeminiParser parser = new GeminiParser();
21 | parser.strictRequest();
22 | parser.parse(channel, buffer, null);
23 | return new GeminiRequest(parser.url());
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/GeminiResponse.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 | import java.nio.channels.ReadableByteChannel;
11 | import java.nio.charset.StandardCharsets;
12 | import java.util.Collections;
13 |
14 | public class GeminiResponse extends Message {
15 | private final int status;
16 | private final String meta;
17 |
18 | public GeminiResponse(int status, String meta, MessageBody body) {
19 | super(MessageVersion.GEMINI, new MessageHeaders(Collections.emptyMap()), body);
20 | this.status = status;
21 | this.meta = meta;
22 | }
23 |
24 | public static GeminiResponse parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException {
25 | GeminiParser parser = new GeminiParser();
26 | parser.strictResponse();
27 | parser.parse(channel, buffer, null);
28 | return new GeminiResponse(parser.status(), parser.meta(), LengthedBody.createFromContentLength(channel, buffer, null));
29 | }
30 |
31 | public int status() {
32 | return status;
33 | }
34 |
35 | /**
36 | * Returns the HTTP equivalent of the status code. (e.g. 20 -> 200, 51 -> 404)
37 | */
38 | public int statusHttpEquivalent() {
39 | switch (status) {
40 | case 20:
41 | return 200;
42 | case 31: // redirect - temporary
43 | return 307;
44 | case 32: // redirect - permanent
45 | return 308;
46 | case 40: // temporary failure
47 | return 503;
48 | case 41: // server unavailable
49 | return 503;
50 | case 42: // CGI error
51 | return 500;
52 | case 43: // proxy error
53 | return 502;
54 | case 44: // slow down
55 | return 429;
56 | case 50: // permanent failure
57 | return 500;
58 | case 51: // not found
59 | return 404;
60 | case 52: // gone
61 | return 410;
62 | case 53: // proxy request refused
63 | return 502;
64 | case 59: // bad request
65 | return 400;
66 | case 60: // client certificate required
67 | return 401;
68 | case 61: // certificate not authorized
69 | return 403;
70 | case 62: // certificate not valid
71 | return 403;
72 | default:
73 | if (status > 10 && status < 20) { // input
74 | return 100;
75 | } else if (status >= 20 && status < 30) { // success
76 | return 200;
77 | } else if (status >= 30 && status < 40) { // redirect
78 | return 307;
79 | } else if (status >= 60 && status < 70) { // client cert required
80 | return 401;
81 | } else {
82 | return 500;
83 | }
84 | }
85 | }
86 |
87 | public String meta() {
88 | return meta;
89 | }
90 |
91 | @Override
92 | public byte[] serializeHeader() {
93 | return (String.format("%02d", status) + " " + meta + "\r\n").getBytes(StandardCharsets.UTF_8);
94 | }
95 |
96 | @Override
97 | public MediaType contentType() {
98 | if (status >= 20 && status < 30) {
99 | if (meta.isEmpty()) {
100 | return MediaType.parse("text/gemini; charset=utf-8");
101 | }
102 | return MediaType.parseLeniently(meta);
103 | } else {
104 | return MediaType.OCTET_STREAM;
105 | }
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/HttpMessage.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.nio.charset.Charset;
9 | import java.util.List;
10 |
11 | import static java.nio.charset.StandardCharsets.ISO_8859_1;
12 |
13 | import java.io.IOException;
14 |
15 | public abstract class HttpMessage extends Message {
16 | HttpMessage(MessageVersion version, MessageHeaders headers, MessageBody body) {
17 | super(version, headers, body);
18 | }
19 |
20 | @Override
21 | Charset headerCharset() {
22 | return ISO_8859_1;
23 | }
24 |
25 | /**
26 | * The HTTP payload with Content-Encoding decoded.
27 | *
28 | * @return a message body with content decoded following the HTTP
29 | * Content-Encoding header.
30 | * @throws IOException
31 | */
32 | public MessageBody bodyDecoded() throws IOException {
33 | MessageBody payload = body();
34 | List contentEncodings = headers().all("Content-Encoding");
35 | if (contentEncodings.isEmpty()) {
36 | return payload;
37 | } else if (contentEncodings.size() > 1) {
38 | throw new IOException("Multiple Content-Encodings not supported: " + contentEncodings);
39 | } else if (contentEncodings.get(0).equalsIgnoreCase("identity")
40 | || contentEncodings.get(0).equalsIgnoreCase("none")) {
41 | return payload;
42 | } else if (contentEncodings.get(0).equalsIgnoreCase("gzip")
43 | || contentEncodings.get(0).equalsIgnoreCase("x-gzip")) {
44 | return DecodedBody.create(payload, DecodedBody.Encoding.GZIP);
45 | } else if (contentEncodings.get(0).equalsIgnoreCase("br")) {
46 | return DecodedBody.create(payload, DecodedBody.Encoding.BROTLI);
47 | } else if (contentEncodings.get(0).equalsIgnoreCase("deflate")) {
48 | return DecodedBody.create(payload, DecodedBody.Encoding.DEFLATE);
49 | } else {
50 | throw new IOException("Content-Encoding not supported: " + contentEncodings.get(0));
51 | }
52 |
53 | }
54 |
55 | public abstract static class AbstractBuilder> extends Message.AbstractBuilder {
56 | public AbstractBuilder() {
57 | super(MessageVersion.HTTP_1_1);
58 | }
59 |
60 | @Override
61 | public B version(MessageVersion version) {
62 | version.requireProtocol("HTTP");
63 | return super.version(version);
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/HttpRequest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.ByteArrayOutputStream;
9 | import java.io.EOFException;
10 | import java.io.IOException;
11 | import java.net.URI;
12 | import java.nio.ByteBuffer;
13 | import java.nio.channels.Channels;
14 | import java.nio.channels.ReadableByteChannel;
15 | import java.nio.channels.WritableByteChannel;
16 |
17 | public class HttpRequest extends HttpMessage {
18 | private final String method;
19 | private final String target;
20 |
21 | HttpRequest(String method, String target, MessageVersion version, MessageHeaders headers, MessageBody body) {
22 | super(version, headers, body);
23 | this.method = method;
24 | this.target = target;
25 | }
26 |
27 | public String target() {
28 | return target;
29 | }
30 |
31 | public String method() {
32 | return method;
33 | }
34 |
35 | @Override
36 | void serializeHeaderTo(Appendable output) throws IOException {
37 | output.append(method);
38 | output.append(' ');
39 | output.append(target);
40 | output.append(' ');
41 | output.append(version().toString());
42 | output.append("\r\n");
43 | headers().appendTo(output);
44 | output.append("\r\n");
45 | }
46 |
47 | /**
48 | * Parses a HTTP request while leniently allowing common deviations from the standard.
49 | */
50 | public static HttpRequest parse(ReadableByteChannel channel) throws IOException {
51 | ByteBuffer buffer = ByteBuffer.allocate(8192);
52 | buffer.flip();
53 | return parse(channel, buffer);
54 | }
55 |
56 | /**
57 | * Parses a HTTP request while leniently allowing common deviations from the standard.
58 | */
59 | public static HttpRequest parse(ReadableByteChannel channel, ByteBuffer buffer) throws IOException {
60 | return parse(channel, buffer, null);
61 | }
62 |
63 | /**
64 | * Parses a HTTP request while strictly rejecting deviations from the standard.
65 | */
66 | public static HttpRequest parseStrictly(ReadableByteChannel channel, ByteBuffer buffer) throws IOException {
67 | return parse(channel, buffer, null, true);
68 | }
69 |
70 | static HttpRequest parse(ReadableByteChannel channel, ByteBuffer buffer, WritableByteChannel copyTo) throws IOException {
71 | return parse(channel, buffer, copyTo, false);
72 | }
73 |
74 | private static HttpRequest parse(ReadableByteChannel channel, ByteBuffer buffer, WritableByteChannel copyTo, boolean strict) throws IOException {
75 | HttpParser parser = new HttpParser();
76 | if (strict) {
77 | parser.strictRequest();
78 | } else {
79 | parser.lenientRequest();
80 | }
81 | ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream();
82 | parser.parse(channel, buffer, Channels.newChannel(headerBuffer));
83 | byte[] headerBytes = headerBuffer.toByteArray();
84 | if (headerBytes.length == 0) throw new EOFException();
85 | if (copyTo != null) {
86 | copyTo.write(ByteBuffer.wrap(headerBytes));
87 | copyTo.write(buffer.duplicate());
88 | }
89 | MessageHeaders headers = parser.headers();
90 | Long contentLength;
91 | try {
92 | contentLength = headers.first("Content-Length").map(Long::parseLong).orElse(null);
93 | } catch (NumberFormatException e) {
94 | if (strict) throw new IOException("Invalid Content-Length header", e);
95 | contentLength = null;
96 | }
97 | LengthedBody body = LengthedBody.createFromContentLength(channel, buffer, contentLength);
98 | HttpRequest request = new HttpRequest(parser.method(), parser.target(), parser.version(), headers, body);
99 | request.serializedHeader = headerBytes;
100 | return request;
101 | }
102 |
103 | public static class Builder extends AbstractBuilder {
104 | private final String method;
105 | private final String target;
106 |
107 | public Builder(String method, String target) {
108 | super();
109 | this.method = method;
110 | this.target = target;
111 | }
112 |
113 | /**
114 | * Create a new HTTP request builder from a URI.
115 | *
116 | * The request target will be set to the path and query of the URI.
117 | * The Host header will be set to the host and port of the URI.
118 | */
119 | public Builder(String method, URI uri) {
120 | this(method, uri.getRawQuery() == null ? uri.getRawPath() : uri.getRawPath() + "?" + uri.getRawQuery());
121 | setHeader("Host", uri.getPort() == -1 ? uri.getHost() : uri.getHost() + ":" + uri.getPort());
122 | }
123 |
124 | public HttpRequest build() {
125 | return new HttpRequest(method, target, version, new MessageHeaders(headerMap), makeBody());
126 | }
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/HttpResponse.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.ByteArrayOutputStream;
9 | import java.io.IOException;
10 | import java.nio.ByteBuffer;
11 | import java.nio.channels.Channels;
12 | import java.nio.channels.ReadableByteChannel;
13 | import java.nio.channels.SeekableByteChannel;
14 | import java.nio.channels.WritableByteChannel;
15 |
16 | public class HttpResponse extends HttpMessage {
17 | private final int status;
18 | private final String reason;
19 |
20 | HttpResponse(int status, String reason, MessageVersion version, MessageHeaders headers, MessageBody body) {
21 | super(version, headers, body);
22 | this.status = status;
23 | this.reason = reason;
24 | }
25 |
26 | @Override
27 | void serializeHeaderTo(Appendable output) throws IOException {
28 | output.append(version().toString());
29 | output.append(' ');
30 | output.append(Integer.toString(status));
31 | output.append(' ');
32 | output.append(reason);
33 | output.append("\r\n");
34 | headers().appendTo(output);
35 | output.append("\r\n");
36 | }
37 |
38 | /**
39 | * Parses a HTTP response while leniently allowing common deviations from the standard.
40 | */
41 | public static HttpResponse parse(ReadableByteChannel channel) throws IOException {
42 | return parse(channel, null);
43 | }
44 |
45 | /**
46 | * Parses a HTTP response while strictly rejecting deviations from the standard.
47 | */
48 | public static HttpResponse parseStrictly(ReadableByteChannel channel) throws IOException {
49 | return parse(channel, null, true, false);
50 | }
51 |
52 | static HttpResponse parse(ReadableByteChannel channel, WritableByteChannel copyTo) throws IOException {
53 | return parse(channel, copyTo, false, false);
54 | }
55 |
56 | public static HttpResponse parseWithoutBody(ReadableByteChannel channel, WritableByteChannel copyTo) throws IOException {
57 | return parse(channel, copyTo, false, true);
58 | }
59 |
60 | private static HttpResponse parse(ReadableByteChannel channel, WritableByteChannel copyTo, boolean strict, boolean withoutBody) throws IOException {
61 | ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream();
62 | ByteBuffer buffer = ByteBuffer.allocate(8192);
63 | buffer.flip();
64 | HttpParser parser = new HttpParser();
65 | if (strict) {
66 | parser.strictResponse();
67 | } else {
68 | parser.lenientResponse();
69 | }
70 | parser.parse(channel, buffer, Channels.newChannel(headerBuffer));
71 | byte[] headerBytes = headerBuffer.toByteArray();
72 | if (copyTo != null) {
73 | copyTo.write(ByteBuffer.wrap(headerBytes));
74 | copyTo.write(buffer.duplicate());
75 | }
76 | MessageHeaders headers = parser.headers();
77 | MessageBody body;
78 | if (withoutBody) {
79 | body = MessageBody.empty();
80 | } else if (headers.contains("Transfer-Encoding", "chunked")) {
81 | ChunkedBody chunkedBody = new ChunkedBody(channel, buffer);
82 | if (strict) {
83 | chunkedBody.strict();
84 | }
85 | body = chunkedBody;
86 | } else {
87 | Long contentLength;
88 | try {
89 | contentLength = headers.first("Content-Length")
90 | .map(Long::parseLong)
91 | .orElse(null);
92 | } catch (NumberFormatException e) {
93 | if (strict) throw new IOException("Invalid Content-Length header", e);
94 | contentLength = null;
95 | }
96 | body = LengthedBody.createFromContentLength(channel, buffer, contentLength);
97 | }
98 | HttpResponse response = new HttpResponse(parser.status(), parser.reason(), parser.version(), headers, body);
99 | response.serializedHeader = headerBytes;
100 | return response;
101 | }
102 |
103 |
104 | /**
105 | * The 3 digit response status code.
106 | */
107 | public int status() {
108 | return status;
109 | }
110 |
111 | /**
112 | * The resposne status reason phrase.
113 | */
114 | public String reason() {
115 | return reason;
116 | }
117 |
118 | public static class Builder extends HttpMessage.AbstractBuilder {
119 | private final int status;
120 | private final String reasonPhrase;
121 |
122 | public Builder(int status, String reasonPhrase) {
123 | super();
124 | this.status = status;
125 | this.reasonPhrase = reasonPhrase;
126 | }
127 |
128 | @Override
129 | public HttpResponse build() {
130 | return new HttpResponse(status, reasonPhrase, version, new MessageHeaders(headerMap), makeBody());
131 | }
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/IOUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import javax.net.ssl.SSLSocketFactory;
9 | import java.io.IOException;
10 | import java.io.InputStream;
11 | import java.io.OutputStream;
12 | import java.net.Socket;
13 | import java.nio.ByteBuffer;
14 | import java.nio.channels.ReadableByteChannel;
15 | import java.util.Arrays;
16 | import java.util.Objects;
17 |
18 | /**
19 | * This class is public only due to technical constraints. Please don't depend on it your own code.
20 | */
21 | public final class IOUtils {
22 |
23 | /**
24 | * Transfers as many bytes as possible from src to dst.
25 | * @return the number of bytes transferred.
26 | */
27 | static int transfer(ByteBuffer src, ByteBuffer dst) {
28 | return transferExactly(src, dst, Math.min(src.remaining(), dst.remaining()));
29 | }
30 |
31 | /**
32 | * Transfers up to limits from src to dst.
33 | * @return the number of bytes transferred.
34 | */
35 | static int transfer(ByteBuffer src, ByteBuffer dst, long limit) {
36 | return transferExactly(src, dst, (int)Math.min(Math.min(src.remaining(), dst.remaining()), limit));
37 | }
38 |
39 | private static int transferExactly(ByteBuffer src, ByteBuffer dst, int n) {
40 | if (src.remaining() > n) {
41 | int savedLimit = src.limit();
42 | try {
43 | src.limit(src.position() + n);
44 | dst.put(src);
45 | return n;
46 | } finally {
47 | src.limit(savedLimit);
48 | }
49 | }
50 | dst.put(src);
51 | return n;
52 | }
53 |
54 | static int transfer(ReadableByteChannel src, ByteBuffer dst, long limit) throws IOException {
55 | if (dst.remaining() > limit) {
56 | int savedLimit = dst.limit();
57 | try {
58 | dst.limit(dst.position() + (int) limit);
59 | int n = src.read(dst);
60 | return n;
61 | } finally {
62 | dst.limit(savedLimit);
63 | }
64 | }
65 | return src.read(dst);
66 | }
67 |
68 | static ReadableByteChannel prefixChannel(ByteBuffer prefix, ReadableByteChannel channel) {
69 | return new ReadableByteChannel() {
70 | @Override
71 | public int read(ByteBuffer byteBuffer) throws IOException {
72 | int n = 0;
73 | if (prefix.hasRemaining()) {
74 | n += IOUtils.transfer(prefix, byteBuffer);
75 | }
76 | if (byteBuffer.hasRemaining()) {
77 | n += channel.read(byteBuffer);
78 | }
79 | return n;
80 | }
81 |
82 | @Override
83 | public boolean isOpen() {
84 | return channel.isOpen();
85 | }
86 |
87 | @Override
88 | public void close() throws IOException {
89 | channel.close();
90 | }
91 | };
92 | }
93 |
94 | public static void copy(InputStream inputStream, OutputStream outputStream) throws IOException {
95 | byte[] buffer = new byte[8192];
96 | while (true) {
97 | int n = inputStream.read(buffer);
98 | if (n < 0) break;
99 | outputStream.write(buffer, 0, n);
100 | }
101 | }
102 |
103 | public static ReadableByteChannel gunzipChannel(ReadableByteChannel gzipped) throws IOException {
104 | ByteBuffer buffer = ByteBuffer.allocate(8192);
105 | buffer.flip();
106 | return new GunzipChannel(gzipped, buffer);
107 | }
108 |
109 | public static ReadableByteChannel inflateChannel(ReadableByteChannel deflated) throws IOException {
110 | ByteBuffer buffer = ByteBuffer.allocate(8192);
111 | buffer.flip();
112 | return new InflateChannel(deflated, buffer);
113 | }
114 |
115 | static Socket connect(String scheme, String host, int port) throws IOException {
116 | Objects.requireNonNull(host);
117 | if ("http".equalsIgnoreCase(scheme)) {
118 | return new Socket(host, port < 0 ? 80 : port);
119 | } else if ("https".equalsIgnoreCase(scheme)) {
120 | return SSLSocketFactory.getDefault().createSocket(host, port < 0 ? 443 : port);
121 | } else {
122 | throw new IllegalArgumentException("Unsupported URI scheme: " + scheme);
123 | }
124 | }
125 |
126 | public static byte[] readNBytes(InputStream stream, int n) throws IOException {
127 | byte[] buffer = new byte[n];
128 | for (int remaining = n; remaining > 0;) {
129 | int read = stream.read(buffer, buffer.length - remaining, remaining);
130 | if (read < 0) {
131 | return Arrays.copyOf(buffer, buffer.length - remaining);
132 | }
133 | remaining -= read;
134 | }
135 | return buffer;
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/InflateChannel.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2020 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 | import java.nio.channels.ReadableByteChannel;
11 | import java.util.zip.DataFormatException;
12 | import java.util.zip.Inflater;
13 | import java.util.zip.ZipException;
14 |
15 | /**
16 | * A ReadableByteChannel inflating deflate-compressed content on read. Used to
17 | * uncompress HTTP payload with header Content-Encoding: deflate
.
18 | */
19 | public class InflateChannel implements ReadableByteChannel {
20 |
21 | private final ReadableByteChannel channel;
22 | private final ByteBuffer buffer;
23 | private final Inflater inflater = new Inflater(true);
24 |
25 | public InflateChannel(ReadableByteChannel channel, ByteBuffer buffer) throws IllegalArgumentException {
26 | this.channel = channel;
27 | this.buffer = buffer;
28 | if (!buffer.hasArray()) {
29 | throw new IllegalArgumentException("ByteBuffer must be array-backed and writable");
30 | }
31 | }
32 |
33 | @Override
34 | public int read(ByteBuffer dest) throws IOException {
35 | if (inflater.finished()) {
36 | return -1;
37 | }
38 |
39 | if (inflater.needsInput()) {
40 | if (!buffer.hasRemaining()) {
41 | buffer.compact();
42 | channel.read(buffer);
43 | buffer.flip();
44 | }
45 | inflater.setInput(buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining());
46 | }
47 |
48 | try {
49 | int n = inflater.inflate(dest.array(), dest.arrayOffset() + dest.position(), dest.remaining());
50 | dest.position(dest.position() + n);
51 |
52 | int newBufferPosition = buffer.limit() - inflater.getRemaining();
53 | buffer.position(newBufferPosition);
54 |
55 | return n;
56 | } catch (DataFormatException e) {
57 | throw new ZipException(e.getMessage());
58 | }
59 | }
60 |
61 | @Override
62 | public boolean isOpen() {
63 | return channel.isOpen();
64 | }
65 |
66 | @Override
67 | public void close() throws IOException {
68 | channel.close();
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/MessageBody.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import java.io.ByteArrayInputStream;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.nio.ByteBuffer;
7 | import java.nio.channels.Channels;
8 | import java.nio.channels.ReadableByteChannel;
9 |
10 | public abstract class MessageBody extends MessageParser implements ReadableByteChannel {
11 |
12 | MessageBody() {
13 | }
14 |
15 | public static MessageBody empty() {
16 | return LengthedBody.create(Channels.newChannel(new ByteArrayInputStream(new byte[0])),
17 | ByteBuffer.allocate(0), 0);
18 | }
19 |
20 | /**
21 | * Returns the length of the body. This may be less than the Content-Length header if the record was truncated.
22 | * Returns -1 if the length cannot be determined (such as when chunked encoding is used).
23 | */
24 | public long size() throws IOException {
25 | return -1;
26 | }
27 |
28 | public abstract long position() throws IOException;
29 |
30 | public InputStream stream() throws IOException {
31 | return Channels.newInputStream(this);
32 | }
33 |
34 | public void consume() throws IOException {
35 | ByteBuffer buffer = ByteBuffer.allocate(8192);
36 | while (read(buffer) >= 0) {
37 | buffer.clear();
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/MessageHeaders.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 | import java.nio.channels.ReadableByteChannel;
11 | import java.nio.charset.StandardCharsets;
12 | import java.util.*;
13 | import java.util.regex.Pattern;
14 |
15 | import static java.util.Collections.emptyList;
16 |
17 | public class MessageHeaders {
18 | private static Pattern COMMA_SEPARATOR = Pattern.compile("[ \t]*,[ \t]*");
19 | private Map> map;
20 |
21 | public static MessageHeaders of(String... keysAndValues) {
22 | if (keysAndValues.length % 2 != 0) {
23 | throw new IllegalArgumentException("an even number keysAndValues must be provided");
24 | }
25 | Map> map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
26 | for (int i = 0; i < keysAndValues.length; i += 2) {
27 | map.computeIfAbsent(keysAndValues[i], k -> new ArrayList<>()).add(keysAndValues[i + 1]);
28 | }
29 | return new MessageHeaders(map);
30 | }
31 |
32 | MessageHeaders(Map> map) {
33 | map.replaceAll((name, values) -> Collections.unmodifiableList(values));
34 | this.map = Collections.unmodifiableMap(map);
35 | }
36 |
37 | /**
38 | * Returns the value of a single-valued header field. Throws an exception if there are more than one.
39 | */
40 | public Optional sole(String name) {
41 | List values = all(name);
42 | if (values.size() > 1) {
43 | throw new IllegalArgumentException("record has " + values.size() + " " + name + " headers");
44 | }
45 | return values.stream().findFirst();
46 | }
47 |
48 | /**
49 | * Returns the first value of a header field.
50 | */
51 | public Optional first(String name) {
52 | return all(name).stream().findFirst();
53 | }
54 |
55 | /**
56 | * Returns all the values of a header field.
57 | */
58 | public List all(String name) {
59 | return map.getOrDefault(name, emptyList());
60 | }
61 |
62 | /**
63 | * Returns a map of header fields to their values.
64 | */
65 | public Map> map() {
66 | return map;
67 | }
68 |
69 | /**
70 | * Returns true when the given header value is present.
71 | *
72 | * Fields are interpreted as a comma-separated list and the value is compared case-insensitively.
73 | */
74 | public boolean contains(String name, String value) {
75 | for (String rawValue : all(name)) {
76 | for (String splitValue : COMMA_SEPARATOR.split(rawValue)) {
77 | if (splitValue.equalsIgnoreCase(value)) {
78 | return true;
79 | }
80 | }
81 | }
82 | return false;
83 | }
84 |
85 | @Override
86 | public String toString() {
87 | return map.toString();
88 | }
89 |
90 | /**
91 | * Parses application/warc-fields.
92 | */
93 | public static MessageHeaders parse(ReadableByteChannel channel) throws IOException {
94 | WarcParser parser = WarcParser.newWarcFieldsParser();
95 | ByteBuffer buffer = ByteBuffer.allocate(8192);
96 | while (!parser.isFinished()) {
97 | int n = channel.read(buffer);
98 | if (n < 0) {
99 | parser.parse(ByteBuffer.wrap("\r\n\r\n".getBytes(StandardCharsets.US_ASCII)));
100 | break;
101 | }
102 | buffer.flip();
103 | parser.parse(buffer);
104 | if (parser.isError()) throw new ParsingException("invalid WARC fields");
105 | buffer.compact();
106 | }
107 | return parser.headers();
108 | }
109 |
110 | private static final boolean[] ILLEGAL = initIllegalLookup();
111 | private static boolean[] initIllegalLookup() {
112 | boolean[] illegal = new boolean[256];
113 | String separators = "()<>@,;:\\\"/[]?={} \t";
114 | for (int i = 0; i < separators.length(); i++) {
115 | illegal[separators.charAt(i)] = true;
116 | }
117 | for (int i = 0; i < 32; i++) { // control characters
118 | illegal[i] = true;
119 | }
120 | return illegal;
121 | }
122 |
123 | static String format(Map> map) {
124 | StringBuilder out = new StringBuilder();
125 | for (Map.Entry> entry : map.entrySet()) {
126 | String name = entry.getKey();
127 | for (String value : entry.getValue()) {
128 | out.append(name).append(": ").append(value).append("\r\n");
129 | }
130 | }
131 | return out.toString();
132 | }
133 |
134 | public void appendTo(Appendable appendable) throws IOException {
135 | for (Map.Entry> entry : map.entrySet()) {
136 | String name = entry.getKey();
137 | for (String value : entry.getValue()) {
138 | appendable.append(name).append(": ").append(value).append("\r\n");
139 | }
140 | }
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/MessageParser.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2020 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.nio.ByteBuffer;
9 | import java.util.function.Consumer;
10 |
11 | public class MessageParser {
12 | private Consumer warningHandler;
13 |
14 | protected void emitWarning(String message) {
15 | if (warningHandler != null) {
16 | warningHandler.accept(message);
17 | }
18 | }
19 |
20 | void onWarning(Consumer warningHandler) {
21 | this.warningHandler = warningHandler;
22 | }
23 |
24 | protected static String getErrorContext(String input, int position, int length) {
25 | StringBuilder context = new StringBuilder();
26 |
27 | int start = position - length;
28 | if (start < 0) {
29 | start = 0;
30 | } else {
31 | context.append("...");
32 | }
33 | int end = Math.min(input.length(), (position + length));
34 |
35 | context.append(input.substring(start, position));
36 | context.append("<-- HERE -->");
37 | context.append(input.substring(position, end));
38 |
39 | if (end < input.length()) {
40 | context.append("...");
41 | }
42 |
43 | return context.toString();
44 | }
45 |
46 | protected static String getErrorContext(ByteBuffer buffer, int position, int length) {
47 | StringBuilder context = new StringBuilder();
48 |
49 | int start = position - length;
50 | if (start < 0) {
51 | start = 0;
52 | } else {
53 | context.append("...");
54 | }
55 |
56 | ByteBuffer copy = buffer.duplicate();
57 | copy.position(start);
58 |
59 | int end = position + length;
60 | if (end < buffer.limit()) {
61 | copy.limit(end);
62 | }
63 |
64 | while (true) {
65 | if (copy.position() == position) {
66 | context.append("<-- HERE -->");
67 | }
68 | if (!copy.hasRemaining()) break;
69 | int c = (int) copy.get();
70 | if (c < 0x7f && c >= 0x20) {
71 | context.append((char) c);
72 | } else if (c == 0x09) {
73 | context.append("\\t");
74 | } else if (c == 0x0a) {
75 | context.append("\\n");
76 | } else if (c == 0x0d) {
77 | context.append("\\r");
78 | } else {
79 | context.append(String.format("\\x%02x", c));
80 | }
81 | }
82 |
83 | if (copy.position() < buffer.limit()) {
84 | context.append("...");
85 | }
86 |
87 | return context.toString();
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/MessageVersion.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.util.Objects;
9 |
10 | public final class MessageVersion {
11 | public static final MessageVersion GEMINI = new MessageVersion("gemini");
12 | public static final MessageVersion HTTP_1_0 = new MessageVersion("HTTP", 1, 0);
13 | public static final MessageVersion HTTP_1_1 = new MessageVersion("HTTP", 1, 1);
14 | public static final MessageVersion WARC_1_0 = new MessageVersion("WARC", 1, 0);
15 | public static final MessageVersion WARC_1_1 = new MessageVersion("WARC", 1, 1);
16 | public static final MessageVersion ARC_1_1 = new MessageVersion("ARC", 1, 1);
17 |
18 | private final String protocol;
19 | private final int major;
20 | private final int minor;
21 |
22 | public MessageVersion(String protocol) {
23 | this.protocol = protocol;
24 | major = 0;
25 | minor = 0;
26 | }
27 |
28 | public MessageVersion(String protocol, int major, int minor) {
29 | this.protocol = protocol;
30 | this.major = major;
31 | this.minor = minor;
32 | }
33 |
34 | public String getProtocol() {
35 | return protocol;
36 | }
37 |
38 | void requireProtocol(String expectedProtocol) {
39 | if (!protocol.equals(expectedProtocol)) {
40 | throw new IllegalArgumentException("Expected a version of " + expectedProtocol + " but got " + this);
41 | }
42 | }
43 |
44 | public int getMajor() {
45 | return major;
46 | }
47 |
48 | public int getMinor() {
49 | return minor;
50 | }
51 |
52 | @Override
53 | public boolean equals(Object o) {
54 | if (this == o) return true;
55 | if (o == null || getClass() != o.getClass()) return false;
56 | MessageVersion that = (MessageVersion) o;
57 | return major == that.major &&
58 | minor == that.minor &&
59 | Objects.equals(protocol, that.protocol);
60 | }
61 |
62 | @Override
63 | public int hashCode() {
64 | return Objects.hash(protocol, major, minor);
65 | }
66 |
67 | @Override
68 | public String toString() {
69 | if (major == 0 && minor == 0) return protocol;
70 | return protocol + "/" + major + "." + minor;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/ParsingException.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 |
10 | public class ParsingException extends IOException {
11 | public ParsingException(String message) {
12 | super(message);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcCaptureRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.net.Inet6Address;
10 | import java.net.InetAddress;
11 | import java.net.URI;
12 | import java.nio.ByteBuffer;
13 | import java.nio.channels.ReadableByteChannel;
14 | import java.util.List;
15 | import java.util.Optional;
16 |
17 | import static java.util.stream.Collectors.toList;
18 |
19 | /**
20 | * A type of WARC record created as part of a web capture event.
21 | */
22 | public abstract class WarcCaptureRecord extends WarcTargetRecord {
23 | WarcCaptureRecord(MessageVersion version, MessageHeaders headers, MessageBody body) {
24 | super(version, headers, body);
25 | }
26 |
27 | /**
28 | * The IP address of the server involved in the capture event this record belongs to.
29 | */
30 | public Optional ipAddress() {
31 | return headers().sole("WARC-IP-Address").map(InetAddresses::forString);
32 | }
33 |
34 | /**
35 | * The IDs of other records created during the same capture event as this one.
36 | */
37 | public List concurrentTo() {
38 | return headers().all("WARC-Concurrent-To").stream().map(WarcRecord::parseRecordID).collect(toList());
39 | }
40 |
41 | /**
42 | * Content-Type of the payload.
43 | */
44 | public MediaType payloadType() throws IOException {
45 | return contentType();
46 | }
47 |
48 | public abstract static class AbstractBuilder> extends WarcTargetRecord.Builder {
49 | protected AbstractBuilder(String type) {
50 | super(type);
51 | }
52 |
53 | public B body(MediaType type, Message message) throws IOException {
54 | ByteBuffer header = ByteBuffer.wrap(message.serializeHeader());
55 | ReadableByteChannel channel = IOUtils.prefixChannel(header, message.body());
56 | return body(type, channel, message.body().size() + header.remaining());
57 | }
58 |
59 | public B concurrentTo(URI recordId) {
60 | return addHeader("WARC-Concurrent-To", "<" + recordId.toString() + ">");
61 | }
62 |
63 | public B ipAddress(InetAddress ipAddress) {
64 | return addHeader("WARC-IP-Address", InetAddresses.toAddrString(ipAddress));
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcCompression.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.nio.file.Path;
9 |
10 | public enum WarcCompression {
11 | NONE, GZIP;
12 |
13 | static WarcCompression forPath(Path path) {
14 | if (path.getFileName().toString().endsWith(".gz")) {
15 | return GZIP;
16 | } else {
17 | return NONE;
18 | }
19 | }
20 | }
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcContinuation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.net.URI;
9 | import java.util.Optional;
10 |
11 | public class WarcContinuation extends WarcTargetRecord {
12 | WarcContinuation(MessageVersion version, MessageHeaders headers, MessageBody body) {
13 | super(version, headers, body);
14 | }
15 |
16 | /**
17 | * The id of the first record in the series of segments this record is part of.
18 | */
19 | public URI segmentOriginId() {
20 | return headers().sole("WARC-Segment-Origin-ID").map(WarcRecord::parseRecordID).get();
21 | }
22 |
23 | /**
24 | * The total length of the content blocks of all segments added together.
25 | */
26 | public Optional segmentTotalLength() {
27 | return headers().sole("WARC-Segment-Total-Length").map(Long::valueOf);
28 | }
29 |
30 | public static class Builder extends WarcTargetRecord.Builder {
31 | public Builder() {
32 | super("continuation");
33 | }
34 |
35 | public Builder segmentOriginId(URI recordId) {
36 | return setHeader("WARC-Segment-Origin-Id", WarcRecord.formatId(recordId));
37 | }
38 |
39 | public Builder segmentTotalLength(long segmentTotalLength) {
40 | return setHeader("WARC-Segment-Total-Length", Long.toString(segmentTotalLength));
41 | }
42 |
43 | @Override
44 | public WarcContinuation build() {
45 | return build(WarcContinuation::new);
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcConversion.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.net.URI;
10 | import java.util.Optional;
11 |
12 | public class WarcConversion extends WarcTargetRecord {
13 | WarcConversion(MessageVersion version, MessageHeaders headers, MessageBody body) {
14 | super(version, headers, body);
15 | }
16 |
17 | public Optional payload() throws IOException {
18 | return Optional.of(new WarcPayload(body()) {
19 | @Override
20 | public MediaType type() {
21 | return contentType();
22 | }
23 |
24 | @Override
25 | Optional identifiedType() {
26 | return Optional.empty();
27 | }
28 |
29 | @Override
30 | public Optional digest() {
31 | Optional payloadDigest = payloadDigest();
32 | return payloadDigest.isPresent() ? payloadDigest : blockDigest();
33 | }
34 | });
35 | }
36 |
37 | /**
38 | * The record id of the source of the conversion.
39 | */
40 | public Optional refersTo() {
41 | return headers().sole("WARC-Refers-To").map(WarcRecord::parseRecordID);
42 | }
43 |
44 | public static class Builder extends WarcTargetRecord.Builder {
45 | public Builder() {
46 | super("conversion");
47 | }
48 |
49 | public Builder refersTo(URI recordId) {
50 | return addHeader("WARC-Refers-To", WarcRecord.formatId(recordId));
51 | }
52 |
53 | @Override
54 | public WarcConversion build() {
55 | return build(WarcConversion::new);
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcFilter.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import java.util.function.Predicate;
4 |
5 | /**
6 | * Filter expressions for matching WARC records.
7 | *
8 | * Simplified grammar for the expression language:
9 | *
10 | * {@code
11 | * expression = "(" expression ")" ; grouping
12 | * | "!(" expression ")" ; boolean NOT
13 | * | expression "&&" expression ; boolean AND
14 | * | expression "||" expression ; boolean OR
15 | * | field "==" string ; string equality
16 | * | field "!=" string ; string inequality
17 | * | field "=~" string ; regex match
18 | * | field "!~" string ; regex non-match
19 | * | field "==" number ; integer equality
20 | * | field "!=" number ; integer inequality
21 | * | field "<" number ; integer less-than
22 | * | field "<=" number ; integer less-than-or-equal
23 | * | field ">" number ; integer greater-than
24 | * | field ">=" number ; integer greater-than-or-equal
25 | *
26 | * field = ":status" ; HTTP response code psuedo-field
27 | * | "http:" field-name ; HTTP header field
28 | * | field-name ; WARC header field
29 | *
30 | * string = '"' [^"]* '"'
31 | * }
32 | *
33 | * Whitespace outside a string or field is ignored. Fields that do not exist are treated as an empty string when subject
34 | * to string comparison. Fields that do not contain a valid number are treated as zero when subject to integer
35 | * comparison.
36 | */
37 | public class WarcFilter implements Predicate {
38 | private final String expression;
39 | private final Predicate predicate;
40 |
41 | private WarcFilter(String expression, Predicate predicate) {
42 | this.expression = expression;
43 | this.predicate = predicate;
44 | }
45 |
46 | /**
47 | * Compiles a filter expression from a string.
48 | *
49 | * @throws WarcFilterException when the expression contains a syntax error
50 | */
51 | public static WarcFilter compile(String expression) {
52 | return new WarcFilter(expression, new WarcFilterCompiler(expression).predicate());
53 | }
54 |
55 | @Override
56 | public boolean test(WarcRecord warcRecord) {
57 | return predicate.test(warcRecord);
58 | }
59 |
60 | @Override
61 | public String toString() {
62 | return expression;
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcFilterException.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import java.util.Arrays;
4 |
5 | /**
6 | * Thrown when a syntax error is encountered when compiling a filter expression.
7 | */
8 | public class WarcFilterException extends RuntimeException {
9 | private final String input;
10 | private final int position;
11 |
12 | public WarcFilterException(String message, int position, String input) {
13 | super(message);
14 | this.position = position;
15 | this.input = input;
16 | }
17 |
18 | /**
19 | * Returns the character position of the error within the input.
20 | */
21 | public int position() {
22 | return position;
23 | }
24 |
25 | /**
26 | * Returns the expression containing the error.
27 | */
28 | public String input() {
29 | return input;
30 | }
31 |
32 | /**
33 | * Returns a user-friendly error message.
34 | */
35 | public String prettyPrint() {
36 | char[] indent = new char[position];
37 | Arrays.fill(indent, ' ');
38 | return input + "\n" + new String(indent) + "^\nError: " + getMessage();
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcFilterLexer.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import java.util.regex.Matcher;
4 | import java.util.regex.Pattern;
5 |
6 | class WarcFilterLexer {
7 | private static Pattern REGEX = Pattern.compile("([a-zA-Z0-9:_-]+)|(&&|[|][|]|!=|==|!~|=~|[<>]=?|!?[(]|[)])|\"([^\"]*)\"|(\\s+)");
8 | private static final int TOKEN = 1, OPERATOR = 2, STRING = 3, WHITESPACE = 4;
9 |
10 | private final String input;
11 | private final Matcher matcher;
12 |
13 | WarcFilterLexer(String input) {
14 | this.input = input;
15 | this.matcher = REGEX.matcher(input);
16 | }
17 |
18 | Object stringOrNumber() {
19 | Object value = peek().group(STRING);
20 | if (value == null) {
21 | String token = matcher.group(TOKEN);
22 | if (token != null) {
23 | try {
24 | value = Long.parseLong(matcher.group(TOKEN));
25 | } catch (NumberFormatException e) {
26 | // not a number
27 | }
28 | }
29 | }
30 | if (value == null) throw error("expected string or integer");
31 | advance();
32 | return value;
33 | }
34 |
35 | String string() {
36 | String str = peek().group(STRING);
37 | if (str == null) throw error("expected string");
38 | advance();
39 | return str;
40 | }
41 |
42 | String token() {
43 | String field = peek().group(TOKEN);
44 | if (field == null) throw error("expected field name");
45 | advance();
46 | return field;
47 | }
48 |
49 | String operator() {
50 | String operator = peekOperator();
51 | if (operator == null) throw error("expected operator");
52 | advance();
53 | return operator;
54 | }
55 |
56 | String peekOperator() {
57 | return peek().group(OPERATOR);
58 | }
59 |
60 | private Matcher peek() {
61 | while (true) {
62 | if (atEnd()) throw error("unexpected end of input");
63 | if (!matcher.lookingAt()) throw error("syntax error");
64 | if (matcher.group(WHITESPACE) == null) return matcher;
65 | advance();
66 | }
67 | }
68 |
69 | void advance() {
70 | matcher.region(matcher.end(), matcher.regionEnd());
71 | }
72 |
73 | WarcFilterException error(String message) {
74 | return new WarcFilterException(message, matcher.regionStart(), input);
75 | }
76 |
77 | boolean atEnd() {
78 | return matcher.regionStart() == matcher.regionEnd();
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcMetadata.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.net.URI;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Optional;
13 |
14 | import static java.nio.charset.StandardCharsets.UTF_8;
15 |
16 | public class WarcMetadata extends WarcCaptureRecord {
17 | private MessageHeaders fields;
18 |
19 | WarcMetadata(MessageVersion version, MessageHeaders headers, MessageBody body) {
20 | super(version, headers, body);
21 | }
22 |
23 | /**
24 | * Metadata records do not have a payload so this method always returns empty.
25 | */
26 | @Override
27 | public Optional payload() throws IOException {
28 | return Optional.empty();
29 | }
30 |
31 | /**
32 | * Parses the body as application/warc-fields.
33 | *
34 | * This is a convenience method for Headers.parse(metadata.body())
.
35 | */
36 | public MessageHeaders fields() throws IOException {
37 | if (fields == null) {
38 | fields = MessageHeaders.parse(body());
39 | }
40 | return fields;
41 | }
42 |
43 | public static class Builder extends AbstractBuilder {
44 | public Builder() {
45 | super("metadata");
46 | }
47 |
48 | @Override
49 | public WarcMetadata build() {
50 | return build(WarcMetadata::new);
51 | }
52 |
53 | public Builder fields(Map> map) {
54 | return body(MediaType.WARC_FIELDS, MessageHeaders.format(map).getBytes(UTF_8));
55 | }
56 |
57 | public Builder targetURI(String uri) {
58 | addHeader("WARC-Target-URI", uri);
59 | return this;
60 | }
61 |
62 | public Builder targetURI(URI uri) {
63 | return targetURI(uri.toString());
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcPayload.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.util.Optional;
9 |
10 | public abstract class WarcPayload {
11 | private final MessageBody body;
12 |
13 | WarcPayload(MessageBody body) {
14 | this.body = body;
15 | }
16 |
17 | public MessageBody body() {
18 | return body;
19 | }
20 |
21 | public abstract MediaType type();
22 |
23 | abstract Optional identifiedType();
24 |
25 | public abstract Optional digest();
26 | }
27 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcRequest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.ByteArrayOutputStream;
9 | import java.io.IOException;
10 | import java.net.URI;
11 | import java.nio.ByteBuffer;
12 | import java.nio.channels.Channels;
13 | import java.util.Optional;
14 |
15 | public class WarcRequest extends WarcCaptureRecord {
16 |
17 | private HttpRequest http;
18 |
19 | WarcRequest(MessageVersion version, MessageHeaders headers, MessageBody body) {
20 | super(version, headers, body);
21 | }
22 |
23 | /**
24 | * Parses the content body of this record as HTTP request.
25 | *
26 | * This is a convenience method for HttpRequest.parse(request.body())
.
27 | */
28 | public HttpRequest http() throws IOException {
29 | if (http == null) {
30 | ByteBuffer buffer = ByteBuffer.allocate(8192);
31 | buffer.flip();
32 | MessageBody body = body();
33 | if (body.position() != 0) throw new IllegalStateException("http() cannot be called after reading from body");
34 | if (body instanceof LengthedBody) {
35 | // if we can, save a copy of the raw header and push it back so we don't invalidate body
36 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
37 | LengthedBody lengthed = (LengthedBody) body;
38 | http = HttpRequest.parse(lengthed.discardPushbackOnRead(), buffer, Channels.newChannel(baos));
39 | lengthed.pushback(baos.toByteArray());
40 | } else {
41 | http = HttpRequest.parse(body, buffer);
42 | }
43 | }
44 | return http;
45 | }
46 |
47 | @Override
48 | public MediaType payloadType() throws IOException {
49 | return http().contentType();
50 | }
51 |
52 | public static class Builder extends AbstractBuilder {
53 | public Builder(URI targetURI) {
54 | this(targetURI.toString());
55 | }
56 |
57 | public Builder(String targetURI) {
58 | super("request");
59 | setHeader("WARC-Target-URI", targetURI);
60 | }
61 |
62 | @Override
63 | public WarcRequest build() {
64 | return build(WarcRequest::new);
65 | }
66 |
67 | public Builder body(HttpRequest httpRequest) throws IOException {
68 | return body(MediaType.HTTP_REQUEST, httpRequest);
69 | }
70 | }
71 |
72 | public Optional payload() throws IOException {
73 | if (contentType().base().equals(MediaType.HTTP)) {
74 | return Optional.of(new WarcPayload(http().body()) {
75 |
76 | @Override
77 | public MediaType type() {
78 | return http.contentType();
79 | }
80 |
81 | @Override
82 | Optional identifiedType() {
83 | return identifiedPayloadType();
84 | }
85 |
86 | @Override
87 | public Optional digest() {
88 | return payloadDigest();
89 | }
90 | });
91 | }
92 | return Optional.empty();
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcResource.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.net.URI;
9 |
10 | public class WarcResource extends WarcCaptureRecord {
11 | WarcResource(MessageVersion version, MessageHeaders headers, MessageBody body) {
12 | super(version, headers, body);
13 | }
14 |
15 | public static class Builder extends AbstractBuilder {
16 | public Builder(URI targetURI) {
17 | super("resource");
18 | setHeader("WARC-Target-URI", targetURI.toString());
19 | }
20 |
21 | public Builder() {
22 | super("resource");
23 | }
24 |
25 | @Override
26 | public WarcResource build() {
27 | return build(WarcResource::new);
28 | }
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcResponse.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.ByteArrayOutputStream;
9 | import java.io.IOException;
10 | import java.net.URI;
11 | import java.nio.ByteBuffer;
12 | import java.nio.channels.Channels;
13 | import java.util.Optional;
14 |
15 | public class WarcResponse extends WarcCaptureRecord {
16 |
17 | private HttpResponse http;
18 | private GeminiResponse gemini;
19 |
20 | WarcResponse(MessageVersion version, MessageHeaders headers, MessageBody body) {
21 | super(version, headers, body);
22 | }
23 |
24 | /**
25 | * Parses the HTTP response captured by this record.
26 | *
27 | * This is a convenience method for HttpResponse.parse(response.body().channel())
.
28 | */
29 | public HttpResponse http() throws IOException {
30 | if (http == null) {
31 | MessageBody body = body();
32 | if (body.position() != 0) throw new IllegalStateException("http() cannot be called after reading from body");
33 | if (body instanceof LengthedBody) {
34 | // if we can, save a copy of the raw header and push it back so we don't invalidate body
35 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
36 | LengthedBody lengthed = (LengthedBody) body;
37 | http = HttpResponse.parse(lengthed.discardPushbackOnRead(), Channels.newChannel(baos));
38 | lengthed.pushback(baos.toByteArray());
39 | } else {
40 | http = HttpResponse.parse(body);
41 | }
42 | }
43 | return http;
44 | }
45 |
46 | public GeminiResponse gemini() throws IOException {
47 | if (gemini == null) {
48 | MessageBody body = body();
49 | if (body.position() != 0) throw new IllegalStateException("gemini() cannot be called after reading from body");
50 | ByteBuffer buffer = ByteBuffer.allocate(8192);
51 | buffer.flip();
52 | gemini = GeminiResponse.parse(body, buffer);
53 | if (body instanceof LengthedBody) {
54 | ((LengthedBody)body).pushback(gemini.serializeHeader());
55 | }
56 | }
57 | return gemini;
58 | }
59 |
60 | @Override
61 | public MediaType payloadType() throws IOException {
62 | return payload().map(WarcPayload::type).orElse(MediaType.OCTET_STREAM);
63 | }
64 |
65 | public Optional payload() throws IOException {
66 | if (contentType().base().equals(MediaType.HTTP)) {
67 | return Optional.of(new WarcPayload(http().body()) {
68 |
69 | @Override
70 | public MediaType type() {
71 | return http.contentType();
72 | }
73 |
74 | @Override
75 | Optional identifiedType() {
76 | return identifiedPayloadType();
77 | }
78 |
79 | @Override
80 | public Optional digest() {
81 | return payloadDigest();
82 | }
83 | });
84 | } else if (contentType().base().equals(MediaType.GEMINI)) {
85 | return Optional.of(new WarcPayload(gemini().body()) {
86 |
87 | @Override
88 | public MediaType type() {
89 | return gemini.contentType();
90 | }
91 |
92 | @Override
93 | Optional identifiedType() {
94 | return identifiedPayloadType();
95 | }
96 |
97 | @Override
98 | public Optional digest() {
99 | return payloadDigest();
100 | }
101 | });
102 | }
103 | return Optional.empty();
104 | }
105 |
106 | public static class Builder extends AbstractBuilder {
107 | public Builder(URI targetURI) {
108 | this(targetURI.toString());
109 | }
110 |
111 | public Builder(String targetURI) {
112 | super("response");
113 | setHeader("WARC-Target-URI", targetURI);
114 | }
115 |
116 | public Builder body(HttpResponse httpResponse) throws IOException {
117 | return body(MediaType.HTTP_RESPONSE, httpResponse);
118 | }
119 |
120 | @Override
121 | public WarcResponse build() {
122 | return build(WarcResponse::new);
123 | }
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcTargetRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.net.URI;
10 | import java.util.Optional;
11 |
12 | /**
13 | * A WARC record associated with some target URI.
14 | *
15 | * This class exists solely to differentiate between the {@link Warcinfo} record type and all the other standard
16 | * record types.
17 | */
18 | public abstract class WarcTargetRecord extends WarcRecord {
19 | WarcTargetRecord(MessageVersion version, MessageHeaders headers, MessageBody body) {
20 | super(version, headers, body);
21 | }
22 |
23 | /**
24 | * The URI of the original target resource this record holds information about as an unparsed string.
25 | *
26 | * Strips enclosing angle brackets if present as a compatibility quirk with WARC 1.0.
27 | */
28 | public String target() {
29 | String value = headers().sole("WARC-Target-URI").orElse(null);
30 |
31 | /*
32 | * Quirk: The grammar in the WARC 1.0 standard included angle brackets around the value of WARC-Target-URI.
33 | * This was likely an editing mistake as it was not present in the drafts of the standard, nor in the examples
34 | * or most implementations. The grammar was corrected in WARC 1.1. It is what ended up published as 1.0 though
35 | * and consequently some software in the wild (e.g. Wget) generates WARCs with angle brackets in this field.
36 | */
37 | if (value != null && value.startsWith("<") && value.endsWith(">")) {
38 | return value.substring(1, value.length() - 1);
39 | } else {
40 | return value;
41 | }
42 | }
43 |
44 | /**
45 | * The URI of the original target resource this record holds information about.
46 | *
47 | * This method uses URIs.parseLeniently() to percent encode characters that are rejected by the URI class and so may
48 | * return a value that is not identical to the value of the WARC-Target-URI field. Using {@link #target()} should
49 | * be preferred unless you actually need an instance of the URI class.
50 | */
51 | public URI targetURI() {
52 | return URIs.parseLeniently(target());
53 | }
54 |
55 | /**
56 | * Digest values that were calculated by applying hash functions to payload.
57 | */
58 | public Optional payloadDigest() {
59 | return headers().sole("WARC-Payload-Digest").map(WarcDigest::new);
60 | }
61 |
62 | /**
63 | * A content-type that was identified by an independent check (not just what the server said).
64 | */
65 | public Optional identifiedPayloadType() {
66 | return headers().sole("WARC-Identified-Payload-Type").map(MediaType::parseLeniently);
67 | }
68 |
69 | /**
70 | * Returns the payload of this record if one is present.
71 | *
72 | * This method returns an empty optional when the payload is undefined for this record type or if this library does
73 | * not know how to parse the body in order to extract the payload. If the payload is well defined but
74 | * happens to be zero bytes in length this method still returns a WarcPayload object.
75 | */
76 | public Optional payload() throws IOException {
77 | return Optional.of(new WarcPayload(body()) {
78 | @Override
79 | public MediaType type() {
80 | return contentType();
81 | }
82 |
83 | @Override
84 | Optional identifiedType() {
85 | return Optional.empty();
86 | }
87 |
88 | @Override
89 | public Optional digest() {
90 | Optional payloadDigest = payloadDigest();
91 | return payloadDigest.isPresent() ? payloadDigest : blockDigest();
92 | }
93 | });
94 | }
95 |
96 | /**
97 | * The ID of a {@link Warcinfo} record associated with this record.
98 | */
99 | public Optional warcinfoID() {
100 | return headers().sole("WARC-Warcinfo-ID").map(WarcRecord::parseRecordID);
101 | }
102 |
103 | @Override
104 | public String toString() {
105 | return getClass().getSimpleName() + "<" + date() + " " + target() + ">";
106 | }
107 |
108 | public static abstract class Builder> extends AbstractBuilder {
109 | public Builder(String type) {
110 | super(type);
111 | }
112 |
113 | public B payloadDigest(WarcDigest payloadDigest) {
114 | return addHeader("WARC-Payload-Digest", payloadDigest.prefixedBase32());
115 | }
116 |
117 | public B identifiedPayloadType(String identifiedPayloadType) {
118 | return setHeader("WARC-Identified-Payload-Type", identifiedPayloadType);
119 | }
120 |
121 | public B warcinfoId(URI recordId) {
122 | return addHeader("WARC-Warcinfo-ID", WarcRecord.formatId(recordId));
123 | }
124 |
125 | public B payloadDigest(String algorithm, String value) {
126 | return payloadDigest(new WarcDigest(algorithm, value));
127 | }
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/WarcTruncationReason.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | public enum WarcTruncationReason {
9 | /**
10 | * no truncation occurred
11 | */
12 | NOT_TRUNCATED,
13 |
14 | /**
15 | * exceeds configured max length
16 | */
17 | LENGTH,
18 |
19 | /**
20 | * exceeds configured max time
21 | */
22 | TIME,
23 |
24 | /**
25 | * network disconnect
26 | */
27 | DISCONNECT,
28 |
29 | /**
30 | * other/unknown reason
31 | */
32 | UNSPECIFIED
33 | }
34 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/Warcinfo.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import java.io.IOException;
9 | import java.util.List;
10 | import java.util.Map;
11 | import java.util.Optional;
12 |
13 | import static java.nio.charset.StandardCharsets.UTF_8;
14 |
15 | /**
16 | * The warcinfo record contains information about the web crawl that generated the records following it.
17 | */
18 | public class Warcinfo extends WarcRecord {
19 |
20 | private MessageHeaders fields;
21 |
22 | Warcinfo(MessageVersion version, MessageHeaders headers, MessageBody body) {
23 | super(version, headers, body);
24 | }
25 |
26 | /**
27 | * The name of the file originally containing this warcinfo record.
28 | */
29 | public Optional filename() {
30 | return headers().sole("WARC-Filename");
31 | }
32 |
33 | /**
34 | * Parses the content body as application/warc-fields.
35 | */
36 | public MessageHeaders fields() throws IOException {
37 | if (fields == null) {
38 | fields = MessageHeaders.parse(body());
39 | }
40 | return fields;
41 | }
42 |
43 | public static class Builder extends AbstractBuilder {
44 | public Builder() {
45 | super("warcinfo");
46 | }
47 |
48 | public Builder filename(String filename) {
49 | return setHeader("WARC-Filename", filename);
50 | }
51 |
52 | @Override
53 | public Warcinfo build() {
54 | return build(Warcinfo::new);
55 | }
56 |
57 | public Builder fields(Map> map) {
58 | return body(MediaType.WARC_FIELDS, MessageHeaders.format(map).getBytes(UTF_8));
59 | }
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/cdx/CdxFields.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2021 National Library of Australia
4 | */
5 |
6 | package org.netpreserve.jwarc.cdx;
7 |
8 | import org.netpreserve.jwarc.*;
9 |
10 | import java.time.format.DateTimeFormatter;
11 |
12 | import static java.time.ZoneOffset.UTC;
13 |
14 | public final class CdxFields {
15 |
16 | static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(UTC);
17 |
18 | private CdxFields() {
19 | }
20 |
21 | public static final byte ORIGINAL_URL = 'a';
22 | public static final byte DATE = 'b';
23 | public static final byte CHECKSUM = 'k';
24 | public static final byte FILENAME = 'g';
25 | public static final byte MIME_TYPE = 'm';
26 | public static final byte REDIRECT = 'r';
27 | public static final byte RESPONSE_CODE = 's';
28 | public static final byte NORMALIZED_SURT = 'N';
29 | public static final byte COMPRESSED_RECORD_SIZE = 'S';
30 | public static final byte COMPRESSED_ARC_FILE_OFFSET = 'V';
31 |
32 | public static String format(byte field, WarcCaptureRecord record) {
33 | try {
34 | return CdxFormat.CDX11.formatField(field, record, null , -1, -1, null);
35 | } catch (Exception e) {
36 | return "-";
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/cdx/CdxReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2021 National Library of Australia
4 | */
5 |
6 | package org.netpreserve.jwarc.cdx;
7 |
8 | import java.io.*;
9 | import java.nio.charset.StandardCharsets;
10 | import java.util.Iterator;
11 | import java.util.NoSuchElementException;
12 | import java.util.Optional;
13 |
14 | public class CdxReader implements Iterable, Closeable {
15 | private final BufferedReader reader;
16 | private CdxFormat format;
17 |
18 | public CdxReader(InputStream stream) {
19 | this(new BufferedReader(new InputStreamReader(stream, StandardCharsets.US_ASCII)));
20 | }
21 |
22 | CdxReader(BufferedReader reader) {
23 | this.reader = reader;
24 | }
25 |
26 | public Optional next() throws IOException {
27 | for (String line = reader.readLine(); line != null; line = reader.readLine()) {
28 | if (line.isEmpty() || line.startsWith("#")) {
29 | continue; // ignore comments
30 | }
31 | if (line.startsWith(" CDX ") || line.startsWith("CDX ")) {
32 | format = new CdxFormat(line);
33 | continue;
34 | }
35 | return Optional.of(new CdxRecord(line, format));
36 | }
37 | return Optional.empty();
38 | }
39 |
40 | @Override
41 | public Iterator iterator() {
42 | return new Iterator() {
43 | private CdxRecord next;
44 |
45 | @Override
46 | public boolean hasNext() {
47 | if (next == null) {
48 | try {
49 | next = CdxReader.this.next().orElse(null);
50 | } catch (IOException e) {
51 | throw new UncheckedIOException(e);
52 | }
53 | }
54 | return next != null;
55 | }
56 |
57 | @Override
58 | public CdxRecord next() {
59 | if (!hasNext()) {
60 | throw new NoSuchElementException();
61 | }
62 | CdxRecord record = next;
63 | next = null;
64 | return record;
65 | }
66 | };
67 | }
68 |
69 | @Override
70 | public void close() throws IOException {
71 | reader.close();
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/cdx/CdxRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2021 National Library of Australia
4 | */
5 |
6 | package org.netpreserve.jwarc.cdx;
7 |
8 | import org.netpreserve.jwarc.MediaType;
9 | import org.netpreserve.jwarc.URIs;
10 |
11 | import java.io.IOException;
12 | import java.net.URI;
13 | import java.time.Instant;
14 |
15 | public class CdxRecord {
16 | private final String[] values;
17 | private final CdxFormat format;
18 |
19 | CdxRecord(String line, CdxFormat format) throws IOException {
20 | this.values = line.split(" ");
21 | if (format != null) {
22 | this.format = format;
23 | } else if (values.length == 9) {
24 | this.format = CdxFormat.CDX9;
25 | } else if (values.length == 10) {
26 | this.format = CdxFormat.CDX10;
27 | } else if (values.length == 11) {
28 | this.format = CdxFormat.CDX11;
29 | } else {
30 | throw new IOException("Unable to determine the CDX format");
31 | }
32 | }
33 |
34 | public String get(int field) {
35 | int i = format.indexOf(field);
36 | if (i == -1) return null;
37 | String value = values[i];
38 | return value.equals("-") ? null : value;
39 | }
40 |
41 | public Instant date() {
42 | String value = get(CdxFields.DATE);
43 | return value == null ? null : CdxFields.DATE_FORMAT.parse(value, Instant::from);
44 | }
45 |
46 | public String filename() {
47 | return get(CdxFields.FILENAME);
48 | }
49 |
50 | public String target() {
51 | return get(CdxFields.ORIGINAL_URL);
52 | }
53 |
54 | public URI targetURI() {
55 | String value = target();
56 | return value == null ? null : URIs.parseLeniently(value);
57 | }
58 |
59 | /**
60 | * Length of the WARC record in bytes. Including headers and measured after any compression is applied.
61 | */
62 | public Long size() {
63 | String value = get(CdxFields.COMPRESSED_RECORD_SIZE);
64 | return value == null ? null : Long.parseLong(value);
65 | }
66 |
67 | /**
68 | * Position in bytes of the record in the WARC file.
69 | */
70 | public Long position() {
71 | String value = get(CdxFields.COMPRESSED_ARC_FILE_OFFSET);
72 | return value == null ? null : Long.parseLong(value);
73 | }
74 |
75 | /**
76 | * HTTP response status code.
77 | */
78 | public Integer status() {
79 | String value = get(CdxFields.RESPONSE_CODE);
80 | return value == null ? null : Integer.parseInt(value);
81 | }
82 |
83 | /**
84 | * A cryptographic digest of the response payload. Most commonly this is a SHA-1 digest in base 32 or an MD5 digest
85 | * in hexadecimal.
86 | */
87 | public String digest() {
88 | return get(CdxFields.CHECKSUM);
89 | }
90 |
91 | /**
92 | * The value of the Location HTTP header for redirect responses.
93 | */
94 | public String redirect() {
95 | return get(CdxFields.REDIRECT);
96 | }
97 |
98 | public MediaType contentType() {
99 | String value = get(CdxFields.MIME_TYPE);
100 | return value == null ? null : MediaType.parseLeniently(value);
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/cdx/JsonException.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 | package org.netpreserve.jwarc.cdx;
6 |
7 | class JsonException extends Exception {
8 | public JsonException(String message) {
9 | super(message);
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/cdx/JsonToken.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 | package org.netpreserve.jwarc.cdx;
6 |
7 | enum JsonToken {
8 | FIELD_NAME, START_OBJECT, END_OBJECT, START_ARRAY, END_ARRAY,
9 | STRING, NUMBER_INT, NUMBER_FLOAT, TRUE, FALSE, NULL
10 | }
11 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/Browser.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.net.InetSocketAddress;
6 | import java.net.URI;
7 | import java.nio.channels.FileChannel;
8 | import java.nio.file.Files;
9 | import java.nio.file.Path;
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 | import java.util.List;
13 | import java.util.concurrent.TimeUnit;
14 |
15 | import static java.nio.file.StandardOpenOption.DELETE_ON_CLOSE;
16 |
17 | public class Browser {
18 | private final String executable;
19 | private final String userAgent;
20 | private final InetSocketAddress proxy;
21 | private final List options = Arrays.asList("--headless", "--disable-gpu", "--ignore-certificate-errors",
22 | "--hide-scrollbars");
23 | private final static long DEFAULT_TIMEOUT = 60000;
24 |
25 | public static Browser chrome(String executable, InetSocketAddress proxy) {
26 | return new Browser(executable, proxy, null);
27 | }
28 |
29 | Browser(String executable, InetSocketAddress proxy, String userAgent) {
30 | this.executable = executable;
31 | this.proxy = proxy;
32 | this.userAgent = userAgent;
33 | }
34 |
35 | public void browse(URI uri) throws IOException {
36 | run(uri.toString());
37 | }
38 |
39 | public void screenshot(URI uri, Path outfile) throws IOException {
40 | screenshot(uri.toString(), outfile);
41 | }
42 |
43 | public void screenshot(String url, Path outfile) throws IOException {
44 | run("--screenshot=" + outfile, url);
45 | }
46 |
47 | public FileChannel screenshot(URI uri) throws IOException {
48 | return screenshot(uri.toString());
49 | }
50 |
51 | public FileChannel screenshot(String uri) throws IOException {
52 | Path outfile = Files.createTempFile("jwarc-screenshot", ".png");
53 | try {
54 | run("--screenshot=" + outfile, uri);
55 | return FileChannel.open(outfile, DELETE_ON_CLOSE);
56 | } catch (Exception e) {
57 | Files.deleteIfExists(outfile);
58 | throw e;
59 | }
60 | }
61 |
62 | private void run(String... args) throws IOException {
63 | List command = new ArrayList<>();
64 | command.add(executable);
65 | command.addAll(options);
66 | if (proxy != null) {
67 | command.add("--proxy-server=" + proxy.getHostString() + ":" + proxy.getPort());
68 | }
69 | if (userAgent != null) {
70 | command.add("--user-agent=" + userAgent);
71 | }
72 | command.addAll(Arrays.asList(args));
73 |
74 | try {
75 | Process process = new ProcessBuilder(command)
76 | .inheritIO()
77 | .redirectOutput(devNull())
78 | .start();
79 | if (DEFAULT_TIMEOUT > 0) {
80 | if (!process.waitFor(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS)) {
81 | process.destroy();
82 | process.waitFor(DEFAULT_TIMEOUT, TimeUnit.MILLISECONDS);
83 | process.destroyForcibly();
84 | throw new IOException("timed out after " + DEFAULT_TIMEOUT + "ms");
85 | }
86 | } else {
87 | process.waitFor();
88 | }
89 | if (process.exitValue() != 0) {
90 | throw new IOException("browser returned exit status: " + process.exitValue());
91 | }
92 | } catch (InterruptedException e) {
93 | Thread.currentThread().interrupt();
94 | }
95 | }
96 |
97 | private static File devNull() {
98 | return new File(System.getProperty("os.name").startsWith("Windows") ? "NUL" : "/dev/null");
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/Capture.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import java.net.URI;
4 | import java.nio.file.Path;
5 | import java.time.Instant;
6 |
7 | /**
8 | * An entry in the {@link CaptureIndex}.
9 | *
10 | * Hods the location of a particular captured version of a resource.
11 | */
12 | class Capture {
13 | private final String uri;
14 | private final Instant date;
15 | private final Path file;
16 | private final long position;
17 |
18 | Capture(String uri, Instant date) {
19 | this(uri, date, null, -1);
20 | }
21 |
22 | Capture(String uri, Instant date, Path file, long position) {
23 | this.uri = uri;
24 | this.date = date;
25 | this.file = file;
26 | this.position = position;
27 | }
28 |
29 | Instant date() {
30 | return date;
31 | }
32 |
33 | String uri() {
34 | return uri;
35 | }
36 |
37 | Path file() {
38 | return file;
39 | }
40 |
41 | long position() {
42 | return position;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/CaptureIndex.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import org.netpreserve.jwarc.*;
4 |
5 | import java.io.IOException;
6 | import java.net.URI;
7 | import java.nio.file.Path;
8 | import java.time.Instant;
9 | import java.util.List;
10 | import java.util.NavigableSet;
11 | import java.util.TreeSet;
12 |
13 | import static java.util.Comparator.comparing;
14 |
15 | public class CaptureIndex {
16 | private final NavigableSet entries = new TreeSet<>(comparing(Capture::uri).thenComparing(Capture::date));
17 | private Capture entrypoint;
18 |
19 | public CaptureIndex(List warcs) throws IOException {
20 | for (Path warc : warcs) {
21 | try (WarcReader reader = new WarcReader(warc)) {
22 | for (WarcRecord record : reader) {
23 | if ((record instanceof WarcResponse || record instanceof WarcResource)) {
24 | WarcCaptureRecord capture = (WarcCaptureRecord) record;
25 | if (URIs.hasHttpOrHttpsScheme(capture.target())) {
26 | Capture entry = new Capture(capture.target(), capture.date(), warc, reader.position());
27 | add(entry);
28 | if (entrypoint == null && MediaType.HTML.equals(capture.payloadType().base())) {
29 | entrypoint = entry;
30 | }
31 | }
32 | }
33 | }
34 | }
35 | }
36 | }
37 |
38 | void add(Capture capture) {
39 | entries.add(capture);
40 | }
41 |
42 | NavigableSet query(String uri) {
43 | return entries.subSet(new Capture(uri, Instant.MIN), true, new Capture(uri, Instant.MAX), true);
44 | }
45 |
46 | Capture entrypoint() {
47 | return entrypoint;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/HttpExchange.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import org.netpreserve.jwarc.HttpRequest;
4 | import org.netpreserve.jwarc.HttpResponse;
5 | import org.netpreserve.jwarc.IOUtils;
6 | import org.netpreserve.jwarc.MediaType;
7 |
8 | import javax.net.ssl.SSLProtocolException;
9 | import java.io.IOException;
10 | import java.io.OutputStream;
11 | import java.net.Socket;
12 | import java.net.SocketException;
13 | import java.util.regex.Matcher;
14 |
15 | import static java.nio.charset.StandardCharsets.UTF_8;
16 |
17 | class HttpExchange {
18 | private static final MediaType HTML_UTF8 = MediaType.parse("text/html;charset=utf-8");
19 |
20 | private final Socket socket;
21 | private final HttpRequest request;
22 | private final Matcher matcher;
23 |
24 | HttpExchange(Socket socket, HttpRequest request, Matcher matcher) {
25 | this.socket = socket;
26 | this.request = request;
27 | this.matcher = matcher;
28 | }
29 |
30 | public String param(int i) {
31 | return matcher.group(i);
32 | }
33 |
34 | public HttpRequest request() {
35 | return request;
36 | }
37 |
38 | public void redirect(String location) throws IOException {
39 | send(new HttpResponse.Builder(307, "Redirect")
40 | .addHeader("Content-Length", "0")
41 | .addHeader("Location", location)
42 | .build());
43 | }
44 |
45 | public void send(int status, String html) throws IOException {
46 | send(status, HTML_UTF8, html);
47 | }
48 |
49 | public void send(int status, MediaType type, String body) throws IOException {
50 | send(new HttpResponse.Builder(status, " ").body(type, body.getBytes(UTF_8)).build());
51 | }
52 |
53 | public void send(HttpResponse response) throws IOException {
54 | try {
55 | OutputStream outputStream = socket.getOutputStream();
56 | outputStream.write(response.serializeHeader());
57 | IOUtils.copy(response.body().stream(), outputStream);
58 | } catch (SSLProtocolException | SocketException e) {
59 | socket.close(); // client probably closed
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/HttpHandler.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import java.io.IOException;
4 |
5 | @FunctionalInterface
6 | public interface HttpHandler {
7 | void handle(HttpExchange exchange) throws Exception;
8 | }
9 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/WarcRecorder.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import org.netpreserve.jwarc.*;
4 |
5 | import java.io.*;
6 | import java.net.ServerSocket;
7 | import java.net.Socket;
8 | import java.net.URI;
9 | import java.nio.channels.Channels;
10 | import java.util.List;
11 | import java.util.Map;
12 |
13 | import static java.nio.charset.StandardCharsets.*;
14 |
15 | /**
16 | * HTTP proxy which records requests and responses as WARC records.
17 | */
18 | public class WarcRecorder extends HttpServer {
19 | private final WarcWriter warcWriter;
20 |
21 | public WarcRecorder(ServerSocket serverSocket, WarcWriter warcWriter) {
22 | super(serverSocket);
23 | this.warcWriter = warcWriter;
24 | try {
25 | on("GET", "/", WarcServer.resource("recorder.html"));
26 | on("GET", "/__jwarc__/recorder-sw.js", WarcServer.resource("recorder-sw.js"));
27 | } catch (IOException e) {
28 | throw new UncheckedIOException(e);
29 | }
30 | }
31 |
32 | @Override
33 | void handle(Socket socket, String target, HttpRequest httpRequest) throws Exception {
34 | boolean rewriteHeaders = false;
35 | if (target.startsWith("/__jwarc__/record/")) {
36 | target = target.substring("/__jwarc__/record/".length());
37 | rewriteHeaders = true;
38 | } else if (target.startsWith("/")) {
39 | super.handle(socket, target, httpRequest);
40 | return;
41 | }
42 | URI uri = new URI(target);
43 | if (uri.getPath().isEmpty()) {
44 | uri = new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), "/", uri.getQuery(), uri.getFragment());
45 | }
46 | String path = uri.getRawPath();
47 | if (uri.getRawQuery() != null) {
48 | path += "?" + uri.getRawQuery();
49 | }
50 | HttpRequest.Builder rb = new HttpRequest.Builder(httpRequest.method(), path).version(MessageVersion.HTTP_1_0);
51 | for (Map.Entry> e : httpRequest.headers().map().entrySet()) {
52 | if (e.getKey().equalsIgnoreCase("TE")) continue;
53 | if (e.getKey().equalsIgnoreCase("Accept-Encoding")) continue;
54 | if (e.getKey().equalsIgnoreCase("Connection")) continue;
55 | for (String v : e.getValue()) {
56 | rb.addHeader(e.getKey(), v);
57 | }
58 | }
59 | rb.setHeader("Host", uri.getPort() != -1 ? uri.getHost() + ":" + uri.getPort() : uri.getHost());
60 | OutputStream outputStream = socket.getOutputStream();
61 | if (rewriteHeaders) outputStream = new HeaderRewriter(outputStream);
62 | warcWriter.fetch(uri, rb.build(), outputStream);
63 | socket.close();
64 | }
65 |
66 | private static class HeaderRewriter extends FilterOutputStream {
67 | private ByteArrayOutputStream buffer = new ByteArrayOutputStream();
68 | private static final byte[] SENTINEL = "\r\n\r\n".getBytes(US_ASCII);
69 | private int state = 0;
70 |
71 | public HeaderRewriter(OutputStream out) {
72 | super(out);
73 | }
74 |
75 | @Override
76 | public void write(int b) throws IOException {
77 | write(new byte[]{(byte) b}, 0, 1);
78 | }
79 |
80 | @Override
81 | public void write(byte[] b, int off, int len) throws IOException {
82 | if (state == SENTINEL.length) {
83 | out.write(b, off, len);
84 | return;
85 | }
86 | for (int i = off; i < off + len; i++) {
87 | if (b[i] == SENTINEL[state]) {
88 | state++;
89 | if (state == SENTINEL.length) {
90 | buffer.write(b, off, i - off + 1);
91 | HttpResponse response = HttpResponse.parseWithoutBody(Channels.newChannel(new ByteArrayInputStream(buffer.toByteArray())), null);
92 | HttpResponse.Builder builder = new HttpResponse.Builder(response.status(), response.reason())
93 | .version(response.version())
94 | .addHeaders(response.headers().map())
95 | .setHeader("Content-Length", null)
96 | .setHeader("Connection", "close")
97 | .setHeader("X-Frame-Options", null)
98 | .setHeader("Content-Security-Policy-Report-Only", null);
99 | response.headers().first("Location").ifPresent(location ->
100 | builder.setHeader("Location", "/__jwarc__/record/" + location));
101 | out.write(builder.build().serializeHeader());
102 | out.write(b, i + 1, len - (i - off + 1));
103 | buffer = null;
104 | return;
105 | }
106 | } else {
107 | state = 0;
108 | }
109 | }
110 | buffer.write(b, off, len);
111 | }
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/WarcRenderer.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import org.netpreserve.jwarc.MediaType;
4 | import org.netpreserve.jwarc.WarcResource;
5 | import org.netpreserve.jwarc.WarcWriter;
6 |
7 | import java.io.Closeable;
8 | import java.io.IOException;
9 | import java.net.InetAddress;
10 | import java.net.InetSocketAddress;
11 | import java.net.ServerSocket;
12 | import java.net.URI;
13 | import java.nio.channels.FileChannel;
14 | import java.nio.file.Files;
15 | import java.nio.file.Path;
16 | import java.time.Instant;
17 | import java.time.format.DateTimeFormatter;
18 |
19 | import static java.time.ZoneOffset.UTC;
20 |
21 | public class WarcRenderer implements Closeable {
22 | private static final DateTimeFormatter ARC_TIME = DateTimeFormatter.ofPattern("yyyyMMddHHmmss").withZone(UTC);
23 |
24 | private final ServerSocket proxySocket;
25 | private final WarcServer server;
26 | private final String browserExecutable;
27 |
28 | public WarcRenderer(CaptureIndex index) throws IOException {
29 | this(index, System.getenv().getOrDefault("BROWSER", "google-chrome"));
30 | }
31 |
32 | public WarcRenderer(CaptureIndex index, String browserExecutable) throws IOException {
33 | this.proxySocket = new ServerSocket(0, -1, InetAddress.getLoopbackAddress());
34 | this.server = new WarcServer(proxySocket, index);
35 | this.browserExecutable = browserExecutable;
36 | new Thread(server::listen).start();
37 | }
38 |
39 | public void screenshot(URI uri, Instant date, WarcWriter warcWriter) throws IOException {
40 | screenshot(uri.toString(), date, warcWriter);
41 | }
42 |
43 | public void screenshot(String url, Instant date, WarcWriter warcWriter) throws IOException {
44 | Path screenshot = Files.createTempFile("jwarc-screenshot", ".png");
45 | try {
46 | Browser browser = new Browser(browserExecutable, (InetSocketAddress) proxySocket.getLocalSocketAddress(),
47 | "WarcRenderer (arctime/" + ARC_TIME.format(date) + ")");
48 | browser.screenshot(url, screenshot);
49 | try (FileChannel channel = FileChannel.open(screenshot)) {
50 | long size = channel.size();
51 | if (size == 0) return;
52 | warcWriter.write(new WarcResource.Builder(URI.create("screenshot:" + url))
53 | .date(date)
54 | .body(MediaType.parse("image/png"), channel, size)
55 | .build());
56 | }
57 | } finally {
58 | Files.deleteIfExists(screenshot);
59 | }
60 | }
61 |
62 | @Override
63 | public void close() throws IOException {
64 | proxySocket.close();
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/net/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Classes for constructing network services which operate on WARC files.
3 | */
4 | package org.netpreserve.jwarc.net;
5 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Classes for parsing, serializing and manipulating WARC records.
3 | */
4 | package org.netpreserve.jwarc;
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/CdxTool.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2021 National Library of Australia
4 | */
5 |
6 | package org.netpreserve.jwarc.tools;
7 |
8 |
9 | import org.netpreserve.jwarc.WarcRecord;
10 | import org.netpreserve.jwarc.WarcRevisit;
11 | import org.netpreserve.jwarc.cdx.CdxFormat;
12 | import org.netpreserve.jwarc.cdx.CdxWriter;
13 |
14 |
15 | import java.io.IOException;
16 | import java.io.OutputStreamWriter;
17 | import java.nio.file.Path;
18 | import java.nio.file.Paths;
19 | import java.util.ArrayList;
20 | import java.util.List;
21 | import java.util.function.Predicate;
22 |
23 | public class CdxTool {
24 | public static void main(String[] args) throws IOException {
25 | List files = new ArrayList<>();
26 | CdxFormat.Builder cdxFormatBuilder = new CdxFormat.Builder();
27 | boolean printHeader = true;
28 | boolean fullFilePath = false;
29 | boolean postAppend = false;
30 | Predicate filter = null;
31 | for (int i = 0; i < args.length; i++) {
32 | if (args[i].startsWith("-")) {
33 | switch (args[i]) {
34 | case "-f":
35 | case "--format":
36 | String format = args[++i];
37 | switch (format) {
38 | case "CDX9":
39 | cdxFormatBuilder.legend(CdxFormat.CDX9_LEGEND);
40 | break;
41 | case "CDX10":
42 | cdxFormatBuilder.legend(CdxFormat.CDX10_LEGEND);
43 | break;
44 | case "CDX11":
45 | cdxFormatBuilder.legend(CdxFormat.CDX11_LEGEND);
46 | break;
47 | default:
48 | cdxFormatBuilder.legend(format);
49 | break;
50 | }
51 | break;
52 | case "-h":
53 | case "--help":
54 | System.out.println("Usage: jwarc cdx [--format LEGEND] warc-files...");
55 | System.out.println();
56 | System.out.println(" -d, --digest-unchanged Include records with unchanged digest");
57 | System.out.println(" -f, --format LEGEND CDX format may be CDX9, CDX11 or a custom legend");
58 | System.out.println(" --no-header Don't print the CDX header line");
59 | System.out.println(" -p, --post-append Append the request body to the urlkey field");
60 | System.out.println(" --revisits-excluded Don't index revisit records");
61 | System.out.println(" -w, --warc-full-path Use absolute paths for the filename field");
62 | return;
63 | case "--no-header":
64 | printHeader = false;
65 | break;
66 | case "-p":
67 | case "--post-append":
68 | postAppend = true;
69 | break;
70 | case "-d":
71 | case "--digest-unchanged":
72 | cdxFormatBuilder.digestUnchanged();
73 | break;
74 | case "-r":
75 | case "--revisits-included":
76 | filter = null;
77 | break;
78 | case "--revisits-excluded":
79 | filter = record -> !(record instanceof WarcRevisit);
80 | break;
81 | case "-w":
82 | case "--warc-full-path":
83 | fullFilePath = true;
84 | break;
85 | default:
86 | System.err.println("Unrecognized option: " + args[i]);
87 | System.err.println("Usage: jwarc cdx [--format LEGEND] warc-files...");
88 | System.exit(1);
89 | return;
90 | }
91 | } else {
92 | files.add(Paths.get(args[i]));
93 | }
94 | }
95 |
96 | try (CdxWriter cdxWriter = new CdxWriter(new OutputStreamWriter(System.out))) {
97 | cdxWriter.onWarning(System.err::println);
98 | cdxWriter.setFormat(cdxFormatBuilder.build());
99 | cdxWriter.setPostAppend(postAppend);
100 | cdxWriter.setRecordFilter(filter);
101 |
102 | if (printHeader) cdxWriter.writeHeaderLine();
103 | cdxWriter.process(files, fullFilePath);
104 | }
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/FetchTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import org.netpreserve.jwarc.FetchOptions;
4 | import org.netpreserve.jwarc.WarcWriter;
5 |
6 | import java.io.IOException;
7 | import java.net.URI;
8 | import java.net.URISyntaxException;
9 | import java.nio.file.Path;
10 | import java.nio.file.Paths;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 |
14 | public class FetchTool {
15 | public static void main(String[] args) throws IOException, URISyntaxException {
16 | FetchOptions options = new FetchOptions();
17 | List urls = new ArrayList<>();
18 | Path outputFile = null;
19 | for (int i = 0; i < args.length; i++) {
20 | switch (args[i]) {
21 | case "-h":
22 | case "--help":
23 | System.out.println("Usage: jwarc fetch [options] url...");
24 | System.out.println("Fetches a URL while writing the request and response as WARC records");
25 | System.out.println();
26 | System.out.println("Options:");
27 | System.out.println(" -A, --user-agent STRING Sets the User-Agent header");
28 | System.out.println(" --read-timeout MILLIS Sets the socket read timeout");
29 | System.out.println(" --max-length BYTES Truncate response after BYTES received");
30 | System.out.println(" --max-time MILLIS Truncate response after MILLIS elapsed");
31 | System.out.println(" -o, --output-file FILE Write WARC records to FILE instead of stdout");
32 | System.out.println();
33 | System.exit(0);
34 | break;
35 | case "-A":
36 | case "--user-agent":
37 | options.userAgent(args[++i]);
38 | break;
39 | case "--read-timeout":
40 | options.readTimeout(Integer.parseInt(args[++i]));
41 | break;
42 | case "--max-length":
43 | options.maxLength(Integer.parseInt(args[++i]));
44 | break;
45 | case "--max-time":
46 | options.maxTime(Integer.parseInt(args[++i]));
47 | break;
48 | case "-o":
49 | case "--output-file":
50 | outputFile = Paths.get(args[++i]);
51 | break;
52 | default:
53 | if (args[i].startsWith("-")) {
54 | System.err.println("Unknown option: " + args[i]);
55 | System.exit(1);
56 | }
57 | urls.add(new URI(args[i]));
58 | }
59 | }
60 | if (urls.isEmpty()) {
61 | System.err.println("No URLs specified. Try: jwarc fetch --help");
62 | System.exit(1);
63 | }
64 | try (WarcWriter writer = outputFile == null ? new WarcWriter(System.out) : new WarcWriter(outputFile)) {
65 | Runtime.getRuntime().addShutdownHook(new Thread(() -> {
66 | try {
67 | // Ensure current progress is written before exiting.
68 | writer.close();
69 | } catch (IOException e) {
70 | e.printStackTrace();
71 | }
72 | }, "FetchToolShutdownHook"));
73 | for (URI url : urls) {
74 | writer.fetch(url, options);
75 | }
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/FilterTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import org.netpreserve.jwarc.*;
4 |
5 | import java.io.IOException;
6 | import java.nio.file.Paths;
7 | import java.util.Arrays;
8 |
9 | public class FilterTool {
10 | public static void main(String[] args) throws Exception {
11 | try {
12 | String[] files;
13 | if (args.length == 0) {
14 | System.err.println("Usage: jwarc filter [warc-file]...");
15 | System.err.println(" e.g. jwarc filter 'warc-type == \"response\" && http:content-type =~ \"image/.*\" && :status == 200' example.warc");
16 | System.exit(1);
17 | return;
18 | } else if (args.length > 1) {
19 | files = Arrays.copyOfRange(args, 1, args.length);
20 | } else {
21 | if (System.console() != null) {
22 | System.err.println("Warning: No input files specified, reading from STDIN");
23 | }
24 | files = new String[]{"-"};
25 | }
26 | WarcFilter filter = WarcFilter.compile(args[0]);
27 | try (WarcWriter writer = new WarcWriter(System.out)) {
28 | for (String file : files) {
29 | try (WarcReader reader = file.equals("-") ? new WarcReader(System.in) : new WarcReader(Paths.get(file))) {
30 | filterRecords(filter, writer, reader);
31 | }
32 | }
33 | }
34 | } catch (WarcFilterException e) {
35 | System.err.println(e.prettyPrint());
36 | System.exit(2);
37 | }
38 | }
39 |
40 | private static void filterRecords(WarcFilter filter, WarcWriter writer, WarcReader reader) throws IOException {
41 | for (WarcRecord record : reader) {
42 | if (filter.test(record)) {
43 | writer.write(record);
44 | }
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/ListTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import org.netpreserve.jwarc.*;
4 |
5 | import java.io.IOException;
6 | import java.nio.file.Paths;
7 |
8 | public class ListTool {
9 | public static void main(String[] args) throws IOException {
10 | for (String arg : args) {
11 | try (WarcReader reader = new WarcReader(Paths.get(arg))) {
12 | for (WarcRecord record : reader) {
13 | String url = "-";
14 | if (record instanceof WarcTargetRecord) {
15 | url = ((WarcTargetRecord) record).target();
16 | }
17 |
18 | String methodOrStatus = "-";
19 | if (record.contentType().base().equals(MediaType.HTTP)) {
20 | if (record instanceof WarcRequest) {
21 | methodOrStatus = ((WarcRequest) record).http().method();
22 | } else if (record instanceof WarcResponse) {
23 | methodOrStatus = String.valueOf(((WarcResponse) record).http().status());
24 | }
25 | }
26 |
27 | System.out.format("%10d %-10s %-4s %s\n", reader.position(), record.type(), methodOrStatus, url);
28 | }
29 | }
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/RecordTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import org.netpreserve.jwarc.WarcWriter;
4 | import org.netpreserve.jwarc.net.Browser;
5 | import org.netpreserve.jwarc.net.WarcRecorder;
6 |
7 | import java.net.InetAddress;
8 | import java.net.InetSocketAddress;
9 | import java.net.ServerSocket;
10 | import java.net.URI;
11 |
12 | public class RecordTool {
13 | public static void main(String[] args) throws Exception {
14 | try (ServerSocket socket = new ServerSocket(0, -1, InetAddress.getLoopbackAddress())) {
15 | WarcRecorder recorder = new WarcRecorder(socket, new WarcWriter(System.out));
16 | new Thread(recorder::listen).start();
17 | InetSocketAddress proxy = (InetSocketAddress) socket.getLocalSocketAddress();
18 | System.err.println("WarcRecorder listening on " + proxy);
19 | String executable = System.getenv().getOrDefault("BROWSER", "google-chrome");
20 | Browser browser = Browser.chrome(executable, proxy);
21 | for (String arg : args) {
22 | browser.browse(URI.create(arg));
23 | }
24 | }
25 | System.exit(0);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/ScreenshotTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import org.netpreserve.jwarc.*;
4 | import org.netpreserve.jwarc.net.CaptureIndex;
5 | import org.netpreserve.jwarc.net.WarcRenderer;
6 |
7 | import java.io.IOException;
8 | import java.nio.file.Path;
9 | import java.nio.file.Paths;
10 | import java.util.List;
11 | import java.util.stream.Collectors;
12 | import java.util.stream.Stream;
13 |
14 | public class ScreenshotTool {
15 | public static void main(String[] args) throws Exception {
16 | List warcs = Stream.of(args).map(Paths::get).collect(Collectors.toList());
17 | try (WarcWriter warcWriter = new WarcWriter(System.out);
18 | WarcRenderer renderer = new WarcRenderer(new CaptureIndex(warcs))) {
19 | for (String arg : args) {
20 | try (WarcReader reader = new WarcReader(Paths.get(arg))) {
21 | for (WarcRecord record : reader) {
22 | if (!isNormalPage(record)) continue;
23 | WarcCaptureRecord capture = (WarcCaptureRecord) record;
24 | renderer.screenshot(capture.target(), capture.date(), warcWriter);
25 | }
26 | }
27 | }
28 | }
29 | }
30 |
31 | private static boolean isNormalPage(WarcRecord record) throws IOException {
32 | if (!(record instanceof WarcResponse) && !(record instanceof WarcResource)) {
33 | return false;
34 | }
35 | WarcCaptureRecord capture = (WarcCaptureRecord) record;
36 | if (!(URIs.hasHttpOrHttpsScheme(capture.target()))) {
37 | return false;
38 | }
39 | try {
40 | if (!(capture.payload().isPresent() && capture.payload().get().type().base().equals(MediaType.HTML))) {
41 | return false;
42 | }
43 | } catch (IllegalArgumentException e) {
44 | return false;
45 | }
46 | return !(capture instanceof WarcResponse) || ((WarcResponse) capture).http().status() == 200;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/ServeTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import org.netpreserve.jwarc.net.WarcServer;
4 |
5 | import java.net.ServerSocket;
6 | import java.nio.file.Path;
7 | import java.nio.file.Paths;
8 | import java.util.List;
9 | import java.util.stream.Collectors;
10 | import java.util.stream.Stream;
11 |
12 | public class ServeTool {
13 | public static void main(String[] args) throws Exception {
14 | if (args.length == 0) {
15 | System.err.println("Usage: WarcTool serve ");
16 | System.err.println("Obeys environment variable PORT.");
17 | System.exit(1);
18 | }
19 | List warcs = Stream.of(args).map(Paths::get).collect(Collectors.toList());
20 | int port = Integer.parseInt(System.getenv().getOrDefault("PORT", "8080"));
21 | WarcServer server = new WarcServer(new ServerSocket(port), warcs);
22 | System.err.println("Listening on port " + port);
23 | server.listen();
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/Utils.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.net.URL;
6 | import java.util.Properties;
7 |
8 | class Utils {
9 | static String getJwarcVersion() {
10 | Properties properties = new Properties();
11 | URL resource = WarcTool.class.getResource("/META-INF/maven/org.netpreserve/jwarc/pom.properties");
12 | if (resource != null) {
13 | try (InputStream stream = resource.openStream()) {
14 | properties.load(stream);
15 | } catch (IOException e) {
16 | // alas!
17 | }
18 | }
19 | return properties.getProperty("version");
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/WarcTool.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.tools;
2 |
3 | import java.util.Arrays;
4 |
5 | public class WarcTool {
6 | public static void main(String[] args) throws Exception {
7 | if (args.length == 0) {
8 | usage();
9 | return;
10 | }
11 | String[] rest = Arrays.copyOfRange(args, 1, args.length);
12 | switch (args[0]) {
13 | case "cdx":
14 | CdxTool.main(rest);
15 | break;
16 | case "dedupe":
17 | DedupeTool.main(rest);
18 | break;
19 | case "extract":
20 | ExtractTool.main(rest);
21 | break;
22 | case "fetch":
23 | FetchTool.main(rest);
24 | break;
25 | case "filter":
26 | FilterTool.main(rest);
27 | break;
28 | case "-h":
29 | case "--help":
30 | case "help":
31 | usage();
32 | break;
33 | case "ls":
34 | ListTool.main(rest);
35 | break;
36 | case "record":
37 | RecordTool.main(rest);
38 | break;
39 | case "recorder":
40 | RecorderTool.main(rest);
41 | break;
42 | case "saveback":
43 | SavebackTool.main(rest);
44 | break;
45 | case "screenshot":
46 | ScreenshotTool.main(rest);
47 | break;
48 | case "serve":
49 | ServeTool.main(rest);
50 | break;
51 | case "stats":
52 | StatsTool.main(rest);
53 | break;
54 | case "validate":
55 | ValidateTool.main(rest);
56 | break;
57 | case "--version":
58 | case "version":
59 | version();
60 | break;
61 | default:
62 | System.err.println("jwarc: '" + args[0] + "' is not a jwarc command. See 'jwarc help'.");
63 | System.exit(1);
64 | }
65 | }
66 |
67 | private static void usage() {
68 | System.out.println("usage: jwarc [args]...");
69 | System.out.println();
70 | System.out.println("Commands:");
71 | System.out.println();
72 | System.out.println(" cdx List records in CDX format");
73 | System.out.println(" dedupe Deduplicate records by looking up a CDX server");
74 | System.out.println(" extract Extract record by offset");
75 | System.out.println(" fetch Download a URL recording the request and response");
76 | System.out.println(" filter Copy records that match a given filter expression");
77 | System.out.println(" ls List records in WARC file(s)");
78 | System.out.println(" record Fetch a page and subresources using headless Chrome");
79 | System.out.println(" recorder Run a recording proxy");
80 | System.out.println(" saveback Saves wayback-style replayed pages as WARC records");
81 | System.out.println(" screenshot Take a screenshot of each page in the given WARCs");
82 | System.out.println(" serve Serve WARC files with a basic replay server/proxy");
83 | System.out.println(" stats Print statistics about WARC and CDX files");
84 | System.out.println(" validate Validate WARC or ARC files");
85 | System.out.println(" version Print version information");
86 | }
87 |
88 | private static void version() {
89 | String version = Utils.getJwarcVersion();
90 | System.out.println("jwarc " + (version == null ? "unknown version" : version));
91 | System.out.println(System.getProperty("java.vm.name") + " " + System.getProperty("java.version"));
92 | System.out.println(System.getProperty("os.name") + " " + System.getProperty("os.version") + " " + System.getProperty("os.arch"));
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/org/netpreserve/jwarc/tools/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Command-line tools for manipulating WARC files.
3 | */
4 | package org.netpreserve.jwarc.tools;
--------------------------------------------------------------------------------
/test-resources/org/netpreserve/jwarc/cc.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iipc/jwarc/7828aa0bae9b52ac2b31c3e783ee9ce3817feeda/test-resources/org/netpreserve/jwarc/cc.warc.gz
--------------------------------------------------------------------------------
/test-resources/org/netpreserve/jwarc/gzip_extra_sl.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iipc/jwarc/7828aa0bae9b52ac2b31c3e783ee9ce3817feeda/test-resources/org/netpreserve/jwarc/gzip_extra_sl.warc.gz
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/ChunkedBodyTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 | import org.netpreserve.jwarc.ChunkedBody;
5 | import org.netpreserve.jwarc.ParsingException;
6 |
7 | import java.io.ByteArrayInputStream;
8 | import java.io.EOFException;
9 | import java.io.IOException;
10 | import java.nio.ByteBuffer;
11 | import java.nio.channels.Channels;
12 | import java.nio.channels.ReadableByteChannel;
13 | import java.nio.charset.StandardCharsets;
14 | import java.util.Arrays;
15 |
16 | import static java.nio.charset.StandardCharsets.US_ASCII;
17 | import static org.junit.Assert.*;
18 |
19 | public class ChunkedBodyTest {
20 | @Test
21 | public void test() throws IOException {
22 | byte[] one = "3\r\nhel\r\n0007\r\nlo ".getBytes(US_ASCII);
23 | byte[] two = "worl\r\n1\r\nd\r\n00000\r\n\r\n".getBytes(US_ASCII);
24 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(two));
25 | ByteBuffer b1 = ByteBuffer.wrap(one);
26 | ChunkedBody decoder = new ChunkedBody(chan, b1);
27 | ByteBuffer buf = ByteBuffer.allocate(32);
28 | while (true) {
29 | int n = decoder.read(buf);
30 | assertNotEquals(0, n);
31 | if (n == -1) {
32 | break;
33 | }
34 | }
35 | assertFalse(b1.hasRemaining());
36 | assertEquals("hello world", new String(Arrays.copyOf(buf.array(), buf.position()), US_ASCII));
37 | }
38 |
39 | @Test(expected = ParsingException.class)
40 | public void testErr() throws IOException {
41 | new ChunkedBody(Channels.newChannel(new ByteArrayInputStream(new byte[0])), ByteBuffer.allocate(16))
42 | .strict()
43 | .read(ByteBuffer.allocate(32));
44 | }
45 |
46 | @Test(expected = EOFException.class)
47 | public void testEOF() throws IOException {
48 | ByteBuffer buf = ByteBuffer.allocate(16);
49 | buf.flip();
50 | new ChunkedBody(Channels.newChannel(new ByteArrayInputStream(new byte[0])), buf)
51 | .read(ByteBuffer.allocate(32));
52 | }
53 |
54 | /** Test optimisation when internal buffer is bypassed on large chunks */
55 | @Test
56 | public void testBypassInternalBuffer() throws IOException {
57 | String bodyString = "hello world, hello world!";
58 | byte[] body = ("19\r\n" + bodyString + "\r\n00000\r\n\r\n").getBytes(US_ASCII);
59 | ByteBuffer buf = ByteBuffer.allocate(8192);
60 | ByteBuffer initBuf = ByteBuffer.allocate(12);
61 | initBuf.flip();
62 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(body));
63 | ChunkedBody decoder = new ChunkedBody(chan, initBuf);
64 | while (true) {
65 | int n = decoder.read(buf);
66 | assertNotEquals(0, n);
67 | if (n < 0) {
68 | break;
69 | }
70 | }
71 | assertFalse(initBuf.hasRemaining());
72 | assertEquals(bodyString, new String(Arrays.copyOf(buf.array(), buf.position()), US_ASCII));
73 | }
74 |
75 | /** Test trailing whitespace after chunk length (#33) */
76 | @Test
77 | public void testChunkLengthTrailingWhiteSpace() throws IOException {
78 | String bodyString = "hello world, hello world!";
79 | byte[] body = ("19 \r\n" + bodyString + "\r\n00000\r\n\r\n").getBytes(US_ASCII);
80 | ByteBuffer buf = ByteBuffer.allocate(8192);
81 | ByteBuffer initBuf = ByteBuffer.allocate(8192);
82 | initBuf.flip();
83 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(body));
84 | ChunkedBody decoder = new ChunkedBody(chan, initBuf);
85 | while (true) {
86 | int n = decoder.read(buf);
87 | assertNotEquals(0, n);
88 | if (n < 0) {
89 | break;
90 | }
91 | }
92 | assertFalse(initBuf.hasRemaining());
93 | assertEquals(bodyString, new String(Arrays.copyOf(buf.array(), buf.position()), US_ASCII));
94 | }
95 |
96 | @Test
97 | public void testLenientMode() throws IOException {
98 | String string = "33hello world!";
99 | byte[] body = string.getBytes(US_ASCII);
100 | ReadableByteChannel chan = Channels.newChannel(new ByteArrayInputStream(body));
101 | ByteBuffer buf = ByteBuffer.allocate(100);
102 | ByteBuffer initBuf = ByteBuffer.allocate(100);
103 | initBuf.flip();
104 | ChunkedBody decoder = new ChunkedBody(chan, initBuf);
105 | int n = decoder.read(buf);
106 | buf.flip();
107 | assertEquals(string, US_ASCII.decode(buf).toString());
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/GunzipChannelTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import static org.junit.Assert.assertEquals;
9 | import static org.junit.Assert.assertNotNull;
10 | import static org.junit.Assert.assertTrue;
11 |
12 | import java.io.ByteArrayInputStream;
13 | import java.io.ByteArrayOutputStream;
14 | import java.io.IOException;
15 | import java.net.URISyntaxException;
16 | import java.net.URL;
17 | import java.nio.ByteBuffer;
18 | import java.nio.ByteOrder;
19 | import java.nio.channels.Channels;
20 | import java.nio.channels.FileChannel;
21 | import java.nio.channels.ReadableByteChannel;
22 | import java.nio.charset.StandardCharsets;
23 | import java.nio.file.Paths;
24 | import java.util.zip.GZIPOutputStream;
25 |
26 | import org.junit.Ignore;
27 | import org.junit.Test;
28 |
29 | public class GunzipChannelTest {
30 |
31 | private ByteArrayOutputStream getHelloWorldGzipByteStream() throws IOException {
32 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
33 | GZIPOutputStream gzos = new GZIPOutputStream(baos);
34 | gzos.write("Hello world".getBytes(StandardCharsets.US_ASCII));
35 | gzos.finish();
36 | return baos;
37 | }
38 |
39 | @Test
40 | public void test() throws IOException {
41 | ByteBuffer inBuffer = ByteBuffer.allocate(1024);
42 | inBuffer.flip();
43 |
44 | ByteArrayOutputStream baos = getHelloWorldGzipByteStream();
45 |
46 | ReadableByteChannel input = Channels.newChannel(new ByteArrayInputStream(baos.toByteArray()));
47 |
48 | GunzipChannel channel = new GunzipChannel(input, inBuffer);
49 |
50 | ByteBuffer buffer = ByteBuffer.allocate(20);
51 | channel.read(buffer);
52 | channel.close();
53 | buffer.flip();
54 |
55 | byte[] bytes = new byte[buffer.remaining()];
56 | buffer.get(bytes);
57 |
58 | assertEquals("Hello world", new String(bytes, StandardCharsets.US_ASCII));
59 |
60 | }
61 |
62 | @Test
63 | public void testExtraField() throws IOException, URISyntaxException {
64 | ByteBuffer inBuffer = ByteBuffer.allocate(1024);
65 | inBuffer.flip();
66 |
67 | URL warcFile = getClass().getClassLoader().getResource("org/netpreserve/jwarc/gzip_extra_sl.warc.gz");
68 | assertNotNull("WARC file gzip_extra_sl.warc.gz not found", warcFile);
69 | ReadableByteChannel input = FileChannel.open(Paths.get(warcFile.toURI()));
70 |
71 | GunzipChannel channel = new GunzipChannel(input, inBuffer);
72 |
73 | ByteBuffer buffer = ByteBuffer.allocate(20);
74 | channel.read(buffer);
75 | buffer.flip();
76 |
77 | byte[] bytes = new byte[buffer.remaining()];
78 | buffer.get(bytes);
79 |
80 | assertTrue("Failed reading WARC file: expected \"WARC/1.0\" as first line",
81 | new String(bytes).startsWith("WARC/1.0"));
82 |
83 | // consume remaining compressed content to determine the length
84 | do {
85 | buffer.clear();
86 | } while (channel.read(buffer) > -1);
87 | channel.close();
88 |
89 | // check GunzipChannel position
90 | long warcFileSize = FileChannel.open(Paths.get(warcFile.toURI())).size();
91 | assertEquals("Wrong input position", warcFileSize, channel.inputPosition());
92 | }
93 |
94 | private void checkExternalBuffer(ByteBuffer buffer) throws IOException {
95 | ByteArrayOutputStream baos = getHelloWorldGzipByteStream();
96 |
97 | ReadableByteChannel input = Channels.newChannel(new ByteArrayInputStream(baos.toByteArray()));
98 |
99 | GunzipChannel channel = new GunzipChannel(input, buffer);
100 | ByteBuffer output = ByteBuffer.allocate(20);
101 | int n = channel.read(output);
102 | channel.close();
103 | assertEquals(11, n);
104 | assertEquals("Hello world", new String(output.array(), 0, 11, StandardCharsets.US_ASCII));
105 | }
106 |
107 | @Test(expected = IllegalArgumentException.class)
108 | public void externalBufferNoArray() throws IOException {
109 | ByteBuffer buffer = ByteBuffer.allocate(1024).asReadOnlyBuffer();
110 | buffer.flip();
111 | checkExternalBuffer(buffer);
112 | }
113 |
114 | @Ignore("User must ensure buffer is in read state")
115 | @Test
116 | public void externalBufferNoReadState() throws IOException, URISyntaxException {
117 | ByteBuffer buffer = ByteBuffer.allocate(8192);
118 | // not calling buffer.flip()
119 | checkExternalBuffer(buffer);
120 | }
121 |
122 | @Test
123 | public void externalBufferByteOrderLE() throws IOException, URISyntaxException {
124 | ByteBuffer buffer = ByteBuffer.allocate(8192);
125 | buffer.order(ByteOrder.LITTLE_ENDIAN);
126 | buffer.flip();
127 | checkExternalBuffer(buffer);
128 | }
129 |
130 | @Test
131 | public void externalBufferByteOrderBE() throws IOException, URISyntaxException {
132 | ByteBuffer buffer = ByteBuffer.allocate(8192);
133 | buffer.order(ByteOrder.BIG_ENDIAN);
134 | buffer.flip();
135 | checkExternalBuffer(buffer);
136 | }
137 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/GzipChannelTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2020 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc;
7 |
8 | import static org.junit.Assert.assertEquals;
9 | import static org.junit.Assert.assertTrue;
10 |
11 | import java.io.ByteArrayInputStream;
12 | import java.io.ByteArrayOutputStream;
13 | import java.io.IOException;
14 | import java.nio.ByteBuffer;
15 | import java.nio.ByteOrder;
16 | import java.nio.channels.Channels;
17 | import java.nio.charset.StandardCharsets;
18 | import java.util.Arrays;
19 | import java.util.zip.GZIPInputStream;
20 |
21 | import org.junit.Test;
22 |
23 | public class GzipChannelTest {
24 |
25 | protected String text = "Hello world";
26 | protected byte[] textBytes = text.getBytes(StandardCharsets.US_ASCII);
27 |
28 | private void checkGzip(byte[] gzipped) {
29 | // did we get valid gzipped data?
30 | short magic = ByteBuffer.wrap(gzipped).order(ByteOrder.LITTLE_ENDIAN).getShort();
31 | assertEquals(magic, GzipChannel.GZIP_MAGIC);
32 | assertTrue(gzipped.length >= 20);
33 | }
34 |
35 | @Test
36 | public void test() throws IOException {
37 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
38 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos));
39 | int written = channel.write(ByteBuffer.wrap(textBytes));
40 | assertEquals(written, textBytes.length);
41 | channel.close();
42 | byte[] gzipped = baos.toByteArray();
43 |
44 | checkGzip(gzipped);
45 |
46 | GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzipped));
47 | byte[] inBytes = new byte[8192];
48 | int n = gzis.read(inBytes);
49 |
50 | assertEquals(n, textBytes.length);
51 | assertEquals(text, new String(inBytes, 0, n, StandardCharsets.US_ASCII));
52 | }
53 |
54 | /**
55 | * Test that zero content (empty string, zero bytes input) is written as valid
56 | * gzip data, otherwise uncompressing will cause an error.
57 | */
58 | @Test
59 | public void testEmpty() throws IOException {
60 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
61 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos));
62 | channel.write(ByteBuffer.allocate(0));
63 | channel.finish();
64 | channel.close();
65 | byte[] gzipped = baos.toByteArray();
66 |
67 | checkGzip(gzipped);
68 |
69 | byte[] inBytes = new byte[8192];
70 | int n = (new GZIPInputStream(new ByteArrayInputStream(gzipped))).read(inBytes);
71 | assertTrue(n <= 0);
72 |
73 | // test without calling write() and finish()
74 | baos = new ByteArrayOutputStream();
75 | channel = new GzipChannel(Channels.newChannel(baos));
76 | channel.close();
77 | gzipped = baos.toByteArray();
78 |
79 | checkGzip(gzipped);
80 |
81 | n = (new GZIPInputStream(new ByteArrayInputStream(gzipped))).read(inBytes);
82 | assertTrue(n <= 0);
83 | }
84 |
85 | @Test
86 | public void testMultiMember() throws IOException {
87 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
88 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos));
89 | int written = channel.write(ByteBuffer.wrap(textBytes));
90 | assertEquals(written, textBytes.length);
91 | channel.finish(); // finish first member
92 | long posSecond = channel.outputPosition();
93 | written = channel.write(ByteBuffer.wrap(textBytes));
94 | assertEquals(written, textBytes.length);
95 | channel.close();
96 | byte[] gzipped = baos.toByteArray();
97 |
98 | checkGzip(gzipped);
99 | checkGzip(Arrays.copyOfRange(gzipped, (int) posSecond, gzipped.length));
100 |
101 | GZIPInputStream gzis = new GZIPInputStream(new ByteArrayInputStream(gzipped));
102 | byte[] inBytes = new byte[8192];
103 | int n = gzis.read(inBytes);
104 |
105 | assertEquals(n, textBytes.length);
106 | assertEquals(text, new String(inBytes, 0, n, StandardCharsets.US_ASCII));
107 |
108 | // read second member
109 | n = gzis.read(inBytes);
110 | assertEquals(n, textBytes.length);
111 | assertEquals(text, new String(inBytes, 0, n, StandardCharsets.US_ASCII));
112 | }
113 |
114 | @Test(expected = IllegalArgumentException.class)
115 | public void testBufferNoArray() throws IOException {
116 | ByteArrayOutputStream baos = new ByteArrayOutputStream();
117 | GzipChannel channel = new GzipChannel(Channels.newChannel(baos), ByteBuffer.allocate(1024).asReadOnlyBuffer());
118 | channel.close();
119 | }
120 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/HeaderValidatorTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.util.*;
6 |
7 | import static org.junit.Assert.*;
8 |
9 | public class HeaderValidatorTest {
10 | private HeaderValidator headerValidator = HeaderValidator.warc_1_1();
11 |
12 | @Test
13 | public void testValid() {
14 | MessageHeaders headers = MessageHeaders.of(
15 | "WARC-Record-ID", "",
16 | "Content-Length", "123456",
17 | "WARC-Date", "2020-01-01T00:00:00Z",
18 | "WARC-Type", "response",
19 | "WARC-Target-URI", "http://example.com/",
20 | "Content-Type", "application/http; msgtype=response",
21 | "WARC-Concurrent-To", "",
22 | "WARC-Concurrent-To", ""
23 | );
24 | assertEquals(Collections.emptyList(), headerValidator.validate(headers));
25 | }
26 |
27 | @Test
28 | public void testMissingMandatoryFields() {
29 | MessageHeaders headers = MessageHeaders.of(
30 | "Content-Length", "123456",
31 | "WARC-Date", "2020-01-01T00:00:00Z",
32 | "WARC-Type", "response"
33 | );
34 | List validationErrors = headerValidator.validate(headers);
35 | assertFalse(validationErrors.isEmpty());
36 | assertTrue(validationErrors.contains("Missing mandatory field: WARC-Record-ID"));
37 | }
38 |
39 | @Test
40 | public void testInvalidPatternValidation() {
41 | MessageHeaders headers = MessageHeaders.of(
42 | "WARC-Record-ID", "",
43 | "Content-Length", "123456",
44 | "WARC-Date", "2020-01-01T00:00:00Z",
45 | "WARC-Type", "response",
46 | "Content-Type", "invalid_content_type"
47 | );
48 | List validationErrors = headerValidator.validate(headers);
49 | assertFalse(validationErrors.isEmpty());
50 | assertTrue(validationErrors.contains("Field has invalid value: invalid_content_type"));
51 | }
52 |
53 | @Test
54 | public void testNonRepeatableField() {
55 | MessageHeaders headers = MessageHeaders.of(
56 | "WARC-Record-ID", "",
57 | "Content-Length", "123456",
58 | "WARC-Date", "2020-01-01T00:00:00Z",
59 | "WARC-Type", "response",
60 | "WARC-Date", "2020-01-01T00:00:00Z",
61 | "WARC-Date", "2020-01-02T00:00:00Z"
62 | );
63 | List validationErrors = headerValidator.validate(headers);
64 | assertFalse(validationErrors.isEmpty());
65 | assertTrue(validationErrors.contains("Field must not be repeated: WARC-Date"));
66 | }
67 |
68 | @Test
69 | public void testForbiddenFieldsOnRecordType() {
70 | MessageHeaders headers = MessageHeaders.of(
71 | "WARC-Record-ID", "",
72 | "Content-Length", "123456",
73 | "WARC-Date", "2020-01-01T00:00:00Z",
74 | "WARC-Type", "response",
75 | "WARC-Filename", "test.warc.gz"
76 | );
77 | List validationErrors = headerValidator.validate(headers);
78 | assertFalse(validationErrors.isEmpty());
79 | assertTrue(validationErrors.contains("Field not allowed on response record: WARC-Filename"));
80 | }
81 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/HttpRequestTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.io.ByteArrayInputStream;
6 | import java.io.IOException;
7 | import java.nio.channels.Channels;
8 | import java.util.Optional;
9 |
10 | import static java.nio.charset.StandardCharsets.US_ASCII;
11 | import static org.junit.Assert.*;
12 |
13 | public class HttpRequestTest {
14 | @Test
15 | public void serializeHeaderShouldPreserveExactly() throws IOException {
16 | String header = "POST / HTTP/1.1\r\n" +
17 | "Connection: close\r\n" +
18 | "Host: example.org\n" +
19 | "Content-Length: 6\r\n\r\n";
20 | String message = header + "[body]";
21 | HttpRequest request = HttpRequest.parse(Channels.newChannel(new ByteArrayInputStream(message.getBytes(US_ASCII))));
22 | assertEquals("POST", request.method());
23 | assertEquals("/", request.target());
24 | assertEquals(Optional.of("example.org"), request.headers().first("Host"));
25 | assertEquals(header, new String(request.serializeHeader(), US_ASCII));
26 | }
27 |
28 | @Test(expected = IllegalArgumentException.class)
29 | public void invalidVersionShouldThrow() {
30 | new HttpRequest.Builder("GET", "/").version(MessageVersion.WARC_1_0);
31 | }
32 |
33 | @Test
34 | public void invalidContentLengthHeader() throws IOException {
35 | String header = "POST / HTTP/1.1\r\n" +
36 | "Connection: close\r\n" +
37 | "Host: example.org\n" +
38 | "Content-Length: 6 dinosaurs\r\n\r\n";
39 | String message = header + "[body]";
40 | HttpRequest request = HttpRequest.parse(LengthedBody.create(message.getBytes(US_ASCII)));
41 | assertEquals("POST", request.method());
42 | assertEquals("[body]", new String(IOUtils.readNBytes(request.body().stream(), 10)));
43 | }
44 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/HttpResponseTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.io.ByteArrayInputStream;
6 | import java.io.IOException;
7 | import java.nio.channels.Channels;
8 |
9 | import static java.nio.charset.StandardCharsets.US_ASCII;
10 | import static org.junit.Assert.assertEquals;
11 |
12 | public class HttpResponseTest {
13 | @Test
14 | public void serializeHeaderShouldPreserveExactly() throws IOException {
15 | String header = "HTTP/1.0 404 Not Found\r\n" +
16 | "Server: example\n" +
17 | "Content-Length: 6\r\n\r\n";
18 | String message = header + "[body]";
19 | HttpResponse response = HttpResponse.parse(Channels.newChannel(new ByteArrayInputStream(message.getBytes(US_ASCII))));
20 | assertEquals(404, response.status());
21 | assertEquals("Not Found", response.reason());
22 | assertEquals(header, new String(response.serializeHeader(), US_ASCII));
23 | }
24 |
25 | @Test
26 | public void parsingBogusContentLengthFolding() throws IOException {
27 | String header = "HTTP/1.0 200 OK\r\n" +
28 | "Content-Length: 6\r\n" +
29 | " Content-Type: text/html\r\n\r\n";
30 | String message = header + "[body]";
31 | HttpResponse response = HttpResponse.parse(LengthedBody.create(message.getBytes(US_ASCII)));
32 | assertEquals(200, response.status());
33 | assertEquals("[body]", new String(IOUtils.readNBytes(response.body().stream(), 10)));
34 | }
35 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/InetAddressesTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.net.InetAddress;
6 |
7 | import static org.junit.Assert.*;
8 | import static org.netpreserve.jwarc.InetAddresses.toAddrString;
9 |
10 | public class InetAddressesTest {
11 | @Test
12 | public void testCanonicalInet6() throws Exception {
13 | assertEquals("2001:db8::1",
14 | toAddrString(InetAddress.getByName("2001:db8:0:0:0:0:0:1")));
15 | assertEquals("::",
16 | toAddrString(InetAddress.getByName("0:0:0:0:0:0:0:0")));
17 | assertEquals("::1",
18 | toAddrString(InetAddress.getByName("0:0:0:0:0:0:0:1")));
19 | assertEquals("2001:db8:1:1:1:1:1:1",
20 | toAddrString(InetAddress.getByName("2001:db8:1:1:1:1:1:1")));
21 | assertEquals("2001:0:0:1::1",
22 | toAddrString(InetAddress.getByName("2001:0:0:1:0:0:0:1")));
23 | assertEquals("2001:db8:f::1",
24 | toAddrString(InetAddress.getByName("2001:db8:000f:0:0:0:0:1")));
25 | assertEquals("2001:db8::1:0:0:1",
26 | toAddrString(InetAddress.getByName("2001:0db8:0000:0000:0001:0000:0000:0001")));
27 | assertEquals("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
28 | toAddrString(InetAddress.getByName("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")));
29 | assertEquals("2001:200f::1",
30 | toAddrString(InetAddress.getByName("2001:200f:0:0:0:0:0:1")));
31 | // https://datatracker.ietf.org/doc/html/rfc5952#section-4.2.2
32 | // "The symbol "::" MUST NOT be used to shorten just one 16-bit 0 field."
33 | assertEquals("2001:0:3:4:5:6:7:8",
34 | toAddrString(InetAddress.getByName("2001:0:3:4:5:6:7:8")));
35 | // shorten first of same-length consecutive 0 fields, also in initial position
36 | assertEquals("::4:0:0:0:ffff",
37 | toAddrString(InetAddress.getByName("0:0:0:4:0:0:0:ffff")));
38 | }
39 |
40 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/LengthedBodyTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.io.IOException;
6 | import java.nio.ByteBuffer;
7 | import java.nio.channels.FileChannel;
8 | import java.nio.channels.SeekableByteChannel;
9 | import java.nio.file.Files;
10 | import java.nio.file.Path;
11 |
12 | import static java.nio.charset.StandardCharsets.US_ASCII;
13 | import static java.nio.file.StandardOpenOption.*;
14 | import static org.junit.Assert.assertEquals;
15 |
16 | public class LengthedBodyTest {
17 |
18 | @Test
19 | public void test() throws IOException {
20 | Path temp = Files.createTempFile("jwarc-test", ".tmp");
21 | try (FileChannel channel = FileChannel.open(temp, DELETE_ON_CLOSE, WRITE, READ)) {
22 | channel.write(ByteBuffer.wrap("xx0123456789yy".getBytes(US_ASCII)));
23 | channel.position(2);
24 | ByteBuffer buf = ByteBuffer.allocate(2);
25 | buf.flip();
26 | SeekableByteChannel body = (SeekableByteChannel) LengthedBody.create(channel, buf, channel.size() - 4);
27 | {
28 | ByteBuffer b = ByteBuffer.allocate(32);
29 | while (true) {
30 | if (body.read(b) < 0) break;
31 | }
32 | b.flip();
33 | assertEquals("0123456789", US_ASCII.decode(b).toString());
34 | }
35 |
36 | {
37 | body.position(3);
38 | ByteBuffer b = ByteBuffer.allocate(4);
39 | body.read(b);
40 | b.flip();
41 | assertEquals("3456", US_ASCII.decode(b).toString());
42 | }
43 |
44 | }
45 | }
46 |
47 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/MessageHeadersTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.Map;
8 | import java.util.TreeMap;
9 |
10 | import static org.junit.Assert.*;
11 |
12 | public class MessageHeadersTest {
13 | @Test
14 | public void testContains() {
15 | assertFalse(headers("Z", "1")
16 | .contains("Transfer-Encoding", "chunked"));
17 | assertTrue(headers("A", "0", "Transfer-Encoding", "chunked", "Z", "1")
18 | .contains("Transfer-Encoding", "chunked"));
19 | assertFalse(headers("Transfer-Encoding", "xchunkedx")
20 | .contains("Transfer-Encoding", "chunked"));
21 | assertFalse(headers("Transfer-Encoding", "gzip chunked")
22 | .contains("Transfer-Encoding", "chunked"));
23 | assertTrue(headers("Transfer-Encoding", "gzip, chunked, chunked, gzip")
24 | .contains("Transfer-Encoding", "chunked"));
25 | assertTrue(headers("Transfer-Encoding", "gzip, \tCHUNKED,,, GZIP")
26 | .contains("Transfer-Encoding", "Chunked"));
27 | }
28 |
29 | private static MessageHeaders headers(String... headers) {
30 | Map> map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
31 | for (int i = 0; i < headers.length; i += 2) {
32 | map.computeIfAbsent(headers[i], (k) -> new ArrayList<>()).add(headers[i + 1]);
33 | }
34 | return new MessageHeaders(map);
35 | }
36 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/URIsTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import static org.junit.Assert.assertEquals;
6 |
7 | public class URIsTest {
8 | @Test
9 | public void toNormalizedSurt() {
10 | assertEquals("org,example:8080)/foo?&&a&b&c", URIs.toNormalizedSurt("http://wWw.EXAMPLE.org:8080/FOO?c&A&&&b"));
11 | }
12 |
13 | @Test
14 | public void testParseLeniently() {
15 | roundtripParseLeniently("");
16 | roundtripParseLeniently("https://www.example.com#anchor");
17 | roundtripParseLeniently("https://example.com?a=b&cd[]=4");
18 | roundtripParseLeniently("/path/to/resource");
19 | roundtripParseLeniently("http://[2001:db8::1]/resource");
20 | roundtripParseLeniently("https://example.com/path%20with%20spaces");
21 | roundtripParseLeniently("https://example.com#fragment%20with%20spaces");
22 | roundtripParseLeniently("https://example.com?query%20with%20spaces");
23 | roundtripParseLeniently("https://example.com/路径");
24 | roundtripParseLeniently("https://example.com?query=测试");
25 | roundtripParseLeniently("https://////example.com?query=测试");
26 | roundtripParseLeniently("https://www.prijmeni.cz/Kr%C3%A1kora");
27 | roundtripParseLeniently("https://dx.doi.org/10.1038%2F35008096");
28 |
29 | assertEquals("https://example.com/path%20with%20spaces", URIs.parseLeniently("https://example.com/path with spaces").toString());
30 | assertEquals("https://example.com?query%20with%20spaces", URIs.parseLeniently("https://example.com?query with spaces").toString());
31 | assertEquals("https://example.com#fragment%20with%20spaces", URIs.parseLeniently("https://example.com#fragment with spaces").toString());
32 | assertEquals("https://example.com/a%20b%25", URIs.parseLeniently("https://example.com/a b%25").toString());
33 | assertEquals("https://example.com/a%20b路径", URIs.parseLeniently("https://example.com/a b路径").toString());
34 | assertEquals("https://example.com?a%20b%25", URIs.parseLeniently("https://example.com?a b%25").toString());
35 | assertEquals("https://example.com?a%20b路径", URIs.parseLeniently("https://example.com?a b路径").toString());
36 | assertEquals("https://example.com#a%20b%25", URIs.parseLeniently("https://example.com#a b%25").toString());
37 | assertEquals("https://example.com/a%20b%25路径%5b?a%20b%25路径[?#a%20b%25路径[?", URIs.parseLeniently("https://example.com/a b%25路径[?a b%25路径[?#a b%25路径[?").toString());
38 | assertEquals("https://example.com/a%20b?c%20d#e%20f", URIs.parseLeniently("https://example.com/a b?c d#e f").toString());
39 | }
40 |
41 | private void roundtripParseLeniently(String s) {
42 | assertEquals(s, URIs.parseLeniently(s).toString());
43 | }
44 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/WarcParserTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.nio.ByteBuffer;
6 | import java.nio.charset.StandardCharsets;
7 | import java.util.Optional;
8 |
9 | import static org.junit.Assert.*;
10 |
11 | public class WarcParserTest {
12 | @Test
13 | public void testParsingArcWithBogusMime() {
14 | WarcParser parser = parse("http://example.com/ 1.2.3.4 20110104111607 @[=*�Content-Type] 494\n");
15 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
16 | parser = parse("http://example.com/ 1.2.3.4 20110104111607 charset=foo 494\n");
17 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
18 | parser = parse("http://example.com/ 1.2.3.4 20110104111607 image(jpeg) 494\n");
19 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
20 | parser = parse("http://example.com/ 1.2.3.4 20110104111607 ERROR: 494\n");
21 | assertEquals(Optional.of("494"), parser.headers().sole("Content-Length"));
22 | }
23 |
24 | @Test
25 | public void testParsingArcWithCorruptDates() {
26 | WarcParser parser = parse("http://example.com/ 1.2.3.4 200012120739 text/html 42\n");
27 | assertEquals(Optional.of("2000-12-12T07:39:00Z"), parser.headers().first("WARC-Date"));
28 | parser = parse("http://example.com/ 1.2.3.4 2000121207394211 text/html 1942\n");
29 | assertEquals(Optional.of("2000-12-12T07:39:42Z"), parser.headers().first("WARC-Date"));
30 | parser = parse("http://example.com/ 1.2.3.4 99999999999999 text/html 1942\n");
31 | assertEquals(Optional.empty(), parser.headers().first("WARC-Date"));
32 | }
33 |
34 | @Test
35 | public void testLenientParsing() {
36 | WarcParser parser = parse( "WARC/0.18\nHello\u0007:\u0008world\r\n\r\n", true);
37 | assertEquals(Optional.of("\u0008world"), parser.headers().sole("Hello\u0007"));
38 | }
39 |
40 | @Test(expected = AssertionError.class)
41 | public void testStrictParsing() {
42 | parse( "WARC/1.0\r\nHello\u0007:\u0008world\r\n\r\n");
43 | }
44 |
45 | private static WarcParser parse(String input) {
46 | return parse(input, false);
47 | }
48 |
49 | private static WarcParser parse(String input, boolean lenient) {
50 | WarcParser parser = new WarcParser();
51 | parser.setLenient(lenient);
52 | parser.parse(ByteBuffer.wrap(input.getBytes(StandardCharsets.ISO_8859_1)));
53 | assertFalse(parser.isError());
54 | assertTrue(parser.isFinished());
55 | return parser;
56 | }
57 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/WarcRecordTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.time.Instant;
6 |
7 | import static org.junit.Assert.assertEquals;
8 |
9 | public class WarcRecordTest {
10 | @Test
11 | public void datePrecision() {
12 | Instant date = Instant.parse("2021-08-30T07:49:07.466148Z");
13 | WarcResource warc10Record = new WarcResource.Builder().version(MessageVersion.WARC_1_0).date(date).build();
14 | assertEquals(0, warc10Record.date().getNano());
15 | WarcResource warc11Record = new WarcResource.Builder().date(date).version(MessageVersion.WARC_1_1).build();
16 | assertEquals(466148000, warc11Record.date().getNano());
17 | }
18 |
19 | @Test(expected = IllegalArgumentException.class)
20 | public void invalidVersionShouldThrow() {
21 | new Warcinfo.Builder().version(MessageVersion.HTTP_1_0);
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/WarcTargetRecordTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc;
2 |
3 | import org.junit.Test;
4 |
5 | import java.net.URI;
6 | import java.util.*;
7 |
8 | import static org.junit.Assert.*;
9 |
10 | public class WarcTargetRecordTest {
11 | @Test
12 | public void testTargetURIAngleBracketsQuirk() { // per warc 1.0 grammar
13 | Map> headers = new HashMap<>();
14 | headers.put("WARC-Target-URI", Collections.singletonList(" "));
15 | WarcTargetRecord record = new WarcTargetRecord(MessageVersion.WARC_1_0, new MessageHeaders(headers), MessageBody.empty()) {
16 | };
17 | assertEquals("http://example.org/", record.target());
18 | assertEquals(URI.create("http://example.org/"), record.targetURI());
19 | }
20 |
21 | @Test
22 | public void testTargetURINormal() { // per warc 1.1 (and warc 1.0 examples)
23 | Map> headers = new HashMap<>();
24 | headers.put("WARC-Target-URI", Collections.singletonList("http://example.org/"));
25 | WarcTargetRecord record = new WarcTargetRecord(MessageVersion.WARC_1_0, new MessageHeaders(headers), MessageBody.empty()) {
26 | };
27 | assertEquals("http://example.org/", record.target());
28 | assertEquals(URI.create("http://example.org/"), record.targetURI());
29 | }
30 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/MediaTypeTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Test;
9 | import org.netpreserve.jwarc.MediaType;
10 |
11 | import static org.junit.Assert.*;
12 |
13 | public class MediaTypeTest {
14 |
15 | @Test
16 | public void test() {
17 | MediaType type = MediaType.parse("text/html; charset=\"foo\\\" bar\";foo=bar ;b=c");
18 | assertEquals("text/html;b=c;charset=\"foo\\\" bar\";foo=bar", type.toString());
19 | assertEquals("foo\" bar", type.parameters().get("charset"));
20 | assertEquals("text", type.type());
21 | assertEquals("html", type.subtype());
22 | assertEquals("text/html", type.base().toString());
23 | assertEquals(MediaType.parse("text/html"), MediaType.parse("teXT/htML"));
24 | assertEquals(MediaType.parse("text/html;charset=utf-8"), MediaType.parse("teXT/htML ;\tCHARsET=utf-8"));
25 | assertEquals(MediaType.parse("text/html;charset=utf-8").hashCode(), MediaType.parse("teXT/htML ;\tCHARsET=utf-8").hashCode());
26 | assertNotEquals(MediaType.parse("text/html;chartset=utf-8"), MediaType.parse("text/html;chartset=UTF-8"));
27 | assertEquals(MediaType.parse("text/html"), MediaType.parse("teXT/htML ;\tCHARsET=utf-8").base());
28 | assertTrue(type.base().parameters().isEmpty());
29 | assertEquals("one", MediaType.parse("text/html;CHARSET=one;charset=two;charset=three").parameters().get("charset"));
30 | }
31 |
32 | @Test
33 | public void testParseLeniently() {
34 | {
35 | MediaType mediaType = MediaType.parseLeniently("text/html;ISO-8859-1;a\0=2;ok=ok");
36 | assertFalse(mediaType.isValid());
37 | assertEquals("text/html;ok=ok", mediaType.toString());
38 | assertEquals(1, mediaType.parameters().size());
39 | assertEquals("ok", mediaType.parameters().get("ok"));
40 | mediaType.raw().equals("text/html;ISO-8859-1;a\0=2;ok=ok");
41 | }
42 | assertEquals("bog\0us", MediaType.parseLeniently("bog\0us").toString());
43 | assertEquals("\0/\0", MediaType.parseLeniently("\0/\0").toString());
44 | assertEquals("", MediaType.parseLeniently("").toString());
45 | }
46 |
47 | @Test(expected = IllegalArgumentException.class)
48 | public void strictParsingShouldThrow() {
49 | MediaType.parse("text/html;ISO-8859-1;a\0=2;ok=ok");
50 | }
51 |
52 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/MessageVersionTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Test;
9 | import org.netpreserve.jwarc.MessageVersion;
10 |
11 | import java.util.HashMap;
12 | import java.util.Map;
13 |
14 | import static org.junit.Assert.assertEquals;
15 |
16 | public class MessageVersionTest {
17 | @Test
18 | public void test() {
19 | Map map = new HashMap<>();
20 | map.put(MessageVersion.HTTP_1_0, 10);
21 | map.put(MessageVersion.HTTP_1_1, 11);
22 | assertEquals(10, (int) map.get(MessageVersion.HTTP_1_0));
23 | assertEquals("HTTP", MessageVersion.HTTP_1_0.getProtocol());
24 | assertEquals(1, MessageVersion.HTTP_1_0.getMajor());
25 | assertEquals(0, MessageVersion.HTTP_1_0.getMinor());
26 | assertEquals("HTTP/1.0", MessageVersion.HTTP_1_0.toString());
27 | }
28 |
29 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcContinuationTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Test;
9 | import org.netpreserve.jwarc.WarcContinuation;
10 | import org.netpreserve.jwarc.WarcReader;
11 | import org.netpreserve.jwarc.WarcResponse;
12 |
13 | import java.io.ByteArrayInputStream;
14 | import java.io.IOException;
15 | import java.net.URI;
16 | import java.util.Optional;
17 |
18 | import static java.nio.charset.StandardCharsets.UTF_8;
19 | import static org.junit.Assert.assertEquals;
20 |
21 | public class WarcContinuationTest {
22 |
23 | final static String continuation1 = "WARC/1.0\r\n" +
24 | "WARC-Type: response\r\n" +
25 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
26 | "WARC-Date: 2006-09-19T17:20:24Z\r\n" +
27 | "WARC-Block-Digest: sha1:2ASS7ZUZY6ND6CCHXETFVJDENAWF7KQ2\r\n" +
28 | "WARC-Payload-Digest: sha1:CCHXETFVJD2MUZY6ND6SS7ZENMWF7KQ2\r\n" +
29 | "WARC-IP-Address: 207.241.233.58\r\n" +
30 | "WARC-Record-ID: \r\n" +
31 | "WARC-Segment-Number: 1\r\n" +
32 | "Content-Type: application/http;msgtype=response\r\n" +
33 | "Content-Length: 1600\r\n" +
34 | "\r\n" +
35 | "HTTP/1.1 200 OK\r\n" +
36 | "Date: Tue, 19 Sep 2006 17:18:40 GMT\r\n" +
37 | "Server: Apache/2.0.54 (Ubuntu)\r\n" +
38 | "Last-Modified: Mon, 16 Jun 2003 22:28:51 GMT\r\n" +
39 | "ETag: \"3e45-67e-2ed02ec0\"\r\n" +
40 | "Accept-Ranges: bytes\r\n" +
41 | "Content-Length: 1662\r\n" +
42 | "Connection: close\r\n" +
43 | "Content-Type: image/jpeg\r\n" +
44 | "\r\n" +
45 | "[first 1360 bytes of image/jpeg binary data here]";
46 |
47 | final static String continuation2 = "WARC/1.0\r\n" +
48 | "WARC-Type: continuation\r\n" +
49 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
50 | "WARC-Date: 2006-09-19T17:20:24Z\r\n" +
51 | "WARC-Block-Digest: sha1:T7HXETFVA92MSS7ZENMFZY6ND6WF7KB7\r\n" +
52 | "WARC-Record-ID: \r\n" +
53 | "WARC-Segment-Origin-ID: \r\n" +
54 | "WARC-Segment-Number: 2\r\n" +
55 | "WARC-Segment-Total-Length: 1902\r\n" +
56 | "WARC-Identified-Payload-Type: image/jpeg\r\n" +
57 | "Content-Length: 302\r\n" +
58 | "\r\n" +
59 | "[last 302 bytes of image/jpeg binary data here]";
60 |
61 |
62 | @Test
63 | public void test() throws IOException {
64 | WarcResponse response = (WarcResponse) new WarcReader(new ByteArrayInputStream(continuation1.getBytes(UTF_8))).next().get();
65 | assertEquals(Optional.of(1L), response.segmentNumber());
66 |
67 | WarcContinuation continuation = (WarcContinuation) new WarcReader(new ByteArrayInputStream(continuation2.getBytes(UTF_8))).next().get();
68 | assertEquals(response.id(), continuation.segmentOriginId());
69 | assertEquals(Optional.of(2L), continuation.segmentNumber());
70 | assertEquals(Optional.of(1902L), continuation.segmentTotalLength());
71 | }
72 |
73 | @Test
74 | public void builder() {
75 | URI id = URI.create("urn:uuid:70653950-a77f-b212-e434-7a7c6ec909ef");
76 | WarcContinuation continuation = new WarcContinuation.Builder()
77 | .segmentOriginId(id)
78 | .segmentNumber(3)
79 | .segmentTotalLength(1024)
80 | .build();
81 | assertEquals("continuation", continuation.type());
82 | assertEquals(id, continuation.segmentOriginId());
83 | assertEquals(Optional.of(3L), continuation.segmentNumber());
84 | assertEquals(Optional.of(1024L), continuation.segmentTotalLength());
85 | }
86 |
87 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcConversionTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Assert;
9 | import org.junit.Test;
10 | import org.netpreserve.jwarc.MediaType;
11 | import org.netpreserve.jwarc.WarcConversion;
12 | import org.netpreserve.jwarc.WarcReader;
13 |
14 | import java.io.ByteArrayInputStream;
15 | import java.io.IOException;
16 | import java.net.URI;
17 |
18 | import static java.nio.charset.StandardCharsets.UTF_8;
19 | import static org.junit.Assert.assertEquals;
20 |
21 | public class WarcConversionTest {
22 |
23 | final static String warc = "WARC/1.0\r\n" +
24 | "WARC-Type: conversion\r\n" +
25 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
26 | "WARC-Date: 2016-09-19T19:00:40Z\r\n" +
27 | "WARC-Record-ID: \r\n" +
28 | "WARC-Refers-To: \r\n" +
29 | "WARC-Block-Digest: sha1:XQMRY75YY42ZWC6JAT6KNXKD37F7MOEK\r\n" +
30 | "Content-Type: image/neoimg\r\n" +
31 | "Content-Length: 934\r\n" +
32 | "\r\n" +
33 | "[image/neoimg binary data here]";
34 |
35 | @Test
36 | public void test() throws IOException {
37 | WarcConversion conversion = (WarcConversion) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get();
38 | assertEquals(URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0"), conversion.refersTo().get());
39 | assertEquals(934, conversion.body().size());
40 | Assert.assertEquals(MediaType.parse("image/neoimg"), conversion.contentType());
41 | assertEquals(URI.create("http://www.archive.org/images/logoc.jpg"), conversion.targetURI());
42 | }
43 |
44 | @Test
45 | public void builder() throws IOException {
46 | URI reference = URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0");
47 | WarcConversion conversion = new WarcConversion.Builder()
48 | .refersTo(reference)
49 | .build();
50 | assertEquals("conversion", conversion.type());
51 | assertEquals(reference, conversion.refersTo().get());
52 | }
53 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcFilterTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.apitests;
2 |
3 | import org.junit.Test;
4 | import org.netpreserve.jwarc.HttpResponse;
5 | import org.netpreserve.jwarc.WarcRequest;
6 | import org.netpreserve.jwarc.WarcResponse;
7 |
8 | import java.io.IOException;
9 | import java.net.URI;
10 | import java.text.ParseException;
11 |
12 | import static org.junit.Assert.*;
13 | import static org.netpreserve.jwarc.WarcFilter.compile;
14 |
15 | public class WarcFilterTest {
16 |
17 | @Test
18 | public void test() throws ParseException, IOException {
19 | WarcResponse response = new WarcResponse.Builder(URI.create("http://example.org/"))
20 | .setHeader("five", "5")
21 | .body(new HttpResponse.Builder(200, "OK")
22 | .setHeader("Transfer-Encoding", "chunked")
23 | .build())
24 | .build();
25 | assertTrue(compile("WARC-Type == \"response\"").test(response));
26 | assertTrue(compile("warc-typE== \t \"response\"").test(response));
27 | assertFalse(compile("WARC-Type != \"response\"").test(response));
28 | assertTrue(compile("WARC-Target-URI =~ \"http:.*\"").test(response));
29 | assertFalse(compile("WARC-Target-URI =~ \"org\"").test(response));
30 | assertFalse(compile("WARC-Target-URI !~ \"http:.*\"").test(response));
31 | assertTrue(compile("content-length < 500").test(response));
32 | assertTrue(compile("warc-type <= 500").test(response));
33 | assertTrue(compile("five >= 5").test(response));
34 | assertTrue(compile("five == 5").test(response));
35 | assertTrue(compile(":status == 200").test(response));
36 | assertTrue(compile("http:transfer-encoding == \"chunked\"").test(response));
37 | assertTrue(compile("(((five >= 5)))").test(response));
38 | assertFalse(compile("!(five >= 5)").test(response));
39 | assertFalse(compile("five > 5").test(response));
40 | assertTrue(compile("five > 10 || five > 11 || five <= 5").test(response));
41 | assertFalse(compile("five < 10 && five > 10").test(response));
42 | assertTrue(compile("(five < 10 || five > 10) && five == \"5\"").test(response));
43 | assertFalse(compile("(five > 100) && five < 10").test(response));
44 | assertFalse(compile("(five < 10) && five > 100").test(response));
45 | assertFalse(compile("(five < 10 || five > 10) && five > 100").test(response));
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcMetadataTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Assert;
9 | import org.junit.Test;
10 | import org.netpreserve.jwarc.MediaType;
11 | import org.netpreserve.jwarc.WarcMetadata;
12 | import org.netpreserve.jwarc.WarcReader;
13 |
14 | import java.io.ByteArrayInputStream;
15 | import java.io.IOException;
16 | import java.util.Arrays;
17 | import java.util.HashMap;
18 | import java.util.List;
19 | import java.util.Map;
20 |
21 | import static java.nio.charset.StandardCharsets.UTF_8;
22 | import static org.junit.Assert.assertEquals;
23 |
24 | public class WarcMetadataTest {
25 | final static String warc = "WARC/1.1\r\n" +
26 | "WARC-Type: metadata\r\n" +
27 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
28 | "WARC-Date: 2016-09-19T17:20:24Z\r\n" +
29 | "WARC-Record-ID: \r\n" +
30 | "WARC-Concurrent-To: \r\n" +
31 | "Content-Type: application/warc-fields\r\n" +
32 | "WARC-Block-Digest: sha1:VXT4AF5BBZVHDYKNC2CSM8TEAWDB6CH8\r\n" +
33 | "Content-Length: 59\r\n" +
34 | "\r\n" +
35 | "via: http://www.archive.org/\r\n" +
36 | "hopsFromSeed: E\r\n" +
37 | "fetchTimeMs: 565";
38 |
39 | @Test
40 | public void test() throws IOException {
41 | WarcMetadata metadata = (WarcMetadata) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get();
42 | assertEquals("http://www.archive.org/", metadata.fields().sole("via").get());
43 | }
44 |
45 | @Test
46 | public void builder() throws IOException {
47 | Map> fields = new HashMap<>();
48 | fields.put("hello", Arrays.asList("one", "two"));
49 | WarcMetadata metadata = new WarcMetadata.Builder()
50 | .fields(fields)
51 | .build();
52 | Assert.assertEquals(MediaType.WARC_FIELDS, metadata.contentType());
53 | assertEquals("one", metadata.fields().first("hello").get());
54 | assertEquals(Arrays.asList("one", "two"), metadata.fields().all("hello"));
55 | }
56 |
57 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcRequestTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Assert;
9 | import org.junit.Test;
10 | import org.netpreserve.jwarc.*;
11 |
12 | import java.io.BufferedReader;
13 | import java.io.ByteArrayInputStream;
14 | import java.io.IOException;
15 | import java.net.URI;
16 | import java.nio.ByteBuffer;
17 | import java.nio.channels.Channels;
18 | import java.nio.charset.StandardCharsets;
19 | import java.util.Arrays;
20 | import java.util.Collections;
21 | import java.util.Optional;
22 |
23 | import static java.nio.charset.StandardCharsets.UTF_8;
24 | import static org.junit.Assert.assertEquals;
25 |
26 | public class WarcRequestTest {
27 |
28 | final static String warc = "WARC/1.1\r\n" +
29 | "WARC-Type: request\r\n" +
30 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
31 | "WARC-Warcinfo-ID: \r\n" +
32 | "WARC-Date: 2016-09-19T17:20:24Z\r\n" +
33 | "Content-Length: 242\r\n" +
34 | "WARC-Record-ID: \r\n" +
35 | "Content-Type: application/http;msgtype=request\r\n" +
36 | "WARC-Concurrent-To: \r\n" +
37 | "\r\n" +
38 | "GET /images/logoc.jpg HTTP/1.0\r\n" +
39 | "User-Agent: Mozilla/5.0 (compatible; heritrix/1.10.0)\r\n" +
40 | "From: stack@example.org\r\n" +
41 | "Connection: close\r\n" +
42 | "Referer: http://www.archive.org/\r\n" +
43 | "Host: www.archive.org\r\n" +
44 | "Cookie: PHPSESSID=009d7bb11022f80605aa87e18224d824\r\n\r\n\r\n";
45 |
46 | @Test
47 | public void test() throws IOException {
48 | WarcRequest request = sampleRequest();
49 | assertEquals(Collections.singletonList(URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0")), request.concurrentTo());
50 | Assert.assertEquals(MediaType.HTTP_REQUEST, request.contentType());
51 | Assert.assertEquals(MessageVersion.WARC_1_1, request.version());
52 | assertEquals(MessageVersion.HTTP_1_0, request.http().version());
53 | assertEquals(Optional.of("close"), request.http().headers().sole("connection"));
54 | }
55 |
56 | @Test
57 | public void builder() throws IOException {
58 | WarcRequest request = new WarcRequest.Builder(URI.create("http://example.org/"))
59 | .concurrentTo(URI.create("id:1"))
60 | .concurrentTo(URI.create("id:2"))
61 | .build();
62 | assertEquals(Arrays.asList(URI.create("id:1"), URI.create("id:2")), request.concurrentTo());
63 | }
64 |
65 | @Test
66 | public void callingHttpShouldNotCorruptBody() throws IOException {
67 | WarcRequest request = sampleRequest();
68 | request.http();
69 | assertEquals(0, request.body().position());
70 | String line = new BufferedReader(Channels.newReader(request.body(), UTF_8.name())).readLine();
71 | assertEquals("GET /images/logoc.jpg HTTP/1.0", line);
72 | }
73 |
74 | private WarcRequest sampleRequest() throws IOException {
75 | return (WarcRequest) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get();
76 | }
77 |
78 | @Test(expected = IllegalStateException.class)
79 | public void readingBodyShouldInvalidateHttp() throws IOException {
80 | WarcRequest response = sampleRequest();
81 | response.body().read(ByteBuffer.allocate(1));
82 | response.http();
83 | }
84 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcResourceTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Test;
9 | import org.netpreserve.jwarc.WarcResource;
10 |
11 | import static org.junit.Assert.*;
12 |
13 | public class WarcResourceTest {
14 | @Test
15 | public void builder() {
16 | WarcResource resource = new WarcResource.Builder().build();
17 | assertEquals("resource", resource.type());
18 | }
19 |
20 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcRevisitTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Test;
9 | import org.netpreserve.jwarc.WarcReader;
10 | import org.netpreserve.jwarc.WarcRevisit;
11 |
12 | import java.io.ByteArrayInputStream;
13 | import java.io.IOException;
14 | import java.net.URI;
15 | import java.time.Instant;
16 | import java.util.Optional;
17 |
18 | import static java.nio.charset.StandardCharsets.UTF_8;
19 | import static org.junit.Assert.assertEquals;
20 | import static org.junit.Assert.assertFalse;
21 |
22 | public class WarcRevisitTest {
23 | final static String warc = "WARC/1.1\r\n" +
24 | "WARC-Type: revisit\r\n" +
25 | "WARC-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
26 | "WARC-Date: 2017-06-23T12:43:35Z\r\n" +
27 | "WARC-Profile: http://netpreserve.org/warc/1.1/revisit/server-not-modified\r\n" +
28 | "WARC-Record-ID: \r\n" +
29 | "WARC-Refers-To: \r\n" +
30 | "WARC-Refers-To-Target-URI: http://www.archive.org/images/logoc.jpg\r\n" +
31 | "WARC-Refers-To-Date: 2016-09-19T17:20:24Z\r\n" +
32 | "Content-Type: message/http\r\n" +
33 | "Content-Length: 202\r\n" +
34 | "\r\n" +
35 | "HTTP/1.0 304 Not Modified\r\n" +
36 | "Date: Tue, 06 Mar 2017 00:43:35 GMT\r\n" +
37 | "Server: Apache/2.0.54 (Ubuntu) PHP/5.0.5-2ubuntu1.4 Connection: Keep-Alive\r\n" +
38 | "Keep-Alive: timeout=15, max=100\r\n" +
39 | "ETag: \"3e45-67e-2ed02ec0\"\r\n" +
40 | "\r\n" +
41 | "this line should not be read";
42 |
43 | @Test
44 | public void test() throws IOException {
45 | WarcRevisit revisit = (WarcRevisit) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get();
46 | assertEquals(WarcRevisit.SERVER_NOT_MODIFIED_1_1, revisit.profile());
47 | assertEquals(Instant.parse("2016-09-19T17:20:24Z"), revisit.refersToDate().get());
48 | assertEquals(URI.create("http://www.archive.org/images/logoc.jpg"), revisit.refersToTargetURI().get());
49 | assertEquals(URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0"), revisit.refersTo().get());
50 | assertEquals(304, revisit.http().status());
51 | assertEquals(Optional.of("timeout=15, max=100"), revisit.http().headers().sole("Keep-Alive"));
52 | assertFalse(revisit.payload().isPresent());
53 | }
54 |
55 | @Test
56 | public void buildingWithoutRefersToRecordId() {
57 | WarcRevisit revisit = new WarcRevisit.Builder(URI.create("http://example.org/"),
58 | WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_1)
59 | .refersTo((URI)null, URI.create("http://example.org/other"), Instant.parse("2016-09-19T17:20:24Z"))
60 | .build();
61 | assertEquals(Optional.empty(), revisit.refersTo());
62 | assertEquals(Optional.of(URI.create("http://example.org/other")), revisit.refersToTargetURI());
63 | assertEquals(Optional.of(Instant.parse("2016-09-19T17:20:24Z")), revisit.refersToDate());
64 | }
65 |
66 | @Test
67 | public void builder() throws IOException {
68 | URI target = URI.create("http://example.org/");
69 | Instant date = Instant.now();
70 | URI reference = URI.create("urn:uuid:92283950-ef2f-4d72-b224-f54c6ec90bb0");
71 | WarcRevisit revisit = new WarcRevisit.Builder(target, WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_1)
72 | .refersTo(reference, target, date)
73 | .build();
74 | assertEquals(WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_1, revisit.profile());
75 | assertEquals(target, revisit.targetURI());
76 | assertEquals(date, revisit.refersToDate().get());
77 | assertEquals(target, revisit.refersToTargetURI().get());
78 | assertEquals(reference, revisit.refersTo().get());
79 | }
80 |
81 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/apitests/WarcinfoTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2018 National Library of Australia and the jwarc contributors
4 | */
5 |
6 | package org.netpreserve.jwarc.apitests;
7 |
8 | import org.junit.Assert;
9 | import org.junit.Test;
10 | import org.netpreserve.jwarc.MediaType;
11 | import org.netpreserve.jwarc.MessageHeaders;
12 | import org.netpreserve.jwarc.WarcReader;
13 | import org.netpreserve.jwarc.Warcinfo;
14 |
15 | import java.io.ByteArrayInputStream;
16 | import java.io.IOException;
17 | import java.net.URI;
18 | import java.time.Instant;
19 | import java.util.Arrays;
20 | import java.util.HashMap;
21 | import java.util.List;
22 | import java.util.Map;
23 |
24 | import static java.nio.charset.StandardCharsets.UTF_8;
25 | import static org.junit.Assert.assertEquals;
26 |
27 | public class WarcinfoTest {
28 | final static String warc = "WARC/1.0\r\n" +
29 | "WARC-Type: warcinfo\r\n" +
30 | "WARC-Date: 2006-09-19T17:20:14Z\r\n" +
31 | "WARC-Record-ID: \r\n" +
32 | "WARC-Filename:hello.warc\r\n" +
33 | "Content-Type: application/warc-fields\r\n" +
34 | "Content-Length: 399\r\n" +
35 | "Folded: a \r\n" +
36 | " b\t \r\n" +
37 | "\t\tc \r\n" +
38 | "\r\n" +
39 | "software: Heritrix 1.12.0 http://crawler.archive.org\r\n" +
40 | "hostname: crawling017.archive.org\r\n" +
41 | "ip: 207.241.227.234\r\n" +
42 | "isPartOf: testcrawl-20050708\r\n" +
43 | "description: testcrawl with WARC output\r\n" +
44 | "operator: IA\\_Admin\r\n" +
45 | "http-header-user-agent:\r\n" +
46 | " Mozilla/5.0 (compatible; heritrix/1.4.0 +http://crawler.archive.org)\r\n" +
47 | "format: WARC file version 1.0\r\n" +
48 | "conformsTo:\r\n" +
49 | " http://www.archive.org/documents/WarcFileFormat-1.0.html\r\n\r\n";
50 |
51 | @Test
52 | public void test() throws IOException {
53 | Warcinfo warcinfo = (Warcinfo) new WarcReader(new ByteArrayInputStream(warc.getBytes(UTF_8))).next().get();
54 | assertEquals(URI.create("urn:uuid:d7ae5c10-e6b3-4d27-967d-34780c58ba39"), warcinfo.id());
55 | assertEquals(Instant.parse("2006-09-19T17:20:14Z"), warcinfo.date());
56 | assertEquals("hello.warc", warcinfo.filename().get());
57 | assertEquals(399, warcinfo.body().size());
58 | Assert.assertEquals(MediaType.WARC_FIELDS, warcinfo.contentType());
59 | MessageHeaders fields = warcinfo.fields();
60 | assertEquals("207.241.227.234", fields.sole("ip").get());
61 | assertEquals("http://www.archive.org/documents/WarcFileFormat-1.0.html", fields.sole("conformsTo").get());
62 | assertEquals("a b c", warcinfo.headers().sole("Folded").get());
63 | }
64 |
65 | @Test
66 | public void builder() throws IOException {
67 | Map> fields = new HashMap<>();
68 | fields.put("hello", Arrays.asList("one", "two"));
69 | Warcinfo warcinfo = new Warcinfo.Builder()
70 | .filename("hello.warc")
71 | .fields(fields)
72 | .build();
73 | assertEquals("warcinfo", warcinfo.type());
74 | assertEquals("hello.warc", warcinfo.filename().get());
75 | assertEquals("one", warcinfo.fields().first("hello").get());
76 | assertEquals(Arrays.asList("one", "two"), warcinfo.fields().all("hello"));
77 | assertEquals(MediaType.WARC_FIELDS, warcinfo.contentType());
78 | }
79 |
80 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/cdx/CdxReaderTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.cdx;
2 |
3 | import org.junit.Test;
4 | import org.netpreserve.jwarc.MediaType;
5 |
6 | import java.io.BufferedReader;
7 | import java.io.IOException;
8 | import java.io.StringReader;
9 | import java.time.Instant;
10 |
11 | import static org.junit.Assert.*;
12 |
13 | public class CdxReaderTest {
14 | @Test
15 | public void test() throws IOException {
16 | String data = "- 20220302214434 http://example.org/ text/html 200 AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7 - - 1062 760582405 example.warc.gz\n" +
17 | "- 20220302214433 https://example.org/page/ application/rss+xml 200 AQO24VNPMHIM6GUNVSCP7IUUETZ4U52J - - 971 760584354 example.warc.gz\n" +
18 | "- 20220302214434 https://example.org/style.css text/css 200 AG2PTU7G6DMXCBP6IBSR5VG5RUMYOHHN - - 749 760586303 example.warc.gz\n";
19 |
20 | try (CdxReader reader = new CdxReader(new BufferedReader(new StringReader(data)))) {
21 | CdxRecord record = reader.next().get();
22 | assertEquals(200, (int) record.status());
23 | assertEquals("http://example.org/", record.target());
24 | assertEquals("AQLNJ7DOPHK477BWWC726H7Y5XBPBNF7", record.digest());
25 | assertEquals(760582405, (long) record.position());
26 | assertEquals(1062, (long) record.size());
27 | assertEquals("example.warc.gz", record.filename());
28 | assertEquals(Instant.parse("2022-03-02T21:44:34Z"), record.date());
29 | assertEquals(MediaType.HTML, record.contentType());
30 |
31 | assertTrue(reader.next().isPresent());
32 | assertTrue(reader.next().isPresent());
33 | assertFalse(reader.next().isPresent());
34 | }
35 | }
36 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/cdx/CdxWriterTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.cdx;
2 |
3 | import org.junit.Rule;
4 | import org.junit.Test;
5 | import org.junit.rules.TemporaryFolder;
6 | import org.netpreserve.jwarc.*;
7 |
8 | import java.io.IOException;
9 | import java.io.StringWriter;
10 | import java.nio.file.Files;
11 | import java.nio.file.Path;
12 | import java.time.Instant;
13 | import java.util.Collections;
14 |
15 | import static java.nio.file.StandardOpenOption.CREATE;
16 | import static java.nio.file.StandardOpenOption.WRITE;
17 | import static org.junit.Assert.assertEquals;
18 |
19 | public class CdxWriterTest {
20 | @Rule
21 | public TemporaryFolder temporaryFolder = new TemporaryFolder();
22 |
23 |
24 | @Test
25 | public void test() throws IOException {
26 | Path testWarcFile = temporaryFolder.newFile().toPath().toAbsolutePath();
27 | try (WarcWriter warcWriter = new WarcWriter(Files.newByteChannel(testWarcFile, CREATE, WRITE))) {
28 | HttpResponse httpResponse = new HttpResponse.Builder(404, "Not Found")
29 | .body(MediaType.HTML, new byte[0])
30 | .build();
31 | warcWriter.write(new WarcResponse.Builder("http://example.org/")
32 | .date(Instant.parse("2022-03-01T12:44:34Z"))
33 | .body(httpResponse)
34 | .payloadDigest("sha256", "b04af472c47a8b1b5059b3404caac0e1bfb5a3c07b329be66f65cfab5ee8d3f3")
35 | .build());
36 | warcWriter.write(new WarcRevisit.Builder("http://example.org/")
37 | .date(Instant.parse("2022-03-02T21:44:34Z"))
38 | .body(httpResponse)
39 | .payloadDigest("sha256", "b04af472c47a8b1b5059b3404caac0e1bfb5a3c07b329be66f65cfab5ee8d3f3")
40 | .build());
41 | }
42 |
43 | StringWriter cdxBuffer = new StringWriter();
44 | CdxWriter cdxWriter = new CdxWriter(cdxBuffer);
45 | cdxWriter.setFormat(new CdxFormat.Builder().digestUnchanged().build());
46 | cdxWriter.writeHeaderLine();
47 | cdxWriter.process(Collections.singletonList(testWarcFile), true);
48 | assertEquals(" CDX N b a m s k r M S V g\n" +
49 | "org,example)/ 20220301124434 http://example.org/ text/html 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 398 0 " + testWarcFile + "\n" +
50 | "org,example)/ 20220302214434 http://example.org/ warc/revisit 404 sha256:WBFPI4WEPKFRWUCZWNAEZKWA4G73LI6APMZJXZTPMXH2WXXI2PZQ==== - - 397 398 " + testWarcFile + "\n",
51 | cdxBuffer.toString());
52 | }
53 |
54 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/cdx/JsonTokenizerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * SPDX-License-Identifier: Apache-2.0
3 | * Copyright (C) 2023 National Library of Australia and the jwarc contributors
4 | */
5 | package org.netpreserve.jwarc.cdx;
6 |
7 | import org.junit.Test;
8 |
9 | import java.io.IOException;
10 | import java.io.StringReader;
11 | import java.util.ArrayList;
12 | import java.util.Arrays;
13 | import java.util.List;
14 |
15 | import static java.util.Collections.singletonList;
16 | import static org.junit.Assert.assertEquals;
17 | import static org.netpreserve.jwarc.cdx.JsonToken.*;
18 |
19 | public class JsonTokenizerTest {
20 | static List tokenize(String json) throws IOException, JsonException {
21 | List tokens = new ArrayList<>();
22 | JsonTokenizer parser = new JsonTokenizer(new StringReader(json));
23 | while (true) {
24 | JsonToken token = parser.nextToken();
25 | if (token == null) break;
26 | tokens.add(token);
27 | }
28 | return tokens;
29 | }
30 |
31 | static List tokenizeValues(String json) throws IOException, JsonException {
32 | List values = new ArrayList<>();
33 | JsonTokenizer parser = new JsonTokenizer(new StringReader(json));
34 | while (true) {
35 | JsonToken token = parser.nextToken();
36 | if (token == null) break;
37 | if (token == STRING) {
38 | values.add(parser.stringValue());
39 | } else if (token == NUMBER_INT) {
40 | values.add(Integer.parseInt(parser.stringValue()));
41 | } else if (token == NUMBER_FLOAT) {
42 | values.add(Double.parseDouble(parser.stringValue()));
43 | } else {
44 | values.add(token);
45 | }
46 | }
47 | return values;
48 | }
49 |
50 | @Test
51 | public void test() throws IOException, JsonException {
52 | assertEquals(Arrays.asList(START_ARRAY, END_ARRAY), tokenize("[]"));
53 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, END_ARRAY), tokenize("[5]"));
54 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_INT, END_ARRAY), tokenize("[5, 6]"));
55 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_FLOAT, END_ARRAY), tokenize(" [ 5,\t\t6.0 ] "));
56 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_FLOAT, STRING, END_ARRAY), tokenize("[5,6.0,\"foo\"]"));
57 | assertEquals(Arrays.asList(START_ARRAY, NUMBER_INT, NUMBER_FLOAT, STRING, TRUE, FALSE, NULL, END_ARRAY), tokenize("[5,6.0,\"foo\",true,false,null]"));
58 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, NUMBER_INT, END_OBJECT), tokenize("{\"foo\":5}"));
59 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, NUMBER_INT, FIELD_NAME, NUMBER_FLOAT, END_OBJECT), tokenize("{\"foo\":5,\"bar\":6.0}"));
60 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, NUMBER_INT, FIELD_NAME, NUMBER_FLOAT, FIELD_NAME, STRING, END_OBJECT), tokenize("{\"foo\":5,\"bar\":6.0,\"baz\":\"q\"}"));
61 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, START_OBJECT, FIELD_NAME, NUMBER_INT, END_OBJECT, END_OBJECT), tokenize("{\"foo\":{\"bar\":5}}"));
62 | assertEquals(Arrays.asList(START_OBJECT, FIELD_NAME, START_ARRAY, NUMBER_INT, START_ARRAY, END_ARRAY, NUMBER_FLOAT, END_ARRAY, END_OBJECT), tokenize("{\"foo\":[5,[],6.0]}"));
63 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("0.0"));
64 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1e0"));
65 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1e+0"));
66 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1e-0"));
67 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1.0e0"));
68 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1.0e+0"));
69 | assertEquals(singletonList(NUMBER_FLOAT), tokenize("1.0e-0"));
70 | assertEquals(Arrays.asList(START_ARRAY, 0.0, -0.0, 1.0, 5, END_ARRAY), tokenizeValues("[0.0, -0.0, 1.0, 5]"));
71 | assertEquals(singletonList(" \t\r\n\0ሴ\"\\/"), tokenizeValues("\" \\t\\r\\n\\u0000\\u1234\\\"\\\\\\/\""));
72 | }
73 |
74 | }
--------------------------------------------------------------------------------
/test/org/netpreserve/jwarc/net/WarcServerTest.java:
--------------------------------------------------------------------------------
1 | package org.netpreserve.jwarc.net;
2 |
3 | import org.junit.Test;
4 |
5 | import java.io.IOException;
6 | import java.net.ServerSocket;
7 | import java.util.Collections;
8 |
9 | import static org.junit.Assert.*;
10 |
11 | public class WarcServerTest {
12 | @Test
13 | public void test() throws IOException {
14 | try (ServerSocket serverSocket = new ServerSocket()) {
15 | // so far just testing that we can instantiate it and load the .js files
16 | WarcServer server = new WarcServer(serverSocket, Collections.emptyList());
17 | }
18 | }
19 | }
--------------------------------------------------------------------------------