125 | * This method must be called one time, but not again even after a restart of
126 | * WASP.
127 | *
128 | * @throws IOException On initializing the index
129 | */
130 | public void initialize()
131 | throws IOException {
132 | final CreateIndexRequest createIndexRequest = CreateIndexRequest.of(
133 | indexBuilder -> indexBuilder
134 | .index(INDEX_NAME)
135 | .mappings(mappings -> mappings
136 | .properties(ResponseRecord.TYPE_PROPERTIES)));
137 | LOG.info("Created index: " + createIndexRequest);
138 | this.getClient().indices().create(createIndexRequest);
139 | }
140 |
141 | /////////////////////////////////////////////////////////////////////////////
142 | // GETTERS
143 | /////////////////////////////////////////////////////////////////////////////
144 |
145 | /**
146 | * Gets the low level REST client used to communicate with the index.
147 | * @return The client
148 | */
149 | protected RestClient getLowLevelClient() {
150 | return this.lowLevelClient;
151 | }
152 |
153 |
154 | /**
155 | * Gets the high level client used to communicate with the index.
156 | * @return The client
157 | */
158 | protected ElasticsearchClient getClient() {
159 | return this.client;
160 | }
161 |
162 | /////////////////////////////////////////////////////////////////////////////
163 | // FUNCTIONALITY
164 | /////////////////////////////////////////////////////////////////////////////
165 |
166 | @Override
167 | public void close() throws IOException {
168 | this.getLowLevelClient().close();
169 | }
170 |
171 | /////////////////////////////////////////////////////////////////////////////
172 | // INDEXING
173 |
174 | /**
175 | * Indexes a response record.
176 | * @param id The ID of the response
177 | * @param uri The target URI of the response
178 | * @param content The extracted content from the response
179 | * @param title The title of the response
180 | * @return Whether the response has been indexed (always)
181 | * @throws IOException On writing to the index
182 | */
183 | public boolean indexResponse(
184 | final String id, final String uri,
185 | final String content, final String title)
186 | throws IOException {
187 | final IndexRequest indexRequest = IndexRequest.of(
188 | builder -> builder
189 | .index(INDEX_NAME)
190 | .id(Objects.requireNonNull(id))
191 | .document(ResponseRecord.forPage(uri, title, content)));
192 | this.getClient().index(indexRequest);
193 | LOG.fine("Index response " + id);
194 | return true;
195 | }
196 |
197 | /**
198 | * Indexes a revisit record.
199 | * @param id The ID of the revisit
200 | * @param uri The target URI of the revisit
201 | * @param originalTime The time of the first visit
202 | * @param instant The time of the revisit
203 | * @return Whether the revisit has been indexed (not if no such response
204 | * exists)
205 | * @throws IOException On reading or writing to the index
206 | */
207 | public boolean indexRevisit(
208 | final String id, final String uri,
209 | final Instant originalTime, final Instant instant)
210 | throws IOException {
211 | final SearchRequest search = new SearchRequest.Builder()
212 | .query(query -> query
213 | .bool(main -> main
214 | .must(time -> time.nested(nested -> nested
215 | .path(ResponseRecord.FIELD_REQUESTS)
216 | .scoreMode(ChildScoreMode.Max)
217 | .query(inner -> inner.match(range ->
218 | range.field(ResponseRecord.FIELD_REQUESTS + "."
219 | + RequestRecord.FIELD_DATE)
220 | .query(originalTime.toString())
221 | ))
222 | ))
223 | )
224 | ).build();
225 | final HitsMetadata hits =
226 | this.getClient().search(search, ResponseRecord.class).hits();
227 | if (hits.hits().size() > 0) {
228 | final String responseId = hits.hits().get(0).id();
229 | LOG.fine("Index revisit " + uri + " -> " + responseId);
230 | this.indexRequest(responseId, uri, instant);
231 | return true;
232 | } else {
233 | LOG.warning("Index revisit " + uri + " FAILED");
234 | return false;
235 | }
236 | }
237 |
238 | /**
239 | * Indexes a request record.
240 | * @param concurrentId The ID of the concurrent response
241 | * @param uri The URI of the request
242 | * @param instant The time of the request
243 | * @return Whether the request has been indexed (not if no such response
244 | * exists)
245 | * @throws IOException On reading or writing to the index
246 | */
247 | public boolean indexRequest(
248 | final String concurrentId, final String uri, final Instant instant)
249 | throws IOException {
250 | final GetResponse response =
251 | this.resolveResponse(concurrentId);
252 | if (response == null) {
253 | LOG.fine("No response found for ID = " + concurrentId + " for request");
254 | return false;
255 | }
256 |
257 | final String field = ResponseRecord.FIELD_REQUESTS;
258 | final Map params = Map.of(field, JsonData.of(
259 | new RequestRecord(uri, instant), MAPPER));
260 | final String scriptSource =
261 | "ctx._source." + field + ".add(params." + field + ");";
262 |
263 | final UpdateRequest updateRequest =
264 | UpdateRequest.of(builder -> builder
265 | .index(INDEX_NAME)
266 | .id(response.id())
267 | .script(script -> script.inline(inline -> inline
268 | .lang("painless")
269 | .source(scriptSource)
270 | .params(params))));
271 | this.getClient().update(updateRequest, ResponseRecord.class);
272 | LOG.fine("Index request -> " + concurrentId + " at " + instant);
273 | return true;
274 | }
275 |
276 | /////////////////////////////////////////////////////////////////////////////
277 | // SEARCH
278 |
279 | /**
280 | * Searches the index.
281 | * @param query The query to match responses and requests by
282 | * @return The results
283 | * @throws IOException On searching the index
284 | * @see #DEFAULT_MAX_RESULTS
285 | */
286 | public List search(final Query query)
287 | throws IOException {
288 | return this.search(query, DEFAULT_MAX_RESULTS);
289 | }
290 |
291 | /**
292 | * Searches the index.
293 | * @param query The query to match responses and requests by
294 | * @param maxResults The maximum number of results to get
295 | * @return The results
296 | * @throws IOException On searching the index
297 | */
298 | public List search(final Query query, final int maxResults)
299 | throws IOException {
300 | return this.search(query, maxResults, 0);
301 | }
302 |
303 | /**
304 | * Searches the index.
305 | * @param query The query to match responses and requests by
306 | * @param maxResults The maximum number of results to get
307 | * @param offset The offset of the first result to get
308 | * @return The results
309 | * @throws IOException On searching the index
310 | */
311 | public List search(
312 | final Query query, final int maxResults, final int offset)
313 | throws IOException {
314 | final SearchResponse search = this.getClient().search(
315 | query.build(maxResults).from(offset).build(), ResponseRecord.class);
316 | final HitsMetadata hits = search.hits();
317 |
318 | final List results = new ArrayList<>();
319 | for (final Hit hit : hits.hits()) {
320 | final Result result = Result.fromHit(hit, query.getFrom(), query.getTo());
321 | if (!result.hasEmptySnippet()) { results.add(result); }
322 | }
323 | return results;
324 | }
325 |
326 | /////////////////////////////////////////////////////////////////////////////
327 | // HELPERS
328 | /////////////////////////////////////////////////////////////////////////////
329 |
330 | /**
331 | * Gets the response with the specified ID.
332 | * @param id The response ID
333 | * @return The response
334 | * @throws IOException On searching the index
335 | */
336 | protected GetResponse resolveResponse(final String id)
337 | throws IOException {
338 | final GetResponse getResponse = this.getClient().get(
339 | get -> get.index(INDEX_NAME).id(id),
340 | ResponseRecord.class);
341 | if (getResponse.found()) {
342 | return getResponse;
343 | } else {
344 | return null;
345 | }
346 | }
347 |
348 |
349 | /////////////////////////////////////////////////////////////////////////////
350 | // JSON BINDINGS
351 | /////////////////////////////////////////////////////////////////////////////
352 |
353 | /**
354 | * Serializer for {@link Instant} using ISO-8601.
355 | *
356 | * @author johannes.kiesel@uni-weimar.de
357 | * @see InstantDeserializer
358 | * @see DateTimeFormatter#ISO_INSTANT
359 | *
360 | */
361 | public static class InstantSerializer extends StdSerializer {
362 |
363 | private static final long serialVersionUID = 2795427768750728869L;
364 |
365 | /**
366 | * Creates a new serializer.
367 | */
368 | public InstantSerializer() {
369 | super(Instant.class);
370 | }
371 |
372 | @Override
373 | public void serialize(
374 | final Instant value,
375 | final JsonGenerator generator,
376 | final SerializerProvider provider)
377 | throws IOException {
378 | generator.writeString(value.toString());
379 | }
380 |
381 | }
382 |
383 | /**
384 | * Deserializer for {@link Instant} using ISO-8601.
385 | *
386 | * @author johannes.kiesel@uni-weimar.de
387 | * @see InstantSerializer
388 | * @see DateTimeFormatter#ISO_INSTANT
389 | *
390 | */
391 | public static class InstantDeserializer extends StdDeserializer {
392 |
393 | private static final long serialVersionUID = -3591379516415686398L;
394 |
395 | /**
396 | * Creates a new deserializer.
397 | */
398 | public InstantDeserializer() {
399 | super(Instant.class);
400 | }
401 |
402 | @Override
403 | public Instant deserialize(
404 | final JsonParser parser,
405 | final DeserializationContext context)
406 | throws IOException, JsonProcessingException {
407 | final String text = parser.getValueAsString();
408 | return Instant.parse(text);
409 | }
410 |
411 | }
412 |
413 | /////////////////////////////////////////////////////////////////////////////
414 | // MAIN
415 | /////////////////////////////////////////////////////////////////////////////
416 |
417 | public static void main(final String[] args) throws IOException {
418 | final int port =
419 | args.length == 0 ? DEFAULT_PORT : Integer.parseInt(args[0]);
420 | try (final Index index = new Index(port)) {
421 | index.initialize();
422 | }
423 | }
424 |
425 | }
426 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/index/Query.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.index;
2 |
3 | import java.time.Instant;
4 | import java.util.Objects;
5 |
6 | import co.elastic.clients.elasticsearch._types.query_dsl.ChildScoreMode;
7 | import co.elastic.clients.elasticsearch._types.query_dsl.Operator;
8 | import co.elastic.clients.elasticsearch.core.SearchRequest;
9 | import co.elastic.clients.elasticsearch.core.search.Highlight;
10 | import co.elastic.clients.elasticsearch.core.search.HighlightField;
11 |
12 | /**
13 | * A query to the index with optional time constraints
14 | *
15 | * @author johannes.kiesel@uni-weimar.de
16 | *
17 | */
18 | public class Query {
19 |
20 | /////////////////////////////////////////////////////////////////////////////
21 | // CONSTANTS
22 | /////////////////////////////////////////////////////////////////////////////
23 |
24 | /**
25 | * Boosting factor for the title over the content.
26 | */
27 | protected static final float TITLE_BOOST = 2.0f;
28 |
29 | /**
30 | * Field name of the request's date within the response.
31 | */
32 | protected static final String FIELD_DATE_COMPLETE =
33 | ResponseRecord.FIELD_REQUESTS + "." + RequestRecord.FIELD_DATE;
34 |
35 | /**
36 | * Snippet generator.
37 | */
38 | protected static final Highlight HIGHLIGHT =
39 | Highlight.of(highlight -> highlight
40 | .fields(ResponseRecord.FIELD_CONTENT, HighlightField.of(field -> field
41 | .type("unified"))));
42 |
43 | /////////////////////////////////////////////////////////////////////////////
44 | // MEMBERS
45 | /////////////////////////////////////////////////////////////////////////////
46 |
47 | private final String terms;
48 |
49 | private Instant from;
50 |
51 | private Instant to;
52 |
53 | /////////////////////////////////////////////////////////////////////////////
54 | // CONSTRUCTORS
55 | /////////////////////////////////////////////////////////////////////////////
56 |
57 | /**
58 | * Creates a new query.
59 | * @param terms The query terms to match the response content and title with
60 | * @param from The earliest time for a request to match this query, or
61 | * null for no constraint in this direction
62 | * @param to The latest time for a request to match this query, or
63 | * null for no constraint in this direction
64 | */
65 | public Query(
66 | final String terms, final Instant from, final Instant to) {
67 | this.terms = Objects.requireNonNull(terms);
68 | this.from = from;
69 | this.to = to;
70 | }
71 |
72 | /////////////////////////////////////////////////////////////////////////////
73 | // GETTERS
74 | /////////////////////////////////////////////////////////////////////////////
75 |
76 | /**
77 | * Gets the query terms to match the response content and title with.
78 | * @return The terms
79 | */
80 | public String getTerms() {
81 | return this.terms;
82 | }
83 |
84 | /**
85 | * Gets the earliest time for a request to match this query, if any.
86 | * @return The time or null for no constraint in this direction
87 | */
88 | public Instant getFrom() {
89 | return this.from;
90 | }
91 |
92 | /**
93 | * Gets the latest time for a request to match this query, if any.
94 | * @return The time or null for no constraint in this direction
95 | */
96 | public Instant getTo() {
97 | return this.to;
98 | }
99 |
100 | /////////////////////////////////////////////////////////////////////////////
101 | // FUNCTIONALITY
102 | /////////////////////////////////////////////////////////////////////////////
103 |
104 | @Override
105 | public boolean equals(final Object obj) {
106 | if (obj == null) { return false; }
107 | if (obj instanceof Query) {
108 | final Query other = (Query) obj;
109 |
110 | if (!this.getTerms().equals(other.getTerms())) { return false; }
111 |
112 | final Instant thisFrom = this.getFrom();
113 | final Instant otherFrom = other.getFrom();
114 | if ((thisFrom == null && otherFrom != null)
115 | || (thisFrom != null && !thisFrom.equals(otherFrom))) {
116 | return false;
117 | }
118 |
119 | final Instant thisTo = this.getTo();
120 | final Instant otherTo = other.getTo();
121 | if ((thisTo == null && otherTo != null)
122 | || (thisTo != null && !thisTo.equals(otherTo))) {
123 | return false;
124 | }
125 | return true;
126 | }
127 | return false;
128 | }
129 |
130 | /**
131 | * Creates a search request from this query.
132 | * @return A search request builder that is configured accordingly
133 | */
134 | public SearchRequest.Builder build() {
135 | final Instant from = this.getFrom();
136 | final Instant to = this.getTo();
137 | final String terms = this.getTerms();
138 |
139 | return new SearchRequest.Builder()
140 | .query(query -> query
141 | .bool(main -> main
142 | .must(time -> time.nested(nested -> nested
143 | .path(ResponseRecord.FIELD_REQUESTS)
144 | .scoreMode(ChildScoreMode.Max)
145 | .query(inner -> inner.range(range -> {
146 | range.field(FIELD_DATE_COMPLETE);
147 | if (from != null) { range.from(from.toString()); }
148 | if (to != null) { range.to(to.toString()); }
149 | return range;
150 | }))))
151 | .should(term -> term.bool(bool -> bool
152 | .should(should -> should
153 | .match(match -> match
154 | .field(ResponseRecord.FIELD_CONTENT)
155 | .query(terms)
156 | .operator(Operator.And)))
157 | .should(should -> should
158 | .match(match -> match
159 | .field(ResponseRecord.FIELD_TITLE)
160 | .query(terms)
161 | .operator(Operator.And)
162 | .boost(TITLE_BOOST))))
163 | )))
164 | .highlight(HIGHLIGHT);
165 | }
166 |
167 | /**
168 | * Creates a search request from this query.
169 | * @param pageSize The result page size
170 | * @return A search request builder that is configured accordingly
171 | */
172 | public SearchRequest.Builder build(final int pageSize) {
173 | return this.build().size(pageSize);
174 | }
175 |
176 | /**
177 | * Creates a search request from this query.
178 | * @param pageSize The result page size
179 | * @param page The result page
180 | * @return A search request builder that is configured accordingly
181 | */
182 | public SearchRequest.Builder build(final int pageSize, final int page) {
183 | return this.build(pageSize).from((page - 1) * pageSize);
184 | }
185 |
186 | }
187 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/index/RequestRecord.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.index;
2 |
3 | import java.time.Instant;
4 | import java.util.Map;
5 | import java.util.Objects;
6 |
7 | import com.fasterxml.jackson.annotation.JsonAutoDetect;
8 | import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility;
9 | import com.fasterxml.jackson.annotation.JsonCreator;
10 | import com.fasterxml.jackson.annotation.JsonGetter;
11 | import com.fasterxml.jackson.annotation.JsonProperty;
12 |
13 | import co.elastic.clients.elasticsearch._types.mapping.DateProperty;
14 | import co.elastic.clients.elasticsearch._types.mapping.KeywordProperty;
15 | import co.elastic.clients.elasticsearch._types.mapping.Property;
16 |
17 | /**
18 | * A record of a request for indexing / retrieval.
19 | *
20 | * @author johannes.kiesel@uni-weimar.de
21 | *
22 | */
23 | @JsonAutoDetect(
24 | getterVisibility = Visibility.NONE,
25 | setterVisibility = Visibility.NONE)
26 | public class RequestRecord {
27 |
28 | /////////////////////////////////////////////////////////////////////////////
29 | // CONSTANTS
30 | /////////////////////////////////////////////////////////////////////////////
31 |
32 | /**
33 | * Name of the record's URI field.
34 | */
35 | public static final String FIELD_URI = "uri";
36 |
37 | /**
38 | * Name of the record's date field.
39 | */
40 | public static final String FIELD_DATE = "date";
41 |
42 | /**
43 | * Properties for an Elasticsearch mapping of this class.
44 | */
45 | public static Map TYPE_PROPERTIES = Map.of(
46 | FIELD_URI, KeywordProperty.of(property -> property)._toProperty(),
47 | FIELD_DATE, DateProperty.of(property -> property)._toProperty());
48 |
49 | /////////////////////////////////////////////////////////////////////////////
50 | // MEMBERS
51 | /////////////////////////////////////////////////////////////////////////////
52 |
53 | private final String uri;
54 |
55 | private final Instant date;
56 |
57 | /////////////////////////////////////////////////////////////////////////////
58 | // CONSTRUCTION
59 | /////////////////////////////////////////////////////////////////////////////
60 |
61 | /**
62 | * Creates a new record for some request.
63 | * @param uri The URI of the request
64 | * @param date The date of the request
65 | */
66 | @JsonCreator
67 | public RequestRecord(
68 | @JsonProperty(FIELD_URI) final String uri,
69 | @JsonProperty(FIELD_DATE) final Instant date) {
70 | this.uri = Objects.requireNonNull(uri);
71 | this.date = Objects.requireNonNull(date);
72 | }
73 |
74 | /////////////////////////////////////////////////////////////////////////////
75 | // GETTERS
76 | /////////////////////////////////////////////////////////////////////////////
77 |
78 | /**
79 | * Gets the URI of the request.
80 | * @return The URI
81 | */
82 | @JsonGetter(FIELD_URI)
83 | public String getUri() {
84 | return this.uri;
85 | }
86 |
87 | /**
88 | * Gets the date of the request.
89 | * @return The date
90 | */
91 | @JsonGetter(FIELD_DATE)
92 | public Instant getDate() {
93 | return this.date;
94 | }
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/index/ResponseRecord.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.index;
2 |
3 | import java.util.List;
4 | import java.util.Map;
5 | import java.util.Objects;
6 |
7 | import com.fasterxml.jackson.annotation.JsonAutoDetect;
8 | import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility;
9 | import com.fasterxml.jackson.annotation.JsonCreator;
10 | import com.fasterxml.jackson.annotation.JsonGetter;
11 | import com.fasterxml.jackson.annotation.JsonProperty;
12 |
13 | import co.elastic.clients.elasticsearch._types.mapping.KeywordProperty;
14 | import co.elastic.clients.elasticsearch._types.mapping.NestedProperty;
15 | import co.elastic.clients.elasticsearch._types.mapping.Property;
16 | import co.elastic.clients.elasticsearch._types.mapping.TextProperty;
17 |
18 | /**
19 | * A record of a response or revisit for indexing / retrieval.
20 | *
21 | * @author johannes.kiesel@uni-weimar.de
22 | *
23 | */
24 | @JsonAutoDetect(
25 | getterVisibility = Visibility.NONE,
26 | setterVisibility = Visibility.NONE)
27 | public class ResponseRecord {
28 |
29 | /////////////////////////////////////////////////////////////////////////////
30 | // CONSTANTS
31 | /////////////////////////////////////////////////////////////////////////////
32 |
33 | /**
34 | * Name of the record's target URI field.
35 | */
36 | public static final String FIELD_URI = "uri";
37 |
38 | /**
39 | * Name of the record's title field.
40 | */
41 | public static final String FIELD_TITLE = "title";
42 |
43 | /**
44 | * Name of the record's content field.
45 | */
46 | public static final String FIELD_CONTENT = "content";
47 |
48 | /**
49 | * Name of the record's requests field.
50 | */
51 | public static final String FIELD_REQUESTS = "requests";
52 |
53 | /**
54 | * Properties for an Elasticsearch mapping of this class.
55 | */
56 | public static Map TYPE_PROPERTIES = Map.of(
57 | FIELD_URI, KeywordProperty.of(property -> property)._toProperty(),
58 | FIELD_TITLE, TextProperty.of(property -> property)._toProperty(),
59 | FIELD_CONTENT, TextProperty.of(property -> property)._toProperty(),
60 | FIELD_REQUESTS, NestedProperty.of(property -> property
61 | .properties(RequestRecord.TYPE_PROPERTIES)
62 | )._toProperty());
63 |
64 | /////////////////////////////////////////////////////////////////////////////
65 | // MEMBERS
66 | /////////////////////////////////////////////////////////////////////////////
67 |
68 | private final String uri;
69 |
70 | private final String title;
71 |
72 | private final String content;
73 |
74 | private final List requests;
75 |
76 | /////////////////////////////////////////////////////////////////////////////
77 | // CONSTRUCTION
78 | /////////////////////////////////////////////////////////////////////////////
79 |
80 | /**
81 | * Creates a new record for some request.
82 | * @param uri The target URI of the response page or revisit
83 | * @param title The title of the response page (or null if a
84 | * revisit)
85 | * @param content The extracted content of the response page (or
86 | * null if a revisit)
87 | * @param requests The requests that led to this response (empty if a revisit)
88 | */
89 | @JsonCreator
90 | public ResponseRecord(
91 | @JsonProperty(FIELD_URI) final String uri,
92 | @JsonProperty(FIELD_TITLE) final String title,
93 | @JsonProperty(FIELD_CONTENT) final String content,
94 | @JsonProperty(FIELD_REQUESTS) final List requests) {
95 | this.uri = Objects.requireNonNull(uri);
96 | this.title = title;
97 | this.content = content;
98 | if (requests == null) {
99 | this.requests = List.of();
100 | } else {
101 | this.requests = List.copyOf(requests);
102 | }
103 | }
104 |
105 | /**
106 | * Creates a new record for a response page without assigned requests.
107 | * @param uri The target URI of the response page
108 | * @param title The title of the page
109 | * @param content The extracted content of the page
110 | * @return The request
111 | */
112 | public static ResponseRecord forPage(
113 | final String uri, final String title, final String content) {
114 | return new ResponseRecord(
115 | Objects.requireNonNull(uri),
116 | Objects.requireNonNull(title), Objects.requireNonNull(content), null);
117 | }
118 |
119 | /////////////////////////////////////////////////////////////////////////////
120 | // GETTERS
121 | /////////////////////////////////////////////////////////////////////////////
122 |
123 | /**
124 | * Gets the URI of the response.
125 | * @return The URI
126 | */
127 | @JsonGetter(FIELD_URI)
128 | public String getUri() {
129 | return this.uri;
130 | }
131 |
132 | /**
133 | * Gets the title of the response.
134 | * @return The title or null if a revisit
135 | */
136 | @JsonGetter(FIELD_TITLE)
137 | public String getTitle() {
138 | return this.title;
139 | }
140 |
141 | /**
142 | * Gets the content of the response.
143 | * @return The content or null if a revisit
144 | */
145 | @JsonGetter(FIELD_CONTENT)
146 | public String getContent() {
147 | return this.content;
148 | }
149 |
150 | /**
151 | * Gets the requests that led to this response.
152 | * @return The list of requests (empty if a revisit)
153 | */
154 | @JsonGetter(FIELD_REQUESTS)
155 | public List getRequests() {
156 | return this.requests;
157 | }
158 |
159 | }
160 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/index/Result.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.index;
2 |
3 | import java.time.Instant;
4 | import java.util.List;
5 | import java.util.ListIterator;
6 | import java.util.Map;
7 | import java.util.Objects;
8 |
9 | import com.fasterxml.jackson.annotation.JsonAutoDetect;
10 | import com.fasterxml.jackson.annotation.JsonCreator;
11 | import com.fasterxml.jackson.annotation.JsonGetter;
12 | import com.fasterxml.jackson.annotation.JsonProperty;
13 |
14 | import co.elastic.clients.elasticsearch.core.search.Hit;
15 |
16 | import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility;
17 |
18 | /**
19 | * A result for a query.
20 | *
21 | * @author johannes.kiesel@uni-weimar.de
22 | *
23 | */
24 | @JsonAutoDetect(
25 | getterVisibility = Visibility.NONE,
26 | setterVisibility = Visibility.NONE)
27 | public class Result {
28 |
29 | /////////////////////////////////////////////////////////////////////////////
30 | // CONSTANTS
31 | /////////////////////////////////////////////////////////////////////////////
32 |
33 | /**
34 | * Name of the result's retrieval score field.
35 | */
36 | public static final String FIELD_SCORE = "score";
37 |
38 | /**
39 | * Name of the result's snippet field.
40 | */
41 | public static final String FIELD_SNIPPET = "snippet";
42 |
43 | /**
44 | * Name of the result's response field.
45 | */
46 | public static final String FIELD_RESPONSE = "response";
47 |
48 | /**
49 | * Name of the results's matched request field.
50 | */
51 | public static final String FIELD_MATCHED_REQUEST = "matchedRequest";
52 |
53 | /////////////////////////////////////////////////////////////////////////////
54 | // MEMBERS
55 | /////////////////////////////////////////////////////////////////////////////
56 |
57 | private final double score;
58 |
59 | private final String snippet;
60 |
61 | private final ResponseRecord response;
62 |
63 | private final RequestRecord matchedRequest;
64 |
65 | /////////////////////////////////////////////////////////////////////////////
66 | // CONSTRUCTION
67 | /////////////////////////////////////////////////////////////////////////////
68 |
69 | /**
70 | * Creates a new result.
71 | * @param score The retrieval score
72 | * @param snippet The snippet to display
73 | * @param response The underlying response
74 | * @param matchedRequest The response's request that matched the query's time
75 | * constraints
76 | */
77 | @JsonCreator
78 | public Result(
79 | @JsonProperty(FIELD_SCORE) final double score,
80 | @JsonProperty(FIELD_SNIPPET) final String snippet,
81 | @JsonProperty(FIELD_RESPONSE) final ResponseRecord response,
82 | @JsonProperty(FIELD_MATCHED_REQUEST) final RequestRecord matchedRequest) {
83 | this.score = score;
84 | this.snippet = Objects.requireNonNull(snippet);
85 | this.response = Objects.requireNonNull(response);
86 | this.matchedRequest = Objects.requireNonNull(matchedRequest);
87 | }
88 |
89 | /**
90 | * Creates a new result from a search hit.
91 | * @param hit The hit
92 | * @param from The earliest time for a request, or null for no
93 | * constraint in this direction
94 | * @param to The latest time for a request, or null for no
95 | * constraint in this direction
96 | * @return The result
97 | * @throws IllegalArgumentException If no request matches the constraints
98 | */
99 | public static Result fromHit(
100 | final Hit hit, final Instant from, final Instant to) {
101 | final double score = hit.score();
102 |
103 | final ResponseRecord response = hit.source();
104 | final RequestRecord request = Result.matchRequest(response, from, to);
105 | final String snippet = Result.getSnippet(hit);
106 |
107 | return new Result(score, snippet, response, request);
108 | }
109 |
110 | /**
111 | * Get the response's request the matches the time constraints.
112 | * @param response The response
113 | * @param from The earliest time for a request, or null for no
114 | * constraint in this direction
115 | * @param to The latest time for a request, or null for no
116 | * constraint in this direction
117 | * @return The latest request matching the constraints
118 | * @throws IllegalArgumentException If no request matches the constraints
119 | */
120 | protected static RequestRecord matchRequest(
121 | final ResponseRecord response, final Instant from, final Instant to) {
122 | final List requests = response.getRequests();
123 | final ListIterator iterator =
124 | requests.listIterator(requests.size());
125 | while (iterator.hasPrevious()) {
126 | final RequestRecord request = iterator.previous();
127 | final Instant date = request.getDate();
128 | if (from != null && date.isBefore(from)) { continue; }
129 | if (to != null && date.isAfter(to)) { continue; }
130 | return request;
131 | }
132 | throw new IllegalArgumentException(
133 | "it contained no request in time interval");
134 | }
135 |
136 | /**
137 | * Gets the snippet of a search hit.
138 | * @param hit The hit
139 | * @return The snippet (may be empty)
140 | */
141 | protected static String getSnippet(final Hit hit) {
142 | final Map> snippetsPerField = hit.highlight();
143 | final List snippetParts =
144 | snippetsPerField.get(ResponseRecord.FIELD_CONTENT);
145 | if (snippetParts == null) { return ""; }
146 | return String.join(" ... ", snippetParts);
147 | }
148 |
149 | /////////////////////////////////////////////////////////////////////////////
150 | // GETTER
151 | /////////////////////////////////////////////////////////////////////////////
152 |
153 | /**
154 | * Gets the retrieval score of the result.
155 | * @return The score
156 | */
157 | @JsonGetter(FIELD_SCORE)
158 | public double getScore() {
159 | return this.score;
160 | }
161 |
162 | /**
163 | * Gets the snippet of the result.
164 | * @return The snippet
165 | */
166 | @JsonGetter(FIELD_SNIPPET)
167 | public String getSnippet() {
168 | return this.snippet;
169 | }
170 |
171 | /**
172 | * Gets the response of the result.
173 | * @return The response
174 | */
175 | @JsonGetter(FIELD_RESPONSE)
176 | public ResponseRecord getResponse() {
177 | return this.response;
178 | }
179 |
180 | /**
181 | * Gets the request of the response that was matched by the query.
182 | * @return The request
183 | */
184 | @JsonGetter(FIELD_MATCHED_REQUEST)
185 | public RequestRecord getMatchedRequest() {
186 | return this.matchedRequest;
187 | }
188 |
189 | /////////////////////////////////////////////////////////////////////////////
190 | // FUNCTIONALITY
191 | /////////////////////////////////////////////////////////////////////////////
192 |
193 | /**
194 | * Checks whether the result snippet is empty.
195 | * @return Whether it is
196 | */
197 | public boolean hasEmptySnippet() {
198 | return this.getSnippet().isEmpty();
199 | }
200 |
201 | @Override
202 | public String toString() {
203 | return String.format(
204 | "RESULT %.2f '%s' FROM '%s' AT %s: '%s'",
205 | this.getScore(), this.getResponse().getTitle(),
206 | this.getMatchedRequest().getUri(),
207 | this.getMatchedRequest().getDate(),
208 | this.getSnippet());
209 | }
210 |
211 | }
212 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/index/WarcIndexer.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.index;
2 |
3 | import java.io.IOException;
4 | import java.util.Objects;
5 | import java.util.logging.Logger;
6 | import java.time.Instant;
7 |
8 | import de.webis.wasp.warcs.GenericHtmlWarcRecordConsumer;
9 |
10 | /**
11 | * Consumer to index WARC records.
12 | *
13 | * @author johannes.kiesel@uni-weimar.de
14 | *
15 | */
16 | public class WarcIndexer
17 | extends GenericHtmlWarcRecordConsumer {
18 |
19 | /////////////////////////////////////////////////////////////////////////////
20 | // LOGGING
21 | /////////////////////////////////////////////////////////////////////////////
22 |
23 | private static final Logger LOG =
24 | Logger.getLogger(WarcIndexer.class.getName());
25 |
26 | /////////////////////////////////////////////////////////////////////////////
27 | // MEMBERS
28 | /////////////////////////////////////////////////////////////////////////////
29 |
30 | private final Index index;
31 |
32 | /////////////////////////////////////////////////////////////////////////////
33 | // CONSTRUCTORS
34 | /////////////////////////////////////////////////////////////////////////////
35 |
36 | /**
37 | * Creates a new consumer that indexes to the specified index.
38 | * @param index The index
39 | */
40 | public WarcIndexer(final Index index) {
41 | this.index = Objects.requireNonNull(index);
42 | }
43 |
44 | /////////////////////////////////////////////////////////////////////////////
45 | // GETTERS
46 | /////////////////////////////////////////////////////////////////////////////
47 |
48 | public Index getIndex() {
49 | return this.index;
50 | }
51 |
52 | /////////////////////////////////////////////////////////////////////////////
53 | // FUNCTIONALITY
54 | /////////////////////////////////////////////////////////////////////////////
55 |
56 | @Override
57 | protected void acceptHtmlResponse(
58 | final String id, final String uri,
59 | final Document document, final Instant time)
60 | throws IOException {
61 | String title = document.getTitle();
62 | if (title == null) { title = ""; }
63 | String content = document.getContent();
64 | if (content == null) { content = ""; }
65 | LOG.fine("accept html response " + id
66 | + " title = '" + title + "' content exists = " + !content.isEmpty());
67 | if (!title.isEmpty() || !content.isEmpty()) {
68 | this.getIndex().indexResponse(id, uri, content, title);
69 | }
70 | }
71 |
72 | @Override
73 | protected void acceptRevisit(
74 | final String id, final String uri, final Instant originalTime,
75 | final Instant time)
76 | throws IOException {
77 | this.getIndex().indexRevisit(id, uri, originalTime, time);
78 | }
79 |
80 | @Override
81 | protected void acceptRequest(
82 | final String concurrentRecordId,
83 | final String targetUri,
84 | final Instant time)
85 | throws IOException {
86 | this.getIndex().indexRequest(concurrentRecordId, targetUri, time);
87 | }
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/ui/SearchServlet.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.ui;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStreamReader;
5 | import java.time.Instant;
6 | import java.util.List;
7 | import java.util.NoSuchElementException;
8 | import java.util.TimeZone;
9 |
10 | import com.github.mustachejava.DefaultMustacheFactory;
11 | import com.github.mustachejava.Mustache;
12 | import com.github.mustachejava.MustacheFactory;
13 |
14 | import de.webis.wasp.index.Index;
15 | import de.webis.wasp.index.Query;
16 | import de.webis.wasp.index.Result;
17 | import jakarta.servlet.ServletConfig;
18 | import jakarta.servlet.ServletException;
19 | import jakarta.servlet.http.HttpServlet;
20 | import jakarta.servlet.http.HttpServletRequest;
21 | import jakarta.servlet.http.HttpServletResponse;
22 | import jakarta.servlet.http.HttpSession;
23 |
24 | /**
25 | * Servlet for the search service.
26 | *
27 | * @author johannes.kiesel@uni-weimar.de
28 | *
29 | */
30 | public class SearchServlet
31 | extends HttpServlet {
32 |
33 | /////////////////////////////////////////////////////////////////////////////
34 | // CONSTANTS
35 | /////////////////////////////////////////////////////////////////////////////
36 |
37 | private static final long serialVersionUID = -5259242888271066638L;
38 |
39 | /////////////////////////////////////////////////////////////////////////////
40 | // CONFIGURATION
41 |
42 | public static final String INIT_PARAMETER_INDEX_PORT = "index.port";
43 |
44 | public static final int DEFAULT_INDEX_PORT = Index.DEFAULT_PORT;
45 |
46 | public static final String INIT_PARAMETER_PAGE_SIZE = "page.size";
47 |
48 | public static final int DEFAULT_PAGE_SIZE = 10;
49 |
50 | public static final String INIT_PARAMETER_REPLAY_SERVER = "replay.server";
51 |
52 | public static final String DEFAULT_REPLAY_SERVER = "http://localhost:8001";
53 |
54 | public static final String INIT_PARAMETER_REPLAY_COLLECTION = "replay.collection";
55 |
56 | public static final String DEFAULT_REPLAY_COLLECTION = "wasp";
57 |
58 | /////////////////////////////////////////////////////////////////////////////
59 | // REQUEST
60 |
61 | public static final String SERVLET_PATH = "search";
62 |
63 | public static final String REQUEST_PARAMETER_TERMS = "terms";
64 |
65 | public static final String REQUEST_PARAMETER_FROM = "from";
66 |
67 | public static final String REQUEST_PARAMETER_TO = "to";
68 |
69 | public static final String REQUEST_PARAMETER_TIMEZONE = "timezone";
70 |
71 | public static final String REQUEST_PARAMETER_PAGE_NUMBER = "page";
72 |
73 | /////////////////////////////////////////////////////////////////////////////
74 | // SESSION
75 |
76 | protected static final String SESSION_QUERY = "query";
77 |
78 | protected static final String SESSION_RESULTS = "results";
79 |
80 | /////////////////////////////////////////////////////////////////////////////
81 | // MEMBERS
82 | /////////////////////////////////////////////////////////////////////////////
83 |
84 | private final Mustache pageRenderer;
85 |
86 | private Index index;
87 |
88 | private int pageSize;
89 |
90 | private String replayServer;
91 |
92 | private String replayCollection;
93 |
94 | /////////////////////////////////////////////////////////////////////////////
95 | // CONSTRUCTION
96 | /////////////////////////////////////////////////////////////////////////////
97 |
98 | /**
99 | * Creates a new servlet.
100 | */
101 | public SearchServlet() {
102 | final MustacheFactory factory = new DefaultMustacheFactory();
103 | this.pageRenderer = factory.compile(new InputStreamReader(
104 | SearchServlet.class.getResourceAsStream("search.mustache")),
105 | "search.mustache");
106 | this.index = null;
107 | this.pageSize = 0;
108 | this.replayServer = null;
109 | this.replayCollection = null;
110 | }
111 |
112 | @Override
113 | public void init(final ServletConfig config) throws ServletException {
114 | this.index = new Index(
115 | SearchServlet.getParameterValue(config,
116 | INIT_PARAMETER_INDEX_PORT, DEFAULT_INDEX_PORT));
117 | this.pageSize = SearchServlet.getParameterValue(config,
118 | INIT_PARAMETER_PAGE_SIZE, DEFAULT_PAGE_SIZE);
119 | this.replayServer = SearchServlet.getParameterValue(config,
120 | INIT_PARAMETER_REPLAY_SERVER, DEFAULT_REPLAY_SERVER);
121 | this.replayCollection = SearchServlet.getParameterValue(config,
122 | INIT_PARAMETER_REPLAY_COLLECTION, DEFAULT_REPLAY_COLLECTION);
123 | }
124 |
125 | /////////////////////////////////////////////////////////////////////////////
126 | // GETTERS
127 | /////////////////////////////////////////////////////////////////////////////
128 |
129 | /**
130 | * Gets the page renderer.
131 | * @return The renderer
132 | */
133 | public Mustache getPageRenderer() {
134 | return this.pageRenderer;
135 | }
136 |
137 | /**
138 | * Gets the index client.
139 | * @return The client
140 | */
141 | protected Index getIndex() {
142 | return this.index;
143 | }
144 |
145 | /**
146 | * Gets the page size to render.
147 | * @return The page size
148 | */
149 | public int getPageSize() {
150 | return this.pageSize;
151 | }
152 |
153 | /**
154 | * Gets the address (including protocol and host) of the replay server.
155 | * @return The URI
156 | */
157 | public String getReplayServer() {
158 | return this.replayServer;
159 | }
160 |
161 | /**
162 | * Gets the name of the collection to replay from.
163 | * @return The name
164 | */
165 | public String getReplayCollection() {
166 | return this.replayCollection;
167 | }
168 |
169 | /////////////////////////////////////////////////////////////////////////////
170 | // FUNCTIONALITY
171 | /////////////////////////////////////////////////////////////////////////////
172 |
173 | @Override
174 | protected void doGet(
175 | final HttpServletRequest request, final HttpServletResponse response)
176 | throws ServletException, IOException {
177 | final UiPage page = this.getPage(request);
178 |
179 | response.setContentType("text/html");
180 | this.getPageRenderer().execute(response.getWriter(), page);
181 | };
182 |
183 | /////////////////////////////////////////////////////////////////////////////
184 | // HELPERS
185 | /////////////////////////////////////////////////////////////////////////////
186 |
187 | /**
188 | * Gets an implementation of the search page model for rendering.
189 | * @param request The request to the servlet
190 | * @return The page model
191 | * @throws IOException On searching the index
192 | */
193 | protected UiPage getPage(final HttpServletRequest request)
194 | throws IOException {
195 | final int pageSize = this.getPageSize();
196 |
197 | final Query query = SearchServlet.getQuery(request);
198 | final TimeZone timezone = SearchServlet.getClientTimeZone(request);
199 | if (query == null) {
200 | return new UiPage(
201 | this.getReplayServer(), this.getReplayCollection(),
202 | request.getLocale(), timezone);
203 | } else {
204 | final List results = this.getResults(request, query);
205 | final int numResults = results.size();
206 | final int numPages = (numResults - 1) / pageSize + 1;
207 | final int pageNumber = SearchServlet.getPageNumber(request);
208 | final int fromResult = Math.min((pageNumber - 1) * pageSize, numResults);
209 | final int toResult = Math.min(pageNumber * pageSize, numResults);
210 | final List paginatedResults =
211 | results.subList(fromResult, toResult);
212 |
213 | return new UiPage(
214 | this.getReplayServer(), this.getReplayCollection(),
215 | query, paginatedResults, pageNumber, numPages,
216 | request.getLocale(), timezone);
217 | }
218 | }
219 |
220 | /**
221 | * Gets the results for the specified query.
222 | * @param request The request to the servlet
223 | * @param query The query
224 | * @return The results for the query
225 | * @throws IOException On searching the index
226 | */
227 | protected List getResults(
228 | final HttpServletRequest request, final Query query)
229 | throws IOException {
230 | final HttpSession session = request.getSession();
231 | synchronized (session) {
232 | @SuppressWarnings("unchecked")
233 | List results =
234 | (List) session.getAttribute(SESSION_RESULTS);
235 | if (results == null) {
236 | results = this.getIndex().search(query);
237 | session.setAttribute(SESSION_RESULTS, results);
238 | }
239 | return results;
240 | }
241 | }
242 |
243 | /**
244 | * Gets the query for a request.
245 | * @param request The request to the servlet
246 | * @return The query or null for none
247 | */
248 | protected static Query getQuery(final HttpServletRequest request) {
249 | final String terms = request.getParameter(REQUEST_PARAMETER_TERMS);
250 | if (terms == null) { return null; }
251 |
252 | final TimeZone timezone = SearchServlet.getClientTimeZone(request);
253 | final Instant from = SearchServlet.parseInstant(
254 | request.getParameter(REQUEST_PARAMETER_FROM), timezone);
255 | final Instant to = SearchServlet.parseInstant(
256 | request.getParameter(REQUEST_PARAMETER_TO), timezone);
257 | final Query query = new Query(terms, from, to);
258 |
259 | final HttpSession session = request.getSession();
260 | synchronized (session) {
261 | final Query oldQuery = (Query) session.getAttribute(SESSION_QUERY);
262 | if (query == null || !query.equals(oldQuery)) {
263 | session.setAttribute(SESSION_QUERY, query);
264 | session.removeAttribute(SESSION_RESULTS);
265 | }
266 | return query;
267 | }
268 | }
269 |
270 | /**
271 | * Gets the page number for a request.
272 | * @param request The request to the servlet
273 | * @return The page number (1 by default)
274 | */
275 | protected static int getPageNumber(final HttpServletRequest request) {
276 | final String pageNumberString =
277 | request.getParameter(REQUEST_PARAMETER_PAGE_NUMBER);
278 | if (pageNumberString == null) {
279 | return 1;
280 | } else {
281 | return Integer.parseInt(pageNumberString);
282 | }
283 | }
284 |
285 | /**
286 | * Gets the time zone of the browser.
287 | * @param request The request to the servlet
288 | * @return The guessed time zone
289 | */
290 | protected static TimeZone getClientTimeZone(
291 | final HttpServletRequest request) {
292 | final String value = request.getParameter(REQUEST_PARAMETER_TIMEZONE);
293 | if (value == null) {
294 | return TimeZone.getDefault();
295 | } else {
296 | return TimeZone.getTimeZone(value);
297 | }
298 | }
299 |
300 | /**
301 | * Parses an instant from a request parameter.
302 | * @param value The parameter value (may be null)
303 | * @param timeZone The time zone of the browser
304 | * @return The instant or null for none
305 | */
306 | protected static Instant parseInstant(
307 | final String value, final TimeZone timeZone) {
308 | if (value == null || value.isEmpty()) {
309 | return null;
310 | } else {
311 | return Instant.from(UiPage.UiInstant.DATE_TIME_PICKER_FORMATTER
312 | .withZone(timeZone.toZoneId()).parse(value));
313 | }
314 | }
315 |
316 | /**
317 | * Gets the value for a parameter.
318 | * @param config The servlet configuration
319 | * @param parameter The parameter name
320 | * @return The value
321 | * @throws NoSuchElementException If no value is provided
322 | */
323 | protected static String getParameterValue(final ServletConfig config,
324 | final String parameter) {
325 | return SearchServlet.getParameterValue(config, parameter, null);
326 | }
327 |
328 | /**
329 | * Gets the value for a parameter.
330 | * @param config The servlet configuration
331 | * @param parameter The parameter name
332 | * @param defaultValue The default value or null for none
333 | * @return The value (may be the default)
334 | * @throws NoSuchElementException If no value and no default value is provided
335 | */
336 | protected static String getParameterValue(final ServletConfig config,
337 | final String parameter, final String defaultValue) {
338 | final String value = config.getInitParameter(parameter);
339 | if (value == null) {
340 | if (defaultValue == null) {
341 | throw new NoSuchElementException(parameter);
342 | } else {
343 | return defaultValue;
344 | }
345 | } else {
346 | return value;
347 | }
348 | }
349 |
350 | /**
351 | * Gets the value for a parameter as integer.
352 | * @param config The servlet configuration
353 | * @param parameter The parameter name
354 | * @param defaultValue The default value
355 | * @return The value (may be the default)
356 | */
357 | protected static int getParameterValue(final ServletConfig config,
358 | final String parameter, final int defaultValue) {
359 | final String value = config.getInitParameter(parameter);
360 | if (value == null) {
361 | return defaultValue;
362 | } else {
363 | return Integer.parseInt(value);
364 | }
365 | }
366 |
367 | }
368 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/ui/UiPage.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.ui;
2 |
3 | import java.io.InputStreamReader;
4 | import java.io.StringWriter;
5 | import java.io.UnsupportedEncodingException;
6 | import java.net.URLEncoder;
7 | import java.time.Instant;
8 | import java.time.ZoneOffset;
9 | import java.time.format.DateTimeFormatter;
10 | import java.util.ArrayList;
11 | import java.util.Collections;
12 | import java.util.List;
13 | import java.util.Locale;
14 | import java.util.Objects;
15 | import java.util.TimeZone;
16 |
17 | import com.github.mustachejava.DefaultMustacheFactory;
18 | import com.github.mustachejava.Mustache;
19 | import com.github.mustachejava.MustacheFactory;
20 |
21 | import de.webis.wasp.index.Query;
22 | import de.webis.wasp.index.RequestRecord;
23 | import de.webis.wasp.index.ResponseRecord;
24 | import de.webis.wasp.index.Result;
25 |
26 | /**
27 | * Model for a WASP user interface web page.
28 | *
29 | * @author johannes.kiesel@uni-weimar.de
30 | *
31 | */
32 | public class UiPage {
33 |
34 | /////////////////////////////////////////////////////////////////////////////
35 | // CONSTANTS
36 | /////////////////////////////////////////////////////////////////////////////
37 |
38 | protected static final int MAX_URI_DISPLAY_LENGTH = 60;
39 |
40 | /////////////////////////////////////////////////////////////////////////////
41 | // MEMBERS
42 | /////////////////////////////////////////////////////////////////////////////
43 |
44 | public final String replayServer;
45 |
46 | public final String replayCollection;
47 |
48 | public final String locale;
49 |
50 | public final UiQuery query;
51 |
52 | public final List results;
53 |
54 | public final List pagination;
55 |
56 | /////////////////////////////////////////////////////////////////////////////
57 | // CONSTRUCTION
58 | /////////////////////////////////////////////////////////////////////////////
59 |
60 | /**
61 | * Create a page without query or results.
62 | * @param replayServer The URL of the replay server (including port and
63 | * optional path up to the collection name)
64 | * @param replayCollection The name of the collection to replay from
65 | * @param locale Locale of the user client
66 | * @param timeZone Time zone of the user client
67 | */
68 | public UiPage(
69 | final String replayServer, final String replayCollection,
70 | final Locale locale, final TimeZone timeZone) {
71 | this.replayServer = Objects.requireNonNull(replayServer);
72 | this.replayCollection = Objects.requireNonNull(replayCollection);
73 | this.locale = locale.toString();
74 | this.query = null;
75 | this.results = List.of();
76 | this.pagination = List.of();
77 | }
78 |
79 | /**
80 | * Create a page with query and results.
81 | * @param replayServer The URL of the replay server (including port and
82 | * optional path up to the collection name)
83 | * @param replayCollection The name of the collection to replay from
84 | * @param query The query for which the results were retrieved
85 | * @param paginatedResults The results for the specific page
86 | * @param pageNumber The number of the result page for the query
87 | * @param numPages The number of available result pages for the query
88 | * @param locale The locale of the user client
89 | * @param timeZone The time zone of the user client
90 | */
91 | public UiPage(
92 | final String replayServer, final String replayCollection,
93 | final Query query, final List paginatedResults,
94 | final int pageNumber, final int numPages,
95 | final Locale locale, final TimeZone timeZone) {
96 | this.replayServer = Objects.requireNonNull(replayServer);
97 | this.replayCollection = Objects.requireNonNull(replayCollection);
98 | this.locale = locale.toString();
99 | this.query = new UiQuery(query, pageNumber, timeZone);
100 |
101 | final List results = new ArrayList<>();
102 | for (final Result result : paginatedResults) {
103 | results.add(
104 | new UiResult(replayServer, replayCollection, result, timeZone));
105 | }
106 | this.results = Collections.unmodifiableList(results);
107 |
108 | final List pagination = new ArrayList<>();
109 | final StringBuilder hrefBaseBuilder = new StringBuilder();
110 | try {
111 | hrefBaseBuilder.append('?')
112 | .append(SearchServlet.REQUEST_PARAMETER_TERMS).append('=')
113 | .append(URLEncoder.encode(query.getTerms(), "UTF-8"));
114 | if (query.getFrom() != null) {
115 | hrefBaseBuilder.append('&')
116 | .append(SearchServlet.REQUEST_PARAMETER_FROM).append('=')
117 | .append(URLEncoder.encode(this.query.from.timePickerValue, "UTF-8"));
118 | }
119 | if (query.getTo() != null) {
120 | hrefBaseBuilder.append('&')
121 | .append(SearchServlet.REQUEST_PARAMETER_TO).append('=')
122 | .append(URLEncoder.encode(this.query.to.timePickerValue, "UTF-8"));
123 | }
124 | hrefBaseBuilder.append("&page=");
125 | } catch (final UnsupportedEncodingException e) {
126 | throw new RuntimeException(e);
127 | }
128 | final String hrefBase = hrefBaseBuilder.toString();
129 | // to first
130 | pagination.add(new UiPaginationLink(
131 | 1, "«", hrefBase + "1",
132 | false, pageNumber == 1));
133 | // pages
134 | for (int p = 1; p <= numPages; ++p) {
135 | pagination.add(new UiPaginationLink(
136 | p, String.valueOf(p), hrefBase + p,
137 | p == pageNumber, false));
138 | }
139 | // to last
140 | pagination.add(new UiPaginationLink(
141 | numPages, "»", hrefBase + numPages,
142 | false, pageNumber == numPages));
143 | this.pagination = Collections.unmodifiableList(pagination);
144 | }
145 |
146 | /////////////////////////////////////////////////////////////////////////////
147 | // HELPER CLASSES
148 |
149 | /**
150 | * Model for a WASP query in a user interface web page.
151 | *
152 | * @author johannes.kiesel@uni-weimar.de
153 | *
154 | */
155 | public static final class UiQuery {
156 |
157 | ///////////////////////////////////////////////////////////////////////////
158 | // MEMBERS
159 | ///////////////////////////////////////////////////////////////////////////
160 |
161 | public final String terms;
162 |
163 | public final String termsUrl;
164 |
165 | public final UiInstant from;
166 |
167 | public final UiInstant to;
168 |
169 | public final int pageNumber;
170 |
171 | ///////////////////////////////////////////////////////////////////////////
172 | // CONSTRUCTION
173 | ///////////////////////////////////////////////////////////////////////////
174 |
175 | /**
176 | * Creates a new query for a WASP page.
177 | * @param query The original WASP query
178 | * @param timeZone The time zone of the user client
179 | */
180 | protected UiQuery(
181 | final Query query, final int pageNumber, final TimeZone timeZone) {
182 | this.terms = query.getTerms();
183 | try {
184 | this.termsUrl = URLEncoder.encode(this.terms, "UTF-8");
185 | } catch (final UnsupportedEncodingException exception) {
186 | throw new RuntimeException(exception);
187 | }
188 | this.from = new UiInstant(query.getFrom(), timeZone, true, false);
189 | this.to = new UiInstant(query.getTo(), timeZone, false, true);
190 | this.pageNumber = pageNumber;
191 | }
192 |
193 | }
194 |
195 | /**
196 | * Model for a WASP result in a user interface web page.
197 | *
198 | * @author johannes.kiesel@uni-weimar.de
199 | *
200 | */
201 | public static final class UiResult {
202 |
203 | ///////////////////////////////////////////////////////////////////////////
204 | // MEMBERS
205 | ///////////////////////////////////////////////////////////////////////////
206 |
207 | public final String title;
208 |
209 | public final UiInstant date;
210 |
211 | public final String liveUri;
212 |
213 | public final String liveUriShortened;
214 |
215 | public final String replayUri;
216 |
217 | public final String snippet;
218 |
219 | ///////////////////////////////////////////////////////////////////////////
220 | // CONSTRUCTION
221 | ///////////////////////////////////////////////////////////////////////////
222 |
223 | /**
224 | * Creates a new result for a WASP page.
225 | * @param replayServer The URL of the replay server (including port and
226 | * optional path up to the collection name)
227 | * @param replayCollection The name of the collection to replay from
228 | * @param result One result retrieved for the query
229 | * @param timeZone The time zone of the user client
230 | */
231 | protected UiResult(
232 | final String replayServer, final String replayCollection,
233 | final Result result, final TimeZone timeZone) {
234 | this.title = result.getResponse().getTitle();
235 |
236 | this.date = new UiInstant(
237 | result.getMatchedRequest().getDate(), timeZone, false, false);
238 |
239 | this.liveUri = result.getMatchedRequest().getUri();
240 | if (this.liveUri.length() <= MAX_URI_DISPLAY_LENGTH) {
241 | this.liveUriShortened = this.liveUri;
242 | } else {
243 | final int splitIndex = (MAX_URI_DISPLAY_LENGTH - 3) / 2;
244 | this.liveUriShortened = this.liveUri.substring(0, splitIndex) + "..."
245 | + this.liveUri.substring(this.liveUri.length() - splitIndex);
246 | }
247 |
248 | this.replayUri = String.format("%s/%s/%s/%s",
249 | Objects.requireNonNull(replayServer),
250 | Objects.requireNonNull(replayCollection),
251 | this.date.replayPathValue,
252 | this.liveUri);
253 |
254 | this.snippet = result.getSnippet();
255 | /*
256 | * StringEscapeUtils.escapeHtml4(
257 | *
258 | final Pattern highlightStartPattern = Pattern.compile("<em>");
259 | final String startUnescaped =
260 | highlightStartPattern.matcher(htmlEscaped).replaceAll(
261 | "");
262 | final Pattern highlightEndPattern = Pattern.compile("</em>");
263 | return highlightEndPattern.matcher(startUnescaped).replaceAll("");
264 | */
265 | }
266 |
267 | }
268 |
269 | /**
270 | * Model for an instant in a user interface web page.
271 | *
272 | * @author johannes.kiesel@uni-weimar.de
273 | *
274 | */
275 | public static final class UiInstant {
276 |
277 | ///////////////////////////////////////////////////////////////////////////
278 | // CONSTANTS
279 | ///////////////////////////////////////////////////////////////////////////
280 |
281 | protected static final DateTimeFormatter DATE_TIME_PICKER_FORMATTER =
282 | DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm");
283 |
284 | protected static final DateTimeFormatter REPLAY_FORMATTER =
285 | DateTimeFormatter.ofPattern("yyyyMMddHHmmss");
286 |
287 | ///////////////////////////////////////////////////////////////////////////
288 | // MEMBERS
289 | ///////////////////////////////////////////////////////////////////////////
290 |
291 | public final String text;
292 |
293 | public final String iso;
294 |
295 | public final String timePickerValue;
296 |
297 | public final String replayPathValue;
298 |
299 | ///////////////////////////////////////////////////////////////////////////
300 | // CONSTRUCTION
301 | ///////////////////////////////////////////////////////////////////////////
302 |
303 | /**
304 | * Creates a new instant for a WASP page.
305 | * @param instant The instant or null for none
306 | * @param timeZone The time zone of the user client
307 | * @param isFrom Whether this instant denotes the start of a time interval
308 | * @param isTo Whether this instant denotes the end of a time interval
309 | */
310 | protected UiInstant(
311 | final Instant instant, final TimeZone timeZone,
312 | final boolean isFrom, final boolean isTo) {
313 | if (instant == null) {
314 | this.iso = null;
315 | this.timePickerValue = null;
316 | this.replayPathValue = null;
317 | if (isFrom) {
318 | this.text = "beginning";
319 | } else if (isTo) {
320 | this.text = "now";
321 | } else {
322 | this.text = null;
323 | }
324 | } else {
325 | this.iso = instant.toString();
326 | this.timePickerValue = DATE_TIME_PICKER_FORMATTER.format(instant
327 | .atZone(timeZone.toZoneId()));
328 | this.replayPathValue = REPLAY_FORMATTER.format(instant
329 | .atOffset(ZoneOffset.UTC));
330 | this.text = this.timePickerValue;
331 | }
332 | }
333 |
334 | }
335 |
336 | /**
337 | * Model for a link to a different result page in a user interface web page.
338 | *
339 | * @author johannes.kiesel@uni-weimar.de
340 | *
341 | */
342 | public static final class UiPaginationLink {
343 |
344 | ///////////////////////////////////////////////////////////////////////////
345 | // MEMBERS
346 | ///////////////////////////////////////////////////////////////////////////
347 |
348 | public final int number;
349 |
350 | public final String text;
351 |
352 | public final String link;
353 |
354 | public final boolean isActive;
355 |
356 | public final boolean isDisabled;
357 |
358 | ///////////////////////////////////////////////////////////////////////////
359 | // CONSTRUCTION
360 | ///////////////////////////////////////////////////////////////////////////
361 |
362 | /**
363 | * Creates a new pagination link for a WASP page.
364 | * @param number The target page number
365 | * @param text The text to show
366 | * @param link The link to the page
367 | * @param isActive Whether this link leads to the current page
368 | * @param isDisabled Whether this link is disabled
369 | */
370 | public UiPaginationLink(
371 | final int number, final String text, final String link,
372 | final boolean isActive, final boolean isDisabled) {
373 | this.number = number;
374 | this.text = Objects.requireNonNull(text);
375 | this.link = Objects.requireNonNull(link);
376 | this.isActive = isActive;
377 | this.isDisabled = isDisabled;
378 | }
379 |
380 | }
381 |
382 | public static void main(String[] args) {
383 | final MustacheFactory factory = new DefaultMustacheFactory();
384 | final Mustache pageRenderer = factory.compile(new InputStreamReader(
385 | SearchServlet.class.getResourceAsStream("search.mustache")),
386 | "search.mustache");
387 | final Query query = new Query("foo bar", null, Instant.now());
388 | final List results = List.of(
389 | new Result(0.5, "my snippet",
390 | new ResponseRecord("foo", "bar", null, null),
391 | new RequestRecord("https://webis.de", Instant.now())),
392 | new Result(0.25, "my second snippet",
393 | new ResponseRecord("foo2", "bar2", null, null),
394 | new RequestRecord("https://webis.de", Instant.now())));
395 | final int pageNumber = 1;
396 | final int numPages = 3;
397 | final UiPage page = new UiPage(
398 | "https://wasp.de", "mywasp",
399 | query, results, pageNumber, numPages,
400 | Locale.ENGLISH, TimeZone.getDefault());
401 | final StringWriter writer = new StringWriter();
402 | pageRenderer.execute(writer, page);
403 | System.out.println(writer.toString());
404 | }
405 |
406 | }
407 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/ArchiveWatcher.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.nio.file.FileSystems;
6 | import java.nio.file.Path;
7 | import java.nio.file.StandardWatchEventKinds;
8 | import java.nio.file.WatchEvent;
9 | import java.nio.file.WatchKey;
10 | import java.nio.file.WatchService;
11 | import java.util.Arrays;
12 | import java.util.Comparator;
13 | import java.util.function.Consumer;
14 | import java.util.logging.Level;
15 | import java.util.logging.Logger;
16 |
17 | import edu.cmu.lemurproject.WarcRecord;
18 |
19 | /**
20 | * Creates a {@link Thread} that watches a web archives repository for new
21 | * content and passes the new records to a consumer.
22 | *
23 | * If archives exist already in the directory, they are read in order of their
24 | * last modified dates (if set so in the constructor). In this case, it will
25 | * monitor the latest archive for changes, but not the others!
26 | *
27 | * Currently, this treats every file (or directory) within the target directory
28 | * as an archive and tries to read from it.
29 | *
30 | *
31 | * @author johannes.kiesel@uni-weimar.de
32 | *
33 | */
34 | public class ArchiveWatcher
35 | extends Thread
36 | implements AutoCloseable {
37 |
38 | /////////////////////////////////////////////////////////////////////////////
39 | // LOGGING
40 | /////////////////////////////////////////////////////////////////////////////
41 |
42 | private static final Logger LOG =
43 | Logger.getLogger(ArchiveWatcher.class.getName());
44 |
45 | /////////////////////////////////////////////////////////////////////////////
46 | // MEMBERS
47 | /////////////////////////////////////////////////////////////////////////////
48 |
49 | private final Path directory;
50 |
51 | private final WatchService watchService;
52 |
53 | private final Consumer consumer;
54 |
55 | private WarcRecordReader reader;
56 |
57 | /////////////////////////////////////////////////////////////////////////////
58 | // CONSTRUCTION
59 | /////////////////////////////////////////////////////////////////////////////
60 |
61 | /**
62 | * Create a new watcher for given directory.
63 | * @param directory The directory that contains the archive files
64 | * @param readExistingRecords Whether records that already exist in the
65 | * archives in the directory should be read
66 | * @param consumer The consumer to which the records will be passed
67 | * @throws IOException On reading records
68 | */
69 | public ArchiveWatcher(
70 | final Path directory, final boolean readExistingRecords,
71 | final Consumer consumer)
72 | throws IOException {
73 | if (consumer == null) { throw new NullPointerException(); }
74 | this.directory = directory;
75 | this.consumer = consumer;
76 | this.reader = null;
77 |
78 | this.initForDirectory(readExistingRecords);
79 |
80 | this.watchService = FileSystems.getDefault().newWatchService();
81 | this.getDirectory().register(this.getWatchService(),
82 | StandardWatchEventKinds.ENTRY_CREATE);
83 | }
84 |
85 | private void initForDirectory(final boolean readExistingRecords)
86 | throws IOException {
87 | final File[] children = this.getDirectory().toFile().listFiles();
88 | Arrays.sort(children, new Comparator() {
89 | @Override
90 | public int compare(final File o1, final File o2) {
91 | return Long.compare(o1.lastModified(), o2.lastModified());
92 | }
93 | });
94 |
95 | if (readExistingRecords) {
96 | // Read what should be closed files
97 | if (children.length >= 2) {
98 | for (final File child
99 | : Arrays.copyOfRange(children, 0, children.length - 1)) {
100 | try (final WarcRecordReader reader = new WarcRecordReader(
101 | this.getDirectory().resolve(child.getName()),
102 | this.getConsumer())) {
103 | reader.run();
104 | }
105 | }
106 | }
107 | }
108 |
109 | // Read what may be the open file
110 | if (children.length >= 1) {
111 | this.openFile(this.getDirectory().resolve(
112 | children[children.length - 1].getName()), readExistingRecords);
113 | }
114 | }
115 |
116 | /////////////////////////////////////////////////////////////////////////////
117 | // GETTERS
118 | /////////////////////////////////////////////////////////////////////////////
119 |
120 | /**
121 | * Gets the directory being watched.
122 | * @return The directory
123 | */
124 | public Path getDirectory() {
125 | return this.directory;
126 | }
127 |
128 | /**
129 | * Gets the service watching for changes in the directory.
130 | * @return The service
131 | */
132 | protected WatchService getWatchService() {
133 | return this.watchService;
134 | }
135 |
136 | /**
137 | * Gets the consumer to which WARC records are passed to.
138 | * @return The consumer
139 | */
140 | public Consumer getConsumer() {
141 | return this.consumer;
142 | }
143 |
144 | /**
145 | * Gets the current WARC record reader.
146 | * @return The reader
147 | */
148 | protected WarcRecordReader getReader() {
149 | return this.reader;
150 | }
151 |
152 | /////////////////////////////////////////////////////////////////////////////
153 | // SETTER
154 | /////////////////////////////////////////////////////////////////////////////
155 |
156 | /**
157 | * Sets the WARC record reader.
158 | * @param reader The reader
159 | */
160 | public void setReader(final WarcRecordReader reader) {
161 | this.reader = reader;
162 | }
163 |
164 | /////////////////////////////////////////////////////////////////////////////
165 | // FUNCTIONALITY
166 | /////////////////////////////////////////////////////////////////////////////
167 |
168 | @Override
169 | public void run() {
170 | final Path directory = this.getDirectory();
171 | try {
172 | while (true) {
173 | final WatchKey key = this.getWatchService().take();
174 | for (final WatchEvent> event : key.pollEvents()) {
175 | final WatchEvent.Kind> kind = event.kind();
176 | if (kind == StandardWatchEventKinds.ENTRY_CREATE) {
177 | final Path inputFile = directory.resolve((Path) event.context());
178 | LOG.fine("New file created in " + directory + ": " + inputFile);
179 | this.openFile(inputFile, true);
180 | } else if (kind == StandardWatchEventKinds.OVERFLOW) {
181 | LOG.warning("Overflow detected when watching " + directory);
182 | } else {
183 | LOG.warning("Unknown watch event kind '" + kind + "' when watching "
184 | + directory);
185 | }
186 | }
187 |
188 | if (!key.reset()) {
189 | LOG.severe(
190 | "Directory " + directory + " can no longer be watched");
191 | break;
192 | }
193 | }
194 | } catch (final InterruptedException exception) {
195 | LOG.log(Level.SEVERE,
196 | "Interrupted watching " + directory, exception);
197 | } catch (final IOException exception) {
198 | LOG.log(Level.SEVERE, "Error watching " + directory, exception);
199 | }
200 | }
201 |
202 | @Override
203 | public void close() throws IOException {
204 | this.closeFile();
205 | }
206 |
207 | /**
208 | * Closes the currently opened file, if any.
209 | * @throws IOException On closing the file
210 | * @see {@link #openFile(Path, boolean)}
211 | */
212 | protected void closeFile() throws IOException {
213 | synchronized (this) {
214 | final WarcRecordReader reader = this.getReader();
215 | if (reader != null) {
216 | this.setReader(null);
217 | reader.close();
218 | }
219 | }
220 | }
221 |
222 | /**
223 | * Starts reading from a new file, keeping watch if records are appended.
224 | * @param inputFile The file to read
225 | * @param consumeExistingRecords Whether to also pass existing records to the
226 | * consumer
227 | * @throws IOException On opening the file
228 | */
229 | protected void openFile(
230 | final Path inputFile, final boolean consumeExistingRecords)
231 | throws IOException {
232 | synchronized (this) {
233 | this.closeFile();
234 | final WarcRecordReader reader = new ContinuousWarcRecordReader(
235 | inputFile, consumeExistingRecords, this.getConsumer(), 1000);
236 | this.setReader(reader);
237 | reader.start();
238 | }
239 | }
240 |
241 | }
242 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/ContinuousWarcRecordReader.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileNotFoundException;
6 | import java.io.IOException;
7 | import java.nio.file.Path;
8 | import java.util.function.Consumer;
9 | import java.util.logging.Level;
10 | import java.util.logging.Logger;
11 |
12 | import edu.cmu.lemurproject.WarcRecord;
13 |
14 | /**
15 | * A {@link WarcRecordReader} that will wait for new content even when it
16 | * reached the end of the archive.
17 | *
18 | * This class should be used for archives that are still filled. When you use
19 | * {@link #close()}, this reader will still continue to read until it
20 | * encounters the end of the file the next time.
21 | *
22 | *
23 | * @author johannes.kiesel@uni-weimar.de
24 | *
25 | */
26 | public class ContinuousWarcRecordReader extends WarcRecordReader {
27 |
28 | /////////////////////////////////////////////////////////////////////////////
29 | // LOGGING
30 | /////////////////////////////////////////////////////////////////////////////
31 |
32 | private static final Logger LOG =
33 | Logger.getLogger(ContinuousWarcRecordReader.class.getName());
34 |
35 | /////////////////////////////////////////////////////////////////////////////
36 | // MEMBERS
37 | /////////////////////////////////////////////////////////////////////////////
38 |
39 | protected final long pollIntervalMillis;
40 |
41 | protected boolean consume;
42 |
43 | protected boolean closed;
44 |
45 | /////////////////////////////////////////////////////////////////////////////
46 | // CONSTRUCTORS
47 | /////////////////////////////////////////////////////////////////////////////
48 |
49 | /**
50 | * Creates a new reader for an archive that is still being filled.
51 | * @param inputFile The archive file
52 | * @param consumeExistingRecords Whether records that are already in the file
53 | * should also be consumed
54 | * @param consumer Consumer for the WARC records that are read
55 | * @param pollIntervalMillis On encountering the end of archive, poll the file
56 | * in this interval to check when it has more content
57 | * @throws IOException When the file can not be opened
58 | */
59 | public ContinuousWarcRecordReader(
60 | final Path inputFile, final boolean consumeExistingRecords,
61 | final Consumer consumer,
62 | final long pollIntervalMillis)
63 | throws IOException {
64 | super(inputFile, consumer);
65 | this.pollIntervalMillis = pollIntervalMillis;
66 | this.consume = consumeExistingRecords;
67 | this.closed = false;
68 | }
69 |
70 | @Override
71 | protected FileInputStream openFileInputStream()
72 | throws IOException {
73 | final File file = this.getInputFile().toFile();
74 | LOG.fine("Open file: " + file);
75 | return new ContinuousFileInputStream(file);
76 | }
77 |
78 | /////////////////////////////////////////////////////////////////////////////
79 | // FUNCTIONALITY
80 | /////////////////////////////////////////////////////////////////////////////
81 |
82 | @Override
83 | public void close() throws IOException {
84 | LOG.fine("Closing " + this.getInputFile());
85 | this.closed = true;
86 | }
87 |
88 | @Override
89 | protected void consume(final WarcRecord record) {
90 | if (this.consume) {
91 | super.getConsumer().accept(record);
92 | }
93 | }
94 |
95 | protected void closeStream() throws IOException {
96 | super.close();
97 | }
98 |
99 | /////////////////////////////////////////////////////////////////////////////
100 | // HELPER CLASSES
101 | /////////////////////////////////////////////////////////////////////////////
102 |
103 | /**
104 | * Modification of {@link FileInputStream} that does waits at the end of the
105 | * file for more content to appear.
106 | *
107 | * @author johannes.kiesel@uni-weimar.de
108 | */
109 | protected class ContinuousFileInputStream
110 | extends FileInputStream {
111 |
112 | public ContinuousFileInputStream(final File file)
113 | throws FileNotFoundException {
114 | super(file);
115 | }
116 |
117 | @Override
118 | public int available() throws IOException {
119 | int available = super.available();
120 | try {
121 | while (available == 0 && !ContinuousWarcRecordReader.this.closed) {
122 | ContinuousWarcRecordReader.this.consume = true;
123 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis);
124 | available = super.available();
125 | }
126 | } catch (final InterruptedException exception) {
127 | LOG.log(Level.WARNING, "Interrupted " + this, exception);
128 | }
129 |
130 | if (ContinuousWarcRecordReader.this.closed) {
131 | ContinuousWarcRecordReader.this.closeStream();
132 | }
133 | return available;
134 | }
135 |
136 | @Override
137 | public int read() throws IOException {
138 | int read = super.read();
139 | try {
140 | while (read == -1 && !ContinuousWarcRecordReader.this.closed) {
141 | ContinuousWarcRecordReader.this.consume = true;
142 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis);
143 | read = super.read();
144 | }
145 | } catch (final InterruptedException exception) {
146 | LOG.log(Level.WARNING, "Interrupted " + this, exception);
147 | }
148 |
149 | if (ContinuousWarcRecordReader.this.closed) {
150 | ContinuousWarcRecordReader.this.closeStream();
151 | }
152 | return read;
153 | }
154 |
155 | @Override
156 | public int read(byte b[]) throws IOException {
157 | int read = super.read(b);
158 | try {
159 | while (read == -1 && !ContinuousWarcRecordReader.this.closed) {
160 | ContinuousWarcRecordReader.this.consume = true;
161 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis);
162 | read = super.read(b);
163 | }
164 | } catch (final InterruptedException exception) {
165 | LOG.log(Level.WARNING, "Interrupted " + this, exception);
166 | }
167 |
168 | if (ContinuousWarcRecordReader.this.closed) {
169 | ContinuousWarcRecordReader.this.closeStream();
170 | }
171 | return read;
172 | }
173 |
174 | @Override
175 | public int read(byte b[], int off, int len) throws IOException {
176 | int read = super.read(b, off, len);
177 | try {
178 | while (read == -1 && !ContinuousWarcRecordReader.this.closed) {
179 | ContinuousWarcRecordReader.this.consume = true;
180 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis);
181 | read = super.read(b, off, len);
182 | }
183 | } catch (final InterruptedException exception) {
184 | LOG.log(Level.WARNING, "Interrupted " + this, exception);
185 | }
186 |
187 | if (ContinuousWarcRecordReader.this.closed) {
188 | ContinuousWarcRecordReader.this.closeStream();
189 | }
190 | return read;
191 | }
192 |
193 | }
194 |
195 | }
196 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/GenericHtmlWarcRecordConsumer.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.io.IOException;
4 | import java.time.Instant;
5 | import java.util.Objects;
6 | import java.util.function.Function;
7 |
8 | /**
9 | * Generic class for consuming HTML WARC records.
10 | *
11 | * @author johannes.kiesel@uni-weimar.de
12 | *
13 | */
14 | public abstract class GenericHtmlWarcRecordConsumer
15 | extends GenericWarcRecordConsumer {
16 |
17 | /////////////////////////////////////////////////////////////////////////////
18 | // CONSTANTS
19 | /////////////////////////////////////////////////////////////////////////////
20 |
21 | /**
22 | * Default function for extracting HTML from response records.
23 | */
24 | public static final Function DEFAULT_DOCUMENT_EXTRACTOR =
25 | JerichoDocumentExtractor.INSTANCE;
26 |
27 | /////////////////////////////////////////////////////////////////////////////
28 | // MEMBERS
29 | /////////////////////////////////////////////////////////////////////////////
30 |
31 | private Function documentExtractor;
32 |
33 | /////////////////////////////////////////////////////////////////////////////
34 | // CONSTRUCTION
35 | /////////////////////////////////////////////////////////////////////////////
36 |
37 | /**
38 | * Creates a new consumer using the default extractor for HTML responses.
39 | */
40 | public GenericHtmlWarcRecordConsumer() {
41 | this(DEFAULT_DOCUMENT_EXTRACTOR);
42 | }
43 |
44 | /**
45 | * Creates a new consumer using the specified extractor for HTML responses.
46 | * @param documentExtractor The extractor
47 | */
48 | public GenericHtmlWarcRecordConsumer(
49 | final Function documentExtractor) {
50 | this.setDocumentExtractor(documentExtractor);
51 | }
52 |
53 | /////////////////////////////////////////////////////////////////////////////
54 | // GETTERS
55 | /////////////////////////////////////////////////////////////////////////////
56 |
57 | /**
58 | * Gets the document extractor for HTML responses.
59 | * @return The extractor
60 | */
61 | public Function getDocumentExtractor() {
62 | return this.documentExtractor;
63 | }
64 |
65 | /////////////////////////////////////////////////////////////////////////////
66 | // SETTERS
67 | /////////////////////////////////////////////////////////////////////////////
68 |
69 | /**
70 | * Sets the document extractor for HTML responses.
71 | * @param documentExtractor The extractor
72 | */
73 | protected void setDocumentExtractor(
74 | final Function documentExtractor) {
75 | this.documentExtractor = Objects.requireNonNull(documentExtractor);
76 | }
77 |
78 | /////////////////////////////////////////////////////////////////////////////
79 | // FUNCTIONALITY
80 | /////////////////////////////////////////////////////////////////////////////
81 |
82 | @Override
83 | protected void acceptHtmlResponse(
84 | final String id, final String uri, final String html, final Instant time)
85 | throws IOException {
86 | final Document document = this.getDocumentExtractor().apply(html);
87 | this.acceptHtmlResponse(id, uri, document, time);
88 | }
89 |
90 | protected abstract void acceptHtmlResponse(
91 | final String id, final String uri, final Document document,
92 | final Instant time)
93 | throws IOException;
94 |
95 | /////////////////////////////////////////////////////////////////////////////
96 | // DOCUMENT
97 | /////////////////////////////////////////////////////////////////////////////
98 |
99 | /**
100 | * A processed document.
101 | *
102 | * @author johannes.kiesel@uni-weimar.de
103 | *
104 | */
105 | public static final class Document {
106 |
107 | private final String title;
108 |
109 | private final String content;
110 |
111 | /**
112 | * Creates a new document.
113 | * @param title The document's title (or null)
114 | * @param content The document's content (or null)
115 | */
116 | public Document(final String title, final String content) {
117 | this.title = title;
118 | this.content = content;
119 | }
120 |
121 | /**
122 | * Gets the title of the document.
123 | * @return The title (may be null or empty)
124 | */
125 | public String getTitle() {
126 | return this.title;
127 | }
128 |
129 | /**
130 | * Gets the text content of the document.
131 | * @return The content (may be null or empty)
132 | */
133 | public String getContent() {
134 | return this.content;
135 | }
136 |
137 | }
138 |
139 | }
140 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/GenericWarcRecordConsumer.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.io.IOException;
4 | import java.util.function.Consumer;
5 | import java.util.logging.Level;
6 | import java.util.logging.Logger;
7 | import java.time.Instant;
8 |
9 | import org.apache.http.HttpResponse;
10 |
11 | import edu.cmu.lemurproject.WarcRecord;
12 |
13 | /**
14 | * Generic class for consuming WARC records with methods for different records.
15 | *
16 | * @author johannes.kiesel@uni-weimar.de
17 | *
18 | */
19 | public abstract class GenericWarcRecordConsumer
20 | implements Consumer {
21 |
22 | /////////////////////////////////////////////////////////////////////////////
23 | // LOGGING
24 | /////////////////////////////////////////////////////////////////////////////
25 |
26 | private static final Logger LOG =
27 | Logger.getLogger(GenericWarcRecordConsumer.class.getName());
28 |
29 | /////////////////////////////////////////////////////////////////////////////
30 | // FUNCTIONALITY
31 | /////////////////////////////////////////////////////////////////////////////
32 |
33 | @Override
34 | public void accept(final WarcRecord record) {
35 | final String type = Warcs.getType(record);
36 |
37 | final Instant time =
38 | Instant.ofEpochSecond(Warcs.getDate(record).getEpochSecond());
39 | try {
40 | switch (type) {
41 | case Warcs.HEADER_TYPE_RESPONSE:
42 | this.acceptResponse(record, time);
43 | break;
44 | case Warcs.HEADER_TYPE_REQUEST:
45 | this.acceptRequest(record, time);
46 | break;
47 | case Warcs.HEADER_TYPE_REVISIT:
48 | this.acceptRevisit(record, time);
49 | break;
50 | default:
51 | break;
52 | }
53 | } catch (final Throwable exception) {
54 | LOG.log(Level.WARNING, "Failed to index record " + Warcs.getId(record)
55 | + " of type " + Warcs.getType(record), exception);
56 | }
57 | }
58 |
59 | /////////////////////////////////////////////////////////////////////////////
60 | // Response
61 |
62 | protected void acceptResponse(final WarcRecord record, final Instant time)
63 | throws IOException {
64 | final String id = Warcs.getId(record);
65 | final String uri = Warcs.getTargetUri(record);
66 | final String html = this.getHtml(record);
67 | if (html != null) {
68 | LOG.fine("accept html response " + id + " -> " + uri);
69 | this.acceptHtmlResponse(id, uri, html, time);
70 | } else {
71 | LOG.fine("accept non-html response " + id + " -> " + uri);
72 | this.acceptNonHtmlResponse(id, uri, time);
73 | }
74 | }
75 |
76 | protected void acceptNonHtmlResponse(
77 | final String id, final String uri, final Instant time)
78 | throws IOException {
79 | // do nothing by default
80 | }
81 |
82 | protected void acceptHtmlResponse(
83 | final String id, final String uri, final String html, final Instant time)
84 | throws IOException {
85 | // do nothing by default
86 | }
87 |
88 | /////////////////////////////////////////////////////////////////////////////
89 | // Revisit
90 |
91 | protected void acceptRevisit(final WarcRecord record, final Instant time)
92 | throws IOException {
93 | this.acceptRevisit(
94 | Warcs.getId(record),
95 | Warcs.getReferedToTargetUri(record),
96 | Warcs.getReferedToDate(record),
97 | time);
98 | }
99 |
100 | protected void acceptRevisit(
101 | final String id, final String uri, final Instant originalTime,
102 | final Instant time)
103 | throws IOException {
104 | // do nothing by default
105 | }
106 |
107 | /////////////////////////////////////////////////////////////////////////////
108 | // Request
109 |
110 | protected void acceptRequest(final WarcRecord record, final Instant time)
111 | throws IOException {
112 | this.acceptRequest(
113 | Warcs.getConcurrentRecordId(record),
114 | Warcs.getTargetUri(record),
115 | time);
116 | }
117 |
118 | protected void acceptRequest(
119 | final String concurrentRecordId,
120 | final String targetUri,
121 | final Instant time)
122 | throws IOException {
123 | // do nothing by default
124 | }
125 |
126 | /////////////////////////////////////////////////////////////////////////////
127 | // HELPERS
128 | /////////////////////////////////////////////////////////////////////////////
129 |
130 | /**
131 | * Gets the HTML from a response WARC record.
132 | * @param record The record
133 | * @return The HTML if it exists, or null
134 | */
135 | protected String getHtml(final WarcRecord record) {
136 | try {
137 | final HttpResponse response = Warcs.toResponse(record);
138 | if (Warcs.isHtml(response)) {
139 | return Warcs.getHtml(record);
140 | }
141 | } catch (final Throwable exception) {
142 | LOG.log(Level.FINER,
143 | "Could not parse record " + Warcs.getId(record),
144 | exception);
145 | }
146 | return null;
147 | }
148 |
149 | }
150 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/JerichoDocumentExtractor.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.util.Objects;
4 | import java.util.function.Function;
5 |
6 | import de.webis.wasp.warcs.GenericHtmlWarcRecordConsumer.Document;
7 | import net.htmlparser.jericho.CharacterReference;
8 | import net.htmlparser.jericho.Element;
9 | import net.htmlparser.jericho.HTMLElementName;
10 | import net.htmlparser.jericho.Renderer;
11 | import net.htmlparser.jericho.Source;
12 |
13 | /**
14 | * A document extractor using Jericho HTML parser.
15 | *
16 | * @author johannes.kiesel@uni-weimar.de
17 | *
18 | */
19 | public class JerichoDocumentExtractor
20 | implements Function {
21 |
22 | /**
23 | * The single instance of the extractor.
24 | */
25 | public static final JerichoDocumentExtractor INSTANCE =
26 | new JerichoDocumentExtractor();
27 |
28 | protected JerichoDocumentExtractor() { }
29 |
30 | @Override
31 | public Document apply(final String html) {
32 | final Source source = new Source(Objects.requireNonNull(html));
33 |
34 | final Renderer renderer = new Renderer(source);
35 | renderer.setMaxLineLength(0);
36 | renderer.setIncludeHyperlinkURLs(false);
37 | renderer.setIncludeAlternateText(true);
38 | final String content = renderer.toString();
39 |
40 | final Element titleElement =
41 | source.getFirstElement(HTMLElementName.TITLE);
42 | final String title = titleElement == null
43 | ? null
44 | : CharacterReference.decodeCollapseWhiteSpace(
45 | titleElement.getContent());
46 |
47 | return new Document(title, content);
48 | }
49 |
50 | }
51 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/WarcRecordReader.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.io.DataInputStream;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 | import java.nio.file.Path;
9 | import java.util.function.Consumer;
10 | import java.util.logging.Level;
11 | import java.util.logging.Logger;
12 | import java.util.zip.GZIPInputStream;
13 |
14 | import edu.cmu.lemurproject.WarcRecord;
15 |
16 | /**
17 | * Reader for WARC files that passes all read records to a consumer.
18 | *
19 | * Use the {@link #run()} or {@link #start()} methods to begin reading.
20 | *
21 | * If the archive is still being filled, use {@link ContinuousWarcRecordReader}
22 | * instead.
23 | *
24 | *
25 | * @author johannes.kiesel@uni-weimar.de
26 | *
27 | */
28 | public class WarcRecordReader
29 | extends Thread
30 | implements AutoCloseable {
31 |
32 | /////////////////////////////////////////////////////////////////////////////
33 | // LOGGING
34 | /////////////////////////////////////////////////////////////////////////////
35 |
36 | private static final Logger LOG =
37 | Logger.getLogger(WarcRecordReader.class.getName());
38 |
39 | /////////////////////////////////////////////////////////////////////////////
40 | // MEMBERS
41 | /////////////////////////////////////////////////////////////////////////////
42 |
43 | private final Consumer consumer;
44 |
45 | private final Path inputFile;
46 |
47 | private final DataInputStream input;
48 |
49 | /////////////////////////////////////////////////////////////////////////////
50 | // CONSTRUCTION
51 | /////////////////////////////////////////////////////////////////////////////
52 |
53 | /**
54 | * Creates a new reader for an archive.
55 | * @param inputFile The archive file
56 | * @param consumer Consumer for the WARC records that are read
57 | * @throws IOException When the file can not be opened
58 | */
59 | public WarcRecordReader(
60 | final Path inputFile, final Consumer consumer)
61 | throws IOException {
62 | if (consumer == null) { throw new NullPointerException(); }
63 | this.consumer = consumer;
64 | this.inputFile = inputFile;
65 | this.input = this.openDataInputStream();
66 | }
67 |
68 | /**
69 | * Opens a data input stream to the reader's file, applying GZip decompression
70 | * if the file ends on .gz.
71 | * @return The input stream
72 | * @throws IOException On opening the file
73 | */
74 | protected DataInputStream openDataInputStream()
75 | throws IOException {
76 | final InputStream inputStream = this.openFileInputStream();
77 | if (this.getInputFile().toString().toLowerCase().endsWith(".gz")) {
78 | return new DataInputStream(new GZIPInputStream(inputStream));
79 | } else {
80 | return new DataInputStream(inputStream);
81 | }
82 | }
83 |
84 | /**
85 | * Opens an input stream to the reader's file.
86 | * @return The input stream
87 | * @throws IOException On opening the file
88 | */
89 | protected FileInputStream openFileInputStream()
90 | throws IOException {
91 | final File file = this.getInputFile().toFile();
92 | LOG.fine("Open file: " + file);
93 | return new FileInputStream(file);
94 | }
95 |
96 | /////////////////////////////////////////////////////////////////////////////
97 | // CONSTRUCTION
98 | /////////////////////////////////////////////////////////////////////////////
99 |
100 | /**
101 | * Gets the file this reader reads from.
102 | * @return The file
103 | */
104 | public Path getInputFile() {
105 | return this.inputFile;
106 | }
107 |
108 | /**
109 | * Gets the consumer to which WARC records are passed to.
110 | * @return The consumer
111 | */
112 | public Consumer getConsumer() {
113 | return this.consumer;
114 | }
115 |
116 | /**
117 | * Gets the input stream.
118 | * @return The stream
119 | */
120 | protected DataInputStream getInput() {
121 | return this.input;
122 | }
123 |
124 | /////////////////////////////////////////////////////////////////////////////
125 | // FUNCTIONALITY
126 | /////////////////////////////////////////////////////////////////////////////
127 |
128 | @Override
129 | public void run() {
130 | final DataInputStream input = this.getInput();
131 | try {
132 | WarcRecord record = WarcRecord.readNextWarcRecord(input);
133 | while (record != null) {
134 | this.consume(record);
135 | record = WarcRecord.readNextWarcRecord(input);
136 | }
137 | LOG.fine("Finished " + this);
138 | this.close();
139 | } catch (final IOException exception) {
140 | LOG.log(Level.SEVERE,
141 | "Error while reading from " + this.getInputFile(), exception);
142 | }
143 | }
144 |
145 | @Override
146 | public void close() throws IOException {
147 | LOG.fine("Close file " + this.getInputFile());
148 | this.getInput().close();
149 | }
150 |
151 | @Override
152 | public String toString() {
153 | return this.getInputFile() + " -> " + this.getConsumer();
154 | }
155 |
156 | /////////////////////////////////////////////////////////////////////////////
157 | // HELPERS
158 | /////////////////////////////////////////////////////////////////////////////
159 |
160 | /**
161 | * Passes the record to the consumer.
162 | * @param record The record
163 | * @see #getConsumer()
164 | */
165 | protected void consume(final WarcRecord record) {
166 | this.getConsumer().accept(record);
167 | }
168 |
169 | }
170 |
--------------------------------------------------------------------------------
/src/de/webis/wasp/warcs/Warcs.java:
--------------------------------------------------------------------------------
1 | package de.webis.wasp.warcs;
2 |
3 | import java.io.ByteArrayInputStream;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.time.Instant;
7 | import java.time.format.DateTimeFormatter;
8 | import java.util.Locale;
9 | import java.util.regex.Pattern;
10 | import java.util.zip.GZIPInputStream;
11 |
12 | import org.apache.http.Header;
13 | import org.apache.http.HeaderElement;
14 | import org.apache.http.HttpEntity;
15 | import org.apache.http.HttpException;
16 | import org.apache.http.HttpResponse;
17 | import org.apache.http.ParseException;
18 | import org.apache.http.client.entity.DecompressingEntity;
19 | import org.apache.http.client.entity.DeflateInputStream;
20 | import org.apache.http.client.entity.InputStreamFactory;
21 | import org.apache.http.config.Lookup;
22 | import org.apache.http.config.MessageConstraints;
23 | import org.apache.http.config.RegistryBuilder;
24 | import org.apache.http.entity.BasicHttpEntity;
25 | import org.apache.http.entity.ContentLengthStrategy;
26 | import org.apache.http.impl.DefaultHttpResponseFactory;
27 | import org.apache.http.impl.entity.LaxContentLengthStrategy;
28 | import org.apache.http.impl.io.ChunkedInputStream;
29 | import org.apache.http.impl.io.ContentLengthInputStream;
30 | import org.apache.http.impl.io.DefaultHttpResponseParser;
31 | import org.apache.http.impl.io.EmptyInputStream;
32 | import org.apache.http.impl.io.HttpTransportMetricsImpl;
33 | import org.apache.http.impl.io.IdentityInputStream;
34 | import org.apache.http.impl.io.SessionInputBufferImpl;
35 | import org.apache.http.io.SessionInputBuffer;
36 | import org.apache.http.protocol.HTTP;
37 | import org.apache.http.util.EntityUtils;
38 |
39 | import edu.cmu.lemurproject.WarcRecord;
40 |
41 | /**
42 | * Utility class for working with WARC files.
43 | *
44 | * @author johannes.kiesel@uni-weimar.de
45 | *
46 | */
47 | public class Warcs {
48 |
49 | public static final String HEADER_ID = "WARC-Record-ID";
50 |
51 | public static final String HEADER_TYPE = "WARC-Type";
52 |
53 | public static final String HEADER_TYPE_INFO = "warcinfo";
54 |
55 | public static final String HEADER_TYPE_REQUEST = "request";
56 |
57 | public static final String HEADER_TYPE_RESPONSE = "response";
58 |
59 | public static final String HEADER_TYPE_REVISIT = "revisit";
60 |
61 | public static final String HEADER_REFERS_TO = "WARC-Refers-To";
62 |
63 | public static final String HEADER_REFERS_TO_TARGET_URI ="WARC-Refers-To-Target-URI";
64 |
65 | public static final String HEADER_REFERS_TO_DATE = "WARC-Refers-To-Date";
66 |
67 | public static final String HEADER_TARGET_URI = "WARC-Target-URI";
68 |
69 | public static final String HEADER_CONCURRENT = "WARC-Concurrent-To";
70 |
71 | public static final String HEADER_DATE = "WARC-Date";
72 |
73 | public static final DateTimeFormatter HEADER_DATE_FORMAT =
74 | DateTimeFormatter.ISO_INSTANT;
75 |
76 |
77 | public static final Pattern HTTP_HEADER_CONTENT_TYPE_HTML = Pattern.compile(
78 | "text/html.*");
79 |
80 | public static final String HTTP_HEADER_CONTENT_TYPE = "Content-Type";
81 |
82 | /////////////////////////////////////////////////////////////////////////////
83 | // STATIC HELPERS
84 | /////////////////////////////////////////////////////////////////////////////
85 |
86 | private final static InputStreamFactory GZIP = new InputStreamFactory() {
87 | @Override
88 | public InputStream create(final InputStream instream) throws IOException {
89 | return new GZIPInputStream(instream);
90 | }
91 | };
92 |
93 | private final static InputStreamFactory DEFLATE = new InputStreamFactory() {
94 | @Override
95 | public InputStream create(final InputStream instream) throws IOException {
96 | return new DeflateInputStream(instream);
97 | }
98 | };
99 |
100 | /////////////////////////////////////////////////////////////////////////////
101 | // CONSTRUCTORS
102 | /////////////////////////////////////////////////////////////////////////////
103 |
104 | // Utility class
105 | private Warcs() { }
106 |
107 | /////////////////////////////////////////////////////////////////////////////
108 | // FUNCTIONALITY
109 | /////////////////////////////////////////////////////////////////////////////
110 |
111 | /////////////////////////////////////////////////////////////////////////////
112 | // Access header fields
113 |
114 | public static String getHeader(
115 | final WarcRecord record, final String header) {
116 | return record.getHeaderMetadataItem(header);
117 | }
118 |
119 | public static String getId(final WarcRecord record) {
120 | return Warcs.getHeader(record, HEADER_ID);
121 | }
122 |
123 | public static String getType(final WarcRecord record) {
124 | return Warcs.getHeader(record, HEADER_TYPE);
125 | }
126 |
127 | public static Instant getDate(final WarcRecord record) {
128 | final String date = Warcs.getHeader(record, HEADER_DATE);
129 | return Instant.from(HEADER_DATE_FORMAT.parse(date));
130 | }
131 |
132 | public static String getTargetUri(final WarcRecord record) {
133 | return Warcs.getHeader(record, HEADER_TARGET_URI);
134 | }
135 |
136 | public static String getConcurrentRecordId(final WarcRecord record) {
137 | return Warcs.getHeader(record, HEADER_CONCURRENT);
138 | }
139 |
140 | public static String getReferedToRecordId(final WarcRecord record) {
141 | return Warcs.getHeader(record, HEADER_REFERS_TO);
142 | }
143 |
144 | public static String getReferedToTargetUri(final WarcRecord record) {
145 | return Warcs.getHeader(record, HEADER_REFERS_TO_TARGET_URI);
146 | }
147 |
148 | public static Instant getReferedToDate(final WarcRecord record) {
149 | final String date = Warcs.getHeader(record, HEADER_REFERS_TO_DATE);
150 | return Instant.from(HEADER_DATE_FORMAT.parse(date));
151 | }
152 |
153 | /////////////////////////////////////////////////////////////////////////////
154 | // HTML
155 |
156 | /**
157 | * Checks if this is a HTML response record.
158 | */
159 | public static boolean isHtml(final WarcRecord record)
160 | throws HttpException, IOException {
161 | if (record == null) { return false; }
162 | final HttpResponse response = Warcs.toResponse(record);
163 | return Warcs.isHtml(response);
164 | }
165 |
166 | /**
167 | * Checks if this is a HTML response.
168 | */
169 | public static boolean isHtml(final HttpResponse response) {
170 | if (response == null) { return false; }
171 |
172 | final String contentType =
173 | response.getLastHeader(HTTP_HEADER_CONTENT_TYPE).getValue();
174 | if (contentType == null) { return false; } // no content type
175 |
176 | if (!HTTP_HEADER_CONTENT_TYPE_HTML.matcher(contentType).matches()) {
177 | return false; // not HTML content type
178 | }
179 |
180 | return true;
181 | }
182 |
183 | /**
184 | * Gets the HTML part of a record or null if there is none or an
185 | * invalid one.
186 | */
187 | public static String getHtml(final WarcRecord record)
188 | throws ParseException, IOException, HttpException {
189 | final HttpResponse response = Warcs.toResponse(record);
190 | if (!Warcs.isHtml(response)) { return null; } // no HTML record
191 |
192 | final HttpEntity entity = response.getEntity();
193 | final String defaultCharset = null;
194 | return EntityUtils.toString(entity, defaultCharset);
195 | }
196 |
197 | /**
198 | * Gets an {@link HttpResponse} object from a WARC record of such a response.
199 | * @return The response or null when the record is not a response
200 | * record
201 | */
202 | public static HttpResponse toResponse(final WarcRecord record)
203 | throws IOException, HttpException {
204 | // based on http://stackoverflow.com/a/26586178
205 | if (!record.getHeaderRecordType().equals("response")) { return null; }
206 |
207 | final SessionInputBufferImpl sessionInputBuffer =
208 | new SessionInputBufferImpl(new HttpTransportMetricsImpl(), 2048);
209 | final InputStream inputStream =
210 | new ByteArrayInputStream(record.getByteContent());
211 | sessionInputBuffer.bind(inputStream);
212 | final MessageConstraints constraints = MessageConstraints.DEFAULT;
213 | final DefaultHttpResponseParser parser =
214 | new DefaultHttpResponseParser(
215 | sessionInputBuffer, null, new DefaultHttpResponseFactory(),
216 | constraints);
217 | final HttpResponse response = parser.parse();
218 | final HttpEntity entity = Warcs.getEntity(response, sessionInputBuffer);
219 | response.setEntity(entity);
220 | Warcs.encodeEntity(response);
221 | return response;
222 | }
223 |
224 |
225 | private static void encodeEntity(final HttpResponse response)
226 | throws HttpException, IOException {
227 | // Adapted from org.apache.http.client.protocol.ResponseContentEncoding
228 | final HttpEntity entity = response.getEntity();
229 |
230 | // entity can be null in case of 304 Not Modified, 204 No Content or similar
231 | // check for zero length entity.
232 | if (entity != null && entity.getContentLength() != 0) {
233 | final Header ceheader = entity.getContentEncoding();
234 | if (ceheader != null) {
235 | final HeaderElement[] codecs = ceheader.getElements();
236 | final Lookup decoderRegistry =
237 | RegistryBuilder.create()
238 | .register("gzip", GZIP)
239 | .register("x-gzip", GZIP)
240 | .register("deflate", DEFLATE)
241 | .build();
242 | for (final HeaderElement codec : codecs) {
243 | final String codecname = codec.getName().toLowerCase(Locale.ROOT);
244 | final InputStreamFactory decoderFactory =
245 | decoderRegistry.lookup(codecname);
246 | if (decoderFactory != null) {
247 | response.setEntity(new DecompressingEntity(
248 | response.getEntity(), decoderFactory));
249 | response.removeHeaders("Content-Length");
250 | response.removeHeaders("Content-Encoding");
251 | response.removeHeaders("Content-MD5");
252 | } else {
253 | if (!"identity".equals(codecname)) {
254 | throw new HttpException(
255 | "Unsupported Content-Encoding: " + codec.getName());
256 | }
257 | }
258 | }
259 | }
260 | }
261 | }
262 |
263 | private static InputStream createInputStream(
264 | final long len, final SessionInputBuffer input) {
265 | // Adapted from the org.apache.http.impl.BHttpConnectionBase
266 | if (len == ContentLengthStrategy.CHUNKED) {
267 | return new ChunkedInputStream(input);
268 | } else if (len == ContentLengthStrategy.IDENTITY) {
269 | return new IdentityInputStream(input);
270 | } else if (len == 0L) {
271 | return EmptyInputStream.INSTANCE;
272 | } else {
273 | return new ContentLengthInputStream(input, len);
274 | }
275 | }
276 |
277 | private static HttpEntity getEntity(
278 | final HttpResponse response, final SessionInputBuffer input)
279 | throws HttpException {
280 | // Adapted from the org.apache.http.impl.BHttpConnectionBase
281 | final BasicHttpEntity entity = new BasicHttpEntity();
282 |
283 | final long len =
284 | new LaxContentLengthStrategy().determineLength(response);
285 | final InputStream instream = Warcs.createInputStream(len, input);
286 | if (len == ContentLengthStrategy.CHUNKED) {
287 | entity.setChunked(true);
288 | entity.setContentLength(-1);
289 | entity.setContent(instream);
290 | } else if (len == ContentLengthStrategy.IDENTITY) {
291 | entity.setChunked(false);
292 | entity.setContentLength(-1);
293 | entity.setContent(instream);
294 | } else {
295 | entity.setChunked(false);
296 | entity.setContentLength(len);
297 | entity.setContent(instream);
298 | }
299 |
300 | final Header contentTypeHeader =
301 | response.getFirstHeader(HTTP.CONTENT_TYPE);
302 | if (contentTypeHeader != null) {
303 | entity.setContentType(contentTypeHeader);
304 | }
305 | final Header contentEncodingHeader =
306 | response.getFirstHeader(HTTP.CONTENT_ENCODING);
307 | if (contentEncodingHeader != null) {
308 | entity.setContentEncoding(contentEncodingHeader);
309 | }
310 | return entity;
311 | }
312 |
313 | }
314 |
--------------------------------------------------------------------------------
/src/edu/cmu/lemurproject/WarcRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | Lemur License Agreement
3 |
4 | Copyright (c) 2000-2011 The Lemur Project. All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions
8 | are met:
9 |
10 | 1. Redistributions of source code must retain the above copyright
11 | notice, this list of conditions and the following disclaimer.
12 |
13 | 2. Redistributions in binary form must reproduce the above copyright
14 | notice, this list of conditions and the following disclaimer in
15 | the documentation and/or other materials provided with the
16 | distribution.
17 |
18 | 3. The names "Lemur", "Indri", "University of Massachusetts" and
19 | "Carnegie Mellon" must not be used to endorse or promote products
20 | derived from this software without prior written permission. To
21 | obtain permission, contact license@lemurproject.org
22 |
23 | 4. Products derived from this software may not be called "Lemur" or "Indri"
24 | nor may "Lemur" or "Indri" appear in their names without prior written
25 | permission of The Lemur Project. To obtain permission,
26 | contact license@lemurproject.org.
27 |
28 | THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AND OTHER
29 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
30 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
31 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
32 | COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
33 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
34 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
35 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
37 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
38 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
39 | DAMAGE.
40 |
41 | */
42 | /*
43 | * To change this template, choose Tools | Templates
44 | * and open the template in the editor.
45 | */
46 | package edu.cmu.lemurproject;
47 |
48 | import java.io.DataInput;
49 | import java.io.DataInputStream;
50 | import java.io.DataOutput;
51 | import java.io.EOFException;
52 | import java.io.IOException;
53 | import java.io.UnsupportedEncodingException;
54 | import java.util.HashMap;
55 | import java.util.Iterator;
56 | import java.util.Map.Entry;
57 | import java.util.Set;
58 | // import org.apache.commons.logging.Log;
59 | // import org.apache.commons.logging.LogFactory;
60 |
61 | /**
62 | *
63 | * @author mhoy
64 | */
65 | public class WarcRecord {
66 |
67 | // public static final Log LOG = LogFactory.getLog(WarcRecord.class);
68 |
69 | public static String WARC_VERSION = "WARC/";
70 | public static String WARC_VERSION_LINE = "WARC/0.18\n";
71 |
72 | ////public static String WARC_VERSION = "WARC/1.0";
73 | //public static String WARC_VERSION = "WARC/0.18";
74 | ////public static String WARC_VERSION_LINE = "WARC/1.0\n";
75 | //public static String WARC_VERSION_LINE = "WARC/0.18\n";
76 | private static String NEWLINE="\n";
77 | private static String CR_NEWLINE="\r\n";
78 |
79 | private static byte MASK_THREE_BYTE_CHAR=(byte)(0xE0);
80 | private static byte MASK_TWO_BYTE_CHAR=(byte)(0xC0);
81 | private static byte MASK_TOPMOST_BIT=(byte)(0x80);
82 | private static byte MASK_BOTTOM_SIX_BITS=(byte)(0x1F);
83 | private static byte MASK_BOTTOM_FIVE_BITS=(byte)(0x3F);
84 | private static byte MASK_BOTTOM_FOUR_BITS=(byte)(0x0F);
85 |
86 | private static String LINE_ENDING="\n";
87 |
88 | private static String readLineFromInputStream(DataInputStream in) throws IOException {
89 | StringBuilder retString=new StringBuilder();
90 | boolean found_cr = false;
91 | boolean keepReading=true;
92 | try {
93 | do {
94 | char thisChar=0;
95 | byte readByte=in.readByte();
96 | // check to see if it's a multibyte character
97 | if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) {
98 | found_cr = false;
99 | // need to read the next 2 bytes
100 | if (in.available() < 2) {
101 | // treat these all as individual characters
102 | retString.append((char)readByte);
103 | int numAvailable=in.available();
104 | for (int i=0; i < numAvailable; i++) {
105 | retString.append((char)(in.readByte()));
106 | }
107 | continue;
108 | }
109 | byte secondByte=in.readByte();
110 | byte thirdByte=in.readByte();
111 | // ensure the topmost bit is set
112 | if (((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) || ((thirdByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT)) {
113 | //treat these as individual characters
114 | retString.append((char)readByte);
115 | retString.append((char)secondByte);
116 | retString.append((char)thirdByte);
117 | continue;
118 | }
119 | int finalVal=(thirdByte & MASK_BOTTOM_FIVE_BITS) + 64*(secondByte & MASK_BOTTOM_FIVE_BITS) + 4096*(readByte & MASK_BOTTOM_FOUR_BITS);
120 | thisChar=(char)finalVal;
121 | } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) {
122 | found_cr = false;
123 |
124 | // need to read next byte
125 | if (in.available() < 1) {
126 | // treat this as individual characters
127 | retString.append((char)readByte);
128 | continue;
129 | }
130 | byte secondByte=in.readByte();
131 | if ((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) {
132 | retString.append((char)readByte);
133 | retString.append((char)secondByte);
134 | continue;
135 | }
136 | int finalVal=(secondByte & MASK_BOTTOM_FIVE_BITS) + 64*(readByte & MASK_BOTTOM_SIX_BITS);
137 | thisChar=(char)finalVal;
138 | } else {
139 | // interpret it as a single byte
140 | thisChar=(char)readByte;
141 | }
142 | // Look for carriage return; if found set a flag
143 | if (thisChar=='\r') {
144 | found_cr = true;
145 | }
146 | if (thisChar=='\n') {
147 | // if the linefeed is the next character after the carriage return
148 | if (found_cr) {
149 | LINE_ENDING = CR_NEWLINE;
150 | } else {
151 | LINE_ENDING = NEWLINE;
152 | }
153 | keepReading=false;
154 | } else {
155 | retString.append(thisChar);
156 | }
157 | } while (keepReading);
158 | } catch (EOFException eofEx) {
159 | return null;
160 | }
161 |
162 | if (retString.length()==0) {
163 | return "";
164 | }
165 |
166 | return retString.toString();
167 | }
168 |
169 | private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer) throws IOException {
170 | if (in==null) { return null; }
171 | if (headerBuffer==null) { return null; }
172 |
173 | String line=null;
174 | boolean foundMark=false;
175 | byte[] retContent=null;
176 |
177 | // cannot be using a buffered reader here!!!!
178 | // just read the header
179 | // first - find our WARC header
180 | while ((!foundMark) && ((line=readLineFromInputStream(in))!=null)) {
181 | if (line.startsWith(WARC_VERSION)) {
182 | WARC_VERSION_LINE = line;
183 | foundMark=true;
184 | }
185 | }
186 |
187 | // no WARC mark?
188 | if (!foundMark) { return null; }
189 |
190 | // LOG.info("Found WARC_VERSION");
191 |
192 | int contentLength = -1;
193 | // read until we see contentLength then an empty line
194 | // (to handle malformed ClueWeb09 headers that have blank lines)
195 | // get the content length and set our retContent
196 | for (line = readLineFromInputStream(in).trim();
197 | line.length() > 0 || contentLength < 0;
198 | line = readLineFromInputStream(in).trim()) {
199 |
200 | if (line.length() > 0 ) {
201 | headerBuffer.append(line);
202 | headerBuffer.append(LINE_ENDING);
203 |
204 | // find the content length designated by Content-Length:
205 | String[] parts = line.split(":", 2);
206 | if (parts.length == 2 && parts[0].equals("Content-Length")) {
207 | try {
208 | contentLength=Integer.parseInt(parts[1].trim());
209 | // LOG.info("WARC record content length: " + contentLength);
210 | } catch (NumberFormatException nfEx) {
211 | contentLength=-1;
212 | }
213 | }
214 | }
215 | }
216 |
217 | // now read the bytes of the content
218 | retContent=new byte[contentLength];
219 | int totalWant=contentLength;
220 | int totalRead=0;
221 | //
222 | // LOOP TO REMOVE LEADING CR * LF
223 | // To prevent last few characters from being cut off of the content
224 | // when reading
225 | //
226 | while ((totalRead == 0) && (totalRead < contentLength)) {
227 | byte CR = in.readByte();
228 | byte LF = in.readByte();
229 | if ((CR != 13) && (LF != 10)) {
230 | retContent[0] = CR;
231 | retContent[1] = LF;
232 | totalRead = 2;
233 | totalWant = contentLength - totalRead;
234 | }
235 | }
236 | //
237 | //
238 | //
239 | while (totalRead < contentLength) {
240 | try {
241 | int numRead=in.read(retContent, totalRead, totalWant);
242 | if (numRead < 0) {
243 | return null;
244 | } else {
245 | totalRead += numRead;
246 | totalWant = contentLength-totalRead;
247 | } // end if (numRead < 0) / else
248 | } catch (EOFException eofEx) {
249 | // resize to what we have
250 | if (totalRead > 0) {
251 | byte[] newReturn=new byte[totalRead];
252 | System.arraycopy(retContent, 0, newReturn, 0, totalRead);
253 | return newReturn;
254 | } else {
255 | return null;
256 | }
257 | } // end try/catch (EOFException)
258 | } // end while (totalRead < contentLength)
259 |
260 | return retContent;
261 | }
262 |
263 | public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException {
264 | // LOG.info("Starting read of WARC record");
265 | StringBuffer recordHeader=new StringBuffer();
266 | byte[] recordContent=readNextRecord(in, recordHeader);
267 | if (recordContent==null) {
268 | // LOG.info("WARC content is null - file is complete");
269 | return null;
270 | }
271 |
272 | // extract out our header information
273 | String thisHeaderString=recordHeader.toString();
274 |
275 |
276 | String[] headerLines=thisHeaderString.split(LINE_ENDING);
277 |
278 | WarcRecord retRecord=new WarcRecord();
279 | for (int i=0; i < headerLines.length; i++) {
280 | String[] pieces=headerLines[i].split(":", 2);
281 | if (pieces.length!=2) {
282 | retRecord.addHeaderMetadata(pieces[0], "");
283 | continue;
284 | }
285 | String thisKey=pieces[0].trim();
286 | String thisValue=pieces[1].trim();
287 |
288 | // check for known keys
289 | if (thisKey.equals("WARC-Type")) {
290 | // LOG.info("Setting WARC record type: " + thisValue);
291 | retRecord.setWarcRecordType(thisValue);
292 | } else if (thisKey.equals("WARC-Date")) {
293 | retRecord.setWarcDate(thisValue);
294 | } else if (thisKey.equals("WARC-Record-ID")) {
295 | // LOG.info("Setting WARC record ID: " + thisValue);
296 | retRecord.setWarcUUID(thisValue);
297 | } else if (thisKey.equals("Content-Type")) {
298 | retRecord.setWarcContentType(thisValue);
299 | } else {
300 | retRecord.addHeaderMetadata(thisKey, thisValue);
301 | }
302 | }
303 |
304 | // set the content
305 | retRecord.setContent(recordContent);
306 |
307 | return retRecord;
308 | }
309 |
310 | public class WarcHeader {
311 | public String contentType="";
312 | public String UUID="";
313 | public String dateString="";
314 | public String recordType="";
315 | public HashMap metadata=new HashMap();
316 | public int contentLength=0;
317 |
318 | public WarcHeader() {
319 | }
320 |
321 | public WarcHeader(WarcHeader o) {
322 | this.contentType=o.contentType;
323 | this.UUID=o.UUID;
324 | this.dateString=o.dateString;
325 | this.recordType=o.recordType;
326 | this.metadata.putAll(o.metadata);
327 | this.contentLength=o.contentLength;
328 | }
329 |
330 | public void write(DataOutput out) throws IOException {
331 | out.writeUTF(contentType);
332 | out.writeUTF(UUID);
333 | out.writeUTF(dateString);
334 | out.writeUTF(recordType);
335 | out.writeInt(metadata.size());
336 | Iterator> metadataIterator=metadata.entrySet().iterator();
337 | while (metadataIterator.hasNext()) {
338 | Entry thisEntry=metadataIterator.next();
339 | out.writeUTF(thisEntry.getKey());
340 | out.writeUTF(thisEntry.getValue());
341 | }
342 | out.writeInt(contentLength);
343 | }
344 |
345 | public void readFields(DataInput in) throws IOException {
346 | contentType=in.readUTF();
347 | UUID=in.readUTF();
348 | dateString=in.readUTF();
349 | recordType=in.readUTF();
350 | metadata.clear();
351 | int numMetaItems=in.readInt();
352 | for (int i=0; i < numMetaItems; i++) {
353 | String thisKey=in.readUTF();
354 | String thisValue=in.readUTF();
355 | metadata.put(thisKey, thisValue);
356 | }
357 | contentLength=in.readInt();
358 | }
359 |
360 | @Override
361 | public String toString() {
362 | StringBuffer retBuffer=new StringBuffer();
363 |
364 | retBuffer.append(WARC_VERSION_LINE);
365 | retBuffer.append(LINE_ENDING);
366 |
367 | retBuffer.append("WARC-Type: " + recordType + LINE_ENDING);
368 | retBuffer.append("WARC-Date: " + dateString + LINE_ENDING);
369 |
370 | Iterator> metadataIterator=metadata.entrySet().iterator();
371 | while (metadataIterator.hasNext()) {
372 | Entry thisEntry=metadataIterator.next();
373 | retBuffer.append(thisEntry.getKey());
374 | retBuffer.append(": ");
375 | retBuffer.append(thisEntry.getValue());
376 | retBuffer.append(LINE_ENDING);
377 | }
378 | // Keep this as the last WARC-...
379 | retBuffer.append("WARC-Record-ID: " + UUID + LINE_ENDING);
380 |
381 | retBuffer.append("Content-Type: " + contentType + LINE_ENDING);
382 | retBuffer.append("Content-Length: " + contentLength + LINE_ENDING);
383 |
384 | return retBuffer.toString();
385 | }
386 | }
387 |
388 | private WarcHeader warcHeader=new WarcHeader();
389 | private byte[] warcContent=null;
390 | private String warcFilePath="";
391 |
392 | public WarcRecord() {
393 |
394 | }
395 |
396 | public WarcRecord(WarcRecord o) {
397 | this.warcHeader=new WarcHeader(o.warcHeader);
398 | this.warcContent=o.warcContent;
399 | }
400 |
401 | public int getTotalRecordLength() {
402 | int headerLength=warcHeader.toString().length();
403 | return (headerLength + warcContent.length);
404 | }
405 |
406 | public void set(WarcRecord o) {
407 | this.warcHeader=new WarcHeader(o.warcHeader);
408 | this.warcContent=o.warcContent;
409 | }
410 |
411 | public String getWarcFilePath() {
412 | return warcFilePath;
413 | }
414 |
415 | public void setWarcFilePath(String path) {
416 | warcFilePath=path;
417 | }
418 |
419 | public void setWarcRecordType(String recordType) {
420 | warcHeader.recordType=recordType;
421 | }
422 |
423 | public void setWarcContentType(String contentType) {
424 | warcHeader.contentType=contentType;
425 | }
426 |
427 | public void setWarcDate(String dateString) {
428 | warcHeader.dateString=dateString;
429 | }
430 |
431 | public void setWarcUUID(String UUID) {
432 | warcHeader.UUID=UUID;
433 | }
434 |
435 | public void addHeaderMetadata(String key, String value) {
436 | //System.out.println("+-- WarRecord.addHeaderMetadata key=" + key + " value=" + value);
437 | // don't allow addition of known keys
438 | if (key.equals("WARC-Type")) { return; }
439 | if (key.equals("WARC-Date")) { return; }
440 | if (key.equals("WARC-Record-ID")) { return; }
441 | if (key.equals("Content-Type")) { return; }
442 | if (key.equals("Content-Length")) { return; }
443 |
444 | warcHeader.metadata.put(key, value);
445 | }
446 |
447 |
448 | public void clearHeaderMetadata() {
449 | warcHeader.metadata.clear();
450 | }
451 |
452 | public Set> getHeaderMetadata() {
453 | return warcHeader.metadata.entrySet();
454 | }
455 |
456 | public String getHeaderMetadataItem(String key) {
457 | //System.out.println("+++ WarRecord.getHeaderMetadataItem key=" + key); // WARC-Target-URI
458 | if (key.equals("WARC-Type")) { return warcHeader.recordType; }
459 | if (key.equals("WARC-Date")) { return warcHeader.dateString; }
460 | if (key.equals("WARC-Record-ID")) { return warcHeader.UUID; }
461 | if (key.equals("Content-Type")) { return warcHeader.contentType; }
462 | if (key.equals("Content-Length")) { return Integer.toString(warcHeader.contentLength); }
463 |
464 | return warcHeader.metadata.get(key);
465 | }
466 |
467 | public void setContent(byte[] content) {
468 | warcContent=content;
469 | warcHeader.contentLength=content.length;
470 | }
471 |
472 | public void setContent(String content) {
473 | setContent(content.getBytes());
474 | }
475 | public void setContentLength(int len) {
476 | warcHeader.contentLength=len;
477 | }
478 |
479 | public byte[] getContent() {
480 | return warcContent;
481 | }
482 | public byte[] getByteContent() {
483 | return warcContent;
484 | }
485 |
486 | public String getContentUTF8() {
487 | String retString=null;
488 | try {
489 | retString = new String(warcContent, "UTF-8");
490 | } catch (UnsupportedEncodingException ex) {
491 | retString=new String(warcContent);
492 | }
493 | return retString;
494 | }
495 |
496 | public String getHeaderRecordType() {
497 | return warcHeader.recordType;
498 | }
499 |
500 | @Override
501 | public String toString() {
502 | StringBuffer retBuffer=new StringBuffer();
503 | retBuffer.append(warcHeader.toString());
504 | retBuffer.append(LINE_ENDING);
505 | retBuffer.append(new String(warcContent));
506 | return retBuffer.toString();
507 | }
508 |
509 | public String getHeaderString() {
510 | return warcHeader.toString();
511 | }
512 |
513 | public void write(DataOutput out) throws IOException {
514 | warcHeader.write(out);
515 | out.write(warcContent);
516 | }
517 |
518 | public void readFields(DataInput in) throws IOException {
519 | warcHeader.readFields(in);
520 | int contentLengthBytes=warcHeader.contentLength;
521 | warcContent=new byte[contentLengthBytes];
522 | in.readFully(warcContent);
523 | }
524 |
525 | }
526 |
527 |
--------------------------------------------------------------------------------