├── .dockerignore ├── .github └── workflows │ └── ghcr.yml ├── .gitignore ├── .project ├── .settings └── org.eclipse.m2e.core.prefs ├── Dockerfile ├── LICENSE ├── README.md ├── app ├── control.sh ├── elasticsearch │ └── run.sh ├── pywb │ ├── config.yaml │ ├── run.sh │ └── templates │ │ └── banner.html ├── redis │ └── run.sh ├── search-service │ └── run.sh └── warc-indexer │ └── run.sh ├── doc └── development.md ├── pom.xml ├── resources ├── de │ └── webis │ │ └── wasp │ │ ├── Index.java │ │ └── ui │ │ ├── Index.java │ │ └── search.mustache └── static │ ├── css │ ├── bootstrap-datetimepicker.min.css │ ├── bootstrap.min.css │ └── search.css │ ├── fonts │ ├── glyphicons-halflings-regular.eot │ ├── glyphicons-halflings-regular.svg │ ├── glyphicons-halflings-regular.ttf │ ├── glyphicons-halflings-regular.woff │ └── glyphicons-halflings-regular.woff2 │ ├── index.html │ └── js │ ├── bootstrap-datetimepicker.min.js │ ├── bootstrap.min.js │ ├── jquery.min.js │ ├── moment.min.js │ └── search.js └── src ├── de └── webis │ └── wasp │ ├── SearchService.java │ ├── WarcIndexingService.java │ ├── index │ ├── Index.java │ ├── Query.java │ ├── RequestRecord.java │ ├── ResponseRecord.java │ ├── Result.java │ └── WarcIndexer.java │ ├── ui │ ├── SearchServlet.java │ └── UiPage.java │ └── warcs │ ├── ArchiveWatcher.java │ ├── ContinuousWarcRecordReader.java │ ├── GenericHtmlWarcRecordConsumer.java │ ├── GenericWarcRecordConsumer.java │ ├── JerichoDocumentExtractor.java │ ├── WarcRecordReader.java │ └── Warcs.java └── edu └── cmu └── lemurproject └── WarcRecord.java /.dockerignore: -------------------------------------------------------------------------------- 1 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 2 | hs_err_pid* 3 | /target/ 4 | /.classpath 5 | /.settings 6 | /.project 7 | /doc 8 | -------------------------------------------------------------------------------- /.github/workflows/ghcr.yml: -------------------------------------------------------------------------------- 1 | name: Publish to GitHub Packages 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'master' 7 | tags: 8 | - 'v*' 9 | 10 | env: 11 | REGISTRY: ghcr.io 12 | IMAGE_NAME: ${{ github.repository }} 13 | 14 | jobs: 15 | build-and-publish: 16 | runs-on: ubuntu-latest 17 | permissions: 18 | contents: read 19 | packages: write 20 | 21 | steps: 22 | - name: Checkout repository 23 | uses: actions/checkout@v2 24 | 25 | - name: Log in to the Container registry 26 | uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 27 | with: 28 | registry: ${{ env.REGISTRY }} 29 | username: ${{ github.actor }} 30 | password: ${{ secrets.GITHUB_TOKEN }} 31 | 32 | - name: Extract metadata (tags, labels) for Docker 33 | id: meta 34 | uses: docker/metadata-action@v3 35 | with: 36 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 37 | tags: | 38 | type=semver,pattern={{version}} 39 | type=semver,pattern={{major}}.{{minor}} 40 | type=semver,pattern={{major}} 41 | 42 | - name: Build and push Docker image 43 | uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc 44 | with: 45 | context: . 46 | push: true 47 | tags: ${{ steps.meta.outputs.tags }} 48 | labels: ${{ steps.meta.outputs.labels }} 49 | build-args: version:${{ steps.meta.outputs.version }} 50 | 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | bin 4 | 5 | # Log file 6 | *.log 7 | 8 | # BlueJ files 9 | *.ctxt 10 | 11 | # Mobile Tools for Java (J2ME) 12 | .mtj.tmp/ 13 | 14 | # Package Files # 15 | *.jar 16 | *.war 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | /target/ 25 | /.classpath 26 | /.settings 27 | 28 | *.pem 29 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | personal-web-archive 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:17.0.2-slim AS compiler 2 | RUN mkdir /app 3 | RUN apt update && apt install -y --no-install-recommends \ 4 | maven 5 | WORKDIR /app/ 6 | COPY pom.xml /app/ 7 | RUN mvn clean compile assembly:single 8 | COPY resources /app/resources/ 9 | COPY src /app/src/ 10 | RUN mvn clean compile assembly:single 11 | 12 | 13 | FROM openjdk:17.0.2-slim 14 | EXPOSE 8001 15 | EXPOSE 8002 16 | 17 | RUN apt update && apt install -y --no-install-recommends \ 18 | curl \ 19 | python3-pip \ 20 | redis-server 21 | RUN pip3 install virtualenv 22 | 23 | 24 | RUN useradd -ms /bin/bash user 25 | USER user 26 | 27 | 28 | RUN mkdir -p /home/user/app/redis /home/user/app/elasticsearch /home/user/app/search-service /home/user/app/warc-indexer /home/user/app/pywb 29 | WORKDIR /home/user/app/pywb 30 | RUN virtualenv env \ 31 | && /bin/bash -c "source env/bin/activate && pip3 install pywb==2.6.7 && wb-manager init wasp" 32 | WORKDIR /home/user/app/elasticsearch 33 | RUN curl -L -O https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.2.3-linux-x86_64.tar.gz \ 34 | && tar xzf elasticsearch-*.tar.gz \ 35 | && rm elasticsearch-*.tar.gz \ 36 | && sed -i 's/^#path\.data.*/path.data: \/home\/user\/app\/elasticsearch\/index/' elasticsearch-*/config/elasticsearch.yml \ 37 | && sed -i 's/^#path\.logs.*/path.logs: \/home\/user\/app\/elasticsearch\/logs/' elasticsearch-*/config/elasticsearch.yml \ 38 | && echo "xpack.security.enabled: false" | tee -a elasticsearch-*/config/elasticsearch.yml 39 | 40 | 41 | WORKDIR /home/user/app 42 | COPY app/control.sh /home/user/app/ 43 | COPY app/redis/ /home/user/app/redis 44 | COPY app/elasticsearch/ /home/user/app/elasticsearch 45 | COPY app/search-service/ /home/user/app/search-service 46 | COPY app/warc-indexer/ /home/user/app/warc-indexer 47 | COPY app/pywb/ /home/user/app/pywb 48 | COPY --from=compiler /app/target/*-jar-with-dependencies.jar /home/user/app/ 49 | CMD ["./control.sh","start"] 50 | 51 | 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Webis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WASP 2 | Personal web archive and search as a container. 3 | 4 | 5 | ## Quickstart 6 | - Install [Docker](https://docker.io) (Windows 10: use Pro or Education) 7 | - `docker run -p 127.0.0.1:8001:8001 -p 127.0.0.1:8002:8002 --name wasp -d ghcr.io/webis-de/wasp:0.4.2` 8 | - Configure your browser to use `localhost:8001` as proxy HTTP+HTTPS (exception: `localhost`) 9 | - Clear browser cache or otherwise ensure your browser actually requests the web pages 10 | - Visit [http://example.org](http://example.org) 11 | - Visit [http://localhost:8002/search](http://localhost:8002/search) and search for "example" 12 | 13 | 14 | ## HTTPS 15 | For HTTPS you have to trust the certificate of your personal WASP instance (generated on the first run). 16 | - `docker cp wasp:/home/user/app/pywb/proxy-certs/pywb-ca.pem .` 17 | - Configure your browser to trust this certificate **as an authority** to identify web pages 18 | 19 | 20 | ## Other commands 21 | - `docker stop wasp` 22 | - `docker start wasp` 23 | 24 | 25 | ## Troubleshooting 26 | - The search page never retrieves results/does not show new results! 27 | - WASP currently uses the default Elastic Search settings which turn the index read-only once your disk reaches 95% of used space. Since the index is (if you did not reconfigure that) stored on your root partition, you might need to clean up there. 28 | 29 | -------------------------------------------------------------------------------- /app/control.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | services="redis elasticsearch pywb warc-indexer search-service" 4 | 5 | for service in $services;do 6 | pushd $service 7 | ./run.sh $1 8 | popd 9 | done 10 | 11 | tail -f */*.log 12 | 13 | -------------------------------------------------------------------------------- /app/elasticsearch/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | case $1 in 4 | start) 5 | ./elasticsearch-*/bin/elasticsearch 1> elastic.log 2>&1 & 6 | echo $! > pid.txt 7 | ;; 8 | stop) 9 | pid=$(cat pid.txt) 10 | kill $pid 11 | ;; 12 | *) 13 | echo Unknown command: $1 14 | esac 15 | 16 | -------------------------------------------------------------------------------- /app/pywb/config.yaml: -------------------------------------------------------------------------------- 1 | framed_replay: false 2 | collections: 3 | live: $live 4 | proxy: 5 | coll: wasp 6 | ca_name: WASP CA 7 | recording: true 8 | recorder: 9 | source_coll: live 10 | dedup_policy: revisit 11 | -------------------------------------------------------------------------------- /app/pywb/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pywb_port=${PYWB_PORT:=8001} 4 | 5 | case $1 in 6 | start) 7 | source env/bin/activate 8 | wayback --port $pywb_port --record 1> pywb.log 2>&1 & 9 | echo $! > pid.txt 10 | ;; 11 | stop) 12 | pid=$(cat pid.txt) 13 | kill $pid 14 | ;; 15 | *) 16 | echo Unknown command: $1 17 | esac 18 | 19 | 20 | -------------------------------------------------------------------------------- /app/pywb/templates/banner.html: -------------------------------------------------------------------------------- 1 | {% if env.pywb_proxy_magic %} 2 | {% else %} 3 |
Archived Version
4 | {% endif %} 5 | -------------------------------------------------------------------------------- /app/redis/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | case $1 in 4 | start) 5 | redis-server 1> redis.log 2>&1 & 6 | echo $! > pid.txt 7 | ;; 8 | stop) 9 | pid=$(cat pid.txt) 10 | kill $pid 11 | ;; 12 | *) 13 | echo Unknown command: $1 14 | esac 15 | 16 | -------------------------------------------------------------------------------- /app/search-service/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | elasticsearch_port=${ELASTICSEARCH_PORT:=9200} # TODO: not used yet 4 | search_port=${SEARCH_PORT:=8002} 5 | 6 | case $1 in 7 | start) 8 | java -cp ../*.jar de.webis.wasp.SearchService $search_port 1> wasp-search.log 2>&1 & 9 | echo $! > pid.txt 10 | ;; 11 | stop) 12 | pid=$(pid.txt) 13 | kill $pid 14 | ;; 15 | *) 16 | echo Unknown command: $1 17 | esac 18 | 19 | 20 | -------------------------------------------------------------------------------- /app/warc-indexer/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | elasticsearch_port=${ELASTICSEARCH_PORT:=9200} 4 | 5 | case $1 in 6 | start) 7 | if [ -e init.log ];then 8 | echo "Restarting" 9 | else 10 | echo "Waiting for elasticsearch to start" 11 | until grep -q "AllocationService.*current.health=.GREEN" /home/user/app/elasticsearch/elastic.log;do 12 | sleep 1 13 | done 14 | echo "Should work now!" 15 | java -cp ../*.jar de.webis.wasp.index.Index $elasticsearch_port 1> init.log 2>&1 16 | fi 17 | 18 | java -cp ../*.jar de.webis.wasp.WarcIndexingService /home/user/app/pywb/collections/wasp/archive $elasticsearch_port 1> indexer.log 2>&1 & 19 | echo $! > pid.txt 20 | ;; 21 | stop) 22 | pid=$(cat pid.txt) 23 | kill $pid 24 | ;; 25 | *) 26 | echo Unknown command: $1 27 | esac 28 | 29 | 30 | -------------------------------------------------------------------------------- /doc/development.md: -------------------------------------------------------------------------------- 1 | WASP Development 2 | ================ 3 | 4 | ``` 5 | sudo docker build -t ghcr.io/webis-de/wasp:dev . 6 | sudo docker run -p 127.0.0.1:8001:8001 -p 127.0.0.1:8002:8002 --name wasp ghcr.io/webis-de/wasp:dev 7 | sudo docker cp wasp:/home/user/app/pywb/proxy-certs/pywb-ca.pem . 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | webis-de 4 | wasp 5 | 0.3.0 6 | 7 | src 8 | 9 | 10 | resources 11 | 12 | 13 | 14 | 15 | maven-compiler-plugin 16 | 3.8.1 17 | 18 | 17 19 | 20 | 21 | 22 | maven-assembly-plugin 23 | 3.3.0 24 | 25 | 26 | 27 | de.webis.wasp.SearchService 28 | 29 | 30 | 31 | jar-with-dependencies 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | org.apache.httpcomponents 40 | httpclient 41 | 4.5.13 42 | 43 | 44 | co.elastic.clients 45 | elasticsearch-java 46 | 8.2.3 47 | 48 | 49 | com.fasterxml.jackson.core 50 | jackson-databind 51 | 2.14.0-rc1 52 | 53 | 54 | net.htmlparser.jericho 55 | jericho-html 56 | 3.4 57 | 58 | 59 | org.eclipse.jetty 60 | jetty-servlet 61 | 11.0.9 62 | 63 | 64 | org.apache.commons 65 | commons-text 66 | 1.10.0 67 | 68 | 69 | com.github.spullara.mustache.java 70 | compiler 71 | 0.9.10 72 | 73 | 74 | org.slf4j 75 | slf4j-simple 76 | 2.0.0-alpha6 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /resources/de/webis/wasp/Index.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.io.IOException; 4 | import java.time.Instant; 5 | import java.time.format.DateTimeFormatter; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Objects; 10 | import java.util.logging.Logger; 11 | 12 | import org.apache.http.HttpHost; 13 | import org.elasticsearch.client.RestClient; 14 | 15 | import com.fasterxml.jackson.core.JsonGenerator; 16 | import com.fasterxml.jackson.core.JsonParser; 17 | import com.fasterxml.jackson.core.JsonProcessingException; 18 | import com.fasterxml.jackson.databind.DeserializationContext; 19 | import com.fasterxml.jackson.databind.ObjectMapper; 20 | import com.fasterxml.jackson.databind.SerializerProvider; 21 | import com.fasterxml.jackson.databind.deser.std.StdDeserializer; 22 | import com.fasterxml.jackson.databind.module.SimpleModule; 23 | import com.fasterxml.jackson.databind.node.ObjectNode; 24 | import com.fasterxml.jackson.databind.ser.std.StdSerializer; 25 | 26 | import co.elastic.clients.elasticsearch.ElasticsearchClient; 27 | import co.elastic.clients.elasticsearch.core.GetResponse; 28 | import co.elastic.clients.elasticsearch.core.IndexRequest; 29 | import co.elastic.clients.elasticsearch.core.SearchResponse; 30 | import co.elastic.clients.elasticsearch.core.UpdateRequest; 31 | import co.elastic.clients.elasticsearch.core.search.Hit; 32 | import co.elastic.clients.elasticsearch.core.search.HitsMetadata; 33 | import co.elastic.clients.elasticsearch.indices.CreateIndexRequest; 34 | import co.elastic.clients.json.JsonData; 35 | import co.elastic.clients.json.jackson.JacksonJsonpMapper; 36 | import co.elastic.clients.transport.ElasticsearchTransport; 37 | import co.elastic.clients.transport.rest_client.RestClientTransport; 38 | 39 | /** 40 | * The WASP index client. 41 | * 42 | * @author johannes.kiesel@uni-weimar.de 43 | * 44 | */ 45 | public class Index 46 | implements AutoCloseable { 47 | 48 | ///////////////////////////////////////////////////////////////////////////// 49 | // LOGGING 50 | ///////////////////////////////////////////////////////////////////////////// 51 | 52 | private static final Logger LOG = 53 | Logger.getLogger(Index.class.getName()); 54 | 55 | ///////////////////////////////////////////////////////////////////////////// 56 | // CONSTANTS 57 | ///////////////////////////////////////////////////////////////////////////// 58 | 59 | /** 60 | * Default port of the index. 61 | */ 62 | public static final int DEFAULT_PORT = 9200; 63 | 64 | /** 65 | * Name of the index to use. 66 | */ 67 | public static final String INDEX_NAME = "archive"; 68 | 69 | /** 70 | * Default number of results to retrieve at most from the index at once. 71 | */ 72 | public static final int DEFAULT_MAX_RESULTS = 100; 73 | 74 | /** 75 | * Object mapper for JSON (de-)serialization. 76 | */ 77 | protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() 78 | .registerModule(new SimpleModule() 79 | .addSerializer(new InstantSerializer()) 80 | .addDeserializer(Instant.class, new InstantDeserializer())); 81 | 82 | /** 83 | * Elasticsearch object mapper for JSON (de-)serialization. 84 | */ 85 | protected static final JacksonJsonpMapper MAPPER = 86 | new JacksonJsonpMapper(OBJECT_MAPPER); 87 | 88 | ///////////////////////////////////////////////////////////////////////////// 89 | // MEMBERS 90 | ///////////////////////////////////////////////////////////////////////////// 91 | 92 | private final ElasticsearchClient client; 93 | 94 | private final RestClient lowLevelClient; 95 | 96 | ///////////////////////////////////////////////////////////////////////////// 97 | // CONSTRUCTION 98 | ///////////////////////////////////////////////////////////////////////////// 99 | 100 | /** 101 | * Creates a new index client talking to the index at the default port. 102 | * @see #DEFAULT_PORT 103 | */ 104 | public Index() { 105 | this(DEFAULT_PORT); 106 | } 107 | 108 | /** 109 | * Creates a new index client talking to the index at the specified port. 110 | * @param port The port 111 | */ 112 | public Index(final int port) { 113 | this.lowLevelClient = 114 | RestClient.builder(new HttpHost("localhost", port)).build(); 115 | final ElasticsearchTransport transport = 116 | new RestClientTransport(this.lowLevelClient, MAPPER); 117 | this.client = new ElasticsearchClient(transport); 118 | } 119 | 120 | /** 121 | * Initializes the index. 122 | *

123 | * This method must be called one time, but not again even after a restart of 124 | * WASP. 125 | *

126 | * @throws IOException On initializing the index 127 | */ 128 | public void initialize() 129 | throws IOException { 130 | final CreateIndexRequest createIndexRequest = CreateIndexRequest.of( 131 | indexBuilder -> indexBuilder 132 | .index(INDEX_NAME) 133 | .mappings(mappings -> mappings 134 | .properties(ResponseRecord.TYPE_PROPERTIES))); 135 | LOG.info("Created index: " + createIndexRequest); 136 | this.getClient().indices().create(createIndexRequest); 137 | } 138 | 139 | ///////////////////////////////////////////////////////////////////////////// 140 | // GETTERS 141 | ///////////////////////////////////////////////////////////////////////////// 142 | 143 | /** 144 | * Gets the low level REST client used to communicate with the index. 145 | * @return The client 146 | */ 147 | protected RestClient getLowLevelClient() { 148 | return this.lowLevelClient; 149 | } 150 | 151 | 152 | /** 153 | * Gets the high level client used to communicate with the index. 154 | * @return The client 155 | */ 156 | protected ElasticsearchClient getClient() { 157 | return this.client; 158 | } 159 | 160 | ///////////////////////////////////////////////////////////////////////////// 161 | // FUNCTIONALITY 162 | ///////////////////////////////////////////////////////////////////////////// 163 | 164 | @Override 165 | public void close() throws IOException { 166 | this.getLowLevelClient().close(); 167 | } 168 | 169 | ///////////////////////////////////////////////////////////////////////////// 170 | // INDEXING 171 | 172 | /** 173 | * Indexes a response record. 174 | * @param id The ID of the response 175 | * @param content The extracted content from the response 176 | * @param title The title of the response 177 | * @return Whether the response has been indexed (always) 178 | * @throws IOException On writing to the index 179 | */ 180 | public boolean indexResponse( 181 | final String id, final String content, final String title) 182 | throws IOException { 183 | final IndexRequest indexRequest = IndexRequest.of( 184 | builder -> builder 185 | .index(INDEX_NAME) 186 | .id(Objects.requireNonNull(id)) 187 | .document(ResponseRecord.forPage(title, content))); 188 | this.getClient().index(indexRequest); 189 | LOG.fine("Index response " + id); 190 | return true; 191 | } 192 | 193 | /** 194 | * Indexes a revisit record. 195 | * @param id The ID of the revisit 196 | * @param responseId The ID of the revisited response 197 | * @return Whether the revisit has been indexed (not if no such response 198 | * exists) 199 | * @throws IOException On reading or writing to the index 200 | */ 201 | public boolean indexRevisit(final String id, final String responseId) 202 | throws IOException { 203 | if (this.resolveResponse(responseId) == null) { 204 | LOG.fine("No response found for ID = " + responseId + " for revisit"); 205 | return false; 206 | } 207 | 208 | final IndexRequest indexRequest = IndexRequest.of( 209 | builder -> builder 210 | .index(INDEX_NAME) 211 | .id(Objects.requireNonNull(id)) 212 | .document(ResponseRecord.forRevisit(responseId))); 213 | this.getClient().index(indexRequest); 214 | LOG.fine("Index revisit " + id + " -> " + responseId); 215 | return true; 216 | } 217 | 218 | /** 219 | * Indexes a request record. 220 | * @param concurrentId The ID of the concurrent response 221 | * @param uri The URI of the request 222 | * @param instant The time of the request 223 | * @return Whether the request has been indexed (not if no such response 224 | * exists) 225 | * @throws IOException On reading or writing to the index 226 | */ 227 | public boolean indexRequest( 228 | final String concurrentId, final String uri, final Instant instant) 229 | throws IOException { 230 | final GetResponse response = 231 | this.resolveResponse(concurrentId); 232 | if (response == null) { 233 | LOG.fine("No response found for ID = " + concurrentId + " for request"); 234 | return false; 235 | } 236 | 237 | final String field = ResponseRecord.FIELD_REQUESTS; 238 | final Map params = Map.of(field, JsonData.of( 239 | new RequestRecord(uri, instant), MAPPER)); 240 | final String scriptSource = 241 | "ctx._source." + field + ".add(params." + field + ");"; 242 | 243 | final UpdateRequest updateRequest = 244 | UpdateRequest.of(builder -> builder 245 | .index(INDEX_NAME) 246 | .id(response.id()) 247 | .script(script -> script.inline(inline -> inline 248 | .lang("painless") 249 | .source(scriptSource) 250 | .params(params)))); 251 | this.getClient().update(updateRequest, ResponseRecord.class); 252 | LOG.fine("Index request -> " + concurrentId); 253 | return true; 254 | } 255 | 256 | ///////////////////////////////////////////////////////////////////////////// 257 | // SEARCH 258 | 259 | /** 260 | * Searches the index. 261 | * @param query The query to match responses and requests by 262 | * @return The results 263 | * @throws IOException On searching the index 264 | * @see #DEFAULT_MAX_RESULTS 265 | */ 266 | public List search(final Query query) 267 | throws IOException { 268 | return this.search(query, DEFAULT_MAX_RESULTS); 269 | } 270 | 271 | /** 272 | * Searches the index. 273 | * @param query The query to match responses and requests by 274 | * @param maxResults The maximum number of results to get 275 | * @return The results 276 | * @throws IOException On searching the index 277 | */ 278 | public List search(final Query query, final int maxResults) 279 | throws IOException { 280 | return this.search(query, maxResults, 0); 281 | } 282 | 283 | /** 284 | * Searches the index. 285 | * @param query The query to match responses and requests by 286 | * @param maxResults The maximum number of results to get 287 | * @param offset The offset of the first result to get 288 | * @return The results 289 | * @throws IOException On searching the index 290 | */ 291 | public List search( 292 | final Query query, final int maxResults, final int offset) 293 | throws IOException { 294 | final SearchResponse search = this.getClient().search( 295 | query.build(maxResults).from(offset).build(), ResponseRecord.class); 296 | final HitsMetadata hits = search.hits(); 297 | 298 | final List results = new ArrayList<>(); 299 | for (final Hit hit : hits.hits()) { 300 | final Result result = Result.fromHit(hit, query.getFrom(), query.getTo()); 301 | if (!result.hasEmptySnippet()) { results.add(result); } 302 | } 303 | return results; 304 | } 305 | 306 | ///////////////////////////////////////////////////////////////////////////// 307 | // HELPERS 308 | ///////////////////////////////////////////////////////////////////////////// 309 | 310 | /** 311 | * Gets the response with the specified ID while resolving revisits. 312 | * @param id The response or revisit ID 313 | * @return The response 314 | * @throws IOException On searching the index 315 | */ 316 | protected GetResponse resolveResponse(final String id) 317 | throws IOException { 318 | final GetResponse getResponse = this.getClient().get( 319 | get -> get.index(INDEX_NAME).id(id), 320 | ResponseRecord.class); 321 | if (getResponse.found()) { 322 | final ResponseRecord response = getResponse.source(); 323 | final String revisited = response.getRevisited(); 324 | if (revisited != null) { 325 | return this.resolveResponse(revisited); 326 | } else { 327 | return getResponse; 328 | } 329 | } else { 330 | return null; 331 | } 332 | } 333 | 334 | 335 | ///////////////////////////////////////////////////////////////////////////// 336 | // JSON BINDINGS 337 | ///////////////////////////////////////////////////////////////////////////// 338 | 339 | /** 340 | * Serializer for {@link Instant} using ISO-8601. 341 | * 342 | * @author johannes.kiesel@uni-weimar.de 343 | * @see InstantDeserializer 344 | * @see DateTimeFormatter#ISO_INSTANT 345 | * 346 | */ 347 | public static class InstantSerializer extends StdSerializer { 348 | 349 | private static final long serialVersionUID = 2795427768750728869L; 350 | 351 | /** 352 | * Creates a new serializer. 353 | */ 354 | public InstantSerializer() { 355 | super(Instant.class); 356 | } 357 | 358 | @Override 359 | public void serialize( 360 | final Instant value, 361 | final JsonGenerator generator, 362 | final SerializerProvider provider) 363 | throws IOException { 364 | generator.writeString(value.toString()); 365 | } 366 | 367 | } 368 | 369 | /** 370 | * Deserializer for {@link Instant} using ISO-8601. 371 | * 372 | * @author johannes.kiesel@uni-weimar.de 373 | * @see InstantSerializer 374 | * @see DateTimeFormatter#ISO_INSTANT 375 | * 376 | */ 377 | public static class InstantDeserializer extends StdDeserializer { 378 | 379 | private static final long serialVersionUID = -3591379516415686398L; 380 | 381 | /** 382 | * Creates a new deserializer. 383 | */ 384 | public InstantDeserializer() { 385 | super(Instant.class); 386 | } 387 | 388 | @Override 389 | public Instant deserialize( 390 | final JsonParser parser, 391 | final DeserializationContext context) 392 | throws IOException, JsonProcessingException { 393 | final String text = parser.getValueAsString(); 394 | return Instant.parse(text); 395 | } 396 | 397 | } 398 | 399 | ///////////////////////////////////////////////////////////////////////////// 400 | // MAIN 401 | ///////////////////////////////////////////////////////////////////////////// 402 | 403 | public static void main(final String[] args) throws IOException { 404 | final int port = 405 | args.length == 0 ? DEFAULT_PORT : Integer.parseInt(args[0]); 406 | try (final Index index = new Index(port)) { 407 | index.initialize(); 408 | } 409 | } 410 | 411 | } 412 | -------------------------------------------------------------------------------- /resources/de/webis/wasp/ui/Index.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.io.IOException; 4 | import java.time.Instant; 5 | import java.time.format.DateTimeFormatter; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Objects; 10 | import java.util.logging.Logger; 11 | 12 | import org.apache.http.HttpHost; 13 | import org.elasticsearch.client.RestClient; 14 | 15 | import com.fasterxml.jackson.core.JsonGenerator; 16 | import com.fasterxml.jackson.core.JsonParser; 17 | import com.fasterxml.jackson.core.JsonProcessingException; 18 | import com.fasterxml.jackson.databind.DeserializationContext; 19 | import com.fasterxml.jackson.databind.ObjectMapper; 20 | import com.fasterxml.jackson.databind.SerializerProvider; 21 | import com.fasterxml.jackson.databind.deser.std.StdDeserializer; 22 | import com.fasterxml.jackson.databind.module.SimpleModule; 23 | import com.fasterxml.jackson.databind.node.ObjectNode; 24 | import com.fasterxml.jackson.databind.ser.std.StdSerializer; 25 | 26 | import co.elastic.clients.elasticsearch.ElasticsearchClient; 27 | import co.elastic.clients.elasticsearch.core.GetResponse; 28 | import co.elastic.clients.elasticsearch.core.IndexRequest; 29 | import co.elastic.clients.elasticsearch.core.SearchResponse; 30 | import co.elastic.clients.elasticsearch.core.UpdateRequest; 31 | import co.elastic.clients.elasticsearch.core.search.Hit; 32 | import co.elastic.clients.elasticsearch.core.search.HitsMetadata; 33 | import co.elastic.clients.elasticsearch.indices.CreateIndexRequest; 34 | import co.elastic.clients.json.JsonData; 35 | import co.elastic.clients.json.jackson.JacksonJsonpMapper; 36 | import co.elastic.clients.transport.ElasticsearchTransport; 37 | import co.elastic.clients.transport.rest_client.RestClientTransport; 38 | 39 | /** 40 | * The WASP index client. 41 | * 42 | * @author johannes.kiesel@uni-weimar.de 43 | * 44 | */ 45 | public class Index 46 | implements AutoCloseable { 47 | 48 | ///////////////////////////////////////////////////////////////////////////// 49 | // LOGGING 50 | ///////////////////////////////////////////////////////////////////////////// 51 | 52 | private static final Logger LOG = 53 | Logger.getLogger(Index.class.getName()); 54 | 55 | ///////////////////////////////////////////////////////////////////////////// 56 | // CONSTANTS 57 | ///////////////////////////////////////////////////////////////////////////// 58 | 59 | /** 60 | * Default port of the index. 61 | */ 62 | public static final int DEFAULT_PORT = 9200; 63 | 64 | /** 65 | * Name of the index to use. 66 | */ 67 | public static final String INDEX_NAME = "archive"; 68 | 69 | /** 70 | * Default number of results to retrieve at most from the index at once. 71 | */ 72 | public static final int DEFAULT_MAX_RESULTS = 100; 73 | 74 | /** 75 | * Object mapper for JSON (de-)serialization. 76 | */ 77 | protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() 78 | .registerModule(new SimpleModule() 79 | .addSerializer(new InstantSerializer()) 80 | .addDeserializer(Instant.class, new InstantDeserializer())); 81 | 82 | /** 83 | * Elasticsearch object mapper for JSON (de-)serialization. 84 | */ 85 | protected static final JacksonJsonpMapper MAPPER = 86 | new JacksonJsonpMapper(OBJECT_MAPPER); 87 | 88 | ///////////////////////////////////////////////////////////////////////////// 89 | // MEMBERS 90 | ///////////////////////////////////////////////////////////////////////////// 91 | 92 | private final ElasticsearchClient client; 93 | 94 | private final RestClient lowLevelClient; 95 | 96 | ///////////////////////////////////////////////////////////////////////////// 97 | // CONSTRUCTION 98 | ///////////////////////////////////////////////////////////////////////////// 99 | 100 | /** 101 | * Creates a new index client talking to the index at the default port. 102 | * @see #DEFAULT_PORT 103 | */ 104 | public Index() { 105 | this(DEFAULT_PORT); 106 | } 107 | 108 | /** 109 | * Creates a new index client talking to the index at the specified port. 110 | * @param port The port 111 | */ 112 | public Index(final int port) { 113 | this.lowLevelClient = 114 | RestClient.builder(new HttpHost("localhost", port)).build(); 115 | final ElasticsearchTransport transport = 116 | new RestClientTransport(this.lowLevelClient, MAPPER); 117 | this.client = new ElasticsearchClient(transport); 118 | } 119 | 120 | /** 121 | * Initializes the index. 122 | *

123 | * This method must be called one time, but not again even after a restart of 124 | * WASP. 125 | *

126 | * @throws IOException On initializing the index 127 | */ 128 | public void initialize() 129 | throws IOException { 130 | final CreateIndexRequest createIndexRequest = CreateIndexRequest.of( 131 | indexBuilder -> indexBuilder 132 | .index(INDEX_NAME) 133 | .mappings(mappings -> mappings 134 | .properties(ResponseRecord.TYPE_PROPERTIES))); 135 | LOG.info("Created index: " + createIndexRequest); 136 | this.getClient().indices().create(createIndexRequest); 137 | } 138 | 139 | ///////////////////////////////////////////////////////////////////////////// 140 | // GETTERS 141 | ///////////////////////////////////////////////////////////////////////////// 142 | 143 | /** 144 | * Gets the low level REST client used to communicate with the index. 145 | * @return The client 146 | */ 147 | protected RestClient getLowLevelClient() { 148 | return this.lowLevelClient; 149 | } 150 | 151 | 152 | /** 153 | * Gets the high level client used to communicate with the index. 154 | * @return The client 155 | */ 156 | protected ElasticsearchClient getClient() { 157 | return this.client; 158 | } 159 | 160 | ///////////////////////////////////////////////////////////////////////////// 161 | // FUNCTIONALITY 162 | ///////////////////////////////////////////////////////////////////////////// 163 | 164 | @Override 165 | public void close() throws IOException { 166 | this.getLowLevelClient().close(); 167 | } 168 | 169 | ///////////////////////////////////////////////////////////////////////////// 170 | // INDEXING 171 | 172 | /** 173 | * Indexes a response record. 174 | * @param id The ID of the response 175 | * @param content The extracted content from the response 176 | * @param title The title of the response 177 | * @return Whether the response has been indexed (always) 178 | * @throws IOException On writing to the index 179 | */ 180 | public boolean indexResponse( 181 | final String id, final String content, final String title) 182 | throws IOException { 183 | final IndexRequest indexRequest = IndexRequest.of( 184 | builder -> builder 185 | .index(INDEX_NAME) 186 | .id(Objects.requireNonNull(id)) 187 | .document(ResponseRecord.forPage(title, content))); 188 | this.getClient().index(indexRequest); 189 | LOG.fine("Index response " + id); 190 | return true; 191 | } 192 | 193 | /** 194 | * Indexes a revisit record. 195 | * @param id The ID of the revisit 196 | * @param responseId The ID of the revisited response 197 | * @return Whether the revisit has been indexed (not if no such response 198 | * exists) 199 | * @throws IOException On reading or writing to the index 200 | */ 201 | public boolean indexRevisit(final String id, final String responseId) 202 | throws IOException { 203 | if (this.resolveResponse(responseId) == null) { 204 | LOG.fine("No response found for ID = " + responseId + " for revisit"); 205 | return false; 206 | } 207 | 208 | final IndexRequest indexRequest = IndexRequest.of( 209 | builder -> builder 210 | .index(INDEX_NAME) 211 | .id(Objects.requireNonNull(id)) 212 | .document(ResponseRecord.forRevisit(responseId))); 213 | this.getClient().index(indexRequest); 214 | LOG.fine("Index revisit " + id + " -> " + responseId); 215 | return true; 216 | } 217 | 218 | /** 219 | * Indexes a request record. 220 | * @param concurrentId The ID of the concurrent response 221 | * @param uri The URI of the request 222 | * @param instant The time of the request 223 | * @return Whether the request has been indexed (not if no such response 224 | * exists) 225 | * @throws IOException On reading or writing to the index 226 | */ 227 | public boolean indexRequest( 228 | final String concurrentId, final String uri, final Instant instant) 229 | throws IOException { 230 | final GetResponse response = 231 | this.resolveResponse(concurrentId); 232 | if (response == null) { 233 | LOG.fine("No response found for ID = " + concurrentId + " for request"); 234 | return false; 235 | } 236 | 237 | final String field = ResponseRecord.FIELD_REQUESTS; 238 | final Map params = Map.of(field, JsonData.of( 239 | new RequestRecord(uri, instant), MAPPER)); 240 | final String scriptSource = 241 | "ctx._source." + field + ".add(params." + field + ");"; 242 | 243 | final UpdateRequest updateRequest = 244 | UpdateRequest.of(builder -> builder 245 | .index(INDEX_NAME) 246 | .id(response.id()) 247 | .script(script -> script.inline(inline -> inline 248 | .lang("painless") 249 | .source(scriptSource) 250 | .params(params)))); 251 | this.getClient().update(updateRequest, ResponseRecord.class); 252 | LOG.fine("Index request -> " + concurrentId); 253 | return true; 254 | } 255 | 256 | ///////////////////////////////////////////////////////////////////////////// 257 | // SEARCH 258 | 259 | /** 260 | * Searches the index. 261 | * @param query The query to match responses and requests by 262 | * @return The results 263 | * @throws IOException On searching the index 264 | * @see #DEFAULT_MAX_RESULTS 265 | */ 266 | public List search(final Query query) 267 | throws IOException { 268 | return this.search(query, DEFAULT_MAX_RESULTS); 269 | } 270 | 271 | /** 272 | * Searches the index. 273 | * @param query The query to match responses and requests by 274 | * @param maxResults The maximum number of results to get 275 | * @return The results 276 | * @throws IOException On searching the index 277 | */ 278 | public List search(final Query query, final int maxResults) 279 | throws IOException { 280 | return this.search(query, maxResults, 0); 281 | } 282 | 283 | /** 284 | * Searches the index. 285 | * @param query The query to match responses and requests by 286 | * @param maxResults The maximum number of results to get 287 | * @param offset The offset of the first result to get 288 | * @return The results 289 | * @throws IOException On searching the index 290 | */ 291 | public List search( 292 | final Query query, final int maxResults, final int offset) 293 | throws IOException { 294 | final SearchResponse search = this.getClient().search( 295 | query.build(maxResults).from(offset).build(), ResponseRecord.class); 296 | final HitsMetadata hits = search.hits(); 297 | 298 | final List results = new ArrayList<>(); 299 | for (final Hit hit : hits.hits()) { 300 | final Result result = Result.fromHit(hit, query.getFrom(), query.getTo()); 301 | if (!result.hasEmptySnippet()) { results.add(result); } 302 | } 303 | return results; 304 | } 305 | 306 | ///////////////////////////////////////////////////////////////////////////// 307 | // HELPERS 308 | ///////////////////////////////////////////////////////////////////////////// 309 | 310 | /** 311 | * Gets the response with the specified ID while resolving revisits. 312 | * @param id The response or revisit ID 313 | * @return The response 314 | * @throws IOException On searching the index 315 | */ 316 | protected GetResponse resolveResponse(final String id) 317 | throws IOException { 318 | final GetResponse getResponse = this.getClient().get( 319 | get -> get.index(INDEX_NAME).id(id), 320 | ResponseRecord.class); 321 | if (getResponse.found()) { 322 | final ResponseRecord response = getResponse.source(); 323 | final String revisited = response.getRevisited(); 324 | if (revisited != null) { 325 | return this.resolveResponse(revisited); 326 | } else { 327 | return getResponse; 328 | } 329 | } else { 330 | return null; 331 | } 332 | } 333 | 334 | 335 | ///////////////////////////////////////////////////////////////////////////// 336 | // JSON BINDINGS 337 | ///////////////////////////////////////////////////////////////////////////// 338 | 339 | /** 340 | * Serializer for {@link Instant} using ISO-8601. 341 | * 342 | * @author johannes.kiesel@uni-weimar.de 343 | * @see InstantDeserializer 344 | * @see DateTimeFormatter#ISO_INSTANT 345 | * 346 | */ 347 | public static class InstantSerializer extends StdSerializer { 348 | 349 | private static final long serialVersionUID = 2795427768750728869L; 350 | 351 | /** 352 | * Creates a new serializer. 353 | */ 354 | public InstantSerializer() { 355 | super(Instant.class); 356 | } 357 | 358 | @Override 359 | public void serialize( 360 | final Instant value, 361 | final JsonGenerator generator, 362 | final SerializerProvider provider) 363 | throws IOException { 364 | generator.writeString(value.toString()); 365 | } 366 | 367 | } 368 | 369 | /** 370 | * Deserializer for {@link Instant} using ISO-8601. 371 | * 372 | * @author johannes.kiesel@uni-weimar.de 373 | * @see InstantSerializer 374 | * @see DateTimeFormatter#ISO_INSTANT 375 | * 376 | */ 377 | public static class InstantDeserializer extends StdDeserializer { 378 | 379 | private static final long serialVersionUID = -3591379516415686398L; 380 | 381 | /** 382 | * Creates a new deserializer. 383 | */ 384 | public InstantDeserializer() { 385 | super(Instant.class); 386 | } 387 | 388 | @Override 389 | public Instant deserialize( 390 | final JsonParser parser, 391 | final DeserializationContext context) 392 | throws IOException, JsonProcessingException { 393 | final String text = parser.getValueAsString(); 394 | return Instant.parse(text); 395 | } 396 | 397 | } 398 | 399 | ///////////////////////////////////////////////////////////////////////////// 400 | // MAIN 401 | ///////////////////////////////////////////////////////////////////////////// 402 | 403 | public static void main(final String[] args) throws IOException { 404 | final int port = 405 | args.length == 0 ? DEFAULT_PORT : Integer.parseInt(args[0]); 406 | try (final Index index = new Index(port)) { 407 | index.initialize(); 408 | } 409 | } 410 | 411 | } 412 | -------------------------------------------------------------------------------- /resources/de/webis/wasp/ui/search.mustache: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | WASP{{#query}}: {{terms}}{{/query}} 6 | 7 | 8 | 9 | 10 | 11 | 18 |
19 |
20 |
21 |
22 | 23 | From: 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
34 |
35 | 36 | Until: 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 |
48 |
49 | 50 | 51 | 52 | 53 |
54 | 55 |
56 | {{#query}} 57 |
58 | Page {{pageNumber}} 59 | for "{{terms}}" 60 | from {{#from.iso}}{{/from.iso}} 61 | until {{#to.iso}}{{/to.iso}} 62 |
63 | {{/query}} 64 |
    65 | {{#results}} 66 |
  1. 67 |
  2. 74 | {{/results}} 75 |
76 | 83 |
84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /resources/static/css/bootstrap-datetimepicker.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Datetimepicker for Bootstrap 3 3 | * version : 4.17.47 4 | * https://github.com/Eonasdan/bootstrap-datetimepicker/ 5 | */.bootstrap-datetimepicker-widget{list-style:none}.bootstrap-datetimepicker-widget.dropdown-menu{display:block;margin:2px 0;padding:4px;width:19em}@media (min-width:768px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}@media (min-width:992px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}@media (min-width:1200px){.bootstrap-datetimepicker-widget.dropdown-menu.timepicker-sbs{width:38em}}.bootstrap-datetimepicker-widget.dropdown-menu:before,.bootstrap-datetimepicker-widget.dropdown-menu:after{content:'';display:inline-block;position:absolute}.bootstrap-datetimepicker-widget.dropdown-menu.bottom:before{border-left:7px solid transparent;border-right:7px solid transparent;border-bottom:7px solid #ccc;border-bottom-color:rgba(0,0,0,0.2);top:-7px;left:7px}.bootstrap-datetimepicker-widget.dropdown-menu.bottom:after{border-left:6px solid transparent;border-right:6px solid transparent;border-bottom:6px solid white;top:-6px;left:8px}.bootstrap-datetimepicker-widget.dropdown-menu.top:before{border-left:7px solid transparent;border-right:7px solid transparent;border-top:7px solid #ccc;border-top-color:rgba(0,0,0,0.2);bottom:-7px;left:6px}.bootstrap-datetimepicker-widget.dropdown-menu.top:after{border-left:6px solid transparent;border-right:6px solid transparent;border-top:6px solid white;bottom:-6px;left:7px}.bootstrap-datetimepicker-widget.dropdown-menu.pull-right:before{left:auto;right:6px}.bootstrap-datetimepicker-widget.dropdown-menu.pull-right:after{left:auto;right:7px}.bootstrap-datetimepicker-widget .list-unstyled{margin:0}.bootstrap-datetimepicker-widget a[data-action]{padding:6px 0}.bootstrap-datetimepicker-widget a[data-action]:active{box-shadow:none}.bootstrap-datetimepicker-widget .timepicker-hour,.bootstrap-datetimepicker-widget .timepicker-minute,.bootstrap-datetimepicker-widget .timepicker-second{width:54px;font-weight:bold;font-size:1.2em;margin:0}.bootstrap-datetimepicker-widget button[data-action]{padding:6px}.bootstrap-datetimepicker-widget .btn[data-action="incrementHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Increment Hours"}.bootstrap-datetimepicker-widget .btn[data-action="incrementMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Increment Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="decrementHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Decrement Hours"}.bootstrap-datetimepicker-widget .btn[data-action="decrementMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Decrement Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="showHours"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Show Hours"}.bootstrap-datetimepicker-widget .btn[data-action="showMinutes"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Show Minutes"}.bootstrap-datetimepicker-widget .btn[data-action="togglePeriod"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Toggle AM/PM"}.bootstrap-datetimepicker-widget .btn[data-action="clear"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Clear the picker"}.bootstrap-datetimepicker-widget .btn[data-action="today"]::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Set the date to today"}.bootstrap-datetimepicker-widget .picker-switch{text-align:center}.bootstrap-datetimepicker-widget .picker-switch::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Toggle Date and Time Screens"}.bootstrap-datetimepicker-widget .picker-switch td{padding:0;margin:0;height:auto;width:auto;line-height:inherit}.bootstrap-datetimepicker-widget .picker-switch td span{line-height:2.5;height:2.5em;width:100%}.bootstrap-datetimepicker-widget table{width:100%;margin:0}.bootstrap-datetimepicker-widget table td,.bootstrap-datetimepicker-widget table th{text-align:center;border-radius:4px}.bootstrap-datetimepicker-widget table th{height:20px;line-height:20px;width:20px}.bootstrap-datetimepicker-widget table th.picker-switch{width:145px}.bootstrap-datetimepicker-widget table th.disabled,.bootstrap-datetimepicker-widget table th.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget table th.prev::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Previous Month"}.bootstrap-datetimepicker-widget table th.next::after{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0;content:"Next Month"}.bootstrap-datetimepicker-widget table thead tr:first-child th{cursor:pointer}.bootstrap-datetimepicker-widget table thead tr:first-child th:hover{background:#eee}.bootstrap-datetimepicker-widget table td{height:54px;line-height:54px;width:54px}.bootstrap-datetimepicker-widget table td.cw{font-size:.8em;height:20px;line-height:20px;color:#777}.bootstrap-datetimepicker-widget table td.day{height:20px;line-height:20px;width:20px}.bootstrap-datetimepicker-widget table td.day:hover,.bootstrap-datetimepicker-widget table td.hour:hover,.bootstrap-datetimepicker-widget table td.minute:hover,.bootstrap-datetimepicker-widget table td.second:hover{background:#eee;cursor:pointer}.bootstrap-datetimepicker-widget table td.old,.bootstrap-datetimepicker-widget table td.new{color:#777}.bootstrap-datetimepicker-widget table td.today{position:relative}.bootstrap-datetimepicker-widget table td.today:before{content:'';display:inline-block;border:solid transparent;border-width:0 0 7px 7px;border-bottom-color:#337ab7;border-top-color:rgba(0,0,0,0.2);position:absolute;bottom:4px;right:4px}.bootstrap-datetimepicker-widget table td.active,.bootstrap-datetimepicker-widget table td.active:hover{background-color:#337ab7;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.bootstrap-datetimepicker-widget table td.active.today:before{border-bottom-color:#fff}.bootstrap-datetimepicker-widget table td.disabled,.bootstrap-datetimepicker-widget table td.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget table td span{display:inline-block;width:54px;height:54px;line-height:54px;margin:2px 1.5px;cursor:pointer;border-radius:4px}.bootstrap-datetimepicker-widget table td span:hover{background:#eee}.bootstrap-datetimepicker-widget table td span.active{background-color:#337ab7;color:#fff;text-shadow:0 -1px 0 rgba(0,0,0,0.25)}.bootstrap-datetimepicker-widget table td span.old{color:#777}.bootstrap-datetimepicker-widget table td span.disabled,.bootstrap-datetimepicker-widget table td span.disabled:hover{background:none;color:#777;cursor:not-allowed}.bootstrap-datetimepicker-widget.usetwentyfour td.hour{height:27px;line-height:27px}.bootstrap-datetimepicker-widget.wider{width:21em}.bootstrap-datetimepicker-widget .datepicker-decades .decade{line-height:1.8em !important}.input-group.date .input-group-addon{cursor:pointer}.sr-only{position:absolute;width:1px;height:1px;margin:-1px;padding:0;overflow:hidden;clip:rect(0, 0, 0, 0);border:0} -------------------------------------------------------------------------------- /resources/static/css/search.css: -------------------------------------------------------------------------------- 1 | /* query box */ 2 | .datetime-form .date .btn { 3 | width: 10rem; 4 | 5 | } 6 | .datetime-form .date span.btn { 7 | width: 7rem; 8 | text-align: left; 9 | background-color: #eee; 10 | border-color: #ccc; 11 | color: black; 12 | } 13 | .datetime-form .date span.btn:hover { 14 | z-index: default; 15 | cursor: default; 16 | } 17 | .input-group.date { 18 | margin-bottom: 0.25rem; 19 | } 20 | 21 | /* current query */ 22 | .current-query { 23 | margin-bottom: 1rem; 24 | } 25 | 26 | /* result */ 27 | .query { 28 | font-weight: bold; 29 | } 30 | 31 | .results { 32 | list-style-type: none; 33 | } 34 | li.result { 35 | margin-bottom: 2rem; 36 | } 37 | li.result .links { 38 | display: block; 39 | margin-bottom: -0.5rem; 40 | } 41 | li.result .title { 42 | font-size: 18px; 43 | } 44 | li.result .archive, li.result .live { 45 | font-size: 12px; 46 | } 47 | li.result .archive::before, li.result .live::before { 48 | content: "["; 49 | } 50 | li.result .archive::after, li.result .live::after { 51 | content: "]"; 52 | } 53 | li.result .meta { 54 | font-size: 12px; 55 | display: flex; 56 | justify-content: space-between; 57 | } 58 | li.result .meta .uri, li.result .meta time { 59 | display: flex-item; 60 | } 61 | li.result .meta time::before { 62 | content: "@"; 63 | } 64 | li.result .snippet { 65 | display: block; 66 | margin-top: 0.25rem; 67 | } 68 | 69 | /* footer */ 70 | nav.footer { 71 | text-align: center; 72 | } 73 | -------------------------------------------------------------------------------- /resources/static/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webis-de/wasp/3dd9a6be6ae3ed911c11f5ead1b10b14379172a0/resources/static/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /resources/static/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webis-de/wasp/3dd9a6be6ae3ed911c11f5ead1b10b14379172a0/resources/static/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /resources/static/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webis-de/wasp/3dd9a6be6ae3ed911c11f5ead1b10b14379172a0/resources/static/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /resources/static/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webis-de/wasp/3dd9a6be6ae3ed911c11f5ead1b10b14379172a0/resources/static/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /resources/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Web Archive Search Personalized 6 | 7 | 8 | Go to search page. 9 | -------------------------------------------------------------------------------- /resources/static/js/search.js: -------------------------------------------------------------------------------- 1 | document.querySelector("[name='timezone']").value = 2 | Intl.DateTimeFormat().resolvedOptions().timeZone; 3 | 4 | const dateTimePickerFormat = 'YYYY-MM-DD HH:mm'; 5 | 6 | $(".date").datetimepicker({format: dateTimePickerFormat}); 7 | 8 | function enableTimeOffsetButtons(targetName) { 9 | const input = document.querySelector("[name='" + targetName + "']"); 10 | const buttons = document.querySelectorAll( 11 | "[data-button-target-name='" + targetName + "'] button[data-time-offset]"); 12 | for (let b = 0; b < buttons.length; ++b) { 13 | buttons[b].addEventListener("click", function(event) { 14 | const dataTimeOffset = event.target.getAttribute("data-time-offset"); 15 | if (dataTimeOffset == "") { 16 | input.value = ""; 17 | } else { 18 | const dataTimeOffsetAmount = 19 | event.target.getAttribute("data-time-offset-amount"); 20 | input.value = moment().subtract(dataTimeOffsetAmount, dataTimeOffset) 21 | .format(dateTimePickerFormat); 22 | } 23 | }); 24 | } 25 | } 26 | 27 | enableTimeOffsetButtons("from"); 28 | enableTimeOffsetButtons("to"); 29 | -------------------------------------------------------------------------------- /src/de/webis/wasp/SearchService.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp; 2 | 3 | import java.io.IOException; 4 | 5 | import org.eclipse.jetty.server.Server; 6 | import org.eclipse.jetty.server.session.SessionHandler; 7 | import org.eclipse.jetty.servlet.DefaultServlet; 8 | import org.eclipse.jetty.servlet.ServletContextHandler; 9 | import org.eclipse.jetty.servlet.ServletHolder; 10 | import org.eclipse.jetty.util.resource.ResourceCollection; 11 | 12 | import de.webis.wasp.ui.SearchServlet; 13 | 14 | public class SearchService extends Thread { 15 | 16 | public static final int DEFAULT_PORT = 8003; 17 | 18 | protected final ServletHolder servletHolder; 19 | 20 | protected final int port; 21 | 22 | public SearchService(final int port) { 23 | this.servletHolder = new ServletHolder(SearchServlet.class); 24 | this.port = port; 25 | } 26 | 27 | @Override 28 | public void run() { 29 | final ServletContextHandler servletHandler = new ServletContextHandler(); 30 | servletHandler.setContextPath("/"); 31 | servletHandler.setSessionHandler(new SessionHandler()); 32 | 33 | // Search Servlet 34 | servletHandler.addServlet( 35 | this.servletHolder, "/" + SearchServlet.SERVLET_PATH); 36 | 37 | // Serve files from resources/static/ 38 | try { 39 | servletHandler.setBaseResource(new ResourceCollection( 40 | this.getClass().getClassLoader().getResource("static") 41 | .toExternalForm())); 42 | } catch (final IOException e) { 43 | throw new RuntimeException(e); // should never happen 44 | } 45 | final ServletHolder resourcesServlet = 46 | new ServletHolder("static-embedded", DefaultServlet.class); 47 | resourcesServlet.setInitParameter("dirAllowed", "true"); 48 | servletHandler.addServlet(resourcesServlet, "/"); 49 | 50 | // Start server 51 | final Server server = new Server(this.port); 52 | server.setHandler(servletHandler); 53 | try { 54 | server.start(); 55 | server.join(); 56 | } catch (final Exception exception) { 57 | throw new RuntimeException(exception); 58 | } 59 | } 60 | 61 | protected void setInitParameter(final String parameter, final String value) { 62 | this.servletHolder.setInitParameter(parameter, value); 63 | } 64 | 65 | public static void main(final String[] args) { 66 | final int port = args.length > 0 ? Integer.parseInt(args[0]) : DEFAULT_PORT; 67 | final SearchService service = new SearchService(port); 68 | service.run(); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/de/webis/wasp/WarcIndexingService.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Path; 5 | import java.nio.file.Paths; 6 | import java.util.logging.ConsoleHandler; 7 | import java.util.logging.Level; 8 | import java.util.logging.Logger; 9 | 10 | import de.webis.wasp.index.Index; 11 | import de.webis.wasp.index.WarcIndexer; 12 | import de.webis.wasp.warcs.ArchiveWatcher; 13 | 14 | /** 15 | * Service to index WARC records. 16 | * 17 | * @author johannes.kiesel@uni-weimar.de 18 | * 19 | */ 20 | public class WarcIndexingService 21 | extends ArchiveWatcher { 22 | 23 | private final Index index; 24 | 25 | /** 26 | * Creates a new WARC indexing service. 27 | * @param directory The directory that contains the archive files 28 | * @param port The port of the index to add new WARC records to 29 | * @throws IOException On reading records 30 | */ 31 | public WarcIndexingService(final Path directory, final int port) 32 | throws IOException { 33 | this(directory, new Index(port)); 34 | } 35 | 36 | /** 37 | * Creates a new WARC indexing service. 38 | * @param directory The directory that contains the archive files 39 | * @param index The index to add new WARC records to 40 | * @throws IOException On reading records 41 | */ 42 | public WarcIndexingService(final Path directory, final Index index) 43 | throws IOException { 44 | super(directory, false, new WarcIndexer(index)); 45 | this.index = index; 46 | } 47 | 48 | @Override 49 | public void close() throws IOException { 50 | super.close(); 51 | this.index.close(); 52 | } 53 | 54 | 55 | /** 56 | * Starts the service 57 | * @param args directory [index-port] 58 | * @throws IOException On reading or indexing 59 | */ 60 | public static void main(final String[] args) throws IOException { 61 | final ConsoleHandler handler = new ConsoleHandler(); 62 | handler.setLevel(Level.FINE); 63 | final Logger logger = Logger.getLogger("de.webis.wasp"); 64 | logger.addHandler(handler); 65 | logger.setLevel(Level.FINE); 66 | 67 | final Path directory = Paths.get(args[0]); 68 | final int port = 69 | args.length != 2 ? Index.DEFAULT_PORT : Integer.parseInt(args[1]); 70 | try (final WarcIndexingService service = 71 | new WarcIndexingService(directory, port)) { 72 | service.run(); 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/de/webis/wasp/index/Index.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.io.IOException; 4 | import java.time.Instant; 5 | import java.time.format.DateTimeFormatter; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Objects; 10 | import java.util.logging.Logger; 11 | 12 | import org.apache.http.HttpHost; 13 | import org.elasticsearch.client.RestClient; 14 | 15 | import com.fasterxml.jackson.core.JsonGenerator; 16 | import com.fasterxml.jackson.core.JsonParser; 17 | import com.fasterxml.jackson.core.JsonProcessingException; 18 | import com.fasterxml.jackson.databind.DeserializationContext; 19 | import com.fasterxml.jackson.databind.ObjectMapper; 20 | import com.fasterxml.jackson.databind.SerializerProvider; 21 | import com.fasterxml.jackson.databind.deser.std.StdDeserializer; 22 | import com.fasterxml.jackson.databind.module.SimpleModule; 23 | import com.fasterxml.jackson.databind.node.ObjectNode; 24 | import com.fasterxml.jackson.databind.ser.std.StdSerializer; 25 | 26 | import co.elastic.clients.elasticsearch.ElasticsearchClient; 27 | import co.elastic.clients.elasticsearch._types.query_dsl.ChildScoreMode; 28 | import co.elastic.clients.elasticsearch.core.GetResponse; 29 | import co.elastic.clients.elasticsearch.core.IndexRequest; 30 | import co.elastic.clients.elasticsearch.core.SearchRequest; 31 | import co.elastic.clients.elasticsearch.core.SearchResponse; 32 | import co.elastic.clients.elasticsearch.core.UpdateRequest; 33 | import co.elastic.clients.elasticsearch.core.search.Hit; 34 | import co.elastic.clients.elasticsearch.core.search.HitsMetadata; 35 | import co.elastic.clients.elasticsearch.indices.CreateIndexRequest; 36 | import co.elastic.clients.json.JsonData; 37 | import co.elastic.clients.json.jackson.JacksonJsonpMapper; 38 | import co.elastic.clients.transport.ElasticsearchTransport; 39 | import co.elastic.clients.transport.rest_client.RestClientTransport; 40 | 41 | /** 42 | * The WASP index client. 43 | * 44 | * @author johannes.kiesel@uni-weimar.de 45 | * 46 | */ 47 | public class Index 48 | implements AutoCloseable { 49 | 50 | ///////////////////////////////////////////////////////////////////////////// 51 | // LOGGING 52 | ///////////////////////////////////////////////////////////////////////////// 53 | 54 | private static final Logger LOG = 55 | Logger.getLogger(Index.class.getName()); 56 | 57 | ///////////////////////////////////////////////////////////////////////////// 58 | // CONSTANTS 59 | ///////////////////////////////////////////////////////////////////////////// 60 | 61 | /** 62 | * Default port of the index. 63 | */ 64 | public static final int DEFAULT_PORT = 9200; 65 | 66 | /** 67 | * Name of the index to use. 68 | */ 69 | public static final String INDEX_NAME = "archive"; 70 | 71 | /** 72 | * Default number of results to retrieve at most from the index at once. 73 | */ 74 | public static final int DEFAULT_MAX_RESULTS = 100; 75 | 76 | /** 77 | * Object mapper for JSON (de-)serialization. 78 | */ 79 | protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() 80 | .registerModule(new SimpleModule() 81 | .addSerializer(new InstantSerializer()) 82 | .addDeserializer(Instant.class, new InstantDeserializer())); 83 | 84 | /** 85 | * Elasticsearch object mapper for JSON (de-)serialization. 86 | */ 87 | protected static final JacksonJsonpMapper MAPPER = 88 | new JacksonJsonpMapper(OBJECT_MAPPER); 89 | 90 | ///////////////////////////////////////////////////////////////////////////// 91 | // MEMBERS 92 | ///////////////////////////////////////////////////////////////////////////// 93 | 94 | private final ElasticsearchClient client; 95 | 96 | private final RestClient lowLevelClient; 97 | 98 | ///////////////////////////////////////////////////////////////////////////// 99 | // CONSTRUCTION 100 | ///////////////////////////////////////////////////////////////////////////// 101 | 102 | /** 103 | * Creates a new index client talking to the index at the default port. 104 | * @see #DEFAULT_PORT 105 | */ 106 | public Index() { 107 | this(DEFAULT_PORT); 108 | } 109 | 110 | /** 111 | * Creates a new index client talking to the index at the specified port. 112 | * @param port The port 113 | */ 114 | public Index(final int port) { 115 | this.lowLevelClient = 116 | RestClient.builder(new HttpHost("localhost", port)).build(); 117 | final ElasticsearchTransport transport = 118 | new RestClientTransport(this.lowLevelClient, MAPPER); 119 | this.client = new ElasticsearchClient(transport); 120 | } 121 | 122 | /** 123 | * Initializes the index. 124 | *

125 | * This method must be called one time, but not again even after a restart of 126 | * WASP. 127 | *

128 | * @throws IOException On initializing the index 129 | */ 130 | public void initialize() 131 | throws IOException { 132 | final CreateIndexRequest createIndexRequest = CreateIndexRequest.of( 133 | indexBuilder -> indexBuilder 134 | .index(INDEX_NAME) 135 | .mappings(mappings -> mappings 136 | .properties(ResponseRecord.TYPE_PROPERTIES))); 137 | LOG.info("Created index: " + createIndexRequest); 138 | this.getClient().indices().create(createIndexRequest); 139 | } 140 | 141 | ///////////////////////////////////////////////////////////////////////////// 142 | // GETTERS 143 | ///////////////////////////////////////////////////////////////////////////// 144 | 145 | /** 146 | * Gets the low level REST client used to communicate with the index. 147 | * @return The client 148 | */ 149 | protected RestClient getLowLevelClient() { 150 | return this.lowLevelClient; 151 | } 152 | 153 | 154 | /** 155 | * Gets the high level client used to communicate with the index. 156 | * @return The client 157 | */ 158 | protected ElasticsearchClient getClient() { 159 | return this.client; 160 | } 161 | 162 | ///////////////////////////////////////////////////////////////////////////// 163 | // FUNCTIONALITY 164 | ///////////////////////////////////////////////////////////////////////////// 165 | 166 | @Override 167 | public void close() throws IOException { 168 | this.getLowLevelClient().close(); 169 | } 170 | 171 | ///////////////////////////////////////////////////////////////////////////// 172 | // INDEXING 173 | 174 | /** 175 | * Indexes a response record. 176 | * @param id The ID of the response 177 | * @param uri The target URI of the response 178 | * @param content The extracted content from the response 179 | * @param title The title of the response 180 | * @return Whether the response has been indexed (always) 181 | * @throws IOException On writing to the index 182 | */ 183 | public boolean indexResponse( 184 | final String id, final String uri, 185 | final String content, final String title) 186 | throws IOException { 187 | final IndexRequest indexRequest = IndexRequest.of( 188 | builder -> builder 189 | .index(INDEX_NAME) 190 | .id(Objects.requireNonNull(id)) 191 | .document(ResponseRecord.forPage(uri, title, content))); 192 | this.getClient().index(indexRequest); 193 | LOG.fine("Index response " + id); 194 | return true; 195 | } 196 | 197 | /** 198 | * Indexes a revisit record. 199 | * @param id The ID of the revisit 200 | * @param uri The target URI of the revisit 201 | * @param originalTime The time of the first visit 202 | * @param instant The time of the revisit 203 | * @return Whether the revisit has been indexed (not if no such response 204 | * exists) 205 | * @throws IOException On reading or writing to the index 206 | */ 207 | public boolean indexRevisit( 208 | final String id, final String uri, 209 | final Instant originalTime, final Instant instant) 210 | throws IOException { 211 | final SearchRequest search = new SearchRequest.Builder() 212 | .query(query -> query 213 | .bool(main -> main 214 | .must(time -> time.nested(nested -> nested 215 | .path(ResponseRecord.FIELD_REQUESTS) 216 | .scoreMode(ChildScoreMode.Max) 217 | .query(inner -> inner.match(range -> 218 | range.field(ResponseRecord.FIELD_REQUESTS + "." 219 | + RequestRecord.FIELD_DATE) 220 | .query(originalTime.toString()) 221 | )) 222 | )) 223 | ) 224 | ).build(); 225 | final HitsMetadata hits = 226 | this.getClient().search(search, ResponseRecord.class).hits(); 227 | if (hits.hits().size() > 0) { 228 | final String responseId = hits.hits().get(0).id(); 229 | LOG.fine("Index revisit " + uri + " -> " + responseId); 230 | this.indexRequest(responseId, uri, instant); 231 | return true; 232 | } else { 233 | LOG.warning("Index revisit " + uri + " FAILED"); 234 | return false; 235 | } 236 | } 237 | 238 | /** 239 | * Indexes a request record. 240 | * @param concurrentId The ID of the concurrent response 241 | * @param uri The URI of the request 242 | * @param instant The time of the request 243 | * @return Whether the request has been indexed (not if no such response 244 | * exists) 245 | * @throws IOException On reading or writing to the index 246 | */ 247 | public boolean indexRequest( 248 | final String concurrentId, final String uri, final Instant instant) 249 | throws IOException { 250 | final GetResponse response = 251 | this.resolveResponse(concurrentId); 252 | if (response == null) { 253 | LOG.fine("No response found for ID = " + concurrentId + " for request"); 254 | return false; 255 | } 256 | 257 | final String field = ResponseRecord.FIELD_REQUESTS; 258 | final Map params = Map.of(field, JsonData.of( 259 | new RequestRecord(uri, instant), MAPPER)); 260 | final String scriptSource = 261 | "ctx._source." + field + ".add(params." + field + ");"; 262 | 263 | final UpdateRequest updateRequest = 264 | UpdateRequest.of(builder -> builder 265 | .index(INDEX_NAME) 266 | .id(response.id()) 267 | .script(script -> script.inline(inline -> inline 268 | .lang("painless") 269 | .source(scriptSource) 270 | .params(params)))); 271 | this.getClient().update(updateRequest, ResponseRecord.class); 272 | LOG.fine("Index request -> " + concurrentId + " at " + instant); 273 | return true; 274 | } 275 | 276 | ///////////////////////////////////////////////////////////////////////////// 277 | // SEARCH 278 | 279 | /** 280 | * Searches the index. 281 | * @param query The query to match responses and requests by 282 | * @return The results 283 | * @throws IOException On searching the index 284 | * @see #DEFAULT_MAX_RESULTS 285 | */ 286 | public List search(final Query query) 287 | throws IOException { 288 | return this.search(query, DEFAULT_MAX_RESULTS); 289 | } 290 | 291 | /** 292 | * Searches the index. 293 | * @param query The query to match responses and requests by 294 | * @param maxResults The maximum number of results to get 295 | * @return The results 296 | * @throws IOException On searching the index 297 | */ 298 | public List search(final Query query, final int maxResults) 299 | throws IOException { 300 | return this.search(query, maxResults, 0); 301 | } 302 | 303 | /** 304 | * Searches the index. 305 | * @param query The query to match responses and requests by 306 | * @param maxResults The maximum number of results to get 307 | * @param offset The offset of the first result to get 308 | * @return The results 309 | * @throws IOException On searching the index 310 | */ 311 | public List search( 312 | final Query query, final int maxResults, final int offset) 313 | throws IOException { 314 | final SearchResponse search = this.getClient().search( 315 | query.build(maxResults).from(offset).build(), ResponseRecord.class); 316 | final HitsMetadata hits = search.hits(); 317 | 318 | final List results = new ArrayList<>(); 319 | for (final Hit hit : hits.hits()) { 320 | final Result result = Result.fromHit(hit, query.getFrom(), query.getTo()); 321 | if (!result.hasEmptySnippet()) { results.add(result); } 322 | } 323 | return results; 324 | } 325 | 326 | ///////////////////////////////////////////////////////////////////////////// 327 | // HELPERS 328 | ///////////////////////////////////////////////////////////////////////////// 329 | 330 | /** 331 | * Gets the response with the specified ID. 332 | * @param id The response ID 333 | * @return The response 334 | * @throws IOException On searching the index 335 | */ 336 | protected GetResponse resolveResponse(final String id) 337 | throws IOException { 338 | final GetResponse getResponse = this.getClient().get( 339 | get -> get.index(INDEX_NAME).id(id), 340 | ResponseRecord.class); 341 | if (getResponse.found()) { 342 | return getResponse; 343 | } else { 344 | return null; 345 | } 346 | } 347 | 348 | 349 | ///////////////////////////////////////////////////////////////////////////// 350 | // JSON BINDINGS 351 | ///////////////////////////////////////////////////////////////////////////// 352 | 353 | /** 354 | * Serializer for {@link Instant} using ISO-8601. 355 | * 356 | * @author johannes.kiesel@uni-weimar.de 357 | * @see InstantDeserializer 358 | * @see DateTimeFormatter#ISO_INSTANT 359 | * 360 | */ 361 | public static class InstantSerializer extends StdSerializer { 362 | 363 | private static final long serialVersionUID = 2795427768750728869L; 364 | 365 | /** 366 | * Creates a new serializer. 367 | */ 368 | public InstantSerializer() { 369 | super(Instant.class); 370 | } 371 | 372 | @Override 373 | public void serialize( 374 | final Instant value, 375 | final JsonGenerator generator, 376 | final SerializerProvider provider) 377 | throws IOException { 378 | generator.writeString(value.toString()); 379 | } 380 | 381 | } 382 | 383 | /** 384 | * Deserializer for {@link Instant} using ISO-8601. 385 | * 386 | * @author johannes.kiesel@uni-weimar.de 387 | * @see InstantSerializer 388 | * @see DateTimeFormatter#ISO_INSTANT 389 | * 390 | */ 391 | public static class InstantDeserializer extends StdDeserializer { 392 | 393 | private static final long serialVersionUID = -3591379516415686398L; 394 | 395 | /** 396 | * Creates a new deserializer. 397 | */ 398 | public InstantDeserializer() { 399 | super(Instant.class); 400 | } 401 | 402 | @Override 403 | public Instant deserialize( 404 | final JsonParser parser, 405 | final DeserializationContext context) 406 | throws IOException, JsonProcessingException { 407 | final String text = parser.getValueAsString(); 408 | return Instant.parse(text); 409 | } 410 | 411 | } 412 | 413 | ///////////////////////////////////////////////////////////////////////////// 414 | // MAIN 415 | ///////////////////////////////////////////////////////////////////////////// 416 | 417 | public static void main(final String[] args) throws IOException { 418 | final int port = 419 | args.length == 0 ? DEFAULT_PORT : Integer.parseInt(args[0]); 420 | try (final Index index = new Index(port)) { 421 | index.initialize(); 422 | } 423 | } 424 | 425 | } 426 | -------------------------------------------------------------------------------- /src/de/webis/wasp/index/Query.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.time.Instant; 4 | import java.util.Objects; 5 | 6 | import co.elastic.clients.elasticsearch._types.query_dsl.ChildScoreMode; 7 | import co.elastic.clients.elasticsearch._types.query_dsl.Operator; 8 | import co.elastic.clients.elasticsearch.core.SearchRequest; 9 | import co.elastic.clients.elasticsearch.core.search.Highlight; 10 | import co.elastic.clients.elasticsearch.core.search.HighlightField; 11 | 12 | /** 13 | * A query to the index with optional time constraints 14 | * 15 | * @author johannes.kiesel@uni-weimar.de 16 | * 17 | */ 18 | public class Query { 19 | 20 | ///////////////////////////////////////////////////////////////////////////// 21 | // CONSTANTS 22 | ///////////////////////////////////////////////////////////////////////////// 23 | 24 | /** 25 | * Boosting factor for the title over the content. 26 | */ 27 | protected static final float TITLE_BOOST = 2.0f; 28 | 29 | /** 30 | * Field name of the request's date within the response. 31 | */ 32 | protected static final String FIELD_DATE_COMPLETE = 33 | ResponseRecord.FIELD_REQUESTS + "." + RequestRecord.FIELD_DATE; 34 | 35 | /** 36 | * Snippet generator. 37 | */ 38 | protected static final Highlight HIGHLIGHT = 39 | Highlight.of(highlight -> highlight 40 | .fields(ResponseRecord.FIELD_CONTENT, HighlightField.of(field -> field 41 | .type("unified")))); 42 | 43 | ///////////////////////////////////////////////////////////////////////////// 44 | // MEMBERS 45 | ///////////////////////////////////////////////////////////////////////////// 46 | 47 | private final String terms; 48 | 49 | private Instant from; 50 | 51 | private Instant to; 52 | 53 | ///////////////////////////////////////////////////////////////////////////// 54 | // CONSTRUCTORS 55 | ///////////////////////////////////////////////////////////////////////////// 56 | 57 | /** 58 | * Creates a new query. 59 | * @param terms The query terms to match the response content and title with 60 | * @param from The earliest time for a request to match this query, or 61 | * null for no constraint in this direction 62 | * @param to The latest time for a request to match this query, or 63 | * null for no constraint in this direction 64 | */ 65 | public Query( 66 | final String terms, final Instant from, final Instant to) { 67 | this.terms = Objects.requireNonNull(terms); 68 | this.from = from; 69 | this.to = to; 70 | } 71 | 72 | ///////////////////////////////////////////////////////////////////////////// 73 | // GETTERS 74 | ///////////////////////////////////////////////////////////////////////////// 75 | 76 | /** 77 | * Gets the query terms to match the response content and title with. 78 | * @return The terms 79 | */ 80 | public String getTerms() { 81 | return this.terms; 82 | } 83 | 84 | /** 85 | * Gets the earliest time for a request to match this query, if any. 86 | * @return The time or null for no constraint in this direction 87 | */ 88 | public Instant getFrom() { 89 | return this.from; 90 | } 91 | 92 | /** 93 | * Gets the latest time for a request to match this query, if any. 94 | * @return The time or null for no constraint in this direction 95 | */ 96 | public Instant getTo() { 97 | return this.to; 98 | } 99 | 100 | ///////////////////////////////////////////////////////////////////////////// 101 | // FUNCTIONALITY 102 | ///////////////////////////////////////////////////////////////////////////// 103 | 104 | @Override 105 | public boolean equals(final Object obj) { 106 | if (obj == null) { return false; } 107 | if (obj instanceof Query) { 108 | final Query other = (Query) obj; 109 | 110 | if (!this.getTerms().equals(other.getTerms())) { return false; } 111 | 112 | final Instant thisFrom = this.getFrom(); 113 | final Instant otherFrom = other.getFrom(); 114 | if ((thisFrom == null && otherFrom != null) 115 | || (thisFrom != null && !thisFrom.equals(otherFrom))) { 116 | return false; 117 | } 118 | 119 | final Instant thisTo = this.getTo(); 120 | final Instant otherTo = other.getTo(); 121 | if ((thisTo == null && otherTo != null) 122 | || (thisTo != null && !thisTo.equals(otherTo))) { 123 | return false; 124 | } 125 | return true; 126 | } 127 | return false; 128 | } 129 | 130 | /** 131 | * Creates a search request from this query. 132 | * @return A search request builder that is configured accordingly 133 | */ 134 | public SearchRequest.Builder build() { 135 | final Instant from = this.getFrom(); 136 | final Instant to = this.getTo(); 137 | final String terms = this.getTerms(); 138 | 139 | return new SearchRequest.Builder() 140 | .query(query -> query 141 | .bool(main -> main 142 | .must(time -> time.nested(nested -> nested 143 | .path(ResponseRecord.FIELD_REQUESTS) 144 | .scoreMode(ChildScoreMode.Max) 145 | .query(inner -> inner.range(range -> { 146 | range.field(FIELD_DATE_COMPLETE); 147 | if (from != null) { range.from(from.toString()); } 148 | if (to != null) { range.to(to.toString()); } 149 | return range; 150 | })))) 151 | .should(term -> term.bool(bool -> bool 152 | .should(should -> should 153 | .match(match -> match 154 | .field(ResponseRecord.FIELD_CONTENT) 155 | .query(terms) 156 | .operator(Operator.And))) 157 | .should(should -> should 158 | .match(match -> match 159 | .field(ResponseRecord.FIELD_TITLE) 160 | .query(terms) 161 | .operator(Operator.And) 162 | .boost(TITLE_BOOST)))) 163 | ))) 164 | .highlight(HIGHLIGHT); 165 | } 166 | 167 | /** 168 | * Creates a search request from this query. 169 | * @param pageSize The result page size 170 | * @return A search request builder that is configured accordingly 171 | */ 172 | public SearchRequest.Builder build(final int pageSize) { 173 | return this.build().size(pageSize); 174 | } 175 | 176 | /** 177 | * Creates a search request from this query. 178 | * @param pageSize The result page size 179 | * @param page The result page 180 | * @return A search request builder that is configured accordingly 181 | */ 182 | public SearchRequest.Builder build(final int pageSize, final int page) { 183 | return this.build(pageSize).from((page - 1) * pageSize); 184 | } 185 | 186 | } 187 | -------------------------------------------------------------------------------- /src/de/webis/wasp/index/RequestRecord.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.time.Instant; 4 | import java.util.Map; 5 | import java.util.Objects; 6 | 7 | import com.fasterxml.jackson.annotation.JsonAutoDetect; 8 | import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility; 9 | import com.fasterxml.jackson.annotation.JsonCreator; 10 | import com.fasterxml.jackson.annotation.JsonGetter; 11 | import com.fasterxml.jackson.annotation.JsonProperty; 12 | 13 | import co.elastic.clients.elasticsearch._types.mapping.DateProperty; 14 | import co.elastic.clients.elasticsearch._types.mapping.KeywordProperty; 15 | import co.elastic.clients.elasticsearch._types.mapping.Property; 16 | 17 | /** 18 | * A record of a request for indexing / retrieval. 19 | * 20 | * @author johannes.kiesel@uni-weimar.de 21 | * 22 | */ 23 | @JsonAutoDetect( 24 | getterVisibility = Visibility.NONE, 25 | setterVisibility = Visibility.NONE) 26 | public class RequestRecord { 27 | 28 | ///////////////////////////////////////////////////////////////////////////// 29 | // CONSTANTS 30 | ///////////////////////////////////////////////////////////////////////////// 31 | 32 | /** 33 | * Name of the record's URI field. 34 | */ 35 | public static final String FIELD_URI = "uri"; 36 | 37 | /** 38 | * Name of the record's date field. 39 | */ 40 | public static final String FIELD_DATE = "date"; 41 | 42 | /** 43 | * Properties for an Elasticsearch mapping of this class. 44 | */ 45 | public static Map TYPE_PROPERTIES = Map.of( 46 | FIELD_URI, KeywordProperty.of(property -> property)._toProperty(), 47 | FIELD_DATE, DateProperty.of(property -> property)._toProperty()); 48 | 49 | ///////////////////////////////////////////////////////////////////////////// 50 | // MEMBERS 51 | ///////////////////////////////////////////////////////////////////////////// 52 | 53 | private final String uri; 54 | 55 | private final Instant date; 56 | 57 | ///////////////////////////////////////////////////////////////////////////// 58 | // CONSTRUCTION 59 | ///////////////////////////////////////////////////////////////////////////// 60 | 61 | /** 62 | * Creates a new record for some request. 63 | * @param uri The URI of the request 64 | * @param date The date of the request 65 | */ 66 | @JsonCreator 67 | public RequestRecord( 68 | @JsonProperty(FIELD_URI) final String uri, 69 | @JsonProperty(FIELD_DATE) final Instant date) { 70 | this.uri = Objects.requireNonNull(uri); 71 | this.date = Objects.requireNonNull(date); 72 | } 73 | 74 | ///////////////////////////////////////////////////////////////////////////// 75 | // GETTERS 76 | ///////////////////////////////////////////////////////////////////////////// 77 | 78 | /** 79 | * Gets the URI of the request. 80 | * @return The URI 81 | */ 82 | @JsonGetter(FIELD_URI) 83 | public String getUri() { 84 | return this.uri; 85 | } 86 | 87 | /** 88 | * Gets the date of the request. 89 | * @return The date 90 | */ 91 | @JsonGetter(FIELD_DATE) 92 | public Instant getDate() { 93 | return this.date; 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/de/webis/wasp/index/ResponseRecord.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | import java.util.Objects; 6 | 7 | import com.fasterxml.jackson.annotation.JsonAutoDetect; 8 | import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility; 9 | import com.fasterxml.jackson.annotation.JsonCreator; 10 | import com.fasterxml.jackson.annotation.JsonGetter; 11 | import com.fasterxml.jackson.annotation.JsonProperty; 12 | 13 | import co.elastic.clients.elasticsearch._types.mapping.KeywordProperty; 14 | import co.elastic.clients.elasticsearch._types.mapping.NestedProperty; 15 | import co.elastic.clients.elasticsearch._types.mapping.Property; 16 | import co.elastic.clients.elasticsearch._types.mapping.TextProperty; 17 | 18 | /** 19 | * A record of a response or revisit for indexing / retrieval. 20 | * 21 | * @author johannes.kiesel@uni-weimar.de 22 | * 23 | */ 24 | @JsonAutoDetect( 25 | getterVisibility = Visibility.NONE, 26 | setterVisibility = Visibility.NONE) 27 | public class ResponseRecord { 28 | 29 | ///////////////////////////////////////////////////////////////////////////// 30 | // CONSTANTS 31 | ///////////////////////////////////////////////////////////////////////////// 32 | 33 | /** 34 | * Name of the record's target URI field. 35 | */ 36 | public static final String FIELD_URI = "uri"; 37 | 38 | /** 39 | * Name of the record's title field. 40 | */ 41 | public static final String FIELD_TITLE = "title"; 42 | 43 | /** 44 | * Name of the record's content field. 45 | */ 46 | public static final String FIELD_CONTENT = "content"; 47 | 48 | /** 49 | * Name of the record's requests field. 50 | */ 51 | public static final String FIELD_REQUESTS = "requests"; 52 | 53 | /** 54 | * Properties for an Elasticsearch mapping of this class. 55 | */ 56 | public static Map TYPE_PROPERTIES = Map.of( 57 | FIELD_URI, KeywordProperty.of(property -> property)._toProperty(), 58 | FIELD_TITLE, TextProperty.of(property -> property)._toProperty(), 59 | FIELD_CONTENT, TextProperty.of(property -> property)._toProperty(), 60 | FIELD_REQUESTS, NestedProperty.of(property -> property 61 | .properties(RequestRecord.TYPE_PROPERTIES) 62 | )._toProperty()); 63 | 64 | ///////////////////////////////////////////////////////////////////////////// 65 | // MEMBERS 66 | ///////////////////////////////////////////////////////////////////////////// 67 | 68 | private final String uri; 69 | 70 | private final String title; 71 | 72 | private final String content; 73 | 74 | private final List requests; 75 | 76 | ///////////////////////////////////////////////////////////////////////////// 77 | // CONSTRUCTION 78 | ///////////////////////////////////////////////////////////////////////////// 79 | 80 | /** 81 | * Creates a new record for some request. 82 | * @param uri The target URI of the response page or revisit 83 | * @param title The title of the response page (or null if a 84 | * revisit) 85 | * @param content The extracted content of the response page (or 86 | * null if a revisit) 87 | * @param requests The requests that led to this response (empty if a revisit) 88 | */ 89 | @JsonCreator 90 | public ResponseRecord( 91 | @JsonProperty(FIELD_URI) final String uri, 92 | @JsonProperty(FIELD_TITLE) final String title, 93 | @JsonProperty(FIELD_CONTENT) final String content, 94 | @JsonProperty(FIELD_REQUESTS) final List requests) { 95 | this.uri = Objects.requireNonNull(uri); 96 | this.title = title; 97 | this.content = content; 98 | if (requests == null) { 99 | this.requests = List.of(); 100 | } else { 101 | this.requests = List.copyOf(requests); 102 | } 103 | } 104 | 105 | /** 106 | * Creates a new record for a response page without assigned requests. 107 | * @param uri The target URI of the response page 108 | * @param title The title of the page 109 | * @param content The extracted content of the page 110 | * @return The request 111 | */ 112 | public static ResponseRecord forPage( 113 | final String uri, final String title, final String content) { 114 | return new ResponseRecord( 115 | Objects.requireNonNull(uri), 116 | Objects.requireNonNull(title), Objects.requireNonNull(content), null); 117 | } 118 | 119 | ///////////////////////////////////////////////////////////////////////////// 120 | // GETTERS 121 | ///////////////////////////////////////////////////////////////////////////// 122 | 123 | /** 124 | * Gets the URI of the response. 125 | * @return The URI 126 | */ 127 | @JsonGetter(FIELD_URI) 128 | public String getUri() { 129 | return this.uri; 130 | } 131 | 132 | /** 133 | * Gets the title of the response. 134 | * @return The title or null if a revisit 135 | */ 136 | @JsonGetter(FIELD_TITLE) 137 | public String getTitle() { 138 | return this.title; 139 | } 140 | 141 | /** 142 | * Gets the content of the response. 143 | * @return The content or null if a revisit 144 | */ 145 | @JsonGetter(FIELD_CONTENT) 146 | public String getContent() { 147 | return this.content; 148 | } 149 | 150 | /** 151 | * Gets the requests that led to this response. 152 | * @return The list of requests (empty if a revisit) 153 | */ 154 | @JsonGetter(FIELD_REQUESTS) 155 | public List getRequests() { 156 | return this.requests; 157 | } 158 | 159 | } 160 | -------------------------------------------------------------------------------- /src/de/webis/wasp/index/Result.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.time.Instant; 4 | import java.util.List; 5 | import java.util.ListIterator; 6 | import java.util.Map; 7 | import java.util.Objects; 8 | 9 | import com.fasterxml.jackson.annotation.JsonAutoDetect; 10 | import com.fasterxml.jackson.annotation.JsonCreator; 11 | import com.fasterxml.jackson.annotation.JsonGetter; 12 | import com.fasterxml.jackson.annotation.JsonProperty; 13 | 14 | import co.elastic.clients.elasticsearch.core.search.Hit; 15 | 16 | import com.fasterxml.jackson.annotation.JsonAutoDetect.Visibility; 17 | 18 | /** 19 | * A result for a query. 20 | * 21 | * @author johannes.kiesel@uni-weimar.de 22 | * 23 | */ 24 | @JsonAutoDetect( 25 | getterVisibility = Visibility.NONE, 26 | setterVisibility = Visibility.NONE) 27 | public class Result { 28 | 29 | ///////////////////////////////////////////////////////////////////////////// 30 | // CONSTANTS 31 | ///////////////////////////////////////////////////////////////////////////// 32 | 33 | /** 34 | * Name of the result's retrieval score field. 35 | */ 36 | public static final String FIELD_SCORE = "score"; 37 | 38 | /** 39 | * Name of the result's snippet field. 40 | */ 41 | public static final String FIELD_SNIPPET = "snippet"; 42 | 43 | /** 44 | * Name of the result's response field. 45 | */ 46 | public static final String FIELD_RESPONSE = "response"; 47 | 48 | /** 49 | * Name of the results's matched request field. 50 | */ 51 | public static final String FIELD_MATCHED_REQUEST = "matchedRequest"; 52 | 53 | ///////////////////////////////////////////////////////////////////////////// 54 | // MEMBERS 55 | ///////////////////////////////////////////////////////////////////////////// 56 | 57 | private final double score; 58 | 59 | private final String snippet; 60 | 61 | private final ResponseRecord response; 62 | 63 | private final RequestRecord matchedRequest; 64 | 65 | ///////////////////////////////////////////////////////////////////////////// 66 | // CONSTRUCTION 67 | ///////////////////////////////////////////////////////////////////////////// 68 | 69 | /** 70 | * Creates a new result. 71 | * @param score The retrieval score 72 | * @param snippet The snippet to display 73 | * @param response The underlying response 74 | * @param matchedRequest The response's request that matched the query's time 75 | * constraints 76 | */ 77 | @JsonCreator 78 | public Result( 79 | @JsonProperty(FIELD_SCORE) final double score, 80 | @JsonProperty(FIELD_SNIPPET) final String snippet, 81 | @JsonProperty(FIELD_RESPONSE) final ResponseRecord response, 82 | @JsonProperty(FIELD_MATCHED_REQUEST) final RequestRecord matchedRequest) { 83 | this.score = score; 84 | this.snippet = Objects.requireNonNull(snippet); 85 | this.response = Objects.requireNonNull(response); 86 | this.matchedRequest = Objects.requireNonNull(matchedRequest); 87 | } 88 | 89 | /** 90 | * Creates a new result from a search hit. 91 | * @param hit The hit 92 | * @param from The earliest time for a request, or null for no 93 | * constraint in this direction 94 | * @param to The latest time for a request, or null for no 95 | * constraint in this direction 96 | * @return The result 97 | * @throws IllegalArgumentException If no request matches the constraints 98 | */ 99 | public static Result fromHit( 100 | final Hit hit, final Instant from, final Instant to) { 101 | final double score = hit.score(); 102 | 103 | final ResponseRecord response = hit.source(); 104 | final RequestRecord request = Result.matchRequest(response, from, to); 105 | final String snippet = Result.getSnippet(hit); 106 | 107 | return new Result(score, snippet, response, request); 108 | } 109 | 110 | /** 111 | * Get the response's request the matches the time constraints. 112 | * @param response The response 113 | * @param from The earliest time for a request, or null for no 114 | * constraint in this direction 115 | * @param to The latest time for a request, or null for no 116 | * constraint in this direction 117 | * @return The latest request matching the constraints 118 | * @throws IllegalArgumentException If no request matches the constraints 119 | */ 120 | protected static RequestRecord matchRequest( 121 | final ResponseRecord response, final Instant from, final Instant to) { 122 | final List requests = response.getRequests(); 123 | final ListIterator iterator = 124 | requests.listIterator(requests.size()); 125 | while (iterator.hasPrevious()) { 126 | final RequestRecord request = iterator.previous(); 127 | final Instant date = request.getDate(); 128 | if (from != null && date.isBefore(from)) { continue; } 129 | if (to != null && date.isAfter(to)) { continue; } 130 | return request; 131 | } 132 | throw new IllegalArgumentException( 133 | "it contained no request in time interval"); 134 | } 135 | 136 | /** 137 | * Gets the snippet of a search hit. 138 | * @param hit The hit 139 | * @return The snippet (may be empty) 140 | */ 141 | protected static String getSnippet(final Hit hit) { 142 | final Map> snippetsPerField = hit.highlight(); 143 | final List snippetParts = 144 | snippetsPerField.get(ResponseRecord.FIELD_CONTENT); 145 | if (snippetParts == null) { return ""; } 146 | return String.join(" ... ", snippetParts); 147 | } 148 | 149 | ///////////////////////////////////////////////////////////////////////////// 150 | // GETTER 151 | ///////////////////////////////////////////////////////////////////////////// 152 | 153 | /** 154 | * Gets the retrieval score of the result. 155 | * @return The score 156 | */ 157 | @JsonGetter(FIELD_SCORE) 158 | public double getScore() { 159 | return this.score; 160 | } 161 | 162 | /** 163 | * Gets the snippet of the result. 164 | * @return The snippet 165 | */ 166 | @JsonGetter(FIELD_SNIPPET) 167 | public String getSnippet() { 168 | return this.snippet; 169 | } 170 | 171 | /** 172 | * Gets the response of the result. 173 | * @return The response 174 | */ 175 | @JsonGetter(FIELD_RESPONSE) 176 | public ResponseRecord getResponse() { 177 | return this.response; 178 | } 179 | 180 | /** 181 | * Gets the request of the response that was matched by the query. 182 | * @return The request 183 | */ 184 | @JsonGetter(FIELD_MATCHED_REQUEST) 185 | public RequestRecord getMatchedRequest() { 186 | return this.matchedRequest; 187 | } 188 | 189 | ///////////////////////////////////////////////////////////////////////////// 190 | // FUNCTIONALITY 191 | ///////////////////////////////////////////////////////////////////////////// 192 | 193 | /** 194 | * Checks whether the result snippet is empty. 195 | * @return Whether it is 196 | */ 197 | public boolean hasEmptySnippet() { 198 | return this.getSnippet().isEmpty(); 199 | } 200 | 201 | @Override 202 | public String toString() { 203 | return String.format( 204 | "RESULT %.2f '%s' FROM '%s' AT %s: '%s'", 205 | this.getScore(), this.getResponse().getTitle(), 206 | this.getMatchedRequest().getUri(), 207 | this.getMatchedRequest().getDate(), 208 | this.getSnippet()); 209 | } 210 | 211 | } 212 | -------------------------------------------------------------------------------- /src/de/webis/wasp/index/WarcIndexer.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.index; 2 | 3 | import java.io.IOException; 4 | import java.util.Objects; 5 | import java.util.logging.Logger; 6 | import java.time.Instant; 7 | 8 | import de.webis.wasp.warcs.GenericHtmlWarcRecordConsumer; 9 | 10 | /** 11 | * Consumer to index WARC records. 12 | * 13 | * @author johannes.kiesel@uni-weimar.de 14 | * 15 | */ 16 | public class WarcIndexer 17 | extends GenericHtmlWarcRecordConsumer { 18 | 19 | ///////////////////////////////////////////////////////////////////////////// 20 | // LOGGING 21 | ///////////////////////////////////////////////////////////////////////////// 22 | 23 | private static final Logger LOG = 24 | Logger.getLogger(WarcIndexer.class.getName()); 25 | 26 | ///////////////////////////////////////////////////////////////////////////// 27 | // MEMBERS 28 | ///////////////////////////////////////////////////////////////////////////// 29 | 30 | private final Index index; 31 | 32 | ///////////////////////////////////////////////////////////////////////////// 33 | // CONSTRUCTORS 34 | ///////////////////////////////////////////////////////////////////////////// 35 | 36 | /** 37 | * Creates a new consumer that indexes to the specified index. 38 | * @param index The index 39 | */ 40 | public WarcIndexer(final Index index) { 41 | this.index = Objects.requireNonNull(index); 42 | } 43 | 44 | ///////////////////////////////////////////////////////////////////////////// 45 | // GETTERS 46 | ///////////////////////////////////////////////////////////////////////////// 47 | 48 | public Index getIndex() { 49 | return this.index; 50 | } 51 | 52 | ///////////////////////////////////////////////////////////////////////////// 53 | // FUNCTIONALITY 54 | ///////////////////////////////////////////////////////////////////////////// 55 | 56 | @Override 57 | protected void acceptHtmlResponse( 58 | final String id, final String uri, 59 | final Document document, final Instant time) 60 | throws IOException { 61 | String title = document.getTitle(); 62 | if (title == null) { title = ""; } 63 | String content = document.getContent(); 64 | if (content == null) { content = ""; } 65 | LOG.fine("accept html response " + id 66 | + " title = '" + title + "' content exists = " + !content.isEmpty()); 67 | if (!title.isEmpty() || !content.isEmpty()) { 68 | this.getIndex().indexResponse(id, uri, content, title); 69 | } 70 | } 71 | 72 | @Override 73 | protected void acceptRevisit( 74 | final String id, final String uri, final Instant originalTime, 75 | final Instant time) 76 | throws IOException { 77 | this.getIndex().indexRevisit(id, uri, originalTime, time); 78 | } 79 | 80 | @Override 81 | protected void acceptRequest( 82 | final String concurrentRecordId, 83 | final String targetUri, 84 | final Instant time) 85 | throws IOException { 86 | this.getIndex().indexRequest(concurrentRecordId, targetUri, time); 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/de/webis/wasp/ui/SearchServlet.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.ui; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStreamReader; 5 | import java.time.Instant; 6 | import java.util.List; 7 | import java.util.NoSuchElementException; 8 | import java.util.TimeZone; 9 | 10 | import com.github.mustachejava.DefaultMustacheFactory; 11 | import com.github.mustachejava.Mustache; 12 | import com.github.mustachejava.MustacheFactory; 13 | 14 | import de.webis.wasp.index.Index; 15 | import de.webis.wasp.index.Query; 16 | import de.webis.wasp.index.Result; 17 | import jakarta.servlet.ServletConfig; 18 | import jakarta.servlet.ServletException; 19 | import jakarta.servlet.http.HttpServlet; 20 | import jakarta.servlet.http.HttpServletRequest; 21 | import jakarta.servlet.http.HttpServletResponse; 22 | import jakarta.servlet.http.HttpSession; 23 | 24 | /** 25 | * Servlet for the search service. 26 | * 27 | * @author johannes.kiesel@uni-weimar.de 28 | * 29 | */ 30 | public class SearchServlet 31 | extends HttpServlet { 32 | 33 | ///////////////////////////////////////////////////////////////////////////// 34 | // CONSTANTS 35 | ///////////////////////////////////////////////////////////////////////////// 36 | 37 | private static final long serialVersionUID = -5259242888271066638L; 38 | 39 | ///////////////////////////////////////////////////////////////////////////// 40 | // CONFIGURATION 41 | 42 | public static final String INIT_PARAMETER_INDEX_PORT = "index.port"; 43 | 44 | public static final int DEFAULT_INDEX_PORT = Index.DEFAULT_PORT; 45 | 46 | public static final String INIT_PARAMETER_PAGE_SIZE = "page.size"; 47 | 48 | public static final int DEFAULT_PAGE_SIZE = 10; 49 | 50 | public static final String INIT_PARAMETER_REPLAY_SERVER = "replay.server"; 51 | 52 | public static final String DEFAULT_REPLAY_SERVER = "http://localhost:8001"; 53 | 54 | public static final String INIT_PARAMETER_REPLAY_COLLECTION = "replay.collection"; 55 | 56 | public static final String DEFAULT_REPLAY_COLLECTION = "wasp"; 57 | 58 | ///////////////////////////////////////////////////////////////////////////// 59 | // REQUEST 60 | 61 | public static final String SERVLET_PATH = "search"; 62 | 63 | public static final String REQUEST_PARAMETER_TERMS = "terms"; 64 | 65 | public static final String REQUEST_PARAMETER_FROM = "from"; 66 | 67 | public static final String REQUEST_PARAMETER_TO = "to"; 68 | 69 | public static final String REQUEST_PARAMETER_TIMEZONE = "timezone"; 70 | 71 | public static final String REQUEST_PARAMETER_PAGE_NUMBER = "page"; 72 | 73 | ///////////////////////////////////////////////////////////////////////////// 74 | // SESSION 75 | 76 | protected static final String SESSION_QUERY = "query"; 77 | 78 | protected static final String SESSION_RESULTS = "results"; 79 | 80 | ///////////////////////////////////////////////////////////////////////////// 81 | // MEMBERS 82 | ///////////////////////////////////////////////////////////////////////////// 83 | 84 | private final Mustache pageRenderer; 85 | 86 | private Index index; 87 | 88 | private int pageSize; 89 | 90 | private String replayServer; 91 | 92 | private String replayCollection; 93 | 94 | ///////////////////////////////////////////////////////////////////////////// 95 | // CONSTRUCTION 96 | ///////////////////////////////////////////////////////////////////////////// 97 | 98 | /** 99 | * Creates a new servlet. 100 | */ 101 | public SearchServlet() { 102 | final MustacheFactory factory = new DefaultMustacheFactory(); 103 | this.pageRenderer = factory.compile(new InputStreamReader( 104 | SearchServlet.class.getResourceAsStream("search.mustache")), 105 | "search.mustache"); 106 | this.index = null; 107 | this.pageSize = 0; 108 | this.replayServer = null; 109 | this.replayCollection = null; 110 | } 111 | 112 | @Override 113 | public void init(final ServletConfig config) throws ServletException { 114 | this.index = new Index( 115 | SearchServlet.getParameterValue(config, 116 | INIT_PARAMETER_INDEX_PORT, DEFAULT_INDEX_PORT)); 117 | this.pageSize = SearchServlet.getParameterValue(config, 118 | INIT_PARAMETER_PAGE_SIZE, DEFAULT_PAGE_SIZE); 119 | this.replayServer = SearchServlet.getParameterValue(config, 120 | INIT_PARAMETER_REPLAY_SERVER, DEFAULT_REPLAY_SERVER); 121 | this.replayCollection = SearchServlet.getParameterValue(config, 122 | INIT_PARAMETER_REPLAY_COLLECTION, DEFAULT_REPLAY_COLLECTION); 123 | } 124 | 125 | ///////////////////////////////////////////////////////////////////////////// 126 | // GETTERS 127 | ///////////////////////////////////////////////////////////////////////////// 128 | 129 | /** 130 | * Gets the page renderer. 131 | * @return The renderer 132 | */ 133 | public Mustache getPageRenderer() { 134 | return this.pageRenderer; 135 | } 136 | 137 | /** 138 | * Gets the index client. 139 | * @return The client 140 | */ 141 | protected Index getIndex() { 142 | return this.index; 143 | } 144 | 145 | /** 146 | * Gets the page size to render. 147 | * @return The page size 148 | */ 149 | public int getPageSize() { 150 | return this.pageSize; 151 | } 152 | 153 | /** 154 | * Gets the address (including protocol and host) of the replay server. 155 | * @return The URI 156 | */ 157 | public String getReplayServer() { 158 | return this.replayServer; 159 | } 160 | 161 | /** 162 | * Gets the name of the collection to replay from. 163 | * @return The name 164 | */ 165 | public String getReplayCollection() { 166 | return this.replayCollection; 167 | } 168 | 169 | ///////////////////////////////////////////////////////////////////////////// 170 | // FUNCTIONALITY 171 | ///////////////////////////////////////////////////////////////////////////// 172 | 173 | @Override 174 | protected void doGet( 175 | final HttpServletRequest request, final HttpServletResponse response) 176 | throws ServletException, IOException { 177 | final UiPage page = this.getPage(request); 178 | 179 | response.setContentType("text/html"); 180 | this.getPageRenderer().execute(response.getWriter(), page); 181 | }; 182 | 183 | ///////////////////////////////////////////////////////////////////////////// 184 | // HELPERS 185 | ///////////////////////////////////////////////////////////////////////////// 186 | 187 | /** 188 | * Gets an implementation of the search page model for rendering. 189 | * @param request The request to the servlet 190 | * @return The page model 191 | * @throws IOException On searching the index 192 | */ 193 | protected UiPage getPage(final HttpServletRequest request) 194 | throws IOException { 195 | final int pageSize = this.getPageSize(); 196 | 197 | final Query query = SearchServlet.getQuery(request); 198 | final TimeZone timezone = SearchServlet.getClientTimeZone(request); 199 | if (query == null) { 200 | return new UiPage( 201 | this.getReplayServer(), this.getReplayCollection(), 202 | request.getLocale(), timezone); 203 | } else { 204 | final List results = this.getResults(request, query); 205 | final int numResults = results.size(); 206 | final int numPages = (numResults - 1) / pageSize + 1; 207 | final int pageNumber = SearchServlet.getPageNumber(request); 208 | final int fromResult = Math.min((pageNumber - 1) * pageSize, numResults); 209 | final int toResult = Math.min(pageNumber * pageSize, numResults); 210 | final List paginatedResults = 211 | results.subList(fromResult, toResult); 212 | 213 | return new UiPage( 214 | this.getReplayServer(), this.getReplayCollection(), 215 | query, paginatedResults, pageNumber, numPages, 216 | request.getLocale(), timezone); 217 | } 218 | } 219 | 220 | /** 221 | * Gets the results for the specified query. 222 | * @param request The request to the servlet 223 | * @param query The query 224 | * @return The results for the query 225 | * @throws IOException On searching the index 226 | */ 227 | protected List getResults( 228 | final HttpServletRequest request, final Query query) 229 | throws IOException { 230 | final HttpSession session = request.getSession(); 231 | synchronized (session) { 232 | @SuppressWarnings("unchecked") 233 | List results = 234 | (List) session.getAttribute(SESSION_RESULTS); 235 | if (results == null) { 236 | results = this.getIndex().search(query); 237 | session.setAttribute(SESSION_RESULTS, results); 238 | } 239 | return results; 240 | } 241 | } 242 | 243 | /** 244 | * Gets the query for a request. 245 | * @param request The request to the servlet 246 | * @return The query or null for none 247 | */ 248 | protected static Query getQuery(final HttpServletRequest request) { 249 | final String terms = request.getParameter(REQUEST_PARAMETER_TERMS); 250 | if (terms == null) { return null; } 251 | 252 | final TimeZone timezone = SearchServlet.getClientTimeZone(request); 253 | final Instant from = SearchServlet.parseInstant( 254 | request.getParameter(REQUEST_PARAMETER_FROM), timezone); 255 | final Instant to = SearchServlet.parseInstant( 256 | request.getParameter(REQUEST_PARAMETER_TO), timezone); 257 | final Query query = new Query(terms, from, to); 258 | 259 | final HttpSession session = request.getSession(); 260 | synchronized (session) { 261 | final Query oldQuery = (Query) session.getAttribute(SESSION_QUERY); 262 | if (query == null || !query.equals(oldQuery)) { 263 | session.setAttribute(SESSION_QUERY, query); 264 | session.removeAttribute(SESSION_RESULTS); 265 | } 266 | return query; 267 | } 268 | } 269 | 270 | /** 271 | * Gets the page number for a request. 272 | * @param request The request to the servlet 273 | * @return The page number (1 by default) 274 | */ 275 | protected static int getPageNumber(final HttpServletRequest request) { 276 | final String pageNumberString = 277 | request.getParameter(REQUEST_PARAMETER_PAGE_NUMBER); 278 | if (pageNumberString == null) { 279 | return 1; 280 | } else { 281 | return Integer.parseInt(pageNumberString); 282 | } 283 | } 284 | 285 | /** 286 | * Gets the time zone of the browser. 287 | * @param request The request to the servlet 288 | * @return The guessed time zone 289 | */ 290 | protected static TimeZone getClientTimeZone( 291 | final HttpServletRequest request) { 292 | final String value = request.getParameter(REQUEST_PARAMETER_TIMEZONE); 293 | if (value == null) { 294 | return TimeZone.getDefault(); 295 | } else { 296 | return TimeZone.getTimeZone(value); 297 | } 298 | } 299 | 300 | /** 301 | * Parses an instant from a request parameter. 302 | * @param value The parameter value (may be null) 303 | * @param timeZone The time zone of the browser 304 | * @return The instant or null for none 305 | */ 306 | protected static Instant parseInstant( 307 | final String value, final TimeZone timeZone) { 308 | if (value == null || value.isEmpty()) { 309 | return null; 310 | } else { 311 | return Instant.from(UiPage.UiInstant.DATE_TIME_PICKER_FORMATTER 312 | .withZone(timeZone.toZoneId()).parse(value)); 313 | } 314 | } 315 | 316 | /** 317 | * Gets the value for a parameter. 318 | * @param config The servlet configuration 319 | * @param parameter The parameter name 320 | * @return The value 321 | * @throws NoSuchElementException If no value is provided 322 | */ 323 | protected static String getParameterValue(final ServletConfig config, 324 | final String parameter) { 325 | return SearchServlet.getParameterValue(config, parameter, null); 326 | } 327 | 328 | /** 329 | * Gets the value for a parameter. 330 | * @param config The servlet configuration 331 | * @param parameter The parameter name 332 | * @param defaultValue The default value or null for none 333 | * @return The value (may be the default) 334 | * @throws NoSuchElementException If no value and no default value is provided 335 | */ 336 | protected static String getParameterValue(final ServletConfig config, 337 | final String parameter, final String defaultValue) { 338 | final String value = config.getInitParameter(parameter); 339 | if (value == null) { 340 | if (defaultValue == null) { 341 | throw new NoSuchElementException(parameter); 342 | } else { 343 | return defaultValue; 344 | } 345 | } else { 346 | return value; 347 | } 348 | } 349 | 350 | /** 351 | * Gets the value for a parameter as integer. 352 | * @param config The servlet configuration 353 | * @param parameter The parameter name 354 | * @param defaultValue The default value 355 | * @return The value (may be the default) 356 | */ 357 | protected static int getParameterValue(final ServletConfig config, 358 | final String parameter, final int defaultValue) { 359 | final String value = config.getInitParameter(parameter); 360 | if (value == null) { 361 | return defaultValue; 362 | } else { 363 | return Integer.parseInt(value); 364 | } 365 | } 366 | 367 | } 368 | -------------------------------------------------------------------------------- /src/de/webis/wasp/ui/UiPage.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.ui; 2 | 3 | import java.io.InputStreamReader; 4 | import java.io.StringWriter; 5 | import java.io.UnsupportedEncodingException; 6 | import java.net.URLEncoder; 7 | import java.time.Instant; 8 | import java.time.ZoneOffset; 9 | import java.time.format.DateTimeFormatter; 10 | import java.util.ArrayList; 11 | import java.util.Collections; 12 | import java.util.List; 13 | import java.util.Locale; 14 | import java.util.Objects; 15 | import java.util.TimeZone; 16 | 17 | import com.github.mustachejava.DefaultMustacheFactory; 18 | import com.github.mustachejava.Mustache; 19 | import com.github.mustachejava.MustacheFactory; 20 | 21 | import de.webis.wasp.index.Query; 22 | import de.webis.wasp.index.RequestRecord; 23 | import de.webis.wasp.index.ResponseRecord; 24 | import de.webis.wasp.index.Result; 25 | 26 | /** 27 | * Model for a WASP user interface web page. 28 | * 29 | * @author johannes.kiesel@uni-weimar.de 30 | * 31 | */ 32 | public class UiPage { 33 | 34 | ///////////////////////////////////////////////////////////////////////////// 35 | // CONSTANTS 36 | ///////////////////////////////////////////////////////////////////////////// 37 | 38 | protected static final int MAX_URI_DISPLAY_LENGTH = 60; 39 | 40 | ///////////////////////////////////////////////////////////////////////////// 41 | // MEMBERS 42 | ///////////////////////////////////////////////////////////////////////////// 43 | 44 | public final String replayServer; 45 | 46 | public final String replayCollection; 47 | 48 | public final String locale; 49 | 50 | public final UiQuery query; 51 | 52 | public final List results; 53 | 54 | public final List pagination; 55 | 56 | ///////////////////////////////////////////////////////////////////////////// 57 | // CONSTRUCTION 58 | ///////////////////////////////////////////////////////////////////////////// 59 | 60 | /** 61 | * Create a page without query or results. 62 | * @param replayServer The URL of the replay server (including port and 63 | * optional path up to the collection name) 64 | * @param replayCollection The name of the collection to replay from 65 | * @param locale Locale of the user client 66 | * @param timeZone Time zone of the user client 67 | */ 68 | public UiPage( 69 | final String replayServer, final String replayCollection, 70 | final Locale locale, final TimeZone timeZone) { 71 | this.replayServer = Objects.requireNonNull(replayServer); 72 | this.replayCollection = Objects.requireNonNull(replayCollection); 73 | this.locale = locale.toString(); 74 | this.query = null; 75 | this.results = List.of(); 76 | this.pagination = List.of(); 77 | } 78 | 79 | /** 80 | * Create a page with query and results. 81 | * @param replayServer The URL of the replay server (including port and 82 | * optional path up to the collection name) 83 | * @param replayCollection The name of the collection to replay from 84 | * @param query The query for which the results were retrieved 85 | * @param paginatedResults The results for the specific page 86 | * @param pageNumber The number of the result page for the query 87 | * @param numPages The number of available result pages for the query 88 | * @param locale The locale of the user client 89 | * @param timeZone The time zone of the user client 90 | */ 91 | public UiPage( 92 | final String replayServer, final String replayCollection, 93 | final Query query, final List paginatedResults, 94 | final int pageNumber, final int numPages, 95 | final Locale locale, final TimeZone timeZone) { 96 | this.replayServer = Objects.requireNonNull(replayServer); 97 | this.replayCollection = Objects.requireNonNull(replayCollection); 98 | this.locale = locale.toString(); 99 | this.query = new UiQuery(query, pageNumber, timeZone); 100 | 101 | final List results = new ArrayList<>(); 102 | for (final Result result : paginatedResults) { 103 | results.add( 104 | new UiResult(replayServer, replayCollection, result, timeZone)); 105 | } 106 | this.results = Collections.unmodifiableList(results); 107 | 108 | final List pagination = new ArrayList<>(); 109 | final StringBuilder hrefBaseBuilder = new StringBuilder(); 110 | try { 111 | hrefBaseBuilder.append('?') 112 | .append(SearchServlet.REQUEST_PARAMETER_TERMS).append('=') 113 | .append(URLEncoder.encode(query.getTerms(), "UTF-8")); 114 | if (query.getFrom() != null) { 115 | hrefBaseBuilder.append('&') 116 | .append(SearchServlet.REQUEST_PARAMETER_FROM).append('=') 117 | .append(URLEncoder.encode(this.query.from.timePickerValue, "UTF-8")); 118 | } 119 | if (query.getTo() != null) { 120 | hrefBaseBuilder.append('&') 121 | .append(SearchServlet.REQUEST_PARAMETER_TO).append('=') 122 | .append(URLEncoder.encode(this.query.to.timePickerValue, "UTF-8")); 123 | } 124 | hrefBaseBuilder.append("&page="); 125 | } catch (final UnsupportedEncodingException e) { 126 | throw new RuntimeException(e); 127 | } 128 | final String hrefBase = hrefBaseBuilder.toString(); 129 | // to first 130 | pagination.add(new UiPaginationLink( 131 | 1, "«", hrefBase + "1", 132 | false, pageNumber == 1)); 133 | // pages 134 | for (int p = 1; p <= numPages; ++p) { 135 | pagination.add(new UiPaginationLink( 136 | p, String.valueOf(p), hrefBase + p, 137 | p == pageNumber, false)); 138 | } 139 | // to last 140 | pagination.add(new UiPaginationLink( 141 | numPages, "»", hrefBase + numPages, 142 | false, pageNumber == numPages)); 143 | this.pagination = Collections.unmodifiableList(pagination); 144 | } 145 | 146 | ///////////////////////////////////////////////////////////////////////////// 147 | // HELPER CLASSES 148 | 149 | /** 150 | * Model for a WASP query in a user interface web page. 151 | * 152 | * @author johannes.kiesel@uni-weimar.de 153 | * 154 | */ 155 | public static final class UiQuery { 156 | 157 | /////////////////////////////////////////////////////////////////////////// 158 | // MEMBERS 159 | /////////////////////////////////////////////////////////////////////////// 160 | 161 | public final String terms; 162 | 163 | public final String termsUrl; 164 | 165 | public final UiInstant from; 166 | 167 | public final UiInstant to; 168 | 169 | public final int pageNumber; 170 | 171 | /////////////////////////////////////////////////////////////////////////// 172 | // CONSTRUCTION 173 | /////////////////////////////////////////////////////////////////////////// 174 | 175 | /** 176 | * Creates a new query for a WASP page. 177 | * @param query The original WASP query 178 | * @param timeZone The time zone of the user client 179 | */ 180 | protected UiQuery( 181 | final Query query, final int pageNumber, final TimeZone timeZone) { 182 | this.terms = query.getTerms(); 183 | try { 184 | this.termsUrl = URLEncoder.encode(this.terms, "UTF-8"); 185 | } catch (final UnsupportedEncodingException exception) { 186 | throw new RuntimeException(exception); 187 | } 188 | this.from = new UiInstant(query.getFrom(), timeZone, true, false); 189 | this.to = new UiInstant(query.getTo(), timeZone, false, true); 190 | this.pageNumber = pageNumber; 191 | } 192 | 193 | } 194 | 195 | /** 196 | * Model for a WASP result in a user interface web page. 197 | * 198 | * @author johannes.kiesel@uni-weimar.de 199 | * 200 | */ 201 | public static final class UiResult { 202 | 203 | /////////////////////////////////////////////////////////////////////////// 204 | // MEMBERS 205 | /////////////////////////////////////////////////////////////////////////// 206 | 207 | public final String title; 208 | 209 | public final UiInstant date; 210 | 211 | public final String liveUri; 212 | 213 | public final String liveUriShortened; 214 | 215 | public final String replayUri; 216 | 217 | public final String snippet; 218 | 219 | /////////////////////////////////////////////////////////////////////////// 220 | // CONSTRUCTION 221 | /////////////////////////////////////////////////////////////////////////// 222 | 223 | /** 224 | * Creates a new result for a WASP page. 225 | * @param replayServer The URL of the replay server (including port and 226 | * optional path up to the collection name) 227 | * @param replayCollection The name of the collection to replay from 228 | * @param result One result retrieved for the query 229 | * @param timeZone The time zone of the user client 230 | */ 231 | protected UiResult( 232 | final String replayServer, final String replayCollection, 233 | final Result result, final TimeZone timeZone) { 234 | this.title = result.getResponse().getTitle(); 235 | 236 | this.date = new UiInstant( 237 | result.getMatchedRequest().getDate(), timeZone, false, false); 238 | 239 | this.liveUri = result.getMatchedRequest().getUri(); 240 | if (this.liveUri.length() <= MAX_URI_DISPLAY_LENGTH) { 241 | this.liveUriShortened = this.liveUri; 242 | } else { 243 | final int splitIndex = (MAX_URI_DISPLAY_LENGTH - 3) / 2; 244 | this.liveUriShortened = this.liveUri.substring(0, splitIndex) + "..." 245 | + this.liveUri.substring(this.liveUri.length() - splitIndex); 246 | } 247 | 248 | this.replayUri = String.format("%s/%s/%s/%s", 249 | Objects.requireNonNull(replayServer), 250 | Objects.requireNonNull(replayCollection), 251 | this.date.replayPathValue, 252 | this.liveUri); 253 | 254 | this.snippet = result.getSnippet(); 255 | /* 256 | * StringEscapeUtils.escapeHtml4( 257 | * 258 | final Pattern highlightStartPattern = Pattern.compile("<em>"); 259 | final String startUnescaped = 260 | highlightStartPattern.matcher(htmlEscaped).replaceAll( 261 | ""); 262 | final Pattern highlightEndPattern = Pattern.compile("</em>"); 263 | return highlightEndPattern.matcher(startUnescaped).replaceAll(""); 264 | */ 265 | } 266 | 267 | } 268 | 269 | /** 270 | * Model for an instant in a user interface web page. 271 | * 272 | * @author johannes.kiesel@uni-weimar.de 273 | * 274 | */ 275 | public static final class UiInstant { 276 | 277 | /////////////////////////////////////////////////////////////////////////// 278 | // CONSTANTS 279 | /////////////////////////////////////////////////////////////////////////// 280 | 281 | protected static final DateTimeFormatter DATE_TIME_PICKER_FORMATTER = 282 | DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm"); 283 | 284 | protected static final DateTimeFormatter REPLAY_FORMATTER = 285 | DateTimeFormatter.ofPattern("yyyyMMddHHmmss"); 286 | 287 | /////////////////////////////////////////////////////////////////////////// 288 | // MEMBERS 289 | /////////////////////////////////////////////////////////////////////////// 290 | 291 | public final String text; 292 | 293 | public final String iso; 294 | 295 | public final String timePickerValue; 296 | 297 | public final String replayPathValue; 298 | 299 | /////////////////////////////////////////////////////////////////////////// 300 | // CONSTRUCTION 301 | /////////////////////////////////////////////////////////////////////////// 302 | 303 | /** 304 | * Creates a new instant for a WASP page. 305 | * @param instant The instant or null for none 306 | * @param timeZone The time zone of the user client 307 | * @param isFrom Whether this instant denotes the start of a time interval 308 | * @param isTo Whether this instant denotes the end of a time interval 309 | */ 310 | protected UiInstant( 311 | final Instant instant, final TimeZone timeZone, 312 | final boolean isFrom, final boolean isTo) { 313 | if (instant == null) { 314 | this.iso = null; 315 | this.timePickerValue = null; 316 | this.replayPathValue = null; 317 | if (isFrom) { 318 | this.text = "beginning"; 319 | } else if (isTo) { 320 | this.text = "now"; 321 | } else { 322 | this.text = null; 323 | } 324 | } else { 325 | this.iso = instant.toString(); 326 | this.timePickerValue = DATE_TIME_PICKER_FORMATTER.format(instant 327 | .atZone(timeZone.toZoneId())); 328 | this.replayPathValue = REPLAY_FORMATTER.format(instant 329 | .atOffset(ZoneOffset.UTC)); 330 | this.text = this.timePickerValue; 331 | } 332 | } 333 | 334 | } 335 | 336 | /** 337 | * Model for a link to a different result page in a user interface web page. 338 | * 339 | * @author johannes.kiesel@uni-weimar.de 340 | * 341 | */ 342 | public static final class UiPaginationLink { 343 | 344 | /////////////////////////////////////////////////////////////////////////// 345 | // MEMBERS 346 | /////////////////////////////////////////////////////////////////////////// 347 | 348 | public final int number; 349 | 350 | public final String text; 351 | 352 | public final String link; 353 | 354 | public final boolean isActive; 355 | 356 | public final boolean isDisabled; 357 | 358 | /////////////////////////////////////////////////////////////////////////// 359 | // CONSTRUCTION 360 | /////////////////////////////////////////////////////////////////////////// 361 | 362 | /** 363 | * Creates a new pagination link for a WASP page. 364 | * @param number The target page number 365 | * @param text The text to show 366 | * @param link The link to the page 367 | * @param isActive Whether this link leads to the current page 368 | * @param isDisabled Whether this link is disabled 369 | */ 370 | public UiPaginationLink( 371 | final int number, final String text, final String link, 372 | final boolean isActive, final boolean isDisabled) { 373 | this.number = number; 374 | this.text = Objects.requireNonNull(text); 375 | this.link = Objects.requireNonNull(link); 376 | this.isActive = isActive; 377 | this.isDisabled = isDisabled; 378 | } 379 | 380 | } 381 | 382 | public static void main(String[] args) { 383 | final MustacheFactory factory = new DefaultMustacheFactory(); 384 | final Mustache pageRenderer = factory.compile(new InputStreamReader( 385 | SearchServlet.class.getResourceAsStream("search.mustache")), 386 | "search.mustache"); 387 | final Query query = new Query("foo bar", null, Instant.now()); 388 | final List results = List.of( 389 | new Result(0.5, "my snippet", 390 | new ResponseRecord("foo", "bar", null, null), 391 | new RequestRecord("https://webis.de", Instant.now())), 392 | new Result(0.25, "my second snippet", 393 | new ResponseRecord("foo2", "bar2", null, null), 394 | new RequestRecord("https://webis.de", Instant.now()))); 395 | final int pageNumber = 1; 396 | final int numPages = 3; 397 | final UiPage page = new UiPage( 398 | "https://wasp.de", "mywasp", 399 | query, results, pageNumber, numPages, 400 | Locale.ENGLISH, TimeZone.getDefault()); 401 | final StringWriter writer = new StringWriter(); 402 | pageRenderer.execute(writer, page); 403 | System.out.println(writer.toString()); 404 | } 405 | 406 | } 407 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/ArchiveWatcher.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.file.FileSystems; 6 | import java.nio.file.Path; 7 | import java.nio.file.StandardWatchEventKinds; 8 | import java.nio.file.WatchEvent; 9 | import java.nio.file.WatchKey; 10 | import java.nio.file.WatchService; 11 | import java.util.Arrays; 12 | import java.util.Comparator; 13 | import java.util.function.Consumer; 14 | import java.util.logging.Level; 15 | import java.util.logging.Logger; 16 | 17 | import edu.cmu.lemurproject.WarcRecord; 18 | 19 | /** 20 | * Creates a {@link Thread} that watches a web archives repository for new 21 | * content and passes the new records to a consumer. 22 | *

23 | * If archives exist already in the directory, they are read in order of their 24 | * last modified dates (if set so in the constructor). In this case, it will 25 | * monitor the latest archive for changes, but not the others! 26 | *

27 | * Currently, this treats every file (or directory) within the target directory 28 | * as an archive and tries to read from it. 29 | *

30 | * 31 | * @author johannes.kiesel@uni-weimar.de 32 | * 33 | */ 34 | public class ArchiveWatcher 35 | extends Thread 36 | implements AutoCloseable { 37 | 38 | ///////////////////////////////////////////////////////////////////////////// 39 | // LOGGING 40 | ///////////////////////////////////////////////////////////////////////////// 41 | 42 | private static final Logger LOG = 43 | Logger.getLogger(ArchiveWatcher.class.getName()); 44 | 45 | ///////////////////////////////////////////////////////////////////////////// 46 | // MEMBERS 47 | ///////////////////////////////////////////////////////////////////////////// 48 | 49 | private final Path directory; 50 | 51 | private final WatchService watchService; 52 | 53 | private final Consumer consumer; 54 | 55 | private WarcRecordReader reader; 56 | 57 | ///////////////////////////////////////////////////////////////////////////// 58 | // CONSTRUCTION 59 | ///////////////////////////////////////////////////////////////////////////// 60 | 61 | /** 62 | * Create a new watcher for given directory. 63 | * @param directory The directory that contains the archive files 64 | * @param readExistingRecords Whether records that already exist in the 65 | * archives in the directory should be read 66 | * @param consumer The consumer to which the records will be passed 67 | * @throws IOException On reading records 68 | */ 69 | public ArchiveWatcher( 70 | final Path directory, final boolean readExistingRecords, 71 | final Consumer consumer) 72 | throws IOException { 73 | if (consumer == null) { throw new NullPointerException(); } 74 | this.directory = directory; 75 | this.consumer = consumer; 76 | this.reader = null; 77 | 78 | this.initForDirectory(readExistingRecords); 79 | 80 | this.watchService = FileSystems.getDefault().newWatchService(); 81 | this.getDirectory().register(this.getWatchService(), 82 | StandardWatchEventKinds.ENTRY_CREATE); 83 | } 84 | 85 | private void initForDirectory(final boolean readExistingRecords) 86 | throws IOException { 87 | final File[] children = this.getDirectory().toFile().listFiles(); 88 | Arrays.sort(children, new Comparator() { 89 | @Override 90 | public int compare(final File o1, final File o2) { 91 | return Long.compare(o1.lastModified(), o2.lastModified()); 92 | } 93 | }); 94 | 95 | if (readExistingRecords) { 96 | // Read what should be closed files 97 | if (children.length >= 2) { 98 | for (final File child 99 | : Arrays.copyOfRange(children, 0, children.length - 1)) { 100 | try (final WarcRecordReader reader = new WarcRecordReader( 101 | this.getDirectory().resolve(child.getName()), 102 | this.getConsumer())) { 103 | reader.run(); 104 | } 105 | } 106 | } 107 | } 108 | 109 | // Read what may be the open file 110 | if (children.length >= 1) { 111 | this.openFile(this.getDirectory().resolve( 112 | children[children.length - 1].getName()), readExistingRecords); 113 | } 114 | } 115 | 116 | ///////////////////////////////////////////////////////////////////////////// 117 | // GETTERS 118 | ///////////////////////////////////////////////////////////////////////////// 119 | 120 | /** 121 | * Gets the directory being watched. 122 | * @return The directory 123 | */ 124 | public Path getDirectory() { 125 | return this.directory; 126 | } 127 | 128 | /** 129 | * Gets the service watching for changes in the directory. 130 | * @return The service 131 | */ 132 | protected WatchService getWatchService() { 133 | return this.watchService; 134 | } 135 | 136 | /** 137 | * Gets the consumer to which WARC records are passed to. 138 | * @return The consumer 139 | */ 140 | public Consumer getConsumer() { 141 | return this.consumer; 142 | } 143 | 144 | /** 145 | * Gets the current WARC record reader. 146 | * @return The reader 147 | */ 148 | protected WarcRecordReader getReader() { 149 | return this.reader; 150 | } 151 | 152 | ///////////////////////////////////////////////////////////////////////////// 153 | // SETTER 154 | ///////////////////////////////////////////////////////////////////////////// 155 | 156 | /** 157 | * Sets the WARC record reader. 158 | * @param reader The reader 159 | */ 160 | public void setReader(final WarcRecordReader reader) { 161 | this.reader = reader; 162 | } 163 | 164 | ///////////////////////////////////////////////////////////////////////////// 165 | // FUNCTIONALITY 166 | ///////////////////////////////////////////////////////////////////////////// 167 | 168 | @Override 169 | public void run() { 170 | final Path directory = this.getDirectory(); 171 | try { 172 | while (true) { 173 | final WatchKey key = this.getWatchService().take(); 174 | for (final WatchEvent event : key.pollEvents()) { 175 | final WatchEvent.Kind kind = event.kind(); 176 | if (kind == StandardWatchEventKinds.ENTRY_CREATE) { 177 | final Path inputFile = directory.resolve((Path) event.context()); 178 | LOG.fine("New file created in " + directory + ": " + inputFile); 179 | this.openFile(inputFile, true); 180 | } else if (kind == StandardWatchEventKinds.OVERFLOW) { 181 | LOG.warning("Overflow detected when watching " + directory); 182 | } else { 183 | LOG.warning("Unknown watch event kind '" + kind + "' when watching " 184 | + directory); 185 | } 186 | } 187 | 188 | if (!key.reset()) { 189 | LOG.severe( 190 | "Directory " + directory + " can no longer be watched"); 191 | break; 192 | } 193 | } 194 | } catch (final InterruptedException exception) { 195 | LOG.log(Level.SEVERE, 196 | "Interrupted watching " + directory, exception); 197 | } catch (final IOException exception) { 198 | LOG.log(Level.SEVERE, "Error watching " + directory, exception); 199 | } 200 | } 201 | 202 | @Override 203 | public void close() throws IOException { 204 | this.closeFile(); 205 | } 206 | 207 | /** 208 | * Closes the currently opened file, if any. 209 | * @throws IOException On closing the file 210 | * @see {@link #openFile(Path, boolean)} 211 | */ 212 | protected void closeFile() throws IOException { 213 | synchronized (this) { 214 | final WarcRecordReader reader = this.getReader(); 215 | if (reader != null) { 216 | this.setReader(null); 217 | reader.close(); 218 | } 219 | } 220 | } 221 | 222 | /** 223 | * Starts reading from a new file, keeping watch if records are appended. 224 | * @param inputFile The file to read 225 | * @param consumeExistingRecords Whether to also pass existing records to the 226 | * consumer 227 | * @throws IOException On opening the file 228 | */ 229 | protected void openFile( 230 | final Path inputFile, final boolean consumeExistingRecords) 231 | throws IOException { 232 | synchronized (this) { 233 | this.closeFile(); 234 | final WarcRecordReader reader = new ContinuousWarcRecordReader( 235 | inputFile, consumeExistingRecords, this.getConsumer(), 1000); 236 | this.setReader(reader); 237 | reader.start(); 238 | } 239 | } 240 | 241 | } 242 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/ContinuousWarcRecordReader.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.nio.file.Path; 8 | import java.util.function.Consumer; 9 | import java.util.logging.Level; 10 | import java.util.logging.Logger; 11 | 12 | import edu.cmu.lemurproject.WarcRecord; 13 | 14 | /** 15 | * A {@link WarcRecordReader} that will wait for new content even when it 16 | * reached the end of the archive. 17 | *

18 | * This class should be used for archives that are still filled. When you use 19 | * {@link #close()}, this reader will still continue to read until it 20 | * encounters the end of the file the next time. 21 | *

22 | * 23 | * @author johannes.kiesel@uni-weimar.de 24 | * 25 | */ 26 | public class ContinuousWarcRecordReader extends WarcRecordReader { 27 | 28 | ///////////////////////////////////////////////////////////////////////////// 29 | // LOGGING 30 | ///////////////////////////////////////////////////////////////////////////// 31 | 32 | private static final Logger LOG = 33 | Logger.getLogger(ContinuousWarcRecordReader.class.getName()); 34 | 35 | ///////////////////////////////////////////////////////////////////////////// 36 | // MEMBERS 37 | ///////////////////////////////////////////////////////////////////////////// 38 | 39 | protected final long pollIntervalMillis; 40 | 41 | protected boolean consume; 42 | 43 | protected boolean closed; 44 | 45 | ///////////////////////////////////////////////////////////////////////////// 46 | // CONSTRUCTORS 47 | ///////////////////////////////////////////////////////////////////////////// 48 | 49 | /** 50 | * Creates a new reader for an archive that is still being filled. 51 | * @param inputFile The archive file 52 | * @param consumeExistingRecords Whether records that are already in the file 53 | * should also be consumed 54 | * @param consumer Consumer for the WARC records that are read 55 | * @param pollIntervalMillis On encountering the end of archive, poll the file 56 | * in this interval to check when it has more content 57 | * @throws IOException When the file can not be opened 58 | */ 59 | public ContinuousWarcRecordReader( 60 | final Path inputFile, final boolean consumeExistingRecords, 61 | final Consumer consumer, 62 | final long pollIntervalMillis) 63 | throws IOException { 64 | super(inputFile, consumer); 65 | this.pollIntervalMillis = pollIntervalMillis; 66 | this.consume = consumeExistingRecords; 67 | this.closed = false; 68 | } 69 | 70 | @Override 71 | protected FileInputStream openFileInputStream() 72 | throws IOException { 73 | final File file = this.getInputFile().toFile(); 74 | LOG.fine("Open file: " + file); 75 | return new ContinuousFileInputStream(file); 76 | } 77 | 78 | ///////////////////////////////////////////////////////////////////////////// 79 | // FUNCTIONALITY 80 | ///////////////////////////////////////////////////////////////////////////// 81 | 82 | @Override 83 | public void close() throws IOException { 84 | LOG.fine("Closing " + this.getInputFile()); 85 | this.closed = true; 86 | } 87 | 88 | @Override 89 | protected void consume(final WarcRecord record) { 90 | if (this.consume) { 91 | super.getConsumer().accept(record); 92 | } 93 | } 94 | 95 | protected void closeStream() throws IOException { 96 | super.close(); 97 | } 98 | 99 | ///////////////////////////////////////////////////////////////////////////// 100 | // HELPER CLASSES 101 | ///////////////////////////////////////////////////////////////////////////// 102 | 103 | /** 104 | * Modification of {@link FileInputStream} that does waits at the end of the 105 | * file for more content to appear. 106 | * 107 | * @author johannes.kiesel@uni-weimar.de 108 | */ 109 | protected class ContinuousFileInputStream 110 | extends FileInputStream { 111 | 112 | public ContinuousFileInputStream(final File file) 113 | throws FileNotFoundException { 114 | super(file); 115 | } 116 | 117 | @Override 118 | public int available() throws IOException { 119 | int available = super.available(); 120 | try { 121 | while (available == 0 && !ContinuousWarcRecordReader.this.closed) { 122 | ContinuousWarcRecordReader.this.consume = true; 123 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis); 124 | available = super.available(); 125 | } 126 | } catch (final InterruptedException exception) { 127 | LOG.log(Level.WARNING, "Interrupted " + this, exception); 128 | } 129 | 130 | if (ContinuousWarcRecordReader.this.closed) { 131 | ContinuousWarcRecordReader.this.closeStream(); 132 | } 133 | return available; 134 | } 135 | 136 | @Override 137 | public int read() throws IOException { 138 | int read = super.read(); 139 | try { 140 | while (read == -1 && !ContinuousWarcRecordReader.this.closed) { 141 | ContinuousWarcRecordReader.this.consume = true; 142 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis); 143 | read = super.read(); 144 | } 145 | } catch (final InterruptedException exception) { 146 | LOG.log(Level.WARNING, "Interrupted " + this, exception); 147 | } 148 | 149 | if (ContinuousWarcRecordReader.this.closed) { 150 | ContinuousWarcRecordReader.this.closeStream(); 151 | } 152 | return read; 153 | } 154 | 155 | @Override 156 | public int read(byte b[]) throws IOException { 157 | int read = super.read(b); 158 | try { 159 | while (read == -1 && !ContinuousWarcRecordReader.this.closed) { 160 | ContinuousWarcRecordReader.this.consume = true; 161 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis); 162 | read = super.read(b); 163 | } 164 | } catch (final InterruptedException exception) { 165 | LOG.log(Level.WARNING, "Interrupted " + this, exception); 166 | } 167 | 168 | if (ContinuousWarcRecordReader.this.closed) { 169 | ContinuousWarcRecordReader.this.closeStream(); 170 | } 171 | return read; 172 | } 173 | 174 | @Override 175 | public int read(byte b[], int off, int len) throws IOException { 176 | int read = super.read(b, off, len); 177 | try { 178 | while (read == -1 && !ContinuousWarcRecordReader.this.closed) { 179 | ContinuousWarcRecordReader.this.consume = true; 180 | Thread.sleep(ContinuousWarcRecordReader.this.pollIntervalMillis); 181 | read = super.read(b, off, len); 182 | } 183 | } catch (final InterruptedException exception) { 184 | LOG.log(Level.WARNING, "Interrupted " + this, exception); 185 | } 186 | 187 | if (ContinuousWarcRecordReader.this.closed) { 188 | ContinuousWarcRecordReader.this.closeStream(); 189 | } 190 | return read; 191 | } 192 | 193 | } 194 | 195 | } 196 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/GenericHtmlWarcRecordConsumer.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.io.IOException; 4 | import java.time.Instant; 5 | import java.util.Objects; 6 | import java.util.function.Function; 7 | 8 | /** 9 | * Generic class for consuming HTML WARC records. 10 | * 11 | * @author johannes.kiesel@uni-weimar.de 12 | * 13 | */ 14 | public abstract class GenericHtmlWarcRecordConsumer 15 | extends GenericWarcRecordConsumer { 16 | 17 | ///////////////////////////////////////////////////////////////////////////// 18 | // CONSTANTS 19 | ///////////////////////////////////////////////////////////////////////////// 20 | 21 | /** 22 | * Default function for extracting HTML from response records. 23 | */ 24 | public static final Function DEFAULT_DOCUMENT_EXTRACTOR = 25 | JerichoDocumentExtractor.INSTANCE; 26 | 27 | ///////////////////////////////////////////////////////////////////////////// 28 | // MEMBERS 29 | ///////////////////////////////////////////////////////////////////////////// 30 | 31 | private Function documentExtractor; 32 | 33 | ///////////////////////////////////////////////////////////////////////////// 34 | // CONSTRUCTION 35 | ///////////////////////////////////////////////////////////////////////////// 36 | 37 | /** 38 | * Creates a new consumer using the default extractor for HTML responses. 39 | */ 40 | public GenericHtmlWarcRecordConsumer() { 41 | this(DEFAULT_DOCUMENT_EXTRACTOR); 42 | } 43 | 44 | /** 45 | * Creates a new consumer using the specified extractor for HTML responses. 46 | * @param documentExtractor The extractor 47 | */ 48 | public GenericHtmlWarcRecordConsumer( 49 | final Function documentExtractor) { 50 | this.setDocumentExtractor(documentExtractor); 51 | } 52 | 53 | ///////////////////////////////////////////////////////////////////////////// 54 | // GETTERS 55 | ///////////////////////////////////////////////////////////////////////////// 56 | 57 | /** 58 | * Gets the document extractor for HTML responses. 59 | * @return The extractor 60 | */ 61 | public Function getDocumentExtractor() { 62 | return this.documentExtractor; 63 | } 64 | 65 | ///////////////////////////////////////////////////////////////////////////// 66 | // SETTERS 67 | ///////////////////////////////////////////////////////////////////////////// 68 | 69 | /** 70 | * Sets the document extractor for HTML responses. 71 | * @param documentExtractor The extractor 72 | */ 73 | protected void setDocumentExtractor( 74 | final Function documentExtractor) { 75 | this.documentExtractor = Objects.requireNonNull(documentExtractor); 76 | } 77 | 78 | ///////////////////////////////////////////////////////////////////////////// 79 | // FUNCTIONALITY 80 | ///////////////////////////////////////////////////////////////////////////// 81 | 82 | @Override 83 | protected void acceptHtmlResponse( 84 | final String id, final String uri, final String html, final Instant time) 85 | throws IOException { 86 | final Document document = this.getDocumentExtractor().apply(html); 87 | this.acceptHtmlResponse(id, uri, document, time); 88 | } 89 | 90 | protected abstract void acceptHtmlResponse( 91 | final String id, final String uri, final Document document, 92 | final Instant time) 93 | throws IOException; 94 | 95 | ///////////////////////////////////////////////////////////////////////////// 96 | // DOCUMENT 97 | ///////////////////////////////////////////////////////////////////////////// 98 | 99 | /** 100 | * A processed document. 101 | * 102 | * @author johannes.kiesel@uni-weimar.de 103 | * 104 | */ 105 | public static final class Document { 106 | 107 | private final String title; 108 | 109 | private final String content; 110 | 111 | /** 112 | * Creates a new document. 113 | * @param title The document's title (or null) 114 | * @param content The document's content (or null) 115 | */ 116 | public Document(final String title, final String content) { 117 | this.title = title; 118 | this.content = content; 119 | } 120 | 121 | /** 122 | * Gets the title of the document. 123 | * @return The title (may be null or empty) 124 | */ 125 | public String getTitle() { 126 | return this.title; 127 | } 128 | 129 | /** 130 | * Gets the text content of the document. 131 | * @return The content (may be null or empty) 132 | */ 133 | public String getContent() { 134 | return this.content; 135 | } 136 | 137 | } 138 | 139 | } 140 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/GenericWarcRecordConsumer.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.io.IOException; 4 | import java.util.function.Consumer; 5 | import java.util.logging.Level; 6 | import java.util.logging.Logger; 7 | import java.time.Instant; 8 | 9 | import org.apache.http.HttpResponse; 10 | 11 | import edu.cmu.lemurproject.WarcRecord; 12 | 13 | /** 14 | * Generic class for consuming WARC records with methods for different records. 15 | * 16 | * @author johannes.kiesel@uni-weimar.de 17 | * 18 | */ 19 | public abstract class GenericWarcRecordConsumer 20 | implements Consumer { 21 | 22 | ///////////////////////////////////////////////////////////////////////////// 23 | // LOGGING 24 | ///////////////////////////////////////////////////////////////////////////// 25 | 26 | private static final Logger LOG = 27 | Logger.getLogger(GenericWarcRecordConsumer.class.getName()); 28 | 29 | ///////////////////////////////////////////////////////////////////////////// 30 | // FUNCTIONALITY 31 | ///////////////////////////////////////////////////////////////////////////// 32 | 33 | @Override 34 | public void accept(final WarcRecord record) { 35 | final String type = Warcs.getType(record); 36 | 37 | final Instant time = 38 | Instant.ofEpochSecond(Warcs.getDate(record).getEpochSecond()); 39 | try { 40 | switch (type) { 41 | case Warcs.HEADER_TYPE_RESPONSE: 42 | this.acceptResponse(record, time); 43 | break; 44 | case Warcs.HEADER_TYPE_REQUEST: 45 | this.acceptRequest(record, time); 46 | break; 47 | case Warcs.HEADER_TYPE_REVISIT: 48 | this.acceptRevisit(record, time); 49 | break; 50 | default: 51 | break; 52 | } 53 | } catch (final Throwable exception) { 54 | LOG.log(Level.WARNING, "Failed to index record " + Warcs.getId(record) 55 | + " of type " + Warcs.getType(record), exception); 56 | } 57 | } 58 | 59 | ///////////////////////////////////////////////////////////////////////////// 60 | // Response 61 | 62 | protected void acceptResponse(final WarcRecord record, final Instant time) 63 | throws IOException { 64 | final String id = Warcs.getId(record); 65 | final String uri = Warcs.getTargetUri(record); 66 | final String html = this.getHtml(record); 67 | if (html != null) { 68 | LOG.fine("accept html response " + id + " -> " + uri); 69 | this.acceptHtmlResponse(id, uri, html, time); 70 | } else { 71 | LOG.fine("accept non-html response " + id + " -> " + uri); 72 | this.acceptNonHtmlResponse(id, uri, time); 73 | } 74 | } 75 | 76 | protected void acceptNonHtmlResponse( 77 | final String id, final String uri, final Instant time) 78 | throws IOException { 79 | // do nothing by default 80 | } 81 | 82 | protected void acceptHtmlResponse( 83 | final String id, final String uri, final String html, final Instant time) 84 | throws IOException { 85 | // do nothing by default 86 | } 87 | 88 | ///////////////////////////////////////////////////////////////////////////// 89 | // Revisit 90 | 91 | protected void acceptRevisit(final WarcRecord record, final Instant time) 92 | throws IOException { 93 | this.acceptRevisit( 94 | Warcs.getId(record), 95 | Warcs.getReferedToTargetUri(record), 96 | Warcs.getReferedToDate(record), 97 | time); 98 | } 99 | 100 | protected void acceptRevisit( 101 | final String id, final String uri, final Instant originalTime, 102 | final Instant time) 103 | throws IOException { 104 | // do nothing by default 105 | } 106 | 107 | ///////////////////////////////////////////////////////////////////////////// 108 | // Request 109 | 110 | protected void acceptRequest(final WarcRecord record, final Instant time) 111 | throws IOException { 112 | this.acceptRequest( 113 | Warcs.getConcurrentRecordId(record), 114 | Warcs.getTargetUri(record), 115 | time); 116 | } 117 | 118 | protected void acceptRequest( 119 | final String concurrentRecordId, 120 | final String targetUri, 121 | final Instant time) 122 | throws IOException { 123 | // do nothing by default 124 | } 125 | 126 | ///////////////////////////////////////////////////////////////////////////// 127 | // HELPERS 128 | ///////////////////////////////////////////////////////////////////////////// 129 | 130 | /** 131 | * Gets the HTML from a response WARC record. 132 | * @param record The record 133 | * @return The HTML if it exists, or null 134 | */ 135 | protected String getHtml(final WarcRecord record) { 136 | try { 137 | final HttpResponse response = Warcs.toResponse(record); 138 | if (Warcs.isHtml(response)) { 139 | return Warcs.getHtml(record); 140 | } 141 | } catch (final Throwable exception) { 142 | LOG.log(Level.FINER, 143 | "Could not parse record " + Warcs.getId(record), 144 | exception); 145 | } 146 | return null; 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/JerichoDocumentExtractor.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.util.Objects; 4 | import java.util.function.Function; 5 | 6 | import de.webis.wasp.warcs.GenericHtmlWarcRecordConsumer.Document; 7 | import net.htmlparser.jericho.CharacterReference; 8 | import net.htmlparser.jericho.Element; 9 | import net.htmlparser.jericho.HTMLElementName; 10 | import net.htmlparser.jericho.Renderer; 11 | import net.htmlparser.jericho.Source; 12 | 13 | /** 14 | * A document extractor using Jericho HTML parser. 15 | * 16 | * @author johannes.kiesel@uni-weimar.de 17 | * 18 | */ 19 | public class JerichoDocumentExtractor 20 | implements Function { 21 | 22 | /** 23 | * The single instance of the extractor. 24 | */ 25 | public static final JerichoDocumentExtractor INSTANCE = 26 | new JerichoDocumentExtractor(); 27 | 28 | protected JerichoDocumentExtractor() { } 29 | 30 | @Override 31 | public Document apply(final String html) { 32 | final Source source = new Source(Objects.requireNonNull(html)); 33 | 34 | final Renderer renderer = new Renderer(source); 35 | renderer.setMaxLineLength(0); 36 | renderer.setIncludeHyperlinkURLs(false); 37 | renderer.setIncludeAlternateText(true); 38 | final String content = renderer.toString(); 39 | 40 | final Element titleElement = 41 | source.getFirstElement(HTMLElementName.TITLE); 42 | final String title = titleElement == null 43 | ? null 44 | : CharacterReference.decodeCollapseWhiteSpace( 45 | titleElement.getContent()); 46 | 47 | return new Document(title, content); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/WarcRecordReader.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.nio.file.Path; 9 | import java.util.function.Consumer; 10 | import java.util.logging.Level; 11 | import java.util.logging.Logger; 12 | import java.util.zip.GZIPInputStream; 13 | 14 | import edu.cmu.lemurproject.WarcRecord; 15 | 16 | /** 17 | * Reader for WARC files that passes all read records to a consumer. 18 | *

19 | * Use the {@link #run()} or {@link #start()} methods to begin reading. 20 | *

21 | * If the archive is still being filled, use {@link ContinuousWarcRecordReader} 22 | * instead. 23 | *

24 | * 25 | * @author johannes.kiesel@uni-weimar.de 26 | * 27 | */ 28 | public class WarcRecordReader 29 | extends Thread 30 | implements AutoCloseable { 31 | 32 | ///////////////////////////////////////////////////////////////////////////// 33 | // LOGGING 34 | ///////////////////////////////////////////////////////////////////////////// 35 | 36 | private static final Logger LOG = 37 | Logger.getLogger(WarcRecordReader.class.getName()); 38 | 39 | ///////////////////////////////////////////////////////////////////////////// 40 | // MEMBERS 41 | ///////////////////////////////////////////////////////////////////////////// 42 | 43 | private final Consumer consumer; 44 | 45 | private final Path inputFile; 46 | 47 | private final DataInputStream input; 48 | 49 | ///////////////////////////////////////////////////////////////////////////// 50 | // CONSTRUCTION 51 | ///////////////////////////////////////////////////////////////////////////// 52 | 53 | /** 54 | * Creates a new reader for an archive. 55 | * @param inputFile The archive file 56 | * @param consumer Consumer for the WARC records that are read 57 | * @throws IOException When the file can not be opened 58 | */ 59 | public WarcRecordReader( 60 | final Path inputFile, final Consumer consumer) 61 | throws IOException { 62 | if (consumer == null) { throw new NullPointerException(); } 63 | this.consumer = consumer; 64 | this.inputFile = inputFile; 65 | this.input = this.openDataInputStream(); 66 | } 67 | 68 | /** 69 | * Opens a data input stream to the reader's file, applying GZip decompression 70 | * if the file ends on .gz. 71 | * @return The input stream 72 | * @throws IOException On opening the file 73 | */ 74 | protected DataInputStream openDataInputStream() 75 | throws IOException { 76 | final InputStream inputStream = this.openFileInputStream(); 77 | if (this.getInputFile().toString().toLowerCase().endsWith(".gz")) { 78 | return new DataInputStream(new GZIPInputStream(inputStream)); 79 | } else { 80 | return new DataInputStream(inputStream); 81 | } 82 | } 83 | 84 | /** 85 | * Opens an input stream to the reader's file. 86 | * @return The input stream 87 | * @throws IOException On opening the file 88 | */ 89 | protected FileInputStream openFileInputStream() 90 | throws IOException { 91 | final File file = this.getInputFile().toFile(); 92 | LOG.fine("Open file: " + file); 93 | return new FileInputStream(file); 94 | } 95 | 96 | ///////////////////////////////////////////////////////////////////////////// 97 | // CONSTRUCTION 98 | ///////////////////////////////////////////////////////////////////////////// 99 | 100 | /** 101 | * Gets the file this reader reads from. 102 | * @return The file 103 | */ 104 | public Path getInputFile() { 105 | return this.inputFile; 106 | } 107 | 108 | /** 109 | * Gets the consumer to which WARC records are passed to. 110 | * @return The consumer 111 | */ 112 | public Consumer getConsumer() { 113 | return this.consumer; 114 | } 115 | 116 | /** 117 | * Gets the input stream. 118 | * @return The stream 119 | */ 120 | protected DataInputStream getInput() { 121 | return this.input; 122 | } 123 | 124 | ///////////////////////////////////////////////////////////////////////////// 125 | // FUNCTIONALITY 126 | ///////////////////////////////////////////////////////////////////////////// 127 | 128 | @Override 129 | public void run() { 130 | final DataInputStream input = this.getInput(); 131 | try { 132 | WarcRecord record = WarcRecord.readNextWarcRecord(input); 133 | while (record != null) { 134 | this.consume(record); 135 | record = WarcRecord.readNextWarcRecord(input); 136 | } 137 | LOG.fine("Finished " + this); 138 | this.close(); 139 | } catch (final IOException exception) { 140 | LOG.log(Level.SEVERE, 141 | "Error while reading from " + this.getInputFile(), exception); 142 | } 143 | } 144 | 145 | @Override 146 | public void close() throws IOException { 147 | LOG.fine("Close file " + this.getInputFile()); 148 | this.getInput().close(); 149 | } 150 | 151 | @Override 152 | public String toString() { 153 | return this.getInputFile() + " -> " + this.getConsumer(); 154 | } 155 | 156 | ///////////////////////////////////////////////////////////////////////////// 157 | // HELPERS 158 | ///////////////////////////////////////////////////////////////////////////// 159 | 160 | /** 161 | * Passes the record to the consumer. 162 | * @param record The record 163 | * @see #getConsumer() 164 | */ 165 | protected void consume(final WarcRecord record) { 166 | this.getConsumer().accept(record); 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /src/de/webis/wasp/warcs/Warcs.java: -------------------------------------------------------------------------------- 1 | package de.webis.wasp.warcs; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.time.Instant; 7 | import java.time.format.DateTimeFormatter; 8 | import java.util.Locale; 9 | import java.util.regex.Pattern; 10 | import java.util.zip.GZIPInputStream; 11 | 12 | import org.apache.http.Header; 13 | import org.apache.http.HeaderElement; 14 | import org.apache.http.HttpEntity; 15 | import org.apache.http.HttpException; 16 | import org.apache.http.HttpResponse; 17 | import org.apache.http.ParseException; 18 | import org.apache.http.client.entity.DecompressingEntity; 19 | import org.apache.http.client.entity.DeflateInputStream; 20 | import org.apache.http.client.entity.InputStreamFactory; 21 | import org.apache.http.config.Lookup; 22 | import org.apache.http.config.MessageConstraints; 23 | import org.apache.http.config.RegistryBuilder; 24 | import org.apache.http.entity.BasicHttpEntity; 25 | import org.apache.http.entity.ContentLengthStrategy; 26 | import org.apache.http.impl.DefaultHttpResponseFactory; 27 | import org.apache.http.impl.entity.LaxContentLengthStrategy; 28 | import org.apache.http.impl.io.ChunkedInputStream; 29 | import org.apache.http.impl.io.ContentLengthInputStream; 30 | import org.apache.http.impl.io.DefaultHttpResponseParser; 31 | import org.apache.http.impl.io.EmptyInputStream; 32 | import org.apache.http.impl.io.HttpTransportMetricsImpl; 33 | import org.apache.http.impl.io.IdentityInputStream; 34 | import org.apache.http.impl.io.SessionInputBufferImpl; 35 | import org.apache.http.io.SessionInputBuffer; 36 | import org.apache.http.protocol.HTTP; 37 | import org.apache.http.util.EntityUtils; 38 | 39 | import edu.cmu.lemurproject.WarcRecord; 40 | 41 | /** 42 | * Utility class for working with WARC files. 43 | * 44 | * @author johannes.kiesel@uni-weimar.de 45 | * 46 | */ 47 | public class Warcs { 48 | 49 | public static final String HEADER_ID = "WARC-Record-ID"; 50 | 51 | public static final String HEADER_TYPE = "WARC-Type"; 52 | 53 | public static final String HEADER_TYPE_INFO = "warcinfo"; 54 | 55 | public static final String HEADER_TYPE_REQUEST = "request"; 56 | 57 | public static final String HEADER_TYPE_RESPONSE = "response"; 58 | 59 | public static final String HEADER_TYPE_REVISIT = "revisit"; 60 | 61 | public static final String HEADER_REFERS_TO = "WARC-Refers-To"; 62 | 63 | public static final String HEADER_REFERS_TO_TARGET_URI ="WARC-Refers-To-Target-URI"; 64 | 65 | public static final String HEADER_REFERS_TO_DATE = "WARC-Refers-To-Date"; 66 | 67 | public static final String HEADER_TARGET_URI = "WARC-Target-URI"; 68 | 69 | public static final String HEADER_CONCURRENT = "WARC-Concurrent-To"; 70 | 71 | public static final String HEADER_DATE = "WARC-Date"; 72 | 73 | public static final DateTimeFormatter HEADER_DATE_FORMAT = 74 | DateTimeFormatter.ISO_INSTANT; 75 | 76 | 77 | public static final Pattern HTTP_HEADER_CONTENT_TYPE_HTML = Pattern.compile( 78 | "text/html.*"); 79 | 80 | public static final String HTTP_HEADER_CONTENT_TYPE = "Content-Type"; 81 | 82 | ///////////////////////////////////////////////////////////////////////////// 83 | // STATIC HELPERS 84 | ///////////////////////////////////////////////////////////////////////////// 85 | 86 | private final static InputStreamFactory GZIP = new InputStreamFactory() { 87 | @Override 88 | public InputStream create(final InputStream instream) throws IOException { 89 | return new GZIPInputStream(instream); 90 | } 91 | }; 92 | 93 | private final static InputStreamFactory DEFLATE = new InputStreamFactory() { 94 | @Override 95 | public InputStream create(final InputStream instream) throws IOException { 96 | return new DeflateInputStream(instream); 97 | } 98 | }; 99 | 100 | ///////////////////////////////////////////////////////////////////////////// 101 | // CONSTRUCTORS 102 | ///////////////////////////////////////////////////////////////////////////// 103 | 104 | // Utility class 105 | private Warcs() { } 106 | 107 | ///////////////////////////////////////////////////////////////////////////// 108 | // FUNCTIONALITY 109 | ///////////////////////////////////////////////////////////////////////////// 110 | 111 | ///////////////////////////////////////////////////////////////////////////// 112 | // Access header fields 113 | 114 | public static String getHeader( 115 | final WarcRecord record, final String header) { 116 | return record.getHeaderMetadataItem(header); 117 | } 118 | 119 | public static String getId(final WarcRecord record) { 120 | return Warcs.getHeader(record, HEADER_ID); 121 | } 122 | 123 | public static String getType(final WarcRecord record) { 124 | return Warcs.getHeader(record, HEADER_TYPE); 125 | } 126 | 127 | public static Instant getDate(final WarcRecord record) { 128 | final String date = Warcs.getHeader(record, HEADER_DATE); 129 | return Instant.from(HEADER_DATE_FORMAT.parse(date)); 130 | } 131 | 132 | public static String getTargetUri(final WarcRecord record) { 133 | return Warcs.getHeader(record, HEADER_TARGET_URI); 134 | } 135 | 136 | public static String getConcurrentRecordId(final WarcRecord record) { 137 | return Warcs.getHeader(record, HEADER_CONCURRENT); 138 | } 139 | 140 | public static String getReferedToRecordId(final WarcRecord record) { 141 | return Warcs.getHeader(record, HEADER_REFERS_TO); 142 | } 143 | 144 | public static String getReferedToTargetUri(final WarcRecord record) { 145 | return Warcs.getHeader(record, HEADER_REFERS_TO_TARGET_URI); 146 | } 147 | 148 | public static Instant getReferedToDate(final WarcRecord record) { 149 | final String date = Warcs.getHeader(record, HEADER_REFERS_TO_DATE); 150 | return Instant.from(HEADER_DATE_FORMAT.parse(date)); 151 | } 152 | 153 | ///////////////////////////////////////////////////////////////////////////// 154 | // HTML 155 | 156 | /** 157 | * Checks if this is a HTML response record. 158 | */ 159 | public static boolean isHtml(final WarcRecord record) 160 | throws HttpException, IOException { 161 | if (record == null) { return false; } 162 | final HttpResponse response = Warcs.toResponse(record); 163 | return Warcs.isHtml(response); 164 | } 165 | 166 | /** 167 | * Checks if this is a HTML response. 168 | */ 169 | public static boolean isHtml(final HttpResponse response) { 170 | if (response == null) { return false; } 171 | 172 | final String contentType = 173 | response.getLastHeader(HTTP_HEADER_CONTENT_TYPE).getValue(); 174 | if (contentType == null) { return false; } // no content type 175 | 176 | if (!HTTP_HEADER_CONTENT_TYPE_HTML.matcher(contentType).matches()) { 177 | return false; // not HTML content type 178 | } 179 | 180 | return true; 181 | } 182 | 183 | /** 184 | * Gets the HTML part of a record or null if there is none or an 185 | * invalid one. 186 | */ 187 | public static String getHtml(final WarcRecord record) 188 | throws ParseException, IOException, HttpException { 189 | final HttpResponse response = Warcs.toResponse(record); 190 | if (!Warcs.isHtml(response)) { return null; } // no HTML record 191 | 192 | final HttpEntity entity = response.getEntity(); 193 | final String defaultCharset = null; 194 | return EntityUtils.toString(entity, defaultCharset); 195 | } 196 | 197 | /** 198 | * Gets an {@link HttpResponse} object from a WARC record of such a response. 199 | * @return The response or null when the record is not a response 200 | * record 201 | */ 202 | public static HttpResponse toResponse(final WarcRecord record) 203 | throws IOException, HttpException { 204 | // based on http://stackoverflow.com/a/26586178 205 | if (!record.getHeaderRecordType().equals("response")) { return null; } 206 | 207 | final SessionInputBufferImpl sessionInputBuffer = 208 | new SessionInputBufferImpl(new HttpTransportMetricsImpl(), 2048); 209 | final InputStream inputStream = 210 | new ByteArrayInputStream(record.getByteContent()); 211 | sessionInputBuffer.bind(inputStream); 212 | final MessageConstraints constraints = MessageConstraints.DEFAULT; 213 | final DefaultHttpResponseParser parser = 214 | new DefaultHttpResponseParser( 215 | sessionInputBuffer, null, new DefaultHttpResponseFactory(), 216 | constraints); 217 | final HttpResponse response = parser.parse(); 218 | final HttpEntity entity = Warcs.getEntity(response, sessionInputBuffer); 219 | response.setEntity(entity); 220 | Warcs.encodeEntity(response); 221 | return response; 222 | } 223 | 224 | 225 | private static void encodeEntity(final HttpResponse response) 226 | throws HttpException, IOException { 227 | // Adapted from org.apache.http.client.protocol.ResponseContentEncoding 228 | final HttpEntity entity = response.getEntity(); 229 | 230 | // entity can be null in case of 304 Not Modified, 204 No Content or similar 231 | // check for zero length entity. 232 | if (entity != null && entity.getContentLength() != 0) { 233 | final Header ceheader = entity.getContentEncoding(); 234 | if (ceheader != null) { 235 | final HeaderElement[] codecs = ceheader.getElements(); 236 | final Lookup decoderRegistry = 237 | RegistryBuilder.create() 238 | .register("gzip", GZIP) 239 | .register("x-gzip", GZIP) 240 | .register("deflate", DEFLATE) 241 | .build(); 242 | for (final HeaderElement codec : codecs) { 243 | final String codecname = codec.getName().toLowerCase(Locale.ROOT); 244 | final InputStreamFactory decoderFactory = 245 | decoderRegistry.lookup(codecname); 246 | if (decoderFactory != null) { 247 | response.setEntity(new DecompressingEntity( 248 | response.getEntity(), decoderFactory)); 249 | response.removeHeaders("Content-Length"); 250 | response.removeHeaders("Content-Encoding"); 251 | response.removeHeaders("Content-MD5"); 252 | } else { 253 | if (!"identity".equals(codecname)) { 254 | throw new HttpException( 255 | "Unsupported Content-Encoding: " + codec.getName()); 256 | } 257 | } 258 | } 259 | } 260 | } 261 | } 262 | 263 | private static InputStream createInputStream( 264 | final long len, final SessionInputBuffer input) { 265 | // Adapted from the org.apache.http.impl.BHttpConnectionBase 266 | if (len == ContentLengthStrategy.CHUNKED) { 267 | return new ChunkedInputStream(input); 268 | } else if (len == ContentLengthStrategy.IDENTITY) { 269 | return new IdentityInputStream(input); 270 | } else if (len == 0L) { 271 | return EmptyInputStream.INSTANCE; 272 | } else { 273 | return new ContentLengthInputStream(input, len); 274 | } 275 | } 276 | 277 | private static HttpEntity getEntity( 278 | final HttpResponse response, final SessionInputBuffer input) 279 | throws HttpException { 280 | // Adapted from the org.apache.http.impl.BHttpConnectionBase 281 | final BasicHttpEntity entity = new BasicHttpEntity(); 282 | 283 | final long len = 284 | new LaxContentLengthStrategy().determineLength(response); 285 | final InputStream instream = Warcs.createInputStream(len, input); 286 | if (len == ContentLengthStrategy.CHUNKED) { 287 | entity.setChunked(true); 288 | entity.setContentLength(-1); 289 | entity.setContent(instream); 290 | } else if (len == ContentLengthStrategy.IDENTITY) { 291 | entity.setChunked(false); 292 | entity.setContentLength(-1); 293 | entity.setContent(instream); 294 | } else { 295 | entity.setChunked(false); 296 | entity.setContentLength(len); 297 | entity.setContent(instream); 298 | } 299 | 300 | final Header contentTypeHeader = 301 | response.getFirstHeader(HTTP.CONTENT_TYPE); 302 | if (contentTypeHeader != null) { 303 | entity.setContentType(contentTypeHeader); 304 | } 305 | final Header contentEncodingHeader = 306 | response.getFirstHeader(HTTP.CONTENT_ENCODING); 307 | if (contentEncodingHeader != null) { 308 | entity.setContentEncoding(contentEncodingHeader); 309 | } 310 | return entity; 311 | } 312 | 313 | } 314 | -------------------------------------------------------------------------------- /src/edu/cmu/lemurproject/WarcRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | Lemur License Agreement 3 | 4 | Copyright (c) 2000-2011 The Lemur Project. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | 2. Redistributions in binary form must reproduce the above copyright 14 | notice, this list of conditions and the following disclaimer in 15 | the documentation and/or other materials provided with the 16 | distribution. 17 | 18 | 3. The names "Lemur", "Indri", "University of Massachusetts" and 19 | "Carnegie Mellon" must not be used to endorse or promote products 20 | derived from this software without prior written permission. To 21 | obtain permission, contact license@lemurproject.org 22 | 23 | 4. Products derived from this software may not be called "Lemur" or "Indri" 24 | nor may "Lemur" or "Indri" appear in their names without prior written 25 | permission of The Lemur Project. To obtain permission, 26 | contact license@lemurproject.org. 27 | 28 | THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AND OTHER 29 | CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 30 | BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 31 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 32 | COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 33 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 34 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 35 | OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 37 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 38 | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 39 | DAMAGE. 40 | 41 | */ 42 | /* 43 | * To change this template, choose Tools | Templates 44 | * and open the template in the editor. 45 | */ 46 | package edu.cmu.lemurproject; 47 | 48 | import java.io.DataInput; 49 | import java.io.DataInputStream; 50 | import java.io.DataOutput; 51 | import java.io.EOFException; 52 | import java.io.IOException; 53 | import java.io.UnsupportedEncodingException; 54 | import java.util.HashMap; 55 | import java.util.Iterator; 56 | import java.util.Map.Entry; 57 | import java.util.Set; 58 | // import org.apache.commons.logging.Log; 59 | // import org.apache.commons.logging.LogFactory; 60 | 61 | /** 62 | * 63 | * @author mhoy 64 | */ 65 | public class WarcRecord { 66 | 67 | // public static final Log LOG = LogFactory.getLog(WarcRecord.class); 68 | 69 | public static String WARC_VERSION = "WARC/"; 70 | public static String WARC_VERSION_LINE = "WARC/0.18\n"; 71 | 72 | ////public static String WARC_VERSION = "WARC/1.0"; 73 | //public static String WARC_VERSION = "WARC/0.18"; 74 | ////public static String WARC_VERSION_LINE = "WARC/1.0\n"; 75 | //public static String WARC_VERSION_LINE = "WARC/0.18\n"; 76 | private static String NEWLINE="\n"; 77 | private static String CR_NEWLINE="\r\n"; 78 | 79 | private static byte MASK_THREE_BYTE_CHAR=(byte)(0xE0); 80 | private static byte MASK_TWO_BYTE_CHAR=(byte)(0xC0); 81 | private static byte MASK_TOPMOST_BIT=(byte)(0x80); 82 | private static byte MASK_BOTTOM_SIX_BITS=(byte)(0x1F); 83 | private static byte MASK_BOTTOM_FIVE_BITS=(byte)(0x3F); 84 | private static byte MASK_BOTTOM_FOUR_BITS=(byte)(0x0F); 85 | 86 | private static String LINE_ENDING="\n"; 87 | 88 | private static String readLineFromInputStream(DataInputStream in) throws IOException { 89 | StringBuilder retString=new StringBuilder(); 90 | boolean found_cr = false; 91 | boolean keepReading=true; 92 | try { 93 | do { 94 | char thisChar=0; 95 | byte readByte=in.readByte(); 96 | // check to see if it's a multibyte character 97 | if ((readByte & MASK_THREE_BYTE_CHAR) == MASK_THREE_BYTE_CHAR) { 98 | found_cr = false; 99 | // need to read the next 2 bytes 100 | if (in.available() < 2) { 101 | // treat these all as individual characters 102 | retString.append((char)readByte); 103 | int numAvailable=in.available(); 104 | for (int i=0; i < numAvailable; i++) { 105 | retString.append((char)(in.readByte())); 106 | } 107 | continue; 108 | } 109 | byte secondByte=in.readByte(); 110 | byte thirdByte=in.readByte(); 111 | // ensure the topmost bit is set 112 | if (((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) || ((thirdByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT)) { 113 | //treat these as individual characters 114 | retString.append((char)readByte); 115 | retString.append((char)secondByte); 116 | retString.append((char)thirdByte); 117 | continue; 118 | } 119 | int finalVal=(thirdByte & MASK_BOTTOM_FIVE_BITS) + 64*(secondByte & MASK_BOTTOM_FIVE_BITS) + 4096*(readByte & MASK_BOTTOM_FOUR_BITS); 120 | thisChar=(char)finalVal; 121 | } else if ((readByte & MASK_TWO_BYTE_CHAR) == MASK_TWO_BYTE_CHAR) { 122 | found_cr = false; 123 | 124 | // need to read next byte 125 | if (in.available() < 1) { 126 | // treat this as individual characters 127 | retString.append((char)readByte); 128 | continue; 129 | } 130 | byte secondByte=in.readByte(); 131 | if ((secondByte & MASK_TOPMOST_BIT)!=MASK_TOPMOST_BIT) { 132 | retString.append((char)readByte); 133 | retString.append((char)secondByte); 134 | continue; 135 | } 136 | int finalVal=(secondByte & MASK_BOTTOM_FIVE_BITS) + 64*(readByte & MASK_BOTTOM_SIX_BITS); 137 | thisChar=(char)finalVal; 138 | } else { 139 | // interpret it as a single byte 140 | thisChar=(char)readByte; 141 | } 142 | // Look for carriage return; if found set a flag 143 | if (thisChar=='\r') { 144 | found_cr = true; 145 | } 146 | if (thisChar=='\n') { 147 | // if the linefeed is the next character after the carriage return 148 | if (found_cr) { 149 | LINE_ENDING = CR_NEWLINE; 150 | } else { 151 | LINE_ENDING = NEWLINE; 152 | } 153 | keepReading=false; 154 | } else { 155 | retString.append(thisChar); 156 | } 157 | } while (keepReading); 158 | } catch (EOFException eofEx) { 159 | return null; 160 | } 161 | 162 | if (retString.length()==0) { 163 | return ""; 164 | } 165 | 166 | return retString.toString(); 167 | } 168 | 169 | private static byte[] readNextRecord(DataInputStream in, StringBuffer headerBuffer) throws IOException { 170 | if (in==null) { return null; } 171 | if (headerBuffer==null) { return null; } 172 | 173 | String line=null; 174 | boolean foundMark=false; 175 | byte[] retContent=null; 176 | 177 | // cannot be using a buffered reader here!!!! 178 | // just read the header 179 | // first - find our WARC header 180 | while ((!foundMark) && ((line=readLineFromInputStream(in))!=null)) { 181 | if (line.startsWith(WARC_VERSION)) { 182 | WARC_VERSION_LINE = line; 183 | foundMark=true; 184 | } 185 | } 186 | 187 | // no WARC mark? 188 | if (!foundMark) { return null; } 189 | 190 | // LOG.info("Found WARC_VERSION"); 191 | 192 | int contentLength = -1; 193 | // read until we see contentLength then an empty line 194 | // (to handle malformed ClueWeb09 headers that have blank lines) 195 | // get the content length and set our retContent 196 | for (line = readLineFromInputStream(in).trim(); 197 | line.length() > 0 || contentLength < 0; 198 | line = readLineFromInputStream(in).trim()) { 199 | 200 | if (line.length() > 0 ) { 201 | headerBuffer.append(line); 202 | headerBuffer.append(LINE_ENDING); 203 | 204 | // find the content length designated by Content-Length: 205 | String[] parts = line.split(":", 2); 206 | if (parts.length == 2 && parts[0].equals("Content-Length")) { 207 | try { 208 | contentLength=Integer.parseInt(parts[1].trim()); 209 | // LOG.info("WARC record content length: " + contentLength); 210 | } catch (NumberFormatException nfEx) { 211 | contentLength=-1; 212 | } 213 | } 214 | } 215 | } 216 | 217 | // now read the bytes of the content 218 | retContent=new byte[contentLength]; 219 | int totalWant=contentLength; 220 | int totalRead=0; 221 | // 222 | // LOOP TO REMOVE LEADING CR * LF 223 | // To prevent last few characters from being cut off of the content 224 | // when reading 225 | // 226 | while ((totalRead == 0) && (totalRead < contentLength)) { 227 | byte CR = in.readByte(); 228 | byte LF = in.readByte(); 229 | if ((CR != 13) && (LF != 10)) { 230 | retContent[0] = CR; 231 | retContent[1] = LF; 232 | totalRead = 2; 233 | totalWant = contentLength - totalRead; 234 | } 235 | } 236 | // 237 | // 238 | // 239 | while (totalRead < contentLength) { 240 | try { 241 | int numRead=in.read(retContent, totalRead, totalWant); 242 | if (numRead < 0) { 243 | return null; 244 | } else { 245 | totalRead += numRead; 246 | totalWant = contentLength-totalRead; 247 | } // end if (numRead < 0) / else 248 | } catch (EOFException eofEx) { 249 | // resize to what we have 250 | if (totalRead > 0) { 251 | byte[] newReturn=new byte[totalRead]; 252 | System.arraycopy(retContent, 0, newReturn, 0, totalRead); 253 | return newReturn; 254 | } else { 255 | return null; 256 | } 257 | } // end try/catch (EOFException) 258 | } // end while (totalRead < contentLength) 259 | 260 | return retContent; 261 | } 262 | 263 | public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException { 264 | // LOG.info("Starting read of WARC record"); 265 | StringBuffer recordHeader=new StringBuffer(); 266 | byte[] recordContent=readNextRecord(in, recordHeader); 267 | if (recordContent==null) { 268 | // LOG.info("WARC content is null - file is complete"); 269 | return null; 270 | } 271 | 272 | // extract out our header information 273 | String thisHeaderString=recordHeader.toString(); 274 | 275 | 276 | String[] headerLines=thisHeaderString.split(LINE_ENDING); 277 | 278 | WarcRecord retRecord=new WarcRecord(); 279 | for (int i=0; i < headerLines.length; i++) { 280 | String[] pieces=headerLines[i].split(":", 2); 281 | if (pieces.length!=2) { 282 | retRecord.addHeaderMetadata(pieces[0], ""); 283 | continue; 284 | } 285 | String thisKey=pieces[0].trim(); 286 | String thisValue=pieces[1].trim(); 287 | 288 | // check for known keys 289 | if (thisKey.equals("WARC-Type")) { 290 | // LOG.info("Setting WARC record type: " + thisValue); 291 | retRecord.setWarcRecordType(thisValue); 292 | } else if (thisKey.equals("WARC-Date")) { 293 | retRecord.setWarcDate(thisValue); 294 | } else if (thisKey.equals("WARC-Record-ID")) { 295 | // LOG.info("Setting WARC record ID: " + thisValue); 296 | retRecord.setWarcUUID(thisValue); 297 | } else if (thisKey.equals("Content-Type")) { 298 | retRecord.setWarcContentType(thisValue); 299 | } else { 300 | retRecord.addHeaderMetadata(thisKey, thisValue); 301 | } 302 | } 303 | 304 | // set the content 305 | retRecord.setContent(recordContent); 306 | 307 | return retRecord; 308 | } 309 | 310 | public class WarcHeader { 311 | public String contentType=""; 312 | public String UUID=""; 313 | public String dateString=""; 314 | public String recordType=""; 315 | public HashMap metadata=new HashMap(); 316 | public int contentLength=0; 317 | 318 | public WarcHeader() { 319 | } 320 | 321 | public WarcHeader(WarcHeader o) { 322 | this.contentType=o.contentType; 323 | this.UUID=o.UUID; 324 | this.dateString=o.dateString; 325 | this.recordType=o.recordType; 326 | this.metadata.putAll(o.metadata); 327 | this.contentLength=o.contentLength; 328 | } 329 | 330 | public void write(DataOutput out) throws IOException { 331 | out.writeUTF(contentType); 332 | out.writeUTF(UUID); 333 | out.writeUTF(dateString); 334 | out.writeUTF(recordType); 335 | out.writeInt(metadata.size()); 336 | Iterator> metadataIterator=metadata.entrySet().iterator(); 337 | while (metadataIterator.hasNext()) { 338 | Entry thisEntry=metadataIterator.next(); 339 | out.writeUTF(thisEntry.getKey()); 340 | out.writeUTF(thisEntry.getValue()); 341 | } 342 | out.writeInt(contentLength); 343 | } 344 | 345 | public void readFields(DataInput in) throws IOException { 346 | contentType=in.readUTF(); 347 | UUID=in.readUTF(); 348 | dateString=in.readUTF(); 349 | recordType=in.readUTF(); 350 | metadata.clear(); 351 | int numMetaItems=in.readInt(); 352 | for (int i=0; i < numMetaItems; i++) { 353 | String thisKey=in.readUTF(); 354 | String thisValue=in.readUTF(); 355 | metadata.put(thisKey, thisValue); 356 | } 357 | contentLength=in.readInt(); 358 | } 359 | 360 | @Override 361 | public String toString() { 362 | StringBuffer retBuffer=new StringBuffer(); 363 | 364 | retBuffer.append(WARC_VERSION_LINE); 365 | retBuffer.append(LINE_ENDING); 366 | 367 | retBuffer.append("WARC-Type: " + recordType + LINE_ENDING); 368 | retBuffer.append("WARC-Date: " + dateString + LINE_ENDING); 369 | 370 | Iterator> metadataIterator=metadata.entrySet().iterator(); 371 | while (metadataIterator.hasNext()) { 372 | Entry thisEntry=metadataIterator.next(); 373 | retBuffer.append(thisEntry.getKey()); 374 | retBuffer.append(": "); 375 | retBuffer.append(thisEntry.getValue()); 376 | retBuffer.append(LINE_ENDING); 377 | } 378 | // Keep this as the last WARC-... 379 | retBuffer.append("WARC-Record-ID: " + UUID + LINE_ENDING); 380 | 381 | retBuffer.append("Content-Type: " + contentType + LINE_ENDING); 382 | retBuffer.append("Content-Length: " + contentLength + LINE_ENDING); 383 | 384 | return retBuffer.toString(); 385 | } 386 | } 387 | 388 | private WarcHeader warcHeader=new WarcHeader(); 389 | private byte[] warcContent=null; 390 | private String warcFilePath=""; 391 | 392 | public WarcRecord() { 393 | 394 | } 395 | 396 | public WarcRecord(WarcRecord o) { 397 | this.warcHeader=new WarcHeader(o.warcHeader); 398 | this.warcContent=o.warcContent; 399 | } 400 | 401 | public int getTotalRecordLength() { 402 | int headerLength=warcHeader.toString().length(); 403 | return (headerLength + warcContent.length); 404 | } 405 | 406 | public void set(WarcRecord o) { 407 | this.warcHeader=new WarcHeader(o.warcHeader); 408 | this.warcContent=o.warcContent; 409 | } 410 | 411 | public String getWarcFilePath() { 412 | return warcFilePath; 413 | } 414 | 415 | public void setWarcFilePath(String path) { 416 | warcFilePath=path; 417 | } 418 | 419 | public void setWarcRecordType(String recordType) { 420 | warcHeader.recordType=recordType; 421 | } 422 | 423 | public void setWarcContentType(String contentType) { 424 | warcHeader.contentType=contentType; 425 | } 426 | 427 | public void setWarcDate(String dateString) { 428 | warcHeader.dateString=dateString; 429 | } 430 | 431 | public void setWarcUUID(String UUID) { 432 | warcHeader.UUID=UUID; 433 | } 434 | 435 | public void addHeaderMetadata(String key, String value) { 436 | //System.out.println("+-- WarRecord.addHeaderMetadata key=" + key + " value=" + value); 437 | // don't allow addition of known keys 438 | if (key.equals("WARC-Type")) { return; } 439 | if (key.equals("WARC-Date")) { return; } 440 | if (key.equals("WARC-Record-ID")) { return; } 441 | if (key.equals("Content-Type")) { return; } 442 | if (key.equals("Content-Length")) { return; } 443 | 444 | warcHeader.metadata.put(key, value); 445 | } 446 | 447 | 448 | public void clearHeaderMetadata() { 449 | warcHeader.metadata.clear(); 450 | } 451 | 452 | public Set> getHeaderMetadata() { 453 | return warcHeader.metadata.entrySet(); 454 | } 455 | 456 | public String getHeaderMetadataItem(String key) { 457 | //System.out.println("+++ WarRecord.getHeaderMetadataItem key=" + key); // WARC-Target-URI 458 | if (key.equals("WARC-Type")) { return warcHeader.recordType; } 459 | if (key.equals("WARC-Date")) { return warcHeader.dateString; } 460 | if (key.equals("WARC-Record-ID")) { return warcHeader.UUID; } 461 | if (key.equals("Content-Type")) { return warcHeader.contentType; } 462 | if (key.equals("Content-Length")) { return Integer.toString(warcHeader.contentLength); } 463 | 464 | return warcHeader.metadata.get(key); 465 | } 466 | 467 | public void setContent(byte[] content) { 468 | warcContent=content; 469 | warcHeader.contentLength=content.length; 470 | } 471 | 472 | public void setContent(String content) { 473 | setContent(content.getBytes()); 474 | } 475 | public void setContentLength(int len) { 476 | warcHeader.contentLength=len; 477 | } 478 | 479 | public byte[] getContent() { 480 | return warcContent; 481 | } 482 | public byte[] getByteContent() { 483 | return warcContent; 484 | } 485 | 486 | public String getContentUTF8() { 487 | String retString=null; 488 | try { 489 | retString = new String(warcContent, "UTF-8"); 490 | } catch (UnsupportedEncodingException ex) { 491 | retString=new String(warcContent); 492 | } 493 | return retString; 494 | } 495 | 496 | public String getHeaderRecordType() { 497 | return warcHeader.recordType; 498 | } 499 | 500 | @Override 501 | public String toString() { 502 | StringBuffer retBuffer=new StringBuffer(); 503 | retBuffer.append(warcHeader.toString()); 504 | retBuffer.append(LINE_ENDING); 505 | retBuffer.append(new String(warcContent)); 506 | return retBuffer.toString(); 507 | } 508 | 509 | public String getHeaderString() { 510 | return warcHeader.toString(); 511 | } 512 | 513 | public void write(DataOutput out) throws IOException { 514 | warcHeader.write(out); 515 | out.write(warcContent); 516 | } 517 | 518 | public void readFields(DataInput in) throws IOException { 519 | warcHeader.readFields(in); 520 | int contentLengthBytes=warcHeader.contentLength; 521 | warcContent=new byte[contentLengthBytes]; 522 | in.readFully(warcContent); 523 | } 524 | 525 | } 526 | 527 | --------------------------------------------------------------------------------