├── .gitignore ├── LICENSE ├── README.md ├── crawls ├── example.scala ├── out │ └── Example-20160128144235 │ │ ├── Example-20160128144235.cdx.gz │ │ └── Example-20160128144235.warc.gz └── run.sh ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt └── src └── main └── scala └── de └── l3s └── web2warc ├── Constants.scala ├── Web2Warc.scala ├── crawling ├── Crawler.scala ├── QueuedUrl.scala └── components │ ├── CrawlInfo.scala │ ├── CrawlRunInfo.scala │ ├── CrawlSpecification.scala │ ├── CrawlStrategy.scala │ ├── CrawlStrategyDef.scala │ └── io │ ├── CrawlWriter.scala │ └── WarcCdxWriter.scala ├── utils ├── GZip.scala ├── Html.scala ├── HttpClientReader.scala ├── HttpHeader.scala ├── HttpResponse.scala ├── OutFile.scala ├── SURT.scala └── SimpleHttpReader.scala └── warc ├── Capture.scala ├── WarcHeaders.scala └── WarcRecord.scala /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/README.md -------------------------------------------------------------------------------- /crawls/example.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/crawls/example.scala -------------------------------------------------------------------------------- /crawls/out/Example-20160128144235/Example-20160128144235.cdx.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/crawls/out/Example-20160128144235/Example-20160128144235.cdx.gz -------------------------------------------------------------------------------- /crawls/out/Example-20160128144235/Example-20160128144235.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/crawls/out/Example-20160128144235/Example-20160128144235.warc.gz -------------------------------------------------------------------------------- /crawls/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/crawls/run.sh -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/project/assembly.sbt -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.16 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/project/plugins.sbt -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/Constants.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/Constants.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/Web2Warc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/Web2Warc.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/Crawler.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/Crawler.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/QueuedUrl.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/QueuedUrl.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/CrawlInfo.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/CrawlInfo.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/CrawlRunInfo.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/CrawlRunInfo.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/CrawlSpecification.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/CrawlSpecification.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/CrawlStrategy.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/CrawlStrategy.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/CrawlStrategyDef.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/CrawlStrategyDef.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/io/CrawlWriter.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/io/CrawlWriter.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/crawling/components/io/WarcCdxWriter.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/crawling/components/io/WarcCdxWriter.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/GZip.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/GZip.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/Html.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/Html.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/HttpClientReader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/HttpClientReader.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/HttpHeader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/HttpHeader.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/HttpResponse.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/HttpResponse.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/OutFile.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/OutFile.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/SURT.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/SURT.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/utils/SimpleHttpReader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/utils/SimpleHttpReader.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/warc/Capture.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/warc/Capture.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/warc/WarcHeaders.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/warc/WarcHeaders.scala -------------------------------------------------------------------------------- /src/main/scala/de/l3s/web2warc/warc/WarcRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/Web2Warc/HEAD/src/main/scala/de/l3s/web2warc/warc/WarcRecord.scala --------------------------------------------------------------------------------