├── .github └── workflows │ └── maven.yml ├── .gitignore ├── CHANGES.md ├── LICENSE ├── README.md ├── pom.xml └── src ├── main ├── java │ ├── it │ │ └── unimi │ │ │ └── dsi │ │ │ └── fastutil │ │ │ └── io │ │ │ └── RepositionableStream.java │ └── org │ │ └── archive │ │ ├── RecoverableRecordFormatException.java │ │ ├── extract │ │ ├── CDXExtractorOutput.java │ │ ├── DumpingExtractorOutput.java │ │ ├── ExtractingResourceFactoryMapper.java │ │ ├── ExtractingResourceProducer.java │ │ ├── ExtractorOutput.java │ │ ├── FilteredExtractorOuput.java │ │ ├── JSONViewExtractorOutput.java │ │ ├── ProducerUtils.java │ │ ├── RealCDXExtractorOutput.java │ │ ├── ResourceExtractor.java │ │ ├── ResourceFactoryMapper.java │ │ ├── WARCMetadataRecordExtractorOutput.java │ │ ├── WATExtractorOutput.java │ │ └── WETExtractorOutput.java │ │ ├── format │ │ ├── ArchiveFileConstants.java │ │ ├── arc │ │ │ ├── ARCConstants.java │ │ │ ├── ARCFormatException.java │ │ │ ├── ARCMetaData.java │ │ │ ├── ARCMetaDataParser.java │ │ │ ├── FiledescRecord.java │ │ │ └── FiledescRecordParser.java │ │ ├── cdx │ │ │ ├── CDX09Line.java │ │ │ ├── CDX11Line.java │ │ │ ├── CDXFieldConstants.java │ │ │ ├── CDXFile.java │ │ │ ├── CDXInputSource.java │ │ │ ├── CDXLine.java │ │ │ ├── CDXLineFactory.java │ │ │ ├── FieldSplitFormat.java │ │ │ ├── FieldSplitLine.java │ │ │ ├── MultiCDXInputSource.java │ │ │ └── StandardCDXLineFactory.java │ │ ├── dns │ │ │ ├── DNSParseException.java │ │ │ ├── DNSRecord.java │ │ │ ├── DNSResponse.java │ │ │ └── DNSResponseParser.java │ │ ├── gzip │ │ │ ├── GZIPConstants.java │ │ │ ├── GZIPDecoder.java │ │ │ ├── GZIPFExtraRecord.java │ │ │ ├── GZIPFExtraRecords.java │ │ │ ├── GZIPFooter.java │ │ │ ├── GZIPFormatException.java │ │ │ ├── GZIPHeader.java │ │ │ ├── GZIPMemberSeries.java │ │ │ ├── GZIPMemberWriter.java │ │ │ ├── GZIPMemberWriterCommittedOutputStream.java │ │ │ ├── GZIPSeriesMember.java │ │ │ ├── GZIPStaticHeader.java │ │ │ └── zipnum │ │ │ │ ├── LineBufferingIterator.java │ │ │ │ ├── MultiBlockIterator.java │ │ │ │ ├── SummaryBlockIterator.java │ │ │ │ ├── SummaryLine.java │ │ │ │ ├── TimestampBestPickDedupIterator.java │ │ │ │ ├── TimestampCustomDedupIterator.java │ │ │ │ ├── TimestampDedupIterator.java │ │ │ │ ├── ZipNumBlockLoader.java │ │ │ │ ├── ZipNumCluster.java │ │ │ │ ├── ZipNumIndex.java │ │ │ │ ├── ZipNumParams.java │ │ │ │ └── ZipNumWriter.java │ │ ├── http │ │ │ ├── DumpingHTTPParseObserver.java │ │ │ ├── HttpConstants.java │ │ │ ├── HttpHeader.java │ │ │ ├── HttpHeaderObserver.java │ │ │ ├── HttpHeaderParser.java │ │ │ ├── HttpHeaders.java │ │ │ ├── HttpMessage.java │ │ │ ├── HttpMessageParser.java │ │ │ ├── HttpParseException.java │ │ │ ├── HttpParseObserver.java │ │ │ ├── HttpRequest.java │ │ │ ├── HttpRequestMessage.java │ │ │ ├── HttpRequestMessageObserver.java │ │ │ ├── HttpRequestMessageParser.java │ │ │ ├── HttpRequestParser.java │ │ │ ├── HttpResponse.java │ │ │ ├── HttpResponseMessage.java │ │ │ ├── HttpResponseMessageObserver.java │ │ │ ├── HttpResponseMessageParser.java │ │ │ └── HttpResponseParser.java │ │ ├── json │ │ │ ├── CompoundORJSONPathSpec.java │ │ │ ├── CrossProductOfLists.java │ │ │ ├── JSONPathSpec.java │ │ │ ├── JSONPathSpecFactory.java │ │ │ ├── JSONUtils.java │ │ │ ├── JSONView.java │ │ │ └── SimpleJSONPathSpec.java │ │ ├── text │ │ │ ├── charset │ │ │ │ ├── CharsetDetector.java │ │ │ │ ├── RotatingCharsetDetector.java │ │ │ │ └── StandardCharsetDetector.java │ │ │ └── html │ │ │ │ ├── CDATALexer.java │ │ │ │ ├── LexParser.java │ │ │ │ ├── NodeUtils.java │ │ │ │ └── ParseObserver.java │ │ └── warc │ │ │ ├── WARCConstants.java │ │ │ └── WARCRecordWriter.java │ │ ├── hadoop │ │ ├── ArchiveJSONViewLoader.java │ │ ├── ArchiveMetadataLoader.java │ │ ├── FilenameInputFormat.java │ │ ├── PerMapOutputFormat.java │ │ ├── ResourceContext.java │ │ ├── ResourceInputFormat.java │ │ ├── ResourceRecordReader.java │ │ └── func │ │ │ ├── JSONViewEvalFunc.java │ │ │ ├── TupleFunc.java │ │ │ └── URLResolverFunc.java │ │ ├── httpclient │ │ ├── ConfigurableX509TrustManager.java │ │ ├── HttpRecorderGetMethod.java │ │ ├── HttpRecorderMethod.java │ │ ├── HttpRecorderPostMethod.java │ │ ├── SingleHttpConnectionManager.java │ │ ├── ThreadLocalHttpConnectionManager.java │ │ └── package.html │ │ ├── io │ │ ├── ArchiveFileConstants.java │ │ ├── ArchiveReader.java │ │ ├── ArchiveReaderFactory.java │ │ ├── ArchiveRecord.java │ │ ├── ArchiveRecordHeader.java │ │ ├── ArraySeekInputStream.java │ │ ├── BufferedSeekInputStream.java │ │ ├── CharSubSequence.java │ │ ├── CompositeFileInputStream.java │ │ ├── CompositeFileReader.java │ │ ├── Endian.java │ │ ├── GZIPMembersInputStream.java │ │ ├── GenerationFileHandler.java │ │ ├── GenericReplayCharSequence.java │ │ ├── GzipHeader.java │ │ ├── HeaderedArchiveRecord.java │ │ ├── LoudObjectOutputStream.java │ │ ├── MiserOutputStream.java │ │ ├── NoGzipMagicException.java │ │ ├── ObjectPlusFilesInputStream.java │ │ ├── ObjectPlusFilesOutputStream.java │ │ ├── OriginSeekInputStream.java │ │ ├── Preformatter.java │ │ ├── RandomAccessInputStream.java │ │ ├── RandomAccessOutputStream.java │ │ ├── ReadSource.java │ │ ├── RecorderIOException.java │ │ ├── RecorderLengthExceededException.java │ │ ├── RecorderTimeoutException.java │ │ ├── RecorderTooMuchHeaderException.java │ │ ├── RecordingInputStream.java │ │ ├── RecordingOutputStream.java │ │ ├── RecoverableIOException.java │ │ ├── ReplayCharSequence.java │ │ ├── ReplayInputStream.java │ │ ├── RepositionableInputStream.java │ │ ├── SafeSeekInputStream.java │ │ ├── SeekInputStream.java │ │ ├── SeekReader.java │ │ ├── SeekReaderCharSequence.java │ │ ├── SinkHandlerLogThread.java │ │ ├── UTF8Bytes.java │ │ ├── WriterPool.java │ │ ├── WriterPoolMember.java │ │ ├── WriterPoolSettings.java │ │ ├── arc │ │ │ ├── ARC2WCDX.java │ │ │ ├── ARCConstants.java │ │ │ ├── ARCLocation.java │ │ │ ├── ARCReader.java │ │ │ ├── ARCReaderFactory.java │ │ │ ├── ARCRecord.java │ │ │ ├── ARCRecordMetaData.java │ │ │ ├── ARCUtils.java │ │ │ ├── ARCWriter.java │ │ │ ├── ARCWriterPool.java │ │ │ └── WriterPoolSettingsData.java │ │ ├── package.html │ │ └── warc │ │ │ ├── WARCConstants.java │ │ │ ├── WARCReader.java │ │ │ ├── WARCReaderFactory.java │ │ │ ├── WARCRecord.java │ │ │ ├── WARCRecordInfo.java │ │ │ ├── WARCWriter.java │ │ │ ├── WARCWriterPool.java │ │ │ ├── WARCWriterPoolSettings.java │ │ │ ├── WARCWriterPoolSettingsData.java │ │ │ └── package.html │ │ ├── net │ │ ├── DownloadURLConnection.java │ │ ├── FTPException.java │ │ ├── PublicSuffixes.java │ │ ├── md5 │ │ │ ├── Handler.java │ │ │ └── Md5URLConnection.java │ │ └── rsync │ │ │ ├── Handler.java │ │ │ └── RsyncURLConnection.java │ │ ├── resource │ │ ├── AbstractEmptyResource.java │ │ ├── AbstractResource.java │ │ ├── MetaData.java │ │ ├── MetaDataConstants.java-normal │ │ ├── Resource.java │ │ ├── ResourceConstants.java │ │ ├── ResourceContainer.java │ │ ├── ResourceFactory.java │ │ ├── ResourceParseException.java │ │ ├── ResourceProducer.java │ │ ├── TransformingResourceProducer.java │ │ ├── arc │ │ │ ├── ARCResource.java │ │ │ ├── ARCResourceFactory.java │ │ │ └── record │ │ │ │ ├── FiledescResource.java │ │ │ │ └── FiledescResourceFactory.java │ │ ├── generic │ │ │ ├── GenericResourceProducer.java │ │ │ └── GenericStreamResource.java │ │ ├── gzip │ │ │ ├── GZIPMetaData.java │ │ │ ├── GZIPResource.java │ │ │ └── GZIPResourceContainer.java │ │ ├── html │ │ │ ├── ExtractingParseObserver.java │ │ │ ├── HTMLMetaData.java │ │ │ ├── HTMLResource.java │ │ │ └── HTMLResourceFactory.java │ │ ├── http │ │ │ ├── HTTPHeadersResource.java │ │ │ ├── HTTPHeadersResourceFactory.java │ │ │ ├── HTTPRequestResource.java │ │ │ ├── HTTPRequestResourceFactory.java │ │ │ ├── HTTPResponseResource.java │ │ │ └── HTTPResponseResourceFactory.java │ │ ├── producer │ │ │ ├── ARCFile.java │ │ │ ├── EnvelopedResourceFile.java │ │ │ └── WARCFile.java │ │ └── warc │ │ │ ├── WARCResource.java │ │ │ ├── WARCResourceFactory.java │ │ │ └── record │ │ │ ├── DNSResource.java │ │ │ ├── DNSResourceFactory.java │ │ │ ├── WARCJSONMetaDataResource.java │ │ │ ├── WARCJSONMetaDataResourceFactory.java │ │ │ ├── WARCMetaDataResource.java │ │ │ └── WARCMetaDataResourceFactory.java │ │ ├── streamcontext │ │ ├── AbstractBufferingStream.java │ │ ├── ByteArrayWrappedStream.java │ │ ├── HDFSStream.java │ │ ├── HTTP11Stream.java │ │ ├── RandomAccessFileStream.java │ │ ├── SimpleStream.java │ │ ├── Stream.java │ │ └── StreamWrappedInputStream.java │ │ ├── uid │ │ ├── RecordIDGenerator.java │ │ ├── UUIDGenerator.java │ │ └── package.html │ │ ├── url │ │ ├── AggressiveIACanonicalizerRules.java │ │ ├── AggressiveIAURLCanonicalizer.java │ │ ├── BasicURLCanonicalizer.java │ │ ├── CanonicalizeRules.java │ │ ├── CanonicalizerConstants.java │ │ ├── DefaultIACanonicalizerRules.java │ │ ├── DefaultIAURLCanonicalizer.java │ │ ├── ExtractRule.java │ │ ├── GoogleURLCanonicalizer.java │ │ ├── HandyURL.java │ │ ├── IAURLCanonicalizer.java │ │ ├── LaxURI.java │ │ ├── LaxURLCodec.java │ │ ├── NonMassagingIAURLCanonicalizer.java │ │ ├── OrdinaryIACanonicalizerRules.java │ │ ├── OrdinaryIAURLCanonicalizer.java │ │ ├── RewriteRule.java │ │ ├── SURT.java │ │ ├── SURTTokenizer.java │ │ ├── URLCanonicalizer.java │ │ ├── URLKeyMaker.java │ │ ├── URLParser.java │ │ ├── URLRegexTransformer.java │ │ ├── UrlSurtRangeComputer.java │ │ ├── UsableURI.java │ │ ├── UsableURIFactory.java │ │ └── WaybackURLKeyMaker.java │ │ └── util │ │ ├── ArchiveUtils.java │ │ ├── Base32.java │ │ ├── ByteOp.java │ │ ├── CrossProduct.java │ │ ├── DateUtils.java │ │ ├── DevUtils.java │ │ ├── FileNameSpec.java │ │ ├── FileUtils.java │ │ ├── GeneralURIStreamFactory.java │ │ ├── Grep.java │ │ ├── HMACSigner.java │ │ ├── IAUtils.java │ │ ├── InetAddressUtil.java │ │ ├── InterruptibleCharSequence.java │ │ ├── IterableLineIterator.java │ │ ├── LaxHttpParser.java │ │ ├── MimetypeUtils.java │ │ ├── NestedMap.java │ │ ├── PrefixSet.java │ │ ├── ProcessUtils.java │ │ ├── ProgressStatisticsReporter.java │ │ ├── PropertyUtils.java │ │ ├── Recorder.java │ │ ├── Reporter.java │ │ ├── SURT.java │ │ ├── StreamCopy.java │ │ ├── StringFieldExtractor.java │ │ ├── StringParse.java │ │ ├── SurtPrefixSet.java │ │ ├── TextUtils.java │ │ ├── TmpDirTestCase.java │ │ ├── anvl │ │ ├── ANVLRecord.java │ │ ├── Element.java │ │ ├── Label.java │ │ ├── SubElement.java │ │ ├── Value.java │ │ └── package.html │ │ ├── binsearch │ │ ├── AbstractSeekableLineReader.java │ │ ├── ByteBufferInputStream.java │ │ ├── FieldExtractingSLR.java │ │ ├── FileSearchTool.java │ │ ├── SeekCDXBenchmarker.java │ │ ├── SeekableLineReader.java │ │ ├── SeekableLineReaderFactory.java │ │ ├── SeekableLineReaderIterator.java │ │ ├── SortedTextFile.java │ │ ├── WrappedSeekableLineReader.java │ │ └── impl │ │ │ ├── HDFSSeekableLineReader.java │ │ │ ├── HDFSSeekableLineReaderFactory.java │ │ │ ├── HTTPSeekableLineReader.java │ │ │ ├── HTTPSeekableLineReaderFactory.java │ │ │ ├── MappedSeekableLineReader.java │ │ │ ├── MappedSeekableLineReaderFactory.java │ │ │ ├── NIOSeekableLineReader.java │ │ │ ├── NIOSeekableLineReaderFactory.java │ │ │ ├── RandomAccessFileSeekableLineReader.java │ │ │ ├── RandomAccessFileSeekableLineReaderFactory.java │ │ │ └── http │ │ │ ├── ApacheHttp31SLR.java │ │ │ ├── ApacheHttp31SLRFactory.java │ │ │ ├── ApacheHttp43SLR.java │ │ │ ├── ApacheHttp43SLRFactory.java │ │ │ ├── HTTPURLConnSLR.java │ │ │ └── HTTPURLConnSLRFactory.java │ │ ├── io │ │ ├── BytesReadObserver.java │ │ ├── CRCInputStream.java │ │ ├── CRCOutputStream.java │ │ ├── CommitedOutputStream.java │ │ ├── EOFNotifyingInputStream.java │ │ ├── EOFObserver.java │ │ ├── MultiMemberOpenJDKGZIPInputStream.java │ │ ├── NotifyingInputStream.java │ │ ├── PushBackOneByteInputStream.java │ │ └── RuntimeIOException.java │ │ ├── iterator │ │ ├── AbstractPeekableIterator.java │ │ ├── BoundedStringIterator.java │ │ ├── CachingStringFilter.java │ │ ├── CloseableCompositeIterator.java │ │ ├── CloseableIterator.java │ │ ├── CloseableIteratorUtil.java │ │ ├── CloseableIteratorWrapper.java │ │ ├── FilterStringIterator.java │ │ ├── LineReadingIterator.java │ │ ├── LookaheadIterator.java │ │ ├── PeekableIterator.java │ │ ├── PrefixMatchStringIterator.java │ │ ├── RegexLineIterator.java │ │ ├── SortedCompositeIterator.java │ │ ├── StartBoundedStringIterator.java │ │ ├── StringFilter.java │ │ ├── StringTransformer.java │ │ ├── TransformingIteratorWrapper.java │ │ └── TransformingPrefixStringFilter.java │ │ └── zip │ │ ├── GZIPMembersInputStream.java │ │ ├── GzipHeader.java │ │ ├── NoGzipMagicException.java │ │ ├── OpenJDK7GZIPInputStream.java │ │ └── OpenJDK7InflaterInputStream.java └── resources │ ├── effective_tld_names.dat │ └── org │ └── archive │ ├── commons.properties │ ├── ia-web-commons-version.txt │ └── util │ └── tlds-alpha-by-domain.txt └── test ├── java └── org │ └── archive │ ├── extract │ └── RealCDXExtractorOutputTest.java │ ├── format │ ├── dns │ │ └── DNSResponseParserTest.java │ ├── gzip │ │ ├── GZIPMemberSeriesTest.java │ │ ├── GZIPMemberWriterTest.java │ │ └── zipnum │ │ │ └── ZipNumWriterTest.java │ ├── http │ │ ├── HttpRequestMessageParserTest.java │ │ └── HttpResponseParserTest.java │ ├── json │ │ ├── CompoundORJSONPathSpecTest.java │ │ ├── JSONPathSpecFactoryTest.java │ │ ├── JSONViewTest.java │ │ └── SimpleJSONPathSpecTest.java │ └── text │ │ └── html │ │ └── CDATALexerTest.java │ ├── io │ ├── ArchiveReaderFactoryTest.java │ ├── BufferedSeekInputStreamTest.java │ ├── HeaderedArchiveRecordTest.java │ ├── RecordingInputStreamTest.java │ ├── RecordingOutputStreamTest.java │ ├── ReplayCharSequenceTest.java │ ├── RepositionableInputStreamTest.java │ ├── arc │ │ ├── ARCReaderFactoryTest.java │ │ ├── ARCWriterPoolTest.java │ │ └── ARCWriterTest.java │ └── warc │ │ ├── WARCReaderFactoryTest.java │ │ └── WARCWriterTest.java │ ├── net │ └── PublicSuffixesTest.java │ ├── resource │ ├── MetaDataTest.java │ ├── arc │ │ └── ARCResourceTest.java │ ├── html │ │ ├── ExtractingParseObserverTest.java │ │ └── HTMLMetaDataTest.java │ └── warc │ │ └── WARCResourceTest.java │ ├── uid │ └── UUIDGeneratorTest.java │ ├── url │ ├── AggressiveIAURLCanonicalizerTest.java │ ├── BasicURLCanonicalizerTest.java │ ├── HandyURLTest.java │ ├── IAURLCanonicalizerTest.java │ ├── OrdinaryIAURLCanonicalizerTest.java │ ├── URLParserTest.java │ ├── URLRegexTransformerTest.java │ ├── UsableURIFactoryTest.java │ ├── UsableURITest.java │ └── WaybackURLKeyMakerTest.java │ └── util │ ├── ArchiveUtilsTest.java │ ├── ByteOpTest.java │ ├── CrossProductTest.java │ ├── FileUtilsTest.java │ ├── InterruptibleCharSequenceTest.java │ ├── MimetypeUtilsTest.java │ ├── PropertyUtilsTest.java │ ├── StringFieldExtractorTest.java │ ├── TestUtils.java │ ├── anvl │ └── ANVLRecordTest.java │ ├── binsearch │ └── SortedTextFileTest.java │ ├── iterator │ ├── CachingStringFilterTest.java │ ├── FilterStringIteratorTest.java │ └── SortedCompositeIteratorTest.java │ └── zip │ └── GZIPMembersInputStreamTest.java └── resources └── org └── archive ├── format ├── arc │ └── IAH-20080430204825-00000-blackbook-truncated.arc ├── gzip │ ├── IAH-urls-wget.warc.gz │ ├── abcd.gz │ ├── double-single-inflate-error.gz │ ├── empty.gz │ ├── hi-2.gz │ └── hi.gz └── warc │ ├── IAH-urls-wget.warc │ └── mutliple-headers.warc └── resource └── html ├── html-lang-attribute.warc ├── link-extraction-test.warc ├── meta-itemprop.warc ├── text-extraction-test.warc └── title-extraction-embedded-SVG.warc /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | name: Java CI with Maven 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | 9 | jobs: 10 | build: 11 | strategy: 12 | matrix: 13 | jdk: [8, 11, 17, 21, 22] 14 | 15 | runs-on: ubuntu-latest 16 | timeout-minutes: 30 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up JDK ${{ matrix.jdk }} 21 | uses: actions/setup-java@v4 22 | with: 23 | java-version: ${{ matrix.jdk }} 24 | distribution: 'temurin' 25 | cache: maven 26 | - name: Cache local Maven repository 27 | uses: actions/cache@v4 28 | with: 29 | path: ~/.m2/repository 30 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 31 | restore-keys: | 32 | ${{ runner.os }}-maven- 33 | - name: Build with Maven 34 | run: mvn -B package --file pom.xml 35 | 36 | # Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive 37 | - name: Update dependency graph 38 | if: ${{ github.event_name == 'push' }} 39 | uses: advanced-security/maven-dependency-submission-action@v4.1.1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pydevproject 2 | .project 3 | .metadata 4 | bin/** 5 | tmp/** 6 | tmp/**/* 7 | *.tmp 8 | *.bak 9 | *.swp 10 | *~.nib 11 | local.properties 12 | .classpath 13 | .settings/ 14 | .loadpath 15 | 16 | # Target 17 | target/ 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | IIPC Web Archive Commons 2 | ======================== 3 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons) [![Javadoc](https://javadoc.io/badge2/org.netpreserve.commons/webarchive-commons/javadoc.svg)](https://www.javadoc.io/doc/org.netpreserve.commons/webarchive-commons) 4 | 5 | This repository contains common utility code for [OpenWayback][1] and other projects. 6 | 7 | [1]: https://github.com/iipc/openwayback 8 | -------------------------------------------------------------------------------- /src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java: -------------------------------------------------------------------------------- 1 | // copied from fastutil, keeping the original package name to avoid breaking 2 | // compatibility with existing user code that implements this interface 3 | package it.unimi.dsi.fastutil.io; 4 | 5 | /* 6 | * Copyright (C) 2005-2015 Sebastiano Vigna 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | 22 | /** A basic interface specifying positioning methods for a byte stream. 23 | * 24 | * @author Sebastiano Vigna 25 | * @since 4.4 26 | */ 27 | 28 | public interface RepositionableStream { 29 | 30 | /** Sets the current stream position. 31 | * 32 | * @param newPosition the new stream position. 33 | */ 34 | void position( long newPosition ) throws java.io.IOException; 35 | 36 | /** Returns the current stream position. 37 | * 38 | * @return the current stream position. 39 | */ 40 | long position() throws java.io.IOException; 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/org/archive/RecoverableRecordFormatException.java: -------------------------------------------------------------------------------- 1 | package org.archive; 2 | 3 | import java.io.IOException; 4 | 5 | public class RecoverableRecordFormatException extends IOException { 6 | 7 | /** 8 | * 9 | */ 10 | private static final long serialVersionUID = 2775048979983919630L; 11 | public RecoverableRecordFormatException() { 12 | super(); 13 | } 14 | public RecoverableRecordFormatException(String message) { 15 | super(message); 16 | } 17 | public RecoverableRecordFormatException(Exception e) { 18 | super(e); 19 | } 20 | public RecoverableRecordFormatException(String message, IOException e) { 21 | super(message,e); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/archive/extract/DumpingExtractorOutput.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.io.PrintStream; 6 | import java.util.logging.Logger; 7 | 8 | import org.archive.resource.Resource; 9 | import org.archive.util.StreamCopy; 10 | import com.github.openjson.JSONException; 11 | 12 | import com.google.common.io.ByteStreams; 13 | import com.google.common.io.CountingOutputStream; 14 | 15 | public class DumpingExtractorOutput implements ExtractorOutput { 16 | private static final Logger LOG = 17 | Logger.getLogger(DumpingExtractorOutput.class.getName()); 18 | 19 | private PrintStream out; 20 | public DumpingExtractorOutput(OutputStream out) { 21 | this.out = new PrintStream(out); 22 | } 23 | 24 | public void output(Resource resource) throws IOException { 25 | OutputStream nullo = ByteStreams.nullOutputStream(); 26 | CountingOutputStream co = new CountingOutputStream(nullo); 27 | StreamCopy.copy(resource.getInputStream(), co); 28 | long bytes = co.getCount(); 29 | if(bytes > 0) { 30 | LOG.info(bytes + " unconsumed bytes in Resource InputStream."); 31 | } 32 | try { 33 | out.println(resource.getMetaData().getTopMetaData().toString(1)); 34 | } catch (JSONException e) { 35 | LOG.warning(e.getMessage()); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/extract/ExtractingResourceProducer.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import java.io.IOException; 4 | import java.util.logging.Level; 5 | import java.util.logging.Logger; 6 | 7 | import org.archive.resource.Resource; 8 | import org.archive.resource.ResourceFactory; 9 | import org.archive.resource.ResourceParseException; 10 | import org.archive.resource.ResourceProducer; 11 | 12 | public class ExtractingResourceProducer implements ResourceProducer { 13 | private static final Logger LOG = 14 | Logger.getLogger(ExtractingResourceProducer.class.getName()); 15 | private ResourceProducer producer; 16 | private ResourceFactoryMapper mapper; 17 | 18 | public ExtractingResourceProducer(ResourceProducer producer, 19 | ResourceFactoryMapper mapper) { 20 | 21 | this.producer = producer; 22 | this.mapper = mapper; 23 | } 24 | 25 | public Resource getNext() throws ResourceParseException, IOException { 26 | Resource current = producer.getNext(); 27 | if(current == null) { 28 | return null; 29 | } 30 | while(true) { 31 | ResourceFactory f = mapper.mapResourceToFactory(current); 32 | if(f == null) { 33 | return current; 34 | } 35 | if(LOG.isLoggable(Level.FINE)) { 36 | LOG.fine(String.format("Extracting (%s) with (%s)\n", 37 | current.getClass().toString(), 38 | f.getClass().toString())); 39 | } 40 | current = f.getResource(current.getInputStream(), 41 | current.getMetaData(), current.getContainer()); 42 | } 43 | } 44 | 45 | public void close() throws IOException { 46 | producer.close(); 47 | } 48 | 49 | public String getContext() { 50 | return producer.getContext(); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/org/archive/extract/ExtractorOutput.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import java.io.IOException; 4 | 5 | import org.archive.resource.Resource; 6 | 7 | public interface ExtractorOutput { 8 | public void output(Resource resource) throws IOException; 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/org/archive/extract/FilteredExtractorOuput.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintStream; 5 | import java.util.List; 6 | 7 | import org.archive.format.json.JSONUtils; 8 | import org.archive.resource.Resource; 9 | import org.archive.util.StreamCopy; 10 | 11 | public class FilteredExtractorOuput implements ExtractorOutput { 12 | private String filterPath; 13 | private PrintStream out; 14 | public FilteredExtractorOuput(PrintStream out, String filterPath) { 15 | this.filterPath = filterPath; 16 | this.out = out; 17 | } 18 | public void output(Resource resource) throws IOException { 19 | StreamCopy.readToEOF(resource.getInputStream()); 20 | List results = JSONUtils.extractFancy(resource.getMetaData().getTopMetaData(), filterPath); 21 | if(results != null) { 22 | for(String result: results) { 23 | out.println("Result: " + result); 24 | } 25 | } 26 | } 27 | public void output2(Resource resource) throws IOException { 28 | String result = JSONUtils.extractSingle(resource.getMetaData().getTopMetaData(), filterPath); 29 | if(result != null) { 30 | out.println("Result:" + result); 31 | } 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/archive/extract/JSONViewExtractorOutput.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.io.PrintStream; 6 | import java.util.List; 7 | 8 | import org.apache.commons.lang.StringUtils; 9 | import org.archive.format.json.JSONView; 10 | import org.archive.resource.Resource; 11 | import org.archive.util.StreamCopy; 12 | 13 | public class JSONViewExtractorOutput implements ExtractorOutput { 14 | private PrintStream out; 15 | private JSONView view; 16 | public JSONViewExtractorOutput(OutputStream out, String filterPath) { 17 | view = new JSONView(filterPath.split(",")); 18 | this.out = new PrintStream(out); 19 | } 20 | public void output(Resource resource) throws IOException { 21 | StreamCopy.readToEOF(resource.getInputStream()); 22 | List> data = 23 | view.apply(resource.getMetaData().getTopMetaData()); 24 | if(data != null) { 25 | for(List d : data) { 26 | out.println(StringUtils.join(d,"\t")); 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/archive/extract/ResourceFactoryMapper.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import org.archive.resource.Resource; 4 | import org.archive.resource.ResourceConstants; 5 | import org.archive.resource.ResourceFactory; 6 | 7 | public interface ResourceFactoryMapper extends ResourceConstants { 8 | public ResourceFactory mapResourceToFactory(Resource resource); 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/arc/ARCFormatException.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.arc; 2 | 3 | import org.archive.RecoverableRecordFormatException; 4 | 5 | public class ARCFormatException extends RecoverableRecordFormatException { 6 | 7 | public ARCFormatException(String string) { 8 | super(string); 9 | } 10 | public ARCFormatException(Exception e) { 11 | super(e); 12 | } 13 | 14 | /** 15 | * 16 | */ 17 | private static final long serialVersionUID = 1L; 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/cdx/CDX09Line.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.cdx; 2 | 3 | public class CDX09Line extends CDXLine { 4 | 5 | CDX09Line(String string, FieldSplitFormat selectNames) { 6 | super(string, selectNames); 7 | } 8 | 9 | @Override 10 | public String getUrlKey() { 11 | return getField(0); 12 | } 13 | 14 | @Override 15 | public String getTimestamp() { 16 | return getField(1); 17 | } 18 | 19 | @Override 20 | public String getOriginalUrl() { 21 | return getField(2); 22 | } 23 | 24 | @Override 25 | public String getMimeType() { 26 | return getField(3); 27 | } 28 | 29 | @Override 30 | public void setMimeType(String mime) 31 | { 32 | setField(3, mime); 33 | } 34 | 35 | @Override 36 | public String getStatusCode() { 37 | return getField(4); 38 | } 39 | 40 | @Override 41 | public String getDigest() { 42 | return getField(5); 43 | } 44 | 45 | @Override 46 | public String getRedirect() { 47 | return getField(6); 48 | } 49 | 50 | @Override 51 | public String getOffset() { 52 | return getField(7); 53 | } 54 | 55 | @Override 56 | public String getFilename() { 57 | return getField(8); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/cdx/CDX11Line.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.cdx; 2 | 3 | public class CDX11Line extends CDXLine { 4 | 5 | CDX11Line(String string, FieldSplitFormat selectNames) { 6 | super(string, selectNames); 7 | } 8 | 9 | @Override 10 | public String getUrlKey() { 11 | return getField(0); 12 | } 13 | 14 | @Override 15 | public String getTimestamp() { 16 | return getField(1); 17 | } 18 | 19 | @Override 20 | public String getOriginalUrl() { 21 | return getField(2); 22 | } 23 | 24 | @Override 25 | public String getMimeType() { 26 | return getField(3); 27 | } 28 | 29 | @Override 30 | public void setMimeType(String mime) 31 | { 32 | setField(3, mime); 33 | } 34 | 35 | @Override 36 | public String getStatusCode() { 37 | return getField(4); 38 | } 39 | 40 | @Override 41 | public String getDigest() { 42 | return getField(5); 43 | } 44 | 45 | @Override 46 | public String getRedirect() { 47 | return getField(6); 48 | } 49 | 50 | @Override 51 | public String getRobotFlags() { 52 | return getField(7); 53 | } 54 | 55 | @Override 56 | public String getLength() { 57 | return getField(8); 58 | } 59 | 60 | @Override 61 | public String getOffset() { 62 | return getField(9); 63 | } 64 | 65 | @Override 66 | public String getFilename() { 67 | return getField(10); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/cdx/CDXFieldConstants.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.cdx; 2 | 3 | public interface CDXFieldConstants { 4 | public final static String urlkey = "urlkey"; 5 | public final static String timestamp = "timestamp"; 6 | public final static String original = "original"; 7 | public final static String mimetype = "mimetype"; 8 | public final static String statuscode = "statuscode"; 9 | public final static String digest = "digest"; 10 | public final static String redirect = "redirect"; 11 | public final static String robotflags = "robotflags"; 12 | public final static String length = "length"; 13 | public final static String offset = "offset"; 14 | public final static String filename = "filename"; 15 | 16 | // A list of *ALL* standard cdx field names 17 | public final static FieldSplitFormat CDX_ALL_NAMES = new FieldSplitFormat(urlkey, timestamp, original, mimetype, statuscode, digest, redirect, robotflags, 18 | length, offset, filename); 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/cdx/CDXInputSource.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.cdx; 2 | 3 | import java.io.IOException; 4 | 5 | import org.archive.format.gzip.zipnum.ZipNumParams; 6 | import org.archive.util.iterator.CloseableIterator; 7 | 8 | public interface CDXInputSource { 9 | 10 | public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException; 11 | public CloseableIterator getCDXIterator(String key, String start, String startEndUrl, ZipNumParams params) throws IOException; 12 | 13 | public long getTotalLines(); 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/cdx/CDXLine.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.cdx; 2 | 3 | 4 | public class CDXLine extends FieldSplitLine implements CDXFieldConstants { 5 | 6 | public CDXLine(String line, FieldSplitFormat names) { 7 | super(line, ' ', names); 8 | } 9 | 10 | public CDXLine(CDXLine line, FieldSplitFormat selectNames) 11 | { 12 | super(line.selectValues(selectNames), selectNames); 13 | } 14 | 15 | public String getUrlKey() { 16 | return super.getField(CDXLine.urlkey); 17 | } 18 | 19 | public String getTimestamp() { 20 | return super.getField(CDXLine.timestamp); 21 | } 22 | 23 | public String getOriginalUrl() { 24 | return super.getField(CDXLine.original); 25 | } 26 | 27 | public String getMimeType() { 28 | return super.getField(CDXLine.mimetype); 29 | } 30 | 31 | public void setMimeType(String newMime) { 32 | setField(CDXLine.mimetype, newMime); 33 | } 34 | 35 | public String getStatusCode() { 36 | return super.getField(CDXLine.statuscode); 37 | } 38 | 39 | public void setStatusCode(String newStatus) { 40 | setField(CDXLine.statuscode, newStatus); 41 | } 42 | 43 | public String getDigest() { 44 | return super.getField(CDXLine.digest); 45 | } 46 | 47 | public String getLength() { 48 | return super.getField(CDXLine.length); 49 | } 50 | 51 | public String getOffset() { 52 | return super.getField(CDXLine.offset); 53 | } 54 | 55 | public String getFilename() { 56 | return super.getField(CDXLine.filename); 57 | } 58 | 59 | public String getRedirect() { 60 | return super.getField(CDXLine.redirect); 61 | } 62 | 63 | public String getRobotFlags() { 64 | return super.getField(CDXLine.robotflags); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/cdx/CDXLineFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.cdx; 2 | 3 | public interface CDXLineFactory { 4 | public FieldSplitFormat getParseFormat(); 5 | public CDXLine createStandardCDXLine(String input, FieldSplitFormat exFormat); 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/dns/DNSParseException.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.dns; 2 | 3 | import org.archive.RecoverableRecordFormatException; 4 | 5 | public class DNSParseException extends RecoverableRecordFormatException { 6 | 7 | public DNSParseException(String string) { 8 | super(string); 9 | } 10 | 11 | /** 12 | * 13 | */ 14 | private static final long serialVersionUID = 7946541881940132743L; 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/dns/DNSRecord.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.dns; 2 | 3 | public class DNSRecord { 4 | private String name; 5 | private int ttl; 6 | private String netClass; 7 | private String type; 8 | private String value; 9 | public DNSRecord(String name, int ttl, String netClass, String type, String value) { 10 | this.name = name; 11 | this.ttl = ttl; 12 | this.netClass = netClass; 13 | this.type = type; 14 | this.value = value; 15 | } 16 | public String getName() { 17 | return name; 18 | } 19 | public int getTtl() { 20 | return ttl; 21 | } 22 | public String getNetClass() { 23 | return netClass; 24 | } 25 | public String getType() { 26 | return type; 27 | } 28 | public String getValue() { 29 | return value; 30 | } 31 | public static DNSRecord parse(String line) throws DNSParseException { 32 | String a[] = line.split("\\s+"); 33 | try { 34 | if(a.length == 5) { 35 | return new DNSRecord(a[0],Integer.parseInt(a[1]),a[2],a[3],a[4]); 36 | } else { 37 | throw new DNSParseException("Wrong number of fields:" + line); 38 | } 39 | } catch (NumberFormatException e) { 40 | throw new DNSParseException("BAD TTL field:" + line); 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/dns/DNSResponse.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.dns; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class DNSResponse extends ArrayList { 6 | /** 7 | * 8 | */ 9 | private static final long serialVersionUID = -10624236867791758L; 10 | private String date; 11 | public void setDate(String date) { 12 | this.date = date; 13 | } 14 | 15 | public String getDate() { 16 | return date; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/dns/DNSResponseParser.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.dns; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.io.UnsupportedEncodingException; 8 | 9 | public class DNSResponseParser { 10 | 11 | private boolean isBlank(String line) { 12 | return line.matches("\\S"); 13 | } 14 | private boolean isDate(String dateLine) { 15 | return !isBlank(dateLine); 16 | } 17 | public void parse(InputStream is, DNSResponse response) throws IOException, DNSParseException { 18 | /* 19 | 20110328212258 20 | www.google.com. 86399 IN CNAME www.l.google.com. 21 | www.l.google.com. 299 IN A 74.125.71.105 22 | www.l.google.com. 299 IN A 74.125.71.103 23 | www.l.google.com. 299 IN A 74.125.71.99 24 | www.l.google.com. 299 IN A 74.125.71.147 25 | www.l.google.com. 299 IN A 74.125.71.104 26 | www.l.google.com. 299 IN A 74.125.71.106 27 | */ 28 | try { 29 | // TODO: should we wrap in a CountingInputStream and indicate 30 | // observed octet-length? 31 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); 32 | String date = br.readLine().trim(); 33 | if(isDate(date)) { 34 | response.setDate(date); 35 | } 36 | while(true) { 37 | String line = br.readLine(); 38 | if(line == null) { 39 | break; 40 | } 41 | if(!isBlank(line)) { 42 | response.add(DNSRecord.parse(line)); 43 | } 44 | } 45 | } catch (UnsupportedEncodingException e) { 46 | // really really should not happen.. 47 | e.printStackTrace(); 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/GZIPFooter.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | 6 | import org.archive.util.ByteOp; 7 | 8 | public class GZIPFooter implements GZIPConstants { 9 | byte buffer[] = null; 10 | 11 | public GZIPFooter(byte buffer[]) throws GZIPFormatException { 12 | if(buffer.length != GZIP_FOOTER_BYTES) { 13 | throw new GZIPFormatException("Wrong length footer"); 14 | } 15 | this.buffer = buffer; 16 | } 17 | public GZIPFooter(long crc, long length) { 18 | buffer = new byte[GZIP_FOOTER_BYTES]; 19 | ByteOp.writeInt(buffer, 0, crc); 20 | ByteOp.writeInt(buffer, BYTES_IN_INT, length); 21 | } 22 | public long getCRC() { 23 | return ByteOp.bytesToInt(buffer, 0); 24 | } 25 | public long getLength() { 26 | return ByteOp.bytesToInt(buffer, BYTES_IN_INT); 27 | } 28 | public void verify(long crc, long length) throws GZIPFormatException { 29 | // long gotCRC = getCRC() & 0xffffffff; 30 | // long gotCRC2 = getCRC(); 31 | // int gotCRCi = (int) (getCRC() & 0xffffffff); 32 | // 33 | // long wantCRC = crc & 0xffffffff; 34 | int wantCRCi = (int) (crc & 0xffffffff); 35 | if(wantCRCi != getCRC()) { 36 | throw new GZIPFormatException("GZip crc error"); 37 | } 38 | if(length != getLength()) { 39 | throw new GZIPFormatException("GZip length error"); 40 | } 41 | } 42 | public void writeBytes(OutputStream os) throws IOException { 43 | os.write(buffer); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/GZIPFormatException.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip; 2 | 3 | import java.io.IOException; 4 | 5 | import org.archive.RecoverableRecordFormatException; 6 | 7 | 8 | public class GZIPFormatException extends RecoverableRecordFormatException { 9 | /** */ 10 | private static final long serialVersionUID = -3526676437467483190L; 11 | 12 | public GZIPFormatException() { 13 | super(); 14 | } 15 | public GZIPFormatException(String message) { 16 | super(message); 17 | } 18 | public GZIPFormatException(Exception e) { 19 | super(e); 20 | } 21 | public GZIPFormatException(String message, IOException e) { 22 | super(message,e); 23 | } 24 | public static class GZIPExtraFieldShortException extends GZIPFormatException { 25 | int bytesRead; 26 | public GZIPExtraFieldShortException(int bytesRead) { 27 | super("Extra Field short."); 28 | this.bytesRead = bytesRead; 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/GZIPMemberWriter.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.OutputStream; 6 | import java.util.zip.Deflater; 7 | import java.util.zip.DeflaterOutputStream; 8 | 9 | import org.archive.util.StreamCopy; 10 | import org.archive.util.io.CRCInputStream; 11 | 12 | import com.google.common.io.CountingOutputStream; 13 | 14 | public class GZIPMemberWriter implements GZIPConstants { 15 | private static final int MAX_RAM_BUFFER = 1024 * 1024; 16 | private byte slRecordName[] = SL_RECORD; 17 | public int maxBuffer = MAX_RAM_BUFFER; 18 | private CountingOutputStream out; 19 | 20 | public GZIPMemberWriter(OutputStream out) { 21 | this.out = new CountingOutputStream(out); 22 | } 23 | 24 | public void write(InputStream is) throws IOException { 25 | CRCInputStream crc = new CRCInputStream(is); 26 | GZIPHeader gzHeader = new GZIPHeader(); 27 | // TODO: add fields... 28 | gzHeader.writeBytes(out); 29 | Deflater deflater = new Deflater(Deflater.DEFAULT_COMPRESSION, true); 30 | DeflaterOutputStream deflateOut = new DeflaterOutputStream(out,deflater); 31 | StreamCopy.copy(crc, deflateOut); 32 | deflateOut.finish(); 33 | GZIPFooter gzFooter = new GZIPFooter(crc.getCRCValue(), crc.getByteCount()); 34 | gzFooter.writeBytes(out); 35 | out.flush(); 36 | } 37 | 38 | public long getBytesWritten() { 39 | return out.getCount(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/GZIPMemberWriterCommittedOutputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip; 2 | 3 | import java.io.IOException; 4 | import java.io.ByteArrayInputStream; 5 | import java.io.ByteArrayOutputStream; 6 | 7 | import org.archive.util.io.CommitedOutputStream; 8 | 9 | public class GZIPMemberWriterCommittedOutputStream extends CommitedOutputStream { 10 | private static int DEFAULT_BUFFER_RAM = 1024 * 1024; 11 | private GZIPMemberWriter gzW; 12 | public GZIPMemberWriterCommittedOutputStream(GZIPMemberWriter gzW) { 13 | this(gzW,DEFAULT_BUFFER_RAM); 14 | } 15 | public GZIPMemberWriterCommittedOutputStream(GZIPMemberWriter gzW, int bufferRAM) { 16 | super(new ByteArrayOutputStream()); 17 | this.gzW = gzW; 18 | } 19 | 20 | @Override 21 | public void commit() throws IOException { 22 | ByteArrayOutputStream bos = (ByteArrayOutputStream) out; 23 | gzW.write(new ByteArrayInputStream(bos.toByteArray())); 24 | } 25 | public long getBytesWritten() { 26 | return gzW.getBytesWritten(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/zipnum/SummaryLine.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip.zipnum; 2 | 3 | import org.archive.format.cdx.FieldSplitLine; 4 | 5 | public class SummaryLine extends FieldSplitLine 6 | { 7 | String partId; 8 | long offset; 9 | int length; 10 | 11 | public SummaryLine(String line) 12 | { 13 | super(line, '\t', null); 14 | partId = super.getField(1); 15 | if (super.getNumFields() < 3) { 16 | return; 17 | } 18 | offset = Long.parseLong(super.getField(2)); 19 | length = Integer.parseInt(super.getField(3)); 20 | //timestamp = makeTimestamp(parts[0]); 21 | } 22 | 23 | // String makeTimestamp(String key) 24 | // { 25 | // if (params.getTimestampDedupLength() <= 0) { 26 | // return null; 27 | // } 28 | // 29 | // int space = key.indexOf(' '); 30 | // if (space >= 0) { 31 | // return key.substring(0, space + 1 + params.getTimestampDedupLength()); 32 | // } else { 33 | // return null; 34 | // } 35 | // } 36 | 37 | public boolean isContinuous(SummaryLine next) 38 | { 39 | if (next == null || next.fullLine == null) { 40 | return false; 41 | } 42 | 43 | // Must be same part 44 | if (!partId.equals(next.partId)) { 45 | return false; 46 | } 47 | 48 | if ((offset + length) != next.offset) { 49 | return false; 50 | } 51 | 52 | return true; 53 | } 54 | 55 | // boolean sameTimestamp(SplitLine next) 56 | // { 57 | // if (next == null || next.timestamp == null) { 58 | // return false; 59 | // } 60 | // 61 | // if (timestamp == null) { 62 | // return false; 63 | // } 64 | // 65 | // return timestamp.equals(next.timestamp); 66 | // } 67 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/zipnum/TimestampCustomDedupIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip.zipnum; 2 | 3 | import org.archive.util.iterator.CloseableIterator; 4 | 5 | public class TimestampCustomDedupIterator extends TimestampDedupIterator { 6 | 7 | // The additional field used as status field from the timestamp 8 | private int additionalFieldNum = 3; 9 | private int sep = ' '; 10 | 11 | public TimestampCustomDedupIterator(CloseableIterator inner, 12 | int timestampDedupLength) { 13 | super(inner, timestampDedupLength); 14 | } 15 | 16 | @Override 17 | protected boolean isSame(String currStamp, String nextStamp, 18 | String currLine, String nextLine) { 19 | 20 | if (!super.isSame(currStamp, nextStamp, currLine, nextLine)) { 21 | return false; 22 | } 23 | 24 | //Same only if status code also matches 25 | String currStatus = getNthField(currLine, currStamp.length(), additionalFieldNum, sep); 26 | if (currStatus == null) { 27 | return false; 28 | } 29 | 30 | String nextStatus = getNthField(nextLine, nextStamp.length(), additionalFieldNum, sep); 31 | if (nextStatus == null) { 32 | return false; 33 | } 34 | 35 | return currStatus.equals(nextStatus); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip.zipnum; 2 | 3 | public class ZipNumParams 4 | { 5 | protected int maxAggregateBlocks = 1; 6 | protected int timestampDedupLength = 0; 7 | protected int maxBlocks = 0; 8 | private boolean reverse = false; 9 | private boolean sequential = false; 10 | 11 | public ZipNumParams() 12 | { 13 | 14 | } 15 | 16 | public ZipNumParams(ZipNumParams defaults) 17 | { 18 | this(defaults.maxAggregateBlocks, defaults.maxBlocks, defaults.timestampDedupLength, defaults.reverse); 19 | } 20 | 21 | public ZipNumParams(int maxAggregateBlocks, int maxBlocks, int timestampDedupLength, boolean reverse) 22 | { 23 | this.maxAggregateBlocks = maxAggregateBlocks; 24 | this.maxBlocks = maxBlocks; 25 | this.timestampDedupLength = timestampDedupLength; 26 | this.reverse = reverse; 27 | } 28 | 29 | public int getMaxAggregateBlocks() { 30 | return maxAggregateBlocks; 31 | } 32 | 33 | public void setMaxAggregateBlocks(int maxAggregateBlocks) { 34 | this.maxAggregateBlocks = maxAggregateBlocks; 35 | } 36 | 37 | public int getTimestampDedupLength() { 38 | return timestampDedupLength; 39 | } 40 | 41 | public void setTimestampDedupLength(int timestampDedupLength) { 42 | this.timestampDedupLength = timestampDedupLength; 43 | } 44 | 45 | public int getMaxBlocks() { 46 | return maxBlocks; 47 | } 48 | 49 | public void setMaxBlocks(int maxBlocks) { 50 | this.maxBlocks = maxBlocks; 51 | } 52 | 53 | public boolean isReverse() { 54 | return this.reverse; 55 | } 56 | 57 | public void setReverse(boolean reverse) { 58 | this.reverse = reverse; 59 | } 60 | 61 | public boolean isSequential() { 62 | return sequential; 63 | } 64 | 65 | public void setSequential(boolean sequential) { 66 | this.sequential = sequential; 67 | } 68 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip.zipnum; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | import java.nio.charset.Charset; 7 | 8 | import org.archive.format.gzip.GZIPMemberWriter; 9 | import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream; 10 | 11 | public class ZipNumWriter extends GZIPMemberWriterCommittedOutputStream { 12 | int limit; 13 | int count; 14 | OutputStream manifestOut; 15 | ByteArrayOutputStream manifestBuffer; 16 | char delimiter = '\t'; 17 | private static final Charset UTF8 = Charset.forName("utf-8"); 18 | public ZipNumWriter(OutputStream main, OutputStream manifest, int limit) { 19 | super(new GZIPMemberWriter(main)); 20 | manifestOut = manifest; 21 | this.limit = limit; 22 | count = 0; 23 | manifestBuffer = new ByteArrayOutputStream(); 24 | } 25 | 26 | public void addRecord(byte[] bytes) throws IOException { 27 | if(count == 0) { 28 | manifestBuffer.write(bytes); 29 | } 30 | write(bytes); 31 | count++; 32 | if(count == limit) { 33 | finishCurrent(); 34 | } 35 | } 36 | 37 | public void close() throws IOException { 38 | finishCurrent(); 39 | } 40 | 41 | private void finishCurrent() throws IOException { 42 | if(count == 0) { 43 | return; 44 | } 45 | long start = getBytesWritten(); 46 | commit(); 47 | long end = getBytesWritten(); 48 | long len = end - start; 49 | StringBuilder sb = new StringBuilder(); 50 | sb.append(start); 51 | sb.append(delimiter); 52 | sb.append(len); 53 | sb.append(delimiter); 54 | manifestOut.write(sb.toString().getBytes(UTF8)); 55 | manifestBuffer.writeTo(manifestOut); 56 | manifestOut.flush(); 57 | count = 0; 58 | manifestBuffer.reset(); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import java.io.PrintStream; 4 | import java.nio.charset.Charset; 5 | 6 | public class DumpingHTTPParseObserver implements HttpHeaderObserver { 7 | private static final Charset UTF8 = Charset.forName("UTF-8"); 8 | private PrintStream ps = null; 9 | public DumpingHTTPParseObserver() { 10 | ps = System.out; 11 | } 12 | public DumpingHTTPParseObserver(PrintStream ps) { 13 | this.ps = ps; 14 | } 15 | 16 | public void headerParsed(byte[] name, int ns, int nl, byte[] value, int vs, 17 | int vl) { 18 | ps.format("headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n", 19 | ns,nl,new String(name,0,nl,UTF8), 20 | vs,vl,new String(value,0,vl,UTF8)); 21 | } 22 | 23 | public void headersComplete(int bytesRead) { 24 | ps.format("headersComplete(%d)\n",bytesRead); 25 | } 26 | public void headersCorrupt() { 27 | ps.println("headersCorrupted\n"); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpHeader.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | 6 | public class HttpHeader implements HttpConstants { 7 | private String name = null; 8 | private String value = null; 9 | 10 | public HttpHeader() {} 11 | 12 | public HttpHeader(String name, String value) { 13 | this.name = name; 14 | this.value = value; 15 | } 16 | 17 | public String getName() { return name; } 18 | public void setName(String name) { this.name = name; } 19 | public String getValue() { return value; } 20 | public void setValue(String value) { this.value = value; } 21 | 22 | public void write(OutputStream out) throws IOException { 23 | out.write(name.getBytes(UTF8)); out.write(COLON); out.write(SP); 24 | 25 | out.write(value.getBytes(UTF8)); out.write(CR); out.write(LF); 26 | } 27 | 28 | public String toString() { 29 | StringBuilder sb = new StringBuilder(name.length() + value.length()+20); 30 | sb.append(String.format("HttpHeader(%s)(%s)",name,value)); 31 | return sb.toString(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpHeaderObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public interface HttpHeaderObserver extends HttpConstants { 4 | 5 | public void headerParsed(byte name[], int ns, int nl, 6 | byte value[], int vs, int vl); 7 | 8 | public void headersComplete(int totalBytes); 9 | public void headersCorrupt(); 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpMessage.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public class HttpMessage implements HttpConstants { 4 | protected int version = VERSION_0; 5 | protected int bytes = -1; 6 | protected boolean isCorrupt; 7 | 8 | public int getVersion() { 9 | return version; 10 | } 11 | public String getVersionString() { 12 | if(version == VERSION_1) { 13 | return VERSION_1_STATUS; 14 | } else if(version == VERSION_9) { 15 | return VERSION_9_STATUS; 16 | } 17 | return VERSION_0_STATUS; 18 | } 19 | public int getLength() { 20 | return bytes; 21 | } 22 | 23 | public void messageCorrupt() { 24 | isCorrupt = true; 25 | } 26 | public boolean isCorrupt() { 27 | return isCorrupt; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpMessageParser.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | 4 | public class HttpMessageParser implements HttpConstants { 5 | 6 | protected int parseVersionStrict(byte buf[], int start, int len) 7 | throws HttpParseException { 8 | 9 | String v = new String(buf,start,len,UTF8); 10 | if(v.compareTo(VERSION_0_STATUS) == 0) { 11 | return VERSION_0; 12 | } else if(v.compareTo(VERSION_1_STATUS) == 0) { 13 | return VERSION_1; 14 | } else if(v.compareTo(VERSION_9_STATUS) == 0) { 15 | return VERSION_9; 16 | } else { 17 | throw new HttpParseException("Unknown version"); 18 | } 19 | } 20 | 21 | protected int parseVersionLax(byte buf[], int start, int len) 22 | throws HttpParseException { 23 | 24 | String v = new String(buf,start,len,UTF8); 25 | if(v.toLowerCase().compareTo(VERSION_0_STATUS.toLowerCase()) == 0) { 26 | return VERSION_0; 27 | } else if(v.toLowerCase().compareTo(VERSION_1_STATUS.toLowerCase()) == 0) { 28 | return VERSION_1; 29 | } else if(v.toLowerCase().compareTo(VERSION_9_STATUS.toLowerCase()) == 0) { 30 | return VERSION_9; 31 | } 32 | return VERSION_0; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpParseException.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import org.archive.RecoverableRecordFormatException; 4 | 5 | public class HttpParseException extends RecoverableRecordFormatException { 6 | 7 | /** */ 8 | private static final long serialVersionUID = -2194883519998764425L; 9 | 10 | public HttpParseException(String string) { 11 | super(string); 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpParseObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public interface HttpParseObserver 4 | 5 | extends HttpResponseMessageObserver, HttpHeaderObserver { 6 | 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpRequest.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import java.io.FilterInputStream; 4 | import java.io.InputStream; 5 | 6 | public class HttpRequest extends FilterInputStream { 7 | private HttpRequestMessage message = null; 8 | private HttpHeaders headers = null; 9 | private int headerBytes = 0; 10 | 11 | protected HttpRequest(InputStream in, 12 | HttpRequestMessage message, HttpHeaders headers) { 13 | super(in); 14 | this.message = message; 15 | this.headers = headers; 16 | } 17 | 18 | public HttpRequestMessage getMessage() { 19 | return message; 20 | } 21 | 22 | public HttpHeaders getHeaders() { 23 | return headers; 24 | } 25 | 26 | public int getHeaderBytes() { 27 | return headerBytes; 28 | } 29 | 30 | public void setHeaderBytes(int headerBytes) { 31 | this.headerBytes = headerBytes; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpRequestMessage.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public class HttpRequestMessage extends HttpMessage implements HttpRequestMessageObserver { 4 | private int method = 0; 5 | private String path = null; 6 | 7 | public void messageParsed(int method, String path, int version, int bytes) { 8 | this.method = method; 9 | this.path = path; 10 | this.version = version; 11 | this.bytes = bytes; 12 | } 13 | 14 | public String getPath() { 15 | return path; 16 | } 17 | public int getMethod() { 18 | return method; 19 | } 20 | 21 | public String getMethodString() { 22 | switch(method) { 23 | case METHOD_GET : return METHOD_GET_STRING; 24 | case METHOD_HEAD : return METHOD_HEAD_STRING; 25 | case METHOD_POST : return METHOD_POST_STRING; 26 | case METHOD_PUT : return METHOD_PUT_STRING; 27 | case METHOD_TRACE : return METHOD_TRACE_STRING; 28 | case METHOD_DELETE : return METHOD_DELETE_STRING; 29 | case METHOD_CONNECT : return METHOD_CONNECT_STRING; 30 | } 31 | return METHOD_UNK_STRING; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpRequestMessageObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public interface HttpRequestMessageObserver extends HttpConstants { 4 | public void messageParsed(int method, String path, int version, int bytes); 5 | public void messageCorrupt(); 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpRequestParser.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | public class HttpRequestParser { 7 | private HttpRequestMessageParser messageParser = 8 | new HttpRequestMessageParser(); 9 | 10 | private HttpHeaderParser headerParser = new HttpHeaderParser(); 11 | 12 | public HttpRequestParser() {} 13 | public HttpRequest parse(InputStream is) 14 | throws HttpParseException, IOException { 15 | 16 | HttpRequestMessage message = new HttpRequestMessage(); 17 | HttpHeaders headers = new HttpHeaders(); 18 | int headerBytes = messageParser.parse(is, message); 19 | headerBytes += headerParser.doParse(is, headers); 20 | 21 | HttpRequest request = new HttpRequest(is, message, headers); 22 | request.setHeaderBytes(headerBytes); 23 | 24 | return request; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpResponse.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import java.io.FilterInputStream; 4 | import java.io.InputStream; 5 | 6 | public class HttpResponse extends FilterInputStream { 7 | private HttpResponseMessage message = null; 8 | private HttpHeaders headers = null; 9 | private InputStream inner; 10 | private int headerBytes = 0; 11 | 12 | protected HttpResponse(InputStream in, 13 | HttpResponseMessage message, HttpHeaders headers) { 14 | super(in); 15 | inner = in; 16 | this.message = message; 17 | this.headers = headers; 18 | } 19 | public InputStream getInner() { 20 | return inner; 21 | } 22 | public HttpResponseMessage getMessage() { 23 | return message; 24 | } 25 | 26 | public HttpHeaders getHeaders() { 27 | return headers; 28 | } 29 | 30 | public int getHeaderBytes() { 31 | return headerBytes; 32 | } 33 | 34 | public void setHeaderBytes(int headerBytes) { 35 | this.headerBytes = headerBytes; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpResponseMessage.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public class HttpResponseMessage extends HttpMessage implements HttpResponseMessageObserver { 4 | private int status = 0; 5 | private String reason = null; 6 | 7 | public HttpResponseMessage(){} 8 | 9 | public HttpResponseMessage(int version, int status, String reason) { 10 | this.version = version; 11 | this.status = status; 12 | this.reason = reason; 13 | } 14 | 15 | public int getStatus() { 16 | return status; 17 | } 18 | 19 | public String getReason() { 20 | return reason; 21 | } 22 | public String toString() { 23 | return String.format("%s %d %s%s", getVersionString(), status, reason, CRLF); 24 | } 25 | public String toDebugString() { 26 | return String.format("Message(%d):(%s) (%d) (%s)\n", 27 | reason.length(),getVersionString(),status,reason,CRLF); 28 | } 29 | 30 | public void messageParsed(int version, int status, String reason, int bytes) { 31 | this.version = version; 32 | this.status = status; 33 | this.reason = reason; 34 | this.bytes = bytes; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpResponseMessageObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | public interface HttpResponseMessageObserver extends HttpConstants { 4 | public void messageParsed(int version, int code, String reason, int bytes); 5 | public void messageCorrupt(); 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/http/HttpResponseParser.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.http; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | public class HttpResponseParser { 7 | private HttpResponseMessageParser messageParser = 8 | new HttpResponseMessageParser(); 9 | 10 | private HttpHeaderParser headerParser = new HttpHeaderParser(); 11 | 12 | public HttpResponseParser() {} 13 | public HttpResponse parse(InputStream is) 14 | throws HttpParseException, IOException { 15 | 16 | HttpResponseMessage message = new HttpResponseMessage(); 17 | HttpHeaders headers = new HttpHeaders(); 18 | int headerBytes = messageParser.parse(is, message); 19 | headerBytes += headerParser.doParse(is, headers); 20 | 21 | HttpResponse response = new HttpResponse(is, message, headers); 22 | response.setHeaderBytes(headerBytes); 23 | // TODO: check for chunked transfer encoding 24 | return response; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.github.openjson.JSONObject; 7 | 8 | public class CompoundORJSONPathSpec implements JSONPathSpec { 9 | ArrayList parts; 10 | public CompoundORJSONPathSpec(List parts) { 11 | this.parts = new ArrayList(); 12 | for(JSONPathSpec part : parts) { 13 | this.parts.add(part); 14 | } 15 | } 16 | 17 | public List> extract(JSONObject json) { 18 | List> matches; 19 | for(JSONPathSpec spec : parts) { 20 | matches = spec.extract(json); 21 | // check if empty: 22 | if(matches.size() == 1) { 23 | if(matches.get(0).size() == 1) { 24 | if(matches.get(0).get(0).length() > 0) { 25 | return matches; 26 | } 27 | } 28 | } 29 | // if(matches.size() > 0) { 30 | // if(matches.get(0).size() > 0) { 31 | // return matches; 32 | // } 33 | // } 34 | } 35 | return null; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/json/JSONPathSpec.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import java.util.List; 4 | 5 | import com.github.openjson.JSONObject; 6 | 7 | public interface JSONPathSpec { 8 | public static final String EMPTY = ""; 9 | public List> extract(JSONObject json); 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/json/JSONPathSpecFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class JSONPathSpecFactory { 6 | public static JSONPathSpec get(String spec) { 7 | if(spec.contains("|")) { 8 | // compound OR: 9 | String parts[] = spec.split("\\|"); 10 | ArrayList subs = new ArrayList(parts.length); 11 | for(String part : parts) { 12 | subs.add(new SimpleJSONPathSpec(part)); 13 | } 14 | return new CompoundORJSONPathSpec(subs); 15 | } else { 16 | // assume "simple": 17 | return new SimpleJSONPathSpec(spec); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/json/JSONView.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.logging.Level; 6 | import java.util.logging.Logger; 7 | 8 | import org.apache.commons.lang.StringUtils; 9 | import com.github.openjson.JSONObject; 10 | 11 | /** 12 | * 13 | * Class which provides a column-oriented view of a JSON structure. 14 | * 15 | * An instance is constructed with an array of field specifiers, each of which 16 | * declares the source path to one column of output. 17 | * 18 | * @author brad 19 | * 20 | */ 21 | public class JSONView { 22 | private static final Logger LOG = 23 | Logger.getLogger(JSONView.class.getName()); 24 | 25 | ArrayList pathSpecs; 26 | CrossProductOfLists crosser; 27 | 28 | public JSONView(String... pathSpecs) { 29 | this.pathSpecs = new ArrayList(pathSpecs.length); 30 | if(LOG.isLoggable(Level.INFO)) { 31 | LOG.info(String.format("Creating JSONView with(%s)", 32 | StringUtils.join(pathSpecs,","))); 33 | } 34 | for(String pathSpec : pathSpecs) { 35 | this.pathSpecs.add(JSONPathSpecFactory.get(pathSpec)); 36 | } 37 | crosser = new CrossProductOfLists(); 38 | } 39 | public List> apply(JSONObject json) { 40 | ArrayList>> results = 41 | new ArrayList>>(pathSpecs.size()); 42 | 43 | for(JSONPathSpec pathSpec : pathSpecs) { 44 | List> result = pathSpec.extract(json); 45 | if(result == null) { 46 | // ArrayList tmp = new ArrayList(); 47 | result = new ArrayList>(); 48 | } 49 | results.add(result); 50 | } 51 | return crosser.crossProduct(results); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/text/charset/StandardCharsetDetector.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Wayback archival access software 3 | * (http://archive-access.sourceforge.net/projects/wayback/). 4 | * 5 | * Licensed to the Internet Archive (IA) by one or more individual 6 | * contributors. 7 | * 8 | * The IA licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package org.archive.format.text.charset; 21 | 22 | import java.io.IOException; 23 | 24 | import org.archive.format.http.HttpHeaders; 25 | 26 | public class StandardCharsetDetector extends CharsetDetector { 27 | public String getCharset(byte buffer[],int len, HttpHeaders headers) 28 | throws IOException { 29 | String charSet = getCharsetFromHeaders(headers); 30 | if(charSet == null) { 31 | charSet = getCharsetFromMeta(buffer,len); 32 | if(charSet == null) { 33 | charSet = getCharsetFromBytes(buffer,len); 34 | if(charSet == null) { 35 | charSet = DEFAULT_CHARSET; 36 | } 37 | } 38 | } 39 | return charSet; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/text/html/LexParser.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.text.html; 2 | 3 | import java.io.IOException; 4 | import java.io.Writer; 5 | 6 | import org.htmlparser.Node; 7 | import org.htmlparser.nodes.RemarkNode; 8 | import org.htmlparser.nodes.TagNode; 9 | import org.htmlparser.nodes.TextNode; 10 | import org.htmlparser.util.ParserException; 11 | 12 | public class LexParser extends NodeUtils { 13 | ParseObserver obs; 14 | public LexParser(ParseObserver obs) { 15 | this.obs = obs; 16 | } 17 | public void doParse(CDATALexer lex) throws ParserException, IOException { 18 | doParse(lex,null); 19 | } 20 | 21 | public void doParse(CDATALexer lex, Writer w) throws ParserException, IOException { 22 | obs.handleDocumentStart(); 23 | Node n; 24 | TextNode tx; 25 | TagNode tn; 26 | while(true) { 27 | n = lex.nextNode(); 28 | if(n == null) { 29 | break; 30 | } 31 | if(isRemarkNode(n)) { 32 | obs.handleRemarkNode((RemarkNode)n); 33 | } else if(isTextNode(n)) { 34 | tx = (TextNode) n; 35 | if(lex.inCSS()) { 36 | obs.handleStyleNode(tx); 37 | } else if(lex.inJS()) { 38 | obs.handleScriptNode(tx); 39 | } else { 40 | obs.handleTextNode(tx); 41 | } 42 | } else { 43 | tn = (TagNode) n; 44 | if(tn.isEmptyXmlTag()) { 45 | obs.handleTagEmpty(tn); 46 | } else if(tn.isEndTag()) { 47 | obs.handleTagClose(tn); 48 | } else { 49 | obs.handleTagOpen(tn); 50 | } 51 | } 52 | if(w != null) { 53 | w.write(n.toHtml(true)); 54 | } 55 | } 56 | obs.handleDocumentComplete(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/archive/format/text/html/ParseObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.text.html; 2 | 3 | import org.htmlparser.nodes.RemarkNode; 4 | import org.htmlparser.nodes.TagNode; 5 | import org.htmlparser.nodes.TextNode; 6 | 7 | public interface ParseObserver { 8 | public void handleDocumentStart(); 9 | public void handleDocumentComplete(); 10 | 11 | public void handleTagEmpty(TagNode tag); 12 | public void handleTagOpen(TagNode tag); 13 | public void handleTagClose(TagNode tag); 14 | 15 | public void handleTextNode(TextNode text); 16 | public void handleScriptNode(TextNode text); 17 | public void handleStyleNode(TextNode text); 18 | 19 | public void handleRemarkNode(RemarkNode remark); 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/archive/hadoop/ResourceContext.java: -------------------------------------------------------------------------------- 1 | package org.archive.hadoop; 2 | 3 | public class ResourceContext { 4 | public String name; 5 | public long offset; 6 | public ResourceContext(String name, long offset) { 7 | this.name = name; 8 | this.offset = offset; 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/org/archive/hadoop/ResourceInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.archive.hadoop; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.mapreduce.InputSplit; 7 | import org.apache.hadoop.mapreduce.JobContext; 8 | import org.apache.hadoop.mapreduce.RecordReader; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.archive.resource.MetaData; 12 | 13 | public class ResourceInputFormat extends FileInputFormat{ 14 | 15 | @Override 16 | public RecordReader createRecordReader(InputSplit inputSplit, 17 | TaskAttemptContext context) throws IOException, InterruptedException { 18 | 19 | return new ResourceRecordReader(); 20 | } 21 | 22 | /* (non-Javadoc) 23 | * @see org.apache.hadoop.mapreduce.lib.input.FileInputFormat#isSplitable(org.apache.hadoop.mapreduce.JobContext, org.apache.hadoop.fs.Path) 24 | */ 25 | @Override 26 | protected boolean isSplitable(JobContext context, Path filename) { 27 | // TODO: ensure this works... it should be may be losing records between.. 28 | return false; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java: -------------------------------------------------------------------------------- 1 | package org.archive.hadoop.func; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.logging.Logger; 6 | 7 | import org.apache.pig.EvalFunc; 8 | import org.apache.pig.data.Tuple; 9 | import org.apache.pig.data.TupleFactory; 10 | import org.archive.format.json.JSONUtils; 11 | import com.github.openjson.JSONException; 12 | import com.github.openjson.JSONObject; 13 | 14 | public class JSONViewEvalFunc extends EvalFunc { 15 | private static final Logger LOG = 16 | Logger.getLogger(JSONViewEvalFunc.class.getName()); 17 | 18 | protected TupleFactory mTupleFactory = TupleFactory.getInstance(); 19 | private ArrayList mProtoTuple = null; 20 | 21 | public JSONViewEvalFunc() { 22 | mProtoTuple = new ArrayList(); 23 | } 24 | 25 | @Override 26 | public Tuple exec(Tuple tup) throws IOException { 27 | // [0] is the JSON. Remaining elements are Strings describing paths 28 | // into the JSON to "flatten" into a single tuple: 29 | if(tup == null || tup.size() == 0) { 30 | return null; 31 | } 32 | try { 33 | JSONObject json = new JSONObject(tup.get(0).toString()); 34 | for(int i = 1; i < tup.size(); i++) { 35 | String path = tup.get(i).toString(); 36 | String result = JSONUtils.extractSingle(json, path); 37 | mProtoTuple.add(result); 38 | } 39 | } catch (JSONException e) { 40 | LOG.warning("Failed to parse JSON:"+e.getMessage()); 41 | return null; 42 | } 43 | Tuple t = mTupleFactory.newTuple(mProtoTuple); 44 | mProtoTuple.clear(); 45 | return t; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/org/archive/hadoop/func/TupleFunc.java: -------------------------------------------------------------------------------- 1 | package org.archive.hadoop.func; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | 6 | import org.apache.pig.EvalFunc; 7 | import org.apache.pig.data.Tuple; 8 | import org.apache.pig.data.TupleFactory; 9 | 10 | public class TupleFunc extends EvalFunc { 11 | 12 | protected TupleFactory mTupleFactory = TupleFactory.getInstance(); 13 | private ArrayList mProtoTuple = null; 14 | 15 | public TupleFunc() { 16 | mProtoTuple = new ArrayList(); 17 | } 18 | 19 | private Tuple makeTuple(String va[]) { 20 | if(va == null) { 21 | return null; 22 | } 23 | for(String v : va) { 24 | mProtoTuple.add(v); 25 | } 26 | Tuple t = mTupleFactory.newTuple(mProtoTuple); 27 | mProtoTuple.clear(); 28 | return t; 29 | } 30 | 31 | @Override 32 | public Tuple exec(Tuple tup) throws IOException { 33 | if(tup == null || tup.size() != 2) { 34 | return null; 35 | } 36 | String in = tup.get(0).toString(); 37 | String split = tup.get(1).toString(); 38 | return makeTuple(in.split(split)); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/archive/httpclient/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | org.archive.httpclient package 5 | 6 | Provides specializations on 7 | apache jakarta 8 | commons httpclient. 9 | 10 |

HttpRecorderGetMethod

11 |

Class that the passed HttpRecorder w/ boundary between 12 | HTTP header and content. Also forces a close on the response on 13 | call to releaseConnection.

14 | 15 |

ConfigurableTrustManagerProtocolSocketFactory

16 |

A protocol socket factory that allows setting of trust level on 17 | construction.

18 | 19 |

References

20 |

JavaTM Secure Socket Extension (JSSE): Reference Guide

22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/ArchiveFileConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.archive.io; 21 | 22 | @Deprecated 23 | public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants { 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/CompositeFileReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | import java.io.File; 22 | import java.io.IOException; 23 | import java.io.InputStreamReader; 24 | import java.util.List; 25 | 26 | 27 | /** 28 | * @author gojomo 29 | */ 30 | public class CompositeFileReader extends InputStreamReader { 31 | 32 | /** 33 | * @param filenames 34 | * @throws IOException 35 | */ 36 | public CompositeFileReader(List filenames) throws IOException { 37 | super(new CompositeFileInputStream(filenames)); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/GZIPMembersInputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | 24 | /** 25 | * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream} 26 | */ 27 | @Deprecated 28 | public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream { 29 | 30 | public GZIPMembersInputStream(InputStream in) throws IOException { 31 | super(in); 32 | } 33 | 34 | public GZIPMembersInputStream(InputStream in, int size) throws IOException { 35 | super(in, size); 36 | } 37 | 38 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/io/GzipHeader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | /** 22 | * @deprecated use {@link org.archive.util.zip.GzipHeader} 23 | */ 24 | @Deprecated 25 | public class GzipHeader extends org.archive.util.zip.GzipHeader { 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/NoGzipMagicException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | /** 22 | * @deprecated use {@link org.archive.util.zip.NoGzipMagicException} 23 | */ 24 | @Deprecated 25 | public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException { 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/Preformatter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | import java.util.logging.LogRecord; 22 | 23 | /** 24 | * Interface indicating a logging Formatter can preformat a record (outside 25 | * the standard-implementation synchronized block) and cache it, returning it 26 | * for the next request for formatting from the same thread. 27 | * @author gojomo 28 | */ 29 | public interface Preformatter { 30 | public void preformat(LogRecord record); 31 | public void clear(); 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/ReadSource.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.archive.io; 21 | 22 | import java.io.Reader; 23 | 24 | /** 25 | * Interface for objects that can provide a Reader view of their 26 | * contents. 27 | * 28 | */ 29 | public interface ReadSource { 30 | /** 31 | * Obtain a Reader. Not named 'getReader' so that it is not 32 | * considered a simple costless read-only property by 33 | * bean-convention introspection tools. 34 | * @return a Reader on this object 35 | */ 36 | Reader obtainReader(); 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/RecorderIOException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | import java.io.IOException; 22 | 23 | /** 24 | * 25 | * @author Gordon Mohr 26 | */ 27 | public class RecorderIOException extends IOException { 28 | 29 | private static final long serialVersionUID = 5907470275350314277L; 30 | 31 | public RecorderIOException() { 32 | super(); 33 | } 34 | 35 | public RecorderIOException(String msg) { 36 | super(msg); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/RecorderLengthExceededException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | 22 | /** 23 | * Indicates a length exception thrown by the Recorder. 24 | * 25 | * @author Gordon Mohr 26 | */ 27 | public class RecorderLengthExceededException 28 | extends RecorderIOException { 29 | 30 | private static final long serialVersionUID = 6655419033414648444L; 31 | 32 | public RecorderLengthExceededException() { 33 | super(); 34 | } 35 | 36 | public RecorderLengthExceededException(String msg) { 37 | super(msg); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/RecorderTimeoutException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | /** 22 | * Indicates a timeout thrown by the RecordingInputStream. 23 | * 24 | * @author Gordon Mohr 25 | */ 26 | public class RecorderTimeoutException extends RecorderIOException { 27 | 28 | private static final long serialVersionUID = 7433214063765078269L; 29 | 30 | public RecorderTimeoutException() { 31 | super(); 32 | } 33 | 34 | public RecorderTimeoutException(String msg) { 35 | super(msg); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/RecorderTooMuchHeaderException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | 22 | /** 23 | * Indicates a too much header material exception thrown by the Recorder 24 | * (specificially the RecordingOutputStream) 25 | * 26 | * @author Gordon Mohr 27 | */ 28 | public class RecorderTooMuchHeaderException 29 | extends RecorderIOException { 30 | 31 | private static final long serialVersionUID = 3528516034898129150L; 32 | 33 | public RecorderTooMuchHeaderException() { 34 | super(); 35 | } 36 | 37 | public RecorderTooMuchHeaderException(String msg) { 38 | super(msg); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/SeekReaderCharSequence.java: -------------------------------------------------------------------------------- 1 | package org.archive.io; 2 | 3 | import java.io.IOException; 4 | 5 | public class SeekReaderCharSequence implements CharSequence { 6 | 7 | 8 | final private SeekReader reader; 9 | final private int size; 10 | 11 | 12 | public SeekReaderCharSequence(SeekReader reader, int size) { 13 | this.reader = reader; 14 | this.size = size; 15 | } 16 | 17 | 18 | public int length() { 19 | return size; 20 | } 21 | 22 | 23 | public char charAt(int index) { 24 | if ((index < 0) || (index >= length())) { 25 | throw new IndexOutOfBoundsException(Integer.toString(index)); 26 | } 27 | try { 28 | reader.position(index); 29 | int r = reader.read(); 30 | if (r < 0) { 31 | throw new IllegalStateException("EOF"); 32 | } 33 | return (char)reader.read(); 34 | } catch (IOException e) { 35 | throw new RuntimeException(e); 36 | } 37 | } 38 | 39 | 40 | public CharSequence subSequence(int start, int end) { 41 | return new CharSubSequence(this, start, end); 42 | } 43 | 44 | public String toString() { 45 | StringBuilder sb = new StringBuilder(); 46 | try { 47 | reader.position(0); 48 | for (int ch = reader.read(); ch >= 0; ch = reader.read()) { 49 | sb.append((char)ch); 50 | } 51 | return sb.toString(); 52 | } catch (IOException e) { 53 | throw new IllegalStateException(e); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/SinkHandlerLogThread.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.archive.io; 21 | 22 | 23 | /** 24 | * Implemented by threads that provide extra information. 25 | * 26 | * TODO: rename class, rename getCurrentProcessorName() 27 | */ 28 | public interface SinkHandlerLogThread { 29 | 30 | String getName(); 31 | String getCurrentProcessorName(); 32 | int getSerialNumber(); 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/UTF8Bytes.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | import java.io.UnsupportedEncodingException; 22 | 23 | /** 24 | * Marker Interface for instances that can be serialized as UTF8 bytes. 25 | * TODO: Do we need a UTF8Stream Marker Interface? 26 | * @author stack 27 | * @version $Date$ $Version$ 28 | */ 29 | public interface UTF8Bytes { 30 | public static final String UTF8 = "UTF-8"; 31 | 32 | /** 33 | * @return Instance as UTF-8 bytes. 34 | * @throws UnsupportedEncodingException 35 | */ 36 | public byte [] getUTF8Bytes() throws UnsupportedEncodingException; 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/WriterPoolSettings.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io; 20 | 21 | import java.io.File; 22 | import java.util.List; 23 | 24 | /** 25 | * Settings object for a {@link WriterPool}. 26 | * Used creating {@link WriterPoolMember}s. 27 | * @author stack 28 | * @version $Date$, $Revision$ 29 | */ 30 | public interface WriterPoolSettings { 31 | public long getMaxFileSizeBytes(); 32 | public String getPrefix(); 33 | public String getTemplate(); 34 | public List calcOutputDirs(); 35 | public boolean getCompress(); 36 | public List getMetadata(); 37 | public boolean getFrequentFlushes(); 38 | public int getWriteBufferSize(); 39 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/io/arc/ARCConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io.arc; 20 | 21 | 22 | /** 23 | * Constants used by ARC files and in ARC file processing. 24 | * 25 | * @author stack 26 | * @deprecated 27 | */ 28 | public interface ARCConstants extends org.archive.format.arc.ARCConstants { 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/arc/ARCLocation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io.arc; 20 | 21 | /** 22 | * Datastructure to hold ARC record location. 23 | * Used by wayback machine. 24 | * @author stack 25 | */ 26 | public interface ARCLocation { 27 | /** 28 | * @return Returns the ARC filename. Can be full path to ARC, URL to an 29 | * ARC or just the portion of an ARC name that is unique to a collection. 30 | */ 31 | public String getName(); 32 | 33 | /** 34 | * @return Returns the offset into the ARC. 35 | */ 36 | public long getOffset(); 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | org.archive.io.arc package 5 | 6 | 7 | ARC file reading and writing. 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/warc/WARCConstants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.archive.io.warc; 21 | 22 | @Deprecated 23 | public interface WARCConstants extends org.archive.format.warc.WARCConstants { 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io.warc; 20 | 21 | import org.archive.io.WriterPoolSettings; 22 | import org.archive.uid.RecordIDGenerator; 23 | 24 | /** 25 | * Settings object for a {@link WARCWriterPool}. 26 | * Used creating {@link WARCWriter}s. 27 | * 28 | * @version $Date: 2010-08-19 17:21:43 -0700 (Thu, 19 Aug 2010) $, $Revision: 6927 $ 29 | */ 30 | public interface WARCWriterPoolSettings extends WriterPoolSettings { 31 | public RecordIDGenerator getRecordIDGenerator(); 32 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.io.warc; 20 | 21 | import java.io.File; 22 | import java.util.List; 23 | 24 | import org.archive.io.arc.WriterPoolSettingsData; 25 | import org.archive.uid.RecordIDGenerator; 26 | 27 | public class WARCWriterPoolSettingsData extends WriterPoolSettingsData implements WARCWriterPoolSettings { 28 | RecordIDGenerator generator; 29 | 30 | public WARCWriterPoolSettingsData(String prefix, String template, 31 | long maxFileSizeBytes, boolean compress, List outputDirs, 32 | List metadata, RecordIDGenerator generator) { 33 | super(prefix,template,maxFileSizeBytes,compress,outputDirs,metadata); 34 | this.generator = generator; 35 | } 36 | @Override 37 | public RecordIDGenerator getRecordIDGenerator() { 38 | return generator; 39 | } 40 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/io/warc/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | org.archive.io.warc package 5 | 6 | 7 | Experimental WARC Writer and Readers. Code and specification subject to change 8 | with no guarantees of backward compatibility: i.e. newer readers 9 | may not be able to parse WARCs written with older writers. This package 10 | contains prototyping code for revision 0.12 of the WARC specification. 11 | See latest revision 12 | for current state (Version 0.10 code and its documentation has been moved into the 13 | v10 subpackage). 14 | 15 | 16 |

Implementation Notes

17 |

Tools

18 |

Initial implementations of Arc2Warc and Warc2Arc 19 | tools can be found in Heritrix, at 20 | org.archive.io.Arc2Warc and org.archive.io.Warc2Arc 21 | respectively. Pass --help to learn how to use each tool. 22 |

23 | 24 |

TODO

25 |
    26 |
  • Is MIME-Version header needed? MIME Parsers seem fine without (python email 27 | lib and java mail).
  • 28 |
  • Should we write out a Content-Transfer-Encoding 29 | header (Currently we do not). Need section in spec. explicit about our 30 | interpretation of MIME and deviations (e.g. content-transfer-encoding should 31 | be assumed binary in case of WARCs, multipart is not disallowed but not 32 | encouraged, etc.)
  • 33 |
  • Minor: Do WARC-Version: 0.12 like MIME-Version: 1.0 rather than 34 | WARC/0.12 for lead in to an ARCRecord?
  • 35 |
36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/net/FTPException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.net; 20 | 21 | import java.io.IOException; 22 | 23 | /** 24 | * Indicates that a FTP operation failed due to a protocol violation. 25 | * For instance, if authentication fails. 26 | * 27 | * @author pjack 28 | */ 29 | public class FTPException extends IOException { 30 | private static final long serialVersionUID = 1L; 31 | 32 | /** 33 | * The reply code from the FTP server. 34 | */ 35 | private int code; 36 | 37 | /** 38 | * Constructs a new FTPException. 39 | * 40 | * @param code the error code from the FTP server 41 | */ 42 | public FTPException(int code) { 43 | super("FTP error code: " + code); 44 | this.code = code; 45 | } 46 | 47 | 48 | /** 49 | * Returns the error code from the FTP server. 50 | * 51 | * @return the error code from the FTP server 52 | */ 53 | public int getReplyCode() { 54 | return code; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/archive/net/md5/Md5URLConnection.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.net.md5; 20 | 21 | import java.net.URL; 22 | 23 | import org.archive.net.DownloadURLConnection; 24 | 25 | /** 26 | * Md5 URL connection. 27 | * @author stack 28 | * @version $Date$, $Revision$ 29 | */ 30 | public class Md5URLConnection extends DownloadURLConnection { 31 | protected Md5URLConnection(URL u) { 32 | super(u); 33 | } 34 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/net/rsync/RsyncURLConnection.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.net.rsync; 20 | 21 | import java.io.File; 22 | import java.net.URL; 23 | 24 | import org.archive.net.DownloadURLConnection; 25 | 26 | /** 27 | * Rsync URL connection. 28 | * @author stack 29 | * @version $Date$, $Revision$ 30 | */ 31 | public class RsyncURLConnection extends DownloadURLConnection { 32 | private final String RSYNC_TIMEOUT = 33 | System.getProperty(RsyncURLConnection.class.getName() + ".timeout", 34 | "300"); 35 | 36 | protected RsyncURLConnection(URL u) { 37 | super(u); 38 | } 39 | 40 | protected String getScript() { 41 | return System.getProperty(this.getClass().getName() + ".path", 42 | "rsync"); 43 | } 44 | 45 | @Override 46 | protected String[] getCommand(final URL thisUrl, 47 | final File downloadFile) { 48 | return new String[] {getScript(), "--timeout=" + RSYNC_TIMEOUT, 49 | this.url.getPath(), downloadFile.getAbsolutePath()}; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/AbstractEmptyResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.InputStream; 5 | 6 | 7 | public class AbstractEmptyResource extends AbstractResource { 8 | 9 | public AbstractEmptyResource(MetaData metaData, ResourceContainer container) { 10 | super(metaData, container); 11 | } 12 | 13 | public InputStream getInputStream() { 14 | byte bytes[] = new byte[0]; 15 | return new ByteArrayInputStream(bytes); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/AbstractResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | import java.io.IOException; 4 | import java.io.PrintStream; 5 | 6 | import org.archive.util.StreamCopy; 7 | 8 | import com.google.common.io.ByteStreams; 9 | 10 | public abstract class AbstractResource implements Resource { 11 | protected ResourceContainer container; 12 | protected MetaData metaData; 13 | public AbstractResource(MetaData metaData, 14 | ResourceContainer container) { 15 | this.container = container; 16 | this.metaData = metaData; 17 | } 18 | 19 | public ResourceContainer getContainer() { 20 | return container; 21 | } 22 | public MetaData getMetaData() { 23 | return metaData; 24 | } 25 | 26 | public static void dump(PrintStream out, Resource resource) throws IOException { 27 | 28 | MetaData m = resource.getMetaData(); 29 | 30 | out.println("Headers Before"); 31 | out.print(m.toString()); 32 | 33 | out.println("Resource Follows:\n==================="); 34 | StreamCopy.copy(resource.getInputStream(),out); 35 | 36 | out.println("[\n]Headers After"); 37 | out.print(m.toString()); 38 | 39 | } 40 | public static void dumpShort(PrintStream out, Resource resource) throws IOException { 41 | 42 | MetaData m = resource.getMetaData(); 43 | 44 | // out.println("Headers Before"); 45 | // out.print(m.toString()); 46 | 47 | long bytes = StreamCopy.copy(resource.getInputStream(), ByteStreams.nullOutputStream()); 48 | out.println("Resource Was:"+bytes+" Long"); 49 | 50 | out.println("[\n]Headers After"); 51 | out.print(m.toString()); 52 | 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/Resource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | import java.io.InputStream; 4 | 5 | 6 | public interface Resource { 7 | /** 8 | * @return the ResourceContainer holding this Resource 9 | */ 10 | public ResourceContainer getContainer(); 11 | 12 | /** 13 | * @return an InputStream for reading data from this Resource. Use only 14 | * once, and assume it is unbuffered 15 | */ 16 | public InputStream getInputStream(); 17 | 18 | /** 19 | * @return the MetaData associated with this Resource 20 | */ 21 | public MetaData getMetaData(); 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/ResourceContainer.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | /** 4 | * A container for one or more Resource objects. Primarily holds context for the 5 | * current record 6 | * 7 | * @author Brad 8 | * 9 | */ 10 | public interface ResourceContainer { 11 | /** 12 | * @return the name of this container. Could be a path, url, basename... 13 | */ 14 | public String getName(); 15 | public boolean isCompressed(); 16 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/ResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | 7 | /** 8 | * @author Brad 9 | * 10 | */ 11 | public interface ResourceFactory { 12 | 13 | /** 14 | * Attempts to create a Resource from the InputStream 15 | */ 16 | public Resource getResource(InputStream is, MetaData parentMetaData, 17 | ResourceContainer container) 18 | throws ResourceParseException, IOException; 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/ResourceParseException.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | 4 | public class ResourceParseException extends Exception { 5 | 6 | /** */ 7 | private static final long serialVersionUID = 5364502969148304884L; 8 | public ResourceParseException(Exception e) { 9 | super(e); 10 | } 11 | public ResourceParseException(Exception e, MetaData metaData) { 12 | super(e); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/ResourceProducer.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | import java.io.IOException; 4 | 5 | public interface ResourceProducer { 6 | public Resource getNext() throws ResourceParseException, IOException; 7 | public void close() throws IOException; 8 | public String getContext(); 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/TransformingResourceProducer.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource; 2 | 3 | import java.io.IOException; 4 | 5 | public class TransformingResourceProducer implements ResourceProducer { 6 | private ResourceProducer producer; 7 | private ResourceFactory factory; 8 | public TransformingResourceProducer(ResourceProducer producer, ResourceFactory factory) { 9 | this.producer = producer; 10 | this.factory = factory; 11 | } 12 | public Resource getNext() throws ResourceParseException, IOException { 13 | Resource inner = producer.getNext(); 14 | if(inner == null) { 15 | return null; 16 | } 17 | return factory.getResource(inner.getInputStream(), inner.getMetaData(), 18 | inner.getContainer()); 19 | } 20 | public void close() throws IOException { 21 | producer.close(); 22 | } 23 | public String getContext() { 24 | return producer.getContext(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/arc/ARCResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.arc; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.format.arc.ARCFormatException; 7 | import org.archive.format.arc.ARCMetaData; 8 | import org.archive.format.arc.ARCMetaDataParser; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.Resource; 12 | import org.archive.resource.ResourceContainer; 13 | import org.archive.resource.ResourceFactory; 14 | import org.archive.resource.ResourceParseException; 15 | 16 | public class ARCResourceFactory implements ResourceFactory, ResourceConstants { 17 | public ARCMetaDataParser parser; 18 | public boolean strict = false; 19 | public ARCResourceFactory() { 20 | parser = new ARCMetaDataParser(); 21 | } 22 | public Resource getResource(InputStream is, MetaData parentMetaData, 23 | ResourceContainer container) throws ResourceParseException, 24 | IOException { 25 | 26 | try { 27 | ARCMetaData m = parser.parse(is,strict,!container.isCompressed()); 28 | if(m == null) { 29 | return null; 30 | } 31 | ARCResource r = new ARCResource(parentMetaData.createChild(ENVELOPE), 32 | container, m,is); 33 | return r; 34 | 35 | } catch(ARCFormatException e) { 36 | throw new ResourceParseException(e); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/arc/record/FiledescResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.arc.record; 2 | 3 | //import java.util.logging.Logger; 4 | 5 | import org.archive.format.arc.FiledescRecord; 6 | import org.archive.resource.AbstractEmptyResource; 7 | import org.archive.resource.MetaData; 8 | import org.archive.resource.ResourceConstants; 9 | import org.archive.resource.ResourceContainer; 10 | 11 | public class FiledescResource extends AbstractEmptyResource implements ResourceConstants { 12 | // private static final Logger LOG = 13 | // Logger.getLogger(FiledescResource.class.getName()); 14 | 15 | public FiledescResource(MetaData metaData, ResourceContainer container, 16 | FiledescRecord record) { 17 | super(metaData, container); 18 | metaData.putLong(FILEDESC_MAJOR, record.getMajorVersion()); 19 | metaData.putLong(FILEDESC_MINOR, record.getMinorVersion()); 20 | metaData.putString(FILEDESC_ORGANIZATION, record.getOrganization()); 21 | metaData.putString(FILEDESC_FORMAT, record.getFormat()); 22 | if(record.hasMetaData()) { 23 | int count = record.getMetaDataCount(); 24 | for(int i = 0; i < count; i++) { 25 | String name = record.getMetaDataName(i); 26 | String value = record.getMetaDataValue(i); 27 | if((name != null) && (value != null)) { 28 | metaData.appendObj(FILEDESC_DATA, 29 | METADATA_KV_NAME,name,METADATA_KV_VALUE,value); 30 | } 31 | } 32 | } 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/arc/record/FiledescResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.arc.record; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.format.arc.FiledescRecord; 7 | import org.archive.format.arc.FiledescRecordParser; 8 | import org.archive.resource.MetaData; 9 | import org.archive.resource.ResourceConstants; 10 | import org.archive.resource.Resource; 11 | import org.archive.resource.ResourceContainer; 12 | import org.archive.resource.ResourceFactory; 13 | import org.archive.resource.ResourceParseException; 14 | 15 | public class FiledescResourceFactory implements ResourceFactory, ResourceConstants { 16 | FiledescRecordParser parser = new FiledescRecordParser(); 17 | public Resource getResource(InputStream is, MetaData parentMetaData, 18 | ResourceContainer container) throws ResourceParseException, 19 | IOException { 20 | FiledescRecord rec = parser.parse(is); 21 | 22 | parentMetaData.putString(PAYLOAD_CONTENT_TYPE, PAYLOAD_TYPE_FILEDESC); 23 | return new FiledescResource( 24 | parentMetaData.createChild(FILEDESC_METADATA), container, rec); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/generic/GenericResourceProducer.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.generic; 2 | 3 | import java.io.IOException; 4 | 5 | import org.archive.resource.MetaData; 6 | import org.archive.resource.Resource; 7 | import org.archive.resource.ResourceContainer; 8 | import org.archive.resource.ResourceParseException; 9 | import org.archive.resource.ResourceProducer; 10 | import org.archive.streamcontext.Stream; 11 | 12 | public class GenericResourceProducer implements ResourceContainer, ResourceProducer { 13 | private static long UNLIMITED = -1; 14 | private Stream stream; 15 | private String name; 16 | private long endOffset; 17 | public GenericResourceProducer(Stream stream, String name) { 18 | this(stream,name,UNLIMITED); 19 | } 20 | public GenericResourceProducer(Stream stream, String name, long endOffset) { 21 | this.stream = stream; 22 | this.name = name; 23 | this.endOffset = endOffset; 24 | } 25 | public Resource getNext() throws ResourceParseException, IOException { 26 | if(stream.atEof()) { 27 | return null; 28 | } 29 | if(endOffset != UNLIMITED) { 30 | if(stream.getOffset() > endOffset) { 31 | return null; 32 | } 33 | } 34 | return new GenericStreamResource(new MetaData(), this, stream); 35 | } 36 | 37 | public String getName() { 38 | return name; 39 | } 40 | 41 | public boolean isCompressed() { 42 | return false; 43 | } 44 | public void close() throws IOException { 45 | stream.close(); 46 | } 47 | public String getContext() { 48 | return String.format("Context(%s)(%d)", name, stream.getOffset()); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/generic/GenericStreamResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.generic; 2 | 3 | import java.io.InputStream; 4 | 5 | import org.archive.resource.AbstractResource; 6 | import org.archive.resource.MetaData; 7 | import org.archive.resource.ResourceConstants; 8 | import org.archive.resource.ResourceContainer; 9 | import org.archive.streamcontext.StreamWrappedInputStream; 10 | import org.archive.streamcontext.Stream; 11 | 12 | public class GenericStreamResource extends AbstractResource implements ResourceConstants { 13 | private Stream stream; 14 | public GenericStreamResource(MetaData metaData, ResourceContainer container, Stream stream) { 15 | super(metaData, container); 16 | this.stream = stream; 17 | 18 | MetaData containerMD = new MetaData(metaData, CONTAINER); 19 | 20 | containerMD.putString(CONTAINER_FILENAME, container.getName()); 21 | containerMD.putBoolean(CONTAINER_COMPRESSED, container.isCompressed()); 22 | containerMD.putLong(CONTAINER_OFFSET, stream.getOffset()); 23 | } 24 | 25 | public InputStream getInputStream() { 26 | return new StreamWrappedInputStream(stream); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/gzip/GZIPResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.gzip; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.format.gzip.GZIPConstants; 7 | import org.archive.format.gzip.GZIPSeriesMember; 8 | import org.archive.resource.AbstractResource; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.ResourceContainer; 12 | import org.archive.util.io.EOFNotifyingInputStream; 13 | import org.archive.util.io.EOFObserver; 14 | 15 | public class GZIPResource extends AbstractResource 16 | implements GZIPConstants, EOFObserver, ResourceConstants { 17 | 18 | private GZIPSeriesMember member; 19 | private EOFNotifyingInputStream eofStream; 20 | private GZIPMetaData gzMetaData; 21 | 22 | public GZIPResource(MetaData metaData, ResourceContainer container, 23 | GZIPSeriesMember member) { 24 | super(metaData, container); 25 | this.member = member; 26 | this.eofStream = 27 | new EOFNotifyingInputStream(member, this); 28 | 29 | MetaData containerMD = new MetaData(metaData, CONTAINER); 30 | 31 | containerMD.putString(CONTAINER_FILENAME, member.getRecordFileContext()); 32 | containerMD.putBoolean(CONTAINER_COMPRESSED, true); 33 | containerMD.putLong(CONTAINER_OFFSET, member.getRecordStartOffset()); 34 | 35 | gzMetaData = new GZIPMetaData(containerMD); 36 | } 37 | 38 | public void close() throws IOException { 39 | member.close(); 40 | } 41 | 42 | public InputStream getInputStream() { 43 | return eofStream; 44 | } 45 | 46 | public void notifyEOF() throws IOException { 47 | gzMetaData.setData(member); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/html/HTMLResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.html; 2 | 3 | import org.archive.resource.AbstractEmptyResource; 4 | import org.archive.resource.MetaData; 5 | import org.archive.resource.ResourceContainer; 6 | 7 | 8 | public class HTMLResource extends AbstractEmptyResource { 9 | 10 | public HTMLResource(MetaData metaData, ResourceContainer container) { 11 | super(metaData, container); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/http/HTTPHeadersResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.http; 2 | 3 | import org.archive.format.arc.ARCConstants; 4 | import org.archive.format.http.HttpHeader; 5 | import org.archive.format.http.HttpHeaders; 6 | import org.archive.resource.AbstractEmptyResource; 7 | import org.archive.resource.MetaData; 8 | import org.archive.resource.ResourceContainer; 9 | 10 | 11 | public class HTTPHeadersResource extends AbstractEmptyResource 12 | implements ARCConstants { 13 | 14 | public HTTPHeadersResource(MetaData metaData, ResourceContainer container, 15 | HttpHeaders headers) { 16 | super(metaData, container); 17 | for(HttpHeader h : headers) { 18 | metaData.putString(h.getName(),h.getValue()); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/http/HTTPRequestResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.http; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.format.http.HttpParseException; 7 | import org.archive.format.http.HttpRequest; 8 | import org.archive.format.http.HttpRequestParser; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.Resource; 12 | import org.archive.resource.ResourceContainer; 13 | import org.archive.resource.ResourceFactory; 14 | import org.archive.resource.ResourceParseException; 15 | 16 | public class HTTPRequestResourceFactory implements ResourceFactory, ResourceConstants { 17 | private HttpRequestParser parser; 18 | public HTTPRequestResourceFactory() { 19 | parser = new HttpRequestParser(); 20 | } 21 | 22 | public Resource getResource(InputStream is, MetaData metaData, 23 | ResourceContainer container) 24 | throws ResourceParseException, IOException { 25 | try { 26 | 27 | HttpRequest response = parser.parse(is); 28 | metaData.putString(PAYLOAD_CONTENT_TYPE, 29 | PAYLOAD_TYPE_HTTP_REQUEST); 30 | return new HTTPRequestResource(metaData.createChild(HTTP_REQUEST_METADATA), 31 | container, response, true); 32 | 33 | } catch(HttpParseException e) { 34 | throw new ResourceParseException(e); 35 | } 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.http; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.format.http.HttpParseException; 7 | import org.archive.format.http.HttpResponse; 8 | import org.archive.format.http.HttpResponseParser; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.Resource; 12 | import org.archive.resource.ResourceContainer; 13 | import org.archive.resource.ResourceFactory; 14 | import org.archive.resource.ResourceParseException; 15 | 16 | public class HTTPResponseResourceFactory implements ResourceFactory, ResourceConstants { 17 | private HttpResponseParser parser; 18 | public HTTPResponseResourceFactory() { 19 | parser = new HttpResponseParser(); 20 | } 21 | 22 | public Resource getResource(InputStream is, MetaData metaData, 23 | ResourceContainer container) 24 | throws ResourceParseException, IOException { 25 | try { 26 | 27 | HttpResponse response = parser.parse(is); 28 | metaData.putString(PAYLOAD_CONTENT_TYPE, 29 | PAYLOAD_TYPE_HTTP_RESPONSE); 30 | return new HTTPResponseResource(metaData.createChild(HTTP_RESPONSE_METADATA), 31 | container, response, true); 32 | 33 | } catch(HttpParseException e) { 34 | throw new ResourceParseException(e); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/producer/ARCFile.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.producer; 2 | 3 | import org.archive.resource.arc.ARCResourceFactory; 4 | 5 | public class ARCFile extends EnvelopedResourceFile { 6 | public ARCFile() { 7 | super(new ARCResourceFactory()); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/producer/WARCFile.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.producer; 2 | 3 | import org.archive.resource.warc.WARCResourceFactory; 4 | 5 | public class WARCFile extends EnvelopedResourceFile { 6 | public WARCFile() { 7 | super(new WARCResourceFactory()); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/warc/WARCResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.format.http.HttpParseException; 7 | import org.archive.format.http.HttpResponse; 8 | import org.archive.format.http.HttpResponseParser; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.Resource; 12 | import org.archive.resource.ResourceContainer; 13 | import org.archive.resource.ResourceFactory; 14 | import org.archive.resource.ResourceParseException; 15 | 16 | public class WARCResourceFactory implements ResourceFactory, ResourceConstants { 17 | private HttpResponseParser parser; 18 | public WARCResourceFactory() { 19 | parser = new HttpResponseParser(); 20 | } 21 | 22 | public Resource getResource(InputStream is, MetaData parentMetaData, 23 | ResourceContainer container) throws ResourceParseException, 24 | IOException { 25 | try { 26 | 27 | HttpResponse response = parser.parse(is); 28 | WARCResource r = new WARCResource(parentMetaData.createChild(ENVELOPE), 29 | container, response); 30 | return r; 31 | 32 | } catch(HttpParseException e) { 33 | throw new ResourceParseException(e); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/warc/record/DNSResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc.record; 2 | 3 | 4 | import java.util.logging.Logger; 5 | 6 | import org.archive.format.dns.DNSRecord; 7 | import org.archive.format.dns.DNSResponse; 8 | import org.archive.resource.AbstractEmptyResource; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.ResourceContainer; 12 | import com.github.openjson.JSONException; 13 | import com.github.openjson.JSONObject; 14 | 15 | public class DNSResource extends AbstractEmptyResource implements ResourceConstants { 16 | private static final Logger LOG = 17 | Logger.getLogger(DNSResource.class.getName()); 18 | 19 | public DNSResource(MetaData metaData, ResourceContainer container, 20 | DNSResponse response) { 21 | super(metaData, container); 22 | metaData.putString(DNS_DATE, response.getDate()); 23 | try { 24 | for(DNSRecord rec : response) { 25 | JSONObject rjo = new JSONObject(); 26 | rjo.put(DNS_NAME, rec.getName()); 27 | rjo.put(DNS_TTL, rec.getTtl()); 28 | rjo.put(DNS_NETCLASS, rec.getNetClass()); 29 | rjo.put(DNS_TYPE, rec.getType()); 30 | rjo.put(DNS_VALUE, rec.getValue()); 31 | metaData.appendChild(DNS_ENTRIES, rjo); 32 | } 33 | } catch(JSONException e) { 34 | LOG.severe(e.getMessage()); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc.record; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.RecoverableRecordFormatException; 7 | import org.archive.format.dns.DNSResponse; 8 | import org.archive.format.dns.DNSResponseParser; 9 | import org.archive.resource.MetaData; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.Resource; 12 | import org.archive.resource.ResourceContainer; 13 | import org.archive.resource.ResourceFactory; 14 | import org.archive.resource.ResourceParseException; 15 | 16 | public class DNSResourceFactory implements ResourceFactory, ResourceConstants { 17 | 18 | DNSResponseParser parser = new DNSResponseParser(); 19 | 20 | public Resource getResource(InputStream is, MetaData parentMetaData, 21 | ResourceContainer container) throws ResourceParseException, 22 | IOException { 23 | DNSResponse response = new DNSResponse(); 24 | try { 25 | parser.parse(is, response); 26 | } catch(RecoverableRecordFormatException e) { 27 | throw new ResourceParseException(e); 28 | } 29 | parentMetaData.putString(PAYLOAD_CONTENT_TYPE, PAYLOAD_TYPE_DNS); 30 | return new DNSResource(parentMetaData.createChild(DNS_METADATA), container, response); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc.record; 2 | 3 | import org.archive.resource.AbstractEmptyResource; 4 | import org.archive.resource.MetaData; 5 | import org.archive.resource.ResourceConstants; 6 | import org.archive.resource.ResourceContainer; 7 | 8 | public class WARCJSONMetaDataResource extends AbstractEmptyResource implements ResourceConstants { 9 | 10 | public WARCJSONMetaDataResource(MetaData metaData, 11 | ResourceContainer container) { 12 | super(metaData, container); 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc.record; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.nio.charset.Charset; 7 | 8 | import org.archive.resource.MetaData; 9 | import org.archive.resource.Resource; 10 | import org.archive.resource.ResourceConstants; 11 | import org.archive.resource.ResourceContainer; 12 | import org.archive.resource.ResourceFactory; 13 | import org.archive.resource.ResourceParseException; 14 | import com.github.openjson.JSONException; 15 | import com.github.openjson.JSONTokener; 16 | 17 | public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants { 18 | private static final Charset UTF8 = Charset.forName("UTF-8"); 19 | 20 | public WARCJSONMetaDataResourceFactory() { 21 | } 22 | 23 | public Resource getResource(InputStream is, MetaData parentMetaData, 24 | ResourceContainer container) throws ResourceParseException, 25 | IOException { 26 | 27 | 28 | MetaData md; 29 | try { 30 | md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF8))); 31 | } catch (JSONException e) { 32 | throw new ResourceParseException(e); 33 | } 34 | return new WARCJSONMetaDataResource(md, container); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/archive/resource/warc/record/WARCMetaDataResource.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc.record; 2 | 3 | //import java.util.logging.Logger; 4 | 5 | import org.archive.format.http.HttpHeader; 6 | import org.archive.format.http.HttpHeaders; 7 | import org.archive.resource.AbstractEmptyResource; 8 | import org.archive.resource.MetaData; 9 | import org.archive.resource.ResourceConstants; 10 | import org.archive.resource.ResourceContainer; 11 | 12 | public class WARCMetaDataResource extends AbstractEmptyResource implements ResourceConstants { 13 | // private static final Logger LOG = 14 | // Logger.getLogger(WARCMetaDataResource.class.getName()); 15 | 16 | public WARCMetaDataResource(MetaData metaData, ResourceContainer container, 17 | HttpHeaders headers) { 18 | super(metaData, container); 19 | for(HttpHeader h : headers) { 20 | metaData.appendObj(WARC_META_FIELDS_LIST, 21 | METADATA_KV_NAME, h.getName(), 22 | METADATA_KV_VALUE,h.getValue()); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/org/archive/streamcontext/ByteArrayWrappedStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.streamcontext; 2 | 3 | import java.io.IOException; 4 | 5 | public class ByteArrayWrappedStream extends AbstractBufferingStream { 6 | private byte[] buffer = null; 7 | int offset = 0; 8 | public ByteArrayWrappedStream(byte b[]) { 9 | buffer = b; 10 | offset = 0; 11 | } 12 | @Override 13 | public int doRead(byte[] b, int off, int len) throws IOException { 14 | if(offset == buffer.length) { 15 | return -1; 16 | } 17 | int amtToCopy = Math.min(buffer.length - offset, len); 18 | System.arraycopy(buffer, offset, b, off, amtToCopy); 19 | offset += amtToCopy; 20 | return amtToCopy; 21 | } 22 | 23 | @Override 24 | public void doSeek(long offset) throws IOException { 25 | if(offset > this.offset) { 26 | throw new IOException("seek past end.."); 27 | } 28 | this.offset = (int) offset; 29 | } 30 | 31 | @Override 32 | public void doClose() throws IOException { 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/org/archive/streamcontext/HDFSStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.streamcontext; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FSDataInputStream; 6 | 7 | public class HDFSStream extends AbstractBufferingStream { 8 | FSDataInputStream hdfs; 9 | public HDFSStream(FSDataInputStream hdfs) { 10 | this.hdfs = hdfs; 11 | } 12 | public HDFSStream(FSDataInputStream hdfs, long offset) throws IOException { 13 | this.hdfs = hdfs; 14 | doSeek(offset); 15 | } 16 | 17 | @Override 18 | public int doRead(byte[] b, int off, int len) throws IOException { 19 | return hdfs.read(b, off, len); 20 | } 21 | 22 | @Override 23 | public void doSeek(long offset) throws IOException { 24 | // System.err.format("HDFSdoSeek(%d)\n", offset); 25 | hdfs.seek(offset); 26 | } 27 | 28 | @Override 29 | public void doClose() throws IOException { 30 | hdfs.close(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/archive/streamcontext/HTTP11Stream.java: -------------------------------------------------------------------------------- 1 | package org.archive.streamcontext; 2 | 3 | import java.io.FileNotFoundException; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.net.URL; 7 | import java.net.URLConnection; 8 | 9 | public class HTTP11Stream extends AbstractBufferingStream { 10 | private URL url; 11 | private URLConnection conn = null; 12 | private InputStream is = null; 13 | 14 | public HTTP11Stream(URL url) 15 | throws IndexOutOfBoundsException, FileNotFoundException, IOException { 16 | this(url,0L,DEFAULT_READ_SIZE); 17 | } 18 | public HTTP11Stream(URL url, long offset) 19 | throws IndexOutOfBoundsException, FileNotFoundException, IOException { 20 | this(url,offset,DEFAULT_READ_SIZE); 21 | } 22 | public HTTP11Stream(URL url, long offset, int readSize) throws IOException { 23 | super(offset,readSize); 24 | this.url = url; 25 | doSeek(offset); 26 | } 27 | 28 | @Override 29 | public void doClose() throws IOException { 30 | if(is != null) { 31 | is.close(); 32 | is = null; 33 | } 34 | } 35 | 36 | @Override 37 | public int doRead(byte[] b, int off, int len) throws IOException { 38 | return is.read(b, off, len); 39 | } 40 | 41 | @Override 42 | public void doSeek(long offset) throws IOException { 43 | doClose(); 44 | conn = url.openConnection(); 45 | conn.setRequestProperty("Range", String.format("bytes=%d-", offset)); 46 | conn.connect(); 47 | is = conn.getInputStream(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/archive/streamcontext/RandomAccessFileStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.streamcontext; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.io.RandomAccessFile; 7 | 8 | public class RandomAccessFileStream extends AbstractBufferingStream { 9 | 10 | private RandomAccessFile raf = null; 11 | private File file = null; 12 | public RandomAccessFileStream(File file) 13 | throws IndexOutOfBoundsException, FileNotFoundException, IOException { 14 | this(file,0L,DEFAULT_READ_SIZE); 15 | } 16 | public RandomAccessFileStream(File file, long offset) 17 | throws IndexOutOfBoundsException, FileNotFoundException, IOException { 18 | this(file,offset,DEFAULT_READ_SIZE); 19 | } 20 | public RandomAccessFileStream(File file, long offset, int readSize) 21 | throws IndexOutOfBoundsException, FileNotFoundException, IOException { 22 | super(offset,readSize); 23 | raf = new RandomAccessFile(file, "r"); 24 | if(offset > 0) { 25 | raf.seek(offset); 26 | } 27 | this.file = file; 28 | } 29 | 30 | public File getFile() { 31 | return file; 32 | } 33 | 34 | public void doClose() throws IOException { 35 | raf.close(); 36 | } 37 | 38 | public int doRead(byte[] b, int off, int len) throws IOException { 39 | return raf.read(b, off, len); 40 | } 41 | 42 | public void doSeek(long offset) throws IOException { 43 | raf.seek(offset); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/archive/streamcontext/SimpleStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.streamcontext; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | public class SimpleStream extends AbstractBufferingStream { 7 | private InputStream is; 8 | 9 | public SimpleStream(InputStream is) { 10 | this(is,0L,DEFAULT_READ_SIZE); 11 | } 12 | 13 | public SimpleStream(InputStream is, long offset) { 14 | this(is,offset,DEFAULT_READ_SIZE); 15 | } 16 | 17 | public SimpleStream(InputStream is, long offset, int readSize) { 18 | super(offset,readSize); 19 | this.is = is; 20 | } 21 | 22 | @Override 23 | public void doClose() throws IOException { 24 | is.close(); 25 | } 26 | 27 | @Override 28 | public int doRead(byte[] b, int off, int len) throws IOException { 29 | return is.read(b,off,len); 30 | } 31 | 32 | @Override 33 | public void doSeek(long offset) throws IOException { 34 | throw new IOException("Unable to seek!"); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/archive/streamcontext/Stream.java: -------------------------------------------------------------------------------- 1 | package org.archive.streamcontext; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | 6 | /** 7 | * Alternate simplified interface for accessing data from an underlying source 8 | * of bytes. 9 | * 10 | * @author brad 11 | * 12 | */ 13 | public interface Stream extends Closeable { 14 | public long getOffset(); 15 | public long setOffset(long offset) throws IOException; 16 | public int read(byte[] bytes, int off, int len) throws IOException; 17 | public boolean atEof(); 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/archive/uid/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | org.archive.uid package 5 | 6 | 7 | A unique ID generator. 8 | Default is {@link org.archive.uid.UUIDGenerator}. 9 | To use another ID Generator, set the System Property 10 | org.archive.uid.GeneratorFactory.generator to point 11 | at an alternate implementation of {@link org.archive.uid.RecordIDGenerator}. 12 | 13 |

TODO

14 |
    15 |
  • MIME boundaries have upper-bound of 70 characters total including 16 | 'blank line' (CRLFCRLF) and two leading hyphens. Add to 17 | {@link org.archive.uid.RecordIDGenerator} 18 | interface an upper-bound on generated ID length.
  • 19 |
  • Add example of an actionable uid generator: 20 | e.g. http://archive.org/UID-SCHEME/ID 21 | where scheme might be UUID and an ID might be 22 | f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata or, 23 | using ARK: 24 | http://archive.org/ark:/13030/f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata. 25 |
  • 26 |
27 | 28 | 29 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/AggressiveIACanonicalizerRules.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | public class AggressiveIACanonicalizerRules extends CanonicalizeRules { 4 | 5 | public AggressiveIACanonicalizerRules() 6 | { 7 | this(true); 8 | } 9 | 10 | public AggressiveIACanonicalizerRules(boolean stripSlash) { 11 | 12 | setRule(SCHEME_SETTINGS, SCHEME_LOWERCASE); 13 | setRule(HOST_SETTINGS, 14 | HOST_LOWERCASE|HOST_MASSAGE); 15 | 16 | setRule(PORT_SETTINGS, 17 | PORT_STRIP_DEFAULT); 18 | 19 | int pathSettings = PATH_LOWERCASE|PATH_STRIP_SESSION_ID; 20 | 21 | if (stripSlash) { 22 | pathSettings |= PATH_STRIP_TRAILING_SLASH_UNLESS_EMPTY; 23 | } 24 | 25 | setRule(PATH_SETTINGS, pathSettings); 26 | 27 | setRule(QUERY_SETTINGS, 28 | QUERY_LOWERCASE|QUERY_STRIP_SESSION_ID|QUERY_STRIP_EMPTY| 29 | QUERY_ALPHA_REORDER); 30 | 31 | setRule(HASH_SETTINGS,HASH_STRIP); 32 | 33 | setRule(AUTH_SETTINGS, AUTH_STRIP_PASS|AUTH_STRIP_USER); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/AggressiveIAURLCanonicalizer.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | public class AggressiveIAURLCanonicalizer implements URLCanonicalizer { 4 | private static final BasicURLCanonicalizer basic = 5 | new BasicURLCanonicalizer(); 6 | 7 | private static final IAURLCanonicalizer ia = 8 | new IAURLCanonicalizer(new AggressiveIACanonicalizerRules()); 9 | 10 | public void canonicalize(HandyURL url) { 11 | // just google's stuff, followed by the IA default stuff: 12 | basic.canonicalize(url); 13 | ia.canonicalize(url); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/CanonicalizeRules.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | // XXX it's "Canonicalizer" everywhere else but here 4 | public class CanonicalizeRules implements CanonicalizerConstants { 5 | private int[] settings = new int[NUM_SETTINGS]; 6 | 7 | public void setRule(int rule, int value) { 8 | settings[rule] = value; 9 | } 10 | public int getRule(int rule) { 11 | return settings[rule]; 12 | } 13 | public boolean isSet(int rule, int value) { 14 | return (settings[rule] & value) == value; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/CanonicalizerConstants.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | public interface CanonicalizerConstants { 4 | public static final int HOST_SETTINGS = 0; 5 | 6 | public static final int HOST_ORIGINAL = 0; 7 | public static final int HOST_LOWERCASE = 1; 8 | public static final int HOST_MASSAGE = 2; 9 | 10 | 11 | public static final int PORT_SETTINGS = 1; 12 | 13 | public static final int PORT_ORIGINAL = 0; 14 | public static final int PORT_STRIP_DEFAULT = 1; 15 | 16 | 17 | public static final int PATH_SETTINGS = 2; 18 | 19 | public static final int PATH_ORIGINAL = 0; 20 | public static final int PATH_LOWERCASE = 1; 21 | public static final int PATH_STRIP_SESSION_ID = 2; 22 | public static final int PATH_STRIP_EMPTY = 4; 23 | public static final int PATH_STRIP_TRAILING_SLASH_UNLESS_EMPTY = 8; 24 | 25 | 26 | public static final int QUERY_SETTINGS = 3; 27 | 28 | public static final int QUERY_ORIGINAL = 0; 29 | public static final int QUERY_LOWERCASE = 1; 30 | public static final int QUERY_STRIP_SESSION_ID = 2; 31 | public static final int QUERY_STRIP_EMPTY = 4; 32 | public static final int QUERY_ALPHA_REORDER = 8; 33 | // TODO: Need a setting to remove empty query ARGs.. 34 | 35 | public static final int HASH_SETTINGS = 4; 36 | 37 | public static final int HASH_ORIGINAL = 0; 38 | public static final int HASH_STRIP = 1; 39 | 40 | 41 | public static final int AUTH_SETTINGS = 5; 42 | 43 | public static final int AUTH_ORIGINAL = 0; 44 | public static final int AUTH_STRIP_USER = 1; 45 | public static final int AUTH_STRIP_PASS = 2; 46 | 47 | public static final int SCHEME_SETTINGS = 6; 48 | 49 | public static final int SCHEME_ORIGINAL = 0; 50 | public static final int SCHEME_LOWERCASE = 1; 51 | 52 | 53 | public static final int NUM_SETTINGS = 7; 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/DefaultIACanonicalizerRules.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | /** 4 | * @deprecated use AggressiveIACanonicalizerRules 5 | */ 6 | public class DefaultIACanonicalizerRules extends AggressiveIACanonicalizerRules { 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | /** 4 | * @deprecated use AggressiveIAURLCanonicalizer 5 | */ 6 | public class DefaultIAURLCanonicalizer extends AggressiveIAURLCanonicalizer { 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/ExtractRule.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | public class ExtractRule 7 | { 8 | protected String startsWith; 9 | protected String regex; 10 | 11 | protected Pattern regexPattern; 12 | 13 | public String getStartsWith() { 14 | return startsWith; 15 | } 16 | public void setStartsWith(String startsWith) { 17 | this.startsWith = startsWith; 18 | } 19 | public String getRegex() { 20 | return regex; 21 | } 22 | public void setRegex(String regex) { 23 | regexPattern = Pattern.compile(regex); 24 | this.regex = regex; 25 | } 26 | 27 | public Matcher extract(String url) 28 | { 29 | if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) { 30 | return null; 31 | } 32 | 33 | if (regexPattern == null) { 34 | return null; 35 | } 36 | 37 | Matcher match = regexPattern.matcher(url); 38 | 39 | if (!match.find()) { 40 | return null; 41 | } 42 | 43 | return match; 44 | } 45 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/url/GoogleURLCanonicalizer.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | /** 4 | * @deprecated use {@link BasicURLCanonicalizer} 5 | */ 6 | public class GoogleURLCanonicalizer extends BasicURLCanonicalizer { 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | public class NonMassagingIAURLCanonicalizer implements URLCanonicalizer { 4 | private static final GoogleURLCanonicalizer google = 5 | new GoogleURLCanonicalizer(); 6 | private static CanonicalizeRules nonMassagingRules = 7 | new DefaultIACanonicalizerRules(); 8 | static { 9 | nonMassagingRules.setRule(CanonicalizeRules.HOST_SETTINGS, 10 | CanonicalizeRules.HOST_LOWERCASE); 11 | } 12 | private static final IAURLCanonicalizer ia = 13 | new IAURLCanonicalizer(nonMassagingRules); 14 | 15 | public void canonicalize(HandyURL url) { 16 | // just google's stuff, followed by the IA default stuff: 17 | google.canonicalize(url); 18 | ia.canonicalize(url); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/OrdinaryIACanonicalizerRules.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | /** 4 | * Idea of these rules is to accomplish roughly the equivalent of 5 | * {@link UsableURIFactory} fixup plus {@link BasicURLCanonicalizer} fixup. 6 | */ 7 | public class OrdinaryIACanonicalizerRules extends CanonicalizeRules { 8 | public OrdinaryIACanonicalizerRules() { 9 | setRule(SCHEME_SETTINGS, SCHEME_LOWERCASE); 10 | setRule(HOST_SETTINGS, HOST_LOWERCASE); 11 | setRule(PORT_SETTINGS, PORT_STRIP_DEFAULT); 12 | setRule(QUERY_SETTINGS, QUERY_STRIP_EMPTY); 13 | setRule(HASH_SETTINGS, HASH_STRIP); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/OrdinaryIAURLCanonicalizer.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | /** 4 | * Idea of this canonicalizer is to accomplish roughly the equivalent of 5 | * {@link UsableURIFactory} fixup plus {@link BasicURLCanonicalizer} fixup. 6 | */ 7 | public class OrdinaryIAURLCanonicalizer implements URLCanonicalizer { 8 | private static final BasicURLCanonicalizer basic = new BasicURLCanonicalizer(); 9 | 10 | private static final IAURLCanonicalizer ia = 11 | new IAURLCanonicalizer(new OrdinaryIACanonicalizerRules()); 12 | 13 | public void canonicalize(HandyURL url) { 14 | basic.canonicalize(url); 15 | ia.canonicalize(url); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/RewriteRule.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | public class RewriteRule 7 | { 8 | protected String startsWith; 9 | protected String regex; 10 | protected String replace; 11 | 12 | protected Pattern regexPattern; 13 | 14 | public String getStartsWith() { 15 | return startsWith; 16 | } 17 | public void setStartsWith(String startsWith) { 18 | this.startsWith = startsWith; 19 | } 20 | public String getRegex() { 21 | return regex; 22 | } 23 | public void setRegex(String regex) { 24 | regexPattern = Pattern.compile(regex); 25 | this.regex = regex; 26 | } 27 | public String getReplace() { 28 | return replace; 29 | } 30 | public void setReplace(String replace) { 31 | this.replace = replace; 32 | } 33 | 34 | public boolean rewrite(StringBuilder sb) 35 | { 36 | String urlkey = sb.toString(); 37 | 38 | if ((startsWith != null) && !urlkey.startsWith(startsWith)) { 39 | return false; 40 | } 41 | 42 | if (regexPattern == null || replace == null) { 43 | return false; 44 | } 45 | 46 | Matcher match = regexPattern.matcher(urlkey); 47 | 48 | if (match.matches()) { 49 | sb.replace(0, sb.length(), match.replaceAll(replace)); 50 | return true; 51 | } 52 | 53 | return false; 54 | } 55 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/url/SURT.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStreamReader; 5 | import java.nio.charset.Charset; 6 | import java.util.Iterator; 7 | import java.util.logging.Logger; 8 | 9 | import org.apache.commons.httpclient.URIException; 10 | import org.archive.util.iterator.AbstractPeekableIterator; 11 | 12 | public class SURT { 13 | private static final Logger LOG = 14 | Logger.getLogger(SURT.class.getCanonicalName()); 15 | public static String toSURT(String input) { 16 | if(input.startsWith("(")) { 17 | return input; 18 | } 19 | try { 20 | // String tmp = input; 21 | // if(tmp == null) { 22 | // throw new URIException(); 23 | // } 24 | String tmp = SURTTokenizer.prefixKey(input); 25 | if(tmp.contains("/")) { 26 | return tmp; 27 | } 28 | return tmp + ","; 29 | } catch (URIException e) { 30 | LOG.warning("URI Exception for(" + input + "):" + e.getLocalizedMessage()); 31 | // e.printStackTrace(); 32 | return input; 33 | } 34 | } 35 | public static void main(String[] args) { 36 | String line; 37 | InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8")); 38 | BufferedReader br = new BufferedReader(isr); 39 | Iterator i = AbstractPeekableIterator.wrapReader(br); 40 | while(i.hasNext()) { 41 | line = i.next(); 42 | System.out.println(toSURT(line)); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/URLCanonicalizer.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | public interface URLCanonicalizer { 4 | public void canonicalize(HandyURL url); 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/org/archive/url/URLKeyMaker.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import java.net.URISyntaxException; 4 | 5 | public interface URLKeyMaker { 6 | public String makeKey(String url) throws URISyntaxException; 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/CrossProduct.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.util.ArrayDeque; 4 | import java.util.ArrayList; 5 | import java.util.Deque; 6 | import java.util.List; 7 | import java.util.Stack; 8 | 9 | public class CrossProduct { 10 | public List> crossProduct(List> listOfLists) { 11 | 12 | ArrayList> results = new ArrayList>(); 13 | 14 | Stack current = new Stack(); 15 | Deque> remainder = new ArrayDeque>(listOfLists); 16 | recurse(remainder,current,results); 17 | return results; 18 | } 19 | private void recurse(Deque> remainder, 20 | Stack current, ArrayList> accumulation) { 21 | if(remainder.isEmpty()) { 22 | // all done: 23 | dump(new ArrayList(current)); 24 | accumulation.add(new ArrayList(current)); 25 | 26 | } else { 27 | List cur = remainder.removeFirst(); 28 | for(T o : cur) { 29 | current.push(o); 30 | recurse(remainder,current,accumulation); 31 | current.pop(); 32 | } 33 | remainder.addFirst(cur); 34 | } 35 | } 36 | private void dump(ArrayList a) { 37 | StringBuilder sb = new StringBuilder(); 38 | boolean first = false; 39 | for(T o : a) { 40 | if(first) { 41 | first = false; 42 | } else { 43 | sb.append(","); 44 | } 45 | sb.append(o.toString()); 46 | } 47 | System.out.println("CrossOutput:" + sb.toString()); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/FileNameSpec.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.util.concurrent.atomic.AtomicInteger; 4 | 5 | public class FileNameSpec { 6 | // private final static String DEFAULT_PREFIX_PATTERN = "UNK-%h-%p-%t-%s"; 7 | private AtomicInteger aInt; 8 | private String prefix; 9 | private String suffix; 10 | public FileNameSpec(String prefix, String suffix) { 11 | this.prefix = prefix; 12 | this.suffix = suffix; 13 | aInt = new AtomicInteger(-1); 14 | } 15 | public String getNextName() { 16 | StringBuilder sb = new StringBuilder(); 17 | sb.append(prefix); 18 | sb.append(String.format("%06d",aInt.incrementAndGet())); 19 | sb.append(suffix); 20 | return sb.toString(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/IterableLineIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.io.Reader; 4 | import java.util.Iterator; 5 | 6 | import org.apache.commons.io.LineIterator; 7 | 8 | /** 9 | * A LineIterator that also implements Iterable, so that it can be used with 10 | * the java enhanced for-each loop syntax. 11 | * 12 | * @author nlevitt 13 | */ 14 | public class IterableLineIterator extends LineIterator 15 | implements Iterable { 16 | 17 | public IterableLineIterator(final Reader reader) 18 | throws IllegalArgumentException { 19 | super(reader); 20 | } 21 | 22 | @SuppressWarnings("unchecked") 23 | public Iterator iterator() { 24 | return this; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/ProgressStatisticsReporter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.util; 20 | 21 | import java.io.IOException; 22 | import java.io.PrintWriter; 23 | 24 | public interface ProgressStatisticsReporter { 25 | /** 26 | * @param writer Where to write statistics. 27 | * @throws IOException 28 | */ 29 | public void progressStatisticsLine(PrintWriter writer) throws IOException; 30 | 31 | /** 32 | * @param writer Where to write statistics legend. 33 | * @throws IOException 34 | */ 35 | public void progressStatisticsLegend(PrintWriter writer) throws IOException; 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/StringParse.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.util.List; 4 | import java.util.regex.Pattern; 5 | 6 | public class StringParse { 7 | private final static Pattern IP_PATTERN = 8 | Pattern.compile("b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?).)" 9 | + "{3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)b"); 10 | public static boolean isIP(final String ip) { 11 | // TODO: 12 | return ip.length() > 0; 13 | // return IP_PATTERN.matcher(ip).matches(); 14 | } 15 | public static boolean isIPBad(final String ip) { 16 | return IP_PATTERN.matcher(ip).matches(); 17 | } 18 | public static String join(List p) { 19 | return join(p,","); 20 | } 21 | public static String join(List p, String delim) { 22 | StringBuilder sb = new StringBuilder(); 23 | boolean first = true; 24 | for(String part : p) { 25 | if(first) { 26 | first = false; 27 | } else { 28 | sb.append(delim); 29 | } 30 | sb.append(part); 31 | } 32 | return sb.toString(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/anvl/Label.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.archive.util.anvl; 21 | 22 | class Label extends SubElement { 23 | public static final char COLON = ':'; 24 | 25 | @SuppressWarnings("unused") 26 | private Label() { 27 | this(null); 28 | } 29 | 30 | public Label(final String s) { 31 | super(s); 32 | } 33 | 34 | @Override 35 | protected void checkCharacter(char c, String srcStr, int index) { 36 | super.checkCharacter(c, srcStr, index); 37 | if (c == COLON) { 38 | throw new IllegalArgumentException("Label cannot contain " + COLON); 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/util/anvl/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | org.archive.util.anvl package 5 | 6 | 7 | Parsers and Writers for the (expired) Internet-Draft A Name-Value 9 | Language (ANVL). Use {@link org.archive.util.anvl.ANVLRecord} 10 | to create new instances of ANVL Records and for parsing. 11 | 12 |

Implementation Details

13 |

The ANVL Internet-Draft of 14 February, 2005 is inspecific as to the 14 | definition of 'blank line' and 'newline'. This parser implementation 15 | assumes CRNL. 16 |

17 |

Says "An element consists of a label, a colon, and an optional value". 18 | Should that be: "An element consists of a label and an optional value, or a 19 | comment."

20 | 21 |

Specification is unclear regards CR or NL in label or 22 | comment (This implementation disallows CR or NL in labels but lets 23 | them pass in comments).

24 | 25 |

A grammar would help. Here is RFC822: 26 |

27 |      field       =  field-name ":" [ field-body ] CRLF
28 |      
29 |      field-name  =  1*<any CHAR, excluding CTLs, SPACE, and ":">
30 |      
31 |      field-body  =  field-body-contents
32 |                     [CRLF LWSP-char field-body]
33 |      
34 |      field-body-contents =
35 |                    <the ASCII characters making up the field-body, as
36 |                     defined in the following sections, and consisting
37 |                     of combinations of atom, quoted-string, and
38 |                     specials tokens, or else consisting of texts>
39 | 
40 | 41 | 42 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/FieldExtractingSLR.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Special SLR wrapper (SeekableLineReader) that extracts a certain field 7 | * from the reader and only returns that field 8 | * @author ilya 9 | * 10 | */ 11 | 12 | public class FieldExtractingSLR extends WrappedSeekableLineReader { 13 | protected String sep; 14 | protected int fieldIndex; 15 | 16 | public FieldExtractingSLR(SeekableLineReader slr, int fieldIndex, String sep) { 17 | super(slr); 18 | this.fieldIndex = fieldIndex; 19 | this.sep = sep; 20 | } 21 | 22 | @Override 23 | public String readLine() throws IOException { 24 | String line = super.readLine(); 25 | String[] fields = line.split(sep); 26 | return fields[fieldIndex]; 27 | } 28 | 29 | @Override 30 | public void skipLine() throws IOException { 31 | super.readLine(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/SeekableLineReader.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | public interface SeekableLineReader { 7 | public void seek(long offset) throws IOException; 8 | public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException; 9 | public InputStream getInputStream(); 10 | public String readLine() throws IOException; 11 | public void skipLine() throws IOException; 12 | public void close() throws IOException; 13 | public long getSize() throws IOException; 14 | public void setBufferFully(boolean bufferFully); 15 | public boolean isClosed(); 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/SeekableLineReaderFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch; 2 | 3 | import java.io.IOException; 4 | 5 | public interface SeekableLineReaderFactory { 6 | public final static int BINSEARCH_BLOCK_SIZE = 8192; 7 | public SeekableLineReader get() throws IOException; 8 | public void close() throws IOException; 9 | public long getModTime(); 10 | public void reload() throws IOException; 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch; 2 | 3 | import java.io.IOException; 4 | 5 | import org.archive.util.io.RuntimeIOException; 6 | import org.archive.util.iterator.AbstractPeekableIterator; 7 | 8 | public class SeekableLineReaderIterator extends AbstractPeekableIterator { 9 | protected SeekableLineReader slr; 10 | protected boolean propagateException; 11 | 12 | public SeekableLineReaderIterator(SeekableLineReader slr) { 13 | this(slr, true); 14 | } 15 | 16 | public SeekableLineReaderIterator(SeekableLineReader slr, boolean propagateException) { 17 | this.slr = slr; 18 | this.propagateException = propagateException; 19 | } 20 | 21 | @Override 22 | public String getNextInner() { 23 | String next = null; 24 | if (slr != null) { 25 | try { 26 | next = slr.readLine(); 27 | } catch (IOException e) { 28 | if (propagateException) { 29 | throw new RuntimeIOException(e.toString()); 30 | } 31 | } 32 | } 33 | return next; 34 | } 35 | @Override 36 | public void close() throws IOException { 37 | if (slr != null) { 38 | slr.close(); 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/WrappedSeekableLineReader.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | /** 7 | * WrappedSeekableLineReader that wraps an existing SeekableLineReader for custom extension 8 | * @author ilya 9 | * 10 | */ 11 | public class WrappedSeekableLineReader implements SeekableLineReader { 12 | 13 | protected SeekableLineReader slr; 14 | 15 | public WrappedSeekableLineReader(SeekableLineReader slr) 16 | { 17 | this.slr = slr; 18 | } 19 | 20 | @Override 21 | public void seek(long offset) throws IOException { 22 | this.slr.seek(offset); 23 | } 24 | 25 | @Override 26 | public void seekWithMaxRead(long offset, boolean gzip, int maxLength) 27 | throws IOException { 28 | slr.seekWithMaxRead(offset, gzip, maxLength); 29 | } 30 | 31 | @Override 32 | public InputStream getInputStream() { 33 | return slr.getInputStream(); 34 | } 35 | 36 | @Override 37 | public String readLine() throws IOException { 38 | return slr.readLine(); 39 | } 40 | 41 | @Override 42 | public void close() throws IOException { 43 | slr.close(); 44 | } 45 | 46 | @Override 47 | public long getSize() throws IOException { 48 | return slr.getSize(); 49 | } 50 | 51 | @Override 52 | public void setBufferFully(boolean bufferFully) { 53 | slr.setBufferFully(bufferFully); 54 | } 55 | 56 | @Override 57 | public boolean isClosed() { 58 | return slr.isClosed(); 59 | } 60 | 61 | @Override 62 | public void skipLine() throws IOException { 63 | slr.skipLine(); 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReader.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch.impl; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.apache.hadoop.fs.FSDataInputStream; 7 | import org.archive.util.binsearch.AbstractSeekableLineReader; 8 | 9 | import com.google.common.io.ByteStreams; 10 | 11 | public class HDFSSeekableLineReader extends AbstractSeekableLineReader { 12 | private FSDataInputStream fsdis; 13 | private long length; 14 | 15 | public HDFSSeekableLineReader(FSDataInputStream fsdis, long length, 16 | int blockSize) { 17 | super(blockSize); 18 | this.fsdis = fsdis; 19 | this.length = length; 20 | } 21 | 22 | public InputStream doSeekLoad(long offset, int maxLength) throws IOException { 23 | fsdis.seek(offset); 24 | 25 | if (maxLength >= 0) { 26 | return ByteStreams.limit(fsdis, maxLength); 27 | } else { 28 | return fsdis; 29 | } 30 | } 31 | 32 | public long getOffset() throws IOException { 33 | return fsdis.getPos(); 34 | } 35 | 36 | public void doClose() throws IOException { 37 | //Superclass closes the input stream 38 | fsdis = null; 39 | } 40 | 41 | public long getSize() throws IOException { 42 | return length; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReaderFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch.impl; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FSDataInputStream; 6 | import org.apache.hadoop.fs.FileStatus; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.archive.util.binsearch.SeekableLineReader; 10 | import org.archive.util.binsearch.SeekableLineReaderFactory; 11 | 12 | public class HDFSSeekableLineReaderFactory implements SeekableLineReaderFactory { 13 | private FileSystem fs; 14 | private Path path; 15 | public HDFSSeekableLineReaderFactory(FileSystem fs, Path path) { 16 | this.fs = fs; 17 | this.path = path; 18 | } 19 | public SeekableLineReader get() throws IOException { 20 | FileStatus status = fs.getFileStatus(path); 21 | if(status.isDir()) { 22 | throw new IOException("Path:" + path.toUri().toASCIIString() + " is a directory!"); 23 | } 24 | long length = status.getLen(); 25 | FSDataInputStream fsdis = fs.open(path); 26 | return new HDFSSeekableLineReader(fsdis, length, 4096); 27 | } 28 | 29 | public void close() throws IOException 30 | { 31 | if (this.fs != null) { 32 | fs.close(); 33 | } 34 | } 35 | 36 | public long getModTime() 37 | { 38 | try { 39 | return fs.getFileStatus(path).getModificationTime(); 40 | } catch (IOException e) { 41 | return 0; 42 | } 43 | } 44 | @Override 45 | public void reload() throws IOException { 46 | // TODO Auto-generated method stub 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/impl/MappedSeekableLineReader.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch.impl; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.archive.util.binsearch.AbstractSeekableLineReader; 7 | import org.archive.util.binsearch.ByteBufferInputStream; 8 | 9 | import com.google.common.io.ByteStreams; 10 | 11 | public class MappedSeekableLineReader extends AbstractSeekableLineReader { 12 | 13 | private ByteBufferInputStream bbis; 14 | 15 | public MappedSeekableLineReader(ByteBufferInputStream bbis, int blockSize) throws IOException { 16 | super(blockSize); 17 | this.bbis = bbis; 18 | } 19 | 20 | public long getOffset() throws IOException 21 | { 22 | if (closed) { 23 | return 0; 24 | } 25 | 26 | return bbis.position(); 27 | } 28 | 29 | @Override 30 | protected InputStream doSeekLoad(long offset, int maxLength) 31 | throws IOException { 32 | 33 | bbis.position(offset); 34 | 35 | if (maxLength > 0) { 36 | return ByteStreams.limit(bbis, maxLength); 37 | } else { 38 | return bbis; 39 | } 40 | } 41 | 42 | @Override 43 | public long getSize() throws IOException { 44 | return bbis.length(); 45 | } 46 | 47 | @Override 48 | protected void doClose() throws IOException { 49 | bbis = null; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReader.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch.impl; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.RandomAccessFile; 7 | 8 | import org.archive.util.binsearch.AbstractSeekableLineReader; 9 | 10 | import com.google.common.io.ByteStreams; 11 | 12 | public class RandomAccessFileSeekableLineReader extends AbstractSeekableLineReader { 13 | 14 | private RandomAccessFile raf; 15 | 16 | public RandomAccessFileSeekableLineReader(RandomAccessFile raf, int blockSize) { 17 | super(blockSize); 18 | this.raf = raf; 19 | } 20 | 21 | public InputStream doSeekLoad(long offset, int maxLength) throws IOException { 22 | raf.seek(offset); 23 | 24 | FileInputStream fis = new FileInputStream(raf.getFD()); 25 | 26 | if (maxLength > 0) { 27 | return ByteStreams.limit(fis, maxLength); 28 | } else { 29 | return fis; 30 | } 31 | } 32 | 33 | public long getOffset() throws IOException 34 | { 35 | if (closed) { 36 | return 0; 37 | } 38 | 39 | return raf.getFilePointer(); 40 | } 41 | 42 | public void doClose() throws IOException { 43 | if (raf != null) { 44 | raf.close(); 45 | } 46 | raf = null; 47 | } 48 | 49 | public long getSize() throws IOException { 50 | return raf.length(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReaderFactory.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.binsearch.impl; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | 7 | import org.archive.util.binsearch.SeekableLineReader; 8 | import org.archive.util.binsearch.SeekableLineReaderFactory; 9 | 10 | public class RandomAccessFileSeekableLineReaderFactory implements SeekableLineReaderFactory { 11 | private File file; 12 | private int blockSize = BINSEARCH_BLOCK_SIZE; 13 | 14 | public RandomAccessFileSeekableLineReaderFactory(File file) { 15 | this.file = file; 16 | } 17 | public RandomAccessFileSeekableLineReaderFactory(File file, int blockSize) { 18 | this.file = file; 19 | this.blockSize = blockSize; 20 | } 21 | public SeekableLineReader get() throws IOException { 22 | return new RandomAccessFileSeekableLineReader(new RandomAccessFile(file, "r"), 23 | blockSize); 24 | } 25 | public void close() throws IOException { 26 | this.file = null; 27 | } 28 | 29 | public long getModTime() 30 | { 31 | return file.lastModified(); 32 | } 33 | 34 | @Override 35 | public void reload() throws IOException { 36 | //RAF created each time, nothing to reload 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/BytesReadObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | public interface BytesReadObserver { 4 | void notifyBytesRead(int amt); 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/CRCInputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | import java.io.IOException; 3 | import java.io.InputStream; 4 | import java.util.zip.CRC32; 5 | 6 | public class CRCInputStream extends InputStream { 7 | private InputStream is = null; 8 | private CRC32 crc = null; 9 | private long count = 0; 10 | public CRCInputStream(InputStream is) { 11 | this(is,new CRC32()); 12 | } 13 | public CRCInputStream(InputStream is, CRC32 crc) { 14 | this.is = is; 15 | this.crc = crc; 16 | count = 0; 17 | } 18 | @Override 19 | public int read() throws IOException { 20 | int b = is.read(); 21 | if(b != -1) { 22 | crc.update(b); 23 | count++; 24 | } 25 | return b; 26 | } 27 | public int read(byte[] b) throws IOException { 28 | return read(b,0,b.length); 29 | } 30 | public int read(byte[] b, int off, int len) throws IOException { 31 | int amt = is.read(b, off, len); 32 | if(amt > -1) { 33 | count += amt; 34 | crc.update(b, off, amt); 35 | } 36 | return amt; 37 | } 38 | public long getCRCValue() { 39 | return crc.getValue(); 40 | } 41 | public long getByteCount() { 42 | return count; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/CRCOutputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.util.zip.CRC32; 6 | 7 | public class CRCOutputStream extends OutputStream { 8 | OutputStream os = null; 9 | private CRC32 crc = null; 10 | boolean autoFlush = false; 11 | long bytesWritten = 0; 12 | public CRCOutputStream(OutputStream os) { 13 | this(os,false); 14 | } 15 | public CRCOutputStream(OutputStream os, boolean autoFlush) { 16 | this.os = os; 17 | this.crc = new CRC32(); 18 | this.autoFlush = autoFlush; 19 | bytesWritten = 0; 20 | } 21 | 22 | @Override 23 | public void write(int b) throws IOException { 24 | crc.update(b); 25 | os.write(b); 26 | if(autoFlush) 27 | os.flush(); 28 | bytesWritten++; 29 | } 30 | @Override 31 | public void write(byte[] b) throws IOException { 32 | write(b,0,b.length); 33 | } 34 | @Override 35 | public void write(byte[] b, int off, int len) throws IOException { 36 | crc.update(b, off, len); 37 | os.write(b,0,len); 38 | if(autoFlush) { 39 | os.flush(); 40 | } 41 | bytesWritten += len; 42 | } 43 | public long getCRCValue() { 44 | return crc.getValue(); 45 | } 46 | public long getBytesWritten() { 47 | return bytesWritten; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/CommitedOutputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | import java.io.FilterOutputStream; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | 7 | public abstract class CommitedOutputStream extends FilterOutputStream { 8 | public CommitedOutputStream(OutputStream arg0) { 9 | super(arg0); 10 | } 11 | public abstract void commit() throws IOException; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/EOFNotifyingInputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | import java.io.FilterInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | 7 | public class EOFNotifyingInputStream extends FilterInputStream { 8 | EOFObserver observer; 9 | boolean notified = false; 10 | public EOFNotifyingInputStream(InputStream in, EOFObserver observer) { 11 | super(in); 12 | this.observer = observer; 13 | } 14 | private void doNotify() throws IOException { 15 | if(!notified) { 16 | notified = true; 17 | if(observer != null) { 18 | observer.notifyEOF(); 19 | } 20 | } 21 | } 22 | 23 | @Override 24 | public int read() throws IOException { 25 | int amtRead = super.read(); 26 | if(amtRead == -1) { 27 | doNotify(); 28 | } 29 | return amtRead; 30 | } 31 | 32 | @Override 33 | public int read(byte[] b) throws IOException { 34 | return read(b,0,b.length); 35 | } 36 | 37 | @Override 38 | public int read(byte[] b, int off, int len) throws IOException { 39 | int amtRead = super.read(b, off, len); 40 | if(amtRead == -1) { 41 | doNotify(); 42 | } 43 | return amtRead; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/EOFObserver.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | import java.io.IOException; 4 | 5 | public interface EOFObserver { 6 | public void notifyEOF() throws IOException; 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/NotifyingInputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | public class NotifyingInputStream extends InputStream { 7 | InputStream wrapped; 8 | BytesReadObserver observer; 9 | public NotifyingInputStream(InputStream wrapped, 10 | BytesReadObserver observer) { 11 | this.wrapped = wrapped; 12 | this.observer = observer; 13 | } 14 | private int notifyRead(int amt) { 15 | if(amt > 0) { 16 | observer.notifyBytesRead(amt); 17 | } 18 | return amt; 19 | } 20 | @Override 21 | public int read() throws IOException { 22 | return notifyRead(wrapped.read()); 23 | } 24 | @Override 25 | public int read(byte[] b) throws IOException { 26 | return notifyRead(wrapped.read(b)); 27 | } 28 | @Override 29 | public int read(byte[] b, int o, int l) throws IOException { 30 | return notifyRead(wrapped.read(b,o,l)); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/PushBackOneByteInputStream.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | import java.io.IOException; 4 | 5 | public interface PushBackOneByteInputStream { 6 | public void pushback() throws IOException; 7 | public int read() throws IOException; 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/io/RuntimeIOException.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.io; 2 | 3 | public class RuntimeIOException extends RuntimeException { 4 | private static final long serialVersionUID = 4762025404760379497L; 5 | 6 | private int status = 503; 7 | 8 | public RuntimeIOException() 9 | { 10 | 11 | } 12 | 13 | public RuntimeIOException(String message) 14 | { 15 | super(message); 16 | } 17 | 18 | public RuntimeIOException(int status) 19 | { 20 | this.status = status; 21 | } 22 | 23 | public RuntimeIOException(Throwable cause) 24 | { 25 | super(cause); 26 | } 27 | 28 | public RuntimeIOException(int status, Throwable cause) 29 | { 30 | super(cause); 31 | this.status = status; 32 | } 33 | 34 | public int getStatus() 35 | { 36 | return status; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/BoundedStringIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | public class BoundedStringIterator extends AbstractPeekableIterator 7 | implements CloseableIterator { 8 | 9 | private Iterator inner; 10 | private String boundary; 11 | private boolean inclusive; 12 | private int flip; 13 | 14 | public BoundedStringIterator(Iterator inner, String boundary) { 15 | this(inner, boundary, false); 16 | } 17 | 18 | public BoundedStringIterator(Iterator inner, String boundary, boolean inclusive) { 19 | this(inner, boundary, inclusive, false); 20 | } 21 | 22 | public BoundedStringIterator(Iterator inner, String boundary, boolean inclusive, boolean reverse) { 23 | this.inner = inner; 24 | this.boundary = boundary; 25 | this.inclusive = inclusive; 26 | this.flip = (reverse ? -1 : 1); 27 | } 28 | 29 | @Override 30 | public String getNextInner() { 31 | String tmp = null; 32 | if(inner.hasNext()) { 33 | tmp = inner.next(); 34 | if(tmp.compareTo(boundary) * flip >= 0 && (!inclusive || !tmp.startsWith(boundary))) { 35 | tmp = null; 36 | try { 37 | close(); 38 | } catch (IOException e) { 39 | throw new RuntimeException(e); 40 | } 41 | } 42 | } 43 | return tmp; 44 | } 45 | 46 | public void close() throws IOException { 47 | CloseableIteratorUtil.attemptClose(inner); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/CachingStringFilter.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.util.LinkedHashMap; 4 | import java.util.Map; 5 | 6 | public class CachingStringFilter implements StringFilter { 7 | private LRUCache cache; 8 | private StringFilter inner; 9 | public CachingStringFilter(StringFilter inner, int max) { 10 | this.inner = inner; 11 | cache = new LRUCache(max); 12 | } 13 | 14 | public boolean isFiltered(String text) { 15 | Boolean v = cache.remove(text); 16 | if(v == null) { 17 | v = inner.isFiltered(text); 18 | } 19 | cache.put(text, v); 20 | return v; 21 | } 22 | 23 | public class LRUCache extends LinkedHashMap { 24 | /** */ 25 | private static final long serialVersionUID = 1L; 26 | private int max = 100; 27 | 28 | public LRUCache(int max) { 29 | this.max = max; 30 | } 31 | 32 | protected boolean removeEldestEntry(Map.Entry eldest) { 33 | return (size() > max); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | import java.util.LinkedList; 6 | 7 | public class CloseableCompositeIterator implements CloseableIterator { 8 | 9 | protected LinkedList> iters; 10 | protected Iterator> iterPtr; 11 | protected CloseableIterator currIter; 12 | 13 | public CloseableCompositeIterator() 14 | { 15 | iters = new LinkedList>(); 16 | } 17 | 18 | public void addFirst(CloseableIterator e) 19 | { 20 | iters.addFirst(e); 21 | } 22 | 23 | public void addLast(CloseableIterator e) 24 | { 25 | iters.addLast(e); 26 | } 27 | 28 | @Override 29 | public boolean hasNext() { 30 | 31 | if (iterPtr == null) { 32 | iterPtr = iters.iterator(); 33 | currIter = iterPtr.next(); 34 | } 35 | 36 | if (currIter == null) { 37 | return false; 38 | } 39 | 40 | while (currIter != null) { 41 | if (currIter.hasNext()) { 42 | return true; 43 | } 44 | 45 | currIter = (iterPtr.hasNext() ? iterPtr.next() : null); 46 | } 47 | 48 | return false; 49 | } 50 | 51 | @Override 52 | public E next() { 53 | return currIter.next(); 54 | } 55 | 56 | @Override 57 | public void remove() { 58 | currIter.remove(); 59 | } 60 | 61 | @Override 62 | public void close() throws IOException { 63 | for (CloseableIterator e : iters) { 64 | if (e != null) { 65 | try { 66 | e.close(); 67 | } catch (IOException io) { 68 | 69 | } 70 | } 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/CloseableIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.Closeable; 4 | import java.util.Iterator; 5 | 6 | /** 7 | * Iterator with a close method that frees up any resources associated with 8 | * the Iterator. 9 | * 10 | * @author brad 11 | * @version $Date$, $Revision$ 12 | * @param 13 | */ 14 | public interface CloseableIterator extends Iterator, Closeable { 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/CloseableIteratorUtil.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | public class CloseableIteratorUtil { 7 | public static void attemptClose(Iterator i) throws IOException { 8 | if(i instanceof CloseableIterator) { 9 | ((CloseableIterator) i).close(); 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | /** 7 | * Wrap a regular Iterator<S> to create a CloseableIterator<S> where the close() is a no-op 8 | * @author ilya 9 | * 10 | * @param 11 | */ 12 | 13 | public class CloseableIteratorWrapper implements CloseableIterator 14 | { 15 | protected Iterator iter; 16 | 17 | public CloseableIteratorWrapper(Iterator iter) 18 | { 19 | this.iter = iter; 20 | } 21 | 22 | @Override 23 | public boolean hasNext() { 24 | return this.iter.hasNext(); 25 | } 26 | 27 | @Override 28 | public S next() { 29 | return this.iter.next(); 30 | } 31 | 32 | @Override 33 | public void remove() { 34 | this.iter.remove(); 35 | 36 | } 37 | 38 | @Override 39 | public void close() throws IOException { 40 | //No Op 41 | } 42 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/PeekableIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | public interface PeekableIterator extends CloseableIterator { 4 | public E peek(); 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/PrefixMatchStringIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.IOException; 4 | 5 | 6 | public class PrefixMatchStringIterator extends AbstractPeekableIterator 7 | { 8 | private boolean first = true; 9 | private String key; 10 | private CloseableIterator inner; 11 | 12 | public PrefixMatchStringIterator(CloseableIterator inner, String key, boolean alwaysIncludeFirst) 13 | { 14 | this.inner = inner; 15 | this.key = key; 16 | this.first = alwaysIncludeFirst; 17 | } 18 | 19 | @Override 20 | public String getNextInner() { 21 | 22 | if (!inner.hasNext()) { 23 | return null; 24 | } 25 | 26 | String blockLine = inner.next(); 27 | 28 | // only compare the correct length: 29 | String prefCmp = key; 30 | 31 | if (first) { 32 | // always add first: 33 | first = false; 34 | } else if (!blockLine.startsWith(prefCmp)) { 35 | return null; 36 | } 37 | 38 | return blockLine; 39 | } 40 | 41 | @Override 42 | public void close() throws IOException { 43 | inner.close(); 44 | } 45 | } -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/StartBoundedStringIterator.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | public class StartBoundedStringIterator extends AbstractPeekableIterator { 7 | 8 | private Iterator inner; 9 | private String boundary; 10 | private boolean done = false; 11 | private boolean started = false; 12 | private int flip = 1; 13 | 14 | public StartBoundedStringIterator(Iterator inner, String boundary) { 15 | this(inner, boundary, false); 16 | } 17 | 18 | public StartBoundedStringIterator(Iterator inner, String boundary, boolean reverse) { 19 | this.inner = inner; 20 | this.boundary = boundary; 21 | this.done = false; 22 | this.started = false; 23 | this.flip = (reverse ? -1 : 1); 24 | } 25 | 26 | @Override 27 | public String getNextInner() { 28 | if(done) { 29 | return null; 30 | } 31 | if(started) { 32 | if(inner.hasNext()) { 33 | String tmp = inner.next(); 34 | if(tmp == null) { 35 | done = true; 36 | return null; 37 | } 38 | return tmp; 39 | } 40 | } 41 | while(inner.hasNext()) { 42 | String tmp = inner.next(); 43 | 44 | int cmp = boundary.compareTo(tmp) * flip; 45 | 46 | if ((cmp <= 0)) { 47 | started = true; 48 | return tmp; 49 | } 50 | } 51 | try { 52 | close(); 53 | } catch(IOException e) { 54 | throw new RuntimeException(e); 55 | } 56 | done = true; 57 | return null; 58 | } 59 | 60 | public void close() throws IOException { 61 | CloseableIteratorUtil.attemptClose(inner); 62 | } 63 | 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/StringFilter.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | public interface StringFilter { 4 | public boolean isFiltered(String text); 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/StringTransformer.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | public interface StringTransformer { 4 | public String transform(String input); 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/iterator/TransformingPrefixStringFilter.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.util.Collection; 4 | import java.util.TreeSet; 5 | 6 | public class TransformingPrefixStringFilter implements StringFilter { 7 | TreeSet filters; 8 | StringTransformer transformer; 9 | 10 | public TransformingPrefixStringFilter(Collection blocks) { 11 | this(blocks,null); 12 | } 13 | public TransformingPrefixStringFilter(Collection blocks, 14 | StringTransformer transformer) { 15 | filters = makeTreeSet(blocks,transformer); 16 | this.transformer = transformer; 17 | } 18 | 19 | public static TreeSet makeTreeSet(Collection blocks, 20 | StringTransformer trans) { 21 | TreeSet tmp = new TreeSet(); 22 | for(String filter : blocks) { 23 | if(trans != null) { 24 | filter = trans.transform(filter); 25 | } 26 | String possiblePrefix = tmp.floor(filter); 27 | if (possiblePrefix != null && filter.startsWith(possiblePrefix)) { 28 | // don't add - a prefix is already in the set: 29 | } else { 30 | // is this a prefix of the existing item? 31 | String possibleLonger = tmp.ceiling(filter); 32 | if(possibleLonger == null) { 33 | } else if(possibleLonger.startsWith(filter)) { 34 | tmp.remove(possibleLonger); 35 | } 36 | tmp.add(filter); 37 | } 38 | } 39 | return tmp; 40 | } 41 | 42 | public boolean isFiltered(String text) { 43 | if(transformer != null) { 44 | text = transformer.transform(text); 45 | } 46 | String possiblePrefix = filters.floor(text); 47 | return (possiblePrefix != null && text.startsWith(possiblePrefix)); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/archive/util/zip/NoGzipMagicException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.util.zip; 20 | 21 | import java.io.IOException; 22 | 23 | public class NoGzipMagicException extends IOException { 24 | 25 | private static final long serialVersionUID = 3084169624430655013L; 26 | 27 | public NoGzipMagicException() { 28 | super(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/resources/org/archive/commons.properties: -------------------------------------------------------------------------------- 1 | operator= 2 | publisher= 3 | wat.warcinfo.description= 4 | warc.format=WARC File Format 1.0 5 | warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf 6 | -------------------------------------------------------------------------------- /src/main/resources/org/archive/ia-web-commons-version.txt: -------------------------------------------------------------------------------- 1 | ia-web-commons.${pom.version}-${build.time} 2 | -------------------------------------------------------------------------------- /src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.extract; 2 | 3 | import java.net.MalformedURLException; 4 | import java.net.URI; 5 | import java.net.URISyntaxException; 6 | import java.net.URL; 7 | import java.net.URLEncoder; 8 | 9 | import junit.framework.TestCase; 10 | 11 | 12 | public class RealCDXExtractorOutputTest extends TestCase { 13 | 14 | public void testEscapeResolvedUrl() throws Exception { 15 | String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf"; 16 | String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor"; 17 | String escaped = RealCDXExtractorOutput.resolve(context, spec); 18 | assertTrue(escaped.indexOf(" ") < 0); 19 | URI parsed = new URI(escaped); 20 | assertEquals("änchor", parsed.getFragment()); 21 | } 22 | 23 | public void testNoDoubleEscaping() throws Exception { 24 | String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8"; 25 | String resolved = RealCDXExtractorOutput.resolve(spec, spec); 26 | assertTrue(spec.equals(resolved)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.gzip; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.File; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | 8 | import org.archive.util.IAUtils; 9 | 10 | import junit.framework.TestCase; 11 | 12 | public class GZIPMemberWriterTest extends TestCase { 13 | 14 | public void testWrite() throws IOException { 15 | File outFile = File.createTempFile("tmp", ".gz"); 16 | GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile)); 17 | gzw.write(new ByteArrayInputStream("Here is record 1".getBytes(IAUtils.UTF8))); 18 | gzw.write(new ByteArrayInputStream("Here is record 2".getBytes(IAUtils.UTF8))); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import java.util.ArrayList; 4 | 5 | import org.archive.util.TestUtils; 6 | import com.github.openjson.JSONException; 7 | import com.github.openjson.JSONObject; 8 | 9 | import junit.framework.TestCase; 10 | 11 | public class CompoundORJSONPathSpecTest extends TestCase { 12 | String json1S = "{\"a\":\"A\"}"; 13 | String json2S = "{\"b\":\"B\"}"; 14 | public void testExtract() throws JSONException { 15 | JSONObject json1 = new JSONObject(json1S); 16 | JSONObject json2 = new JSONObject(json2S); 17 | ArrayList parts = new ArrayList(); 18 | parts.add(new SimpleJSONPathSpec("a")); 19 | parts.add(new SimpleJSONPathSpec("b")); 20 | 21 | JSONPathSpec comp = new CompoundORJSONPathSpec(parts); 22 | TestUtils.dumpMatch("json1", comp.extract(json1)); 23 | TestUtils.assertLoLMatches(new String[][]{{"A"}}, comp.extract(json1)); 24 | TestUtils.assertLoLMatches(new String[][]{{"B"}}, comp.extract(json2)); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/test/java/org/archive/format/json/JSONViewTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import org.archive.util.TestUtils; 4 | import com.github.openjson.JSONException; 5 | import com.github.openjson.JSONObject; 6 | 7 | import junit.framework.TestCase; 8 | 9 | public class JSONViewTest extends TestCase { 10 | 11 | public int getInt(byte b[]) { 12 | return b[0] & 0xff; 13 | } 14 | 15 | public void testBytes() throws JSONException { 16 | JSONObject o = new JSONObject(); 17 | o.append("name1", "val\\rue1"); 18 | String json = o.toString(); 19 | System.out.format("once: (%s)\n",json); 20 | JSONObject o2 = new JSONObject(json); 21 | System.out.format("twice: (%s)\n",o2.toString()); 22 | 23 | 24 | byte b[] = new byte[2]; 25 | for(int i = 0; i < 256; i++) { 26 | b[0] = (byte) i; 27 | int gi = getInt(b); 28 | System.out.format("I(%d) gi(%d)\n",i,gi); 29 | } 30 | } 31 | public void testApply() throws JSONException { 32 | String json1S = "{\"url\":\"a\",\"link\":[{\"zz\":\"1\",\"qq\":\"qa\"},{\"zz2\":\"2\",\"qq\":\"qb\"},{\"zz\":\"3\",\"qq\":\"qc\"},{\"zz\":\"4\"}]}"; 33 | JSONObject json1 = new JSONObject(json1S); 34 | 35 | JSONView view = new JSONView("url","@link.zz"); 36 | TestUtils.assertLoLMatches(new String[][]{{"a","1"},{"a",""},{"a","3"},{"a","4"}}, 37 | view.apply(json1)); 38 | 39 | view = new JSONView("url","@link.{zz,qq}"); 40 | TestUtils.assertLoLMatches(new String[][]{{"a","1","qa"},{"a","","qb"},{"a","3","qc"},{"a","4",""}}, 41 | view.apply(json1)); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.format.json; 2 | 3 | import org.archive.util.TestUtils; 4 | import com.github.openjson.JSONException; 5 | import com.github.openjson.JSONObject; 6 | 7 | import junit.framework.TestCase; 8 | 9 | public class SimpleJSONPathSpecTest extends TestCase { 10 | String json1 = "{\"a\": { \"b\": \"Foo\" }}"; 11 | String json2 = "{\"a\": { \"b\": [{\"a\":\"1\"},{\"a\":\"2\"}] }}"; 12 | 13 | String json3 = "{\"a\": { \"b\": {\"A\":\"11\",\"B\":\"22\"} }}"; 14 | String json4 = "{\"a\": { \"b\": [{\"A\":\"11\",\"B\":\"22\"},{\"A\":\"33\",\"B\":\"44\"}] }}"; 15 | 16 | public void testExtract() throws JSONException { 17 | JSONObject json = new JSONObject(json1); 18 | JSONPathSpec spec = new SimpleJSONPathSpec("a.b"); 19 | TestUtils.dumpMatch("json1", spec.extract(json)); 20 | TestUtils.assertLoLMatches(new String[][]{{"Foo"}}, spec.extract(json)); 21 | 22 | json = new JSONObject(json2); 23 | spec = new SimpleJSONPathSpec("a.@b.a"); 24 | TestUtils.dumpMatch("json2", spec.extract(json)); 25 | TestUtils.assertLoLMatches(new String[][]{{"1"},{"2"}}, spec.extract(json)); 26 | 27 | json = new JSONObject(json3); 28 | spec = new SimpleJSONPathSpec("a.b.{A,B}"); 29 | TestUtils.dumpMatch("json3", spec.extract(json)); 30 | TestUtils.assertLoLMatches(new String[][]{{"11","22"}}, spec.extract(json)); 31 | 32 | json = new JSONObject(json4); 33 | spec = new SimpleJSONPathSpec("a.@b.{A,B}"); 34 | TestUtils.dumpMatch("json4", spec.extract(json)); 35 | TestUtils.assertLoLMatches(new String[][]{{"11","22"},{"33","44"}}, spec.extract(json)); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/java/org/archive/io/warc/WARCReaderFactoryTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.io.warc; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | 6 | import org.archive.format.warc.WARCConstants; 7 | import org.archive.format.warc.WARCConstants.WARCRecordType; 8 | import org.archive.io.ArchiveReader; 9 | import org.archive.io.ArchiveRecord; 10 | 11 | import junit.framework.TestCase; 12 | 13 | public class WARCReaderFactoryTest extends TestCase { 14 | 15 | // Test files: 16 | String[] files = new String[] { 17 | "src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz", 18 | "src/test/resources/org/archive/format/warc/IAH-urls-wget.warc" 19 | }; 20 | 21 | public void testGetStringInputstreamBoolean() throws IOException { 22 | // Check the test files can be opened: 23 | for( String file : files ) { 24 | FileInputStream is = new FileInputStream(file); 25 | ArchiveReader ar = WARCReaderFactory.get(file, is, true); 26 | ArchiveRecord r = ar.get(); 27 | String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE); 28 | // Check the first record comes out as a 'warcinfo' record. 29 | assertEquals(WARCRecordType.warcinfo.name(), type); 30 | } 31 | } 32 | 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/test/java/org/archive/resource/warc/WARCResourceTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.resource.warc; 2 | 3 | import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH; 4 | import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES; 5 | 6 | import java.io.IOException; 7 | 8 | import org.archive.extract.ExtractingResourceFactoryMapper; 9 | import org.archive.extract.ExtractingResourceProducer; 10 | import org.archive.extract.ProducerUtils; 11 | import org.archive.extract.ResourceFactoryMapper; 12 | import org.archive.resource.Resource; 13 | import org.archive.resource.ResourceParseException; 14 | import org.archive.resource.ResourceProducer; 15 | import org.archive.util.StreamCopy; 16 | 17 | import com.github.openjson.JSONObject; 18 | 19 | import junit.framework.TestCase; 20 | 21 | public class WARCResourceTest extends TestCase { 22 | 23 | public void testWARCResource() throws ResourceParseException, IOException { 24 | String testFileName = "../../format/warc/IAH-urls-wget.warc"; 25 | ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath()); 26 | ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); 27 | ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper); 28 | 29 | Resource resource = extractor.getNext(); 30 | 31 | while (resource != null) { 32 | JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope") 33 | .getJSONObject("Payload-Metadata"); 34 | 35 | if (payloadMD.has(PAYLOAD_LENGTH)) { 36 | assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1); 37 | } 38 | if (payloadMD.has(PAYLOAD_SLOP_BYTES)) { 39 | assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES)); 40 | } 41 | 42 | StreamCopy.readToEOF(resource.getInputStream()); 43 | resource = extractor.getNext(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/java/org/archive/uid/UUIDGeneratorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package org.archive.uid; 20 | 21 | import java.net.URI; 22 | import java.net.URISyntaxException; 23 | import java.util.HashMap; 24 | import java.util.Map; 25 | 26 | import junit.framework.TestCase; 27 | 28 | /** 29 | * @author stack 30 | * @version $Revision$ $Date$ 31 | */ 32 | public class UUIDGeneratorTest extends TestCase { 33 | public void testQualifyRecordID() throws URISyntaxException { 34 | RecordIDGenerator g = new UUIDGenerator(); 35 | URI uri = g.getRecordID(); 36 | Map qualifiers = new HashMap(); 37 | qualifiers.put("a", "b"); 38 | URI nuURI = g.qualifyRecordID(uri, qualifiers); 39 | assertNotSame(uri, nuURI); 40 | qualifiers.put("c", "d"); 41 | nuURI = g.qualifyRecordID(nuURI, qualifiers); 42 | assertNotSame(uri, nuURI); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/test/java/org/archive/url/AggressiveIAURLCanonicalizerTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import java.net.URISyntaxException; 4 | 5 | import junit.framework.TestCase; 6 | 7 | public class AggressiveIAURLCanonicalizerTest extends TestCase { 8 | static AggressiveIAURLCanonicalizer ia = new AggressiveIAURLCanonicalizer(); 9 | public void testCanonicalize() throws URISyntaxException { 10 | // FULL end-to-end tests: 11 | check("http://www.alexa.com/","http://alexa.com/"); 12 | check("http://archive.org/index.html","http://archive.org/index.html"); 13 | check("http://archive.org/index.html?","http://archive.org/index.html"); 14 | check("http://archive.org/index.html?a=b","http://archive.org/index.html?a=b"); 15 | check("http://archive.org/index.html?b=b&a=b","http://archive.org/index.html?a=b&b=b"); 16 | check("http://archive.org/index.html?b=a&b=b&a=b","http://archive.org/index.html?a=b&b=a&b=b"); 17 | check("http://www34.archive.org/index.html?b=a&b=b&a=b","http://archive.org/index.html?a=b&b=a&b=b"); 18 | } 19 | 20 | private static void check(String orig, String want) throws URISyntaxException { 21 | HandyURL u = URLParser.parse(orig); 22 | ia.canonicalize(u); 23 | String got = u.getURLString(); 24 | assertEquals(want,got); 25 | 26 | HandyURL u2 = URLParser.parse(got); 27 | ia.canonicalize(u2); 28 | String got2 = u2.getURLString(); 29 | assertEquals("Second passs changed!",got,got2); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/org/archive/url/HandyURLTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import junit.framework.TestCase; 4 | 5 | public class HandyURLTest extends TestCase { 6 | 7 | public void testGetPublicSuffix() { 8 | HandyURL h = new HandyURL(); 9 | h.setHost("www.fool.com"); 10 | assertEquals("fool.com",h.getPublicSuffix()); 11 | assertEquals("www",h.getPublicPrefix()); 12 | 13 | h.setHost("www.amazon.co.uk"); 14 | assertEquals("amazon.co.uk",h.getPublicSuffix()); 15 | assertEquals("www",h.getPublicPrefix()); 16 | 17 | h.setHost("www.images.amazon.co.uk"); 18 | assertEquals("amazon.co.uk",h.getPublicSuffix()); 19 | assertEquals("www.images",h.getPublicPrefix()); 20 | 21 | h.setHost("funky-images.fancy.co.jp"); 22 | assertEquals("fancy.co.jp",h.getPublicSuffix()); 23 | assertEquals("funky-images",h.getPublicPrefix()); 24 | 25 | } 26 | 27 | public void testGetPublicPrefix() { 28 | // 29 | // fail("Not yet implemented"); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/test/java/org/archive/url/OrdinaryIAURLCanonicalizerTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.url; 2 | 3 | import java.net.URISyntaxException; 4 | 5 | import junit.framework.TestCase; 6 | 7 | public class OrdinaryIAURLCanonicalizerTest extends TestCase { 8 | private OrdinaryIAURLCanonicalizer canon = new OrdinaryIAURLCanonicalizer(); 9 | 10 | public void testMisc() throws URISyntaxException { 11 | checkCanonicalization("http://...host..com..", "http://host.com/"); 12 | checkCanonicalization("http://example.org:80/", "http://example.org/"); 13 | checkCanonicalization("https://example.org:443/", "https://example.org/"); 14 | checkCanonicalization("http://example.org:443/", "http://example.org:443/"); 15 | checkCanonicalization("http://example.org/?", "http://example.org/"); 16 | checkCanonicalization("http://example.org/foo?", "http://example.org/foo"); 17 | checkCanonicalization("http://example.org/foo/?", "http://example.org/foo/"); 18 | } 19 | 20 | public void testSchemeCapitals() throws URISyntaxException { 21 | checkCanonicalization("Http://example.com", "http://example.com/"); 22 | checkCanonicalization("HTTP://example.com", "http://example.com/"); 23 | checkCanonicalization("ftP://example.com", "ftp://example.com/"); 24 | } 25 | 26 | private void checkCanonicalization(String in, String want) throws URISyntaxException { 27 | HandyURL h = URLParser.parse(in); 28 | canon.canonicalize(h); 29 | String got = h.getURLString(); 30 | assertEquals(want, got); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/ByteOpTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.DataInputStream; 6 | import java.io.IOException; 7 | 8 | import org.archive.util.ByteOp; 9 | 10 | import com.google.common.io.LittleEndianDataOutputStream; 11 | 12 | import junit.framework.TestCase; 13 | 14 | public class ByteOpTest extends TestCase { 15 | 16 | public void testReadShort() throws IOException { 17 | byte a[] = new byte[]{0,1,2,3}; 18 | ByteArrayInputStream bais = new ByteArrayInputStream(a); 19 | int bos = ByteOp.readShort(bais); 20 | System.out.format("BO.Read short(%d)\n", bos); 21 | DataInputStream dis = new DataInputStream(new ByteArrayInputStream(a)); 22 | int disv = dis.readUnsignedShort(); 23 | System.out.format("DI.Read short(%d)\n", disv); 24 | for(int i = 0; i < 256 * 256; i++) { 25 | ByteArrayOutputStream baos = new ByteArrayOutputStream(2); 26 | LittleEndianDataOutputStream dos = new LittleEndianDataOutputStream(baos); 27 | dos.writeShort(i); 28 | ByteArrayInputStream bais2 = new ByteArrayInputStream(baos.toByteArray()); 29 | int gotI = ByteOp.readShort(bais2); 30 | assertEquals(i, gotI); 31 | } 32 | } 33 | 34 | public void testAppend() { 35 | byte a[] = new byte[]{1}; 36 | byte b[] = new byte[]{2}; 37 | byte n[] = ByteOp.append(a,b); 38 | assertEquals(2,n.length); 39 | assertEquals(1,n[0]); 40 | assertEquals(2,n[1]); 41 | 42 | byte a2[] = new byte[]{1,2,3,4}; 43 | byte b2[] = new byte[]{5,6,7,8}; 44 | byte n2[] = ByteOp.append(a2,b2); 45 | assertEquals(8,n2.length); 46 | assertEquals(1,n2[0]); 47 | assertEquals(2,n2[1]); 48 | assertEquals(5,n2[4]); 49 | 50 | } 51 | 52 | public void testReadInt() { 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/CrossProductTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.archive.util.CrossProduct; 7 | 8 | import junit.framework.TestCase; 9 | 10 | public class CrossProductTest extends TestCase { 11 | private void dumpC(List a) { 12 | StringBuilder sb = new StringBuilder(); 13 | boolean first = false; 14 | for(Object o : a) { 15 | if(first) { 16 | first = false; 17 | } else { 18 | sb.append(","); 19 | } 20 | sb.append(o.toString()); 21 | } 22 | System.out.println("Dump:" + sb.toString()); 23 | } 24 | private void dumpLOL(List> coc) { 25 | for(List co : coc) { 26 | dumpC(co); 27 | } 28 | } 29 | public void testVersion() { 30 | String version = IAUtils.loadCommonsVersion(); 31 | System.out.format("Loaded version(%s)\n", version); 32 | } 33 | public void testCrossProduct() { 34 | ArrayList> input = new ArrayList>(); 35 | CrossProduct xp = new CrossProduct(); 36 | input.add(AtoL("1","2")); 37 | input.add(AtoL("Charming")); 38 | input.add(AtoL("Berry","Elvis")); 39 | input.add(AtoL("a","b","c","d")); 40 | List> cross = xp.crossProduct(input); 41 | dumpLOL(cross); 42 | } 43 | private List AtoL(Object... a) { 44 | ArrayList al = new ArrayList(a.length); 45 | for(Object s : a) { 46 | al.add(s); 47 | } 48 | return al; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/PropertyUtilsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is part of the Heritrix web crawler (crawler.archive.org). 3 | * 4 | * Licensed to the Internet Archive (IA) by one or more individual 5 | * contributors. 6 | * 7 | * The IA licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.archive.util; 21 | 22 | 23 | import java.io.IOException; 24 | import java.util.Properties; 25 | 26 | import junit.framework.TestCase; 27 | 28 | 29 | /** 30 | * PropertyUtils tests. 31 | * 32 | * @author gojomo 33 | * @version $Date: 2009-11-19 14:39:53 -0800 (Thu, 19 Nov 2009) $, $Revision: 6674 $ 34 | */ 35 | public class PropertyUtilsTest extends TestCase { 36 | 37 | public void testSimpleInterpolate() throws IOException { 38 | Properties props = new Properties(); 39 | props.put("foo", "OOF"); 40 | props.put("bar","RAB"); 41 | String original = "FOO|${foo} BAR|${bar}"; 42 | String expected = "FOO|OOF BAR|RAB"; 43 | assertEquals("interpalation problem",expected,PropertyUtils.interpolateWithProperties(original,props)); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/StringFieldExtractorTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import org.archive.util.StringFieldExtractor.StringTuple; 4 | 5 | import junit.framework.TestCase; 6 | 7 | public class StringFieldExtractorTest extends TestCase { 8 | 9 | public void testExtract() { 10 | StringFieldExtractor ex1 = new StringFieldExtractor(' ', 0); 11 | StringFieldExtractor ex2 = new StringFieldExtractor(' ', 1); 12 | StringFieldExtractor ex3 = new StringFieldExtractor(' ', 2); 13 | StringFieldExtractor ex4 = new StringFieldExtractor(' ', 3); 14 | StringFieldExtractor ex5 = new StringFieldExtractor(' ', 4); 15 | assertEquals("1",ex1.extract("1 2 3 4")); 16 | assertEquals("2",ex2.extract("1 2 3 4")); 17 | assertEquals("3",ex3.extract("1 2 3 4")); 18 | assertEquals("4",ex4.extract("1 2 3 4")); 19 | assertEquals(null,ex5.extract("1 2 3 4")); 20 | assertEquals("",ex5.extract("1 2 3 4 ")); 21 | assertEquals("",ex1.extract(" 1 2 3 4 ")); 22 | assertEquals("1",ex2.extract(" 1 2 3 4 ")); 23 | assertEquals("2",ex3.extract(" 1 2 3 4 ")); 24 | assertEquals("abc",ex1.extract("abc 1 2 3 4 ")); 25 | assertEquals("1",ex2.extract("abc 1 2 3 4 ")); 26 | } 27 | 28 | private void checkSplit(String f, String s,StringTuple t) { 29 | assertEquals(f,t.first); 30 | assertEquals(s,t.second); 31 | } 32 | 33 | public void testSplit() { 34 | StringFieldExtractor sfx = new StringFieldExtractor(' ',2); 35 | checkSplit("a b","x y",sfx.split("a b x y")); 36 | checkSplit("ab ","x y",sfx.split("ab x y")); 37 | checkSplit("ab x","y z",sfx.split("ab x y z")); 38 | checkSplit("ab x","y z",sfx.split("ab x y z")); 39 | checkSplit("ab",null,sfx.split("ab")); 40 | checkSplit("ab x",null,sfx.split("ab x")); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/TestUtils.java: -------------------------------------------------------------------------------- 1 | package org.archive.util; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.List; 6 | 7 | import junit.framework.TestCase; 8 | 9 | 10 | import com.google.common.io.ByteStreams; 11 | 12 | public class TestUtils extends TestCase { 13 | public void testNothing() { 14 | assertEquals(2,1+1); 15 | } 16 | public static void dumpMatch(String context, List> res) { 17 | 18 | System.out.format("Context(%s) Found (%d) matches\n", context, res.size()); 19 | for(List r : res) { 20 | System.out.format("Match(%s)\n", StringParse.join(r)); 21 | } 22 | 23 | } 24 | public static void assertLoLMatches(String want[][], List> got) { 25 | assertEquals(want.length,got.size()); 26 | for(int i = 0; i < want.length; i++) { 27 | String [] wantSub = want[i]; 28 | List gotSub = got.get(i); 29 | assertEquals(wantSub.length,gotSub.size()); 30 | for(int j = 0; j < wantSub.length; j++) { 31 | assertEquals(wantSub[j],gotSub.get(j)); 32 | } 33 | } 34 | } 35 | public static void assertStreamEquals(InputStream is,byte b[]) throws IOException { 36 | byte got[] = ByteStreams.toByteArray(is); 37 | assertEquals(got.length,b.length); 38 | assertTrue(ByteOp.cmp(got,b)); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/iterator/CachingStringFilterTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import junit.framework.TestCase; 4 | 5 | public class CachingStringFilterTest extends TestCase { 6 | public void testCache() { 7 | StringFilter tf = new StringFilter() { 8 | public boolean isFiltered(String text) { 9 | return true; 10 | } 11 | }; 12 | CachingStringFilter csf = new CachingStringFilter(tf, 3); 13 | csf.isFiltered("one"); 14 | csf.isFiltered("one"); 15 | csf.isFiltered("two"); 16 | csf.isFiltered("one"); 17 | csf.isFiltered("three"); 18 | csf.isFiltered("two"); 19 | csf.isFiltered("four"); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import java.util.TreeSet; 7 | 8 | import junit.framework.TestCase; 9 | 10 | public class FilterStringIteratorTest extends TestCase { 11 | 12 | public void t2estHasNext() { 13 | String blocks[] = {"a","ab","ba","cc"}; 14 | 15 | List bl = Arrays.asList(blocks); 16 | TransformingPrefixStringFilter f = new TransformingPrefixStringFilter(bl); 17 | assertBlocked(true,"a",f); 18 | assertBlocked(true,"ab",f); 19 | assertBlocked(true,"ac",f); 20 | assertBlocked(true,"acca",f); 21 | assertBlocked(false,"b",f); 22 | assertBlocked(true,"ba",f); 23 | assertBlocked(true,"bac",f); 24 | assertBlocked(false,"bc",f); 25 | assertBlocked(false,"ca",f); 26 | assertBlocked(true,"cc",f); 27 | assertBlocked(true,"cca",f); 28 | } 29 | 30 | public void testTreeSet() { 31 | String blocks[] = {"a","ab","ba","cc"}; 32 | TreeSet s = TransformingPrefixStringFilter.makeTreeSet(Arrays.asList(blocks),null); 33 | assertTrue(s.contains("a")); 34 | assertFalse(s.contains("ab")); 35 | 36 | String blocks2[] = {"ab","a","ba","cc"}; 37 | TreeSet s2 = TransformingPrefixStringFilter.makeTreeSet(Arrays.asList(blocks2),null); 38 | assertTrue(s2.contains("a")); 39 | assertFalse(s2.contains("ab")); 40 | 41 | 42 | 43 | } 44 | 45 | 46 | private void assertBlocked(boolean blocked, String s, StringFilter f) { 47 | ArrayList l = new ArrayList(); 48 | l.add(s); 49 | FilterStringIterator i = new FilterStringIterator(l.iterator(), f); 50 | if(blocked) { 51 | assertFalse(i.hasNext()); 52 | } else { 53 | assertTrue(i.hasNext()); 54 | assertEquals(s,i.next()); 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java: -------------------------------------------------------------------------------- 1 | package org.archive.util.iterator; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.io.PrintWriter; 9 | import java.util.Comparator; 10 | 11 | import junit.framework.TestCase; 12 | 13 | public class SortedCompositeIteratorTest extends TestCase { 14 | 15 | public void testHasNext() throws FileNotFoundException, IOException { 16 | 17 | File a = File.createTempFile("filea", null); 18 | File b = File.createTempFile("fileb", null); 19 | 20 | PrintWriter apw = new PrintWriter(a); 21 | PrintWriter bpw = new PrintWriter(b); 22 | apw.println("1"); 23 | apw.println("3"); 24 | bpw.println("2"); 25 | bpw.println("4"); 26 | apw.close(); 27 | bpw.close(); 28 | BufferedReader abr = new BufferedReader(new FileReader(a)); 29 | BufferedReader bbr = new BufferedReader(new FileReader(b)); 30 | SortedCompositeIterator sci = new SortedCompositeIterator(new Comparator() { 31 | 32 | @Override 33 | public int compare(String o1, String o2) { 34 | return o1.compareTo(o2); 35 | } 36 | 37 | }); 38 | sci.addIterator(AbstractPeekableIterator.wrapReader(abr)); 39 | sci.addIterator(AbstractPeekableIterator.wrapReader(bbr)); 40 | assertTrue(sci.hasNext()); 41 | assertEquals("1",sci.next()); 42 | assertTrue(sci.hasNext()); 43 | assertEquals("2",sci.next()); 44 | assertTrue(sci.hasNext()); 45 | assertEquals("3",sci.next()); 46 | assertTrue(sci.hasNext()); 47 | assertEquals("4",sci.next()); 48 | a.delete(); 49 | b.delete(); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/gzip/abcd.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/abcd.gz -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/gzip/double-single-inflate-error.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/double-single-inflate-error.gz -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/gzip/empty.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/empty.gz -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/gzip/hi-2.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/hi-2.gz -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/gzip/hi.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/hi.gz -------------------------------------------------------------------------------- /src/test/resources/org/archive/format/warc/IAH-urls-wget.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/warc/IAH-urls-wget.warc -------------------------------------------------------------------------------- /src/test/resources/org/archive/resource/html/meta-itemprop.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: response 3 | WARC-Date: 2024-12-05T10:47:02Z 4 | Content-Length: 710 5 | Content-Type: application/http; msgtype=response 6 | WARC-Target-URI: https://www.example.org/ 7 | WARC-Identified-Payload-Type: text/html 8 | 9 | HTTP/1.1 200 10 | content-type: text/html; charset=UTF-8 11 | 12 | 13 | 14 | 15 | 16 | 17 | Test 18 | 19 | 20 | 21 |
22 | Blend-O-Matic 23 | $19.95 24 |
25 | 26 | 27 | 28 | Based on 25 user ratings 29 |
30 |
31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/test/resources/org/archive/resource/html/text-extraction-test.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/resource/html/text-extraction-test.warc -------------------------------------------------------------------------------- /src/test/resources/org/archive/resource/html/title-extraction-embedded-SVG.warc: -------------------------------------------------------------------------------- 1 | WARC/1.0 2 | WARC-Type: response 3 | WARC-Record-ID: 4 | WARC-Target-URI: https://www.example.org/testEmbeddedSVG.html 5 | WARC-Date: 2024-10-14T10:05:41Z 6 | WARC-IP-Address: 127.0.0.1 7 | WARC-Block-Digest: sha1:XNN4JA3QDUN4DDEGTIPH5ZRORHYL657F 8 | WARC-Payload-Digest: sha1:4FUACFTG3WCL26OITZNMEPRKFP6WAAHN 9 | Content-Type: application/http;msgtype=response 10 | Content-Length: 856 11 | 12 | HTTP/1.1 200 OK 13 | Date: Mon, 14 Oct 2024 10:05:41 GMT 14 | Server: Apache/2.4.58 (Ubuntu) 15 | Upgrade: h2,h2c 16 | Connection: Upgrade, Keep-Alive 17 | Last-Modified: Mon, 14 Oct 2024 10:04:25 GMT 18 | ETag: "20a-6246cf6287f50" 19 | Accept-Ranges: bytes 20 | Content-Length: 522 21 | Vary: Accept-Encoding 22 | Keep-Alive: timeout=5, max=100 23 | Content-Type: text/html 24 | 25 | 26 | 27 | 28 | Testing title extraction with embedded SVG 29 | 30 | 31 | 32 |
33 |
Testing title extraction with embedded SVG
34 |

This is body text...

35 | 36 | Embedded SVG 37 | 38 | 39 | 40 |
41 | 42 | 43 | 44 | 45 | 46 | --------------------------------------------------------------------------------