├── .github
    └── workflows
    │   └── maven.yml
├── .gitignore
├── CHANGES.md
├── LICENSE
├── README.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   ├── it
        │   │   └── unimi
        │   │   │   └── dsi
        │   │   │       └── fastutil
        │   │   │           └── io
        │   │   │               └── RepositionableStream.java
        │   └── org
        │   │   └── archive
        │   │       ├── RecoverableRecordFormatException.java
        │   │       ├── extract
        │   │           ├── CDXExtractorOutput.java
        │   │           ├── DumpingExtractorOutput.java
        │   │           ├── ExtractingResourceFactoryMapper.java
        │   │           ├── ExtractingResourceProducer.java
        │   │           ├── ExtractorOutput.java
        │   │           ├── FilteredExtractorOuput.java
        │   │           ├── JSONViewExtractorOutput.java
        │   │           ├── ProducerUtils.java
        │   │           ├── RealCDXExtractorOutput.java
        │   │           ├── ResourceExtractor.java
        │   │           ├── ResourceFactoryMapper.java
        │   │           ├── WARCMetadataRecordExtractorOutput.java
        │   │           ├── WATExtractorOutput.java
        │   │           └── WETExtractorOutput.java
        │   │       ├── format
        │   │           ├── ArchiveFileConstants.java
        │   │           ├── arc
        │   │           │   ├── ARCConstants.java
        │   │           │   ├── ARCFormatException.java
        │   │           │   ├── ARCMetaData.java
        │   │           │   ├── ARCMetaDataParser.java
        │   │           │   ├── FiledescRecord.java
        │   │           │   └── FiledescRecordParser.java
        │   │           ├── cdx
        │   │           │   ├── CDX09Line.java
        │   │           │   ├── CDX11Line.java
        │   │           │   ├── CDXFieldConstants.java
        │   │           │   ├── CDXFile.java
        │   │           │   ├── CDXInputSource.java
        │   │           │   ├── CDXLine.java
        │   │           │   ├── CDXLineFactory.java
        │   │           │   ├── FieldSplitFormat.java
        │   │           │   ├── FieldSplitLine.java
        │   │           │   ├── MultiCDXInputSource.java
        │   │           │   └── StandardCDXLineFactory.java
        │   │           ├── dns
        │   │           │   ├── DNSParseException.java
        │   │           │   ├── DNSRecord.java
        │   │           │   ├── DNSResponse.java
        │   │           │   └── DNSResponseParser.java
        │   │           ├── gzip
        │   │           │   ├── GZIPConstants.java
        │   │           │   ├── GZIPDecoder.java
        │   │           │   ├── GZIPFExtraRecord.java
        │   │           │   ├── GZIPFExtraRecords.java
        │   │           │   ├── GZIPFooter.java
        │   │           │   ├── GZIPFormatException.java
        │   │           │   ├── GZIPHeader.java
        │   │           │   ├── GZIPMemberSeries.java
        │   │           │   ├── GZIPMemberWriter.java
        │   │           │   ├── GZIPMemberWriterCommittedOutputStream.java
        │   │           │   ├── GZIPSeriesMember.java
        │   │           │   ├── GZIPStaticHeader.java
        │   │           │   └── zipnum
        │   │           │   │   ├── LineBufferingIterator.java
        │   │           │   │   ├── MultiBlockIterator.java
        │   │           │   │   ├── SummaryBlockIterator.java
        │   │           │   │   ├── SummaryLine.java
        │   │           │   │   ├── TimestampBestPickDedupIterator.java
        │   │           │   │   ├── TimestampCustomDedupIterator.java
        │   │           │   │   ├── TimestampDedupIterator.java
        │   │           │   │   ├── ZipNumBlockLoader.java
        │   │           │   │   ├── ZipNumCluster.java
        │   │           │   │   ├── ZipNumIndex.java
        │   │           │   │   ├── ZipNumParams.java
        │   │           │   │   └── ZipNumWriter.java
        │   │           ├── http
        │   │           │   ├── DumpingHTTPParseObserver.java
        │   │           │   ├── HttpConstants.java
        │   │           │   ├── HttpHeader.java
        │   │           │   ├── HttpHeaderObserver.java
        │   │           │   ├── HttpHeaderParser.java
        │   │           │   ├── HttpHeaders.java
        │   │           │   ├── HttpMessage.java
        │   │           │   ├── HttpMessageParser.java
        │   │           │   ├── HttpParseException.java
        │   │           │   ├── HttpParseObserver.java
        │   │           │   ├── HttpRequest.java
        │   │           │   ├── HttpRequestMessage.java
        │   │           │   ├── HttpRequestMessageObserver.java
        │   │           │   ├── HttpRequestMessageParser.java
        │   │           │   ├── HttpRequestParser.java
        │   │           │   ├── HttpResponse.java
        │   │           │   ├── HttpResponseMessage.java
        │   │           │   ├── HttpResponseMessageObserver.java
        │   │           │   ├── HttpResponseMessageParser.java
        │   │           │   └── HttpResponseParser.java
        │   │           ├── json
        │   │           │   ├── CompoundORJSONPathSpec.java
        │   │           │   ├── CrossProductOfLists.java
        │   │           │   ├── JSONPathSpec.java
        │   │           │   ├── JSONPathSpecFactory.java
        │   │           │   ├── JSONUtils.java
        │   │           │   ├── JSONView.java
        │   │           │   └── SimpleJSONPathSpec.java
        │   │           ├── text
        │   │           │   ├── charset
        │   │           │   │   ├── CharsetDetector.java
        │   │           │   │   ├── RotatingCharsetDetector.java
        │   │           │   │   └── StandardCharsetDetector.java
        │   │           │   └── html
        │   │           │   │   ├── CDATALexer.java
        │   │           │   │   ├── LexParser.java
        │   │           │   │   ├── NodeUtils.java
        │   │           │   │   └── ParseObserver.java
        │   │           └── warc
        │   │           │   ├── WARCConstants.java
        │   │           │   └── WARCRecordWriter.java
        │   │       ├── hadoop
        │   │           ├── ArchiveJSONViewLoader.java
        │   │           ├── ArchiveMetadataLoader.java
        │   │           ├── FilenameInputFormat.java
        │   │           ├── PerMapOutputFormat.java
        │   │           ├── ResourceContext.java
        │   │           ├── ResourceInputFormat.java
        │   │           ├── ResourceRecordReader.java
        │   │           └── func
        │   │           │   ├── JSONViewEvalFunc.java
        │   │           │   ├── TupleFunc.java
        │   │           │   └── URLResolverFunc.java
        │   │       ├── httpclient
        │   │           ├── ConfigurableX509TrustManager.java
        │   │           ├── HttpRecorderGetMethod.java
        │   │           ├── HttpRecorderMethod.java
        │   │           ├── HttpRecorderPostMethod.java
        │   │           ├── SingleHttpConnectionManager.java
        │   │           ├── ThreadLocalHttpConnectionManager.java
        │   │           └── package.html
        │   │       ├── io
        │   │           ├── ArchiveFileConstants.java
        │   │           ├── ArchiveReader.java
        │   │           ├── ArchiveReaderFactory.java
        │   │           ├── ArchiveRecord.java
        │   │           ├── ArchiveRecordHeader.java
        │   │           ├── ArraySeekInputStream.java
        │   │           ├── BufferedSeekInputStream.java
        │   │           ├── CharSubSequence.java
        │   │           ├── CompositeFileInputStream.java
        │   │           ├── CompositeFileReader.java
        │   │           ├── Endian.java
        │   │           ├── GZIPMembersInputStream.java
        │   │           ├── GenerationFileHandler.java
        │   │           ├── GenericReplayCharSequence.java
        │   │           ├── GzipHeader.java
        │   │           ├── HeaderedArchiveRecord.java
        │   │           ├── LoudObjectOutputStream.java
        │   │           ├── MiserOutputStream.java
        │   │           ├── NoGzipMagicException.java
        │   │           ├── ObjectPlusFilesInputStream.java
        │   │           ├── ObjectPlusFilesOutputStream.java
        │   │           ├── OriginSeekInputStream.java
        │   │           ├── Preformatter.java
        │   │           ├── RandomAccessInputStream.java
        │   │           ├── RandomAccessOutputStream.java
        │   │           ├── ReadSource.java
        │   │           ├── RecorderIOException.java
        │   │           ├── RecorderLengthExceededException.java
        │   │           ├── RecorderTimeoutException.java
        │   │           ├── RecorderTooMuchHeaderException.java
        │   │           ├── RecordingInputStream.java
        │   │           ├── RecordingOutputStream.java
        │   │           ├── RecoverableIOException.java
        │   │           ├── ReplayCharSequence.java
        │   │           ├── ReplayInputStream.java
        │   │           ├── RepositionableInputStream.java
        │   │           ├── SafeSeekInputStream.java
        │   │           ├── SeekInputStream.java
        │   │           ├── SeekReader.java
        │   │           ├── SeekReaderCharSequence.java
        │   │           ├── SinkHandlerLogThread.java
        │   │           ├── UTF8Bytes.java
        │   │           ├── WriterPool.java
        │   │           ├── WriterPoolMember.java
        │   │           ├── WriterPoolSettings.java
        │   │           ├── arc
        │   │           │   ├── ARC2WCDX.java
        │   │           │   ├── ARCConstants.java
        │   │           │   ├── ARCLocation.java
        │   │           │   ├── ARCReader.java
        │   │           │   ├── ARCReaderFactory.java
        │   │           │   ├── ARCRecord.java
        │   │           │   ├── ARCRecordMetaData.java
        │   │           │   ├── ARCUtils.java
        │   │           │   ├── ARCWriter.java
        │   │           │   ├── ARCWriterPool.java
        │   │           │   └── WriterPoolSettingsData.java
        │   │           ├── package.html
        │   │           └── warc
        │   │           │   ├── WARCConstants.java
        │   │           │   ├── WARCReader.java
        │   │           │   ├── WARCReaderFactory.java
        │   │           │   ├── WARCRecord.java
        │   │           │   ├── WARCRecordInfo.java
        │   │           │   ├── WARCWriter.java
        │   │           │   ├── WARCWriterPool.java
        │   │           │   ├── WARCWriterPoolSettings.java
        │   │           │   ├── WARCWriterPoolSettingsData.java
        │   │           │   └── package.html
        │   │       ├── net
        │   │           ├── DownloadURLConnection.java
        │   │           ├── FTPException.java
        │   │           ├── PublicSuffixes.java
        │   │           ├── md5
        │   │           │   ├── Handler.java
        │   │           │   └── Md5URLConnection.java
        │   │           └── rsync
        │   │           │   ├── Handler.java
        │   │           │   └── RsyncURLConnection.java
        │   │       ├── resource
        │   │           ├── AbstractEmptyResource.java
        │   │           ├── AbstractResource.java
        │   │           ├── MetaData.java
        │   │           ├── MetaDataConstants.java-normal
        │   │           ├── Resource.java
        │   │           ├── ResourceConstants.java
        │   │           ├── ResourceContainer.java
        │   │           ├── ResourceFactory.java
        │   │           ├── ResourceParseException.java
        │   │           ├── ResourceProducer.java
        │   │           ├── TransformingResourceProducer.java
        │   │           ├── arc
        │   │           │   ├── ARCResource.java
        │   │           │   ├── ARCResourceFactory.java
        │   │           │   └── record
        │   │           │   │   ├── FiledescResource.java
        │   │           │   │   └── FiledescResourceFactory.java
        │   │           ├── generic
        │   │           │   ├── GenericResourceProducer.java
        │   │           │   └── GenericStreamResource.java
        │   │           ├── gzip
        │   │           │   ├── GZIPMetaData.java
        │   │           │   ├── GZIPResource.java
        │   │           │   └── GZIPResourceContainer.java
        │   │           ├── html
        │   │           │   ├── ExtractingParseObserver.java
        │   │           │   ├── HTMLMetaData.java
        │   │           │   ├── HTMLResource.java
        │   │           │   └── HTMLResourceFactory.java
        │   │           ├── http
        │   │           │   ├── HTTPHeadersResource.java
        │   │           │   ├── HTTPHeadersResourceFactory.java
        │   │           │   ├── HTTPRequestResource.java
        │   │           │   ├── HTTPRequestResourceFactory.java
        │   │           │   ├── HTTPResponseResource.java
        │   │           │   └── HTTPResponseResourceFactory.java
        │   │           ├── producer
        │   │           │   ├── ARCFile.java
        │   │           │   ├── EnvelopedResourceFile.java
        │   │           │   └── WARCFile.java
        │   │           └── warc
        │   │           │   ├── WARCResource.java
        │   │           │   ├── WARCResourceFactory.java
        │   │           │   └── record
        │   │           │       ├── DNSResource.java
        │   │           │       ├── DNSResourceFactory.java
        │   │           │       ├── WARCJSONMetaDataResource.java
        │   │           │       ├── WARCJSONMetaDataResourceFactory.java
        │   │           │       ├── WARCMetaDataResource.java
        │   │           │       └── WARCMetaDataResourceFactory.java
        │   │       ├── streamcontext
        │   │           ├── AbstractBufferingStream.java
        │   │           ├── ByteArrayWrappedStream.java
        │   │           ├── HDFSStream.java
        │   │           ├── HTTP11Stream.java
        │   │           ├── RandomAccessFileStream.java
        │   │           ├── SimpleStream.java
        │   │           ├── Stream.java
        │   │           └── StreamWrappedInputStream.java
        │   │       ├── uid
        │   │           ├── RecordIDGenerator.java
        │   │           ├── UUIDGenerator.java
        │   │           └── package.html
        │   │       ├── url
        │   │           ├── AggressiveIACanonicalizerRules.java
        │   │           ├── AggressiveIAURLCanonicalizer.java
        │   │           ├── BasicURLCanonicalizer.java
        │   │           ├── CanonicalizeRules.java
        │   │           ├── CanonicalizerConstants.java
        │   │           ├── DefaultIACanonicalizerRules.java
        │   │           ├── DefaultIAURLCanonicalizer.java
        │   │           ├── ExtractRule.java
        │   │           ├── GoogleURLCanonicalizer.java
        │   │           ├── HandyURL.java
        │   │           ├── IAURLCanonicalizer.java
        │   │           ├── LaxURI.java
        │   │           ├── LaxURLCodec.java
        │   │           ├── NonMassagingIAURLCanonicalizer.java
        │   │           ├── OrdinaryIACanonicalizerRules.java
        │   │           ├── OrdinaryIAURLCanonicalizer.java
        │   │           ├── RewriteRule.java
        │   │           ├── SURT.java
        │   │           ├── SURTTokenizer.java
        │   │           ├── URLCanonicalizer.java
        │   │           ├── URLKeyMaker.java
        │   │           ├── URLParser.java
        │   │           ├── URLRegexTransformer.java
        │   │           ├── UrlSurtRangeComputer.java
        │   │           ├── UsableURI.java
        │   │           ├── UsableURIFactory.java
        │   │           └── WaybackURLKeyMaker.java
        │   │       └── util
        │   │           ├── ArchiveUtils.java
        │   │           ├── Base32.java
        │   │           ├── ByteOp.java
        │   │           ├── CrossProduct.java
        │   │           ├── DateUtils.java
        │   │           ├── DevUtils.java
        │   │           ├── FileNameSpec.java
        │   │           ├── FileUtils.java
        │   │           ├── GeneralURIStreamFactory.java
        │   │           ├── Grep.java
        │   │           ├── HMACSigner.java
        │   │           ├── IAUtils.java
        │   │           ├── InetAddressUtil.java
        │   │           ├── InterruptibleCharSequence.java
        │   │           ├── IterableLineIterator.java
        │   │           ├── LaxHttpParser.java
        │   │           ├── MimetypeUtils.java
        │   │           ├── NestedMap.java
        │   │           ├── PrefixSet.java
        │   │           ├── ProcessUtils.java
        │   │           ├── ProgressStatisticsReporter.java
        │   │           ├── PropertyUtils.java
        │   │           ├── Recorder.java
        │   │           ├── Reporter.java
        │   │           ├── SURT.java
        │   │           ├── StreamCopy.java
        │   │           ├── StringFieldExtractor.java
        │   │           ├── StringParse.java
        │   │           ├── SurtPrefixSet.java
        │   │           ├── TextUtils.java
        │   │           ├── TmpDirTestCase.java
        │   │           ├── anvl
        │   │               ├── ANVLRecord.java
        │   │               ├── Element.java
        │   │               ├── Label.java
        │   │               ├── SubElement.java
        │   │               ├── Value.java
        │   │               └── package.html
        │   │           ├── binsearch
        │   │               ├── AbstractSeekableLineReader.java
        │   │               ├── ByteBufferInputStream.java
        │   │               ├── FieldExtractingSLR.java
        │   │               ├── FileSearchTool.java
        │   │               ├── SeekCDXBenchmarker.java
        │   │               ├── SeekableLineReader.java
        │   │               ├── SeekableLineReaderFactory.java
        │   │               ├── SeekableLineReaderIterator.java
        │   │               ├── SortedTextFile.java
        │   │               ├── WrappedSeekableLineReader.java
        │   │               └── impl
        │   │               │   ├── HDFSSeekableLineReader.java
        │   │               │   ├── HDFSSeekableLineReaderFactory.java
        │   │               │   ├── HTTPSeekableLineReader.java
        │   │               │   ├── HTTPSeekableLineReaderFactory.java
        │   │               │   ├── MappedSeekableLineReader.java
        │   │               │   ├── MappedSeekableLineReaderFactory.java
        │   │               │   ├── NIOSeekableLineReader.java
        │   │               │   ├── NIOSeekableLineReaderFactory.java
        │   │               │   ├── RandomAccessFileSeekableLineReader.java
        │   │               │   ├── RandomAccessFileSeekableLineReaderFactory.java
        │   │               │   └── http
        │   │               │       ├── ApacheHttp31SLR.java
        │   │               │       ├── ApacheHttp31SLRFactory.java
        │   │               │       ├── ApacheHttp43SLR.java
        │   │               │       ├── ApacheHttp43SLRFactory.java
        │   │               │       ├── HTTPURLConnSLR.java
        │   │               │       └── HTTPURLConnSLRFactory.java
        │   │           ├── io
        │   │               ├── BytesReadObserver.java
        │   │               ├── CRCInputStream.java
        │   │               ├── CRCOutputStream.java
        │   │               ├── CommitedOutputStream.java
        │   │               ├── EOFNotifyingInputStream.java
        │   │               ├── EOFObserver.java
        │   │               ├── MultiMemberOpenJDKGZIPInputStream.java
        │   │               ├── NotifyingInputStream.java
        │   │               ├── PushBackOneByteInputStream.java
        │   │               └── RuntimeIOException.java
        │   │           ├── iterator
        │   │               ├── AbstractPeekableIterator.java
        │   │               ├── BoundedStringIterator.java
        │   │               ├── CachingStringFilter.java
        │   │               ├── CloseableCompositeIterator.java
        │   │               ├── CloseableIterator.java
        │   │               ├── CloseableIteratorUtil.java
        │   │               ├── CloseableIteratorWrapper.java
        │   │               ├── FilterStringIterator.java
        │   │               ├── LineReadingIterator.java
        │   │               ├── LookaheadIterator.java
        │   │               ├── PeekableIterator.java
        │   │               ├── PrefixMatchStringIterator.java
        │   │               ├── RegexLineIterator.java
        │   │               ├── SortedCompositeIterator.java
        │   │               ├── StartBoundedStringIterator.java
        │   │               ├── StringFilter.java
        │   │               ├── StringTransformer.java
        │   │               ├── TransformingIteratorWrapper.java
        │   │               └── TransformingPrefixStringFilter.java
        │   │           └── zip
        │   │               ├── GZIPMembersInputStream.java
        │   │               ├── GzipHeader.java
        │   │               ├── NoGzipMagicException.java
        │   │               ├── OpenJDK7GZIPInputStream.java
        │   │               └── OpenJDK7InflaterInputStream.java
        └── resources
        │   ├── effective_tld_names.dat
        │   └── org
        │       └── archive
        │           ├── commons.properties
        │           ├── ia-web-commons-version.txt
        │           └── util
        │               └── tlds-alpha-by-domain.txt
    └── test
        ├── java
            └── org
            │   └── archive
            │       ├── extract
            │           └── RealCDXExtractorOutputTest.java
            │       ├── format
            │           ├── dns
            │           │   └── DNSResponseParserTest.java
            │           ├── gzip
            │           │   ├── GZIPMemberSeriesTest.java
            │           │   ├── GZIPMemberWriterTest.java
            │           │   └── zipnum
            │           │   │   └── ZipNumWriterTest.java
            │           ├── http
            │           │   ├── HttpRequestMessageParserTest.java
            │           │   └── HttpResponseParserTest.java
            │           ├── json
            │           │   ├── CompoundORJSONPathSpecTest.java
            │           │   ├── JSONPathSpecFactoryTest.java
            │           │   ├── JSONViewTest.java
            │           │   └── SimpleJSONPathSpecTest.java
            │           └── text
            │           │   └── html
            │           │       └── CDATALexerTest.java
            │       ├── io
            │           ├── ArchiveReaderFactoryTest.java
            │           ├── BufferedSeekInputStreamTest.java
            │           ├── HeaderedArchiveRecordTest.java
            │           ├── RecordingInputStreamTest.java
            │           ├── RecordingOutputStreamTest.java
            │           ├── ReplayCharSequenceTest.java
            │           ├── RepositionableInputStreamTest.java
            │           ├── arc
            │           │   ├── ARCReaderFactoryTest.java
            │           │   ├── ARCWriterPoolTest.java
            │           │   └── ARCWriterTest.java
            │           └── warc
            │           │   ├── WARCReaderFactoryTest.java
            │           │   └── WARCWriterTest.java
            │       ├── net
            │           └── PublicSuffixesTest.java
            │       ├── resource
            │           ├── MetaDataTest.java
            │           ├── arc
            │           │   └── ARCResourceTest.java
            │           ├── html
            │           │   ├── ExtractingParseObserverTest.java
            │           │   └── HTMLMetaDataTest.java
            │           └── warc
            │           │   └── WARCResourceTest.java
            │       ├── uid
            │           └── UUIDGeneratorTest.java
            │       ├── url
            │           ├── AggressiveIAURLCanonicalizerTest.java
            │           ├── BasicURLCanonicalizerTest.java
            │           ├── HandyURLTest.java
            │           ├── IAURLCanonicalizerTest.java
            │           ├── OrdinaryIAURLCanonicalizerTest.java
            │           ├── URLParserTest.java
            │           ├── URLRegexTransformerTest.java
            │           ├── UsableURIFactoryTest.java
            │           ├── UsableURITest.java
            │           └── WaybackURLKeyMakerTest.java
            │       └── util
            │           ├── ArchiveUtilsTest.java
            │           ├── ByteOpTest.java
            │           ├── CrossProductTest.java
            │           ├── FileUtilsTest.java
            │           ├── InterruptibleCharSequenceTest.java
            │           ├── MimetypeUtilsTest.java
            │           ├── PropertyUtilsTest.java
            │           ├── StringFieldExtractorTest.java
            │           ├── TestUtils.java
            │           ├── anvl
            │               └── ANVLRecordTest.java
            │           ├── binsearch
            │               └── SortedTextFileTest.java
            │           ├── iterator
            │               ├── CachingStringFilterTest.java
            │               ├── FilterStringIteratorTest.java
            │               └── SortedCompositeIteratorTest.java
            │           └── zip
            │               └── GZIPMembersInputStreamTest.java
        └── resources
            └── org
                └── archive
                    ├── format
                        ├── arc
                        │   └── IAH-20080430204825-00000-blackbook-truncated.arc
                        ├── gzip
                        │   ├── IAH-urls-wget.warc.gz
                        │   ├── abcd.gz
                        │   ├── double-single-inflate-error.gz
                        │   ├── empty.gz
                        │   ├── hi-2.gz
                        │   └── hi.gz
                        └── warc
                        │   ├── IAH-urls-wget.warc
                        │   └── mutliple-headers.warc
                    └── resource
                        └── html
                            ├── html-lang-attribute.warc
                            ├── link-extraction-test.warc
                            ├── meta-itemprop.warc
                            ├── text-extraction-test.warc
                            └── title-extraction-embedded-SVG.warc


/.github/workflows/maven.yml:
--------------------------------------------------------------------------------
 1 | name: Java CI with Maven
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master" ]
 6 |   pull_request:
 7 |     branches: [ "master" ]
 8 | 
 9 | jobs:
10 |   build:
11 |     strategy:
12 |       matrix:
13 |         jdk: [8, 11, 17, 21, 22]
14 |         
15 |     runs-on: ubuntu-latest
16 |     timeout-minutes: 30
17 |     
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up JDK ${{ matrix.jdk }}
21 |       uses: actions/setup-java@v4
22 |       with:
23 |         java-version: ${{ matrix.jdk }}
24 |         distribution: 'temurin'
25 |         cache: maven
26 |     - name: Cache local Maven repository
27 |       uses: actions/cache@v4
28 |       with:
29 |         path: ~/.m2/repository
30 |         key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
31 |         restore-keys: |
32 |           ${{ runner.os }}-maven-
33 |     - name: Build with Maven
34 |       run: mvn -B package --file pom.xml
35 | 
36 |     # Optional: Uploads the full dependency graph to GitHub to improve the quality of Dependabot alerts this repository can receive
37 |     - name: Update dependency graph
38 |       if: ${{ github.event_name == 'push' }}
39 |       uses: advanced-security/maven-dependency-submission-action@v4.1.1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pydevproject
 2 | .project
 3 | .metadata
 4 | bin/**
 5 | tmp/**
 6 | tmp/**/*
 7 | *.tmp
 8 | *.bak
 9 | *.swp
10 | *~.nib
11 | local.properties
12 | .classpath
13 | .settings/
14 | .loadpath
15 | 
16 | # Target
17 | target/
18 | 
19 | # External tool builders
20 | .externalToolBuilders/
21 | 
22 | # Locally stored "Eclipse launch configurations"
23 | *.launch
24 | 
25 | # CDT-specific
26 | .cproject
27 | 
28 | # PDT-specific
29 | .buildpath
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | IIPC Web Archive Commons
2 | ========================
3 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.netpreserve.commons/webarchive-commons) [![Javadoc](https://javadoc.io/badge2/org.netpreserve.commons/webarchive-commons/javadoc.svg)](https://www.javadoc.io/doc/org.netpreserve.commons/webarchive-commons)
4 | 
5 | This repository contains common utility code for [OpenWayback][1] and other projects.
6 | 
7 | [1]: https://github.com/iipc/openwayback
8 | 


--------------------------------------------------------------------------------
/src/main/java/it/unimi/dsi/fastutil/io/RepositionableStream.java:
--------------------------------------------------------------------------------
 1 | // copied from fastutil, keeping the original package name to avoid breaking
 2 | // compatibility with existing user code that implements this interface
 3 | package it.unimi.dsi.fastutil.io;
 4 | 
 5 | /*		 
 6 |  * Copyright (C) 2005-2015 Sebastiano Vigna
 7 |  *
 8 |  * Licensed under the Apache License, Version 2.0 (the "License");
 9 |  * you may not use this file except in compliance with the License.
10 |  * You may obtain a copy of the License at
11 |  *
12 |  *     http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License. 
19 |  */
20 | 
21 | 
22 | /** A basic interface specifying positioning methods for a byte stream.
23 |  *
24 |  * @author Sebastiano Vigna
25 |  * @since 4.4
26 |  */
27 | 
28 | public interface RepositionableStream {
29 | 
30 | 	/** Sets the current stream position.
31 | 	 *
32 | 	 * @param newPosition the new stream position.
33 | 	 */
34 | 	void position( long newPosition ) throws java.io.IOException;
35 | 
36 | 	/** Returns the current stream position.
37 | 	 *
38 | 	 * @return the current stream position.
39 | 	 */
40 | 	long position() throws java.io.IOException;
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/RecoverableRecordFormatException.java:
--------------------------------------------------------------------------------
 1 | package org.archive;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public class RecoverableRecordFormatException extends IOException {
 6 | 
 7 | 	/**
 8 | 	 * 
 9 | 	 */
10 | 	private static final long serialVersionUID = 2775048979983919630L;
11 | 	public RecoverableRecordFormatException() {
12 | 		super();
13 | 	}
14 | 	public RecoverableRecordFormatException(String message) {
15 | 		super(message);
16 | 	}
17 | 	public RecoverableRecordFormatException(Exception e) {
18 | 		super(e);
19 | 	}
20 | 	public RecoverableRecordFormatException(String message, IOException e) {
21 | 		super(message,e);
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/extract/DumpingExtractorOutput.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | import java.io.PrintStream;
 6 | import java.util.logging.Logger;
 7 | 
 8 | import org.archive.resource.Resource;
 9 | import org.archive.util.StreamCopy;
10 | import com.github.openjson.JSONException;
11 | 
12 | import com.google.common.io.ByteStreams;
13 | import com.google.common.io.CountingOutputStream;
14 | 
15 | public class DumpingExtractorOutput implements ExtractorOutput {
16 | 	private static final Logger LOG = 
17 | 		Logger.getLogger(DumpingExtractorOutput.class.getName());
18 | 	
19 | 	private PrintStream out;
20 | 	public DumpingExtractorOutput(OutputStream out) {
21 | 		this.out = new PrintStream(out);
22 | 	}
23 | 
24 | 	public void output(Resource resource) throws IOException {
25 | 		OutputStream nullo = ByteStreams.nullOutputStream();
26 | 		CountingOutputStream co = new CountingOutputStream(nullo);
27 | 		StreamCopy.copy(resource.getInputStream(), co);
28 | 		long bytes = co.getCount();
29 | 		if(bytes > 0) {
30 | 			LOG.info(bytes + " unconsumed bytes in Resource InputStream.");
31 | 		}
32 | 		try {
33 | 			out.println(resource.getMetaData().getTopMetaData().toString(1));
34 | 		} catch (JSONException e) {
35 | 			LOG.warning(e.getMessage());
36 | 		}		
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/extract/ExtractingResourceProducer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.logging.Level;
 5 | import java.util.logging.Logger;
 6 | 
 7 | import org.archive.resource.Resource;
 8 | import org.archive.resource.ResourceFactory;
 9 | import org.archive.resource.ResourceParseException;
10 | import org.archive.resource.ResourceProducer;
11 | 
12 | public class ExtractingResourceProducer implements ResourceProducer {
13 | 	private static final Logger LOG =
14 | 		Logger.getLogger(ExtractingResourceProducer.class.getName());
15 | 	private ResourceProducer producer;
16 | 	private ResourceFactoryMapper mapper;
17 | 
18 | 	public ExtractingResourceProducer(ResourceProducer producer, 
19 | 			ResourceFactoryMapper mapper) {
20 | 
21 | 		this.producer = producer;
22 | 		this.mapper = mapper;
23 | 	}
24 | 	
25 | 	public Resource getNext() throws ResourceParseException, IOException {
26 | 		Resource current = producer.getNext();
27 | 		if(current == null) {
28 | 			return null;
29 | 		}
30 | 		while(true) {
31 | 			ResourceFactory f = mapper.mapResourceToFactory(current);
32 | 			if(f == null) {
33 | 				return current;
34 | 			}
35 | 			if(LOG.isLoggable(Level.FINE)) {
36 | 				LOG.fine(String.format("Extracting (%s) with (%s)\n",
37 | 						current.getClass().toString(),
38 | 						f.getClass().toString()));
39 | 			}
40 | 			current = f.getResource(current.getInputStream(),
41 | 					current.getMetaData(), current.getContainer());
42 | 		}
43 | 	}
44 | 
45 | 	public void close() throws IOException {
46 | 		producer.close();
47 | 	}
48 | 
49 | 	public String getContext() {
50 | 		return producer.getContext();
51 | 	}
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/extract/ExtractorOutput.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.archive.resource.Resource;
 6 | 
 7 | public interface ExtractorOutput {
 8 | 	public void output(Resource resource) throws IOException;
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/extract/FilteredExtractorOuput.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.PrintStream;
 5 | import java.util.List;
 6 | 
 7 | import org.archive.format.json.JSONUtils;
 8 | import org.archive.resource.Resource;
 9 | import org.archive.util.StreamCopy;
10 | 
11 | public class FilteredExtractorOuput implements ExtractorOutput {
12 | 	private String filterPath;
13 | 	private PrintStream out;
14 | 	public FilteredExtractorOuput(PrintStream out, String filterPath) {
15 | 		this.filterPath = filterPath;
16 | 		this.out = out;
17 | 	}
18 | 	public void output(Resource resource) throws IOException {
19 | 		StreamCopy.readToEOF(resource.getInputStream());
20 | 		List<String> results = JSONUtils.extractFancy(resource.getMetaData().getTopMetaData(), filterPath);
21 | 		if(results != null) {
22 | 			for(String result: results) {
23 | 				out.println("Result: " + result);
24 | 			}
25 | 		}
26 | 	}
27 | 	public void output2(Resource resource) throws IOException {
28 | 		String result = JSONUtils.extractSingle(resource.getMetaData().getTopMetaData(), filterPath);
29 | 		if(result != null) {
30 | 			out.println("Result:" + result);
31 | 		}
32 | 	}
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/extract/JSONViewExtractorOutput.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | import java.io.PrintStream;
 6 | import java.util.List;
 7 | 
 8 | import org.apache.commons.lang.StringUtils;
 9 | import org.archive.format.json.JSONView;
10 | import org.archive.resource.Resource;
11 | import org.archive.util.StreamCopy;
12 | 
13 | public class JSONViewExtractorOutput implements ExtractorOutput {
14 | 	private PrintStream out;
15 | 	private JSONView view;
16 | 	public JSONViewExtractorOutput(OutputStream out, String filterPath) {
17 | 		view = new JSONView(filterPath.split(","));
18 | 		this.out = new PrintStream(out);
19 | 	}
20 | 	public void output(Resource resource) throws IOException {
21 | 		StreamCopy.readToEOF(resource.getInputStream());
22 | 		List<List<String>> data = 
23 | 			view.apply(resource.getMetaData().getTopMetaData());
24 | 		if(data != null) {
25 | 			for(List<String> d : data) {
26 | 				out.println(StringUtils.join(d,"\t"));
27 | 			}
28 | 		}
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/extract/ResourceFactoryMapper.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import org.archive.resource.Resource;
 4 | import org.archive.resource.ResourceConstants;
 5 | import org.archive.resource.ResourceFactory;
 6 | 
 7 | public interface ResourceFactoryMapper extends ResourceConstants {
 8 | 	public ResourceFactory mapResourceToFactory(Resource resource);
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/arc/ARCFormatException.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.arc;
 2 | 
 3 | import org.archive.RecoverableRecordFormatException;
 4 | 
 5 | public class ARCFormatException extends RecoverableRecordFormatException {
 6 | 
 7 | 	public ARCFormatException(String string) {
 8 | 		super(string);
 9 | 	}
10 | 	public ARCFormatException(Exception e) {
11 | 		super(e);
12 | 	}
13 | 
14 | 	/**
15 | 	 * 
16 | 	 */
17 | 	private static final long serialVersionUID = 1L;
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/cdx/CDX09Line.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.cdx;
 2 | 
 3 | public class CDX09Line extends CDXLine {
 4 | 	
 5 | 	CDX09Line(String string, FieldSplitFormat selectNames) {
 6 | 	    super(string, selectNames);
 7 |     }
 8 | 
 9 | 	@Override
10 |     public String getUrlKey() {
11 | 	    return getField(0);
12 |     }
13 | 
14 | 	@Override
15 |     public String getTimestamp() {
16 | 	    return getField(1);
17 |     }
18 | 
19 | 	@Override
20 |     public String getOriginalUrl() {
21 | 	    return getField(2);
22 |     }
23 | 
24 | 	@Override
25 |     public String getMimeType() {
26 | 	    return getField(3);
27 |     }
28 | 	
29 | 	@Override
30 | 	public void setMimeType(String mime)
31 | 	{
32 | 		setField(3, mime);
33 | 	}
34 | 
35 | 	@Override
36 |     public String getStatusCode() {
37 | 	    return getField(4);
38 |     }
39 | 
40 | 	@Override
41 |     public String getDigest() {
42 | 	    return getField(5);
43 | 	}
44 | 	
45 | 	@Override
46 |     public String getRedirect() {
47 | 	    return getField(6);
48 |     }
49 | 
50 | 	@Override
51 |     public String getOffset() {
52 | 		return getField(7);
53 |     }
54 | 
55 | 	@Override
56 |     public String getFilename() {
57 | 		return getField(8);
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/cdx/CDX11Line.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.cdx;
 2 | 
 3 | public class CDX11Line extends CDXLine {
 4 | 
 5 | 	CDX11Line(String string, FieldSplitFormat selectNames) {
 6 | 	    super(string, selectNames);
 7 |     }
 8 | 
 9 | 	@Override
10 |     public String getUrlKey() {
11 | 	    return getField(0);
12 |     }
13 | 
14 | 	@Override
15 |     public String getTimestamp() {
16 | 	    return getField(1);
17 |     }
18 | 
19 | 	@Override
20 |     public String getOriginalUrl() {
21 | 	    return getField(2);
22 |     }
23 | 
24 | 	@Override
25 |     public String getMimeType() {
26 | 	    return getField(3);
27 |     }
28 | 	
29 | 	@Override
30 | 	public void setMimeType(String mime)
31 | 	{
32 | 		setField(3, mime);
33 | 	}
34 | 
35 | 	@Override
36 |     public String getStatusCode() {
37 | 	    return getField(4);
38 |     }
39 | 
40 | 	@Override
41 |     public String getDigest() {
42 | 	    return getField(5);
43 | 	}
44 | 	
45 | 	@Override
46 |     public String getRedirect() {
47 | 	    return getField(6);
48 |     }
49 | 	
50 | 	@Override
51 |     public String getRobotFlags() {
52 | 		return getField(7);
53 |     }	
54 | 
55 | 	@Override
56 |     public String getLength() {
57 | 		return getField(8);
58 |     }
59 | 
60 | 	@Override
61 |     public String getOffset() {
62 | 		return getField(9);
63 |     }
64 | 
65 | 	@Override
66 |     public String getFilename() {
67 | 		return getField(10);
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/cdx/CDXFieldConstants.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.cdx;
 2 | 
 3 | public interface CDXFieldConstants {
 4 | 	public final static String urlkey = "urlkey";
 5 | 	public final static String timestamp = "timestamp";
 6 | 	public final static String original = "original";
 7 | 	public final static String mimetype = "mimetype";
 8 | 	public final static String statuscode = "statuscode";
 9 | 	public final static String digest = "digest";
10 | 	public final static String redirect = "redirect";
11 | 	public final static String robotflags = "robotflags";
12 | 	public final static String length = "length";
13 | 	public final static String offset = "offset";
14 | 	public final static String filename = "filename";
15 | 
16 | 	// A list of *ALL* standard cdx field names
17 | 	public final static FieldSplitFormat CDX_ALL_NAMES = new FieldSplitFormat(urlkey, timestamp, original, mimetype, statuscode, digest, redirect, robotflags,
18 | 			length, offset, filename);
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/cdx/CDXInputSource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.cdx;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.archive.format.gzip.zipnum.ZipNumParams;
 6 | import org.archive.util.iterator.CloseableIterator;
 7 | 
 8 | public interface CDXInputSource {
 9 | 
10 | 	public CloseableIterator<String> getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException;
11 | 	public CloseableIterator<String> getCDXIterator(String key, String start, String startEndUrl, ZipNumParams params) throws IOException;
12 | 	
13 | 	public long getTotalLines();
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/cdx/CDXLine.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.cdx;
 2 | 
 3 | 
 4 | public class CDXLine extends FieldSplitLine implements CDXFieldConstants {
 5 | 
 6 | 	public CDXLine(String line, FieldSplitFormat names) {
 7 | 		super(line, ' ', names);
 8 | 	}
 9 | 	
10 | 	public CDXLine(CDXLine line, FieldSplitFormat selectNames)
11 | 	{
12 | 		super(line.selectValues(selectNames), selectNames);
13 | 	}
14 | 
15 | 	public String getUrlKey() {
16 | 		return super.getField(CDXLine.urlkey);
17 | 	}
18 | 
19 | 	public String getTimestamp() {
20 | 		return super.getField(CDXLine.timestamp);
21 | 	}
22 | 
23 | 	public String getOriginalUrl() {
24 | 		return super.getField(CDXLine.original);
25 | 	}
26 | 
27 | 	public String getMimeType() {
28 | 		return super.getField(CDXLine.mimetype);
29 | 	}
30 | 	
31 | 	public void setMimeType(String newMime) {
32 | 		setField(CDXLine.mimetype, newMime);
33 | 	}
34 | 
35 | 	public String getStatusCode() {
36 | 		return super.getField(CDXLine.statuscode);
37 | 	}
38 | 	
39 | 	public void setStatusCode(String newStatus) {
40 | 		setField(CDXLine.statuscode, newStatus);
41 | 	}
42 | 
43 | 	public String getDigest() {
44 | 		return super.getField(CDXLine.digest);
45 | 	}
46 | 
47 | 	public String getLength() {
48 | 		return super.getField(CDXLine.length);
49 | 	}
50 | 
51 | 	public String getOffset() {
52 | 		return super.getField(CDXLine.offset);
53 | 	}
54 | 
55 | 	public String getFilename() {
56 | 		return super.getField(CDXLine.filename);
57 | 	}
58 | 	
59 | 	public String getRedirect() {
60 | 		return super.getField(CDXLine.redirect);
61 | 	}
62 | 	
63 | 	public String getRobotFlags() {
64 | 		return super.getField(CDXLine.robotflags);
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/cdx/CDXLineFactory.java:
--------------------------------------------------------------------------------
1 | package org.archive.format.cdx;
2 | 
3 | public interface CDXLineFactory {
4 | 	public FieldSplitFormat getParseFormat();
5 | 	public CDXLine createStandardCDXLine(String input, FieldSplitFormat exFormat);
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/dns/DNSParseException.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.dns;
 2 | 
 3 | import org.archive.RecoverableRecordFormatException;
 4 | 
 5 | public class DNSParseException extends RecoverableRecordFormatException {
 6 | 
 7 | 	public DNSParseException(String string) {
 8 | 		super(string);
 9 | 	}
10 | 
11 | 	/**
12 | 	 * 
13 | 	 */
14 | 	private static final long serialVersionUID = 7946541881940132743L;
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/dns/DNSRecord.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.dns;
 2 | 
 3 | public class DNSRecord {
 4 | 	private String name;
 5 | 	private int ttl;
 6 | 	private String netClass;
 7 | 	private String type;
 8 | 	private String value;
 9 | 	public DNSRecord(String name, int ttl, String netClass, String type, String value) {
10 | 		this.name = name;
11 | 		this.ttl = ttl;
12 | 		this.netClass = netClass;
13 | 		this.type = type;
14 | 		this.value = value;
15 | 	}
16 | 	public String getName() {
17 | 		return name;
18 | 	}
19 | 	public int getTtl() {
20 | 		return ttl;
21 | 	}
22 | 	public String getNetClass() {
23 | 		return netClass;
24 | 	}
25 | 	public String getType() {
26 | 		return type;
27 | 	}
28 | 	public String getValue() {
29 | 		return value;
30 | 	}
31 | 	public static DNSRecord parse(String line) throws DNSParseException {
32 | 		String a[] = line.split("\\s+");
33 | 		try {
34 | 			if(a.length == 5) {
35 | 				return new DNSRecord(a[0],Integer.parseInt(a[1]),a[2],a[3],a[4]);
36 | 			} else {
37 | 				throw new DNSParseException("Wrong number of fields:" + line);
38 | 			}
39 | 		} catch (NumberFormatException e) {
40 | 			throw new DNSParseException("BAD TTL field:" + line);
41 | 		}
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/dns/DNSResponse.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.dns;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | public class DNSResponse extends ArrayList<DNSRecord> {
 6 | 	/**
 7 | 	 * 
 8 | 	 */
 9 | 	private static final long serialVersionUID = -10624236867791758L;
10 | 	private String date;
11 | 	public void setDate(String date) {
12 | 		this.date = date;
13 | 	}
14 | 
15 | 	public String getDate() {
16 | 		return date;
17 | 	}
18 | 	
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/dns/DNSResponseParser.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.dns;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import java.io.InputStreamReader;
 7 | import java.io.UnsupportedEncodingException;
 8 | 
 9 | public class DNSResponseParser {
10 | 
11 | 	private boolean isBlank(String line) {
12 | 		return line.matches("\\S");
13 | 	}
14 | 	private boolean isDate(String dateLine) {
15 | 		return !isBlank(dateLine);
16 | 	}
17 | 	public void parse(InputStream is, DNSResponse response) throws IOException, DNSParseException {
18 | 		/*
19 | 		20110328212258
20 | 		www.google.com.		86399	IN	CNAME	www.l.google.com.
21 | 		www.l.google.com.	299	IN	A	74.125.71.105
22 | 		www.l.google.com.	299	IN	A	74.125.71.103
23 | 		www.l.google.com.	299	IN	A	74.125.71.99
24 | 		www.l.google.com.	299	IN	A	74.125.71.147
25 | 		www.l.google.com.	299	IN	A	74.125.71.104
26 | 		www.l.google.com.	299	IN	A	74.125.71.106
27 | 		*/
28 | 		try {
29 | 			// TODO: should we wrap in a CountingInputStream and indicate 
30 | 			//        observed octet-length?
31 | 			BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
32 | 			String date = br.readLine().trim();
33 | 			if(isDate(date)) {
34 | 				response.setDate(date);
35 | 			}
36 | 			while(true) {
37 | 				String line = br.readLine();
38 | 				if(line == null) {
39 | 					break;
40 | 				}
41 | 				if(!isBlank(line)) {
42 | 					response.add(DNSRecord.parse(line));
43 | 				}
44 | 			}
45 | 		} catch (UnsupportedEncodingException e) {
46 | 			// really really should not happen..
47 | 			e.printStackTrace();
48 | 		}
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/GZIPFooter.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | 
 6 | import org.archive.util.ByteOp;
 7 | 
 8 | public class GZIPFooter implements GZIPConstants {
 9 | 	byte buffer[] = null;
10 | 
11 | 	public GZIPFooter(byte buffer[]) throws GZIPFormatException {
12 | 		if(buffer.length != GZIP_FOOTER_BYTES) {
13 | 			throw new GZIPFormatException("Wrong length footer");
14 | 		}
15 | 		this.buffer = buffer;
16 | 	}
17 | 	public GZIPFooter(long crc, long length) {
18 | 		buffer = new byte[GZIP_FOOTER_BYTES];
19 | 		ByteOp.writeInt(buffer, 0, crc);
20 | 		ByteOp.writeInt(buffer, BYTES_IN_INT, length);
21 | 	}
22 | 	public long getCRC() {
23 | 		return ByteOp.bytesToInt(buffer, 0);
24 | 	}
25 | 	public long getLength() {
26 | 		return ByteOp.bytesToInt(buffer, BYTES_IN_INT);
27 | 	}
28 | 	public void verify(long crc, long length) throws GZIPFormatException {
29 | //		long gotCRC = getCRC() & 0xffffffff;
30 | //		long gotCRC2 = getCRC();
31 | //		int gotCRCi = (int) (getCRC() & 0xffffffff);
32 | //		
33 | //		long wantCRC = crc & 0xffffffff;
34 | 		int wantCRCi = (int) (crc & 0xffffffff);
35 | 		if(wantCRCi != getCRC()) {
36 | 			throw new GZIPFormatException("GZip crc error");
37 | 		}
38 | 		if(length != getLength()) {
39 | 			throw new GZIPFormatException("GZip length error");
40 | 		}
41 | 	}
42 | 	public void writeBytes(OutputStream os) throws IOException {
43 | 		os.write(buffer);
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/GZIPFormatException.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.archive.RecoverableRecordFormatException;
 6 | 
 7 | 
 8 | public class GZIPFormatException extends RecoverableRecordFormatException {
 9 | 	/** */
10 | 	private static final long serialVersionUID = -3526676437467483190L;
11 | 
12 | 	public GZIPFormatException() {
13 | 		super();
14 | 	}
15 | 	public GZIPFormatException(String message) {
16 | 		super(message);
17 | 	}
18 | 	public GZIPFormatException(Exception e) {
19 | 		super(e);
20 | 	}
21 | 	public GZIPFormatException(String message, IOException e) {
22 | 		super(message,e);
23 | 	}
24 | 	public static class GZIPExtraFieldShortException extends GZIPFormatException {
25 | 		int bytesRead;
26 | 		public GZIPExtraFieldShortException(int bytesRead) {
27 | 			super("Extra Field short.");
28 | 			this.bytesRead = bytesRead;
29 | 		}
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/GZIPMemberWriter.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.OutputStream;
 6 | import java.util.zip.Deflater;
 7 | import java.util.zip.DeflaterOutputStream;
 8 | 
 9 | import org.archive.util.StreamCopy;
10 | import org.archive.util.io.CRCInputStream;
11 | 
12 | import com.google.common.io.CountingOutputStream;
13 | 
14 | public class GZIPMemberWriter implements GZIPConstants {
15 | 	private static final int MAX_RAM_BUFFER = 1024 * 1024;
16 | 	private byte slRecordName[] = SL_RECORD;
17 | 	public int maxBuffer = MAX_RAM_BUFFER;
18 | 	private CountingOutputStream out;
19 | 	
20 | 	public GZIPMemberWriter(OutputStream out) {
21 | 		this.out = new CountingOutputStream(out);
22 | 	}
23 | 
24 | 	public void write(InputStream is) throws IOException {
25 | 		CRCInputStream crc = new CRCInputStream(is);
26 | 		GZIPHeader gzHeader = new GZIPHeader();
27 | 		// TODO: add fields...
28 | 		gzHeader.writeBytes(out);
29 | 		Deflater deflater = new Deflater(Deflater.DEFAULT_COMPRESSION, true);
30 | 		DeflaterOutputStream deflateOut = new DeflaterOutputStream(out,deflater);
31 | 		StreamCopy.copy(crc, deflateOut);
32 | 		deflateOut.finish();
33 | 		GZIPFooter gzFooter = new GZIPFooter(crc.getCRCValue(), crc.getByteCount());
34 | 		gzFooter.writeBytes(out);
35 | 		out.flush();
36 | 	}
37 | 
38 | 	public long getBytesWritten() {
39 | 		return out.getCount();
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/GZIPMemberWriterCommittedOutputStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.ByteArrayInputStream;
 5 | import java.io.ByteArrayOutputStream;
 6 | 
 7 | import org.archive.util.io.CommitedOutputStream;
 8 | 
 9 | public class GZIPMemberWriterCommittedOutputStream extends CommitedOutputStream {
10 | 	private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
11 | 	private GZIPMemberWriter gzW;
12 | 	public GZIPMemberWriterCommittedOutputStream(GZIPMemberWriter gzW) {
13 | 		this(gzW,DEFAULT_BUFFER_RAM);
14 | 	}
15 | 	public GZIPMemberWriterCommittedOutputStream(GZIPMemberWriter gzW, int bufferRAM) {
16 |                 super(new ByteArrayOutputStream());
17 | 		this.gzW = gzW;
18 | 	}
19 | 
20 | 	@Override
21 | 	public void commit() throws IOException {
22 |                 ByteArrayOutputStream bos = (ByteArrayOutputStream) out;
23 | 		gzW.write(new ByteArrayInputStream(bos.toByteArray()));
24 | 	}
25 | 	public long getBytesWritten() {
26 | 		return gzW.getBytesWritten();
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/zipnum/SummaryLine.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip.zipnum;
 2 | 
 3 | import org.archive.format.cdx.FieldSplitLine;
 4 | 
 5 | public class SummaryLine extends FieldSplitLine
 6 | {
 7 | 	String partId;		
 8 | 	long offset;
 9 | 	int length;
10 | 	
11 | 	public SummaryLine(String line)
12 | 	{
13 | 		super(line, '\t', null);
14 | 		partId = super.getField(1);
15 | 		if (super.getNumFields() < 3) {
16 | 			return;
17 | 		}
18 | 		offset = Long.parseLong(super.getField(2));
19 | 		length = Integer.parseInt(super.getField(3));
20 | 		//timestamp = makeTimestamp(parts[0]);
21 | 	}
22 | 	
23 | //		String makeTimestamp(String key)
24 | //		{
25 | //			if (params.getTimestampDedupLength() <= 0) {
26 | //				return null;
27 | //			}
28 | //			
29 | //			int space = key.indexOf(' ');
30 | //			if (space >= 0) {
31 | //				return key.substring(0, space + 1 + params.getTimestampDedupLength());
32 | //			} else {
33 | //				return null;
34 | //			}
35 | //		}
36 | 	
37 | 	public boolean isContinuous(SummaryLine next)
38 | 	{
39 | 		if (next == null || next.fullLine == null) {
40 | 			return false;
41 | 		}
42 | 		
43 | 		// Must be same part
44 | 		if (!partId.equals(next.partId)) {
45 | 			return false;
46 | 		}
47 | 		
48 | 		if ((offset + length) != next.offset) {
49 | 			return false;
50 | 		}
51 | 		
52 | 		return true;
53 | 	}
54 | 	
55 | //		boolean sameTimestamp(SplitLine next)
56 | //		{
57 | //			if (next == null || next.timestamp == null) {
58 | //				return false;
59 | //			}
60 | //			
61 | //			if (timestamp == null) {
62 | //				return false;
63 | //			}
64 | //			
65 | //			return timestamp.equals(next.timestamp);
66 | //		}
67 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/zipnum/TimestampCustomDedupIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip.zipnum;
 2 | 
 3 | import org.archive.util.iterator.CloseableIterator;
 4 | 
 5 | public class TimestampCustomDedupIterator extends TimestampDedupIterator {
 6 | 	
 7 | 	// The additional field used as status field from the timestamp
 8 | 	private int additionalFieldNum = 3;
 9 | 	private int sep = ' ';
10 | 
11 | 	public TimestampCustomDedupIterator(CloseableIterator<String> inner,
12 | 			int timestampDedupLength) {
13 | 		super(inner, timestampDedupLength);
14 | 	}
15 | 	
16 | 	@Override
17 | 	protected boolean isSame(String currStamp, String nextStamp,
18 | 			String currLine, String nextLine) {
19 | 		
20 | 		if (!super.isSame(currStamp, nextStamp, currLine, nextLine)) {
21 | 			return false;
22 | 		}
23 | 		
24 | 		//Same only if status code also matches
25 | 		String currStatus = getNthField(currLine, currStamp.length(), additionalFieldNum, sep);
26 | 		if (currStatus == null) {
27 | 			return false;
28 | 		}
29 | 		
30 | 		String nextStatus = getNthField(nextLine, nextStamp.length(), additionalFieldNum, sep);
31 | 		if (nextStatus == null) {
32 | 			return false;
33 | 		}
34 | 		
35 | 		return currStatus.equals(nextStatus);
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/zipnum/ZipNumParams.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip.zipnum;
 2 | 
 3 | public class ZipNumParams
 4 | {
 5 | 	protected int maxAggregateBlocks = 1;
 6 | 	protected int timestampDedupLength = 0;
 7 | 	protected int maxBlocks = 0;
 8 | 	private boolean reverse = false;
 9 | 	private boolean sequential = false;
10 | 	
11 | 	public ZipNumParams()
12 | 	{
13 | 	    
14 | 	}
15 | 	
16 | 	public ZipNumParams(ZipNumParams defaults)
17 | 	{
18 | 		this(defaults.maxAggregateBlocks, defaults.maxBlocks, defaults.timestampDedupLength, defaults.reverse);
19 | 	}
20 | 	
21 | 	public ZipNumParams(int maxAggregateBlocks, int maxBlocks, int timestampDedupLength, boolean reverse)
22 | 	{
23 | 	    this.maxAggregateBlocks = maxAggregateBlocks;
24 | 	    this.maxBlocks = maxBlocks;
25 | 	    this.timestampDedupLength = timestampDedupLength;
26 | 	    this.reverse = reverse;
27 | 	}
28 | 	
29 | 	public int getMaxAggregateBlocks() {
30 | 		return maxAggregateBlocks;
31 | 	}
32 | 
33 | 	public void setMaxAggregateBlocks(int maxAggregateBlocks) {
34 | 		this.maxAggregateBlocks = maxAggregateBlocks;
35 | 	}
36 | 
37 | 	public int getTimestampDedupLength() {
38 | 		return timestampDedupLength;
39 | 	}
40 | 
41 | 	public void setTimestampDedupLength(int timestampDedupLength) {
42 | 		this.timestampDedupLength = timestampDedupLength;
43 | 	}
44 | 
45 | 	public int getMaxBlocks() {
46 | 		return maxBlocks;
47 | 	}
48 | 
49 | 	public void setMaxBlocks(int maxBlocks) {
50 | 		this.maxBlocks = maxBlocks;
51 | 	}
52 | 
53 | 	public boolean isReverse() {
54 | 		return this.reverse;
55 |     }
56 | 	
57 | 	public void setReverse(boolean reverse) {
58 | 		this.reverse  = reverse;
59 | 	}
60 | 
61 | 	public boolean isSequential() {
62 | 		return sequential;
63 | 	}
64 | 
65 | 	public void setSequential(boolean sequential) {
66 | 		this.sequential = sequential;
67 | 	}
68 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/gzip/zipnum/ZipNumWriter.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip.zipnum;
 2 | 
 3 | import java.io.ByteArrayOutputStream;
 4 | import java.io.IOException;
 5 | import java.io.OutputStream;
 6 | import java.nio.charset.Charset;
 7 | 
 8 | import org.archive.format.gzip.GZIPMemberWriter;
 9 | import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
10 | 
11 | public class ZipNumWriter extends GZIPMemberWriterCommittedOutputStream {
12 | 	int limit;
13 | 	int count;
14 | 	OutputStream manifestOut;
15 | 	ByteArrayOutputStream manifestBuffer;
16 | 	char delimiter = '\t';
17 | 	private static final Charset UTF8 = Charset.forName("utf-8");
18 | 	public ZipNumWriter(OutputStream main, OutputStream manifest, int limit) {
19 | 		super(new GZIPMemberWriter(main));
20 | 		manifestOut = manifest;
21 | 		this.limit = limit;
22 | 		count = 0;
23 | 		manifestBuffer = new ByteArrayOutputStream();
24 | 	}
25 | 
26 | 	public void addRecord(byte[] bytes) throws IOException {
27 | 		if(count == 0) {
28 | 			manifestBuffer.write(bytes);
29 | 		}
30 | 		write(bytes);
31 | 		count++;
32 | 		if(count == limit) {
33 | 			finishCurrent();
34 | 		}
35 | 	}
36 | 	
37 | 	public void close() throws IOException {
38 | 		finishCurrent();
39 | 	}
40 | 
41 | 	private void finishCurrent() throws IOException {
42 | 		if(count == 0) {
43 | 			return;
44 | 		}
45 | 		long start = getBytesWritten();
46 | 		commit();
47 | 		long end = getBytesWritten();
48 | 		long len = end - start;
49 | 		StringBuilder sb = new StringBuilder();
50 | 		sb.append(start);
51 | 		sb.append(delimiter);
52 | 		sb.append(len);
53 | 		sb.append(delimiter);
54 | 		manifestOut.write(sb.toString().getBytes(UTF8));
55 | 		manifestBuffer.writeTo(manifestOut);
56 | 		manifestOut.flush();
57 | 		count = 0;
58 | 		manifestBuffer.reset();
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/DumpingHTTPParseObserver.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import java.io.PrintStream;
 4 | import java.nio.charset.Charset;
 5 | 
 6 | public class DumpingHTTPParseObserver implements HttpHeaderObserver {
 7 | 	private static final Charset UTF8 = Charset.forName("UTF-8");
 8 | 	private PrintStream ps = null;
 9 | 	public DumpingHTTPParseObserver() {
10 | 		ps = System.out;
11 | 	}
12 | 	public DumpingHTTPParseObserver(PrintStream ps) {
13 | 		this.ps = ps;
14 | 	}
15 | 
16 | 	public void headerParsed(byte[] name, int ns, int nl, byte[] value, int vs,
17 | 			int vl) {
18 | 		ps.format("headerParsed:(%d:%d)(%s)(%d:%d)(%s)\n", 
19 | 				ns,nl,new String(name,0,nl,UTF8),
20 | 				vs,vl,new String(value,0,vl,UTF8));
21 | 	}
22 | 
23 | 	public void headersComplete(int bytesRead) {
24 | 		ps.format("headersComplete(%d)\n",bytesRead);
25 | 	}
26 | 	public void headersCorrupt() {
27 | 		ps.println("headersCorrupted\n");
28 | 	}
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpHeader.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | 
 6 | public class HttpHeader implements HttpConstants {
 7 | 	private String name = null;
 8 | 	private String value = null;
 9 | 
10 | 	public HttpHeader() {}
11 | 
12 | 	public HttpHeader(String name, String value) {
13 | 		this.name = name;
14 | 		this.value = value;
15 | 	}
16 | 
17 | 	public String getName()              { return name;        }
18 | 	public void   setName(String name)   { this.name = name;   }
19 | 	public String getValue()             { return value;       }
20 | 	public void   setValue(String value) { this.value = value; }
21 | 
22 | 	public void write(OutputStream out) throws IOException {
23 | 		out.write(name.getBytes(UTF8));  out.write(COLON); out.write(SP);
24 | 
25 | 		out.write(value.getBytes(UTF8)); out.write(CR);    out.write(LF);
26 | 	}
27 | 
28 | 	public String toString() {
29 | 		StringBuilder sb = new StringBuilder(name.length() + value.length()+20);
30 | 		sb.append(String.format("HttpHeader(%s)(%s)",name,value));
31 | 		return sb.toString();
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpHeaderObserver.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | public interface HttpHeaderObserver extends HttpConstants {
 4 | 
 5 | 	public void headerParsed(byte name[], int ns, int nl, 
 6 | 			byte value[], int vs, int vl);
 7 | 
 8 | 	public void headersComplete(int totalBytes);
 9 | 	public void headersCorrupt();
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpMessage.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | public class HttpMessage implements HttpConstants {
 4 | 	protected int version = VERSION_0;
 5 | 	protected int bytes = -1;
 6 | 	protected boolean isCorrupt;
 7 | 
 8 | 	public int getVersion() {
 9 | 		return version;
10 | 	}
11 | 	public String getVersionString() {
12 | 		if(version == VERSION_1) {
13 | 			return VERSION_1_STATUS;
14 | 		} else if(version == VERSION_9) {
15 | 			return VERSION_9_STATUS;
16 | 		}
17 | 		return VERSION_0_STATUS;
18 | 	}
19 | 	public int getLength() {
20 | 		return bytes;
21 | 	}
22 | 
23 | 	public void messageCorrupt() {
24 | 		isCorrupt = true;
25 | 	}
26 | 	public boolean isCorrupt() {
27 | 		return isCorrupt;
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpMessageParser.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | 
 4 | public class HttpMessageParser implements HttpConstants {
 5 | 	
 6 | 	protected int parseVersionStrict(byte buf[], int start, int len)
 7 | 	throws HttpParseException {
 8 | 	
 9 | 		String v = new String(buf,start,len,UTF8);
10 | 		if(v.compareTo(VERSION_0_STATUS) == 0) {
11 | 			return VERSION_0;
12 | 		} else if(v.compareTo(VERSION_1_STATUS) == 0) {
13 | 			return VERSION_1;
14 | 		} else if(v.compareTo(VERSION_9_STATUS) == 0) {
15 | 			return VERSION_9;
16 | 		} else {
17 | 			throw new HttpParseException("Unknown version");
18 | 		}
19 | 	}
20 | 
21 | 	protected int parseVersionLax(byte buf[], int start, int len)
22 | 	throws HttpParseException {
23 | 	
24 | 		String v = new String(buf,start,len,UTF8);
25 | 		if(v.toLowerCase().compareTo(VERSION_0_STATUS.toLowerCase()) == 0) {
26 | 			return VERSION_0;
27 | 		} else if(v.toLowerCase().compareTo(VERSION_1_STATUS.toLowerCase()) == 0) {
28 | 			return VERSION_1;
29 | 		} else if(v.toLowerCase().compareTo(VERSION_9_STATUS.toLowerCase()) == 0) {
30 | 			return VERSION_9;
31 | 		}
32 | 		return VERSION_0;
33 | 	}
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpParseException.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import org.archive.RecoverableRecordFormatException;
 4 | 
 5 | public class HttpParseException extends RecoverableRecordFormatException {
 6 | 
 7 | 	/** */
 8 | 	private static final long serialVersionUID = -2194883519998764425L;
 9 | 
10 | 	public HttpParseException(String string) {
11 | 		super(string);
12 | 	}
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpParseObserver.java:
--------------------------------------------------------------------------------
1 | package org.archive.format.http;
2 | 
3 | public interface HttpParseObserver 
4 | 
5 | extends HttpResponseMessageObserver, HttpHeaderObserver {
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpRequest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import java.io.FilterInputStream;
 4 | import java.io.InputStream;
 5 | 
 6 | public class HttpRequest extends FilterInputStream {
 7 | 	private HttpRequestMessage message = null;
 8 | 	private HttpHeaders headers = null;
 9 | 	private int headerBytes = 0;
10 | 
11 | 	protected HttpRequest(InputStream in,
12 | 			HttpRequestMessage message, HttpHeaders headers) {
13 | 		super(in);
14 | 		this.message = message;
15 | 		this.headers = headers;
16 | 	}
17 | 
18 | 	public HttpRequestMessage getMessage() {
19 | 		return message;
20 | 	}
21 | 
22 | 	public HttpHeaders getHeaders() {
23 | 		return headers;
24 | 	}
25 | 
26 | 	public int getHeaderBytes() {
27 | 		return headerBytes;
28 | 	}
29 | 
30 | 	public void setHeaderBytes(int headerBytes) {
31 | 		this.headerBytes = headerBytes;
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpRequestMessage.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | public class HttpRequestMessage extends HttpMessage implements HttpRequestMessageObserver {
 4 | 	private int method = 0;
 5 | 	private String path = null;
 6 | 
 7 | 	public void messageParsed(int method, String path, int version, int bytes) {
 8 | 		this.method = method;
 9 | 		this.path = path;
10 | 		this.version = version;
11 | 		this.bytes = bytes;
12 | 	}
13 | 
14 | 	public String getPath() {
15 | 		return path;
16 | 	}
17 | 	public int getMethod() {
18 | 		return method;
19 | 	}
20 | 
21 | 	public String getMethodString() {
22 | 		switch(method) {
23 | 		case METHOD_GET     : return METHOD_GET_STRING;
24 | 		case METHOD_HEAD    : return METHOD_HEAD_STRING;
25 | 		case METHOD_POST    : return METHOD_POST_STRING;
26 | 		case METHOD_PUT     : return METHOD_PUT_STRING;
27 | 		case METHOD_TRACE   : return METHOD_TRACE_STRING;
28 | 		case METHOD_DELETE  : return METHOD_DELETE_STRING;
29 | 		case METHOD_CONNECT : return METHOD_CONNECT_STRING;
30 | 		}
31 | 		return METHOD_UNK_STRING;
32 | 	}
33 | 	
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpRequestMessageObserver.java:
--------------------------------------------------------------------------------
1 | package org.archive.format.http;
2 | 
3 | public interface HttpRequestMessageObserver extends HttpConstants {
4 | 	public void messageParsed(int method, String path, int version, int bytes);
5 | 	public void messageCorrupt();
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpRequestParser.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | public class HttpRequestParser {
 7 | 	private HttpRequestMessageParser messageParser = 
 8 | 		new HttpRequestMessageParser();
 9 | 
10 | 	private HttpHeaderParser headerParser = new HttpHeaderParser();
11 | 
12 | 	public HttpRequestParser() {}
13 | 	public HttpRequest parse(InputStream is) 
14 | 	throws HttpParseException, IOException {
15 | 
16 | 		HttpRequestMessage message = new HttpRequestMessage();
17 | 		HttpHeaders headers = new HttpHeaders();
18 | 		int headerBytes = messageParser.parse(is, message);
19 | 		headerBytes += headerParser.doParse(is, headers);
20 | 		
21 | 		HttpRequest request = new HttpRequest(is, message, headers);
22 | 		request.setHeaderBytes(headerBytes);
23 | 
24 | 		return request;
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpResponse.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import java.io.FilterInputStream;
 4 | import java.io.InputStream;
 5 | 
 6 | public class HttpResponse extends FilterInputStream {
 7 | 	private HttpResponseMessage message = null;
 8 | 	private HttpHeaders headers = null;
 9 | 	private InputStream inner;
10 | 	private int headerBytes = 0;
11 | 
12 | 	protected HttpResponse(InputStream in,
13 | 			HttpResponseMessage message, HttpHeaders headers) {
14 | 		super(in);
15 | 		inner = in;
16 | 		this.message = message;
17 | 		this.headers = headers;
18 | 	}
19 | 	public InputStream getInner() {
20 | 		return inner;
21 | 	}
22 | 	public HttpResponseMessage getMessage() {
23 | 		return message;
24 | 	}
25 | 
26 | 	public HttpHeaders getHeaders() {
27 | 		return headers;
28 | 	}
29 | 
30 | 	public int getHeaderBytes() {
31 | 		return headerBytes;
32 | 	}
33 | 
34 | 	public void setHeaderBytes(int headerBytes) {
35 | 		this.headerBytes = headerBytes;
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpResponseMessage.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | public class HttpResponseMessage extends HttpMessage implements HttpResponseMessageObserver {
 4 | 	private int status = 0;
 5 | 	private String reason = null;
 6 | 	
 7 | 	public HttpResponseMessage(){}
 8 | 
 9 | 	public HttpResponseMessage(int version, int status, String reason) {
10 | 		this.version = version;
11 | 		this.status = status;
12 | 		this.reason = reason;
13 | 	}
14 | 
15 | 	public int getStatus() {
16 | 		return status;
17 | 	}
18 | 	
19 | 	public String getReason() {
20 | 		return reason;
21 | 	}
22 | 	public String toString() {
23 | 		return String.format("%s %d %s%s", getVersionString(), status, reason, CRLF);
24 | 	}
25 | 	public String toDebugString() {
26 | 		return String.format("Message(%d):(%s) (%d) (%s)\n",
27 | 				reason.length(),getVersionString(),status,reason,CRLF);
28 | 	}
29 | 
30 | 	public void messageParsed(int version, int status, String reason, int bytes) {
31 | 		this.version = version;
32 | 		this.status = status;
33 | 		this.reason = reason;
34 | 		this.bytes = bytes;
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpResponseMessageObserver.java:
--------------------------------------------------------------------------------
1 | package org.archive.format.http;
2 | 
3 | public interface HttpResponseMessageObserver extends HttpConstants {
4 | 	public void messageParsed(int version, int code, String reason, int bytes);
5 | 	public void messageCorrupt();
6 | }
7 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/http/HttpResponseParser.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | public class HttpResponseParser {
 7 | 	private HttpResponseMessageParser messageParser = 
 8 | 		new HttpResponseMessageParser();
 9 | 
10 | 	private HttpHeaderParser headerParser = new HttpHeaderParser();
11 | 
12 | 	public HttpResponseParser() {}
13 | 	public HttpResponse parse(InputStream is) 
14 | 	throws HttpParseException, IOException {
15 | 
16 | 		HttpResponseMessage message = new HttpResponseMessage();
17 | 		HttpHeaders headers = new HttpHeaders();
18 | 		int headerBytes = messageParser.parse(is, message);
19 | 		headerBytes += headerParser.doParse(is, headers);
20 | 		
21 | 		HttpResponse response = new HttpResponse(is, message, headers);
22 | 		response.setHeaderBytes(headerBytes);
23 | 		// TODO: check for chunked transfer encoding
24 | 		return response;
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/json/CompoundORJSONPathSpec.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import com.github.openjson.JSONObject;
 7 | 
 8 | public class CompoundORJSONPathSpec implements JSONPathSpec {
 9 | 	ArrayList<JSONPathSpec> parts;
10 | 	public CompoundORJSONPathSpec(List<JSONPathSpec> parts) {
11 | 		this.parts = new ArrayList<JSONPathSpec>();
12 | 		for(JSONPathSpec part : parts) {
13 | 			this.parts.add(part);
14 | 		}
15 | 	}
16 | 
17 | 	public List<List<String>> extract(JSONObject json) {
18 | 		List<List<String>> matches;
19 | 		for(JSONPathSpec spec : parts) {
20 | 			matches = spec.extract(json);
21 | 			// check if empty:
22 | 			if(matches.size() == 1) {
23 | 				if(matches.get(0).size() == 1) {
24 | 					if(matches.get(0).get(0).length() > 0) {
25 | 						return matches;
26 | 					}
27 | 				}
28 | 			}
29 | //			if(matches.size() > 0) {
30 | //				if(matches.get(0).size() > 0) {
31 | //					return matches;
32 | //				}
33 | //			}
34 | 		}
35 | 		return null;
36 | 	}
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/json/JSONPathSpec.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import com.github.openjson.JSONObject;
 6 | 
 7 | public interface JSONPathSpec {
 8 | 	public static final String EMPTY = "";
 9 | 	public List<List<String>> extract(JSONObject json);
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/json/JSONPathSpecFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | public class JSONPathSpecFactory {
 6 | 	public static JSONPathSpec get(String spec) {
 7 | 		if(spec.contains("|")) {
 8 | 			// compound OR:
 9 | 			String parts[] = spec.split("\\|");
10 | 			ArrayList<JSONPathSpec> subs = new ArrayList<JSONPathSpec>(parts.length);
11 | 			for(String part : parts) {
12 | 				subs.add(new SimpleJSONPathSpec(part));
13 | 			}
14 | 			return new CompoundORJSONPathSpec(subs);
15 | 		} else {
16 | 			// assume "simple":
17 | 			return new SimpleJSONPathSpec(spec);
18 | 		}
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/json/JSONView.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.logging.Level;
 6 | import java.util.logging.Logger;
 7 | 
 8 | import org.apache.commons.lang.StringUtils;
 9 | import com.github.openjson.JSONObject;
10 | 
11 | /**
12 |  * 
13 |  * Class which provides a column-oriented view of a JSON structure.
14 |  * 
15 |  * An instance is constructed with an array of field specifiers, each of which
16 |  * declares the source path to one column of output.
17 |  * 
18 |  * @author brad
19 |  *
20 |  */
21 | public class JSONView {
22 | 	private static final Logger LOG =
23 | 		Logger.getLogger(JSONView.class.getName());
24 | 	
25 | 	ArrayList<JSONPathSpec> pathSpecs;
26 | 	CrossProductOfLists<String> crosser;
27 | 
28 | 	public JSONView(String... pathSpecs) {
29 | 		this.pathSpecs = new ArrayList<JSONPathSpec>(pathSpecs.length);
30 | 		if(LOG.isLoggable(Level.INFO)) {
31 | 			LOG.info(String.format("Creating JSONView with(%s)",
32 | 					StringUtils.join(pathSpecs,",")));
33 | 		}
34 | 		for(String pathSpec : pathSpecs) {
35 | 			this.pathSpecs.add(JSONPathSpecFactory.get(pathSpec));
36 | 		}
37 | 		crosser = new CrossProductOfLists<String>();
38 | 	}
39 | 	public List<List<String>> apply(JSONObject json) {
40 | 		ArrayList<List<List<String>>> results =
41 | 			new ArrayList<List<List<String>>>(pathSpecs.size());
42 | 		
43 | 		for(JSONPathSpec pathSpec : pathSpecs) {
44 | 			List<List<String>> result = pathSpec.extract(json);
45 | 			if(result == null) {
46 | //				ArrayList<String> tmp = new ArrayList<String>();
47 | 				result = new ArrayList<List<String>>();
48 | 			}
49 | 			results.add(result);
50 | 		}
51 | 		return crosser.crossProduct(results);
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/text/charset/StandardCharsetDetector.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Wayback archival access software
 3 |  *   (http://archive-access.sourceforge.net/projects/wayback/).
 4 |  *
 5 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 6 |  *  contributors. 
 7 |  *
 8 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 9 |  *  (the "License"); you may not use this file except in compliance with
10 |  *  the License.  You may obtain a copy of the License at
11 |  *
12 |  *      http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  *  Unless required by applicable law or agreed to in writing, software
15 |  *  distributed under the License is distributed on an "AS IS" BASIS,
16 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  *  See the License for the specific language governing permissions and
18 |  *  limitations under the License.
19 |  */
20 | package org.archive.format.text.charset;
21 | 
22 | import java.io.IOException;
23 | 
24 | import org.archive.format.http.HttpHeaders;
25 | 
26 | public class StandardCharsetDetector extends CharsetDetector {
27 | 	public String getCharset(byte buffer[],int len, HttpHeaders headers)
28 | 	throws IOException {
29 | 		String charSet = getCharsetFromHeaders(headers);
30 | 		if(charSet == null) {
31 | 			charSet = getCharsetFromMeta(buffer,len);
32 | 			if(charSet == null) {
33 | 				charSet = getCharsetFromBytes(buffer,len);
34 | 				if(charSet == null) {
35 | 					charSet = DEFAULT_CHARSET;
36 | 				}
37 | 			}
38 | 		}
39 | 		return charSet;		
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/text/html/LexParser.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.text.html;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.Writer;
 5 | 
 6 | import org.htmlparser.Node;
 7 | import org.htmlparser.nodes.RemarkNode;
 8 | import org.htmlparser.nodes.TagNode;
 9 | import org.htmlparser.nodes.TextNode;
10 | import org.htmlparser.util.ParserException;
11 | 
12 | public class LexParser extends NodeUtils {
13 | 	ParseObserver obs;
14 | 	public LexParser(ParseObserver obs) {
15 | 		this.obs = obs;
16 | 	}
17 | 	public void doParse(CDATALexer lex) throws ParserException, IOException {
18 | 		doParse(lex,null);
19 | 	}
20 | 	
21 | 	public void doParse(CDATALexer lex, Writer w) throws ParserException, IOException {
22 | 		obs.handleDocumentStart();
23 | 		Node n;
24 | 		TextNode tx;
25 | 		TagNode tn;
26 | 		while(true) {
27 | 			n = lex.nextNode();
28 | 			if(n == null) {
29 | 				break;
30 | 			}
31 | 			if(isRemarkNode(n)) {
32 | 				obs.handleRemarkNode((RemarkNode)n);
33 | 			} else if(isTextNode(n)) {
34 | 				tx = (TextNode) n;
35 | 				if(lex.inCSS()) {
36 | 					obs.handleStyleNode(tx);
37 | 				} else if(lex.inJS()) {
38 | 					obs.handleScriptNode(tx);
39 | 				} else {
40 | 					obs.handleTextNode(tx);
41 | 				}
42 | 			} else {
43 | 				tn = (TagNode) n;
44 | 				if(tn.isEmptyXmlTag()) {
45 | 					obs.handleTagEmpty(tn);
46 | 				} else if(tn.isEndTag()) {
47 | 					obs.handleTagClose(tn);
48 | 				} else {
49 | 					obs.handleTagOpen(tn);
50 | 				}
51 | 			}
52 | 			if(w != null) {
53 | 				w.write(n.toHtml(true));
54 | 			}
55 | 		}
56 | 		obs.handleDocumentComplete();
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/format/text/html/ParseObserver.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.text.html;
 2 | 
 3 | import org.htmlparser.nodes.RemarkNode;
 4 | import org.htmlparser.nodes.TagNode;
 5 | import org.htmlparser.nodes.TextNode;
 6 | 
 7 | public interface ParseObserver {
 8 | 	public void handleDocumentStart();
 9 | 	public void handleDocumentComplete();
10 | 	
11 | 	public void handleTagEmpty(TagNode tag);
12 | 	public void handleTagOpen(TagNode tag);
13 | 	public void handleTagClose(TagNode tag);
14 | 
15 | 	public void handleTextNode(TextNode text);
16 | 	public void handleScriptNode(TextNode text);
17 | 	public void handleStyleNode(TextNode text);
18 | 
19 | 	public void handleRemarkNode(RemarkNode remark);
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/hadoop/ResourceContext.java:
--------------------------------------------------------------------------------
 1 | package org.archive.hadoop;
 2 | 
 3 | public class ResourceContext {
 4 | 	public String name;
 5 | 	public long offset;
 6 | 	public ResourceContext(String name, long offset) {
 7 | 		this.name = name;
 8 | 		this.offset = offset;
 9 | 	}
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/hadoop/ResourceInputFormat.java:
--------------------------------------------------------------------------------
 1 | package org.archive.hadoop;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.mapreduce.InputSplit;
 7 | import org.apache.hadoop.mapreduce.JobContext;
 8 | import org.apache.hadoop.mapreduce.RecordReader;
 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 | import org.archive.resource.MetaData;
12 | 
13 | public class ResourceInputFormat extends FileInputFormat<ResourceContext, MetaData>{
14 | 
15 | 	@Override
16 | 	public RecordReader<ResourceContext, MetaData> createRecordReader(InputSplit inputSplit,
17 | 			TaskAttemptContext context) throws IOException, InterruptedException {
18 | 		
19 | 		return new ResourceRecordReader();
20 | 	}
21 | 
22 | 	/* (non-Javadoc)
23 | 	 * @see org.apache.hadoop.mapreduce.lib.input.FileInputFormat#isSplitable(org.apache.hadoop.mapreduce.JobContext, org.apache.hadoop.fs.Path)
24 | 	 */
25 | 	@Override
26 | 	protected boolean isSplitable(JobContext context, Path filename) {
27 | 		// TODO: ensure this works... it should be may be losing records between..
28 | 		return false;
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/hadoop/func/JSONViewEvalFunc.java:
--------------------------------------------------------------------------------
 1 | package org.archive.hadoop.func;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.logging.Logger;
 6 | 
 7 | import org.apache.pig.EvalFunc;
 8 | import org.apache.pig.data.Tuple;
 9 | import org.apache.pig.data.TupleFactory;
10 | import org.archive.format.json.JSONUtils;
11 | import com.github.openjson.JSONException;
12 | import com.github.openjson.JSONObject;
13 | 
14 | public class JSONViewEvalFunc extends EvalFunc<Tuple> {
15 | 	private static final Logger LOG =
16 | 		Logger.getLogger(JSONViewEvalFunc.class.getName());
17 | 
18 | 	protected TupleFactory mTupleFactory = TupleFactory.getInstance();
19 | 	private ArrayList<Object> mProtoTuple = null;
20 | 
21 | 	public JSONViewEvalFunc() {
22 | 		mProtoTuple = new ArrayList<Object>();
23 | 	}
24 | 	
25 | 	@Override
26 | 	public Tuple exec(Tuple tup) throws IOException {
27 | 		// [0] is the JSON. Remaining elements are Strings describing paths
28 | 		// into the JSON to "flatten" into a single tuple:
29 | 		if(tup == null || tup.size() == 0) {
30 | 			return null;
31 | 		}
32 | 		try {
33 | 			JSONObject json = new JSONObject(tup.get(0).toString());
34 | 			for(int i = 1; i < tup.size(); i++) {
35 | 				String path = tup.get(i).toString();
36 | 				String result = JSONUtils.extractSingle(json, path);
37 | 				mProtoTuple.add(result);
38 | 			}
39 | 		} catch (JSONException e) {
40 | 			LOG.warning("Failed to parse JSON:"+e.getMessage());
41 | 			return null;
42 | 		}
43 | 		Tuple t = mTupleFactory.newTuple(mProtoTuple);
44 | 		mProtoTuple.clear();
45 | 		return t;
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/hadoop/func/TupleFunc.java:
--------------------------------------------------------------------------------
 1 | package org.archive.hadoop.func;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | 
 6 | import org.apache.pig.EvalFunc;
 7 | import org.apache.pig.data.Tuple;
 8 | import org.apache.pig.data.TupleFactory;
 9 | 
10 | public class TupleFunc extends EvalFunc<Tuple> {
11 | 	
12 | 	protected TupleFactory mTupleFactory = TupleFactory.getInstance();
13 | 	private ArrayList<Object> mProtoTuple = null;
14 | 	
15 | 	public TupleFunc() {
16 | 		mProtoTuple = new ArrayList<Object>();
17 | 	}
18 | 
19 | 	private Tuple makeTuple(String va[]) {
20 | 		if(va == null) {
21 | 			return null;
22 | 		}
23 | 		for(String v : va) {
24 | 			mProtoTuple.add(v);
25 | 		}
26 | 		Tuple t = mTupleFactory.newTuple(mProtoTuple);
27 | 		mProtoTuple.clear();
28 | 		return t;
29 | 	}
30 | 
31 | 	@Override
32 | 	public Tuple exec(Tuple tup) throws IOException {
33 | 		if(tup == null || tup.size() != 2) {
34 | 			return null;
35 | 		}
36 | 		String in = tup.get(0).toString();
37 | 		String split = tup.get(1).toString();
38 | 		return makeTuple(in.split(split));
39 | 	}
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/httpclient/package.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
 2 | <html>
 3 | <head>
 4 | <title>org.archive.httpclient package</title>
 5 | </head>
 6 | <body>Provides specializations on
 7 | 	apache <a href="http://jakarta.apache.org/commons/httpclient/">jakarta
 8 | 	commons httpclient</a>.
 9 | 	
10 | 	<h2>HttpRecorderGetMethod</h2>
11 | 	<p>Class that the passed HttpRecorder w/ boundary between
12 | 	HTTP header and content.  Also forces a close on the response on
13 | 	call to releaseConnection.</p>
14 | 	
15 | 	<h2>ConfigurableTrustManagerProtocolSocketFactory</h2>
16 | 	<p>A protocol socket factory that allows setting of trust level on
17 | 	construction.</p>
18 | 
19 |     <h2>References</h2>
20 |     <p><a
21 |     href="http://java.sun.com/j2se/1.4.2/docs/guide/security/jsse/JSSERefGuide.html">JavaTM Secure Socket Extension (JSSE): Reference Guide</a></p>
22 | 
23 | </body>
24 | </html>
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/ArchiveFileConstants.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | 
20 | package org.archive.io;
21 | 
22 | @Deprecated
23 | public interface ArchiveFileConstants extends org.archive.format.ArchiveFileConstants {
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/CompositeFileReader.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | import java.io.File;
22 | import java.io.IOException;
23 | import java.io.InputStreamReader;
24 | import java.util.List;
25 | 
26 | 
27 | /**
28 |  * @author gojomo
29 |  */
30 | public class CompositeFileReader extends InputStreamReader {
31 | 
32 |     /**
33 |      * @param filenames
34 |      * @throws IOException
35 |      */
36 |     public CompositeFileReader(List<File> filenames) throws IOException {
37 |         super(new CompositeFileInputStream(filenames));
38 |     }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/GZIPMembersInputStream.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 | 
24 | /**
25 |  * @deprecated use {@link org.archive.util.zip.GZIPMembersInputStream}
26 |  */
27 | @Deprecated
28 | public class GZIPMembersInputStream extends org.archive.util.zip.GZIPMembersInputStream {
29 | 
30 |     public GZIPMembersInputStream(InputStream in) throws IOException {
31 |         super(in);
32 |     }
33 |     
34 |     public GZIPMembersInputStream(InputStream in, int size) throws IOException {
35 |         super(in, size);
36 |     }
37 |     
38 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/GzipHeader.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | /**
22 |  * @deprecated use {@link org.archive.util.zip.GzipHeader}
23 |  */
24 | @Deprecated
25 | public class GzipHeader extends org.archive.util.zip.GzipHeader {
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/NoGzipMagicException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | /**
22 |  * @deprecated use {@link org.archive.util.zip.NoGzipMagicException}
23 |  */
24 | @Deprecated
25 | public class NoGzipMagicException extends org.archive.util.zip.NoGzipMagicException {
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/Preformatter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | import java.util.logging.LogRecord;
22 | 
23 | /**
24 |  * Interface indicating a logging Formatter can preformat a record (outside
25 |  * the standard-implementation synchronized block) and cache it, returning it
26 |  * for the next request for formatting from the same thread. 
27 |  * @author gojomo
28 |  */
29 | public interface Preformatter {
30 |     public void preformat(LogRecord record); 
31 |     public void clear(); 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/ReadSource.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | 
20 | package org.archive.io;
21 | 
22 | import java.io.Reader;
23 | 
24 | /**
25 |  * Interface for objects that can provide a Reader view of their 
26 |  * contents.  
27 |  * 
28 |  */
29 | public interface ReadSource {
30 |     /**
31 |      * Obtain a Reader. Not named 'getReader' so that it is not
32 |      * considered a simple costless read-only property by 
33 |      * bean-convention introspection tools.
34 |      * @return a Reader on this object
35 |      */
36 |     Reader obtainReader();
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/RecorderIOException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | import java.io.IOException;
22 | 
23 | /**
24 |  *
25 |  * @author Gordon Mohr
26 |  */
27 | public class RecorderIOException extends IOException {
28 | 
29 |     private static final long serialVersionUID = 5907470275350314277L;
30 | 
31 |     public RecorderIOException() {
32 |     	super();
33 |     }
34 | 
35 |     public RecorderIOException(String msg) {
36 |     	super(msg);
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/RecorderLengthExceededException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | 
22 | /**
23 |  * Indicates a length exception thrown by the Recorder.
24 |  *
25 |  * @author Gordon Mohr
26 |  */
27 | public class RecorderLengthExceededException
28 | extends RecorderIOException {
29 | 
30 |     private static final long serialVersionUID = 6655419033414648444L;
31 | 
32 |     public RecorderLengthExceededException() {
33 |         super();
34 |     }
35 |     
36 |     public RecorderLengthExceededException(String msg) {
37 |         super(msg);
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/RecorderTimeoutException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | /**
22 |  * Indicates a timeout thrown by the RecordingInputStream.
23 |  *
24 |  * @author Gordon Mohr
25 |  */
26 | public class RecorderTimeoutException extends RecorderIOException {
27 | 
28 |     private static final long serialVersionUID = 7433214063765078269L;
29 | 
30 |     public RecorderTimeoutException() {
31 |         super();
32 |     }
33 | 
34 |     public RecorderTimeoutException(String msg) {
35 |         super(msg);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/RecorderTooMuchHeaderException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | 
22 | /**
23 |  * Indicates a too much header material exception thrown by the Recorder
24 |  * (specificially the RecordingOutputStream)
25 |  * 
26 |  * @author Gordon Mohr
27 |  */
28 | public class RecorderTooMuchHeaderException
29 | extends RecorderIOException {
30 | 
31 |     private static final long serialVersionUID = 3528516034898129150L;
32 | 
33 |     public RecorderTooMuchHeaderException() {
34 |         super();
35 |     }
36 |     
37 |     public RecorderTooMuchHeaderException(String msg) {
38 |         super(msg);
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/SeekReaderCharSequence.java:
--------------------------------------------------------------------------------
 1 | package org.archive.io;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public class SeekReaderCharSequence implements CharSequence {
 6 | 
 7 |     
 8 |     final private SeekReader reader;
 9 |     final private int size;
10 |     
11 | 
12 |     public SeekReaderCharSequence(SeekReader reader, int size) {
13 |         this.reader = reader;
14 |         this.size = size;
15 |     }
16 |     
17 |     
18 |     public int length() {
19 |         return size;
20 |     }
21 |     
22 |     
23 |     public char charAt(int index) {
24 |         if ((index < 0) || (index >= length())) {
25 |             throw new IndexOutOfBoundsException(Integer.toString(index));
26 |         }
27 |         try {
28 |             reader.position(index);
29 |             int r = reader.read();
30 |             if (r < 0) {
31 |                 throw new IllegalStateException("EOF");
32 |             }
33 |             return (char)reader.read();
34 |         } catch (IOException e) {
35 |             throw new RuntimeException(e);
36 |         }
37 |     }
38 |     
39 |     
40 |     public CharSequence subSequence(int start, int end) {
41 |         return new CharSubSequence(this, start, end);
42 |     }
43 |     
44 |     public String toString() {
45 |         StringBuilder sb = new StringBuilder();
46 |         try {
47 |             reader.position(0);
48 |             for (int ch = reader.read(); ch >= 0; ch = reader.read()) {
49 |                 sb.append((char)ch);
50 |             }
51 |             return sb.toString();
52 |         } catch (IOException e) {
53 |             throw new IllegalStateException(e);
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/SinkHandlerLogThread.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | 
20 | package org.archive.io;
21 | 
22 | 
23 | /**
24 |  * Implemented by threads that provide extra information.
25 |  * 
26 |  * TODO: rename class, rename getCurrentProcessorName()
27 |  */
28 | public interface SinkHandlerLogThread {
29 | 
30 |     String getName();
31 |     String getCurrentProcessorName();
32 |     int getSerialNumber();
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/UTF8Bytes.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | import java.io.UnsupportedEncodingException;
22 | 
23 | /**
24 |  * Marker Interface for instances that can be serialized as UTF8 bytes.
25 |  * TODO: Do we need a UTF8Stream Marker Interface?
26 |  * @author stack
27 |  * @version $Date$ $Version$
28 |  */
29 | public interface UTF8Bytes {
30 |     public static final String UTF8 = "UTF-8";
31 |     
32 |     /**
33 |      * @return Instance as UTF-8 bytes.
34 |      * @throws UnsupportedEncodingException 
35 |      */
36 |     public byte [] getUTF8Bytes() throws UnsupportedEncodingException;
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/WriterPoolSettings.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io;
20 | 
21 | import java.io.File;
22 | import java.util.List;
23 | 
24 | /**
25 |  * Settings object for a {@link WriterPool}.
26 |  * Used creating {@link WriterPoolMember}s.
27 |  * @author stack
28 |  * @version $Date$, $Revision$
29 |  */
30 | public interface WriterPoolSettings {
31 |     public long getMaxFileSizeBytes();
32 |     public String getPrefix();
33 |     public String getTemplate(); 
34 |     public List<File> calcOutputDirs();
35 |     public boolean getCompress();
36 |     public List<String> getMetadata();
37 |     public boolean getFrequentFlushes();
38 |     public int getWriteBufferSize();
39 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/arc/ARCConstants.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io.arc;
20 | 
21 | 
22 | /**
23 |  * Constants used by ARC files and in ARC file processing.
24 |  * 
25 |  * @author stack
26 |  * @deprecated
27 |  */
28 | public interface ARCConstants extends org.archive.format.arc.ARCConstants {
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/arc/ARCLocation.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io.arc;
20 | 
21 | /**
22 |  * Datastructure to hold ARC record location.
23 |  * Used by wayback machine.
24 |  * @author stack
25 |  */
26 | public interface ARCLocation {
27 |     /**
28 |      * @return Returns the ARC filename.  Can be full path to ARC, URL to an
29 |      * ARC or just the portion of an ARC name that is unique to a collection.
30 |      */
31 |     public String getName();
32 | 
33 |     /**
34 |      * @return Returns the offset into the ARC.
35 |      */
36 |     public long getOffset();
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/package.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
 2 | <html>
 3 | <head>
 4 | <title>org.archive.io.arc package</title>
 5 | </head>
 6 | <body>
 7 | ARC file reading and writing.
 8 | </body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/warc/WARCConstants.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | 
20 | package org.archive.io.warc;
21 | 
22 | @Deprecated
23 | public interface WARCConstants extends org.archive.format.warc.WARCConstants {
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/warc/WARCWriterPoolSettings.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io.warc;
20 | 
21 | import org.archive.io.WriterPoolSettings;
22 | import org.archive.uid.RecordIDGenerator;
23 | 
24 | /**
25 |  * Settings object for a {@link WARCWriterPool}.
26 |  * Used creating {@link WARCWriter}s.
27 |  * 
28 |  * @version $Date: 2010-08-19 17:21:43 -0700 (Thu, 19 Aug 2010) $, $Revision: 6927 $
29 |  */
30 | public interface WARCWriterPoolSettings extends WriterPoolSettings {
31 |     public RecordIDGenerator getRecordIDGenerator();
32 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/warc/WARCWriterPoolSettingsData.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.io.warc;
20 | 
21 | import java.io.File;
22 | import java.util.List;
23 | 
24 | import org.archive.io.arc.WriterPoolSettingsData;
25 | import org.archive.uid.RecordIDGenerator;
26 | 
27 | public class WARCWriterPoolSettingsData extends WriterPoolSettingsData implements WARCWriterPoolSettings {
28 |     RecordIDGenerator generator;
29 |     
30 |     public WARCWriterPoolSettingsData(String prefix, String template,
31 |             long maxFileSizeBytes, boolean compress, List<File> outputDirs,
32 |             List<String> metadata, RecordIDGenerator generator) {
33 |         super(prefix,template,maxFileSizeBytes,compress,outputDirs,metadata);
34 |         this.generator = generator;
35 |     }
36 |     @Override
37 |     public RecordIDGenerator getRecordIDGenerator() {
38 |         return generator; 
39 |     }
40 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/io/warc/package.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
 2 | <html>
 3 | <head>
 4 | <title>org.archive.io.warc package</title>
 5 | </head>
 6 | <body>
 7 | Experimental WARC Writer and Readers.  Code and specification subject to change
 8 | with no guarantees of backward compatibility: i.e. newer readers
 9 | may not be able to parse WARCs written with older writers. This package
10 | contains prototyping code for revision 0.12 of the WARC specification.
11 | See <a href="https://archive-access.svn.sourceforge.net/svnroot/archive-access/branches/gjm_warc_0_12/warc/warc_file_format.html">latest revision</a>
12 | for current state (Version 0.10 code and its documentation has been moved into the
13 | <a href="v10/package.html">v10</a> subpackage).
14 | 
15 | 
16 | <h2>Implementation Notes</h2>
17 | <h3>Tools</h3>
18 | <p>Initial implementations of <code>Arc2Warc</code> and <code>Warc2Arc</code>
19 | tools can be found in Heritrix, at
20 | org.archive.io.Arc2Warc and org.archive.io.Warc2Arc
21 | respectively.  Pass <code>--help</code> to learn how to use each tool.
22 | </p>
23 | 
24 | <h2>TODO</h2>
25 | <ul>
26 | <li>Is MIME-Version header needed?  MIME Parsers seem fine without (python email
27 | lib and java mail).</li>
28 | <li>Should we write out a Content-Transfer-Encoding
29 | header (Currently we do not). Need section in spec. explicit about our
30 | interpretation of MIME and deviations (e.g. content-transfer-encoding should
31 | be assumed binary in case of WARCs, multipart is not disallowed but not
32 | encouraged, etc.)</li>
33 | <li>Minor: Do WARC-Version: 0.12 like MIME-Version: 1.0 rather than 
34 | WARC/0.12 for lead in to an ARCRecord?</li>
35 | </ul>
36 | 
37 | </body>
38 | </html>
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/net/FTPException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.net;
20 | 
21 | import java.io.IOException;
22 | 
23 | /**
24 |  * Indicates that a FTP operation failed due to a protocol violation.
25 |  * For instance, if authentication fails.
26 |  * 
27 |  * @author pjack
28 |  */
29 | public class FTPException extends IOException {
30 |     private static final long serialVersionUID = 1L;
31 |     
32 |     /**
33 |      * The reply code from the FTP server.
34 |      */
35 |     private int code;
36 |     
37 |     /**
38 |      * Constructs a new <code>FTPException</code>.
39 |      * 
40 |      * @param code  the error code from the FTP server
41 |      */
42 |     public FTPException(int code) {
43 |         super("FTP error code: " + code);
44 |         this.code = code;
45 |     }
46 | 
47 | 
48 |     /**
49 |      * Returns the error code from the FTP server.
50 |      * 
51 |      * @return  the error code from the FTP server
52 |      */
53 |     public int getReplyCode() {
54 |         return code;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/net/md5/Md5URLConnection.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.net.md5;
20 | 
21 | import java.net.URL;
22 | 
23 | import org.archive.net.DownloadURLConnection;
24 | 
25 | /**
26 |  * Md5 URL connection.
27 |  * @author stack
28 |  * @version $Date$, $Revision$
29 |  */
30 | public class Md5URLConnection extends DownloadURLConnection {
31 |     protected Md5URLConnection(URL u) {
32 |         super(u);
33 |     }
34 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/net/rsync/RsyncURLConnection.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.net.rsync;
20 | 
21 | import java.io.File;
22 | import java.net.URL;
23 | 
24 | import org.archive.net.DownloadURLConnection;
25 | 
26 | /**
27 |  * Rsync URL connection.
28 |  * @author stack
29 |  * @version $Date$, $Revision$
30 |  */
31 | public class RsyncURLConnection extends DownloadURLConnection {
32 |     private final String RSYNC_TIMEOUT =
33 |     	System.getProperty(RsyncURLConnection.class.getName() + ".timeout",
34 |     		"300");
35 | 
36 |     protected RsyncURLConnection(URL u) {
37 |         super(u);
38 |     }
39 |     
40 |     protected String getScript() {
41 |     	return System.getProperty(this.getClass().getName() + ".path",
42 |     		"rsync");
43 |     }
44 |     
45 |     @Override
46 |     protected String[] getCommand(final URL thisUrl,
47 |     		final File downloadFile) {
48 |     	return new String[] {getScript(), "--timeout=" + RSYNC_TIMEOUT,
49 |     		this.url.getPath(), downloadFile.getAbsolutePath()};  
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/AbstractEmptyResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.InputStream;
 5 | 
 6 | 
 7 | public class AbstractEmptyResource extends AbstractResource {
 8 | 
 9 | 	public AbstractEmptyResource(MetaData metaData, ResourceContainer container) {
10 | 		super(metaData, container);
11 | 	}
12 | 
13 | 	public InputStream getInputStream() {
14 | 		byte bytes[] = new byte[0];
15 | 		return new ByteArrayInputStream(bytes);
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/AbstractResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.PrintStream;
 5 | 
 6 | import org.archive.util.StreamCopy;
 7 | 
 8 | import com.google.common.io.ByteStreams;
 9 | 
10 | public abstract class AbstractResource implements Resource {
11 | 	protected ResourceContainer container;
12 | 	protected MetaData metaData;
13 | 	public AbstractResource(MetaData metaData, 
14 | 			ResourceContainer container) {
15 | 		this.container = container;
16 | 		this.metaData = metaData;
17 | 	}
18 | 
19 | 	public ResourceContainer getContainer() {
20 | 		return container;
21 | 	}
22 | 	public MetaData getMetaData() {
23 | 		return metaData;
24 | 	}
25 | 	
26 | 	public static void dump(PrintStream out, Resource resource) throws IOException {
27 | 
28 | 		MetaData m = resource.getMetaData();
29 | 
30 | 		out.println("Headers Before");
31 | 		out.print(m.toString());
32 | 		
33 | 		out.println("Resource Follows:\n===================");
34 | 		StreamCopy.copy(resource.getInputStream(),out);
35 | 
36 | 		out.println("[\n]Headers After");
37 | 		out.print(m.toString());
38 | 
39 | 	}
40 | 	public static void dumpShort(PrintStream out, Resource resource) throws IOException {
41 | 
42 | 		MetaData m = resource.getMetaData();
43 | 
44 | //		out.println("Headers Before");
45 | //		out.print(m.toString());
46 | 		
47 | 		long bytes = StreamCopy.copy(resource.getInputStream(), ByteStreams.nullOutputStream());
48 | 		out.println("Resource Was:"+bytes+" Long");
49 | 
50 | 		out.println("[\n]Headers After");
51 | 		out.print(m.toString());
52 | 
53 | 	}
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/Resource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | 
 6 | public interface Resource {
 7 | 	/**
 8 | 	 * @return the ResourceContainer holding this Resource
 9 | 	 */
10 | 	public ResourceContainer getContainer();
11 | 
12 | 	/**
13 | 	 * @return an InputStream for reading data from this Resource. Use only
14 | 	 * once, and assume it is unbuffered
15 | 	 */
16 | 	public InputStream getInputStream();
17 | 
18 | 	/**
19 | 	 * @return the MetaData associated with this Resource
20 | 	 */
21 | 	public MetaData getMetaData();
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/ResourceContainer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | /**
 4 |  * A container for one or more Resource objects. Primarily holds context for the
 5 |  * current record
 6 |  * 
 7 |  * @author Brad
 8 |  *
 9 |  */
10 | public interface ResourceContainer {
11 | 	/**
12 | 	 * @return the name of this container. Could be a path, url, basename...
13 | 	 */
14 | 	public String getName();
15 | 	public boolean isCompressed();
16 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/ResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | 
 7 | /**
 8 |  * @author Brad
 9 |  *
10 |  */
11 | public interface ResourceFactory {
12 | 	
13 | 	/**
14 | 	 * Attempts to create a Resource from the InputStream 
15 | 	 */
16 | 	public Resource getResource(InputStream is, MetaData parentMetaData, 
17 | 			ResourceContainer container) 
18 | 	throws ResourceParseException, IOException;
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/ResourceParseException.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | 
 4 | public class ResourceParseException extends Exception {
 5 | 
 6 | 	/** */
 7 | 	private static final long serialVersionUID = 5364502969148304884L;
 8 | 	public ResourceParseException(Exception e) {
 9 | 		super(e);
10 | 	}
11 | 	public ResourceParseException(Exception e, MetaData metaData) {
12 | 		super(e);
13 | 	}
14 | 	
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/ResourceProducer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public interface ResourceProducer {
 6 | 	public Resource getNext() throws ResourceParseException, IOException;
 7 | 	public void close() throws IOException;
 8 | 	public String getContext();
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/TransformingResourceProducer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public class TransformingResourceProducer implements ResourceProducer {
 6 | 	private ResourceProducer producer;
 7 | 	private ResourceFactory factory;
 8 | 	public TransformingResourceProducer(ResourceProducer producer, ResourceFactory factory) {
 9 | 		this.producer = producer;
10 | 		this.factory = factory;
11 | 	}
12 | 	public Resource getNext() throws ResourceParseException, IOException {
13 | 		Resource inner = producer.getNext();
14 | 		if(inner == null) {
15 | 			return null;
16 | 		}
17 | 		return factory.getResource(inner.getInputStream(), inner.getMetaData(),
18 | 				inner.getContainer());
19 | 	}
20 | 	public void close() throws IOException {
21 | 		producer.close();
22 | 	}
23 | 	public String getContext() {
24 | 		return producer.getContext();
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/arc/ARCResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.arc;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.format.arc.ARCFormatException;
 7 | import org.archive.format.arc.ARCMetaData;
 8 | import org.archive.format.arc.ARCMetaDataParser;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.Resource;
12 | import org.archive.resource.ResourceContainer;
13 | import org.archive.resource.ResourceFactory;
14 | import org.archive.resource.ResourceParseException;
15 | 
16 | public class ARCResourceFactory implements ResourceFactory, ResourceConstants {
17 | 	public ARCMetaDataParser parser;
18 | 	public boolean strict = false;
19 | 	public ARCResourceFactory() {
20 | 		parser = new ARCMetaDataParser();
21 | 	}
22 | 	public Resource getResource(InputStream is, MetaData parentMetaData,
23 | 			ResourceContainer container) throws ResourceParseException,
24 | 			IOException {
25 | 
26 | 		try {
27 | 			ARCMetaData m = parser.parse(is,strict,!container.isCompressed());
28 | 			if(m == null) {
29 | 				return null;
30 | 			}
31 | 			ARCResource r = new ARCResource(parentMetaData.createChild(ENVELOPE),
32 | 					container, m,is);
33 | 			return r;
34 | 
35 | 		} catch(ARCFormatException e) {
36 | 			throw new ResourceParseException(e);
37 | 		}
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/arc/record/FiledescResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.arc.record;
 2 | 
 3 | //import java.util.logging.Logger;
 4 | 
 5 | import org.archive.format.arc.FiledescRecord;
 6 | import org.archive.resource.AbstractEmptyResource;
 7 | import org.archive.resource.MetaData;
 8 | import org.archive.resource.ResourceConstants;
 9 | import org.archive.resource.ResourceContainer;
10 | 
11 | public class FiledescResource extends AbstractEmptyResource implements ResourceConstants {
12 | //	private static final Logger LOG = 
13 | //		Logger.getLogger(FiledescResource.class.getName()); 
14 | 
15 | 	public FiledescResource(MetaData metaData, ResourceContainer container,
16 | 			FiledescRecord record) {
17 | 		super(metaData, container);
18 | 		metaData.putLong(FILEDESC_MAJOR, record.getMajorVersion());
19 | 		metaData.putLong(FILEDESC_MINOR, record.getMinorVersion());
20 | 		metaData.putString(FILEDESC_ORGANIZATION, record.getOrganization());
21 | 		metaData.putString(FILEDESC_FORMAT, record.getFormat());
22 | 		if(record.hasMetaData()) {
23 | 			int count = record.getMetaDataCount();
24 | 			for(int i = 0; i < count; i++) {
25 | 				String name = record.getMetaDataName(i);
26 | 				String value = record.getMetaDataValue(i);
27 | 				if((name != null) && (value != null)) {
28 | 					metaData.appendObj(FILEDESC_DATA,
29 | 							METADATA_KV_NAME,name,METADATA_KV_VALUE,value);
30 | 				}
31 | 			}
32 | 		}
33 | 	}
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/arc/record/FiledescResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.arc.record;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.format.arc.FiledescRecord;
 7 | import org.archive.format.arc.FiledescRecordParser;
 8 | import org.archive.resource.MetaData;
 9 | import org.archive.resource.ResourceConstants;
10 | import org.archive.resource.Resource;
11 | import org.archive.resource.ResourceContainer;
12 | import org.archive.resource.ResourceFactory;
13 | import org.archive.resource.ResourceParseException;
14 | 
15 | public class FiledescResourceFactory implements ResourceFactory, ResourceConstants {
16 | 	FiledescRecordParser parser = new FiledescRecordParser();
17 | 	public Resource getResource(InputStream is, MetaData parentMetaData,
18 | 			ResourceContainer container) throws ResourceParseException,
19 | 			IOException {
20 | 		FiledescRecord rec = parser.parse(is);
21 | 
22 | 		parentMetaData.putString(PAYLOAD_CONTENT_TYPE, PAYLOAD_TYPE_FILEDESC);
23 | 		return new FiledescResource(
24 | 				parentMetaData.createChild(FILEDESC_METADATA), container, rec);
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/generic/GenericResourceProducer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.generic;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.archive.resource.MetaData;
 6 | import org.archive.resource.Resource;
 7 | import org.archive.resource.ResourceContainer;
 8 | import org.archive.resource.ResourceParseException;
 9 | import org.archive.resource.ResourceProducer;
10 | import org.archive.streamcontext.Stream;
11 | 
12 | public class GenericResourceProducer implements ResourceContainer, ResourceProducer {
13 | 	private static long UNLIMITED = -1;
14 | 	private Stream stream;
15 | 	private String name;
16 | 	private long endOffset;
17 | 	public GenericResourceProducer(Stream stream, String name) {
18 | 		this(stream,name,UNLIMITED);
19 | 	}
20 | 	public GenericResourceProducer(Stream stream, String name, long endOffset) {
21 | 		this.stream = stream;
22 | 		this.name = name;
23 | 		this.endOffset = endOffset;
24 | 	}
25 | 	public Resource getNext() throws ResourceParseException, IOException {
26 | 		if(stream.atEof()) {
27 | 			return null;
28 | 		}
29 | 		if(endOffset != UNLIMITED) {
30 | 			if(stream.getOffset() > endOffset) {
31 | 				return null;
32 | 			}
33 | 		}
34 | 		return new GenericStreamResource(new MetaData(), this, stream);
35 | 	}
36 | 	
37 | 	public String getName() {
38 | 		return name;
39 | 	}
40 | 
41 | 	public boolean isCompressed() {
42 | 		return false;
43 | 	}
44 | 	public void close() throws IOException {
45 | 		stream.close();
46 | 	}
47 | 	public String getContext() {
48 | 		return String.format("Context(%s)(%d)", name, stream.getOffset());
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/generic/GenericStreamResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.generic;
 2 | 
 3 | import java.io.InputStream;
 4 | 
 5 | import org.archive.resource.AbstractResource;
 6 | import org.archive.resource.MetaData;
 7 | import org.archive.resource.ResourceConstants;
 8 | import org.archive.resource.ResourceContainer;
 9 | import org.archive.streamcontext.StreamWrappedInputStream;
10 | import org.archive.streamcontext.Stream;
11 | 
12 | public class GenericStreamResource extends AbstractResource implements ResourceConstants {
13 | 	private Stream stream;
14 | 	public GenericStreamResource(MetaData metaData, ResourceContainer container, Stream stream) {
15 | 		super(metaData, container);
16 | 		this.stream = stream;
17 | 
18 | 		MetaData containerMD = new MetaData(metaData, CONTAINER);
19 | 
20 | 		containerMD.putString(CONTAINER_FILENAME, container.getName());
21 | 		containerMD.putBoolean(CONTAINER_COMPRESSED, container.isCompressed());
22 | 		containerMD.putLong(CONTAINER_OFFSET, stream.getOffset());
23 | 	}
24 | 
25 | 	public InputStream getInputStream() {
26 | 		return new StreamWrappedInputStream(stream);
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/gzip/GZIPResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.gzip;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.format.gzip.GZIPConstants;
 7 | import org.archive.format.gzip.GZIPSeriesMember;
 8 | import org.archive.resource.AbstractResource;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.ResourceContainer;
12 | import org.archive.util.io.EOFNotifyingInputStream;
13 | import org.archive.util.io.EOFObserver;
14 | 
15 | public class GZIPResource extends AbstractResource 
16 | 	implements GZIPConstants, EOFObserver, ResourceConstants {
17 | 
18 | 	private GZIPSeriesMember member;
19 | 	private EOFNotifyingInputStream eofStream;
20 | 	private GZIPMetaData gzMetaData;
21 | 
22 | 	public GZIPResource(MetaData metaData, ResourceContainer container, 
23 | 			GZIPSeriesMember member) {
24 | 		super(metaData, container);
25 | 		this.member = member;
26 | 		this.eofStream = 
27 | 			new EOFNotifyingInputStream(member, this);
28 | 
29 | 		MetaData containerMD = new MetaData(metaData, CONTAINER);
30 | 
31 | 		containerMD.putString(CONTAINER_FILENAME, member.getRecordFileContext());
32 | 		containerMD.putBoolean(CONTAINER_COMPRESSED, true);
33 | 		containerMD.putLong(CONTAINER_OFFSET, member.getRecordStartOffset());
34 | 
35 | 		gzMetaData = new GZIPMetaData(containerMD);
36 | 	}
37 | 
38 | 	public void close() throws IOException {
39 | 		member.close();
40 | 	}
41 | 
42 | 	public InputStream getInputStream() {
43 | 		return eofStream;
44 | 	}
45 | 
46 | 	public void notifyEOF() throws IOException {
47 | 		gzMetaData.setData(member);
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/html/HTMLResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.html;
 2 | 
 3 | import org.archive.resource.AbstractEmptyResource;
 4 | import org.archive.resource.MetaData;
 5 | import org.archive.resource.ResourceContainer;
 6 | 
 7 | 
 8 | public class HTMLResource extends AbstractEmptyResource {
 9 | 	
10 | 	public HTMLResource(MetaData metaData, ResourceContainer container) {
11 | 		super(metaData, container);
12 | 	}
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/http/HTTPHeadersResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.http;
 2 | 
 3 | import org.archive.format.arc.ARCConstants;
 4 | import org.archive.format.http.HttpHeader;
 5 | import org.archive.format.http.HttpHeaders;
 6 | import org.archive.resource.AbstractEmptyResource;
 7 | import org.archive.resource.MetaData;
 8 | import org.archive.resource.ResourceContainer;
 9 | 
10 | 
11 | public class HTTPHeadersResource extends AbstractEmptyResource
12 | implements ARCConstants {
13 | 
14 | 	public HTTPHeadersResource(MetaData metaData, ResourceContainer container,
15 | 			HttpHeaders headers) {
16 | 		super(metaData, container);
17 | 		for(HttpHeader h : headers) {
18 | 			metaData.putString(h.getName(),h.getValue());
19 | 		}
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/http/HTTPRequestResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.format.http.HttpParseException;
 7 | import org.archive.format.http.HttpRequest;
 8 | import org.archive.format.http.HttpRequestParser;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.Resource;
12 | import org.archive.resource.ResourceContainer;
13 | import org.archive.resource.ResourceFactory;
14 | import org.archive.resource.ResourceParseException;
15 | 
16 | public class HTTPRequestResourceFactory implements ResourceFactory, ResourceConstants {
17 | 	private HttpRequestParser parser;
18 | 	public HTTPRequestResourceFactory() {
19 | 		parser = new HttpRequestParser();
20 | 	}
21 | 
22 | 	public Resource getResource(InputStream is, MetaData metaData,
23 | 			ResourceContainer container) 
24 | 	throws ResourceParseException, IOException {
25 | 		try {
26 | 
27 | 			HttpRequest response = parser.parse(is);
28 | 			metaData.putString(PAYLOAD_CONTENT_TYPE, 
29 | 					PAYLOAD_TYPE_HTTP_REQUEST);
30 | 			return new HTTPRequestResource(metaData.createChild(HTTP_REQUEST_METADATA),
31 | 					container, response, true);
32 | 
33 | 		} catch(HttpParseException e) {
34 | 			throw new ResourceParseException(e);
35 | 		}
36 | 	}
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/http/HTTPResponseResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.http;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.format.http.HttpParseException;
 7 | import org.archive.format.http.HttpResponse;
 8 | import org.archive.format.http.HttpResponseParser;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.Resource;
12 | import org.archive.resource.ResourceContainer;
13 | import org.archive.resource.ResourceFactory;
14 | import org.archive.resource.ResourceParseException;
15 | 
16 | public class HTTPResponseResourceFactory implements ResourceFactory, ResourceConstants {
17 | 	private HttpResponseParser parser;
18 | 	public HTTPResponseResourceFactory() {
19 | 		parser = new HttpResponseParser();
20 | 	}
21 | 
22 | 	public Resource getResource(InputStream is, MetaData metaData,
23 | 			ResourceContainer container) 
24 | 	throws ResourceParseException, IOException {
25 | 		try {
26 | 
27 | 			HttpResponse response = parser.parse(is);
28 | 			metaData.putString(PAYLOAD_CONTENT_TYPE, 
29 | 					PAYLOAD_TYPE_HTTP_RESPONSE);
30 | 			return new HTTPResponseResource(metaData.createChild(HTTP_RESPONSE_METADATA),
31 | 					container, response, true);
32 | 
33 | 		} catch(HttpParseException e) {
34 | 			throw new ResourceParseException(e);
35 | 		}
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/producer/ARCFile.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.producer;
 2 | 
 3 | import org.archive.resource.arc.ARCResourceFactory;
 4 | 
 5 | public class ARCFile extends EnvelopedResourceFile {
 6 | 	public ARCFile() {
 7 | 		super(new ARCResourceFactory());
 8 | 	}
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/producer/WARCFile.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.producer;
 2 | 
 3 | import org.archive.resource.warc.WARCResourceFactory;
 4 | 
 5 | public class WARCFile extends EnvelopedResourceFile {
 6 | 	public WARCFile() {
 7 | 		super(new WARCResourceFactory());
 8 | 	}
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/warc/WARCResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.format.http.HttpParseException;
 7 | import org.archive.format.http.HttpResponse;
 8 | import org.archive.format.http.HttpResponseParser;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.Resource;
12 | import org.archive.resource.ResourceContainer;
13 | import org.archive.resource.ResourceFactory;
14 | import org.archive.resource.ResourceParseException;
15 | 
16 | public class WARCResourceFactory implements ResourceFactory, ResourceConstants {
17 | 	private HttpResponseParser parser;
18 | 	public WARCResourceFactory() {
19 | 		parser = new HttpResponseParser();
20 | 	}
21 | 
22 | 	public Resource getResource(InputStream is, MetaData parentMetaData,
23 | 			ResourceContainer container) throws ResourceParseException,
24 | 			IOException {
25 | 		try {
26 | 
27 | 			HttpResponse response = parser.parse(is);
28 | 			WARCResource r = new WARCResource(parentMetaData.createChild(ENVELOPE),
29 | 					container, response);
30 | 			return r;
31 | 
32 | 		} catch(HttpParseException e) {
33 | 			throw new ResourceParseException(e);
34 | 		}
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/warc/record/DNSResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc.record;
 2 | 
 3 | 
 4 | import java.util.logging.Logger;
 5 | 
 6 | import org.archive.format.dns.DNSRecord;
 7 | import org.archive.format.dns.DNSResponse;
 8 | import org.archive.resource.AbstractEmptyResource;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.ResourceContainer;
12 | import com.github.openjson.JSONException;
13 | import com.github.openjson.JSONObject;
14 | 
15 | public class DNSResource extends AbstractEmptyResource implements ResourceConstants {
16 | 	private static final Logger LOG = 
17 | 		Logger.getLogger(DNSResource.class.getName()); 
18 | 
19 | 	public DNSResource(MetaData metaData, ResourceContainer container,
20 | 			DNSResponse response) {
21 | 		super(metaData, container);
22 | 		metaData.putString(DNS_DATE, response.getDate());
23 | 		try {
24 | 			for(DNSRecord rec : response) {
25 | 				JSONObject rjo = new JSONObject();
26 | 				rjo.put(DNS_NAME, rec.getName());
27 | 				rjo.put(DNS_TTL, rec.getTtl());
28 | 				rjo.put(DNS_NETCLASS, rec.getNetClass());
29 | 				rjo.put(DNS_TYPE, rec.getType());
30 | 				rjo.put(DNS_VALUE, rec.getValue());
31 | 				metaData.appendChild(DNS_ENTRIES, rjo);
32 | 			}
33 | 		} catch(JSONException e) {
34 | 			LOG.severe(e.getMessage());
35 | 		}
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/warc/record/DNSResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc.record;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.RecoverableRecordFormatException;
 7 | import org.archive.format.dns.DNSResponse;
 8 | import org.archive.format.dns.DNSResponseParser;
 9 | import org.archive.resource.MetaData;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.Resource;
12 | import org.archive.resource.ResourceContainer;
13 | import org.archive.resource.ResourceFactory;
14 | import org.archive.resource.ResourceParseException;
15 | 
16 | public class DNSResourceFactory implements ResourceFactory, ResourceConstants {
17 | 
18 | 	DNSResponseParser parser = new DNSResponseParser();
19 | 	
20 | 	public Resource getResource(InputStream is, MetaData parentMetaData,
21 | 			ResourceContainer container) throws ResourceParseException,
22 | 			IOException {
23 | 		DNSResponse response = new DNSResponse();
24 | 		try {
25 | 			parser.parse(is, response);
26 | 		} catch(RecoverableRecordFormatException e) {
27 | 			throw new ResourceParseException(e);
28 | 		}
29 | 		parentMetaData.putString(PAYLOAD_CONTENT_TYPE, PAYLOAD_TYPE_DNS);
30 | 		return new DNSResource(parentMetaData.createChild(DNS_METADATA), container, response);
31 | 	}
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc.record;
 2 | 
 3 | import org.archive.resource.AbstractEmptyResource;
 4 | import org.archive.resource.MetaData;
 5 | import org.archive.resource.ResourceConstants;
 6 | import org.archive.resource.ResourceContainer;
 7 | 
 8 | public class WARCJSONMetaDataResource extends AbstractEmptyResource implements ResourceConstants {
 9 | 
10 | 	public WARCJSONMetaDataResource(MetaData metaData,
11 | 			ResourceContainer container) {
12 | 		super(metaData, container);
13 | 	}
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/warc/record/WARCJSONMetaDataResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc.record;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.InputStreamReader;
 6 | import java.nio.charset.Charset;
 7 | 
 8 | import org.archive.resource.MetaData;
 9 | import org.archive.resource.Resource;
10 | import org.archive.resource.ResourceConstants;
11 | import org.archive.resource.ResourceContainer;
12 | import org.archive.resource.ResourceFactory;
13 | import org.archive.resource.ResourceParseException;
14 | import com.github.openjson.JSONException;
15 | import com.github.openjson.JSONTokener;
16 | 
17 | public class WARCJSONMetaDataResourceFactory implements ResourceFactory, ResourceConstants {
18 | 	private static final Charset UTF8 = Charset.forName("UTF-8");
19 | 
20 | 	public WARCJSONMetaDataResourceFactory() {
21 | 	}
22 | 
23 | 	public Resource getResource(InputStream is, MetaData parentMetaData,
24 | 			ResourceContainer container) throws ResourceParseException,
25 | 			IOException {
26 | 
27 | 
28 | 		MetaData md;
29 | 		try {
30 | 			md = new MetaData(new JSONTokener(new InputStreamReader(is, UTF8)));
31 | 		} catch (JSONException e) {
32 | 			throw new ResourceParseException(e);
33 | 		}
34 | 		return new WARCJSONMetaDataResource(md, container);
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/resource/warc/record/WARCMetaDataResource.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc.record;
 2 | 
 3 | //import java.util.logging.Logger;
 4 | 
 5 | import org.archive.format.http.HttpHeader;
 6 | import org.archive.format.http.HttpHeaders;
 7 | import org.archive.resource.AbstractEmptyResource;
 8 | import org.archive.resource.MetaData;
 9 | import org.archive.resource.ResourceConstants;
10 | import org.archive.resource.ResourceContainer;
11 | 
12 | public class WARCMetaDataResource extends AbstractEmptyResource implements ResourceConstants {
13 | //	private static final Logger LOG = 
14 | //		Logger.getLogger(WARCMetaDataResource.class.getName());
15 | 	
16 | 	public WARCMetaDataResource(MetaData metaData, ResourceContainer container,
17 | 			HttpHeaders headers) {
18 | 		super(metaData, container);
19 | 		for(HttpHeader h : headers) {
20 | 			metaData.appendObj(WARC_META_FIELDS_LIST,
21 | 					METADATA_KV_NAME, h.getName(),
22 | 					METADATA_KV_VALUE,h.getValue());
23 | 		}
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/streamcontext/ByteArrayWrappedStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.streamcontext;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public class ByteArrayWrappedStream extends AbstractBufferingStream {
 6 | 	private byte[] buffer = null;
 7 | 	int offset = 0;
 8 | 	public ByteArrayWrappedStream(byte b[]) {
 9 | 		buffer = b;
10 | 		offset = 0;
11 | 	}
12 | 	@Override
13 | 	public int doRead(byte[] b, int off, int len) throws IOException {
14 | 		if(offset == buffer.length) {
15 | 			return -1;
16 | 		}
17 | 		int amtToCopy = Math.min(buffer.length - offset, len);
18 | 		System.arraycopy(buffer, offset, b, off, amtToCopy);
19 | 		offset += amtToCopy;
20 | 		return amtToCopy;
21 | 	}
22 | 
23 | 	@Override
24 | 	public void doSeek(long offset) throws IOException {
25 | 		if(offset > this.offset) {
26 | 			throw new IOException("seek past end..");
27 | 		}
28 | 		this.offset = (int) offset;
29 | 	}
30 | 
31 | 	@Override
32 | 	public void doClose() throws IOException {		
33 | 	}
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/streamcontext/HDFSStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.streamcontext;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.FSDataInputStream;
 6 | 
 7 | public class HDFSStream extends AbstractBufferingStream {
 8 | 	FSDataInputStream hdfs;
 9 | 	public HDFSStream(FSDataInputStream hdfs) {
10 | 		this.hdfs = hdfs;
11 | 	}
12 | 	public HDFSStream(FSDataInputStream hdfs, long offset) throws IOException {
13 | 		this.hdfs = hdfs;
14 | 		doSeek(offset);
15 | 	}
16 | 
17 | 	@Override
18 | 	public int doRead(byte[] b, int off, int len) throws IOException {
19 | 		return hdfs.read(b, off, len);
20 | 	}
21 | 
22 | 	@Override
23 | 	public void doSeek(long offset) throws IOException {
24 | //		System.err.format("HDFSdoSeek(%d)\n", offset);
25 | 		hdfs.seek(offset);
26 | 	}
27 | 
28 | 	@Override
29 | 	public void doClose() throws IOException {
30 | 		hdfs.close();
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/streamcontext/HTTP11Stream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.streamcontext;
 2 | 
 3 | import java.io.FileNotFoundException;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import java.net.URL;
 7 | import java.net.URLConnection;
 8 | 
 9 | public class HTTP11Stream extends AbstractBufferingStream {
10 | 	private URL url;
11 | 	private URLConnection conn = null;
12 | 	private InputStream is = null;
13 | 
14 | 	public HTTP11Stream(URL url)
15 | 		throws IndexOutOfBoundsException, FileNotFoundException, IOException {
16 | 		this(url,0L,DEFAULT_READ_SIZE);
17 | 	}	
18 | 	public HTTP11Stream(URL url, long offset)
19 | 		throws IndexOutOfBoundsException, FileNotFoundException, IOException {
20 | 		this(url,offset,DEFAULT_READ_SIZE);
21 | 	}	
22 | 	public HTTP11Stream(URL url, long offset, int readSize) throws IOException {
23 | 		super(offset,readSize);
24 | 		this.url = url;
25 | 		doSeek(offset);
26 | 	}
27 | 
28 | 	@Override
29 | 	public void doClose() throws IOException {
30 | 		if(is != null) {
31 | 			is.close();
32 | 			is = null;
33 | 		}
34 | 	}
35 | 
36 | 	@Override
37 | 	public int doRead(byte[] b, int off, int len) throws IOException {
38 | 		return is.read(b, off, len);
39 | 	}
40 | 
41 | 	@Override
42 | 	public void doSeek(long offset) throws IOException {
43 | 		doClose();
44 | 		conn = url.openConnection();
45 | 		conn.setRequestProperty("Range", String.format("bytes=%d-", offset));
46 | 		conn.connect();
47 | 		is = conn.getInputStream();
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/streamcontext/RandomAccessFileStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.streamcontext;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileNotFoundException;
 5 | import java.io.IOException;
 6 | import java.io.RandomAccessFile;
 7 | 
 8 | public class RandomAccessFileStream extends AbstractBufferingStream {
 9 | 	
10 | 	private RandomAccessFile raf = null;
11 | 	private File file = null;
12 | 	public RandomAccessFileStream(File file)
13 | 		throws IndexOutOfBoundsException, FileNotFoundException, IOException {
14 | 		this(file,0L,DEFAULT_READ_SIZE);
15 | 	}	
16 | 	public RandomAccessFileStream(File file, long offset)
17 | 		throws IndexOutOfBoundsException, FileNotFoundException, IOException {
18 | 		this(file,offset,DEFAULT_READ_SIZE);
19 | 	}	
20 | 	public RandomAccessFileStream(File file, long offset, int readSize) 
21 | 		throws IndexOutOfBoundsException, FileNotFoundException, IOException {
22 | 		super(offset,readSize);
23 | 		raf = new RandomAccessFile(file, "r");
24 | 		if(offset > 0) {
25 | 			raf.seek(offset);
26 | 		}
27 | 		this.file = file;
28 | 	}
29 | 
30 | 	public File getFile() {
31 | 		return file;
32 | 	}
33 | 
34 | 	public void doClose() throws IOException {
35 | 		raf.close();
36 | 	}
37 | 
38 | 	public int doRead(byte[] b, int off, int len) throws IOException {
39 | 		return raf.read(b, off, len);
40 | 	}
41 | 
42 | 	public void doSeek(long offset) throws IOException {
43 | 		raf.seek(offset);
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/streamcontext/SimpleStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.streamcontext;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | public class SimpleStream extends AbstractBufferingStream {
 7 | 	private InputStream is;
 8 | 
 9 | 	public SimpleStream(InputStream is) {
10 | 		this(is,0L,DEFAULT_READ_SIZE);
11 | 	}
12 | 
13 | 	public SimpleStream(InputStream is, long offset) {
14 | 		this(is,offset,DEFAULT_READ_SIZE);
15 | 	}
16 | 
17 | 	public SimpleStream(InputStream is, long offset, int readSize) {
18 | 		super(offset,readSize);
19 | 		this.is = is;
20 | 	}
21 | 
22 | 	@Override
23 | 	public void doClose() throws IOException {
24 | 		is.close();
25 | 	}
26 | 
27 | 	@Override
28 | 	public int doRead(byte[] b, int off, int len) throws IOException {
29 | 		return is.read(b,off,len);
30 | 	}
31 | 
32 | 	@Override
33 | 	public void doSeek(long offset) throws IOException {
34 | 		throw new IOException("Unable to seek!");
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/streamcontext/Stream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.streamcontext;
 2 | 
 3 | import java.io.Closeable;
 4 | import java.io.IOException;
 5 | 
 6 | /**
 7 |  * Alternate simplified interface for accessing data from an underlying source 
 8 |  * of bytes.
 9 |  *
10 |  * @author brad
11 |  *
12 |  */
13 | public interface Stream extends Closeable {
14 | 	public long getOffset();
15 | 	public long setOffset(long offset) throws IOException;
16 | 	public int read(byte[] bytes, int off, int len) throws IOException;
17 | 	public boolean atEof();
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/uid/package.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
 2 | <html>
 3 | <head>
 4 | <title>org.archive.uid package</title>
 5 | </head>
 6 | <body>
 7 | A unique ID generator.
 8 | Default is {@link org.archive.uid.UUIDGenerator}.  
 9 | To use another ID Generator, set the System Property
10 | <code>org.archive.uid.GeneratorFactory.generator</code> to point
11 | at an alternate implementation of {@link org.archive.uid.RecordIDGenerator}.
12 | 
13 | <h2>TODO</h2>
14 | <ul>
15 |     <li>MIME boundaries have upper-bound of 70 characters total including
16 |     'blank line' (<code>CRLFCRLF</code>) and two leading hyphens. Add to
17 |     {@link org.archive.uid.RecordIDGenerator}
18 |     interface an upper-bound on generated ID length.</li>
19 | <li>Add example of an <i>actionable</i> uid generator:
20 | e.g. <code>http://archive.org/UID-SCHEME/ID</code>
21 | where scheme might be <code>UUID</code> and an ID might be
22 | <code>f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata</code> or,
23 | using <a href="http://ark.cdlib.org/arkcdl.pdf">ARK</a>: 
24 | <code>http://archive.org/ark:/13030/f9472055-fbb6-4810-90e8-68fd39e145a6;type=metadata</code>.
25 | </li>
26 | </ul>
27 | </body>
28 | </html>
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/AggressiveIACanonicalizerRules.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | public class AggressiveIACanonicalizerRules extends CanonicalizeRules {
 4 | 	
 5 | 	public AggressiveIACanonicalizerRules()
 6 | 	{
 7 | 		this(true);
 8 | 	}
 9 | 	
10 | 	public AggressiveIACanonicalizerRules(boolean stripSlash) {
11 | 		
12 | 		setRule(SCHEME_SETTINGS, SCHEME_LOWERCASE);
13 | 		setRule(HOST_SETTINGS,
14 | 				HOST_LOWERCASE|HOST_MASSAGE);
15 | 		
16 | 		setRule(PORT_SETTINGS,
17 | 				PORT_STRIP_DEFAULT);
18 | 		
19 | 		int pathSettings = PATH_LOWERCASE|PATH_STRIP_SESSION_ID;
20 | 		
21 | 		if (stripSlash) {
22 | 			pathSettings |= PATH_STRIP_TRAILING_SLASH_UNLESS_EMPTY;
23 | 		}
24 | 		
25 | 		setRule(PATH_SETTINGS, pathSettings);
26 | 
27 | 		setRule(QUERY_SETTINGS,
28 | 				QUERY_LOWERCASE|QUERY_STRIP_SESSION_ID|QUERY_STRIP_EMPTY|
29 | 				QUERY_ALPHA_REORDER);
30 | 		
31 | 		setRule(HASH_SETTINGS,HASH_STRIP);
32 | 		
33 | 		setRule(AUTH_SETTINGS, AUTH_STRIP_PASS|AUTH_STRIP_USER);
34 | 	}
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/AggressiveIAURLCanonicalizer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | public class AggressiveIAURLCanonicalizer implements URLCanonicalizer {
 4 | 	private static final BasicURLCanonicalizer basic = 
 5 | 			new BasicURLCanonicalizer();
 6 | 	
 7 | 		private static final IAURLCanonicalizer ia = 
 8 | 			new IAURLCanonicalizer(new AggressiveIACanonicalizerRules());
 9 | 
10 | 		public void canonicalize(HandyURL url) {
11 | 			// just google's stuff, followed by the IA default stuff:
12 | 			basic.canonicalize(url);
13 | 			ia.canonicalize(url);
14 | 		}
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/CanonicalizeRules.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | // XXX it's "Canonicalizer" everywhere else but here
 4 | public class CanonicalizeRules implements CanonicalizerConstants {
 5 | 	private int[] settings = new int[NUM_SETTINGS];
 6 | 
 7 | 	public void setRule(int rule, int value) {
 8 | 		settings[rule] = value;
 9 | 	}
10 | 	public int getRule(int rule) {
11 | 		return settings[rule];
12 | 	}
13 | 	public boolean isSet(int rule, int value) {
14 | 		return (settings[rule] & value) == value;
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/CanonicalizerConstants.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | public interface CanonicalizerConstants {
 4 | 	public static final int HOST_SETTINGS = 0;
 5 | 	
 6 | 	public static final int HOST_ORIGINAL = 0;
 7 | 	public static final int HOST_LOWERCASE = 1;
 8 | 	public static final int HOST_MASSAGE = 2;
 9 | 
10 | 	
11 | 	public static final int PORT_SETTINGS = 1;
12 | 	
13 | 	public static final int PORT_ORIGINAL = 0;
14 | 	public static final int PORT_STRIP_DEFAULT = 1;
15 | 
16 | 	
17 | 	public static final int PATH_SETTINGS = 2;
18 | 
19 | 	public static final int PATH_ORIGINAL = 0;
20 | 	public static final int PATH_LOWERCASE = 1;
21 | 	public static final int PATH_STRIP_SESSION_ID = 2;
22 | 	public static final int PATH_STRIP_EMPTY = 4;
23 | 	public static final int PATH_STRIP_TRAILING_SLASH_UNLESS_EMPTY = 8;
24 | 
25 | 	
26 | 	public static final int QUERY_SETTINGS = 3;
27 | 
28 | 	public static final int QUERY_ORIGINAL = 0;
29 | 	public static final int QUERY_LOWERCASE = 1;
30 | 	public static final int QUERY_STRIP_SESSION_ID = 2;
31 | 	public static final int QUERY_STRIP_EMPTY = 4;
32 | 	public static final int QUERY_ALPHA_REORDER = 8;
33 | 	// TODO: Need a setting to remove empty query ARGs..
34 | 
35 | 	public static final int HASH_SETTINGS = 4;
36 | 
37 | 	public static final int HASH_ORIGINAL = 0;
38 | 	public static final int HASH_STRIP = 1;
39 | 
40 | 
41 | 	public static final int AUTH_SETTINGS = 5;
42 | 
43 | 	public static final int AUTH_ORIGINAL = 0;
44 | 	public static final int AUTH_STRIP_USER = 1;
45 | 	public static final int AUTH_STRIP_PASS = 2;
46 | 	
47 | 	public static final int SCHEME_SETTINGS = 6;
48 | 
49 | 	public static final int SCHEME_ORIGINAL = 0;
50 | 	public static final int SCHEME_LOWERCASE = 1;
51 | 	
52 | 	
53 | 	public static final int NUM_SETTINGS = 7;
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/DefaultIACanonicalizerRules.java:
--------------------------------------------------------------------------------
1 | package org.archive.url;
2 | 
3 | /**
4 |  * @deprecated use AggressiveIACanonicalizerRules 
5 |  */
6 | public class DefaultIACanonicalizerRules extends AggressiveIACanonicalizerRules {
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java:
--------------------------------------------------------------------------------
1 | package org.archive.url;
2 | 
3 | /**
4 |  * @deprecated use AggressiveIAURLCanonicalizer
5 |  */
6 | public class DefaultIAURLCanonicalizer extends AggressiveIAURLCanonicalizer {
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/ExtractRule.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | import java.util.regex.Matcher;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | public class ExtractRule
 7 | {
 8 | 	protected String startsWith;
 9 | 	protected String regex;
10 | 	
11 | 	protected Pattern regexPattern;
12 | 	
13 | 	public String getStartsWith() {
14 | 		return startsWith;
15 | 	}
16 | 	public void setStartsWith(String startsWith) {
17 | 		this.startsWith = startsWith;
18 | 	}
19 | 	public String getRegex() {
20 | 		return regex;
21 | 	}
22 | 	public void setRegex(String regex) {
23 | 		regexPattern = Pattern.compile(regex);
24 | 		this.regex = regex;
25 | 	}
26 | 	
27 | 	public Matcher extract(String url)
28 | 	{		
29 | 		if ((startsWith != null) && !startsWith.isEmpty() && !url.startsWith(startsWith)) {
30 | 			return null;
31 | 		}
32 | 		
33 | 		if (regexPattern == null) {
34 | 			return null;
35 | 		}
36 | 		
37 | 		Matcher match = regexPattern.matcher(url);
38 | 		
39 | 		if (!match.find()) {
40 | 			return null;
41 | 		}
42 | 		
43 | 		return match;
44 | 	}
45 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/GoogleURLCanonicalizer.java:
--------------------------------------------------------------------------------
1 | package org.archive.url;
2 | 
3 | /**
4 |  * @deprecated use {@link BasicURLCanonicalizer}
5 |  */
6 | public class GoogleURLCanonicalizer extends BasicURLCanonicalizer {
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/NonMassagingIAURLCanonicalizer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | public class NonMassagingIAURLCanonicalizer implements URLCanonicalizer {
 4 | 	private static final GoogleURLCanonicalizer google = 
 5 | 		new GoogleURLCanonicalizer();
 6 | 	private static CanonicalizeRules nonMassagingRules = 
 7 | 		new DefaultIACanonicalizerRules();
 8 | 	static {
 9 | 		nonMassagingRules.setRule(CanonicalizeRules.HOST_SETTINGS,
10 | 				CanonicalizeRules.HOST_LOWERCASE);
11 | 	}
12 | 	private static final IAURLCanonicalizer ia = 
13 | 		new IAURLCanonicalizer(nonMassagingRules);
14 | 
15 | 	public void canonicalize(HandyURL url) {
16 | 		// just google's stuff, followed by the IA default stuff:
17 | 		google.canonicalize(url);
18 | 		ia.canonicalize(url);
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/OrdinaryIACanonicalizerRules.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | /**
 4 |  * Idea of these rules is to accomplish roughly the equivalent of
 5 |  * {@link UsableURIFactory} fixup plus {@link BasicURLCanonicalizer} fixup.
 6 |  */
 7 | public class OrdinaryIACanonicalizerRules extends CanonicalizeRules {
 8 | 	public OrdinaryIACanonicalizerRules() {
 9 | 		setRule(SCHEME_SETTINGS, SCHEME_LOWERCASE);
10 | 		setRule(HOST_SETTINGS, HOST_LOWERCASE);
11 | 		setRule(PORT_SETTINGS, PORT_STRIP_DEFAULT);
12 | 		setRule(QUERY_SETTINGS, QUERY_STRIP_EMPTY);
13 | 		setRule(HASH_SETTINGS, HASH_STRIP);
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/OrdinaryIAURLCanonicalizer.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | /**
 4 |  * Idea of this canonicalizer is to accomplish roughly the equivalent of
 5 |  * {@link UsableURIFactory} fixup plus {@link BasicURLCanonicalizer} fixup.
 6 |  */
 7 | public class OrdinaryIAURLCanonicalizer implements URLCanonicalizer {
 8 | 	private static final BasicURLCanonicalizer basic = new BasicURLCanonicalizer();
 9 | 
10 | 	private static final IAURLCanonicalizer ia = 
11 | 			new IAURLCanonicalizer(new OrdinaryIACanonicalizerRules());
12 | 
13 | 	public void canonicalize(HandyURL url) {
14 | 		basic.canonicalize(url);
15 | 		ia.canonicalize(url);
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/RewriteRule.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | import java.util.regex.Matcher;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | public class RewriteRule
 7 | {
 8 | 	protected String startsWith;
 9 | 	protected String regex;
10 | 	protected String replace;
11 | 	
12 | 	protected Pattern regexPattern;
13 | 	
14 | 	public String getStartsWith() {
15 | 		return startsWith;
16 | 	}
17 | 	public void setStartsWith(String startsWith) {
18 | 		this.startsWith = startsWith;
19 | 	}
20 | 	public String getRegex() {
21 | 		return regex;
22 | 	}
23 | 	public void setRegex(String regex) {
24 | 		regexPattern = Pattern.compile(regex);
25 | 		this.regex = regex;
26 | 	}
27 | 	public String getReplace() {
28 | 		return replace;
29 | 	}
30 | 	public void setReplace(String replace) {
31 | 		this.replace = replace;
32 | 	}
33 | 	
34 | 	public boolean rewrite(StringBuilder sb)
35 | 	{
36 | 		String urlkey = sb.toString();
37 | 		
38 | 		if ((startsWith != null) && !urlkey.startsWith(startsWith)) {
39 | 			return false;
40 | 		}
41 | 		
42 | 		if (regexPattern == null || replace == null) {
43 | 			return false;
44 | 		}
45 | 		
46 | 		Matcher match = regexPattern.matcher(urlkey);
47 | 		
48 | 		if (match.matches()) {
49 | 			sb.replace(0, sb.length(), match.replaceAll(replace));
50 | 			return true;
51 | 		}
52 | 		
53 | 		return false;
54 | 	}
55 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/SURT.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.InputStreamReader;
 5 | import java.nio.charset.Charset;
 6 | import java.util.Iterator;
 7 | import java.util.logging.Logger;
 8 | 
 9 | import org.apache.commons.httpclient.URIException;
10 | import org.archive.util.iterator.AbstractPeekableIterator;
11 | 
12 | public class SURT {
13 | 	private static final Logger LOG = 
14 | 		Logger.getLogger(SURT.class.getCanonicalName());
15 | 	public static String toSURT(String input) {
16 | 		if(input.startsWith("(")) {
17 | 			return input;
18 | 		}
19 | 		try {
20 | //			String tmp = input;
21 | //			if(tmp == null) {
22 | //				throw new URIException();
23 | //			}
24 | 			String tmp = SURTTokenizer.prefixKey(input);
25 | 			if(tmp.contains("/")) {
26 | 				return tmp;
27 | 			}
28 | 			return tmp + ",";
29 | 		} catch (URIException e) {
30 | 			LOG.warning("URI Exception for(" + input + "):" + e.getLocalizedMessage());
31 | //			e.printStackTrace();
32 | 			return input;
33 | 		}
34 | 	}
35 | 	public static void main(String[] args) {
36 | 		String line;
37 | 		InputStreamReader isr = new InputStreamReader(System.in,Charset.forName("UTF-8"));
38 | 		BufferedReader br = new BufferedReader(isr);
39 | 		Iterator<String> i = AbstractPeekableIterator.wrapReader(br);
40 | 		while(i.hasNext()) {
41 | 			line = i.next();
42 | 			System.out.println(toSURT(line));
43 | 		}
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/URLCanonicalizer.java:
--------------------------------------------------------------------------------
1 | package org.archive.url;
2 | 
3 | public interface URLCanonicalizer {
4 | 	public void canonicalize(HandyURL url);
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/url/URLKeyMaker.java:
--------------------------------------------------------------------------------
1 | package org.archive.url;
2 | 
3 | import java.net.URISyntaxException;
4 | 
5 | public interface URLKeyMaker {
6 | 	public String makeKey(String url) throws URISyntaxException;
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/CrossProduct.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.util.ArrayDeque;
 4 | import java.util.ArrayList;
 5 | import java.util.Deque;
 6 | import java.util.List;
 7 | import java.util.Stack;
 8 | 
 9 | public class CrossProduct <T> {
10 | 	public List<List<T>> crossProduct(List<List<T>> listOfLists) {
11 | 
12 | 		ArrayList<List<T>> results = new ArrayList<List<T>>();
13 | 		
14 | 		Stack<T> current = new Stack<T>();
15 | 		Deque<List<T>> remainder = new ArrayDeque<List<T>>(listOfLists); 
16 | 		recurse(remainder,current,results);
17 | 		return results;
18 | 	}
19 | 	private void recurse(Deque<List<T>> remainder,
20 | 			Stack<T> current, ArrayList<List<T>> accumulation) {
21 | 		if(remainder.isEmpty()) {
22 | 			// all done:
23 | 			dump(new ArrayList<T>(current));
24 | 			accumulation.add(new ArrayList<T>(current));
25 | 			
26 | 		} else {
27 | 			List<T> cur = remainder.removeFirst();
28 | 			for(T o : cur) {
29 | 				current.push(o);
30 | 				recurse(remainder,current,accumulation);
31 | 				current.pop();
32 | 			}
33 | 			remainder.addFirst(cur);
34 | 		}
35 | 	}
36 | 	private void dump(ArrayList<T> a) {
37 | 		StringBuilder sb = new StringBuilder();
38 | 		boolean first = false;
39 | 		for(T o : a) {
40 | 			if(first) {
41 | 				first = false;
42 | 			} else {
43 | 				sb.append(",");
44 | 			}
45 | 			sb.append(o.toString());
46 | 		}
47 | 		System.out.println("CrossOutput:" + sb.toString());
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/FileNameSpec.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.util.concurrent.atomic.AtomicInteger;
 4 | 
 5 | public class FileNameSpec {
 6 | //	private final static String DEFAULT_PREFIX_PATTERN = "UNK-%h-%p-%t-%s";
 7 | 	private AtomicInteger aInt;
 8 | 	private String prefix;
 9 | 	private String suffix;
10 | 	public FileNameSpec(String prefix, String suffix) {
11 | 		this.prefix = prefix;
12 | 		this.suffix = suffix;
13 | 		aInt = new AtomicInteger(-1);
14 | 	}
15 | 	public String getNextName() {
16 | 		StringBuilder sb = new StringBuilder();
17 | 		sb.append(prefix);
18 | 		sb.append(String.format("%06d",aInt.incrementAndGet()));
19 | 		sb.append(suffix);
20 | 		return sb.toString();
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/IterableLineIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.io.Reader;
 4 | import java.util.Iterator;
 5 | 
 6 | import org.apache.commons.io.LineIterator;
 7 | 
 8 | /**
 9 |  * A LineIterator that also implements Iterable, so that it can be used with
10 |  * the java enhanced for-each loop syntax.
11 |  * 
12 |  * @author nlevitt
13 |  */
14 | public class IterableLineIterator extends LineIterator 
15 |     implements Iterable<String> {
16 | 
17 |     public IterableLineIterator(final Reader reader)
18 |             throws IllegalArgumentException {
19 |         super(reader);
20 |     }
21 | 
22 |     @SuppressWarnings("unchecked")
23 |     public Iterator<String> iterator() {
24 |         return this;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/ProgressStatisticsReporter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.util;
20 | 
21 | import java.io.IOException;
22 | import java.io.PrintWriter;
23 | 
24 | public interface ProgressStatisticsReporter {
25 |     /**
26 |      * @param writer Where to write statistics.
27 |      * @throws IOException 
28 |      */
29 |     public void progressStatisticsLine(PrintWriter writer) throws IOException;
30 |     
31 |     /**
32 |      * @param writer Where to write statistics legend.
33 |      * @throws IOException 
34 |      */
35 |     public void progressStatisticsLegend(PrintWriter writer) throws IOException;
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/StringParse.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.util.List;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | public class StringParse {
 7 | 	private final static Pattern IP_PATTERN =
 8 |         Pattern.compile("b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?).)"
 9 |                               + "{3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)b");
10 | 	public static boolean isIP(final String ip) {
11 | 		// TODO:
12 | 		return ip.length() > 0;
13 | //		return IP_PATTERN.matcher(ip).matches();
14 | 	}
15 | 	public static boolean isIPBad(final String ip) {
16 | 		return IP_PATTERN.matcher(ip).matches();
17 | 	}
18 | 	public static String join(List<String> p) {
19 | 		return join(p,",");
20 | 	}
21 | 	public static String join(List<String> p, String delim) {
22 | 		StringBuilder sb = new StringBuilder();
23 | 		boolean first = true;
24 | 		for(String part : p) {
25 | 			if(first) {
26 | 				first = false;
27 | 			} else {
28 | 				sb.append(delim);
29 | 			}
30 | 			sb.append(part);
31 | 		}
32 | 		return sb.toString();
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/anvl/Label.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | 
20 | package org.archive.util.anvl;
21 | 
22 | class Label extends SubElement {
23 | 	public static final char COLON = ':';
24 | 	
25 |     @SuppressWarnings("unused")
26 |     private Label() {
27 |         this(null);
28 |     }
29 |     
30 |     public Label(final String s) {
31 |         super(s);
32 |     }
33 |     
34 |     @Override
35 |     protected void checkCharacter(char c, String srcStr, int index) {
36 |     	super.checkCharacter(c, srcStr, index);
37 |     	if (c == COLON) {
38 |     		throw new IllegalArgumentException("Label cannot contain " + COLON);
39 |     	}
40 |     }
41 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/anvl/package.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
 2 | <html>
 3 | <head>
 4 | <title>org.archive.util.anvl package</title>
 5 | </head>
 6 | <body>
 7 | Parsers and Writers for the (expired) Internet-Draft <a 
 8 | href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
 9 | Language (ANVL)</a>.  Use {@link org.archive.util.anvl.ANVLRecord} 
10 | to create new instances of ANVL Records and for parsing.
11 | 
12 | <h2>Implementation Details</h2>
13 | <p>The ANVL Internet-Draft of 14 February, 2005 is inspecific as to the
14 | definition of 'blank line' and 'newline'.  This parser implementation
15 | assumes <code>CRNL</code>.
16 | </p>
17 | <p>Says "An element consists of a label, a colon, and an optional value".
18 | Should that be: "An element consists of a label and an optional value, or a
19 | comment."</p>
20 | 
21 | <p>Specification is unclear regards CR or NL in <i>label</i> or 
22 | <i>comment</i> (This implementation disallows CR or NL in labels but lets
23 | them pass in comments).</p>
24 | 
25 | <p>A grammar would help.  Here is RFC822:
26 | <pre>
27 |      field       =  field-name ":" [ field-body ] CRLF
28 |      
29 |      field-name  =  1*&lt;any CHAR, excluding CTLs, SPACE, and ":"&gt;
30 |      
31 |      field-body  =  field-body-contents
32 |                     [CRLF LWSP-char field-body]
33 |      
34 |      field-body-contents =
35 |                    &lt;the ASCII characters making up the field-body, as
36 |                     defined in the following sections, and consisting
37 |                     of combinations of atom, quoted-string, and
38 |                     specials tokens, or else consisting of texts&gt;
39 | </pre>
40 | </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/FieldExtractingSLR.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Special SLR wrapper (SeekableLineReader) that extracts a certain field
 7 |  * from the reader and only returns that field
 8 |  * @author ilya
 9 |  *
10 |  */
11 | 
12 | public class FieldExtractingSLR extends WrappedSeekableLineReader {
13 | 	protected String sep;
14 | 	protected int fieldIndex;
15 | 	
16 | 	public FieldExtractingSLR(SeekableLineReader slr, int fieldIndex, String sep) {
17 | 		super(slr);
18 | 		this.fieldIndex = fieldIndex;
19 | 		this.sep = sep;
20 | 	}
21 | 
22 | 	@Override
23 | 	public String readLine() throws IOException {
24 | 		String line = super.readLine();
25 | 		String[] fields = line.split(sep);
26 | 		return fields[fieldIndex];
27 | 	}
28 | 	
29 | 	@Override
30 | 	public void skipLine() throws IOException {
31 | 		super.readLine();
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/SeekableLineReader.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | public interface SeekableLineReader {
 7 | 	public void seek(long offset) throws IOException;
 8 | 	public void seekWithMaxRead(long offset, boolean gzip, int maxLength) throws IOException;
 9 | 	public InputStream getInputStream();
10 | 	public String readLine() throws IOException;
11 | 	public void skipLine() throws IOException;
12 | 	public void close() throws IOException;
13 | 	public long getSize() throws IOException;
14 | 	public void setBufferFully(boolean bufferFully);
15 | 	public boolean isClosed();
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/SeekableLineReaderFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | public interface SeekableLineReaderFactory {
 6 | 	public final static int BINSEARCH_BLOCK_SIZE = 8192;
 7 | 	public SeekableLineReader get() throws IOException;
 8 |     public void close() throws IOException;
 9 | 	public long getModTime();
10 | 	public void reload() throws IOException;
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/SeekableLineReaderIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.archive.util.io.RuntimeIOException;
 6 | import org.archive.util.iterator.AbstractPeekableIterator;
 7 | 
 8 | public class SeekableLineReaderIterator extends AbstractPeekableIterator<String> {
 9 | 	protected SeekableLineReader slr;
10 | 	protected boolean propagateException;
11 | 	
12 | 	public SeekableLineReaderIterator(SeekableLineReader slr) {
13 | 		this(slr, true);
14 | 	}
15 | 	
16 | 	public SeekableLineReaderIterator(SeekableLineReader slr, boolean propagateException) {
17 | 		this.slr = slr;
18 | 		this.propagateException = propagateException;
19 | 	}
20 | 	
21 | 	@Override
22 | 	public String getNextInner() {
23 | 		String next = null;
24 | 		if (slr != null) {
25 | 			try {
26 | 				next = slr.readLine();
27 | 			} catch (IOException e) {
28 | 				if (propagateException) {
29 | 					throw new RuntimeIOException(e.toString());
30 | 				}
31 | 			}
32 | 		}
33 | 		return next;
34 | 	}
35 | 	@Override
36 | 	public void close() throws IOException {
37 | 		if (slr != null) {
38 | 			slr.close();
39 | 		}
40 | 	}
41 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/WrappedSeekableLineReader.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | /**
 7 |  * WrappedSeekableLineReader that wraps an existing SeekableLineReader for custom extension 
 8 |  * @author ilya
 9 |  *
10 |  */
11 | public class WrappedSeekableLineReader implements SeekableLineReader {
12 | 
13 | 	protected SeekableLineReader slr;
14 | 	
15 | 	public WrappedSeekableLineReader(SeekableLineReader slr)
16 | 	{
17 | 		this.slr = slr;
18 | 	}
19 | 	
20 | 	@Override
21 | 	public void seek(long offset) throws IOException {
22 | 		this.slr.seek(offset);		
23 | 	}
24 | 
25 | 	@Override
26 | 	public void seekWithMaxRead(long offset, boolean gzip, int maxLength)
27 | 			throws IOException {
28 | 		slr.seekWithMaxRead(offset, gzip, maxLength);
29 | 	}
30 | 
31 | 	@Override
32 | 	public InputStream getInputStream() {
33 | 		return slr.getInputStream();
34 | 	}
35 | 
36 | 	@Override
37 | 	public String readLine() throws IOException {
38 | 		return slr.readLine();
39 | 	}
40 | 
41 | 	@Override
42 | 	public void close() throws IOException {
43 | 		slr.close();
44 | 	}
45 | 
46 | 	@Override
47 | 	public long getSize() throws IOException {
48 | 		return slr.getSize();
49 | 	}
50 | 
51 | 	@Override
52 | 	public void setBufferFully(boolean bufferFully) {
53 | 		slr.setBufferFully(bufferFully);
54 | 	}
55 | 
56 | 	@Override
57 | 	public boolean isClosed() {
58 | 		return slr.isClosed();
59 | 	}
60 | 
61 | 	@Override
62 | 	public void skipLine() throws IOException {
63 | 		slr.skipLine();
64 | 	}
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReader.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch.impl;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.apache.hadoop.fs.FSDataInputStream;
 7 | import org.archive.util.binsearch.AbstractSeekableLineReader;
 8 | 
 9 | import com.google.common.io.ByteStreams;
10 | 
11 | public class HDFSSeekableLineReader extends AbstractSeekableLineReader {
12 | 	private FSDataInputStream fsdis;
13 | 	private long length;
14 | 	
15 | 	public HDFSSeekableLineReader(FSDataInputStream fsdis, long length,
16 | 			int blockSize) {
17 | 		super(blockSize);
18 | 		this.fsdis = fsdis;
19 | 		this.length = length;
20 | 	}
21 | 	
22 | 	public InputStream doSeekLoad(long offset, int maxLength) throws IOException {
23 | 		fsdis.seek(offset);
24 | 		
25 | 		if (maxLength >= 0) {
26 | 			return ByteStreams.limit(fsdis, maxLength);
27 | 		} else {
28 | 			return fsdis;
29 | 		}
30 |     }
31 | 	
32 | 	public long getOffset() throws IOException {
33 | 		return fsdis.getPos();
34 | 	}
35 | 
36 | 	public void doClose() throws IOException {
37 | 		//Superclass closes the input stream
38 | 		fsdis = null;
39 | 	}
40 | 
41 | 	public long getSize() throws IOException {
42 | 		return length;
43 | 	}
44 | 	
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/impl/HDFSSeekableLineReaderFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch.impl;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.fs.FSDataInputStream;
 6 | import org.apache.hadoop.fs.FileStatus;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.archive.util.binsearch.SeekableLineReader;
10 | import org.archive.util.binsearch.SeekableLineReaderFactory;
11 | 
12 | public class HDFSSeekableLineReaderFactory implements SeekableLineReaderFactory {
13 | 	private FileSystem fs;
14 | 	private Path path;
15 | 	public HDFSSeekableLineReaderFactory(FileSystem fs, Path path) {
16 | 		this.fs = fs;
17 | 		this.path = path;
18 | 	}
19 | 	public SeekableLineReader get() throws IOException {
20 | 		FileStatus status = fs.getFileStatus(path);
21 | 		if(status.isDir()) {
22 | 			throw new IOException("Path:" + path.toUri().toASCIIString() + " is a directory!");
23 | 		}
24 | 		long length = status.getLen();
25 | 		FSDataInputStream fsdis = fs.open(path);
26 | 		return new HDFSSeekableLineReader(fsdis, length, 4096);
27 | 	}
28 | 	
29 | 	public void close() throws IOException
30 | 	{
31 | 		if (this.fs != null) {
32 | 			fs.close();
33 | 		}
34 | 	}
35 | 	
36 | 	public long getModTime()
37 | 	{
38 | 		try {
39 | 			return fs.getFileStatus(path).getModificationTime();
40 | 		} catch (IOException e) {
41 | 			return 0;
42 | 		}
43 | 	}
44 | 	@Override
45 |     public void reload() throws IOException {
46 | 	    // TODO Auto-generated method stub
47 | 	    
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/impl/MappedSeekableLineReader.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch.impl;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.archive.util.binsearch.AbstractSeekableLineReader;
 7 | import org.archive.util.binsearch.ByteBufferInputStream;
 8 | 
 9 | import com.google.common.io.ByteStreams;
10 | 
11 | public class MappedSeekableLineReader extends AbstractSeekableLineReader {
12 | 
13 |     private ByteBufferInputStream bbis;
14 | 
15 |     public MappedSeekableLineReader(ByteBufferInputStream bbis, int blockSize) throws IOException {
16 |         super(blockSize);
17 |         this.bbis = bbis;
18 |     }
19 |     
20 |     public long getOffset() throws IOException
21 |     {
22 |         if (closed) {
23 |             return 0;
24 |         }
25 |         
26 |         return bbis.position();
27 |     }
28 |     
29 |     @Override
30 |     protected InputStream doSeekLoad(long offset, int maxLength)
31 |             throws IOException {
32 |         
33 |         bbis.position(offset);
34 |         
35 |         if (maxLength > 0) {
36 |             return ByteStreams.limit(bbis, maxLength); 
37 |         } else {
38 |             return bbis;
39 |         }
40 |     }
41 | 
42 |     @Override
43 |     public long getSize() throws IOException {
44 |         return bbis.length();
45 |     }
46 | 
47 |     @Override
48 |     protected void doClose() throws IOException {
49 |         bbis = null;
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReader.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch.impl;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import java.io.RandomAccessFile;
 7 | 
 8 | import org.archive.util.binsearch.AbstractSeekableLineReader;
 9 | 
10 | import com.google.common.io.ByteStreams;
11 | 
12 | public class RandomAccessFileSeekableLineReader extends AbstractSeekableLineReader {
13 | 	
14 | 	private RandomAccessFile raf;
15 | 
16 | 	public RandomAccessFileSeekableLineReader(RandomAccessFile raf, int blockSize) {
17 | 		super(blockSize);
18 | 		this.raf = raf;
19 | 	}
20 | 
21 | 	public InputStream doSeekLoad(long offset, int maxLength) throws IOException {
22 | 		raf.seek(offset);
23 | 		
24 |     	FileInputStream fis = new FileInputStream(raf.getFD());
25 |     	
26 |     	if (maxLength > 0) {
27 |     		return ByteStreams.limit(fis, maxLength);
28 |     	} else {
29 |     		return fis;
30 |     	}
31 |     }
32 | 		
33 | 	public long getOffset() throws IOException
34 | 	{
35 | 		if (closed) {
36 | 			return 0;
37 | 		}
38 | 		
39 | 		return raf.getFilePointer();
40 | 	}
41 | 	
42 | 	public void doClose() throws IOException {
43 | 		if (raf != null) {
44 | 			raf.close();
45 | 		}
46 | 		raf = null;
47 | 	}
48 | 	
49 | 	public long getSize() throws IOException {
50 | 		return raf.length();
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/binsearch/impl/RandomAccessFileSeekableLineReaderFactory.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.binsearch.impl;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.io.RandomAccessFile;
 6 | 
 7 | import org.archive.util.binsearch.SeekableLineReader;
 8 | import org.archive.util.binsearch.SeekableLineReaderFactory;
 9 | 
10 | public class RandomAccessFileSeekableLineReaderFactory implements SeekableLineReaderFactory {
11 | 	private File file;
12 | 	private int blockSize = BINSEARCH_BLOCK_SIZE;
13 | 	
14 | 	public RandomAccessFileSeekableLineReaderFactory(File file) {
15 | 		this.file = file;
16 | 	}
17 | 	public RandomAccessFileSeekableLineReaderFactory(File file, int blockSize) {
18 | 		this.file = file;
19 | 		this.blockSize = blockSize;
20 | 	}
21 | 	public SeekableLineReader get() throws IOException {
22 | 		return new RandomAccessFileSeekableLineReader(new RandomAccessFile(file, "r"),
23 | 				blockSize);
24 | 	}
25 | 	public void close() throws IOException {
26 | 		this.file = null;
27 | 	}
28 | 	
29 | 	public long getModTime()
30 | 	{
31 | 		return file.lastModified();
32 | 	}
33 | 	
34 | 	@Override
35 |     public void reload() throws IOException {
36 | 		//RAF created each time, nothing to reload
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/BytesReadObserver.java:
--------------------------------------------------------------------------------
1 | package org.archive.util.io;
2 | 
3 | public interface BytesReadObserver {
4 | 	void notifyBytesRead(int amt);
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/CRCInputStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.io;
 2 | import java.io.IOException;
 3 | import java.io.InputStream;
 4 | import java.util.zip.CRC32;
 5 | 
 6 | public class CRCInputStream extends InputStream {
 7 | 	private InputStream is = null;
 8 | 	private CRC32 crc = null;
 9 | 	private long count = 0;
10 | 	public CRCInputStream(InputStream is) {
11 | 		this(is,new CRC32());
12 | 	}
13 | 	public CRCInputStream(InputStream is, CRC32 crc) {
14 | 		this.is = is;
15 | 		this.crc = crc;
16 | 		count = 0;
17 | 	}
18 | 	@Override
19 | 	public int read() throws IOException {
20 | 		int b = is.read();
21 | 		if(b != -1) {
22 | 			crc.update(b);
23 | 			count++;
24 | 		}
25 | 		return b;
26 | 	}
27 | 	public int read(byte[] b) throws IOException {
28 | 		return read(b,0,b.length);
29 | 	}
30 | 	public int read(byte[] b, int off, int len) throws IOException {
31 | 		int amt = is.read(b, off, len);
32 | 		if(amt > -1) {
33 | 			count += amt;
34 | 			crc.update(b, off, amt);
35 | 		}
36 | 		return amt;
37 | 	}
38 | 	public long getCRCValue() {
39 | 		return crc.getValue();
40 | 	}
41 | 	public long getByteCount() {
42 | 		return count;
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/CRCOutputStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.io;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | import java.util.zip.CRC32;
 6 | 
 7 | public class CRCOutputStream extends OutputStream {
 8 | 	OutputStream os = null;
 9 | 	private CRC32 crc = null;
10 | 	boolean autoFlush = false;
11 | 	long bytesWritten = 0;
12 | 	public CRCOutputStream(OutputStream os) {
13 | 		this(os,false);
14 | 	}
15 | 	public CRCOutputStream(OutputStream os, boolean autoFlush) {
16 | 		this.os = os;
17 | 		this.crc = new CRC32();
18 | 		this.autoFlush = autoFlush;
19 | 		bytesWritten = 0;
20 | 	}
21 | 
22 | 	@Override
23 | 	public void write(int b) throws IOException {
24 | 		crc.update(b);
25 | 		os.write(b);
26 | 		if(autoFlush) 
27 | 			os.flush();
28 | 		bytesWritten++;
29 | 	}
30 | 	@Override
31 | 	public void write(byte[] b) throws IOException {
32 | 		write(b,0,b.length);
33 | 	}
34 | 	@Override
35 | 	public void write(byte[] b, int off, int len) throws IOException {
36 | 		crc.update(b, off, len);
37 | 		os.write(b,0,len);
38 | 		if(autoFlush) {
39 | 			os.flush();
40 | 		}
41 | 		bytesWritten += len;
42 | 	}
43 | 	public long getCRCValue() {
44 | 		return crc.getValue();
45 | 	}
46 | 	public long getBytesWritten() {
47 | 		return bytesWritten;
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/CommitedOutputStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.io;
 2 | 
 3 | import java.io.FilterOutputStream;
 4 | import java.io.IOException;
 5 | import java.io.OutputStream;
 6 | 
 7 | public abstract class CommitedOutputStream extends FilterOutputStream {
 8 | 	public CommitedOutputStream(OutputStream arg0) {
 9 | 		super(arg0);
10 | 	}
11 | 	public abstract void commit() throws IOException;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/EOFNotifyingInputStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.io;
 2 | 
 3 | import java.io.FilterInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | 
 7 | public class EOFNotifyingInputStream extends FilterInputStream {
 8 | 	EOFObserver observer;
 9 | 	boolean notified = false;
10 | 	public EOFNotifyingInputStream(InputStream in, EOFObserver observer) {
11 | 		super(in);
12 | 		this.observer = observer;
13 | 	}
14 | 	private void doNotify() throws IOException {
15 | 		if(!notified) {
16 | 			notified = true;
17 | 			if(observer != null) {
18 | 				observer.notifyEOF();
19 | 			}
20 | 		}
21 | 	}
22 | 	
23 | 	@Override
24 | 	public int read() throws IOException {
25 | 		int amtRead = super.read();
26 | 		if(amtRead == -1) {
27 | 			doNotify();
28 | 		}
29 | 		return amtRead;
30 | 	}
31 | 
32 | 	@Override
33 | 	public int read(byte[] b) throws IOException {
34 | 		return read(b,0,b.length);
35 | 	}
36 | 
37 | 	@Override
38 | 	public int read(byte[] b, int off, int len) throws IOException {
39 | 		int amtRead = super.read(b, off, len);
40 | 		if(amtRead == -1) {
41 | 			doNotify();
42 | 		}
43 | 		return amtRead;
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/EOFObserver.java:
--------------------------------------------------------------------------------
1 | package org.archive.util.io;
2 | 
3 | import java.io.IOException;
4 | 
5 | public interface EOFObserver {
6 | 	public void notifyEOF() throws IOException;
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/NotifyingInputStream.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.io;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | public class NotifyingInputStream extends InputStream {
 7 | 	InputStream wrapped;
 8 | 	BytesReadObserver observer;
 9 | 	public NotifyingInputStream(InputStream wrapped, 
10 | 			BytesReadObserver observer) {
11 | 		this.wrapped = wrapped;
12 | 		this.observer = observer;
13 | 	}
14 | 	private int notifyRead(int amt) {
15 | 		if(amt > 0) {
16 | 			observer.notifyBytesRead(amt);
17 | 		}
18 | 		return amt;
19 | 	}
20 | 	@Override
21 | 	public int read() throws IOException {
22 | 		return notifyRead(wrapped.read());
23 | 	}
24 | 	@Override
25 | 	public int read(byte[] b) throws IOException {
26 | 		return notifyRead(wrapped.read(b));
27 | 	}
28 | 	@Override
29 | 	public int read(byte[] b, int o, int l) throws IOException {
30 | 		return notifyRead(wrapped.read(b,o,l));
31 | 	}
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/PushBackOneByteInputStream.java:
--------------------------------------------------------------------------------
1 | package org.archive.util.io;
2 | 
3 | import java.io.IOException;
4 | 
5 | public interface PushBackOneByteInputStream {
6 | 	public void pushback() throws IOException;
7 | 	public int read() throws IOException;
8 | }
9 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/io/RuntimeIOException.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.io;
 2 | 
 3 | public class RuntimeIOException extends RuntimeException {
 4 |     private static final long serialVersionUID = 4762025404760379497L;
 5 |     
 6 |     private int status = 503;
 7 |     
 8 |     public RuntimeIOException()
 9 |     {
10 |     	
11 |     }
12 |     
13 |     public RuntimeIOException(String message)
14 |     {
15 |     	super(message);
16 |     }
17 |     
18 |     public RuntimeIOException(int status)
19 |     {
20 |     	this.status = status;
21 |     }
22 |     
23 |     public RuntimeIOException(Throwable cause)
24 |     {
25 |     	super(cause);
26 |     }
27 |    
28 |     public RuntimeIOException(int status, Throwable cause)
29 |     {
30 |     	super(cause);
31 |     	this.status = status;
32 |     } 
33 |     
34 |     public int getStatus()
35 |     {
36 |     	return status;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/BoundedStringIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | public class BoundedStringIterator extends AbstractPeekableIterator<String> 
 7 | 	implements CloseableIterator<String> {
 8 | 
 9 | 	private Iterator<String> inner;
10 | 	private String boundary;
11 | 	private boolean inclusive;
12 | 	private int flip;
13 | 
14 | 	public BoundedStringIterator(Iterator<String> inner, String boundary) {
15 | 		this(inner, boundary, false);
16 | 	}
17 | 	
18 | 	public BoundedStringIterator(Iterator<String> inner, String boundary, boolean inclusive) {
19 | 		this(inner, boundary, inclusive, false);
20 | 	}
21 | 	
22 | 	public BoundedStringIterator(Iterator<String> inner, String boundary, boolean inclusive, boolean reverse) {
23 | 		this.inner = inner;
24 | 		this.boundary = boundary;
25 | 		this.inclusive = inclusive;
26 | 		this.flip = (reverse ? -1 : 1);
27 | 	}
28 | 
29 | 	@Override
30 | 	public String getNextInner() {
31 | 		String tmp = null;
32 | 		if(inner.hasNext()) {
33 | 			tmp = inner.next();
34 | 			if(tmp.compareTo(boundary) * flip >= 0 && (!inclusive || !tmp.startsWith(boundary))) {
35 | 				tmp = null;
36 | 				try {
37 | 					close();
38 | 				} catch (IOException e) {
39 | 					throw new RuntimeException(e);
40 | 				}
41 | 			}
42 | 		}
43 | 		return tmp;
44 | 	}
45 | 
46 | 	public void close() throws IOException {
47 | 		CloseableIteratorUtil.attemptClose(inner);
48 | 	}
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/CachingStringFilter.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.util.LinkedHashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class CachingStringFilter implements StringFilter {
 7 | 	private LRUCache cache;
 8 | 	private StringFilter inner;
 9 | 	public CachingStringFilter(StringFilter inner, int max) {
10 | 		this.inner = inner;
11 | 		cache = new LRUCache(max);
12 | 	}
13 | 
14 | 	public boolean isFiltered(String text) {
15 | 		Boolean v = cache.remove(text);
16 | 		if(v == null) {
17 | 			v = inner.isFiltered(text);			
18 | 		}
19 | 		cache.put(text, v);
20 | 		return v;
21 | 	}
22 | 
23 | 	public class LRUCache extends LinkedHashMap<String, Boolean> {
24 | 	     /**  */
25 | 		private static final long serialVersionUID = 1L;
26 | 		private int max = 100;
27 | 
28 | 	     public LRUCache(int max) {
29 | 	    	 this.max = max;
30 | 	     }
31 | 
32 | 	     protected boolean removeEldestEntry(Map.Entry<String,Boolean> eldest) {
33 | 	    	 return (size() > max);
34 | 	     }
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/CloseableCompositeIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | import java.util.LinkedList;
 6 | 
 7 | public class CloseableCompositeIterator<E> implements CloseableIterator<E> {
 8 | 
 9 | 	protected LinkedList<CloseableIterator<E>> iters;
10 | 	protected Iterator<CloseableIterator<E>> iterPtr;
11 | 	protected CloseableIterator<E> currIter;
12 | 	
13 |     public CloseableCompositeIterator()
14 | 	{
15 | 		iters = new LinkedList<CloseableIterator<E>>();
16 | 	}
17 | 	
18 | 	public void addFirst(CloseableIterator<E> e)
19 | 	{
20 | 		iters.addFirst(e);
21 | 	}
22 | 	
23 | 	public void addLast(CloseableIterator<E> e)
24 | 	{
25 | 		iters.addLast(e);
26 | 	}
27 | 	
28 | 	@Override
29 |     public boolean hasNext() {
30 | 		
31 | 		if (iterPtr == null) {
32 | 			iterPtr = iters.iterator();
33 | 			currIter = iterPtr.next();
34 | 		}
35 | 		
36 | 		if (currIter == null) {
37 | 			return false;
38 | 		}
39 | 		
40 | 		while (currIter != null) {
41 | 			if (currIter.hasNext()) {
42 | 				return true;
43 | 			}
44 | 			
45 | 			currIter = (iterPtr.hasNext() ? iterPtr.next() : null);
46 | 		}
47 | 		
48 | 		return false;
49 |     }
50 | 
51 | 	@Override
52 |     public E next() {
53 | 		return currIter.next();
54 |     }
55 | 
56 | 	@Override
57 |     public void remove() {
58 | 		currIter.remove();
59 |     }
60 | 
61 | 	@Override
62 |     public void close() throws IOException {
63 | 		for (CloseableIterator<E> e : iters) {
64 | 			if (e != null) {
65 | 				try {
66 | 					e.close();
67 | 				} catch (IOException io) {
68 | 					
69 | 				}
70 | 			}
71 | 		}
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/CloseableIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.Closeable;
 4 | import java.util.Iterator;
 5 | 
 6 | /**
 7 |  * Iterator with a close method that frees up any resources associated with 
 8 |  * the Iterator.
 9 |  *
10 |  * @author brad
11 |  * @version $Date$, $Revision$
12 |  * @param <E> 
13 |  */
14 | public interface CloseableIterator<E> extends Iterator<E>, Closeable {
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/CloseableIteratorUtil.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | public class CloseableIteratorUtil {
 7 | 	public static <E> void attemptClose(Iterator<E> i) throws IOException {
 8 | 		if(i instanceof CloseableIterator) {
 9 | 			((CloseableIterator<E>) i).close();
10 | 		}
11 | 	}
12 | }
13 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/CloseableIteratorWrapper.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | /**
 7 |  * Wrap a regular Iterator&lt;S&gt; to create a CloseableIterator&lt;S&gt; where the close() is a no-op
 8 |  * @author ilya
 9 |  *
10 |  * @param <S>
11 |  */
12 | 
13 | public class CloseableIteratorWrapper<S> implements CloseableIterator<S>
14 | {
15 | 	protected Iterator<S> iter;
16 | 	
17 | 	public CloseableIteratorWrapper(Iterator<S> iter)
18 | 	{
19 | 		this.iter = iter;
20 | 	}
21 | 	
22 | 	@Override
23 |     public boolean hasNext() {
24 | 		return this.iter.hasNext();
25 |     }
26 | 
27 | 	@Override
28 |     public S next() {
29 | 		return this.iter.next();
30 |     }
31 | 
32 | 	@Override
33 |     public void remove() {
34 | 		this.iter.remove();
35 |         
36 |     }
37 | 
38 | 	@Override
39 |     public void close() throws IOException {
40 |         //No Op
41 |     }		
42 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/PeekableIterator.java:
--------------------------------------------------------------------------------
1 | package org.archive.util.iterator;
2 | 
3 | public interface PeekableIterator<E> extends CloseableIterator<E> {
4 | 	public E peek();
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/PrefixMatchStringIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | 
 6 | public class PrefixMatchStringIterator extends AbstractPeekableIterator<String>
 7 | {
 8 | 	private boolean first = true;
 9 | 	private String key;
10 | 	private CloseableIterator<String> inner;
11 | 	
12 | 	public PrefixMatchStringIterator(CloseableIterator<String> inner, String key, boolean alwaysIncludeFirst)
13 | 	{
14 | 		this.inner = inner;
15 | 		this.key = key;
16 | 		this.first = alwaysIncludeFirst;
17 | 	}
18 | 
19 | 	@Override
20 | 	public String getNextInner() {
21 | 		
22 | 		if (!inner.hasNext()) {
23 | 			return null;
24 | 		}
25 | 		
26 | 		String blockLine = inner.next();
27 | 		
28 | 		// only compare the correct length:
29 | 		String prefCmp = key;
30 | 		
31 | 		if (first) {
32 | 			// always add first:
33 | 			first = false;
34 | 		} else if (!blockLine.startsWith(prefCmp)) {
35 | 			return null;
36 | 		}
37 | 		
38 | 		return blockLine;
39 | 	}
40 | 
41 | 	@Override
42 | 	public void close() throws IOException {
43 | 		inner.close();
44 | 	}
45 | }


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/StartBoundedStringIterator.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Iterator;
 5 | 
 6 | public class StartBoundedStringIterator extends AbstractPeekableIterator<String> {
 7 | 
 8 | 	private Iterator<String> inner;
 9 | 	private String boundary;
10 | 	private boolean done = false;
11 | 	private boolean started = false;
12 | 	private int flip = 1;
13 | 	
14 | 	public StartBoundedStringIterator(Iterator<String> inner, String boundary) {
15 | 		this(inner, boundary, false);
16 | 	}
17 | 
18 | 	public StartBoundedStringIterator(Iterator<String> inner, String boundary, boolean reverse) {
19 | 		this.inner = inner;
20 | 		this.boundary = boundary;
21 | 		this.done = false;
22 | 		this.started = false;
23 | 		this.flip = (reverse ? -1 : 1);
24 | 	}
25 | 
26 | 	@Override
27 | 	public String getNextInner() {
28 | 		if(done) {
29 | 			return null;
30 | 		}
31 | 		if(started) {
32 | 			if(inner.hasNext()) {
33 | 				String tmp = inner.next();
34 | 				if(tmp == null) {
35 | 					done = true;
36 | 					return null;
37 | 				}
38 | 				return tmp;
39 | 			}
40 | 		}
41 | 		while(inner.hasNext()) {
42 | 			String tmp = inner.next();
43 | 			
44 | 			int cmp = boundary.compareTo(tmp) * flip;
45 | 			
46 | 			if ((cmp <= 0)) {
47 | 				started = true;
48 | 				return tmp;
49 | 			}
50 | 		}
51 | 		try {
52 | 			close();
53 | 		} catch(IOException e) {
54 | 			throw new RuntimeException(e);
55 | 		}
56 | 		done = true;
57 | 		return null;
58 | 	}
59 | 
60 | 	public void close() throws IOException {
61 | 		CloseableIteratorUtil.attemptClose(inner);
62 | 	}
63 | 
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/StringFilter.java:
--------------------------------------------------------------------------------
1 | package org.archive.util.iterator;
2 | 
3 | public interface StringFilter {
4 | 	public boolean isFiltered(String text);
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/StringTransformer.java:
--------------------------------------------------------------------------------
1 | package org.archive.util.iterator;
2 | 
3 | public interface StringTransformer {
4 | 	public String transform(String input);
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/iterator/TransformingPrefixStringFilter.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.util.Collection;
 4 | import java.util.TreeSet;
 5 | 
 6 | public class TransformingPrefixStringFilter implements StringFilter {
 7 | 	TreeSet<String> filters;
 8 | 	StringTransformer transformer;
 9 | 
10 | 	public TransformingPrefixStringFilter(Collection<String> blocks) {
11 | 		this(blocks,null);
12 | 	}
13 | 	public TransformingPrefixStringFilter(Collection<String> blocks,
14 | 			StringTransformer transformer) {
15 | 		filters = makeTreeSet(blocks,transformer);
16 | 		this.transformer = transformer;
17 | 	}
18 | 
19 | 	public static TreeSet<String> makeTreeSet(Collection<String> blocks, 
20 | 			StringTransformer trans) {
21 | 		TreeSet<String> tmp = new TreeSet<String>();
22 | 		for(String filter : blocks) {
23 | 			if(trans != null) {
24 | 				filter = trans.transform(filter);
25 | 			}
26 | 			String possiblePrefix = tmp.floor(filter);
27 | 	        if (possiblePrefix != null && filter.startsWith(possiblePrefix)) {
28 | 	        	// don't add - a prefix is already in the set:
29 | 	        } else {
30 | 	        	// is this a prefix of the existing item?
31 | 	        	String possibleLonger = tmp.ceiling(filter);
32 | 	        	if(possibleLonger == null) {
33 | 	        	} else if(possibleLonger.startsWith(filter)) {
34 | 	        		tmp.remove(possibleLonger);
35 | 	        	}
36 | 	        	tmp.add(filter);
37 | 	        }
38 | 		}
39 | 		return tmp;
40 | 	}
41 | 	
42 | 	public boolean isFiltered(String text) {
43 | 		if(transformer != null) {
44 | 			text = transformer.transform(text);
45 | 		}
46 |         String possiblePrefix = filters.floor(text);
47 |         return (possiblePrefix != null && text.startsWith(possiblePrefix));
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/org/archive/util/zip/NoGzipMagicException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.util.zip;
20 | 
21 | import java.io.IOException;
22 | 
23 | public class NoGzipMagicException extends IOException {
24 | 
25 |     private static final long serialVersionUID = 3084169624430655013L;
26 | 
27 |     public NoGzipMagicException() {
28 |         super();
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/resources/org/archive/commons.properties:
--------------------------------------------------------------------------------
1 | operator=
2 | publisher=
3 | wat.warcinfo.description=
4 | warc.format=WARC File Format 1.0
5 | warc.format.conforms.to=http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
6 | 


--------------------------------------------------------------------------------
/src/main/resources/org/archive/ia-web-commons-version.txt:
--------------------------------------------------------------------------------
1 | ia-web-commons.${pom.version}-${build.time}
2 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/extract/RealCDXExtractorOutputTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.extract;
 2 | 
 3 | import java.net.MalformedURLException;
 4 | import java.net.URI;
 5 | import java.net.URISyntaxException;
 6 | import java.net.URL;
 7 | import java.net.URLEncoder;
 8 | 
 9 | import junit.framework.TestCase;
10 | 
11 | 
12 | public class RealCDXExtractorOutputTest extends TestCase {
13 | 
14 |     public void testEscapeResolvedUrl() throws Exception {
15 | 	String context ="http://www.uni-giessen.de/cms/studium/dateien/informationberatung/merkblattpdf";
16 | 	String spec = "http://fss.plone.uni-giessen.de/fß/studium/dateien/informationberatung/merkblattpdf/file/Mérkblatt zur Gestaltung von Nachteilsausgleichen.pdf?föo=bar#änchor";
17 | 	String escaped = RealCDXExtractorOutput.resolve(context, spec);
18 | 	assertTrue(escaped.indexOf(" ") < 0);
19 | 	URI parsed = new URI(escaped);
20 | 	assertEquals("änchor", parsed.getFragment());
21 |     }
22 | 
23 |     public void testNoDoubleEscaping() throws Exception {
24 | 	String spec = "https://www.google.com/search?q=java+escape+url+spaces&ie=utf-8&oe=utf-8";
25 | 	String resolved = RealCDXExtractorOutput.resolve(spec, spec);
26 | 	assertTrue(spec.equals(resolved));
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/format/gzip/GZIPMemberWriterTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.gzip;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.File;
 5 | import java.io.FileOutputStream;
 6 | import java.io.IOException;
 7 | 
 8 | import org.archive.util.IAUtils;
 9 | 
10 | import junit.framework.TestCase;
11 | 
12 | public class GZIPMemberWriterTest extends TestCase {
13 | 
14 | 	public void testWrite() throws IOException {
15 |                 File outFile = File.createTempFile("tmp", ".gz");
16 | 		GZIPMemberWriter gzw = new GZIPMemberWriter(new FileOutputStream(outFile));
17 | 		gzw.write(new ByteArrayInputStream("Here is record 1".getBytes(IAUtils.UTF8)));
18 | 		gzw.write(new ByteArrayInputStream("Here is record 2".getBytes(IAUtils.UTF8)));
19 | 	}
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/format/json/CompoundORJSONPathSpecTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import java.util.ArrayList;
 4 | 
 5 | import org.archive.util.TestUtils;
 6 | import com.github.openjson.JSONException;
 7 | import com.github.openjson.JSONObject;
 8 | 
 9 | import junit.framework.TestCase;
10 | 
11 | public class CompoundORJSONPathSpecTest extends TestCase {
12 | 	String json1S = "{\"a\":\"A\"}";
13 | 	String json2S = "{\"b\":\"B\"}";
14 | 	public void testExtract() throws JSONException {
15 | 		JSONObject json1 = new JSONObject(json1S);
16 | 		JSONObject json2 = new JSONObject(json2S);
17 | 		ArrayList<JSONPathSpec> parts = new ArrayList<JSONPathSpec>();
18 | 		parts.add(new SimpleJSONPathSpec("a"));
19 | 		parts.add(new SimpleJSONPathSpec("b"));
20 | 		
21 | 		JSONPathSpec comp = new CompoundORJSONPathSpec(parts);
22 | 		TestUtils.dumpMatch("json1", comp.extract(json1));
23 | 		TestUtils.assertLoLMatches(new String[][]{{"A"}}, comp.extract(json1));
24 | 		TestUtils.assertLoLMatches(new String[][]{{"B"}}, comp.extract(json2));
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/format/json/JSONViewTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import org.archive.util.TestUtils;
 4 | import com.github.openjson.JSONException;
 5 | import com.github.openjson.JSONObject;
 6 | 
 7 | import junit.framework.TestCase;
 8 | 
 9 | public class JSONViewTest extends TestCase {
10 | 	
11 | 	public int getInt(byte b[]) {
12 | 		return b[0] & 0xff;
13 | 	}
14 | 	
15 | 	public void testBytes() throws JSONException {
16 | 		JSONObject o = new JSONObject();
17 | 		o.append("name1", "val\\rue1");
18 | 		String json = o.toString();
19 | 		System.out.format("once: (%s)\n",json);
20 | 		JSONObject o2 = new JSONObject(json);
21 | 		System.out.format("twice: (%s)\n",o2.toString());
22 | 		
23 | 		
24 | 		byte b[] = new byte[2];
25 | 		for(int i = 0; i < 256; i++) {
26 | 			b[0] = (byte) i;
27 | 			int gi = getInt(b);
28 | 			System.out.format("I(%d) gi(%d)\n",i,gi);
29 | 		}
30 | 	}
31 | 	public void testApply() throws JSONException {
32 | 		String json1S = "{\"url\":\"a\",\"link\":[{\"zz\":\"1\",\"qq\":\"qa\"},{\"zz2\":\"2\",\"qq\":\"qb\"},{\"zz\":\"3\",\"qq\":\"qc\"},{\"zz\":\"4\"}]}";
33 | 		JSONObject json1 = new JSONObject(json1S);
34 | 
35 | 		JSONView view = new JSONView("url","@link.zz");
36 | 		TestUtils.assertLoLMatches(new String[][]{{"a","1"},{"a",""},{"a","3"},{"a","4"}},
37 | 				view.apply(json1));
38 | 
39 | 		view = new JSONView("url","@link.{zz,qq}");
40 | 		TestUtils.assertLoLMatches(new String[][]{{"a","1","qa"},{"a","","qb"},{"a","3","qc"},{"a","4",""}},
41 | 				view.apply(json1));
42 | 	}
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/format/json/SimpleJSONPathSpecTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.format.json;
 2 | 
 3 | import org.archive.util.TestUtils;
 4 | import com.github.openjson.JSONException;
 5 | import com.github.openjson.JSONObject;
 6 | 
 7 | import junit.framework.TestCase;
 8 | 
 9 | public class SimpleJSONPathSpecTest extends TestCase {
10 | 	String json1 = "{\"a\": {  \"b\": \"Foo\" }}";
11 | 	String json2 = "{\"a\": {  \"b\": [{\"a\":\"1\"},{\"a\":\"2\"}] }}";
12 | 
13 | 	String json3 = "{\"a\": {  \"b\": {\"A\":\"11\",\"B\":\"22\"} }}";
14 | 	String json4 = "{\"a\": {  \"b\": [{\"A\":\"11\",\"B\":\"22\"},{\"A\":\"33\",\"B\":\"44\"}] }}";
15 | 
16 | 	public void testExtract() throws JSONException {
17 | 		JSONObject json = new JSONObject(json1);
18 | 		JSONPathSpec spec = new SimpleJSONPathSpec("a.b");
19 | 		TestUtils.dumpMatch("json1", spec.extract(json));
20 | 		TestUtils.assertLoLMatches(new String[][]{{"Foo"}}, spec.extract(json));
21 | 		
22 | 		json = new JSONObject(json2);
23 | 		spec = new SimpleJSONPathSpec("a.@b.a");
24 | 		TestUtils.dumpMatch("json2", spec.extract(json));
25 | 		TestUtils.assertLoLMatches(new String[][]{{"1"},{"2"}}, spec.extract(json));
26 | 
27 | 		json = new JSONObject(json3);
28 | 		spec = new SimpleJSONPathSpec("a.b.{A,B}");
29 | 		TestUtils.dumpMatch("json3", spec.extract(json));
30 | 		TestUtils.assertLoLMatches(new String[][]{{"11","22"}}, spec.extract(json));
31 | 
32 | 		json = new JSONObject(json4);
33 | 		spec = new SimpleJSONPathSpec("a.@b.{A,B}");
34 | 		TestUtils.dumpMatch("json4", spec.extract(json));
35 | 		TestUtils.assertLoLMatches(new String[][]{{"11","22"},{"33","44"}}, spec.extract(json));
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/io/warc/WARCReaderFactoryTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.io.warc;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.io.IOException;
 5 | 
 6 | import org.archive.format.warc.WARCConstants;
 7 | import org.archive.format.warc.WARCConstants.WARCRecordType;
 8 | import org.archive.io.ArchiveReader;
 9 | import org.archive.io.ArchiveRecord;
10 | 
11 | import junit.framework.TestCase;
12 | 
13 | public class WARCReaderFactoryTest extends TestCase {
14 | 	
15 | 	// Test files:
16 | 	String[] files = new String[] {
17 | 			"src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz",
18 | 			"src/test/resources/org/archive/format/warc/IAH-urls-wget.warc"
19 | 	};
20 | 
21 | 	public void testGetStringInputstreamBoolean() throws IOException {
22 | 		// Check the test files can be opened:
23 | 		for( String file : files ) {
24 | 			FileInputStream is = new FileInputStream(file);
25 | 			ArchiveReader ar = WARCReaderFactory.get(file, is, true);
26 | 			ArchiveRecord r = ar.get();
27 | 			String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
28 | 			// Check the first record comes out as a 'warcinfo' record.
29 | 			assertEquals(WARCRecordType.warcinfo.name(), type);
30 | 		}
31 | 	}
32 | 	
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/resource/warc/WARCResourceTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.resource.warc;
 2 | 
 3 | import static org.archive.resource.ResourceConstants.PAYLOAD_LENGTH;
 4 | import static org.archive.resource.ResourceConstants.PAYLOAD_SLOP_BYTES;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | import org.archive.extract.ExtractingResourceFactoryMapper;
 9 | import org.archive.extract.ExtractingResourceProducer;
10 | import org.archive.extract.ProducerUtils;
11 | import org.archive.extract.ResourceFactoryMapper;
12 | import org.archive.resource.Resource;
13 | import org.archive.resource.ResourceParseException;
14 | import org.archive.resource.ResourceProducer;
15 | import org.archive.util.StreamCopy;
16 | 
17 | import com.github.openjson.JSONObject;
18 | 
19 | import junit.framework.TestCase;
20 | 
21 | public class WARCResourceTest extends TestCase {
22 | 
23 | 	public void testWARCResource() throws ResourceParseException, IOException {
24 | 		String testFileName = "../../format/warc/IAH-urls-wget.warc";
25 | 		ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
26 | 		ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
27 | 		ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
28 | 
29 | 		Resource resource = extractor.getNext();
30 | 
31 | 		while (resource != null) {
32 | 			JSONObject payloadMD = resource.getMetaData().getTopMetaData().getJSONObject("Envelope")
33 | 					.getJSONObject("Payload-Metadata");
34 | 
35 | 			if (payloadMD.has(PAYLOAD_LENGTH)) {
36 | 				assertTrue(payloadMD.getLong(PAYLOAD_LENGTH) != -1);
37 | 			}
38 | 			if (payloadMD.has(PAYLOAD_SLOP_BYTES)) {
39 | 				assertEquals(4, payloadMD.getLong(PAYLOAD_SLOP_BYTES));
40 | 			}
41 | 
42 | 			StreamCopy.readToEOF(resource.getInputStream());
43 | 			resource = extractor.getNext();
44 | 		}
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/uid/UUIDGeneratorTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | package org.archive.uid;
20 | 
21 | import java.net.URI;
22 | import java.net.URISyntaxException;
23 | import java.util.HashMap;
24 | import java.util.Map;
25 | 
26 | import junit.framework.TestCase;
27 | 
28 | /**
29 |  * @author stack
30 |  * @version $Revision$ $Date$
31 |  */
32 | public class UUIDGeneratorTest extends TestCase {
33 | 	public void testQualifyRecordID() throws URISyntaxException {
34 | 		RecordIDGenerator g = new UUIDGenerator();
35 | 		URI uri = g.getRecordID();
36 | 		Map<String, String> qualifiers = new HashMap<String, String>();
37 | 		qualifiers.put("a", "b");
38 | 		URI nuURI = g.qualifyRecordID(uri, qualifiers);
39 | 		assertNotSame(uri, nuURI);
40 | 		qualifiers.put("c", "d");
41 | 		nuURI = g.qualifyRecordID(nuURI, qualifiers);
42 | 		assertNotSame(uri, nuURI);
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/url/AggressiveIAURLCanonicalizerTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | import java.net.URISyntaxException;
 4 | 
 5 | import junit.framework.TestCase;
 6 | 
 7 | public class AggressiveIAURLCanonicalizerTest extends TestCase {
 8 | 	static AggressiveIAURLCanonicalizer ia = new AggressiveIAURLCanonicalizer();
 9 | 	public void testCanonicalize() throws URISyntaxException {
10 | 		// FULL end-to-end tests:
11 | 		check("http://www.alexa.com/","http://alexa.com/");
12 | 		check("http://archive.org/index.html","http://archive.org/index.html");
13 | 		check("http://archive.org/index.html?","http://archive.org/index.html");
14 | 		check("http://archive.org/index.html?a=b","http://archive.org/index.html?a=b");
15 | 		check("http://archive.org/index.html?b=b&a=b","http://archive.org/index.html?a=b&b=b");
16 | 		check("http://archive.org/index.html?b=a&b=b&a=b","http://archive.org/index.html?a=b&b=a&b=b");
17 | 		check("http://www34.archive.org/index.html?b=a&b=b&a=b","http://archive.org/index.html?a=b&b=a&b=b");
18 | 	}
19 | 
20 | 	private static void check(String orig, String want) throws URISyntaxException {
21 | 		HandyURL u = URLParser.parse(orig);
22 | 		ia.canonicalize(u);
23 | 		String got = u.getURLString();
24 | 		assertEquals(want,got);
25 | 		
26 | 		HandyURL u2 = URLParser.parse(got);
27 | 		ia.canonicalize(u2);
28 | 		String got2 = u2.getURLString();
29 | 		assertEquals("Second passs changed!",got,got2);
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/url/HandyURLTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | public class HandyURLTest extends TestCase {
 6 | 
 7 | 	public void testGetPublicSuffix() {
 8 | 		HandyURL h = new HandyURL();
 9 | 		h.setHost("www.fool.com");
10 | 		assertEquals("fool.com",h.getPublicSuffix());
11 | 		assertEquals("www",h.getPublicPrefix());
12 | 
13 | 		h.setHost("www.amazon.co.uk");
14 | 		assertEquals("amazon.co.uk",h.getPublicSuffix());
15 | 		assertEquals("www",h.getPublicPrefix());
16 | 
17 | 		h.setHost("www.images.amazon.co.uk");
18 | 		assertEquals("amazon.co.uk",h.getPublicSuffix());
19 | 		assertEquals("www.images",h.getPublicPrefix());
20 | 
21 | 		h.setHost("funky-images.fancy.co.jp");
22 | 		assertEquals("fancy.co.jp",h.getPublicSuffix());
23 | 		assertEquals("funky-images",h.getPublicPrefix());
24 | 	
25 | 	}
26 | 
27 | 	public void testGetPublicPrefix() {
28 | //		
29 | //		fail("Not yet implemented");
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/url/OrdinaryIAURLCanonicalizerTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.url;
 2 | 
 3 | import java.net.URISyntaxException;
 4 | 
 5 | import junit.framework.TestCase;
 6 | 
 7 | public class OrdinaryIAURLCanonicalizerTest extends TestCase {
 8 | 	private OrdinaryIAURLCanonicalizer canon = new OrdinaryIAURLCanonicalizer();
 9 | 	
10 | 	public void testMisc() throws URISyntaxException {
11 | 		checkCanonicalization("http://...host..com..", "http://host.com/");
12 | 		checkCanonicalization("http://example.org:80/", "http://example.org/");
13 | 		checkCanonicalization("https://example.org:443/", "https://example.org/");
14 | 		checkCanonicalization("http://example.org:443/", "http://example.org:443/");
15 | 		checkCanonicalization("http://example.org/?", "http://example.org/");
16 | 		checkCanonicalization("http://example.org/foo?", "http://example.org/foo");
17 | 		checkCanonicalization("http://example.org/foo/?", "http://example.org/foo/");
18 | 	}
19 | 
20 | 	public void testSchemeCapitals() throws URISyntaxException {
21 | 		checkCanonicalization("Http://example.com", "http://example.com/");
22 | 		checkCanonicalization("HTTP://example.com", "http://example.com/");
23 | 		checkCanonicalization("ftP://example.com", "ftp://example.com/");
24 | 	}
25 | 	
26 | 	private void checkCanonicalization(String in, String want) throws URISyntaxException {
27 | 		HandyURL h = URLParser.parse(in);
28 | 		canon.canonicalize(h);
29 | 		String got = h.getURLString();
30 | 		assertEquals(want, got);
31 | 	}
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/ByteOpTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.ByteArrayOutputStream;
 5 | import java.io.DataInputStream;
 6 | import java.io.IOException;
 7 | 
 8 | import org.archive.util.ByteOp;
 9 | 
10 | import com.google.common.io.LittleEndianDataOutputStream;
11 | 
12 | import junit.framework.TestCase;
13 | 
14 | public class ByteOpTest extends TestCase {
15 | 
16 | 	public void testReadShort() throws IOException {
17 | 		byte a[] = new byte[]{0,1,2,3};
18 | 		ByteArrayInputStream bais = new ByteArrayInputStream(a);
19 | 		int bos = ByteOp.readShort(bais);
20 | 		System.out.format("BO.Read short(%d)\n", bos);
21 | 		DataInputStream dis = new DataInputStream(new ByteArrayInputStream(a));
22 | 		int disv = dis.readUnsignedShort();
23 | 		System.out.format("DI.Read short(%d)\n", disv);
24 | 		for(int i = 0; i < 256 * 256; i++) {
25 | 			ByteArrayOutputStream baos = new ByteArrayOutputStream(2);
26 | 			LittleEndianDataOutputStream dos = new LittleEndianDataOutputStream(baos);
27 | 			dos.writeShort(i);
28 | 			ByteArrayInputStream bais2 = new ByteArrayInputStream(baos.toByteArray());
29 | 			int gotI = ByteOp.readShort(bais2);
30 | 			assertEquals(i, gotI);
31 | 		}
32 | 	}
33 | 
34 | 	public void testAppend() {
35 | 		byte a[] = new byte[]{1};
36 | 		byte b[] = new byte[]{2};
37 | 		byte n[] = ByteOp.append(a,b);
38 | 		assertEquals(2,n.length);
39 | 		assertEquals(1,n[0]);
40 | 		assertEquals(2,n[1]);
41 | 
42 | 		byte a2[] = new byte[]{1,2,3,4};
43 | 		byte b2[] = new byte[]{5,6,7,8};
44 | 		byte n2[] = ByteOp.append(a2,b2);
45 | 		assertEquals(8,n2.length);
46 | 		assertEquals(1,n2[0]);
47 | 		assertEquals(2,n2[1]);
48 | 		assertEquals(5,n2[4]);
49 | 		
50 | 	}
51 | 
52 | 	public void testReadInt() {
53 | 	}
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/CrossProductTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import org.archive.util.CrossProduct;
 7 | 
 8 | import junit.framework.TestCase;
 9 | 
10 | public class CrossProductTest extends TestCase {
11 | 	private void dumpC(List<Object> a) {
12 | 		StringBuilder sb = new StringBuilder();
13 | 		boolean first = false;
14 | 		for(Object o : a) {
15 | 			if(first) {
16 | 				first = false;
17 | 			} else {
18 | 				sb.append(",");
19 | 			}
20 | 			sb.append(o.toString());
21 | 		}
22 | 		System.out.println("Dump:" + sb.toString());
23 | 	}
24 | 	private void dumpLOL(List<List<Object>> coc) {
25 | 		for(List<Object> co : coc) {
26 | 			dumpC(co);
27 | 		}
28 | 	}
29 | 	public void testVersion() {
30 | 		String version = IAUtils.loadCommonsVersion();
31 | 		System.out.format("Loaded version(%s)\n", version);
32 | 	}
33 | 	public void testCrossProduct() {
34 | 		ArrayList<List<Object>> input = new ArrayList<List<Object>>();
35 | 		CrossProduct<Object> xp = new CrossProduct<Object>();
36 | 		input.add(AtoL("1","2"));
37 | 		input.add(AtoL("Charming"));
38 | 		input.add(AtoL("Berry","Elvis"));
39 | 		input.add(AtoL("a","b","c","d"));
40 | 		List<List<Object>> cross = xp.crossProduct(input);
41 | 		dumpLOL(cross);
42 | 	}
43 | 	private List<Object> AtoL(Object... a) {
44 | 		ArrayList<Object> al = new ArrayList<Object>(a.length);
45 | 		for(Object s : a) {
46 | 			al.add(s);
47 | 		}
48 | 		return al;
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/PropertyUtilsTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  This file is part of the Heritrix web crawler (crawler.archive.org).
 3 |  *
 4 |  *  Licensed to the Internet Archive (IA) by one or more individual 
 5 |  *  contributors. 
 6 |  *
 7 |  *  The IA licenses this file to You under the Apache License, Version 2.0
 8 |  *  (the "License"); you may not use this file except in compliance with
 9 |  *  the License.  You may obtain a copy of the License at
10 |  *
11 |  *      http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  *  Unless required by applicable law or agreed to in writing, software
14 |  *  distributed under the License is distributed on an "AS IS" BASIS,
15 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  *  See the License for the specific language governing permissions and
17 |  *  limitations under the License.
18 |  */
19 | 
20 | package org.archive.util;
21 | 
22 | 
23 | import java.io.IOException;
24 | import java.util.Properties;
25 | 
26 | import junit.framework.TestCase;
27 | 
28 | 
29 | /**
30 |  * PropertyUtils tests. 
31 |  * 
32 |  * @author gojomo
33 |  * @version $Date: 2009-11-19 14:39:53 -0800 (Thu, 19 Nov 2009) $, $Revision: 6674 $
34 |  */
35 | public class PropertyUtilsTest extends TestCase {
36 |     
37 |     public void testSimpleInterpolate() throws IOException {
38 |         Properties props = new Properties(); 
39 |         props.put("foo", "OOF");
40 |         props.put("bar","RAB");
41 |         String original = "FOO|${foo}  BAR|${bar}";
42 |         String expected = "FOO|OOF  BAR|RAB";
43 |         assertEquals("interpalation problem",expected,PropertyUtils.interpolateWithProperties(original,props));
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/StringFieldExtractorTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import org.archive.util.StringFieldExtractor.StringTuple;
 4 | 
 5 | import junit.framework.TestCase;
 6 | 
 7 | public class StringFieldExtractorTest extends TestCase {
 8 | 
 9 | 	public void testExtract() {
10 | 		StringFieldExtractor ex1 = new StringFieldExtractor(' ', 0);
11 | 		StringFieldExtractor ex2 = new StringFieldExtractor(' ', 1);
12 | 		StringFieldExtractor ex3 = new StringFieldExtractor(' ', 2);
13 | 		StringFieldExtractor ex4 = new StringFieldExtractor(' ', 3);
14 | 		StringFieldExtractor ex5 = new StringFieldExtractor(' ', 4);
15 | 		assertEquals("1",ex1.extract("1 2 3 4"));
16 | 		assertEquals("2",ex2.extract("1 2 3 4"));
17 | 		assertEquals("3",ex3.extract("1 2 3 4"));
18 | 		assertEquals("4",ex4.extract("1 2 3 4"));
19 | 		assertEquals(null,ex5.extract("1 2 3 4"));
20 | 		assertEquals("",ex5.extract("1 2 3 4 "));
21 | 		assertEquals("",ex1.extract(" 1 2 3 4 "));
22 | 		assertEquals("1",ex2.extract(" 1 2 3 4 "));
23 | 		assertEquals("2",ex3.extract(" 1 2 3 4 "));
24 | 		assertEquals("abc",ex1.extract("abc 1 2 3 4 "));
25 | 		assertEquals("1",ex2.extract("abc 1 2 3 4 "));
26 | 	}
27 | 
28 | 	private void checkSplit(String f, String s,StringTuple t) {
29 | 		assertEquals(f,t.first);
30 | 		assertEquals(s,t.second);
31 | 	}
32 | 	
33 | 	public void testSplit() {
34 | 		StringFieldExtractor sfx = new StringFieldExtractor(' ',2);
35 | 		checkSplit("a b","x y",sfx.split("a b x y"));
36 | 		checkSplit("ab ","x y",sfx.split("ab  x y"));
37 | 		checkSplit("ab x","y z",sfx.split("ab x y z"));
38 | 		checkSplit("ab x","y z",sfx.split("ab x y z"));
39 | 		checkSplit("ab",null,sfx.split("ab"));
40 | 		checkSplit("ab x",null,sfx.split("ab x"));
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/TestUtils.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.List;
 6 | 
 7 | import junit.framework.TestCase;
 8 | 
 9 | 
10 | import com.google.common.io.ByteStreams;
11 | 
12 | public class TestUtils extends TestCase {
13 | 	public void testNothing() {
14 | 		assertEquals(2,1+1);
15 | 	}
16 | 	public static void dumpMatch(String context, List<List<String>> res) {
17 | 
18 | 		System.out.format("Context(%s) Found (%d) matches\n", context, res.size());
19 | 		for(List<String> r : res) {
20 | 			System.out.format("Match(%s)\n", StringParse.join(r));
21 | 		}
22 | 		
23 | 	}
24 | 	public static void assertLoLMatches(String want[][], List<List<String>> got) {
25 | 		assertEquals(want.length,got.size());
26 | 		for(int i = 0; i < want.length; i++) {
27 | 			String [] wantSub = want[i];
28 | 			List<String> gotSub = got.get(i);
29 | 			assertEquals(wantSub.length,gotSub.size());
30 | 			for(int j = 0; j < wantSub.length; j++) {
31 | 				assertEquals(wantSub[j],gotSub.get(j));
32 | 			}
33 | 		}
34 | 	}
35 | 	public static void assertStreamEquals(InputStream is,byte b[]) throws IOException {
36 | 		byte got[] = ByteStreams.toByteArray(is);
37 | 		assertEquals(got.length,b.length);
38 | 		assertTrue(ByteOp.cmp(got,b));
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/iterator/CachingStringFilterTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import junit.framework.TestCase;
 4 | 
 5 | public class CachingStringFilterTest extends TestCase {
 6 | 	public void testCache() {
 7 | 		StringFilter tf = new StringFilter() {
 8 | 			public boolean isFiltered(String text) {
 9 | 				return true;
10 | 			}
11 | 		};
12 | 		CachingStringFilter csf = new CachingStringFilter(tf, 3);
13 | 		csf.isFiltered("one");
14 | 		csf.isFiltered("one");
15 | 		csf.isFiltered("two");
16 | 		csf.isFiltered("one");
17 | 		csf.isFiltered("three");
18 | 		csf.isFiltered("two");
19 | 		csf.isFiltered("four");
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/iterator/FilterStringIteratorTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | import java.util.TreeSet;
 7 | 
 8 | import junit.framework.TestCase;
 9 | 
10 | public class FilterStringIteratorTest extends TestCase {
11 | 
12 | 	public void t2estHasNext() {
13 | 		String blocks[] = {"a","ab","ba","cc"};
14 | 		
15 | 		List<String> bl = Arrays.asList(blocks);
16 | 		TransformingPrefixStringFilter f = new TransformingPrefixStringFilter(bl);
17 | 		assertBlocked(true,"a",f);
18 | 		assertBlocked(true,"ab",f);
19 | 		assertBlocked(true,"ac",f);
20 | 		assertBlocked(true,"acca",f);
21 | 		assertBlocked(false,"b",f);
22 | 		assertBlocked(true,"ba",f);
23 | 		assertBlocked(true,"bac",f);
24 | 		assertBlocked(false,"bc",f);
25 | 		assertBlocked(false,"ca",f);
26 | 		assertBlocked(true,"cc",f);
27 | 		assertBlocked(true,"cca",f);
28 | 	}
29 | 	
30 | 	public void testTreeSet() {
31 | 		String blocks[] = {"a","ab","ba","cc"};
32 | 		TreeSet<String> s = TransformingPrefixStringFilter.makeTreeSet(Arrays.asList(blocks),null);
33 | 		assertTrue(s.contains("a"));
34 | 		assertFalse(s.contains("ab"));
35 | 
36 | 		String blocks2[] = {"ab","a","ba","cc"};
37 | 		TreeSet<String> s2 = TransformingPrefixStringFilter.makeTreeSet(Arrays.asList(blocks2),null);
38 | 		assertTrue(s2.contains("a"));
39 | 		assertFalse(s2.contains("ab"));
40 | 
41 | 		
42 | 		
43 | 	}
44 | 	
45 | 	
46 | 	private void assertBlocked(boolean blocked, String s, StringFilter f) {
47 | 		ArrayList<String> l = new ArrayList<String>();
48 | 		l.add(s);
49 | 		FilterStringIterator i = new FilterStringIterator(l.iterator(), f);
50 | 		if(blocked) {
51 | 			assertFalse(i.hasNext());
52 | 		} else {
53 | 			assertTrue(i.hasNext());
54 | 			assertEquals(s,i.next());
55 | 		}
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/src/test/java/org/archive/util/iterator/SortedCompositeIteratorTest.java:
--------------------------------------------------------------------------------
 1 | package org.archive.util.iterator;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.FileReader;
 7 | import java.io.IOException;
 8 | import java.io.PrintWriter;
 9 | import java.util.Comparator;
10 | 
11 | import junit.framework.TestCase;
12 | 
13 | public class SortedCompositeIteratorTest extends TestCase {
14 | 
15 | 	public void testHasNext() throws FileNotFoundException, IOException {
16 | 		
17 | 		File a = File.createTempFile("filea", null); 
18 | 		File b = File.createTempFile("fileb", null);
19 | 		
20 | 		PrintWriter apw = new PrintWriter(a);
21 | 		PrintWriter bpw = new PrintWriter(b);
22 | 		apw.println("1");
23 | 		apw.println("3");
24 | 		bpw.println("2");
25 | 		bpw.println("4");
26 | 		apw.close();
27 | 		bpw.close();
28 | 		BufferedReader abr = new BufferedReader(new FileReader(a));
29 | 		BufferedReader bbr = new BufferedReader(new FileReader(b));
30 | 		SortedCompositeIterator<String> sci = new SortedCompositeIterator<String>(new Comparator<String>() {
31 | 
32 |                         @Override
33 | 			public int compare(String o1, String o2) {
34 | 				return o1.compareTo(o2);
35 | 			}
36 | 			
37 | 		});
38 | 		sci.addIterator(AbstractPeekableIterator.wrapReader(abr));
39 | 		sci.addIterator(AbstractPeekableIterator.wrapReader(bbr));
40 | 		assertTrue(sci.hasNext());
41 | 		assertEquals("1",sci.next());
42 | 		assertTrue(sci.hasNext());
43 | 		assertEquals("2",sci.next());
44 | 		assertTrue(sci.hasNext());
45 | 		assertEquals("3",sci.next());
46 | 		assertTrue(sci.hasNext());
47 | 		assertEquals("4",sci.next());
48 | 		a.delete();
49 | 		b.delete();
50 | 	}
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/arc/IAH-20080430204825-00000-blackbook-truncated.arc


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/gzip/abcd.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/abcd.gz


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/gzip/double-single-inflate-error.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/double-single-inflate-error.gz


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/gzip/empty.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/empty.gz


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/gzip/hi-2.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/hi-2.gz


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/gzip/hi.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/gzip/hi.gz


--------------------------------------------------------------------------------
/src/test/resources/org/archive/format/warc/IAH-urls-wget.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/format/warc/IAH-urls-wget.warc


--------------------------------------------------------------------------------
/src/test/resources/org/archive/resource/html/meta-itemprop.warc:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: response
 3 | WARC-Date: 2024-12-05T10:47:02Z
 4 | Content-Length: 710
 5 | Content-Type: application/http; msgtype=response
 6 | WARC-Target-URI: https://www.example.org/
 7 | WARC-Identified-Payload-Type: text/html
 8 | 
 9 | HTTP/1.1 200 
10 | content-type: text/html; charset=UTF-8
11 | 
12 | <!DOCTYPE html>
13 | <html lang="en">
14 | <head>
15 | 	<meta charset="UTF-8">
16 | 	<meta name=robots content="index,follow">
17 | 	<title>Test</title>
18 | </head>
19 | <body>
20 | <!-- from https://schema.org/docs/gs.html#advanced_missing -->
21 | <div itemscope itemtype="https://schema.org/Offer">
22 |   <span itemprop="name">Blend-O-Matic</span>
23 |   <span itemprop="price">$19.95</span>
24 |   <div itemprop="reviews" itemscope itemtype="https://schema.org/AggregateRating">
25 |     <img src="four-stars.jpg" />
26 |     <meta itemprop="ratingValue" content="4" />
27 |     <meta itemprop="bestRating" content="5" />
28 |     Based on <span itemprop="ratingCount">25</span> user ratings
29 |   </div>
30 | </div>
31 | </body>
32 | </html>
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/src/test/resources/org/archive/resource/html/text-extraction-test.warc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/commoncrawl/ia-web-commons/1229f7a5f35f4fcf1dda9708e90e4199b54c3531/src/test/resources/org/archive/resource/html/text-extraction-test.warc


--------------------------------------------------------------------------------
/src/test/resources/org/archive/resource/html/title-extraction-embedded-SVG.warc:
--------------------------------------------------------------------------------
 1 | WARC/1.0
 2 | WARC-Type: response
 3 | WARC-Record-ID: <urn:uuid:9043ba74-5d11-4dad-97c1-d7454f8b7358>
 4 | WARC-Target-URI: https://www.example.org/testEmbeddedSVG.html
 5 | WARC-Date: 2024-10-14T10:05:41Z
 6 | WARC-IP-Address: 127.0.0.1
 7 | WARC-Block-Digest: sha1:XNN4JA3QDUN4DDEGTIPH5ZRORHYL657F
 8 | WARC-Payload-Digest: sha1:4FUACFTG3WCL26OITZNMEPRKFP6WAAHN
 9 | Content-Type: application/http;msgtype=response
10 | Content-Length: 856
11 | 
12 | HTTP/1.1 200 OK
13 | Date: Mon, 14 Oct 2024 10:05:41 GMT
14 | Server: Apache/2.4.58 (Ubuntu)
15 | Upgrade: h2,h2c
16 | Connection: Upgrade, Keep-Alive
17 | Last-Modified: Mon, 14 Oct 2024 10:04:25 GMT
18 | ETag: "20a-6246cf6287f50"
19 | Accept-Ranges: bytes
20 | Content-Length: 522
21 | Vary: Accept-Encoding
22 | Keep-Alive: timeout=5, max=100
23 | Content-Type: text/html
24 | 
25 | <!DOCTYPE html>
26 | <html>
27 | <head>
28 | <title>Testing title extraction with embedded SVG</title>
29 | <meta charset="utf-8">
30 | </head>
31 | <body>
32 |   <div>
33 |     <header>Testing title extraction with embedded SVG</header>
34 |     <p>This is body text...</p>
35 |     <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 400 400" fill="currentColor" width="1em">
36 |       <title>Embedded SVG</title>
37 |       <rect x="0" y="0" width="100%" height="100%" fill="lightblue"/>
38 |       <circle cx="100" cy="100" r="50" fill="red"/>
39 |     </svg>
40 |   </div>
41 | </body>
42 | </html>
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------