├── .codecov.yml ├── .github ├── ISSUE_TEMPLATE │ ├── Bug_report.md │ └── Feature_request.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── mvn-build.yml │ ├── python-formatter.yml │ └── scala-formatter.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── config └── LICENSE_HEADER.txt ├── pom.xml └── src ├── main ├── assembly │ └── python.xml ├── python │ └── aut │ │ ├── __init__.py │ │ ├── app.py │ │ ├── common.py │ │ └── udfs.py ├── resources │ └── log4j.properties └── scala │ └── io │ └── archivesunleashed │ ├── ArchiveRecord.scala │ ├── SparklingArchiveRecord.scala │ ├── app │ ├── AudioInformationExtractor.scala │ ├── CommandLineApp.scala │ ├── CssInformationExtractor.scala │ ├── DomainFrequencyExtractor.scala │ ├── DomainGraphExtractor.scala │ ├── ExtractPopularImages.scala │ ├── ExtractPopularImagesDF.scala │ ├── HtmlInformationExtractor.scala │ ├── ImageGraphExtractor.scala │ ├── ImageInformationExtractor.scala │ ├── JsInformationExtractor.scala │ ├── JsonInformationExtractor.scala │ ├── PDFInformationExtractor.scala │ ├── PlainTextExtractor.scala │ ├── PlainTextInformationExtractor.scala │ ├── PresentationProgramInformationExtractor.scala │ ├── SpreadsheetInformationExtractor.scala │ ├── VideoInformationExtractor.scala │ ├── WebGraphExtractor.scala │ ├── WebPagesExtractor.scala │ ├── WordProcessorInformationExtractor.scala │ ├── WriteGEXF.scala │ ├── WriteGraphML.scala │ └── XmlInformationExtractor.scala │ ├── df │ ├── DataFrameLoader.scala │ └── package.scala │ ├── matchbox │ ├── ComputeImageSize.scala │ ├── ComputeMD5.scala │ ├── ComputeSHA1.scala │ ├── CovertLastModifiedDate.scala │ ├── DetectLanguage.scala │ ├── DetectMimeTypeTika.scala │ ├── ExtractBoilerpipeText.scala │ ├── ExtractDate.scala │ ├── ExtractDomain.scala │ ├── ExtractImageDetails.scala │ ├── ExtractImageLinks.scala │ ├── ExtractLinks.scala │ ├── ExtractTextFromPDFs.scala │ ├── GetExtensionMIME.scala │ ├── RemoveHTML.scala │ ├── RemoveHTTPHeader.scala │ └── package.scala │ ├── package.scala │ └── udfs │ └── package.scala └── test ├── resources ├── arc │ ├── badexample.arc.gz │ └── example.arc.gz └── warc │ ├── example.docs.warc.gz │ ├── example.media.warc.gz │ ├── example.pdf.warc.gz │ ├── example.txt.warc.gz │ ├── example.warc.gz │ ├── issue-493.warc │ └── issue-514.warc └── scala └── io └── archivesunleashed ├── ArcTest.scala ├── ArchiveRecordTest.scala ├── CountableRDDTest.scala ├── RecordDFTest.scala ├── RecordLoaderTest.scala ├── RecordRDDTest.scala ├── WarcTest.scala ├── app ├── AudioInformationExtractorTest.scala ├── CommandLineAppTest.scala ├── CssInformationExtractorTest.scala ├── DomainFrequencyExtractorTest.scala ├── DomainGraphExtractorTest.scala ├── ExtractPopularImagesDFTest.scala ├── ExtractPopularImagesTest.scala ├── HtmlInformationExtractorTest.scala ├── ImageGraphExtractorTest.scala ├── ImageInformationExtractorTest.scala ├── JsInformationExtractorTest.scala ├── JsonInfromationExtractor.scala ├── PDFInformationExtractorTest.scala ├── PlainTextExtractorTest.scala ├── PlainTextInformationExtractor.scala ├── PresentationProgramInformationExtractorTest.scala ├── SpreadsheetInformationExtractorTest.scala ├── VideoInformationExtractorTest.scala ├── WebGraphExtractorTest.scala ├── WebPagesExtractorTest.scala ├── WordProcessorInformationExtractorTest.scala ├── WriteGEXFTest.scala ├── WriteGraphMLTest.scala └── XmlInfromationExtractor.scala ├── df ├── DataFrameLoaderTest.scala ├── ExtractAudioDetailsTest.scala ├── ExtractDateDFTest.scala ├── ExtractHyperlinksTest.scala ├── ExtractImageDetailsTest.scala ├── ExtractImageLinksTest.scala ├── ExtractPDFDetailsTest.scala ├── ExtractPresentationProgramDetailsTest.scala ├── ExtractSpreadsheetDetailsTest.scala ├── ExtractVideoDetailsTest.scala ├── ExtractWordProcessorDetailsTest.scala ├── SaveMediaBytesTest.scala ├── SimpleDfTest.scala └── UdfsTests.scala ├── issues ├── Issue493Test.scala └── WgetWarcTest.scala └── matchbox ├── ComputeImageSizeTest.scala ├── ExtractBoilerPipeTextTest.scala ├── ExtractDateTest.scala ├── ExtractDomainTest.scala ├── ExtractImageLinksTest.scala ├── ExtractLinksTest.scala ├── ExtractTextFromPDFsTest.scala ├── GetExtensionMIMETest.scala ├── RemoveHTMLTest.scala ├── RemoveHTTPHeaderTest.scala └── StringUtilsTest.scala /.codecov.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.codecov.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/Bug_report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.github/ISSUE_TEMPLATE/Bug_report.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/Feature_request.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.github/ISSUE_TEMPLATE/Feature_request.md -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /.github/workflows/mvn-build.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.github/workflows/mvn-build.yml -------------------------------------------------------------------------------- /.github/workflows/python-formatter.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.github/workflows/python-formatter.yml -------------------------------------------------------------------------------- /.github/workflows/scala-formatter.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.github/workflows/scala-formatter.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/.gitignore -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/README.md -------------------------------------------------------------------------------- /config/LICENSE_HEADER.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/config/LICENSE_HEADER.txt -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/pom.xml -------------------------------------------------------------------------------- /src/main/assembly/python.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/assembly/python.xml -------------------------------------------------------------------------------- /src/main/python/aut/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/python/aut/__init__.py -------------------------------------------------------------------------------- /src/main/python/aut/app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/python/aut/app.py -------------------------------------------------------------------------------- /src/main/python/aut/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/python/aut/common.py -------------------------------------------------------------------------------- /src/main/python/aut/udfs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/python/aut/udfs.py -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/resources/log4j.properties -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/ArchiveRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/ArchiveRecord.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/CommandLineApp.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/WriteGEXF.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/WriteGraphML.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/df/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/df/package.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/CovertLastModifiedDate.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/CovertLastModifiedDate.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractImageLinks.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/GetExtensionMIME.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/GetExtensionMIME.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/matchbox/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/matchbox/package.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/package.scala -------------------------------------------------------------------------------- /src/main/scala/io/archivesunleashed/udfs/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/main/scala/io/archivesunleashed/udfs/package.scala -------------------------------------------------------------------------------- /src/test/resources/arc/badexample.arc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/arc/badexample.arc.gz -------------------------------------------------------------------------------- /src/test/resources/arc/example.arc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/arc/example.arc.gz -------------------------------------------------------------------------------- /src/test/resources/warc/example.docs.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/example.docs.warc.gz -------------------------------------------------------------------------------- /src/test/resources/warc/example.media.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/example.media.warc.gz -------------------------------------------------------------------------------- /src/test/resources/warc/example.pdf.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/example.pdf.warc.gz -------------------------------------------------------------------------------- /src/test/resources/warc/example.txt.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/example.txt.warc.gz -------------------------------------------------------------------------------- /src/test/resources/warc/example.warc.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/example.warc.gz -------------------------------------------------------------------------------- /src/test/resources/warc/issue-493.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/issue-493.warc -------------------------------------------------------------------------------- /src/test/resources/warc/issue-514.warc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/resources/warc/issue-514.warc -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/ArcTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/ArcTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/CountableRDDTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/CountableRDDTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/RecordDFTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/RecordDFTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/RecordLoaderTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/RecordRDDTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/RecordRDDTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/WarcTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/WarcTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/DomainFrequencyExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/DomainFrequencyExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/ImageGraphExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/ImageGraphExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/WebGraphExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/df/UdfsTests.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/df/UdfsTests.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/issues/Issue493Test.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/issues/Issue493Test.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/issues/WgetWarcTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFsTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/RemoveHTTPHeaderTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/RemoveHTTPHeaderTest.scala -------------------------------------------------------------------------------- /src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/archivesunleashed/aut/HEAD/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala --------------------------------------------------------------------------------