├── .gitignore ├── CONTRIBUTORS.md ├── LICENSE ├── README.md ├── docs ├── Approach_Publications.md ├── Building.md ├── Config.md ├── Contribute.md ├── DataSpecs.md ├── Dev_DataSpecs.md ├── Dev_EnrichFuncs.md ├── EnrichFuncs.md ├── General_Usage.md ├── Installing_Jupyter.md ├── Operations.md ├── README.md ├── Recipes.md ├── Related_Projects.md ├── Using_Jupyter.md ├── Using_Library.md ├── approach.png └── screenshots │ ├── Jupyter.png │ ├── Jupyter_example.png │ ├── Jupyter_imports.png │ ├── Jupyter_notebook.png │ └── Jupyter_shutdown.png ├── docs_button.png ├── logo.png ├── notebooks ├── Analyzing_Term-Distributions.ipynb ├── Demo.ipynb ├── Downloading_WARC_from_Wayback.ipynb ├── Extracting_Embeds.ipynb ├── Generating_CDX.ipynb ├── IEEE_BigData_2017.ipynb ├── Link_Extraction.ipynb └── Selected_Title-and-Text.ipynb ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt └── src └── main └── scala └── org └── archive └── webservices └── archivespark ├── ArchiveSpark.scala ├── DistributedConfig.scala ├── dataspecs ├── DataSpec.scala ├── HadoopDataSpec.scala ├── OutputSpec.scala ├── TextDataSpec.scala ├── TextFileDataSpec.scala └── access │ ├── ByteArrayAccessor.scala │ ├── CloseableDataAccessor.scala │ ├── DataAccessor.scala │ ├── HdfsFileAccessor.scala │ ├── HdfsLocationInfo.scala │ ├── HdfsStreamAccessor.scala │ ├── HdfsTextAccessor.scala │ ├── HdfsTextFileAccessor.scala │ └── HttpTextAccessor.scala ├── functions ├── AbsoluteUrl.scala ├── Data.scala ├── Entities.scala ├── Html.scala ├── HtmlAttribute.scala ├── HtmlText.scala ├── LowerCase.scala ├── SURT.scala ├── StringContent.scala └── Values.scala ├── implicits ├── EnrichableRDD.scala ├── GenericHelpersRDD.scala ├── JsonConvertibleRDD.scala ├── SimplifiedGetterEnrichRoot.scala └── StringRDD.scala ├── model ├── BasicEnrichFunc.scala ├── BoundEnrichFunc.scala ├── BoundMultiEnrichFunc.scala ├── DataEnrichRoot.scala ├── Derivatives.scala ├── EnrichFunc.scala ├── EnrichRoot.scala ├── EnrichRootCompanion.scala ├── Enrichable.scala ├── IdentityField.scala ├── MultiEnrichFunc.scala ├── MultiValueEnrichable.scala ├── SingleValueEnrichable.scala ├── dataloads │ ├── ByteLoad.scala │ ├── DataLoad.scala │ └── TextLoad.scala └── pointers │ ├── DataLoadPointer.scala │ ├── DependentFieldPointer.scala │ ├── FieldPointer.scala │ ├── MultiFieldPointer.scala │ ├── MultiToSingleFieldPointer.scala │ ├── NamedFieldPointer.scala │ ├── PathFieldPointer.scala │ ├── RelativeFieldPointer.scala │ └── SingleToMultiFieldPointer.scala ├── package.scala ├── specific ├── raw │ ├── FileStreamRecord.scala │ ├── HdfsFileSpec.scala │ └── package.scala └── warc │ ├── CdxBasedRecord.scala │ ├── CdxSpec.scala │ ├── WarcFileMeta.scala │ ├── WarcLikeRecord.scala │ ├── WarcRecord.scala │ ├── WarcSpec.scala │ ├── WaybackRecord.scala │ ├── functions │ ├── HttpPayload.scala │ └── WarcPayload.scala │ ├── implicits │ ├── CdxRDD.scala │ └── WarcRDD.scala │ ├── package.scala │ └── specs │ ├── CdxHdfsSpec.scala │ ├── WarcCdxHdfsSpec.scala │ ├── WarcHdfsCdxPathRddSpec.scala │ ├── WarcHdfsCdxRddSpec.scala │ ├── WarcHdfsCdxSpecBase.scala │ ├── WarcHdfsSpec.scala │ ├── WaybackCdxHdfsSpec.scala │ └── WaybackSpec.scala └── util ├── Bytes.scala ├── Copyable.scala ├── FilePathMap.scala ├── HttpHeader.scala ├── Json.scala ├── JsonConvertible.scala ├── JupyterHelpers.scala ├── SelectorUtil.scala ├── SerializedException.scala └── ZeppelinHelpers.scala /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/.gitignore -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/CONTRIBUTORS.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/README.md -------------------------------------------------------------------------------- /docs/Approach_Publications.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Approach_Publications.md -------------------------------------------------------------------------------- /docs/Building.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Building.md -------------------------------------------------------------------------------- /docs/Config.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Config.md -------------------------------------------------------------------------------- /docs/Contribute.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Contribute.md -------------------------------------------------------------------------------- /docs/DataSpecs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/DataSpecs.md -------------------------------------------------------------------------------- /docs/Dev_DataSpecs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Dev_DataSpecs.md -------------------------------------------------------------------------------- /docs/Dev_EnrichFuncs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Dev_EnrichFuncs.md -------------------------------------------------------------------------------- /docs/EnrichFuncs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/EnrichFuncs.md -------------------------------------------------------------------------------- /docs/General_Usage.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/General_Usage.md -------------------------------------------------------------------------------- /docs/Installing_Jupyter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Installing_Jupyter.md -------------------------------------------------------------------------------- /docs/Operations.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Operations.md -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/Recipes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Recipes.md -------------------------------------------------------------------------------- /docs/Related_Projects.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Related_Projects.md -------------------------------------------------------------------------------- /docs/Using_Jupyter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Using_Jupyter.md -------------------------------------------------------------------------------- /docs/Using_Library.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/Using_Library.md -------------------------------------------------------------------------------- /docs/approach.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/approach.png -------------------------------------------------------------------------------- /docs/screenshots/Jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/screenshots/Jupyter.png -------------------------------------------------------------------------------- /docs/screenshots/Jupyter_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/screenshots/Jupyter_example.png -------------------------------------------------------------------------------- /docs/screenshots/Jupyter_imports.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/screenshots/Jupyter_imports.png -------------------------------------------------------------------------------- /docs/screenshots/Jupyter_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/screenshots/Jupyter_notebook.png -------------------------------------------------------------------------------- /docs/screenshots/Jupyter_shutdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs/screenshots/Jupyter_shutdown.png -------------------------------------------------------------------------------- /docs_button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/docs_button.png -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/logo.png -------------------------------------------------------------------------------- /notebooks/Analyzing_Term-Distributions.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Analyzing_Term-Distributions.ipynb -------------------------------------------------------------------------------- /notebooks/Demo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Demo.ipynb -------------------------------------------------------------------------------- /notebooks/Downloading_WARC_from_Wayback.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Downloading_WARC_from_Wayback.ipynb -------------------------------------------------------------------------------- /notebooks/Extracting_Embeds.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Extracting_Embeds.ipynb -------------------------------------------------------------------------------- /notebooks/Generating_CDX.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Generating_CDX.ipynb -------------------------------------------------------------------------------- /notebooks/IEEE_BigData_2017.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/IEEE_BigData_2017.ipynb -------------------------------------------------------------------------------- /notebooks/Link_Extraction.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Link_Extraction.ipynb -------------------------------------------------------------------------------- /notebooks/Selected_Title-and-Text.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/notebooks/Selected_Title-and-Text.ipynb -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/project/assembly.sbt -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.17 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/project/plugins.sbt -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/ArchiveSpark.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/ArchiveSpark.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/DistributedConfig.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/DistributedConfig.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/DataSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/DataSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/HadoopDataSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/HadoopDataSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/OutputSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/OutputSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/TextDataSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/TextDataSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/TextFileDataSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/TextFileDataSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/ByteArrayAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/ByteArrayAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/CloseableDataAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/CloseableDataAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/DataAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/DataAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsFileAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsFileAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsLocationInfo.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsLocationInfo.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsStreamAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsStreamAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsTextAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsTextAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsTextFileAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HdfsTextFileAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HttpTextAccessor.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/dataspecs/access/HttpTextAccessor.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/AbsoluteUrl.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/AbsoluteUrl.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/Data.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/Data.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/Entities.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/Entities.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/Html.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/Html.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/HtmlAttribute.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/HtmlAttribute.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/HtmlText.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/HtmlText.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/LowerCase.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/LowerCase.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/SURT.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/SURT.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/StringContent.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/StringContent.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/functions/Values.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/functions/Values.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/implicits/EnrichableRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/implicits/EnrichableRDD.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/implicits/GenericHelpersRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/implicits/GenericHelpersRDD.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/implicits/JsonConvertibleRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/implicits/JsonConvertibleRDD.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/implicits/SimplifiedGetterEnrichRoot.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/implicits/SimplifiedGetterEnrichRoot.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/implicits/StringRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/implicits/StringRDD.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/BasicEnrichFunc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/BasicEnrichFunc.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/BoundEnrichFunc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/BoundEnrichFunc.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/BoundMultiEnrichFunc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/BoundMultiEnrichFunc.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/DataEnrichRoot.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/DataEnrichRoot.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/Derivatives.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/Derivatives.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/EnrichFunc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/EnrichFunc.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/EnrichRoot.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/EnrichRoot.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/EnrichRootCompanion.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/EnrichRootCompanion.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/Enrichable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/Enrichable.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/IdentityField.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/IdentityField.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/MultiEnrichFunc.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/MultiEnrichFunc.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/MultiValueEnrichable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/MultiValueEnrichable.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/SingleValueEnrichable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/SingleValueEnrichable.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/dataloads/ByteLoad.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/dataloads/ByteLoad.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/dataloads/DataLoad.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/dataloads/DataLoad.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/dataloads/TextLoad.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/dataloads/TextLoad.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/DataLoadPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/DataLoadPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/DependentFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/DependentFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/FieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/FieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/MultiFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/MultiFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/MultiToSingleFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/MultiToSingleFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/NamedFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/NamedFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/PathFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/PathFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/RelativeFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/RelativeFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/model/pointers/SingleToMultiFieldPointer.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/model/pointers/SingleToMultiFieldPointer.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/package.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/raw/FileStreamRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/raw/FileStreamRecord.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/raw/HdfsFileSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/raw/HdfsFileSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/raw/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/raw/package.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/CdxBasedRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/CdxBasedRecord.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/CdxSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/CdxSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcFileMeta.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcFileMeta.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcLikeRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcLikeRecord.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcRecord.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/WarcSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/WaybackRecord.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/WaybackRecord.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/functions/HttpPayload.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/functions/HttpPayload.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/functions/WarcPayload.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/functions/WarcPayload.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/implicits/CdxRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/implicits/CdxRDD.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/implicits/WarcRDD.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/implicits/WarcRDD.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/package.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/package.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/CdxHdfsSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/CdxHdfsSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcCdxHdfsSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcCdxHdfsSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsCdxPathRddSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsCdxPathRddSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsCdxRddSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsCdxRddSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsCdxSpecBase.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsCdxSpecBase.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WarcHdfsSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WaybackCdxHdfsSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WaybackCdxHdfsSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WaybackSpec.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/specific/warc/specs/WaybackSpec.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/Bytes.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/Bytes.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/Copyable.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/Copyable.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/FilePathMap.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/FilePathMap.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/HttpHeader.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/HttpHeader.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/Json.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/Json.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/JsonConvertible.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/JsonConvertible.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/JupyterHelpers.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/JupyterHelpers.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/SelectorUtil.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/SelectorUtil.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/SerializedException.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/SerializedException.scala -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/archivespark/util/ZeppelinHelpers.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/helgeho/ArchiveSpark/HEAD/src/main/scala/org/archive/webservices/archivespark/util/ZeppelinHelpers.scala --------------------------------------------------------------------------------