├── .dockerignore ├── .gitattributes ├── .gitignore ├── .gitmodules ├── .jvmopts ├── .scalafmt.conf ├── API.adoc ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── build.sbt ├── config ├── docker.json ├── local.json └── prod.json ├── data ├── ait-collections.json ├── ait-users.json ├── arch-users.json └── special-collections.json ├── entrypoint.sh ├── fairscheduler.xml ├── job_scripts ├── trocr-run.py └── whisper-run.py ├── lib └── javax.servlet-api-3.1.0.jar ├── logging └── .keep ├── migration └── LegacyDatasetMigrator.scala ├── project ├── assembly.sbt ├── build.properties └── plugins.sbt ├── src ├── main │ ├── bash │ │ └── sendmail │ └── scala │ │ └── org │ │ └── archive │ │ └── webservices │ │ └── ars │ │ ├── AdminController.scala │ │ ├── ApiController.scala │ │ ├── Arch.scala │ │ ├── ArchSwaggerSupport.scala │ │ ├── BaseController.scala │ │ ├── FilesController.scala │ │ ├── JobUuidApiController.scala │ │ ├── Keystone.scala │ │ ├── ScalatraBootstrap.scala │ │ ├── SwaggerController.scala │ │ ├── WasapiController.scala │ │ ├── addons │ │ ├── AddonLoader.scala │ │ └── ArchAddon.scala │ │ ├── ait │ │ ├── Ait.scala │ │ └── AitUser.scala │ │ ├── aut │ │ ├── AutLoader.scala │ │ ├── AutUtil.scala │ │ └── TikaUtil.scala │ │ ├── io │ │ ├── CollectionCache.scala │ │ ├── FileAccessContext.scala │ │ ├── FileAccessKeyRing.scala │ │ ├── FilePointer.scala │ │ ├── IOHelper.scala │ │ ├── MemoryCompressor.scala │ │ ├── RandomFileAccess.scala │ │ ├── Vault.scala │ │ └── WebArchiveLoader.scala │ │ ├── model │ │ ├── ArchCollection.scala │ │ ├── ArchCollectionInfo.scala │ │ ├── ArchCollectionStats.scala │ │ ├── ArchConf.scala │ │ ├── ArchJobCategories.scala │ │ ├── ArchJobCategory.scala │ │ ├── ArchJobInstanceInfo.scala │ │ ├── DerivativeOutput.scala │ │ ├── DerivativeOutputCache.scala │ │ ├── LocalArchConf.scala │ │ ├── PublishedDatasets.scala │ │ ├── api │ │ │ ├── AvailableJob.scala │ │ │ ├── AvailableJobsCategory.scala │ │ │ ├── Collection.scala │ │ │ ├── Dataset.scala │ │ │ ├── DatasetFile.scala │ │ │ ├── InputSpec.scala │ │ │ ├── JobState.scala │ │ │ ├── WasapiResponse.scala │ │ │ └── package.scala │ │ ├── app │ │ │ └── RequestContext.scala │ │ ├── collections │ │ │ ├── AitCollectionSpecifics.scala │ │ │ ├── CollectionSpecifics.scala │ │ │ ├── CustomCollectionSpecifics.scala │ │ │ ├── FileCollectionSpecifics.scala │ │ │ ├── GenericRandomAccess.scala │ │ │ ├── SpecialCollectionSpecifics.scala │ │ │ ├── UnionCollectionSpecifics.scala │ │ │ └── inputspecs │ │ │ │ ├── ArchCollectionSpecLoader.scala │ │ │ │ ├── CdxQuerySpecLoader.scala │ │ │ │ ├── DatasetSpecLoader.scala │ │ │ │ ├── FileRecord.scala │ │ │ │ ├── FileRecordFactory.scala │ │ │ │ ├── FileSpecLoader.scala │ │ │ │ ├── HdfsFileRecordFactory.scala │ │ │ │ ├── HttpFileRecordFactory.scala │ │ │ │ ├── InMemoryCdxFileRecord.scala │ │ │ │ ├── InputSpec.scala │ │ │ │ ├── InputSpecLoader.scala │ │ │ │ ├── LongestPrefixProbing.scala │ │ │ │ ├── MetaFilesSpecLoader.scala │ │ │ │ ├── MetaRemoteSpecLoader.scala │ │ │ │ ├── MultiSpecLoader.scala │ │ │ │ ├── OneTimeAccess.scala │ │ │ │ ├── S3FileRecordFactory.scala │ │ │ │ ├── S3HttpFileRecordFactory.scala │ │ │ │ ├── VaultFileRecordFactory.scala │ │ │ │ └── meta │ │ │ │ ├── FileMetaData.scala │ │ │ │ ├── FileMetaField.scala │ │ │ │ ├── FileMetaFieldSummary.scala │ │ │ │ ├── FileMetaFieldType.scala │ │ │ │ ├── FileMetaFieldTypeSummary.scala │ │ │ │ └── FileMetaSummary.scala │ │ └── users │ │ │ ├── ArchUser.scala │ │ │ ├── DefaultArchUser.scala │ │ │ └── KeystoneUser.scala │ │ ├── processing │ │ ├── ChainedJob.scala │ │ ├── DerivationJob.scala │ │ ├── DerivationJobConf.scala │ │ ├── DerivationJobInstance.scala │ │ ├── DerivationJobParameters.scala │ │ ├── GenericJob.scala │ │ ├── GenericJobManager.scala │ │ ├── JobManager.scala │ │ ├── JobManagerBase.scala │ │ ├── JobQueue.scala │ │ ├── JobStateManager.scala │ │ ├── PartialDerivationJob.scala │ │ ├── ProcessingState.scala │ │ ├── SampleVizData.scala │ │ ├── SparkJob.scala │ │ ├── SparkJobListener.scala │ │ ├── SparkJobManager.scala │ │ ├── SparkRunner.scala │ │ └── jobs │ │ │ ├── ArsLgaGeneration.scala │ │ │ ├── ArsWaneGeneration.scala │ │ │ ├── ArsWatGeneration.scala │ │ │ ├── AudioInformationExtraction.scala │ │ │ ├── DomainFrequencyExtraction.scala │ │ │ ├── DomainGraphExtraction.scala │ │ │ ├── ImageGraphExtraction.scala │ │ │ ├── ImageInformationExtraction.scala │ │ │ ├── PdfInformationExtraction.scala │ │ │ ├── PresentationProgramInformationExtraction.scala │ │ │ ├── SpreadsheetInformationExtraction.scala │ │ │ ├── TextFilesInformationExtraction.scala │ │ │ ├── VideoInformationExtraction.scala │ │ │ ├── WebGraphExtraction.scala │ │ │ ├── WebPagesExtraction.scala │ │ │ ├── WordProcessorInformationExtraction.scala │ │ │ ├── archivespark │ │ │ ├── AiJob.scala │ │ │ ├── ArchiveSparkFlexJob.scala │ │ │ ├── ArchiveSparkNoop.scala │ │ │ ├── base │ │ │ │ ├── ArchEnrichRoot.scala │ │ │ │ ├── ArchFileRecord.scala │ │ │ │ ├── ArchFileSpec.scala │ │ │ │ ├── ArchWarcRecord.scala │ │ │ │ ├── ArchWarcSpec.scala │ │ │ │ ├── ArchiveSparkBaseJob.scala │ │ │ │ ├── ArchiveSparkEnrichJob.scala │ │ │ │ ├── FileLoad.scala │ │ │ │ ├── LocalFileCache.scala │ │ │ │ └── PlainTextLoad.scala │ │ │ ├── functions │ │ │ │ ├── ArchFileBytes.scala │ │ │ │ ├── ArchFileCache.scala │ │ │ │ ├── ArchFileProcEnrichFuncBase.scala │ │ │ │ ├── ArchWarcPayload.scala │ │ │ │ ├── CondaBasedFunction.scala │ │ │ │ ├── CoreNlpEntities.scala │ │ │ │ ├── TrOCR.scala │ │ │ │ ├── Whisper.scala │ │ │ │ ├── WhisperText.scala │ │ │ │ └── adapters │ │ │ │ │ ├── ArchArchiveSparkFunctionAdapter.scala │ │ │ │ │ ├── CondaBasedArchiveSparkFunctionAdapter.scala │ │ │ │ │ └── EntitiesAdapter.scala │ │ │ └── preset │ │ │ │ ├── EntityExtraction.scala │ │ │ │ ├── TrOcrEntityExtraction.scala │ │ │ │ ├── TrOcrProcessing.scala │ │ │ │ ├── WhisperEntityExtraction.scala │ │ │ │ ├── WhisperText.scala │ │ │ │ └── WhisperTranscription.scala │ │ │ ├── shared │ │ │ ├── ArsJob.scala │ │ │ ├── AutJob.scala │ │ │ ├── BinaryInformationAutJob.scala │ │ │ └── NetworkAutJob.scala │ │ │ └── system │ │ │ ├── DatasetPublication.scala │ │ │ ├── MetadataSummary.scala │ │ │ └── UserDefinedQuery.scala │ │ └── util │ │ ├── CacheUtil.scala │ │ ├── Common.scala │ │ ├── DatafileUtil.scala │ │ ├── DatasetUtil.scala │ │ ├── FormatUtil.scala │ │ ├── HttpUtil.scala │ │ ├── LazyCache.scala │ │ ├── PublicSuffixUtil.scala │ │ └── UUID.scala └── test │ └── scala │ └── org │ └── archive │ └── webservices │ └── ars │ ├── ApiController.scala │ ├── Fixtures.scala │ ├── JobUuidApiController.scala │ ├── UnitSpec.scala │ └── model │ └── LocalArchConfSpec.scala ├── templates ├── notebooks │ ├── audio-information.ipynb │ ├── css-file-information.ipynb │ ├── domain-frequency.ipynb │ ├── domain-graph.ipynb │ ├── html-file-information.ipynb │ ├── image-graph.ipynb │ ├── image-information.ipynb │ ├── js-file-information.ipynb │ ├── json-file-information.ipynb │ ├── pdf-information.ipynb │ ├── plain-text-file-information.ipynb │ ├── powerpoint-information.ipynb │ ├── spreadsheet-information.ipynb │ ├── video-information.ipynb │ ├── web-graph.ipynb │ ├── web-pages.ipynb │ ├── word-document-information.ipynb │ └── xml-file-information.ipynb ├── sendmail_failed.txt ├── sendmail_finished.txt └── sendmail_udq-finished.txt └── webapp └── .gitkeep /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | webapp/js/dist/** linguist-generated 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/.gitmodules -------------------------------------------------------------------------------- /.jvmopts: -------------------------------------------------------------------------------- 1 | -Xms512M 2 | -Xmx4096M 3 | -Xss2M 4 | -XX:MaxMetaspaceSize=1024M -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | # Apache Spark scalafmt configuration. 2 | 3 | align = none 4 | align.openParenDefnSite = false 5 | align.openParenCallSite = false 6 | align.tokens = [] 7 | optIn = { 8 | configStyleArguments = false 9 | } 10 | danglingParentheses.preset = false 11 | docstrings.style = Asterisk 12 | maxColumn = 98 13 | runner.dialect = scala212 14 | version = 3.7.7 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Read the current user's ID so that we can assign the same ID to "arch" Docker user. 3 | UID = $$(id -u) 4 | 5 | PWD=$(shell pwd) 6 | IMAGE=ait-arch 7 | BASE_IMAGE=`grep -P 'BASE_IMAGE=.+$$' Dockerfile | cut -d'=' -f2` 8 | RUN_BASE_CMD=docker run --rm -it -v $(PWD)/shared:/opt/arch/shared -p 12341:12341 9 | GPU_TEST_CMD=docker run --gpus all $(BASE_IMAGE) 10 | CPU_MSG_CMD=printf '*\n* GPU not available: Whisper and TrOCR jobs will use the CPU\n*\n' 11 | 12 | config/config.json: 13 | cp config/docker.json config/config.json 14 | 15 | .PHONY: build-docker-image 16 | build-docker-image: config/config.json 17 | docker build --build-arg UID=$(UID) . -t $(IMAGE) 18 | 19 | shared: 20 | mkdir -p shared/in/collections; \ 21 | mkdir shared/log; \ 22 | mkdir -p shared/out/custom-collections; \ 23 | mkdir shared/out/datasets; 24 | 25 | # Define a function that first tests running the BASE_IMAGE with the --gpus option, 26 | # and if that doesn't exit with an error, runs ARCH with GPU support, and otherwise 27 | # displays a message indicating that AI jobs will use the CPU and ARCH without GPU support. 28 | # usage: $(call run_docker_image_fn,,) 29 | # The "$(or $(1),--it)" adds quotes with are necessary when a first argument value is 30 | # specified and defaults to the redundant "-it" when one is not specified to prevent docker 31 | # run from complaining about "invalid reference format". 32 | define run_docker_image_fn 33 | $(eval GPU_RUN_CMD=$(RUN_BASE_CMD) "$(or $(1),-it)" --gpus all $(IMAGE) $(2)) 34 | $(eval CPU_RUN_CMD=$(RUN_BASE_CMD) "$(or $(1),-it)" $(IMAGE) $(2)) 35 | @$(GPU_TEST_CMD) 2>/dev/null && ($(GPU_RUN_CMD) || true) || ($(CPU_MSG_CMD) && $(CPU_RUN_CMD)) 36 | endef 37 | 38 | .PHONY: run-docker-image 39 | run-docker-image: shared 40 | $(call run_docker_image_fn) 41 | 42 | lib/.symlinks-copied: 43 | docker cp $$(docker create --name arch-tmp $(IMAGE)):/opt/arch/lib . \ 44 | && docker rm arch-tmp \ 45 | && touch lib/.symlinks-copied 46 | 47 | .PHONY: run-docker-image-dev 48 | run-docker-image-dev: shared lib/.symlinks-copied 49 | $(call run_docker_image_fn,"-v $(PWD):/opt/arch") 50 | 51 | .PHONY: docker-shell 52 | docker-shell: shared lib/.symlinks-copied 53 | $(call run_docker_image_fn,"-v $(PWD):/opt/arch","bash") 54 | -------------------------------------------------------------------------------- /config/docker.json: -------------------------------------------------------------------------------- 1 | { 2 | "basePath": "/", 3 | "customCollectionPath": "/opt/arch/shared/out/custom-collections", 4 | "externalPort": 12341, 5 | "jobLoggingPath": "/opt/arch/shared/log", 6 | "keystoneBaseUrl": "http://keystone:8000", 7 | "keystonePrivateApiKey": "supersecret", 8 | "uuidJobOutPath": "/opt/arch/shared/out/datasets" 9 | } 10 | -------------------------------------------------------------------------------- /config/local.json: -------------------------------------------------------------------------------- 1 | { 2 | "aitCollectionPath": "data/in", 3 | "aitCollectionWarcDir": "arcs", 4 | "collectionCachePath": "/tmp/arch-cache", 5 | "jobOutPath": "/data/user-out", 6 | "globalJobOutPath": "/data/out", 7 | "customCollectionPath": "data/collections", 8 | "jobLoggingPath": "logging", 9 | "localTempPath": "data/tmp", 10 | "sparkMaster": "local[*]", 11 | "baseUrl": "http://127.0.0.1:12341", 12 | "loginUrl": "http://127.0.0.1:12341/ait/login?next=", 13 | "port": 12341, 14 | "githubBearer": "example_bearer_token" 15 | } 16 | -------------------------------------------------------------------------------- /config/prod.json: -------------------------------------------------------------------------------- 1 | { 2 | "aitCollectionHdfsHost": "ia802400.us.archive.org", 3 | "aitCollectionPath": "/search/ait", 4 | "aitCollectionWarcDir": "arcs", 5 | "collectionCachePath": "/tmp/arch-cache", 6 | "globalJobOutPath": "/user/arch/arch", 7 | "jobOutPath": "/user/arch/arch-users", 8 | "customCollectionPath": "/user/arch/arch-custom-collections", 9 | "jobLoggingPath": "logging", 10 | "localTempPath": "/tmp", 11 | "sparkMaster": "yarn", 12 | "baseUrl": "https://webdata.archive-it.org", 13 | "loginUrl": "https://webdata.archive-it.org/ait/login?next=", 14 | "hadoopQueue": "default", 15 | "port": 12353, 16 | "production": true 17 | } -------------------------------------------------------------------------------- /data/ait-collections.json: -------------------------------------------------------------------------------- 1 | { 2 | "ait:1451" : [ 3 | 14462, 4 | 14472, 5 | 14489 6 | ], 7 | "ait:1796" : [ 8 | 10923 9 | ], 10 | "test" : [ 11 | 1 12 | ], 13 | "ks:test" : [ 14 | 1 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /data/ait-users.json: -------------------------------------------------------------------------------- 1 | { 2 | "ids" : [ 3 | 1451, 4 | 1796 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /data/arch-users.json: -------------------------------------------------------------------------------- 1 | { 2 | "ks:system" : { 3 | "name" : "Keystone System", 4 | "admin" : true, 5 | "apiKey" : "$pbkdf2-sha512$120000$edRi7uf7Dg18ebkFm5lphcfOAiVVCvRB$vyl48k.uOahDCmTKOqViXpw8FG7fKzkVParjfOZ/60U" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /data/special-collections.json: -------------------------------------------------------------------------------- 1 | { 2 | "collections" : { 3 | "test-collection" : { 4 | "name" : "ARCH Test Collection", 5 | "path" : "/user/helge/arch-test-collection" 6 | } 7 | }, 8 | "users" : { 9 | "test" : [ 10 | "test-collection" 11 | ], 12 | "ks:test" : [ 13 | "test-collection" 14 | ] 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | shopt -s nullglob 5 | 6 | # For any new directory in /opt/arch/in/collections that is not yet 7 | # represented as being authorized for the built-in test user, authorize it. 8 | for dir in /opt/arch/shared/in/collections/*; do 9 | collection_key=`basename $dir`; 10 | collection_name=`echo $collection_key | sed -r 's/(^|-)(\w)/ \U\2/g' | sed 's/^ //'`; 11 | cat <<< $(jq ".collections |= if has(\"$collection_key\") then . else .\"$collection_key\" = {name: \"$collection_name\", path: \"$dir\"} end | .users[\"ks:system\"] |= (.+ [\"$collection_key\"] | unique)" /opt/arch/data/special-collections.json) > /opt/arch/data/special-collections.json 12 | done 13 | 14 | exec "$@" 15 | -------------------------------------------------------------------------------- /fairscheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | FAIR10 4 | FAIR20 5 | FAIR40 6 | FAIR80 7 | FAIR160 8 | FAIR320 9 | FAIR640 10 | FAIR1280 11 | -------------------------------------------------------------------------------- /job_scripts/whisper-run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import whisper 3 | import json 4 | import torch 5 | 6 | is_cuda = torch.cuda.is_available() 7 | device = "cuda" if is_cuda else "cpu" 8 | 9 | model_filename = sys.argv[1] 10 | out_pipe_path = sys.argv[-1] 11 | 12 | model = whisper.load_model(model_filename, device=device) 13 | 14 | def process(audio_file): 15 | result = model.transcribe(audio_file, fp16=is_cuda) 16 | return result 17 | 18 | with open(out_pipe_path, 'w') as pipe: 19 | while True: 20 | print("##", file=pipe, flush=True) 21 | input_file = sys.stdin.readline().strip() 22 | try: 23 | result = process(input_file) 24 | print(json.dumps(result["segments"]), file=pipe, flush=True) 25 | except: pass 26 | -------------------------------------------------------------------------------- /lib/javax.servlet-api-3.1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/lib/javax.servlet-api-3.1.0.jar -------------------------------------------------------------------------------- /logging/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/logging/.keep -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.3.8 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scalatra.sbt" % "sbt-scalatra" % "1.0.3") 2 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.0") 3 | -------------------------------------------------------------------------------- /src/main/bash/sendmail: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cat $2 >> /data/sendmail.log 4 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/AdminController.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars 2 | 3 | import org.apache.commons.io.input.BoundedInputStream 4 | import org.archive.webservices.ars.processing.JobStateManager 5 | import org.archive.webservices.ars.processing.JobStateManager.Charset 6 | import org.archive.webservices.sparkling._ 7 | import org.archive.webservices.sparkling.io.IOUtil 8 | import org.scalatra._ 9 | 10 | import java.io.{File, FileInputStream} 11 | import scala.io.Source 12 | import scala.util.Try 13 | 14 | class AdminController extends BaseController { 15 | val MaxLogLength: Int = 1.mb.toInt 16 | get("/logs/:log_type") { 17 | ensureAuth { user => 18 | if (user.isAdmin) { 19 | params("log_type") match { 20 | case "jobs" => 21 | val tail = params.get("tail").flatMap(str => Try(str.toInt).toOption).getOrElse(-1) 22 | val logFile = new File(s"${JobStateManager.LoggingDir}/${JobStateManager.JobLogFile}") 23 | val log = if (logFile.exists) { 24 | val skip = if (tail < 0) 0L else (logFile.length - tail.min(MaxLogLength)).max(0L) 25 | val in = new FileInputStream(logFile) 26 | try { 27 | IOUtil.skip(in, skip) 28 | val source = Source.fromInputStream( 29 | new BoundedInputStream(in, MaxLogLength), 30 | JobStateManager.Charset) 31 | try { 32 | source.mkString 33 | } finally { 34 | source.close() 35 | } 36 | } finally { 37 | in.close() 38 | } 39 | } else "" 40 | Ok(log, Map("Content-Type" -> "text/plain")) 41 | case "running" => 42 | val runningJobsFile = 43 | new File(s"${JobStateManager.LoggingDir}/${JobStateManager.RunningJobsFile}") 44 | val log = if (runningJobsFile.exists) { 45 | val source = Source.fromFile(runningJobsFile, Charset) 46 | try { 47 | source.mkString 48 | } finally { 49 | source.close() 50 | } 51 | } else "" 52 | Ok(log, Map("Content-Type" -> "application/json")) 53 | case "failed" => 54 | val failedJobsFile = 55 | new File(s"${JobStateManager.LoggingDir}/${JobStateManager.FailedJobsFile}") 56 | val log = if (failedJobsFile.exists) { 57 | val source = Source.fromFile(failedJobsFile, Charset) 58 | try { 59 | source.mkString 60 | } finally { 61 | source.close() 62 | } 63 | } else "" 64 | Ok(log, Map("Content-Type" -> "text/plain")) 65 | case _ => 66 | NotFound() 67 | } 68 | } else Forbidden() 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/Arch.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars 2 | 3 | import _root_.io.sentry.protocol.Message 4 | import _root_.io.sentry.{Sentry, SentryEvent, SentryLevel} 5 | import org.archive.webservices.ars.addons.AddonLoader 6 | import org.archive.webservices.ars.model.ArchConf 7 | import org.archive.webservices.ars.processing.JobStateManager 8 | import org.archive.webservices.sparkling._ 9 | import org.archive.webservices.sparkling.io.IOUtil 10 | import org.archive.webservices.sparkling.util.RddUtil 11 | import org.eclipse.jetty.server.Server 12 | import org.eclipse.jetty.webapp.WebAppContext 13 | import org.scalatra.servlet.ScalatraListener 14 | 15 | import java.io.File 16 | import scala.collection.JavaConverters._ // For SentryEvent.setExtras 17 | 18 | object Arch { 19 | def start(contextPath: String, port: Int): Unit = { 20 | val server = new Server(port) 21 | 22 | val context = new WebAppContext() 23 | context.setContextPath(contextPath) 24 | context.setResourceBase("webapp") 25 | context.setInitParameter("org.eclipse.jetty.servlet.Default.dirAllowed", "false") 26 | context.setInitParameter("org.eclipse.jetty.servlet.Default.useFileMappedBuffer", "false") 27 | context.setInitParameter( 28 | ScalatraListener.LifeCycleKey, 29 | classOf[ScalatraBootstrap].getCanonicalName) 30 | context.setInitParameter( 31 | org.scalatra.EnvironmentKey, 32 | ArchConf.deploymentEnvironment match { 33 | case "DEV" => "development" 34 | case "QA" => "qa" 35 | case "PROD" => "production" 36 | }) 37 | context.setEventListeners(Array(new ScalatraListener)) 38 | 39 | server.setHandler(context) 40 | server.start() 41 | server.join() 42 | } 43 | 44 | def initSentry(): Unit = { 45 | Sentry.init(options => { 46 | options.setDsn(ArchConf.sentryDsn); 47 | options.setEnvironment(ArchConf.deploymentEnvironment); 48 | // Set traces_sample_rate to 0.10 to capture 10% of transactions for performance monitoring. 49 | options.setTracesSampleRate(0.10); 50 | }) 51 | } 52 | 53 | def reportEvent( 54 | title: String, 55 | message: String, 56 | extraContext: Map[String, Object] = Map.empty, 57 | level: SentryLevel = SentryLevel.INFO): Unit = { 58 | // Send an event to Sentry. 59 | val event = new SentryEvent() 60 | val _message = new Message() 61 | // Use the title as the message and add the message text as the extra "details" property 62 | // to make display in the client more reasonable, otherwise Sentry will display a prefix 63 | // of the message text as the event title. 64 | _message.setMessage(title) 65 | event.setMessage(_message) 66 | event.setLevel(level) 67 | event.setExtras((Map("details" -> message) ++ extraContext).asJava) 68 | Sentry.captureEvent(event) 69 | } 70 | 71 | val reportInfo = reportEvent(_, _, _, SentryLevel.INFO) 72 | val reportWarning = reportEvent(_, _, _, SentryLevel.WARNING) 73 | val reportError = reportEvent(_, _, _, SentryLevel.ERROR) 74 | 75 | def reportException(e: Exception): Unit = Sentry.captureException(e) 76 | 77 | def main(args: Array[String]): Unit = { 78 | IOUtil.memoryBuffer = 1.mb.toInt 79 | RddUtil.saveRecordTimeoutMillis = -1 80 | AddonLoader.initializePackage("org.archive.webservices.arch.addons") 81 | JobStateManager.init() 82 | initSentry() 83 | start(ArchConf.basePath, ArchConf.internalPort) 84 | } 85 | 86 | def debugging: Boolean = new File("_debugging").exists 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/ArchSwaggerSupport.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars 2 | 3 | import org.scalatra.swagger._ 4 | import org.scalatra.util.NotNothing 5 | 6 | trait ArchSwaggerSupport extends SwaggerSupport { 7 | // apiOperation wrapper to add X-API-* header params to all endpoints. 8 | def apiOp[T: Manifest: NotNothing](name: String): SwaggerSupportSyntax.OperationBuilder = 9 | (apiOperation[T](name) 10 | parameter headerParam[String]("X-API-USER").description( 11 | "The user for which this request is being made") 12 | parameter headerParam[String]("X-API-KEY").description( 13 | "An API key that's authorized to act on behalf of X-API-USER")) 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/BaseController.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars 2 | 3 | import org.archive.webservices.ars.model.app.RequestContext 4 | import org.archive.webservices.ars.model.users.ArchUser 5 | import org.scalatra._ 6 | import org.scalatra.scalate.ScalateSupport 7 | 8 | class BaseController extends ScalatraServlet with ScalateSupport { 9 | // Report and rethrow any Exceptions. 10 | error { 11 | case e: Exception => { 12 | Arch.reportException(e) 13 | throw e 14 | } 15 | } 16 | 17 | def ensureAuth(action: RequestContext => ActionResult): ActionResult = { 18 | for { 19 | apiUser <- request.headers.get("X-API-USER") 20 | apiKey <- request.headers.get("X-API-KEY") 21 | } yield { 22 | ArchUser.get(apiUser, Some(apiKey)) match { 23 | case Some(user) => action(RequestContext(user)) 24 | case None => Forbidden() 25 | } 26 | } 27 | }.getOrElse(Forbidden()) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/ScalatraBootstrap.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars 2 | 3 | import org.scalatra.LifeCycle 4 | import org.scalatra.swagger.ApiKey 5 | 6 | import javax.servlet.ServletContext 7 | 8 | class ScalatraBootstrap extends LifeCycle { 9 | 10 | implicit val swagger = new ArchApiSwagger 11 | swagger.addAuthorization(ApiKey("X-API-KEY")) 12 | 13 | override def init(context: ServletContext): Unit = { 14 | context.mount(new AdminController, "/admin/*") 15 | context.mount(new ApiController, "/api/*") 16 | context.mount(new JobUuidApiController, "/api/job/*") 17 | context.mount(new WasapiController, "/wasapi/*") 18 | context.mount(new FilesController, "/files/*") 19 | context.mount(new SwaggerController, "/api-docs") 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/SwaggerController.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars 2 | 3 | import org.scalatra.ScalatraServlet 4 | import org.scalatra.swagger.{ApiInfo, NativeSwaggerBase, Swagger} 5 | 6 | class SwaggerController(implicit val swagger: Swagger) 7 | extends ScalatraServlet 8 | with NativeSwaggerBase 9 | 10 | object ArchApiInfo 11 | extends ApiInfo( 12 | "The ARCH API", 13 | "Docs for the ARCH API", 14 | "https://arch.archive-it.org", 15 | "arch@archive.org", 16 | "AGPL-3.0", 17 | "https://www.gnu.org/licenses/agpl-3.0.en.html") 18 | 19 | class ArchApiSwagger extends Swagger(Swagger.SpecVersion, "2.0.0", ArchApiInfo) 20 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/addons/AddonLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.addons 2 | 3 | import java.util.jar.JarFile 4 | import scala.collection.JavaConverters._ 5 | 6 | object AddonLoader { 7 | private var init: Map[String, ArchAddon] = Map.empty 8 | 9 | def init(addon: ArchAddon): Unit = { 10 | init += addon.getClass.getName -> addon 11 | } 12 | 13 | def initializePackage(packageName: String): Unit = { 14 | println(s"Initializing add-ons of package $packageName...") 15 | val packagePath = packageName.replace('.', '/') + "/" 16 | val systemClassLoader = ClassLoader.getSystemClassLoader 17 | val systemResources = systemClassLoader.getResources(packagePath).asScala 18 | val contextClassLoader = Thread.currentThread.getContextClassLoader 19 | val threadResources = contextClassLoader.getResources(packagePath).asScala 20 | val jarEntries = (systemResources ++ threadResources) 21 | .filter(_.getProtocol == "jar") 22 | .flatMap { jar => 23 | jar.getPath.stripPrefix("file:").split('!').headOption 24 | } 25 | .flatMap(path => new JarFile(path).entries().asScala) 26 | .map(_.toString) 27 | val objects = jarEntries 28 | .filter(_.startsWith(packagePath)) 29 | .filter(_.endsWith("$.class")) 30 | .map(_.stripSuffix(".class").replace('/', '.')) 31 | for (objectClass <- objects) { 32 | try { 33 | Class.forName(objectClass, true, systemClassLoader) 34 | } catch { 35 | case _: ClassNotFoundException => 36 | Class.forName(objectClass, true, contextClassLoader) 37 | } 38 | } 39 | for (addon <- init.values) { 40 | println(s"Loading add-on ${addon.getClass.getName.stripSuffix("$")}...") 41 | addon.initAddon() 42 | } 43 | println(s"Initialized add-ons of package $packageName.") 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/addons/ArchAddon.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.addons 2 | 3 | trait ArchAddon { 4 | def initAddon(): Unit 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/ait/AitUser.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.ait 2 | 3 | case class AitUser(id: Int, userName: String, fullName: String, email: Option[String] = None) { 4 | def isSystemUser: Boolean = id == 0 5 | def isLoggedIn: Boolean = id >= 0 6 | } 7 | 8 | object AitUser { 9 | lazy val Empty = AitUser(-1, "", "") 10 | } 11 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/aut/AutUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.aut 2 | 3 | import io.archivesunleashed.matchbox.ExtractDomain 4 | import org.archive.webservices.ars.util.PublicSuffixUtil 5 | import org.archive.webservices.sparkling.http.HttpMessage 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | import java.io.InputStream 9 | import javax.imageio.ImageIO 10 | 11 | object AutUtil { 12 | val months = Seq( 13 | "jan", 14 | "feb", 15 | "mar", 16 | "apr", 17 | "may", 18 | "jun", 19 | "jul", 20 | "aug", 21 | "sep", 22 | "oct", 23 | "nov", 24 | "dec").zipWithIndex.map { case (s, d) => (s, ("0" + (d + 1)).takeRight(2)) } 25 | 26 | def url(r: WarcRecord): String = r.url.getOrElse("") 27 | 28 | def timestamp(r: WarcRecord): String = 29 | r.timestamp.filter(_.length >= 14).map(_.take(14)).getOrElse("") 30 | 31 | def mime(http: HttpMessage): String = http.mime.getOrElse("unknown") 32 | 33 | def checkPageMime(url: String, mime: String): Boolean = { 34 | val u = url.toLowerCase 35 | (mime == "text/html" || mime == "application/xhtml+xml" || u.endsWith("htm") || u.endsWith( 36 | "html")) && !u.endsWith("robots.txt") 37 | } 38 | 39 | def validPage(r: WarcRecord, http: HttpMessage): Boolean = { 40 | timestamp(r).nonEmpty && checkPageMime(url(r), http.mime.getOrElse("")) && http.status == 200 41 | } 42 | 43 | def extractDomainRemovePrefixWWW(url: String, publicSuffixes: Set[String]): String = { 44 | Option(if (url.trim.isEmpty) "" else ExtractDomain(url).replaceAll("^\\s*www\\.", "")) 45 | .map(_.trim) 46 | .map(PublicSuffixUtil.resolve(_, publicSuffixes)) 47 | .getOrElse("") 48 | } 49 | 50 | def extractDomainRemovePrefixWWW(url: String): String = { 51 | Option(if (url.trim.isEmpty) "" else ExtractDomain(url).replaceAll("^\\s*www\\.", "")) 52 | .map(_.trim) 53 | .getOrElse("") 54 | } 55 | 56 | def rfc1123toTime14(lastModifiedDate: String): String = { 57 | if (lastModifiedDate.isEmpty) { 58 | "" 59 | } else { 60 | val lc = lastModifiedDate.toLowerCase 61 | val date = months.find(m => lc.contains(m._1)).map(_._2).flatMap { m => 62 | val d = lc 63 | .replace(":", "") 64 | .split(' ') 65 | .drop(1) 66 | .map(d => (d.length, d)) 67 | .toMap 68 | for (y <- d.get(4); n <- d.get(2); t <- d.get(6)) 69 | yield y + m + n + t 70 | } 71 | date match { 72 | case Some(date) => 73 | date 74 | case None => 75 | "" 76 | } 77 | } 78 | } 79 | 80 | // see io.archivesunleashed.matchbox.ComputeImageSize 81 | def computeImageSize(in: InputStream): (Int, Int) = { 82 | val nullImage = (0, 0) 83 | try { 84 | val stream = ImageIO.createImageInputStream(in) 85 | try { 86 | val readers = ImageIO.getImageReaders(stream) 87 | if (readers.hasNext) { 88 | val reader = readers.next 89 | reader.setInput(stream) 90 | (reader.getWidth(0), reader.getHeight(0)) 91 | } else nullImage 92 | } finally { 93 | stream.close() 94 | } 95 | } catch { 96 | case e: Throwable => nullImage 97 | } 98 | } 99 | 100 | def extractLinks( 101 | func: (String, String) => Seq[(String, String, String)], 102 | url: String, 103 | body: String): Seq[(String, String, String)] = { 104 | func(url, body).flatMap { case (s, t, a) => 105 | for { 106 | source <- Option(s).map(_.trim).filter(_.nonEmpty) 107 | target <- Option(t).map(_.trim).filter(_.nonEmpty) 108 | } yield (source, target, Option(a).map(_.trim).getOrElse("")) 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/aut/TikaUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.aut 2 | 3 | import org.apache.tika.Tika 4 | import org.apache.tika.detect.DefaultDetector 5 | import org.apache.tika.parser.AutoDetectParser 6 | import org.archive.webservices.sparkling.io.IOUtil 7 | 8 | import java.io.InputStream 9 | import scala.util.Try 10 | 11 | object TikaUtil { 12 | val detector = new DefaultDetector() 13 | val parser = new AutoDetectParser(detector) 14 | val tika = new Tika(detector, parser) 15 | 16 | def mime(in: InputStream): String = { 17 | (if (in.markSupported() && IOUtil.eof(in)) None else Try(tika.detect(in)).toOption) 18 | .getOrElse("N/A") 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/io/CollectionCache.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.io 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.archive.webservices.ars.model.ArchConf 5 | import org.archive.webservices.sparkling._ 6 | import org.archive.webservices.sparkling.io._ 7 | 8 | import java.time.Instant 9 | import scala.util.Try 10 | 11 | object CollectionCache { 12 | val CacheClearThresholdBytes: Long = 1.tb 13 | 14 | private var inUse = Set.empty[String] 15 | private var lastUse = Map.empty[String, Long] 16 | 17 | def cache[R](sourceId: String)(action: String => R): R = { 18 | val dir = cacheDir(sourceId) 19 | synchronized { 20 | inUse += dir 21 | clearCache() 22 | } 23 | val path = cacheDirPath(dir) 24 | HdfsIO.fs.mkdirs(new Path(path)) 25 | val r = action(path) 26 | synchronized { 27 | inUse -= dir 28 | lastUse = lastUse.updated(dir, Instant.now.toEpochMilli) 29 | } 30 | r 31 | } 32 | 33 | def cacheDir(sourceId: String): String = IOHelper.escapePath(sourceId) 34 | 35 | def cacheDirPath(cacheDir: String): String = ArchConf.collectionCachePath + "/" + cacheDir 36 | 37 | def cachePath(sourceId: String): String = cacheDirPath(cacheDir(sourceId)) 38 | 39 | def cachePath(sourceId: String, filename: String): String = cachePath(sourceId) + "/" + filename 40 | 41 | def clearCache(): Unit = synchronized { 42 | val fs = HdfsIO.fs 43 | var length = Try(fs.getContentSummary(new Path(ArchConf.collectionCachePath)).getLength) 44 | .getOrElse(0L) 45 | if (length > CacheClearThresholdBytes) { 46 | for (dir <- fs.listStatus(new Path(ArchConf.collectionCachePath)) 47 | if dir.isDirectory) { 48 | val path = dir.getPath 49 | val c = path.getName 50 | if (!inUse.contains(c) && !lastUse.contains(c)) { 51 | val pathLength = fs.getContentSummary(path).getLength 52 | if (fs.delete(path, true)) length -= pathLength 53 | } 54 | } 55 | val toDelete = 56 | lastUse.toSeq 57 | .filter { case (c, _) => !inUse.contains(c) } 58 | .sortBy(_._2) 59 | .map(_._1) 60 | .toIterator 61 | while (length > CacheClearThresholdBytes && toDelete.hasNext) { 62 | val path = new Path(ArchConf.collectionCachePath, toDelete.next) 63 | if (fs.exists(path)) { 64 | val pathLength = fs.getContentSummary(path).getLength 65 | if (fs.delete(path, true)) length -= pathLength 66 | } 67 | } 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/io/FileAccessContext.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.io 2 | 3 | import org.archive.webservices.ars.Arch 4 | import org.archive.webservices.ars.model.{ArchConf, LocalArchConf} 5 | import org.archive.webservices.sparkling.io.HdfsIO 6 | 7 | class FileAccessContext( 8 | val conf: ArchConf with Serializable, 9 | val useAitHdfsIO: Boolean = false, 10 | val keyRing: FileAccessKeyRing, 11 | val keyValueCache: Map[String, java.io.Serializable]) 12 | extends Serializable { 13 | @transient lazy val hdfsIO: HdfsIO = if (useAitHdfsIO) aitHdfsIO else HdfsIO 14 | @transient lazy val aitHdfsIOopt: Option[HdfsIO] = 15 | conf.aitCollectionHdfsHostPort 16 | .map { case (host, port) => HdfsIO(host, port) } 17 | 18 | def aitHdfsIO: HdfsIO = aitHdfsIOopt.getOrElse(hdfsIO) 19 | 20 | @transient private var initialized: Boolean = false 21 | def init(): Unit = if (!initialized) { 22 | initialized = true 23 | ArchConf.set(conf) 24 | Arch.initSentry() 25 | } 26 | } 27 | 28 | object FileAccessContext { 29 | var KeyValueCache = Map.empty[String, java.io.Serializable] 30 | 31 | def fromLocalArchConf: FileAccessContext = 32 | new FileAccessContext( 33 | conf = LocalArchConf.instance, 34 | keyRing = FileAccessKeyRing.system, 35 | keyValueCache = KeyValueCache) 36 | 37 | def fromLocalArchConf(alwaysAitHdfsIO: Boolean) = 38 | new FileAccessContext( 39 | conf = LocalArchConf.instance, 40 | useAitHdfsIO = alwaysAitHdfsIO, 41 | keyRing = FileAccessKeyRing.system, 42 | keyValueCache = KeyValueCache) 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/io/FileAccessKeyRing.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.io 2 | 3 | import org.archive.webservices.sparkling.io.IOUtil 4 | 5 | import java.io.File 6 | import scala.collection.JavaConverters._ 7 | 8 | class FileAccessKeyRing private (secrets: Map[String, String]) extends Serializable { 9 | import FileAccessKeyRing._ 10 | 11 | def forUrl(url: String): Option[(String, Array[String])] = { 12 | val (protocol, path) = { 13 | val splitAt = url.lastIndexOf(":") 14 | if (splitAt < 0) ("", url) else (url.take(splitAt), url.drop(splitAt + 1)) 15 | } 16 | val secretSplit = path 17 | .split('/') 18 | .find(_.nonEmpty) 19 | .flatMap { host => 20 | secrets.get( 21 | secretKey( 22 | protocol, 23 | (if (host.contains("@")) host.split('@').last else host).split(':').head)) 24 | } 25 | .toArray 26 | .flatMap(_.split(SecretSeparator)) 27 | secretSplit.headOption.filter(SupportedAccessMethods.contains).map((_, secretSplit.drop(1))) 28 | } 29 | } 30 | 31 | object FileAccessKeyRing { 32 | val SecretSeparator = "::" 33 | val AccessMethodS3 = "s3" 34 | val AccessMethodBasic = "basic" 35 | val AccessMethodVault = "vault" 36 | val SupportedAccessMethods = Set(AccessMethodS3, AccessMethodBasic, AccessMethodVault) 37 | val SecretEnvPrefix = "ARCH_SECRET_" 38 | val SecretsFile = ".secrets" 39 | 40 | def secretKey(protocol: String, host: String): String = { 41 | s"${protocol.toUpperCase}_${host.replace('.', '-').toUpperCase}" 42 | } 43 | 44 | def loadEnv: Map[String, String] = { 45 | System.getenv().asScala.toMap.filterKeys(_.startsWith(SecretEnvPrefix)).map { case (k, v) => 46 | k.stripPrefix(SecretEnvPrefix) -> v 47 | } 48 | } 49 | 50 | def loadSecrets: Map[String, String] = { 51 | if (new File(SecretsFile).exists) { 52 | IOUtil 53 | .lines(SecretsFile) 54 | .flatMap { line => 55 | val equalIdx = line.indexOf("=") 56 | if (equalIdx == -1) None 57 | else 58 | Some { 59 | line.take(equalIdx).trim -> line.drop(equalIdx + 1).trim 60 | } 61 | } 62 | .filter { case (k, v) => 63 | k.nonEmpty && v.nonEmpty 64 | } 65 | .toMap 66 | } else Map.empty 67 | } 68 | 69 | lazy val system: FileAccessKeyRing = new FileAccessKeyRing(loadEnv ++ loadSecrets) 70 | 71 | def forUrl(url: String): Option[(String, Array[String])] = system.forUrl(url) 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/io/FilePointer.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.io 2 | 3 | import org.archive.webservices.ars.io.FilePointer.{DefaultSource, SourceSeparator} 4 | import org.archive.webservices.sparkling.util.StringUtil 5 | 6 | case class FilePointer(url: String, filename: String) { 7 | private lazy val sourcePathSplit = { 8 | val splitAt = StringUtil.prefixBySeparator(url, "/").lastIndexOf(SourceSeparator) 9 | if (splitAt < 0) (DefaultSource, url) else (url.take(splitAt), url.drop(splitAt + 1)) 10 | } 11 | 12 | def source: String = sourcePathSplit._1 13 | def path: String = sourcePathSplit._2 14 | 15 | def relative(parent: FilePointer): FilePointer = { 16 | if (source.isEmpty) { 17 | val splitAt = parent.url.lastIndexOf('/') 18 | if (splitAt < 0) this 19 | else { 20 | FilePointer(IOHelper.concatPaths(parent.url.take(splitAt), url), filename) 21 | } 22 | } else this 23 | } 24 | } 25 | 26 | object FilePointer { 27 | val SourceSeparator = ":" 28 | val DefaultSource = "hdfs" 29 | 30 | def fromUrl(url: String): FilePointer = { 31 | val lastSlashIdx = url.lastIndexOf('/') 32 | if (lastSlashIdx < 0) { 33 | val sourceSeparatorIdx = url.lastIndexOf(SourceSeparator) 34 | if (sourceSeparatorIdx < 0) FilePointer(url, url) 35 | else FilePointer(url, url.drop(sourceSeparatorIdx + 1)) 36 | } else FilePointer(url, url.drop(lastSlashIdx + 1)) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/io/MemoryCompressor.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.io 2 | 3 | import org.apache.commons.compress.compressors.bzip2.{BZip2CompressorInputStream, BZip2CompressorOutputStream} 4 | import org.archive.webservices.sparkling.io.ByteArray 5 | 6 | import java.io.{ByteArrayOutputStream, EOFException, InputStream} 7 | 8 | object MemoryCompressor { 9 | val BufferSize = 1024 10 | 11 | def compress(in: InputStream): ByteArray = { 12 | val array = new ByteArray 13 | val buffer = new Array[Byte](BufferSize) 14 | val out = new ByteArrayOutputStream() 15 | val compressor = new BZip2CompressorOutputStream(out) 16 | try { 17 | var read = in.read(buffer) 18 | while (read != -1) { 19 | if (read > 0) compressor.write(buffer, 0, read) 20 | if (out.size > BufferSize) { 21 | array.append(out.toByteArray) 22 | out.reset() 23 | } 24 | read = in.read(buffer) 25 | } 26 | } catch { 27 | case _: EOFException => // ignore EOF / break loop 28 | } 29 | compressor.close() 30 | array.append(out.toByteArray) 31 | array 32 | } 33 | 34 | def decompress(array: ByteArray): InputStream = 35 | new BZip2CompressorInputStream(array.toInputStream) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/ArchCollectionInfo.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | import _root_.io.circe.parser._ 4 | import _root_.io.circe.syntax._ 5 | import io.circe.Json 6 | import org.archive.webservices.ars.processing.{DerivationJobConf, JobManager} 7 | import org.archive.webservices.sparkling.io.HdfsIO 8 | import org.scalatra.guavaCache.GuavaCache 9 | 10 | import java.time.Instant 11 | import scala.collection.immutable.ListMap 12 | 13 | case class ArchCollectionInfo private ( 14 | collectionId: String, 15 | file: String, 16 | lastJob: Option[(String, Boolean, Long)] = None) { 17 | def lastJobId: Option[String] = lastJob.map(_._1) 18 | def lastJobSample: Option[Boolean] = lastJob.map(_._2) 19 | def lastJobTime: Option[Instant] = lastJob.map(_._3).map(Instant.ofEpochSecond) 20 | def lastJobName: Option[String] = lastJobId.flatMap(JobManager.jobs.get).map { job => 21 | job.name + (if (lastJobSample.getOrElse(false)) ArchCollectionInfo.SampleNameSuffix else "") 22 | } 23 | 24 | def setLastJob(id: String, sample: Boolean, time: Instant): ArchCollectionInfo = { 25 | copy(collectionId, file, Some(id, sample, time.getEpochSecond)) 26 | } 27 | 28 | def save(): Unit = { 29 | GuavaCache.put(ArchCollectionInfo.CachePrefix + collectionId, this, None) 30 | HdfsIO.writeLines( 31 | file, 32 | Seq((ListMap.empty[String, Json] ++ { 33 | lastJob.toSeq.flatMap { case (id, sample, time) => 34 | Seq( 35 | "lastJobId" -> id.asJson, 36 | "lastJobSample" -> sample.asJson, 37 | "lastJobEpoch" -> time.asJson) 38 | } 39 | }).asJson.spaces4), 40 | overwrite = true) 41 | } 42 | } 43 | 44 | object ArchCollectionInfo { 45 | val Charset = "utf-8" 46 | val CachePrefix = "collection-info#" 47 | val SampleNameSuffix = " (Sample)" 48 | 49 | def get(collectionId: String): Option[ArchCollectionInfo] = { 50 | GuavaCache.get(CachePrefix + collectionId).orElse { 51 | ArchCollection.get(collectionId).map { c => 52 | val file = DerivationJobConf.collectionOutPath(c) + "/info.json" 53 | val globalFile = DerivationJobConf.collectionOutPath(c, global = true) + "/info.json" 54 | val lastJob = Seq(file, globalFile) 55 | .filter(HdfsIO.exists) 56 | .flatMap { inFile => 57 | parse(HdfsIO.lines(inFile).mkString).right.toOption.map(_.hcursor).flatMap { cursor => 58 | cursor.get[Long]("lastJobEpoch").toOption.flatMap { epoch => 59 | cursor 60 | .get[String]("lastJobId") 61 | .toOption 62 | .map { id => 63 | (id, cursor.get[Boolean]("lastJobSample").getOrElse(false), epoch) 64 | } 65 | .orElse { 66 | cursor.get[String]("lastJobName").toOption.flatMap { name => 67 | JobManager.nameLookup.get(name.stripSuffix(SampleNameSuffix)).map { job => 68 | (job.id, name.endsWith(SampleNameSuffix), epoch) 69 | } 70 | } 71 | } 72 | } 73 | } 74 | } 75 | .sortBy(-_._3) 76 | .headOption 77 | val info = ArchCollectionInfo(collectionId, file, lastJob) 78 | GuavaCache.put(CachePrefix + collectionId, info, None) 79 | } 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/ArchCollectionStats.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | case class ArchCollectionStats(size: Long, seeds: Long = -1, lastCrawlDate: String = "") 4 | 5 | object ArchCollectionStats { 6 | val Empty: ArchCollectionStats = ArchCollectionStats(-1) 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/ArchJobCategories.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | object ArchJobCategories { 4 | val None = ArchJobCategory("", "") 5 | 6 | val BinaryInformation = ArchJobCategory( 7 | "File Formats", 8 | "Find, describe, and use the files contained within a web archive, based on their format.") 9 | 10 | val Collection = ArchJobCategory( 11 | "Collection", 12 | "Discover domain-related patterns and high level information about the documents in a web archive.") 13 | 14 | val Network = ArchJobCategory("Network", "Explore connections in a web archive visually.") 15 | 16 | val Text = ArchJobCategory("Text", "Extract and analyze a web archive as text.") 17 | 18 | val System = 19 | ArchJobCategory("System", "Internal system jobs that are not meant to be exposed to users.") 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/ArchJobCategory.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | case class ArchJobCategory(name: String, description: String) { 4 | override def toString: String = name 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/ArchJobInstanceInfo.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | import _root_.io.circe.syntax._ 4 | import io.circe.Json 5 | import io.circe.parser.parse 6 | import org.archive.webservices.ars.processing.DerivationJobConf 7 | import org.archive.webservices.sparkling.io.HdfsIO 8 | import org.scalatra.guavaCache.GuavaCache 9 | 10 | import java.time.Instant 11 | import scala.collection.immutable.ListMap 12 | 13 | class ArchJobInstanceInfo private () { 14 | var uuid: Option[String] = None 15 | var conf: Option[DerivationJobConf] = None 16 | var started: Option[Instant] = None 17 | var finished: Option[Instant] = None 18 | 19 | def toJson: Json = { 20 | (ListMap(uuid.map("uuid" -> _.asJson).toSeq: _*) ++ { 21 | conf.map("conf" -> _.toJson) 22 | } ++ { 23 | started.map("started" -> _.getEpochSecond.asJson) 24 | } ++ { 25 | finished.map("finished" -> _.getEpochSecond.asJson) 26 | }).asJson 27 | } 28 | 29 | def save(jobOutPath: String): Unit = { 30 | val file = ArchJobInstanceInfo.infoFile(jobOutPath) 31 | GuavaCache.put(ArchJobInstanceInfo.CachePrefix + file, this, None) 32 | HdfsIO.writeLines(file, Seq(toJson.spaces4), overwrite = true) 33 | } 34 | } 35 | 36 | object ArchJobInstanceInfo { 37 | val Charset = "utf-8" 38 | val CachePrefix = "job-instance-info#" 39 | val InfoFile = "info.json" 40 | 41 | def infoFile(jobOutPath: String): String = jobOutPath + s"/$InfoFile" 42 | 43 | def apply(jobOutPath: String): ArchJobInstanceInfo = { 44 | val file = infoFile(jobOutPath) 45 | GuavaCache.get(CachePrefix + file).getOrElse { 46 | val info = if (HdfsIO.exists(file)) { 47 | parse(HdfsIO.lines(file).mkString).right.toOption.map(_.hcursor) match { 48 | case Some(cursor) => 49 | val info = new ArchJobInstanceInfo() 50 | info.uuid = cursor.get[String]("uuid").toOption 51 | info.conf = cursor.downField("conf").focus.flatMap(DerivationJobConf.fromJson) 52 | info.started = cursor.get[Long]("started").toOption.map(Instant.ofEpochSecond) 53 | info.finished = cursor.get[Long]("finished").toOption.map(Instant.ofEpochSecond) 54 | info 55 | case None => new ArchJobInstanceInfo() 56 | } 57 | } else new ArchJobInstanceInfo() 58 | GuavaCache.put(CachePrefix + file, info, None) 59 | } 60 | } 61 | 62 | def inMemory: ArchJobInstanceInfo = new ArchJobInstanceInfo() 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/DerivativeOutput.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | import _root_.io.circe._ 4 | import _root_.io.circe.syntax._ 5 | import org.apache.hadoop.fs.Path 6 | import org.archive.webservices.ars.io.IOHelper 7 | import org.archive.webservices.ars.processing.DerivationJobInstance 8 | import org.archive.webservices.ars.util.FormatUtil 9 | import org.archive.webservices.sparkling.io.HdfsIO 10 | import org.archive.webservices.sparkling.util.{DigestUtil, StringUtil} 11 | 12 | import java.io.{BufferedInputStream, FileInputStream, InputStream} 13 | import java.time.Instant 14 | import scala.util.Try 15 | 16 | trait DerivativeOutput { 17 | def filename: String 18 | def dir: String 19 | def fileType: String 20 | def mimeType: String 21 | def downloadName: String 22 | def size: Long 23 | def time: Long 24 | def lineCount: Long 25 | def checksums: Map[String, String] 26 | def prefixDownload(prefix: String): DerivativeOutput 27 | 28 | lazy val path: String = dir + "/" + filename 29 | 30 | lazy val sizeStr: String = IOHelper.sizeStr(path) 31 | 32 | lazy val timeStr: String = FormatUtil.instantTimeString(Instant.ofEpochMilli(time)) 33 | 34 | lazy val lineCountStr: Option[String] = 35 | if (lineCount < 0) None else Some(StringUtil.formatNumber(lineCount, 0)) 36 | 37 | lazy val accessToken: String = DigestUtil.sha1Base32(filename + size + time) 38 | 39 | def prefixDownload(instance: DerivationJobInstance): DerivativeOutput = { 40 | val timestamp = instance.info.finished.map(IOHelper.pathTimestamp).map(_ + "_") 41 | prefixDownload(instance.conf.inputSpec.id + "_" + timestamp.getOrElse("")) 42 | } 43 | } 44 | 45 | case class DerivativeOutputFile( 46 | filename: String, 47 | dir: String, 48 | fileType: String, 49 | mimeType: String, 50 | downloadName: String) 51 | extends DerivativeOutput { 52 | import DerivativeOutput._ 53 | 54 | lazy val (size, time) = Try { 55 | val status = HdfsIO.fs.getFileStatus(new Path(path)) 56 | (status.getLen, status.getModificationTime) 57 | }.getOrElse((0L, 0L)) 58 | 59 | lazy val lineCount: Long = { 60 | val p = path + LineCountFileSuffix 61 | if (HdfsIO.exists(p)) Try(HdfsIO.lines(p).head.toLong).getOrElse(-1) 62 | else -1 63 | } 64 | 65 | lazy val checksums: Map[String, String] = { 66 | val p = path + ChecksumsFileSuffix 67 | if (HdfsIO.exists(p)) 68 | parser 69 | .decode[Map[String, String]](HdfsIO.lines(p).mkString) 70 | .right 71 | .toOption 72 | .getOrElse(Map.empty) 73 | else Map.empty 74 | } 75 | 76 | def prefixDownload(prefix: String): DerivativeOutput = 77 | copy(downloadName = IOHelper.escapePath(prefix) + filename) 78 | } 79 | 80 | object DerivativeOutput { 81 | val LineCountFileSuffix = "_linecount" 82 | val ChecksumsFileSuffix = ".checksums" 83 | 84 | def apply( 85 | filename: String, 86 | dir: String, 87 | fileType: String, 88 | mimeType: String): DerivativeOutputFile = { 89 | DerivativeOutputFile(filename, dir, fileType, mimeType, filename) 90 | } 91 | 92 | def hashFile(in: InputStream): Map[String, String] = Map("md5" -> DigestUtil.md5Hex(in)) 93 | 94 | def hashFile(in: InputStream, hdfsPath: String): Unit = 95 | HdfsIO.writeLines( 96 | hdfsPath + ChecksumsFileSuffix, 97 | Seq(hashFile(in).asJson.spaces4), 98 | overwrite = true) 99 | 100 | def hashFileLocal(localPath: String, hdfsPath: String): Unit = { 101 | val in = new BufferedInputStream(new FileInputStream(localPath)) 102 | try { 103 | hashFile(in, hdfsPath) 104 | } finally { 105 | in.close() 106 | } 107 | } 108 | 109 | def hashFileHdfs(hdfsPath: String): Unit = 110 | HdfsIO.access(hdfsPath, decompress = false)(hashFile(_, hdfsPath)) 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/DerivativeOutputCache.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model 2 | 3 | import _root_.io.circe.parser 4 | import _root_.io.circe.syntax._ 5 | import org.archive.webservices.ars.io.IOHelper 6 | import org.archive.webservices.sparkling.io.HdfsIO 7 | 8 | import scala.collection.immutable.ListMap 9 | 10 | case class DerivativeOutputCache(count: Int, size: Long, files: Iterator[DerivativeOutput]) 11 | 12 | object DerivativeOutputCache { 13 | case class CachedDerivativeOutput( 14 | filename: String, 15 | dir: String, 16 | fileType: String, 17 | mimeType: String, 18 | downloadName: String, 19 | size: Long, 20 | time: Long, 21 | lineCount: Long, 22 | checksums: Map[String, String]) 23 | extends DerivativeOutput { 24 | override def prefixDownload(prefix: String): DerivativeOutput = 25 | copy(downloadName = IOHelper.escapePath(prefix) + filename) 26 | } 27 | 28 | def parseLine(line: String): Option[DerivativeOutput] = { 29 | parser.parse(line).toOption.map(_.hcursor).flatMap { cursor => 30 | for { 31 | filename <- cursor.get[String]("filename").toOption 32 | dir <- cursor.get[String]("dir").toOption 33 | fileType <- cursor.get[String]("fileType").toOption 34 | mimeType <- cursor.get[String]("mimeType").toOption 35 | size <- cursor.get[Long]("size").toOption 36 | time <- cursor.get[Long]("time").toOption 37 | lineCount <- cursor.get[Long]("lineCount").toOption 38 | checksums = { 39 | val checksums = cursor.downField("checksums") 40 | checksums.keys.toIterator.flatten.flatMap { key => 41 | checksums.get[String](key).toOption.map(key -> _) 42 | }.toMap 43 | } 44 | } yield CachedDerivativeOutput( 45 | filename, 46 | dir, 47 | fileType, 48 | mimeType, 49 | filename, 50 | size, 51 | time, 52 | lineCount, 53 | checksums) 54 | } 55 | } 56 | 57 | def parse(cacheFile: String): Option[DerivativeOutputCache] = { 58 | val lines = HdfsIO.iterLines(cacheFile) 59 | if (lines.hasNext) { 60 | val metadata = lines.next() 61 | parser.parse(metadata).toOption.map(_.hcursor).flatMap { cursor => 62 | for { 63 | count <- cursor.get[Int]("count").toOption 64 | size <- cursor.get[Long]("size").toOption 65 | } yield DerivativeOutputCache( 66 | count, 67 | size, { 68 | lines.flatMap(parseLine) 69 | }) 70 | } 71 | } else None 72 | } 73 | 74 | def write(files: Iterator[DerivativeOutput], cacheFile: String): Unit = { 75 | val tmpFile = cacheFile + ".tmp" 76 | var count = 0 77 | var size = 0L 78 | HdfsIO.writeLines( 79 | path = tmpFile, { 80 | files.map { file => 81 | count += 1 82 | size += file.size 83 | ListMap( 84 | "filename" -> file.filename.asJson, 85 | "dir" -> file.dir.asJson, 86 | "fileType" -> file.fileType.asJson, 87 | "mimeType" -> file.mimeType.asJson, 88 | "size" -> file.size.asJson, 89 | "time" -> file.time.asJson, 90 | "lineCount" -> file.lineCount.asJson, 91 | "checksums" -> file.checksums.asJson).asJson.noSpaces 92 | } 93 | }, 94 | overwrite = true) 95 | HdfsIO.writeLines( 96 | cacheFile, { 97 | Iterator(ListMap("count" -> count.asJson, "size" -> size.asJson).asJson.noSpaces) ++ { 98 | HdfsIO.iterLines(tmpFile) 99 | } 100 | }, 101 | overwrite = true) 102 | HdfsIO.delete(tmpFile) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/AvailableJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | import org.archive.webservices.ars.model.PublishedDatasets 4 | import org.archive.webservices.ars.processing.DerivationJob 5 | import org.scalatra.swagger.annotations.ApiModelProperty 6 | 7 | case class AvailableJob( 8 | @ApiModelProperty(description = "Unique job type identifier") uuid: String, 9 | name: String, 10 | description: String, 11 | @ApiModelProperty(description = "Whether the job output is publishable to archive.org") 12 | publishable: Boolean, 13 | @ApiModelProperty(description = "Whether the job is internal/non-user-facing use only") 14 | internal: Boolean, 15 | @ApiModelProperty(description = "A link to the job source code") codeUrl: String, 16 | @ApiModelProperty(description = "A link to information about the job") infoUrl: String) 17 | extends ApiResponseObject[AvailableJob] 18 | 19 | object AvailableJob { 20 | def apply(job: DerivationJob, isInternal: Boolean): AvailableJob = 21 | AvailableJob( 22 | uuid = job.uuid.toString, 23 | name = job.name, 24 | description = job.description, 25 | publishable = (!PublishedDatasets.ProhibitedJobs.contains(job)), 26 | internal = isInternal, 27 | codeUrl = job.codeUrl, 28 | infoUrl = job.infoUrl) 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/AvailableJobsCategory.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 4 | import org.archive.webservices.ars.processing.DerivationJob 5 | 6 | case class AvailableJobsCategory( 7 | categoryName: String, 8 | categoryDescription: String, 9 | jobs: Seq[AvailableJob]) 10 | extends ApiResponseObject[AvailableJobsCategory] 11 | 12 | object AvailableJobsCategory { 13 | def apply(category: ArchJobCategory, jobs: Seq[DerivationJob]): AvailableJobsCategory = { 14 | val isInternal = (category == ArchJobCategories.System) 15 | AvailableJobsCategory( 16 | categoryName = category.name, 17 | categoryDescription = category.description, 18 | jobs = jobs.map(j => AvailableJob.apply(j, isInternal))) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/Collection.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | import io.circe.Json 4 | import io.circe.syntax._ 5 | import org.archive.webservices.ars.model.app.RequestContext 6 | import org.archive.webservices.ars.model.collections.CustomCollectionSpecifics 7 | import org.archive.webservices.ars.model.{ArchCollection, ArchCollectionInfo} 8 | import org.archive.webservices.ars.processing.jobs.system.UserDefinedQuery 9 | import org.archive.webservices.ars.util.FormatUtil 10 | 11 | case class Collection( 12 | id: String, 13 | name: String, 14 | public: Boolean, 15 | size: String, 16 | sortSize: Long, 17 | seeds: Long, 18 | lastCrawlDate: Option[String], 19 | lastJobId: Option[String], 20 | lastJobSample: Option[java.lang.Boolean], 21 | lastJobName: Option[String], 22 | lastJobTime: Option[String], 23 | params: Option[Json]) 24 | extends ApiResponseObject[Collection] 25 | 26 | object Collection { 27 | private def params(collection: ArchCollection): Option[Json] = 28 | if (collection.specifics.isInstanceOf[CustomCollectionSpecifics]) 29 | Some( 30 | UserDefinedQuery 31 | .parseInfo(CustomCollectionSpecifics.path(collection.id).get) 32 | .get 33 | .top 34 | .get 35 | .asObject 36 | .get 37 | .filterKeys(k => k != "name" && k != "size") 38 | .asJson) 39 | else None 40 | 41 | def apply(collection: ArchCollection)(implicit context: RequestContext): Collection = { 42 | val info = ArchCollectionInfo.get(collection.id) 43 | Collection( 44 | id = collection.id, 45 | name = collection.name, 46 | public = collection.public, 47 | size = FormatUtil.formatBytes(collection.stats.size), 48 | sortSize = collection.stats.size, 49 | seeds = collection.stats.seeds, 50 | lastCrawlDate = Option(collection.stats.lastCrawlDate).filter(_.nonEmpty), 51 | lastJobId = info.flatMap(_.lastJobId), 52 | lastJobSample = info.flatMap(_.lastJobSample).map(Boolean.box), 53 | lastJobName = info.flatMap(_.lastJobName), 54 | lastJobTime = info.flatMap(_.lastJobTime).map(FormatUtil.instantTimeString), 55 | params = params(collection)) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/Dataset.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | import org.archive.webservices.ars.model.ArchCollection 4 | import org.archive.webservices.ars.model.app.RequestContext 5 | import org.archive.webservices.ars.processing.DerivationJobInstance 6 | import org.archive.webservices.ars.util.{DatasetUtil, FormatUtil} 7 | 8 | case class Dataset( 9 | id: String, 10 | collectionId: String, 11 | collectionName: String, 12 | isSample: Boolean, 13 | jobId: String, 14 | category: String, 15 | name: String, 16 | sample: Int, 17 | state: String, 18 | startTime: Option[String], 19 | finishedTime: Option[String]) 20 | extends ApiResponseObject[Dataset] 21 | 22 | object Dataset { 23 | def apply(collection: ArchCollection, jobInstance: DerivationJobInstance)(implicit 24 | context: RequestContext): Dataset = { 25 | Dataset( 26 | id = DatasetUtil.formatId(collection.id, jobInstance), 27 | collectionId = collection.id, 28 | collectionName = collection.name, 29 | isSample = jobInstance.conf.isSample, 30 | jobId = jobInstance.job.uuid.toString, 31 | category = jobInstance.job.category.name, 32 | name = jobInstance.job.name, 33 | sample = jobInstance.conf.sample, 34 | state = jobInstance.stateStr, 35 | startTime = jobInstance.info.started.map(FormatUtil.instantTimeString), 36 | finishedTime = jobInstance.info.finished.map(FormatUtil.instantTimeString)) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/DatasetFile.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | import org.archive.webservices.ars.model.DerivativeOutput 4 | import org.archive.webservices.ars.util.FormatUtil 5 | 6 | import java.time.Instant 7 | 8 | case class DatasetFile( 9 | filename: String, 10 | sizeBytes: Long, 11 | mimeType: String, 12 | lineCount: Long, 13 | fileType: String, 14 | creationTime: String, 15 | md5Checksum: Option[String], 16 | accessToken: String) 17 | extends ApiResponseObject[DatasetFile] 18 | 19 | object DatasetFile { 20 | def apply(derivOut: DerivativeOutput): DatasetFile = 21 | DatasetFile( 22 | filename = derivOut.filename, 23 | sizeBytes = derivOut.size, 24 | lineCount = derivOut.lineCount, 25 | mimeType = derivOut.mimeType, 26 | fileType = derivOut.fileType, 27 | creationTime = FormatUtil.instantTimeString(Instant.ofEpochMilli(derivOut.time)), 28 | md5Checksum = derivOut.checksums.get("md5"), 29 | accessToken = derivOut.accessToken) 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/InputSpec.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | case class InputSpec( 4 | `type`: String, 5 | collectionId: Option[String], 6 | specs: Option[Seq[InputSpec]], 7 | inputType: Option[String], 8 | uuid: Option[String]) 9 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/JobState.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | import org.archive.webservices.ars.processing.{DerivationJobInstance, ProcessingState} 4 | import org.archive.webservices.ars.util.FormatUtil 5 | 6 | case class JobState( 7 | id: String, 8 | uuid: String, 9 | name: String, 10 | sample: Int, 11 | state: String, 12 | started: Boolean, 13 | finished: Boolean, 14 | failed: Boolean, 15 | activeStage: String, 16 | activeState: String, 17 | queue: Option[String], 18 | queuePos: Option[ 19 | Integer 20 | ], // Option type lookup fails for Int in package.scala, so use Integer 21 | startTime: Option[String], 22 | finishedTime: Option[String]) 23 | extends ApiResponseObject[JobState] 24 | 25 | object JobState { 26 | def apply(instance: DerivationJobInstance): JobState = { 27 | val active = instance.active 28 | val info = instance.info 29 | JobState( 30 | id = instance.job.id, 31 | uuid = instance.uuid, 32 | name = instance.job.name, 33 | sample = instance.conf.sample, 34 | state = instance.stateStr, 35 | started = (instance.state != ProcessingState.NotStarted), 36 | finished = (instance.state == ProcessingState.Finished), 37 | failed = (instance.state == ProcessingState.Failed), 38 | activeStage = active.job.stage, 39 | activeState = active.stateStr, 40 | queue = active.queue.map(_.name), 41 | queuePos = active.queue.map(q => active.queueIndex), 42 | startTime = info.started.map(FormatUtil.instantTimeString), 43 | finishedTime = info.finished.map(FormatUtil.instantTimeString)) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/api/WasapiResponse.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.api 2 | 3 | case class WasapiResponseFile( 4 | filename: String, 5 | filetype: String, 6 | checksums: Map[String, String], 7 | locations: Seq[String], 8 | size: Long, 9 | collection: Option[String]) 10 | extends ApiResponseObject[WasapiResponseFile] 11 | 12 | case class WasapiResponse( 13 | count: Int, 14 | next: Option[String], 15 | previous: Option[String], 16 | files: Seq[WasapiResponseFile]) 17 | extends ApiResponseObject[WasapiResponse] 18 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/app/RequestContext.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.app 2 | 3 | import org.archive.webservices.ars.model.users.ArchUser 4 | 5 | import javax.servlet.http.HttpServletRequest 6 | 7 | class RequestContext private ( 8 | val request: Option[HttpServletRequest], 9 | val loggedIn: ArchUser, 10 | val user: ArchUser) { 11 | def isRequest: Boolean = request.nonEmpty 12 | def isInternal: Boolean = !isRequest 13 | def isUser: Boolean = loggedIn.isUser 14 | def loggedInOpt: Option[ArchUser] = loggedIn.option 15 | def userOpt: Option[ArchUser] = user.option 16 | def isAdmin: Boolean = loggedIn.isAdmin 17 | def forRequest[R](action: HttpServletRequest => Option[R]): Option[R] = request.flatMap(action) 18 | } 19 | 20 | object RequestContext { 21 | val None: RequestContext = new RequestContext(scala.None, ArchUser.None, ArchUser.None) 22 | 23 | def apply( 24 | request: Option[HttpServletRequest], 25 | loggedIn: ArchUser, 26 | user: ArchUser): RequestContext = { 27 | new RequestContext(request, loggedIn, user) 28 | } 29 | def apply(loggedIn: ArchUser, user: ArchUser)(implicit 30 | request: HttpServletRequest): RequestContext = { 31 | RequestContext(Some(request), loggedIn, user) 32 | } 33 | def apply(user: ArchUser)(implicit request: HttpServletRequest): RequestContext = { 34 | RequestContext(user, user) 35 | } 36 | def apply(user: Option[ArchUser])(implicit request: HttpServletRequest): RequestContext = { 37 | RequestContext(user.getOrElse(ArchUser.None)) 38 | } 39 | 40 | def apply(request: HttpServletRequest): RequestContext = { 41 | new RequestContext(Some(request), ArchUser.None, ArchUser.None) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/CollectionSpecifics.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.io.{FileAccessContext, FilePointer, WebArchiveLoader} 5 | import org.archive.webservices.ars.model.app.RequestContext 6 | import org.archive.webservices.ars.model.{ArchCollection, ArchCollectionStats} 7 | import org.archive.webservices.ars.processing.DerivationJobConf 8 | import org.archive.webservices.sparkling.cdx.CdxRecord 9 | 10 | import java.io.InputStream 11 | 12 | abstract class CollectionSpecifics { 13 | def id: String 14 | def inputPath: String 15 | def sourceId: String = id 16 | 17 | def collection(implicit context: RequestContext = RequestContext.None): Option[ArchCollection] 18 | def stats(implicit context: RequestContext = RequestContext.None): ArchCollectionStats 19 | def inputSize(conf: DerivationJobConf): Long = conf.inputSpec.collection.stats.size 20 | def loadWarcFiles[R](inputPath: String)(action: RDD[(FilePointer, InputStream)] => R): R 21 | 22 | def loadCdx[R](inputPath: String)(action: RDD[CdxRecord] => R): R = loadWarcFiles(inputPath) { 23 | rdd => 24 | action(WebArchiveLoader.loadCdxFromWarcGzStreams(rdd)) 25 | } 26 | 27 | def randomAccess( 28 | context: FileAccessContext, 29 | inputPath: String, 30 | pointer: FilePointer, 31 | offset: Long, 32 | positions: Iterator[(Long, Long)]): InputStream 33 | } 34 | 35 | object CollectionSpecifics { 36 | def get(id: String): Option[CollectionSpecifics] = { 37 | ArchCollection.prefix(id).map { 38 | case AitCollectionSpecifics.Prefix => new AitCollectionSpecifics(id) 39 | case SpecialCollectionSpecifics.Prefix => new SpecialCollectionSpecifics(id) 40 | case CustomCollectionSpecifics.Prefix => new CustomCollectionSpecifics(id) 41 | case UnionCollectionSpecifics.Prefix => new UnionCollectionSpecifics(id) 42 | case FileCollectionSpecifics.Prefix => new FileCollectionSpecifics(id) 43 | } 44 | } 45 | 46 | def pointer(sourceId: String, filename: String): FilePointer = 47 | FilePointer(sourceId + FilePointer.SourceSeparator + filename, filename) 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/GenericRandomAccess.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections 2 | 3 | import org.archive.webservices.ars.io.{FileAccessContext, FilePointer, RandomFileAccess} 4 | 5 | import java.io.InputStream 6 | 7 | trait GenericRandomAccess { 8 | def randomAccess( 9 | context: FileAccessContext, 10 | inputPath: String, 11 | pointer: FilePointer, 12 | offset: Long, 13 | positions: Iterator[(Long, Long)]): InputStream = 14 | RandomFileAccess.access(context, pointer, offset, positions) 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/UnionCollectionSpecifics.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.io.FilePointer 5 | import org.archive.webservices.ars.model.app.RequestContext 6 | import org.archive.webservices.ars.model.{ArchCollection, ArchCollectionStats} 7 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobParameters} 8 | import org.archive.webservices.sparkling.cdx.CdxRecord 9 | import org.archive.webservices.sparkling.util.RddUtil 10 | 11 | import java.io.InputStream 12 | import scala.reflect.ClassTag 13 | 14 | class UnionCollectionSpecifics(val id: String) 15 | extends CollectionSpecifics 16 | with GenericRandomAccess { 17 | val (userId, collectionId) = 18 | ArchCollection.splitIdUserCollection(id.stripPrefix(UnionCollectionSpecifics.Prefix)) 19 | 20 | def inputPath: String = "" 21 | 22 | def collection(implicit 23 | context: RequestContext = RequestContext.None): Option[ArchCollection] = { 24 | Some( 25 | ArchCollection( 26 | id, 27 | collectionId, 28 | public = false, 29 | userId.map((_, UnionCollectionSpecifics.Prefix + collectionId)), 30 | sourceId)) 31 | } 32 | 33 | override def stats(implicit 34 | context: RequestContext = RequestContext.None): ArchCollectionStats = 35 | ArchCollectionStats.Empty 36 | 37 | override def inputSize(conf: DerivationJobConf): Long = { 38 | UnionCollectionSpecifics 39 | .collections(conf.params) 40 | .map(_.specifics.inputSize(conf)) 41 | .filter(_ > -1) 42 | .sum 43 | } 44 | 45 | private def loadUnion[A: ClassTag, R]( 46 | inputPath: String, 47 | load: CollectionSpecifics => (RDD[A] => R) => R)(action: RDD[A] => R): R = { 48 | def union(rdd: RDD[A], remaining: Seq[CollectionSpecifics], numPartitions: Int): R = { 49 | if (remaining.nonEmpty) { 50 | load(remaining.head) { nextRdd => 51 | union(rdd.union(nextRdd), remaining.tail, nextRdd.getNumPartitions.max(numPartitions)) 52 | } 53 | } else action(rdd.coalesce(numPartitions)) 54 | } 55 | val sourceIds = inputPath.split(',').map(_.trim).filter(_.nonEmpty).distinct 56 | union(RddUtil.emptyRDD[A], sourceIds.flatMap(CollectionSpecifics.get), 0) 57 | } 58 | 59 | def loadWarcFiles[R](inputPath: String)(action: RDD[(FilePointer, InputStream)] => R): R = { 60 | loadUnion[(FilePointer, InputStream), R](inputPath, s => s.loadWarcFiles(s.inputPath))(action) 61 | } 62 | 63 | override def loadCdx[R](inputPath: String)(action: RDD[CdxRecord] => R): R = { 64 | loadUnion[CdxRecord, R](inputPath, s => s.loadCdx(s.inputPath))(action) 65 | } 66 | } 67 | 68 | object UnionCollectionSpecifics { 69 | val Prefix = "UNION-" 70 | 71 | def collections(params: DerivationJobParameters)(implicit 72 | context: RequestContext = RequestContext.None): Seq[ArchCollection] = { 73 | params 74 | .get[Array[String]]("input") 75 | .toSeq 76 | .flatten 77 | .distinct 78 | .sorted 79 | .flatMap(ArchCollection.get(_)) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/ArchCollectionSpecLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.io.{FilePointer, WebArchiveLoader} 5 | import org.archive.webservices.ars.model.collections.inputspecs.meta.{FileMetaData, FileMetaField} 6 | 7 | import java.io.InputStream 8 | 9 | object ArchCollectionSpecLoader extends InputSpecLoader { 10 | val specType = "collection" 11 | 12 | class WarcFileRecord(file: FilePointer, val in: InputStream) 13 | extends FileRecord 14 | with OneTimeAccess { 15 | override def filename: String = file.filename 16 | override def mime: String = WebArchiveLoader.WarcMime 17 | override def path: String = 18 | file.url.stripSuffix(file.filename).stripSuffix(FilePointer.SourceSeparator) 19 | override def pointer: FilePointer = file 20 | override lazy val meta: FileMetaData = FileMetaData( 21 | FileMetaField("filename", filename), 22 | FileMetaField("mime", mime), 23 | FileMetaField("path", path)) 24 | } 25 | 26 | override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 27 | spec.collection.specifics.loadWarcFiles(spec.inputPath) { rdd => 28 | action(rdd.map { case (pointer, in) => 29 | new WarcFileRecord(pointer, in) 30 | }) 31 | } 32 | } 33 | 34 | override def size(spec: InputSpec): Long = spec.collection.stats.size 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/CdxQuerySpecLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.io.WebArchiveLoader 5 | import org.archive.webservices.ars.processing.jobs.system.UserDefinedQuery 6 | 7 | object CdxQuerySpecLoader extends InputSpecLoader { 8 | override def specType: String = "cdx-query" 9 | 10 | def input(spec: InputSpec): InputSpec = { 11 | spec.cursor 12 | .downField("input") 13 | .focus 14 | .map(json => InputSpec(json.hcursor)) 15 | }.getOrElse { 16 | throw new UnsupportedOperationException("No sub spec specified.") 17 | } 18 | 19 | override def size(spec: InputSpec): Long = Some(super.size(spec)).filter(_ != -1).getOrElse { 20 | input(spec).size 21 | } 22 | 23 | override def inputType(spec: InputSpec): Option[String] = Some(InputSpec.InputType.CDX) 24 | 25 | override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = action({ 26 | for { 27 | query <- spec.params("query") 28 | } yield { 29 | for { 30 | error <- UserDefinedQuery.validateQuery(query) 31 | } throw new RuntimeException(error) 32 | 33 | WebArchiveLoader.loadCdx(input(spec)) { rdd => 34 | val queryBc = rdd.sparkContext.broadcast(query) 35 | rdd.mapPartitionsWithIndex { (idx, partition) => 36 | val cdx = UserDefinedQuery.filterQuery(partition, queryBc.value) 37 | Iterator(InMemoryCdxFileRecord(idx, cdx).asInstanceOf[FileRecord]) 38 | } 39 | } 40 | } 41 | }.getOrElse { 42 | throw new UnsupportedOperationException("missing query") 43 | }) 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/DatasetSpecLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | import org.apache.spark.rdd.RDD 3 | import org.archive.webservices.ars.io.FileAccessContext 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 5 | import org.archive.webservices.sparkling.Sparkling 6 | import org.archive.webservices.sparkling.util.RddUtil 7 | 8 | object DatasetSpecLoader extends InputSpecLoader { 9 | val specType = "dataset" 10 | 11 | override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 12 | spec.toFileSpec 13 | .map { fileSpec => 14 | FileSpecLoader.loadSpark(fileSpec)(action) 15 | } 16 | .getOrElse { 17 | val recordFactoryBc = Sparkling.sc.broadcast(HdfsFileRecordFactory()) 18 | val accessContext = FileAccessContext.fromLocalArchConf 19 | action(RddUtil.parallelize(spec.dataset.outFiles.toSeq).mapPartitions { partition => 20 | accessContext.init() 21 | val recordFactory = recordFactoryBc.value 22 | recordFactory.accessContext = accessContext 23 | val meta = FileMetaData.empty 24 | partition.map { file => 25 | recordFactory.get(file.path, file.mimeType, meta) 26 | } 27 | }) 28 | } 29 | } 30 | 31 | override def size(spec: InputSpec): Long = spec.dataset.outputSize 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/FileRecord.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.archive.webservices.ars.io.FilePointer 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 5 | 6 | import java.io.InputStream 7 | 8 | trait FileRecord { 9 | def filename: String = filePath.split('/').last 10 | def path: String = { 11 | val slashIdx = filePath.lastIndexOf('/') 12 | if (slashIdx < 0) "" else filePath.take(slashIdx) 13 | } 14 | def filePath: String = FileRecordFactory.filePath(path, filename) 15 | def mime: String 16 | def meta: FileMetaData 17 | def access: InputStream 18 | def pointer: FilePointer = FilePointer(filePath, filename) 19 | 20 | def withAccess(in: InputStream): FileRecord = { 21 | val origin = this 22 | new FileRecord { 23 | override def filename: String = origin.filename 24 | override def mime: String = origin.mime 25 | override def path: String = origin.path 26 | override def meta: FileMetaData = origin.meta 27 | override def access: InputStream = in 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/FileRecordFactory.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.archive.webservices.ars.io.{FileAccessContext, IOHelper} 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 5 | 6 | import java.io.InputStream 7 | 8 | trait FileRecordFactory extends Serializable { 9 | def companion: FileFactoryCompanion 10 | def dataSourceType: String = companion.dataSourceType 11 | @transient var accessContext: FileAccessContext = 12 | FileAccessContext.fromLocalArchConf 13 | def get(file: String, mime: String, meta: FileMetaData): FileRecord 14 | def accessFile( 15 | file: String, 16 | resolve: Boolean = true, 17 | accessContext: FileAccessContext = accessContext): InputStream 18 | } 19 | 20 | trait FileFactoryCompanion { 21 | def dataSourceType: String 22 | def apply(spec: InputSpec): FileRecordFactory 23 | } 24 | 25 | object FileRecordFactory { 26 | var factories: Seq[FileFactoryCompanion] = Seq( 27 | S3FileRecordFactory, 28 | S3HttpFileRecordFactory, 29 | HttpFileRecordFactory, 30 | HdfsFileRecordFactory, 31 | VaultFileRecordFactory) 32 | 33 | def apply(spec: InputSpec, default: FileFactoryCompanion): FileRecordFactory = { 34 | apply(spec, Some(default)) 35 | } 36 | 37 | def apply(spec: InputSpec, default: Option[FileFactoryCompanion] = None): FileRecordFactory = { 38 | spec 39 | .str(InputSpec.DataSourceKey) 40 | .flatMap { dataSource => 41 | factories.find { factory => 42 | factory.dataSourceType == dataSource 43 | } 44 | } 45 | .orElse(default) 46 | .getOrElse { 47 | throw new UnsupportedOperationException() 48 | } 49 | .apply(spec) 50 | } 51 | 52 | def filePath(path: String, filename: String): String = IOHelper.concatPaths(path, filename) 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/HdfsFileRecordFactory.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.archive.webservices.ars.io.FileAccessContext 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 5 | import org.archive.webservices.sparkling.io.HdfsIO 6 | 7 | import java.io.{FileNotFoundException, InputStream} 8 | 9 | class HdfsFileRecordFactory private (excludeSuffix: Option[String] = None) 10 | extends FileRecordFactory { 11 | def companion = HdfsFileRecordFactory 12 | 13 | class HdfsFileRecord private[HdfsFileRecordFactory] ( 14 | file: String, 15 | val mime: String, 16 | val meta: FileMetaData) 17 | extends FileRecord { 18 | override lazy val filePath: String = locateFile(file) 19 | override def access: InputStream = accessFile(filePath, resolve = false) 20 | } 21 | 22 | override def get(file: String, mime: String, meta: FileMetaData): FileRecord = { 23 | new HdfsFileRecord(file, mime, meta) 24 | } 25 | 26 | override def accessFile( 27 | file: String, 28 | resolve: Boolean, 29 | accessContext: FileAccessContext = accessContext): InputStream = { 30 | accessContext.hdfsIO.open(if (resolve) locateFile(file) else file) 31 | } 32 | 33 | def locateFile(filePath: String): String = { 34 | if (filePath.contains("*")) { 35 | val files = HdfsIO.files(filePath, recursive = false) 36 | val filtered = 37 | if (excludeSuffix.isEmpty) files else files.filter(!_.endsWith(excludeSuffix.get)) 38 | if (filtered.isEmpty) throw new FileNotFoundException() 39 | filtered.next 40 | } else filePath 41 | } 42 | } 43 | 44 | object HdfsFileRecordFactory extends FileFactoryCompanion { 45 | val dataSourceType: String = "hdfs" 46 | 47 | def apply(spec: InputSpec): HdfsFileRecordFactory = new HdfsFileRecordFactory( 48 | spec.str("metaSuffix")) 49 | 50 | def apply(): HdfsFileRecordFactory = new HdfsFileRecordFactory() 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/HttpFileRecordFactory.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.archive.webservices.ars.io.FileAccessContext 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 5 | 6 | import java.io.InputStream 7 | import java.net.URL 8 | 9 | class HttpFileRecordFactory(location: String) extends FileRecordFactory { 10 | def companion: FileFactoryCompanion = HttpFileRecordFactory 11 | 12 | class HttpFileRecord private[HttpFileRecordFactory] ( 13 | file: String, 14 | val mime: String, 15 | val meta: FileMetaData) 16 | extends FileRecord { 17 | override lazy val filePath: String = locateFile(file) 18 | override def access: InputStream = accessFile(filePath, resolve = false) 19 | } 20 | 21 | override def get(file: String, mime: String, meta: FileMetaData): FileRecord = { 22 | new HttpFileRecord(file, mime, meta) 23 | } 24 | 25 | def accessFile( 26 | file: String, 27 | resolve: Boolean = true, 28 | accessContext: FileAccessContext): InputStream = { 29 | val url = if (resolve) locateFile(file) else file 30 | println(s"Reading $url...") 31 | new URL(url).openStream 32 | } 33 | 34 | def locateFile(filename: String): String = FileRecordFactory.filePath(location, filename) 35 | } 36 | 37 | object HttpFileRecordFactory extends FileFactoryCompanion { 38 | val dataSourceType: String = "http" 39 | 40 | def apply(spec: InputSpec): HttpFileRecordFactory = { 41 | spec 42 | .str(InputSpec.DataLocationKey) 43 | .map(new HttpFileRecordFactory(_)) 44 | .getOrElse { 45 | throw new RuntimeException("No location URL specified.") 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/InMemoryCdxFileRecord.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.archive.webservices.ars.io.WebArchiveLoader 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.{FileMetaData, FileMetaField} 5 | import org.archive.webservices.sparkling.cdx.CdxRecord 6 | import org.archive.webservices.sparkling.io.IteratorInputStream 7 | 8 | import java.io.InputStream 9 | 10 | class InMemoryCdxFileRecord(override val filePath: String, records: Iterator[CdxRecord]) 11 | extends FileRecord { 12 | override def mime: String = WebArchiveLoader.CdxMime 13 | 14 | override def meta: FileMetaData = 15 | FileMetaData(FileMetaField("path", filePath), FileMetaField("mime", mime)) 16 | 17 | override def access: InputStream = 18 | new IteratorInputStream[CdxRecord](records, r => (r.toCdxString + "\n").getBytes) 19 | } 20 | 21 | object InMemoryCdxFileRecord { 22 | def apply(partition: Int, records: Iterator[CdxRecord]): InMemoryCdxFileRecord = { 23 | new InMemoryCdxFileRecord(s"partition-$partition.cdx.gz", records) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/InputSpecLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.io.WebArchiveLoader 5 | 6 | trait InputSpecLoader { 7 | def specType: String 8 | def inputType(spec: InputSpec): Option[String] = None 9 | def size(spec: InputSpec): Long = spec.get[Long]("size").getOrElse(-1) 10 | def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R 11 | def loadSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 12 | loadFilesSpark(spec) { rdd => 13 | val filtered = spec.inputType match { 14 | case InputSpec.InputType.WARC => 15 | rdd.filter(_.mime == WebArchiveLoader.WarcMime) 16 | case InputSpec.InputType.CDX => 17 | rdd.filter(_.mime == WebArchiveLoader.CdxMime) 18 | case _ => rdd 19 | } 20 | action(filtered) 21 | } 22 | } 23 | } 24 | 25 | object InputSpecLoader { 26 | var loaders: Seq[InputSpecLoader] = Seq( 27 | DatasetSpecLoader, 28 | ArchCollectionSpecLoader, 29 | FileSpecLoader, 30 | MetaRemoteSpecLoader, 31 | MetaFilesSpecLoader, 32 | MultiSpecLoader, 33 | CdxQuerySpecLoader) 34 | 35 | def get(spec: InputSpec): Option[InputSpecLoader] = { 36 | loaders.find(_.specType == spec.specType) 37 | } 38 | 39 | def loadSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 40 | spec.loader.loadSpark(spec)(action) 41 | } 42 | 43 | def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 44 | spec.loader.loadFilesSpark(spec)(action) 45 | } 46 | 47 | def size(spec: InputSpec): Long = spec.loader.size(spec) 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/LongestPrefixProbing.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import java.io.FileNotFoundException 4 | 5 | trait LongestPrefixProbing { 6 | protected def locateLongestPrefixPath(filename: String): String = { 7 | var remaining = filename 8 | var prefix = "" 9 | var next = nextPrefixes(prefix) 10 | while (next.nonEmpty) { 11 | val keys = 12 | next.map(p => (p, p.stripPrefix(prefix).stripSuffix("/"))).filter(_._2.nonEmpty) 13 | val longest = keys 14 | .filter { case (_, k) => 15 | remaining.startsWith(k) 16 | } 17 | .toSeq 18 | .sortBy(-_._2.length) 19 | .headOption 20 | .orElse { 21 | keys 22 | .filter { case (_, k) => 23 | filename.startsWith(k) 24 | } 25 | .toSeq 26 | .sortBy(-_._2.length) 27 | .headOption 28 | } 29 | if (longest.isEmpty) throw new FileNotFoundException(filename + s" ($prefix)") 30 | val (p, k) = longest.get 31 | if (k == filename) return prefix.stripSuffix("/") 32 | if (remaining.startsWith(k)) remaining = remaining.stripPrefix(k) 33 | prefix = p 34 | next = nextPrefixes(prefix) 35 | } 36 | throw new FileNotFoundException(filename + s" ($prefix)") 37 | } 38 | 39 | protected def nextPrefixes(prefix: String): Set[String] 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/MetaRemoteSpecLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | import org.apache.spark.rdd.RDD 3 | import org.apache.spark.sql.SparkSession 4 | import org.archive.webservices.ars.io.FileAccessContext 5 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 6 | import org.archive.webservices.sparkling.Sparkling 7 | import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil} 8 | 9 | object MetaRemoteSpecLoader extends InputSpecLoader { 10 | val specType = "meta-remote" 11 | 12 | override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = action({ 13 | val recordFactory = FileRecordFactory(spec) 14 | val recordFactoryBc = Sparkling.sc.broadcast(recordFactory) 15 | for { 16 | filenameKey <- spec.str("metaFilenameKey") 17 | mimeKey <- spec.str("metaMimeKey") 18 | } yield { 19 | val accessContext = FileAccessContext.fromLocalArchConf 20 | Sparkling.initPartitions(loadMeta(spec)).mapPartitions { partition => 21 | accessContext.init() 22 | val recordFactory = recordFactoryBc.value 23 | recordFactory.accessContext = accessContext 24 | partition.flatMap { meta => 25 | for { 26 | filename <- meta.str(filenameKey) 27 | mime <- meta.str(mimeKey) 28 | } yield recordFactory.get(filename, mime, meta) 29 | } 30 | } 31 | } 32 | }.getOrElse { 33 | throw new RuntimeException("No meta filename and/or mime key specified.") 34 | }) 35 | 36 | def loadMeta(spec: InputSpec): RDD[FileMetaData] = { 37 | spec 38 | .str(InputSpec.MetaSourceKey) 39 | .orElse(spec.str(InputSpec.DataSourceKey)) 40 | .flatMap { 41 | case HdfsFileRecordFactory.dataSourceType => Some(loadMetaHdfs(spec)) 42 | case VaultFileRecordFactory.dataSourceType => Some(loadMetaVault(spec)) 43 | case _ => None 44 | } 45 | .getOrElse { 46 | throw new UnsupportedOperationException() 47 | } 48 | } 49 | 50 | def loadMetaHdfs(spec: InputSpec): RDD[FileMetaData] = { 51 | spec 52 | .str(InputSpec.MetaLocationKey) 53 | .map { 54 | case location if location.endsWith(".parquet") => 55 | loadParquet(location) 56 | case _ => throw new UnsupportedOperationException() 57 | } 58 | .getOrElse { 59 | throw new RuntimeException("No meta location specified") 60 | } 61 | } 62 | 63 | def loadMetaVault(spec: InputSpec): RDD[FileMetaData] = { 64 | spec 65 | .str(InputSpec.MetaLocationKey) 66 | .map { 67 | case location if location.endsWith(".parquet") => 68 | val in = VaultFileRecordFactory(spec).accessFile(location) 69 | val tmpFile = HdfsIO.createTmpPath() 70 | val out = HdfsIO.out(tmpFile) 71 | try { 72 | IOUtil.copy(in, out) 73 | } finally { 74 | out.close() 75 | } 76 | loadParquet(tmpFile) 77 | case _ => throw new UnsupportedOperationException() 78 | } 79 | .getOrElse { 80 | throw new RuntimeException("No meta location specified") 81 | } 82 | } 83 | 84 | def loadParquet(path: String): RDD[FileMetaData] = { 85 | val dataFrame = SparkSession.builder.getOrCreate.read.parquet(path) 86 | val schema = Sparkling.sc.broadcast(dataFrame.schema) 87 | dataFrame.rdd.map { row => 88 | FileMetaData.fromParquet(schema.value, row) 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/MultiSpecLoader.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.sparkling.util.RddUtil 5 | 6 | object MultiSpecLoader extends InputSpecLoader { 7 | override def specType: String = "multi-specs" 8 | 9 | def multiSpecs(spec: InputSpec): Iterator[InputSpec] = { 10 | spec.cursor 11 | .downField("specs") 12 | .values 13 | .toIterator 14 | .flatten 15 | .map(json => InputSpec(json.hcursor)) 16 | } 17 | 18 | override def size(spec: InputSpec): Long = Some(super.size(spec)).filter(_ != -1).getOrElse { 19 | val sizes = multiSpecs(spec).map(_.size).filter(_ != -1) 20 | if (sizes.isEmpty) -1 else sizes.sum 21 | } 22 | 23 | override def inputType(spec: InputSpec): Option[String] = { 24 | val types = multiSpecs(spec).map(_.inputType).toSet 25 | Some(if (types.size == 1) types.head else InputSpec.InputType.Files) 26 | } 27 | 28 | private def unionSpark[R](spec: InputSpec, load: InputSpec => (RDD[FileRecord] => R) => R)( 29 | action: RDD[FileRecord] => R): R = { 30 | val specs = multiSpecs(spec) 31 | var union = RddUtil.emptyRDD[FileRecord] 32 | def next: R = { 33 | if (specs.hasNext) { 34 | load(specs.next) { rdd => 35 | union = union.union(rdd) 36 | next 37 | } 38 | } else action(union) 39 | } 40 | next 41 | } 42 | 43 | override def loadFilesSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 44 | unionSpark[R](spec, InputSpecLoader.loadFilesSpark)(action) 45 | } 46 | 47 | override def loadSpark[R](spec: InputSpec)(action: RDD[FileRecord] => R): R = { 48 | unionSpark[R](spec, InputSpecLoader.loadSpark)(action) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/OneTimeAccess.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import java.io.InputStream 4 | 5 | trait OneTimeAccess { this: FileRecord => 6 | private var accessed = false 7 | 8 | def in: InputStream 9 | 10 | override def access: InputStream = { 11 | if (!accessed) { 12 | accessed = true 13 | in 14 | } else throw new UnsupportedOperationException("InputStream can only be accessed once.") 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/S3HttpFileRecordFactory.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs 2 | 3 | import scala.io.Source 4 | 5 | class S3HttpFileRecordFactory(location: String, longestPrefixMapping: Boolean) 6 | extends HttpFileRecordFactory(location) 7 | with LongestPrefixProbing { 8 | override def companion: FileFactoryCompanion = S3HttpFileRecordFactory 9 | 10 | override def locateFile(file: String): String = { 11 | if (longestPrefixMapping) FileRecordFactory.filePath(locateLongestPrefixPath(file), file) 12 | else super.locateFile(file) 13 | } 14 | 15 | private val prefixes = collection.mutable.Map.empty[String, Set[String]] 16 | override protected def nextPrefixes(prefix: String): Set[String] = { 17 | prefixes.getOrElseUpdate( 18 | prefix, { 19 | val url = location + "?delimiter=/&prefix=" + prefix 20 | val source = Source.fromURL(url) 21 | try { 22 | source.mkString 23 | .split('<') 24 | .filter(keyValue => keyValue.startsWith("Prefix>") || keyValue.startsWith("Key>")) 25 | .map { keyValue => 26 | keyValue.split('>').last 27 | } 28 | .toSet 29 | } finally { 30 | source.close() 31 | } 32 | }) 33 | } 34 | } 35 | 36 | object S3HttpFileRecordFactory extends FileFactoryCompanion { 37 | val dataSourceType: String = "s3-http" 38 | 39 | def apply(spec: InputSpec): S3HttpFileRecordFactory = { 40 | spec 41 | .str(InputSpec.DataLocationKey) 42 | .map { location => 43 | val longestPrefixMapping = spec.str("dataPathMapping").contains("longest-prefix") 44 | new S3HttpFileRecordFactory(location, longestPrefixMapping) 45 | } 46 | .getOrElse { 47 | throw new RuntimeException("No location URL specified.") 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaField.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta 2 | 3 | import io.circe.Json 4 | 5 | import scala.reflect.ClassTag 6 | 7 | class FileMetaField private (val key: String, val value: Any, val fieldType: FileMetaFieldType) 8 | extends Serializable { 9 | def get[A: ClassTag]: Option[A] = implicitly[ClassTag[A]].unapply(value) 10 | def gets[A: ClassTag]: Seq[A] = 11 | get[Seq[_]].toSeq.flatMap(_.flatMap(implicitly[ClassTag[A]].unapply)) 12 | def toJson: Json = fieldType.toJson(value) 13 | } 14 | 15 | object FileMetaField { 16 | def apply(key: String, value: Any, fieldType: FileMetaFieldType): FileMetaField = { 17 | new FileMetaField(key, value, fieldType) 18 | } 19 | 20 | def apply(key: String, value: String): FileMetaField = 21 | FileMetaField(key, value, FileMetaFieldType.String) 22 | def apply(key: String, value: Int): FileMetaField = 23 | FileMetaField(key, value, FileMetaFieldType.Number) 24 | def apply(key: String, value: Long): FileMetaField = 25 | FileMetaField(key, value, FileMetaFieldType.Number) 26 | def apply(key: String, value: Double): FileMetaField = 27 | FileMetaField(key, value, FileMetaFieldType.Number) 28 | def apply(key: String, value: Boolean): FileMetaField = 29 | FileMetaField(key, value, FileMetaFieldType.Boolean) 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaFieldSummary.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta 2 | 3 | import io.circe._ 4 | import io.circe.syntax._ 5 | 6 | import scala.collection.immutable.ListMap 7 | 8 | class FileMetaFieldSummary extends Serializable { 9 | var optional: Boolean = false 10 | var types: Map[FileMetaFieldType, FileMetaFieldTypeSummary] = Map.empty 11 | 12 | def add(field: FileMetaField): Unit = { 13 | val summary = types.getOrElse( 14 | field.fieldType, { 15 | val summary = field.fieldType.primitive match { 16 | case FileMetaFieldType.String => new FileMetaFieldStringTypeSummary 17 | case FileMetaFieldType.Number => new FileMetaFieldNumberTypeSummary 18 | case FileMetaFieldType.Boolean => FileMetaFieldBooleanTypeSummary 19 | } 20 | types += field.fieldType.primitive -> summary 21 | summary 22 | }) 23 | if (field.fieldType.multi) { 24 | val values = field.value.asInstanceOf[Seq[_]] 25 | summary.adds(values) 26 | } else summary.add(field.value) 27 | } 28 | 29 | def ++(that: FileMetaFieldSummary): FileMetaFieldSummary = { 30 | val summary = new FileMetaFieldSummary 31 | summary.optional = optional || that.optional 32 | summary.types = (types.keySet ++ that.types.keySet).toSeq.map { t => 33 | val thisType = types.get(t) 34 | val thatType = that.types.get(t) 35 | t -> (if (thisType.isEmpty || thatType.isEmpty) thisType.orElse(thatType).get 36 | else thisType.get ++ thatType.get) 37 | }.toMap 38 | summary 39 | } 40 | 41 | def toJson: Json = { 42 | Map("optional" -> optional.asJson, "types" -> types.values.map(_.toJson).toMap.asJson).asJson 43 | } 44 | 45 | def toJsonSchemaProperties: Seq[(String, Json)] = { 46 | if (types.size == 1) { 47 | types.head._2.toJsonSchemaProperties 48 | } else { 49 | Seq("oneOf" -> types.toSeq.map { case (_, t) => 50 | ListMap(t.toJsonSchemaProperties: _*).asJson 51 | }.asJson) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaFieldType.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta 2 | 3 | import _root_.io.circe.syntax._ 4 | import io.circe.Json 5 | 6 | trait FileMetaFieldType extends Serializable { 7 | def primitive: FileMetaFieldType = this 8 | def multi: Boolean = false 9 | def toJson(value: Any): Json 10 | } 11 | 12 | trait FileMetaFieldMultiType extends FileMetaFieldType { 13 | def primitive: FileMetaFieldType 14 | override def multi: Boolean = true 15 | def toJson(value: Any): Json = value.asInstanceOf[Seq[_]].map(primitive.toJson).asJson 16 | } 17 | 18 | object FileMetaFieldType { 19 | case object String extends FileMetaFieldType { 20 | override def toJson(value: Any): Json = { 21 | if (value == null) Json.Null else value.asInstanceOf[String].asJson 22 | } 23 | } 24 | 25 | case object Number extends FileMetaFieldType { 26 | override def toJson(value: Any): Json = { 27 | value match { 28 | case i: Int => i.asJson 29 | case l: Long => l.asJson 30 | case d: Double => d.asJson 31 | } 32 | } 33 | } 34 | 35 | case object Boolean extends FileMetaFieldType { 36 | override def toJson(value: Any): Json = value.asInstanceOf[Boolean].asJson 37 | } 38 | 39 | case object Strings extends FileMetaFieldMultiType { 40 | override def primitive: FileMetaFieldType = String 41 | } 42 | 43 | case object Numbers extends FileMetaFieldMultiType { 44 | override def primitive: FileMetaFieldType = Number 45 | } 46 | 47 | case object Booleans extends FileMetaFieldMultiType { 48 | override def primitive: FileMetaFieldType = Boolean 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/collections/inputspecs/meta/FileMetaSummary.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.collections.inputspecs.meta 2 | 3 | import io.circe._ 4 | import io.circe.syntax._ 5 | 6 | import scala.collection.immutable.ListMap 7 | 8 | object FileMetaSummary { 9 | val MaxOptions = 10 10 | val MaxStringOptionLength = 100 11 | 12 | lazy val empty = new FileMetaSummary() 13 | } 14 | 15 | class FileMetaSummary extends Serializable { 16 | private var fields: ListMap[String, FileMetaFieldSummary] = ListMap.empty 17 | 18 | def add(meta: FileMetaData): Unit = { 19 | for (missing <- (fields -- meta.keys).values) missing.optional = true 20 | val first = fields.isEmpty 21 | for (field <- meta.fields) { 22 | fields 23 | .getOrElse( 24 | field.key, { 25 | val summary = new FileMetaFieldSummary 26 | summary.optional = !first 27 | fields += field.key -> summary 28 | summary 29 | }) 30 | .add(field) 31 | } 32 | } 33 | 34 | def ++(that: FileMetaSummary): FileMetaSummary = { 35 | if (fields.isEmpty) that 36 | else if (that.fields.isEmpty) this 37 | else { 38 | val newFields = { 39 | fields.toSeq.map(_._1).zipWithIndex ++ that.fields.toSeq.map(_._1).zipWithIndex 40 | }.groupBy(_._1) 41 | .toSeq 42 | .map { case (key, group) => 43 | (key, group.map(_._2).min) 44 | } 45 | .sortBy(_._2) 46 | .map(_._1) 47 | .map { field => 48 | val thisField = fields.get(field) 49 | val thatField = that.fields.get(field) 50 | field -> { 51 | if (thisField.isEmpty || thatField.isEmpty) { 52 | val field = thisField.orElse(thatField).get 53 | field.optional = true 54 | field 55 | } else thisField.get ++ thatField.get 56 | } 57 | } 58 | val summary = new FileMetaSummary 59 | summary.fields ++= newFields 60 | summary 61 | } 62 | } 63 | 64 | def toJson: Json = { 65 | fields.toSeq 66 | .map { case (key, field) => 67 | key -> field.toJson 68 | } 69 | .toMap 70 | .asJson 71 | } 72 | 73 | def toJsonSchema: Json = { 74 | ListMap( 75 | "$schema" -> "https://json-schema.org/draft/2020-12/schema".asJson, 76 | "type" -> "object".asJson, 77 | "required" -> fields.toSeq.filter(!_._2.optional).map(_._1).asJson, 78 | "properties" -> ListMap(fields.toSeq.map { case (key, field) => 79 | key -> (ListMap("title" -> key.asJson) ++ field.toJsonSchemaProperties) 80 | }: _*).asJson).asJson 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/users/DefaultArchUser.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.users 2 | 3 | case class DefaultArchUser( 4 | id: String, 5 | userName: String, 6 | fullName: String, 7 | email: Option[String], 8 | isAdmin: Boolean, 9 | isUser: Boolean = true) 10 | extends ArchUser 11 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/model/users/KeystoneUser.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.model.users 2 | 3 | import io.circe.parser 4 | import io.circe.syntax._ 5 | import org.archive.webservices.ars.model.ArchConf 6 | import requests._ 7 | 8 | object KeystoneUser { 9 | val prefix = "ks" 10 | 11 | private def parseKeystoneUserResponse(r: Response): Option[DefaultArchUser] = 12 | if (r.statusCode != 200) None 13 | else { 14 | parser.parse(r.text) match { 15 | case Left(error) => 16 | None 17 | case Right(json) => 18 | val cursor = json.hcursor 19 | Some( 20 | DefaultArchUser( 21 | id = prefix + ":" + cursor.get[String]("username").toOption.get, 22 | userName = cursor.get[String]("username").toOption.get, 23 | fullName = cursor.get[String]("fullname").toOption.get, 24 | email = cursor.get[String]("email").toOption, 25 | isAdmin = cursor.get[Boolean]("is_staff").toOption.get)) 26 | } 27 | } 28 | 29 | def get(username: String): Option[DefaultArchUser] = 30 | if (ArchConf.keystoneBaseUrl.isEmpty || ArchConf.keystonePrivateApiKey.isEmpty) { 31 | None 32 | } else { 33 | parseKeystoneUserResponse( 34 | requests.get( 35 | s"${ArchConf.keystoneBaseUrl.get}/private/api/user?username=${username}", 36 | headers = Map("X-API-Key" -> ArchConf.keystonePrivateApiKey.get), 37 | check = false)) 38 | } 39 | 40 | def login(username: String, password: String): Option[DefaultArchUser] = 41 | if (ArchConf.keystoneBaseUrl.isEmpty || ArchConf.keystonePrivateApiKey.isEmpty) { 42 | None 43 | } else { 44 | parseKeystoneUserResponse( 45 | requests.post( 46 | s"${ArchConf.keystoneBaseUrl.get}/private/api/proxy_login", 47 | data = Map("username" -> username, "password" -> password).asJson.noSpaces, 48 | headers = Map("X-API-Key" -> ArchConf.keystonePrivateApiKey.get), 49 | check = false)) 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/DerivationJobParameters.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | import io.circe.parser._ 4 | import io.circe.syntax._ 5 | import io.circe.{Decoder, Encoder, HCursor, Json} 6 | 7 | case class DerivationJobParameters(values: Map[String, Json]) extends Serializable { 8 | def size: Int = values.size 9 | def isEmpty: Boolean = size == 0 10 | def nonEmpty: Boolean = !isEmpty 11 | 12 | def set[A: Encoder](key: String, value: A): DerivationJobParameters = { 13 | DerivationJobParameters(values.updated(key, value.asJson)) 14 | } 15 | 16 | def set[A: Encoder](keyValues: (String, A)*): DerivationJobParameters = { 17 | DerivationJobParameters(values ++ keyValues.map { case (k, v) => k -> v.asJson }) 18 | } 19 | 20 | def set(keyValues: (String, Json)*): DerivationJobParameters = { 21 | DerivationJobParameters(values ++ keyValues) 22 | } 23 | 24 | def get[A: Decoder](key: String): Option[A] = values.get(key).flatMap(_.as[A].toOption) 25 | 26 | def toJson: Json = values.asJson 27 | } 28 | 29 | object DerivationJobParameters { 30 | val Empty: DerivationJobParameters = DerivationJobParameters(Map.empty) 31 | 32 | def fromJson(cursor: HCursor): Option[DerivationJobParameters] = cursor.keys.map { keys => 33 | val params = keys.flatMap { key => 34 | cursor.downField(key).focus.map(key -> _) 35 | }.toMap 36 | DerivationJobParameters(params) 37 | } 38 | 39 | def fromJson(json: Json): Option[DerivationJobParameters] = fromJson(json.hcursor) 40 | 41 | def fromJson(json: String): Option[DerivationJobParameters] = 42 | parse(json).right.toOption.flatMap(fromJson) 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/GenericJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | trait GenericJob extends DerivationJob { 4 | override def enqueue( 5 | conf: DerivationJobConf, 6 | get: DerivationJobInstance => Unit = _ => {}): Option[DerivationJobInstance] = { 7 | super.enqueue(conf, get).flatMap(GenericJobManager.enqueue) 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/GenericJobManager.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | object GenericJobManager extends JobManagerBase("Generic", 3) 4 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/JobQueue.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | import org.archive.webservices.ars.model.collections.inputspecs.InputSpec 4 | 5 | class JobQueue(val name: String) { 6 | private var _pos = 0 7 | private val queue = collection.mutable.Queue.empty[DerivationJobInstance] 8 | 9 | def items: Iterator[DerivationJobInstance] = queue.toIterator 10 | 11 | def isEmpty: Boolean = queue.isEmpty 12 | def nonEmpty: Boolean = queue.nonEmpty 13 | def size: Int = queue.size 14 | 15 | def enqueue(instance: DerivationJobInstance): Int = synchronized { 16 | val remainder = Int.MaxValue - _pos 17 | val thisPos = if (remainder < queue.size) queue.size - remainder else _pos + queue.size 18 | queue.enqueue(instance) 19 | instance.updateState(ProcessingState.Queued) 20 | thisPos 21 | } 22 | 23 | def dequeue: DerivationJobInstance = synchronized { 24 | if (_pos == Int.MaxValue) _pos = 1 else _pos += 1 25 | queue.dequeue 26 | } 27 | 28 | def dequeue( 29 | freeSlots: Int, 30 | excludeSources: Set[InputSpec.Identifier] = Set.empty, 31 | recentUsers: Seq[String]): Option[DerivationJobInstance] = synchronized { 32 | val idxs = recentUsers.zipWithIndex.map { case (user, idx) => user -> (idx + 1) }.toMap 33 | var minIdx = 0 34 | var minUserInstance: Option[DerivationJobInstance] = None 35 | queue 36 | .dequeueFirst { instance => 37 | instance.slots <= freeSlots && !excludeSources.contains( 38 | instance.conf.inputSpec) && instance.user.map(_.id).forall { id => 39 | idxs.get(id) match { 40 | case Some(idx) => 41 | if (minIdx == 0 || idx < minIdx) { 42 | minIdx = idx 43 | minUserInstance = Some(instance) 44 | } 45 | false 46 | case None => true 47 | } 48 | } 49 | } 50 | .orElse(minUserInstance.flatMap(instance => queue.dequeueFirst(_ == instance))) 51 | } 52 | 53 | def pos: Int = _pos 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/PartialDerivationJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 3 | 4 | abstract class PartialDerivationJob(parent: ChainedJob) extends DerivationJob { 5 | override val partialOf: Option[DerivationJob] = Some(parent) 6 | override lazy val id: String = parent.id + "_" + super.id 7 | val name: String = id 8 | override lazy val uuid: String = parent.uuid 9 | override def relativeOutPath: String = parent.relativeOutPath 10 | val category: ArchJobCategory = ArchJobCategories.None 11 | val description: String = id 12 | override val templateName: Option[String] = None 13 | override def enqueue( 14 | conf: DerivationJobConf, 15 | get: DerivationJobInstance => Unit = _ => {}): Option[DerivationJobInstance] = 16 | super.enqueue(conf, get) 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/ProcessingState.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | object ProcessingState { 4 | val Strings: Seq[String] = Seq("Not started", "Queued", "Running", "Finished", "Failed") 5 | 6 | val NotStarted = 0 7 | val Queued = 1 8 | val Running = 2 9 | val Finished = 3 10 | val Failed = 4 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/SampleVizData.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | import io.circe._ 4 | import io.circe.generic.semiauto._ 5 | 6 | case class SampleVizData( 7 | nodes: Seq[(String, String)], 8 | edges: Option[Seq[(String, String)]] = None) 9 | 10 | object SampleVizData { 11 | implicit val sampleVizDataEncoder: Encoder[SampleVizData] = deriveEncoder 12 | } 13 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/SparkJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | trait SparkJob extends DerivationJob { 4 | override def enqueue( 5 | conf: DerivationJobConf, 6 | get: DerivationJobInstance => Unit = _ => {}): Option[DerivationJobInstance] = { 7 | super.enqueue(conf, get).flatMap(SparkJobManager.enqueue) 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/SparkJobListener.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | import org.apache.spark.scheduler._ 4 | import org.archive.webservices.sparkling.io.StageSyncManager 5 | 6 | import java.time.Instant 7 | 8 | object SparkJobListener extends SparkListener { 9 | private val _taskStartTimes = collection.mutable.Map.empty[String, Long] 10 | 11 | def taskStartTimes: Map[String, Long] = _taskStartTimes.toMap 12 | 13 | def id(info: TaskInfo): String = info.id + "#" + info.taskId 14 | 15 | override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized { 16 | _taskStartTimes(id(taskStart.taskInfo)) = Instant.now.getEpochSecond 17 | } 18 | 19 | override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { 20 | _taskStartTimes.remove(id(taskEnd.taskInfo)) 21 | } 22 | 23 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { 24 | StageSyncManager.cleanup(StageSyncManager.stageId(stageCompleted.stageInfo.stageId)) 25 | } 26 | 27 | def reset(): Unit = synchronized(_taskStartTimes.clear()) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/SparkJobManager.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.SparkSession 5 | import org.archive.webservices.ars.Arch 6 | import org.archive.webservices.ars.model.ArchConf 7 | import org.archive.webservices.sparkling.Sparkling.executionContext 8 | import org.archive.webservices.sparkling.util.SparkUtil 9 | import org.archive.webservices.sparkling.{Sparkling, _} 10 | 11 | import java.io.File 12 | import java.time.Instant 13 | import scala.concurrent.Future 14 | 15 | object SparkJobManager 16 | extends JobManagerBase("Spark", 3, timeoutSecondsMinMax = Some((60 * 60, 60 * 60 * 3))) { 17 | val taskTimeoutSeconds = 60 * 60 * 12 // 12 hours 18 | val SharedSparkContext = true 19 | val SparkAllocationFile = "fairscheduler.xml" 20 | val MaxPriorityWeight = 128 21 | val PoolPrefix = "weight-" 22 | 23 | private var _context: Option[SparkContext] = None 24 | 25 | def context: Future[SparkContext] = { 26 | Future { 27 | synchronized(_context.filter(!_.isStopped).getOrElse { 28 | val context = SparkUtil.config( 29 | SparkSession.builder, 30 | appName = s"ARCH ${ArchConf.deploymentEnvironment}", 31 | executors = 15, 32 | executorCores = 4, 33 | executorMemory = "16g", 34 | queue = ArchConf.hadoopQueue, 35 | additionalConfigs = Map( 36 | "spark.master" -> ArchConf.sparkMaster, 37 | "spark.scheduler.mode" -> "FAIR", 38 | "spark.yarn.executor.memoryOverhead" -> (4.gb / 1.mb).toString, // off-heap memory in MiB 39 | "spark.scheduler.allocation.file" -> new File(SparkAllocationFile).getAbsolutePath, 40 | "spark.yarn.am.memory" -> "4096m"), 41 | verbose = true) 42 | context.setLogLevel("INFO") 43 | _context = Some(context) 44 | Sparkling.resetSparkContext(Some(context)) 45 | context.addSparkListener(SparkJobListener) 46 | println("New Spark context initialized: " + context.applicationId) 47 | context 48 | }) 49 | } 50 | } 51 | 52 | private def priorityWeight: Int = if (currentPriority == 0) 1 else currentPriority 53 | 54 | def initThread(sc: SparkContext, job: DerivationJob, conf: DerivationJobConf): Unit = { 55 | sc.setJobGroup(job.uuid, job.name + " " + conf.serialize) 56 | sc.setLocalProperty("spark.scheduler.pool", PoolPrefix + priorityWeight) 57 | } 58 | 59 | def stopContext(): Unit = synchronized { 60 | for (context <- _context) { 61 | context.stop() 62 | while (!context.isStopped) Thread.`yield`() 63 | _context = None 64 | Sparkling.resetSparkContext() 65 | } 66 | } 67 | 68 | override protected def onAllJobsFinished(): Unit = synchronized { 69 | super.onAllJobsFinished() 70 | if (!Arch.debugging) stopContext() 71 | } 72 | 73 | override protected def onTimeout(instances: Seq[DerivationJobInstance]): Unit = synchronized { 74 | val threshold = Instant.now.getEpochSecond - taskTimeoutSeconds 75 | val startTimes = SparkJobListener.taskStartTimes.values 76 | if (startTimes.forall(_ < threshold)) { 77 | SparkJobListener.synchronized { 78 | SparkJobListener.reset() 79 | stopContext() 80 | return 81 | } 82 | } 83 | if (numQueued > 0 && freeSlots == 0) bypassJobs() 84 | } 85 | 86 | def bypassJobs(): Boolean = synchronized { 87 | if (priorityWeight < MaxPriorityWeight && priorityRunningCount > 0) { 88 | newPriority(priorityWeight * 2) 89 | true 90 | } else false 91 | } 92 | 93 | def run(job: DerivationJob, conf: DerivationJobConf): Future[Boolean] = { 94 | if (SharedSparkContext) job.run(conf) else SparkRunner.run(job, conf) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/SparkRunner.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing 2 | 3 | import org.apache.tools.ant.taskdefs.Java 4 | import org.apache.tools.ant.{DefaultLogger, Project} 5 | import org.archive.webservices.sparkling.Sparkling.executionContext 6 | 7 | import java.io.File 8 | import scala.concurrent.duration.Duration 9 | import scala.concurrent.{Await, Future} 10 | 11 | object SparkRunner { 12 | def run(job: DerivationJob, conf: DerivationJobConf): Future[Boolean] = Future { 13 | val mainClass = getClass.getName.stripSuffix("$") 14 | val args = Seq(job.getClass.getName, conf.serialize) 15 | 16 | val command = System.getProperty("sun.java.command") 17 | val jar = command.split(' ').head 18 | val isSbt = jar.endsWith("/sbt-launch.jar") 19 | 20 | val project = new Project 21 | 22 | val logger = new DefaultLogger 23 | project.addBuildListener(logger) 24 | logger.setOutputPrintStream(System.out) 25 | logger.setErrorPrintStream(System.err) 26 | logger.setMessageOutputLevel(Project.MSG_DEBUG) 27 | 28 | val jvm = new Java 29 | jvm.setTaskName(job.getClass.getSimpleName.stripSuffix("$")) 30 | jvm.setProject(project) 31 | jvm.setFork(true) 32 | jvm.setCloneVm(true) 33 | jvm.setJar(new File(jar)) 34 | 35 | if (isSbt) jvm.createArg.setValue("runMain " + mainClass + " " + args.mkString(" ")) 36 | else { 37 | jvm.setClassname(mainClass) 38 | for (arg <- args) jvm.createArg.setValue(arg) 39 | } 40 | 41 | jvm.executeJava == 0 42 | } 43 | 44 | def main(args: Array[String]): Unit = { 45 | val Array(className, confStr) = args 46 | DerivationJobConf.deserialize(confStr) match { 47 | case Some(conf) => 48 | val job = 49 | Class.forName(className).getField("MODULE$").get(null).asInstanceOf[DerivationJob] 50 | val success = Await.result(job.run(conf), Duration.Inf) 51 | Await.ready( 52 | SparkJobManager.context.map { sc => 53 | sc.stop() 54 | while (!sc.isStopped) Thread.`yield`() 55 | }, 56 | Duration.Inf) 57 | System.exit(if (success) 0 else 1) 58 | case None => 59 | System.exit(2) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/AudioInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | object AudioInformationExtraction extends BinaryInformationAutJob { 9 | val name = "Audio file information" 10 | val uuid = "01895066-7db2-794b-b91b-e3f5a340e859" 11 | 12 | override val infoUrl = 13 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#audio" 14 | 15 | val description = 16 | "Locations and metadata for MP3, WAV, AAC, and other audio formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 17 | 18 | val targetFile: String = "audio-information.csv.gz" 19 | 20 | def checkMime(url: String, server: String, tika: String): Boolean = 21 | tika.startsWith("audio/") 22 | 23 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/DomainFrequencyExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.functions.desc 5 | import org.apache.spark.sql.{Dataset, Row} 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil} 7 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 8 | import org.archive.webservices.ars.processing.jobs.shared.AutJob 9 | import org.archive.webservices.ars.processing.{DerivationJobConf, ProcessingState, SampleVizData} 10 | import org.archive.webservices.ars.util.{Common, PublicSuffixUtil} 11 | import org.archive.webservices.sparkling.io.HdfsIO 12 | import org.archive.webservices.sparkling.warc.WarcRecord 13 | 14 | import java.io.PrintStream 15 | 16 | object DomainFrequencyExtraction extends AutJob[(String, Long)] { 17 | val name = "Domain frequency" 18 | val uuid = "01894bc7-ff6a-7e25-a5b5-4570425a8ab7" 19 | val category: ArchJobCategory = ArchJobCategories.Collection 20 | 21 | override val infoUrl = 22 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410734896148-ARCH-Collection-datasets#domain-frequency" 23 | 24 | val description = 25 | "The number of unique documents collected from each domain in the collection. Output: one CSV file with columns for domain and count." 26 | 27 | val targetFile: String = "domain-frequency.csv.gz" 28 | 29 | override def printToOutputStream(out: PrintStream): Unit = out.println("domain, count") 30 | 31 | override def df(rdd: RDD[(String, Long)]): Dataset[Row] = { 32 | val rows = rdd 33 | .reduceByKey(_ + _) 34 | .map { case (domain, count) => 35 | Row(domain, count) 36 | } 37 | AutLoader.domainFrequency(rows).orderBy(desc("count")) 38 | } 39 | 40 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[(String, Long)] = { 41 | val publicSuffixes = PublicSuffixUtil.broadcast(rdd.context) 42 | rdd 43 | .flatMap { r => 44 | Common.tryOrElse[Option[(String, Long)]](None) { 45 | r.http.filter(AutUtil.validPage(r, _)).map { _ => 46 | val url = AutUtil.url(r) 47 | (AutUtil.extractDomainRemovePrefixWWW(url, publicSuffixes.value), 1L) 48 | } 49 | } 50 | } 51 | } 52 | 53 | override val templateName: Option[String] = Some("jobs/DomainFrequencyExtraction") 54 | 55 | override def sampleVizData(conf: DerivationJobConf): Option[SampleVizData] = 56 | checkFinishedState(conf.outputPath + relativeOutPath) match { 57 | case Some(ProcessingState.Finished) => 58 | Some( 59 | SampleVizData( 60 | HdfsIO 61 | .lines(conf.outputPath + relativeOutPath + "/" + targetFile, 11) 62 | .drop(1) 63 | .flatMap { line => 64 | val comma = line.lastIndexOf(',') 65 | if (comma < 0) None 66 | else 67 | Some { 68 | val (domain, freq) = 69 | (line.take(comma).stripPrefix("\"").stripSuffix("\""), line.drop(comma + 1)) 70 | (domain, freq) 71 | } 72 | })) 73 | case _ => None 74 | } 75 | 76 | override def templateVariables(conf: DerivationJobConf): Seq[(String, Any)] = { 77 | super.templateVariables(conf) ++ Seq( 78 | "topDomains" -> 79 | sampleVizData(conf).map(_.nodes).getOrElse(Seq.empty)) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/DomainGraphExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import io.archivesunleashed.matchbox.ExtractLinks 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.functions.desc 6 | import org.apache.spark.sql.{Dataset, Row} 7 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil} 8 | import org.archive.webservices.ars.processing.jobs.shared.NetworkAutJob 9 | import org.archive.webservices.ars.util.{Common, HttpUtil, PublicSuffixUtil} 10 | import org.archive.webservices.sparkling.warc.WarcRecord 11 | 12 | import java.io.PrintStream 13 | 14 | object DomainGraphExtraction extends NetworkAutJob[((String, String, String), Long)] { 15 | val name = "Domain graph" 16 | val uuid = "01895067-417d-7665-ba60-a9bb9ca0aa3e" 17 | 18 | override val infoUrl = 19 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410738717588-ARCH-Network-datasets#domain-graph" 20 | 21 | val description = 22 | "Links between domains in the collection over time. Output: one CSV file with columns for crawl date, source, target, and count." 23 | 24 | val targetFile: String = "domain-graph.csv.gz" 25 | 26 | val srcDstFields: (String, String) = ("src_domain", "dest_domain") 27 | 28 | override def printToOutputStream(out: PrintStream): Unit = 29 | out.println("crawl_date, source, target, count") 30 | 31 | override def df(rdd: RDD[((String, String, String), Long)]): Dataset[Row] = { 32 | val rows = 33 | rdd 34 | .reduceByKey(_ + _) 35 | .filter(_._2 > 5) 36 | .map { case ((date, source, target), count) => 37 | Row(date, source, target, count) 38 | } 39 | AutLoader.domainGraph(rows).orderBy(desc("count")) 40 | } 41 | 42 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[((String, String, String), Long)] = { 43 | val publicSuffixes = PublicSuffixUtil.broadcast(rdd.context) 44 | rdd 45 | .flatMap { r => 46 | r.http.filter(AutUtil.validPage(r, _)).toIterator.flatMap { http => 47 | Common 48 | .tryOrElse(Seq.empty[((String, String, String), Long)]) { 49 | val url = AutUtil.url(r) 50 | AutUtil 51 | .extractLinks(ExtractLinks.apply, url, HttpUtil.bodyString(http.body, http)) 52 | .map { case (source, target, _) => 53 | ( 54 | AutUtil.extractDomainRemovePrefixWWW(source, publicSuffixes.value), 55 | AutUtil.extractDomainRemovePrefixWWW(target, publicSuffixes.value)) 56 | } 57 | .distinct 58 | .filter { case (s, t) => s != "" && t != "" } 59 | .map { case (source, target) => 60 | ((AutUtil.timestamp(r).take(8), source, target), 1L) 61 | } 62 | } 63 | .toIterator 64 | } 65 | } 66 | } 67 | 68 | override def edgeCounts(df: Dataset[Row]): RDD[((String, String), Long)] = { 69 | val (srcField, dstField) = srcDstFields 70 | df.rdd 71 | .flatMap { row => 72 | Common.tryOrElse[Option[((String, String), Long)]](None) { 73 | Some( 74 | ( 75 | (row.getAs[String](srcField), row.getAs[String](dstField)), 76 | row.getAs[Long]("count"))) 77 | } 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/ImageGraphExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import io.archivesunleashed.matchbox.ExtractImageLinks 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.{Dataset, Row} 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil} 7 | import org.archive.webservices.ars.processing.jobs.shared.NetworkAutJob 8 | import org.archive.webservices.ars.util.{Common, HttpUtil} 9 | import org.archive.webservices.sparkling.warc.WarcRecord 10 | 11 | import java.io.PrintStream 12 | 13 | object ImageGraphExtraction extends NetworkAutJob[Row] { 14 | val name = "Image graph" 15 | val uuid = "01895067-92fb-739c-a99d-037fde1798a4" 16 | 17 | override val infoUrl = 18 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410738717588-ARCH-Network-datasets#image-graph" 19 | 20 | val description = 21 | "Timestamp, location, and any original description for each image file in the collection. Output: one CSV with columns for crawl date, source page, image file url, and alt text." 22 | 23 | val targetFile: String = "image-graph.csv.gz" 24 | 25 | val srcDstFields: (String, String) = ("src", "image_url") 26 | 27 | override def printToOutputStream(out: PrintStream): Unit = 28 | out.println("crawl_date, source, url, alt_text") 29 | 30 | override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.imageGraph(rdd) 31 | 32 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = { 33 | rdd.flatMap { r => 34 | r.http.filter(AutUtil.validPage(r, _)).toIterator.flatMap { http => 35 | Common 36 | .tryOrElse(Seq.empty[Row]) { 37 | val url = AutUtil.url(r) 38 | AutUtil 39 | .extractLinks(ExtractImageLinks.apply, url, HttpUtil.bodyString(http.body, http)) 40 | .map { case (source, target, alt) => 41 | Row(AutUtil.timestamp(r), source, target, alt) 42 | } 43 | } 44 | .toIterator 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/ImageInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import io.archivesunleashed.matchbox.GetExtensionMIME 4 | import org.apache.commons.io.FilenameUtils 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.{Dataset, Row} 7 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil} 8 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 9 | import org.archive.webservices.sparkling.Sparkling.executionContext 10 | import org.archive.webservices.sparkling.http.HttpMessage 11 | import org.archive.webservices.sparkling.io.InputStreamForker 12 | import org.archive.webservices.sparkling.util.{Common, DigestUtil} 13 | import org.archive.webservices.sparkling.warc.WarcRecord 14 | 15 | import java.io.{InputStream, PrintStream} 16 | import java.net.URL 17 | import scala.concurrent.duration._ 18 | import scala.concurrent.{Await, Future} 19 | import scala.util.Try 20 | 21 | object ImageInformationExtraction extends BinaryInformationAutJob { 22 | val name = "Image file information" 23 | val uuid = "01895067-d598-7db8-88ad-46fed66e27f5" 24 | 25 | override val infoUrl = 26 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#image" 27 | 28 | val description = 29 | "Locations and metadata for JPEG, PNG, GIF, and other image formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 30 | 31 | val targetFile: String = "image-information.csv.gz" 32 | 33 | override def printToOutputStream(out: PrintStream): Unit = 34 | out.println( 35 | "crawl_date, last_modified_date, url, filename, extension, mime_type_web_server, mime_type_tika, width, height, md5, sha1") 36 | 37 | override def checkMime(url: String, server: String, tika: String): Boolean = 38 | tika.startsWith("image/") 39 | 40 | override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.images(rdd) 41 | 42 | override def prepareRecord(r: WarcRecord): Option[Row] = 43 | prepareBinaryRow( 44 | r, 45 | ( 46 | url: String, 47 | http: HttpMessage, 48 | body: InputStream, 49 | tikaMime: String, 50 | crawlDate: String, 51 | lastModifiedDate: String) => { 52 | val forker = InputStreamForker(body) 53 | val Array(imageIn, md5In, sha1In) = forker.fork(3).map(Future(_)) 54 | val Seq((width: Int, height: Int), md5: String, sha1: String) = 55 | try { 56 | Await.result( 57 | Future.sequence( 58 | Seq( 59 | imageIn.map(in => Common.cleanup(AutUtil.computeImageSize(in))(in.close)), 60 | md5In.map(DigestUtil.md5Hex), 61 | sha1In.map(DigestUtil.sha1Hex))), 62 | Duration.Inf) 63 | } finally { 64 | for (s <- imageIn) Try(s.close()) 65 | for (s <- md5In) Try(s.close()) 66 | for (s <- sha1In) Try(s.close()) 67 | Try(body.close()) 68 | } 69 | 70 | val jUrl = new URL(url) 71 | val filename = FilenameUtils.getName(jUrl.getPath) 72 | val extension = GetExtensionMIME(jUrl.getPath, tikaMime) 73 | val lastModifiedDate = 74 | AutUtil.rfc1123toTime14(http.headerMap.get("last-modified").getOrElse("")) 75 | 76 | Row( 77 | crawlDate, 78 | lastModifiedDate, 79 | url, 80 | filename, 81 | extension, 82 | AutUtil.mime(http), 83 | tikaMime, 84 | width, 85 | height, 86 | md5, 87 | sha1) 88 | }) 89 | 90 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/PdfInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | object PdfInformationExtraction extends BinaryInformationAutJob { 9 | val name = "PDF file information" 10 | val uuid = "01895068-3e02-72cb-b0d9-4e1bacc42c37" 11 | 12 | override val infoUrl = 13 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#pdf" 14 | 15 | val description = 16 | "Locations and metadata for Portable Document Format (PDF) files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 17 | 18 | val targetFile: String = "pdf-information.csv.gz" 19 | 20 | override def checkMime(url: String, server: String, tika: String): Boolean = 21 | server == "application/pdf" // not `tika == `, which we had before, but also matches Adobe Illustrator and PostScript 22 | 23 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/PresentationProgramInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | object PresentationProgramInformationExtraction extends BinaryInformationAutJob { 9 | val name = "Presentation file information" 10 | val uuid = "01895068-a576-7a00-b4dd-2d5650bc69ab" 11 | 12 | override val infoUrl = 13 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#presentation" 14 | 15 | val description = 16 | "Locations and metadata for PowerPoint, Keynote, and other presentation formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 17 | 18 | val targetFile: String = "powerpoint-information.csv.gz" 19 | 20 | val PresentationMimeTypes: Set[String] = Set( 21 | "application/vnd.apple.keynote", 22 | "application/vnd.ms-powerpoint", 23 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 24 | "application/vnd.oasis.opendocument.presentation", 25 | "application/vnd.oasis.opendocument.presentation-template", 26 | "application/vnd.sun.xml.impress", 27 | "application/vnd.sun.xml.impress.template", 28 | "application/vnd.stardivision.impress", 29 | "application/x-starimpress", 30 | "application/vnd.ms-powerpoint.addin.macroEnabled.12", 31 | "application/vnd.ms-powerpoint.presentation.macroEnabled.12", 32 | "application/vnd.ms-powerpoint.slide.macroEnabled.12", 33 | "application/vnd.ms-powerpoint.slideshow.macroEnabled.12", 34 | "application/vnd.ms-powerpoint.template.macroEnabled.12") 35 | 36 | override def checkMime(url: String, server: String, tika: String): Boolean = 37 | PresentationMimeTypes.contains(tika) 38 | 39 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/SpreadsheetInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | object SpreadsheetInformationExtraction extends BinaryInformationAutJob { 9 | val name = "Spreadsheet file information" 10 | val uuid = "01895069-192a-74f8-84a9-b14f20c20f89" 11 | 12 | override val infoUrl = 13 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#spreadsheet" 14 | 15 | val description = 16 | "Locations and metadata for CSV, XLS, ODS, and other spreadsheet formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 17 | 18 | val targetFile: String = "spreadsheet-information.csv.gz" 19 | 20 | val SpreadsheetMimeTypes: Set[String] = Set( 21 | " application/vnd.apple.numbers", 22 | "application/vnd.ms-excel", 23 | "application/vnd.ms-excel.workspace.3", 24 | "application/vnd.ms-excel.workspace.4", 25 | "application/vnd.ms-excel.sheet.2", 26 | "application/vnd.ms-excel.sheet.3", 27 | "application/vnd.ms-excel.sheet.3", 28 | "application/vnd.ms-excel.addin.macroenabled.12", 29 | "application/vnd.ms-excel.sheet.binary.macroenabled.12", 30 | "application/vnd.ms-excel.sheet.macroenabled.12", 31 | "application/vnd.ms-excel.template.macroenabled.12", 32 | "application/vnd.ms-spreadsheetml", 33 | "application/vnd.openxmlformats-officedocument.spreadsheetml.template", 34 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 35 | "application/x-vnd.oasis.opendocument.spreadsheet-template", 36 | "application/vnd.oasis.opendocument.spreadsheet-template", 37 | "application/vnd.oasis.opendocument.spreadsheet", 38 | "application/x-vnd.oasis.opendocument.spreadsheet", 39 | "application/x-tika-msworks-spreadsheet", 40 | "application/vnd.lotus-1-2-3", 41 | "text/csv", 42 | "text/tab-separated-values") 43 | 44 | override def checkMime(url: String, server: String, tika: String): Boolean = 45 | SpreadsheetMimeTypes.contains( 46 | tika) || server == "text/csv" || server == "text/tab-separated-values" || ((url.toLowerCase 47 | .endsWith(".csv") || url.toLowerCase.endsWith(".tsv")) && tika == "text/plain") 48 | 49 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/VideoInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | object VideoInformationExtraction extends BinaryInformationAutJob { 9 | val name = "Video file information" 10 | val uuid = "01895069-a9fa-734c-b669-fcf528f85c1e" 11 | 12 | override val infoUrl = 13 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#video" 14 | 15 | val description = 16 | "Locations and metadata for MP4, MOV, AVI, and other video formatted files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 17 | 18 | val targetFile: String = "video-information.csv.gz" 19 | 20 | override def checkMime(url: String, server: String, tika: String): Boolean = 21 | tika.startsWith("video/") 22 | 23 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/WebGraphExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import io.archivesunleashed.matchbox.ExtractLinks 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.{Dataset, Row} 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil} 7 | import org.archive.webservices.ars.processing.jobs.shared.NetworkAutJob 8 | import org.archive.webservices.ars.util.{Common, HttpUtil} 9 | import org.archive.webservices.sparkling.warc.WarcRecord 10 | 11 | import java.io.PrintStream 12 | 13 | object WebGraphExtraction extends NetworkAutJob[Row] { 14 | val name = "Web graph" 15 | val uuid = "01895069-e74c-79de-8292-effb45265179" 16 | 17 | override val infoUrl = 18 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410738717588-ARCH-Network-datasets#web-graph" 19 | 20 | val description = 21 | "Links between all documents in the collection over time and any descriptive anchor text about with them. Output: one CSV file with columns for crawl date, source, target, and anchor text." 22 | 23 | val targetFile: String = "web-graph.csv.gz" 24 | 25 | val srcDstFields: (String, String) = ("src", "dest") 26 | 27 | override def printToOutputStream(out: PrintStream): Unit = 28 | out.println("crawl_date, source, target, anchor_text") 29 | 30 | override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.webGraph(rdd) 31 | 32 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = { 33 | rdd.flatMap { r => 34 | r.http.filter(AutUtil.validPage(r, _)).toIterator.flatMap { http => 35 | Common 36 | .tryOrElse(Seq.empty[Row]) { 37 | val url = AutUtil.url(r) 38 | AutUtil 39 | .extractLinks(ExtractLinks.apply, url, HttpUtil.bodyString(http.body, http)) 40 | .map { case (source, target, alt) => 41 | Row(AutUtil.timestamp(r), source, target, alt) 42 | } 43 | } 44 | .toIterator 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/WebPagesExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import io.archivesunleashed.matchbox.{DetectLanguage, RemoveHTML} 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.{Dataset, Row} 6 | import org.archive.webservices.ars.aut.{AutLoader, AutUtil} 7 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 8 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 9 | import org.archive.webservices.ars.util.{HttpUtil, PublicSuffixUtil} 10 | import org.archive.webservices.sparkling.http.HttpMessage 11 | import org.archive.webservices.sparkling.warc.WarcRecord 12 | 13 | import java.io.{InputStream, PrintStream} 14 | 15 | object WebPagesExtraction extends BinaryInformationAutJob { 16 | val name = "Plain text of webpages" 17 | val uuid = "0189506a-46f3-7d73-9dcf-a8fce59c50cc" 18 | 19 | override val category: ArchJobCategory = ArchJobCategories.Text 20 | 21 | override val infoUrl = 22 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#webpages" 23 | 24 | val description = 25 | "Location, technical metadata, and extracted full text contents of each text-bearing document in the collection. Output: one CSV file with columns for crawl date, last modified date, domain, URL, MIME type as reported by the web server and as detected by Apache TIKA, and content." 26 | 27 | val targetFile: String = "web-pages.csv.gz" 28 | 29 | override def printToOutputStream(out: PrintStream): Unit = 30 | out.println( 31 | "crawl_date, last_modified_date, domain,url, mime_type_web_server, mime_type_tika, language, content") 32 | 33 | override def checkMime(url: String, server: String, tika: String): Boolean = 34 | AutUtil.checkPageMime(url, server) 35 | 36 | override def df(rdd: RDD[Row]): Dataset[Row] = AutLoader.webpages(rdd) 37 | 38 | override def prepareRecord(r: WarcRecord): Option[Row] = 39 | throw new RuntimeException( 40 | "This method should not be called in WebPagesExtraction, see #prepareRecords") 41 | 42 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = { 43 | val publicSuffixes = PublicSuffixUtil.broadcast(rdd.context) 44 | rdd.flatMap { r => 45 | prepareBinaryRow( 46 | r, 47 | ( 48 | url: String, 49 | http: HttpMessage, 50 | body: InputStream, 51 | tikaMime: String, 52 | crawlDate: String, 53 | lastModifiedDate: String) => { 54 | val bodyString = HttpUtil.bodyString(body, http) 55 | val content = RemoveHTML(bodyString) 56 | Row( 57 | crawlDate, 58 | lastModifiedDate, 59 | AutUtil.extractDomainRemovePrefixWWW(url, publicSuffixes.value), 60 | url, 61 | AutUtil.mime(http), 62 | tikaMime, 63 | DetectLanguage(content), 64 | content) 65 | }) 66 | } 67 | } 68 | 69 | override val templateName: Option[String] = Some("jobs/DefaultAutJob") 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/WordProcessorInformationExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.Row 5 | import org.archive.webservices.ars.processing.jobs.shared.BinaryInformationAutJob 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | object WordProcessorInformationExtraction extends BinaryInformationAutJob { 9 | val name = "Word processing file information" 10 | val uuid = "0189506a-d09d-7571-9d3c-a44698d58d39" 11 | 12 | override val infoUrl = 13 | "https://arch-webservices.zendesk.com/hc/en-us/articles/14410815476500-ARCH-File-format-datasets#word" 14 | 15 | val description = 16 | "Locations and metadata for DOC, RTF, ODT, and other word processing files in the collection. Output: one CSV with columns for crawl date, last modified date, URL, file name, file format extension, MIME type as reported by the web server and as detected by Apache TIKA, and MD5 and SHA1 hash values." 17 | 18 | val targetFile: String = "word-document-information.csv.gz" 19 | 20 | val WordProcessorMimeTypes: Set[String] = Set( 21 | "application/vnd.lotus-wordpro", 22 | "application/vnd.kde.kword", 23 | "application/vnd.ms-word.document.macroEnabled.12", 24 | "application/vnd.ms-word.template.macroEnabled.12", 25 | "application/vnd.oasis.opendocument.text", 26 | "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", 27 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 28 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml", 29 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml", 30 | "application/vnd.wordperfect", 31 | "application/wordperfect5.1", 32 | "application/msword", 33 | "application/vnd.ms-word.document.macroEnabled.12", 34 | "application/vnd.ms-word.template.macroEnabled.12", 35 | "application/vnd.apple.pages", 36 | "application/macwriteii", 37 | "application/vnd.ms-works", 38 | "application/rtf") 39 | 40 | override def checkMime(url: String, server: String, tika: String): Boolean = 41 | WordProcessorMimeTypes.contains(tika) 42 | 43 | override def prepareRecords(rdd: RDD[WarcRecord]): RDD[Row] = rdd.flatMap(prepareRecord) 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/AiJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.processing.DerivationJobConf 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchiveSparkEnrichJob} 6 | import org.archive.webservices.sparkling.io.StageSyncManager 7 | 8 | abstract class AiJob extends ArchiveSparkEnrichJob { 9 | // override def maxInputSize: Int = 5000 // limit the input size to avoid extraordinarily long jobs 10 | 11 | override def enrich( 12 | rdd: RDD[ArchEnrichRoot[_]], 13 | conf: DerivationJobConf): RDD[ArchEnrichRoot[_]] = { 14 | StageSyncManager.sync(super.enrich(rdd, conf)) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/ArchiveSparkFlexJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark 2 | 3 | import io.circe.{HCursor, Json} 4 | import org.archive.webservices.archivespark.model.EnrichFunc 5 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord} 7 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.{ArchArchiveSparkFunctionAdapter, EntitiesAdapter} 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.{Whisper, WhisperText} 9 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobParameters} 10 | import org.archive.webservices.sparkling.util.StringUtil 11 | 12 | object ArchiveSparkFlexJob extends AiJob { 13 | val uuid: String = "018f52cc-d917-71ac-9e64-19fb219114a4" 14 | 15 | val name: String = id 16 | val description: String = "ArchiveSpark flex job " 17 | val category: ArchJobCategory = ArchJobCategories.None 18 | 19 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 20 | val mime = conf.params.values 21 | .get("mime") 22 | .toSeq 23 | .flatMap { mime => 24 | if (mime.isString) mime.asString.toSeq 25 | else if (mime.isArray) mime.asArray.toSeq.flatMap(_.flatMap(_.asString)) 26 | else Seq.empty 27 | } 28 | .toSet 29 | if (mime.isEmpty) { 30 | super.genericPredicate(conf) 31 | } else { record => 32 | mime.contains(record.mime) || mime.contains(StringUtil.prefixToSeparator(record.mime, "/")) 33 | } 34 | } 35 | 36 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 37 | val superFilter = super.warcPredicate(conf) 38 | val status = conf.params.values.get("status").toSeq.flatMap { status => 39 | if (status.isNumber) status.asNumber.flatMap(_.toInt).toSeq 40 | else if (status.isArray) 41 | status.asArray.toSeq.flatMap(_.flatMap(_.asNumber).flatMap(_.toInt)) 42 | else Seq.empty 43 | } 44 | if (status.isEmpty) { 45 | superFilter 46 | } else { warc => 47 | superFilter(warc) && { 48 | status.exists { s => 49 | warc.status == s || (s < 100 && (warc.status / 10 == s || (s < 10 && warc.status / 100 == s))) 50 | } 51 | } 52 | } 53 | } 54 | 55 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 56 | conf.params.values.get("functions").toSeq.flatMap(_.asArray.toSeq.flatten).map { function => 57 | ArchiveSparkFlexJob.initFunction(function) 58 | } 59 | } 60 | 61 | val adapters: Map[String, ArchArchiveSparkFunctionAdapter[_]] = 62 | Seq(EntitiesAdapter, Whisper, WhisperText).flatMap { adapter => 63 | Iterator(adapter.name -> adapter, adapter.name.toLowerCase -> adapter) 64 | }.toMap 65 | 66 | private def initFunction[A]( 67 | func: ArchArchiveSparkFunctionAdapter[A], 68 | cursor: HCursor): EnrichFunc[ArchEnrichRoot[_], A, _] = { 69 | val dependency = 70 | cursor.downField("on").focus.map(initFunction).flatMap(func.toDependencyPointer) 71 | cursor.downField("params").focus.flatMap(DerivationJobParameters.fromJson) match { 72 | case Some(params) => func.withParams(params, on = dependency) 73 | case None => func.noParams(on = dependency) 74 | } 75 | } 76 | 77 | def initFunction(definition: Json): EnrichFunc[ArchEnrichRoot[_], _, _] = { 78 | if (definition.isString) { 79 | adapters.get(definition.asString.get).map(_.noParams) 80 | } else if (definition.isObject) { 81 | val cursor = definition.hcursor 82 | cursor.get[String]("name").toOption.flatMap { name => 83 | adapters.get(name).map(initFunction(_, cursor)) 84 | } 85 | } else None 86 | }.getOrElse { 87 | throw new UnsupportedOperationException() 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/ArchiveSparkNoop.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.DerivationJobConf 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchiveSparkBaseJob} 7 | 8 | object ArchiveSparkNoop extends ArchiveSparkBaseJob { 9 | val name: String = id 10 | val uuid: String = "018d1cef-c91d-7d51-9cf4-05fe51900321" 11 | val description: String = 12 | "Am ArchiveSpark job that does nothing. Output: records turned into ArchiveSpark JSON format without any enrichment function applied." 13 | val category: ArchJobCategory = ArchJobCategories.None 14 | 15 | override def enrich( 16 | rdd: RDD[ArchEnrichRoot[_]], 17 | conf: DerivationJobConf): RDD[ArchEnrichRoot[_]] = rdd 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchEnrichRoot.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.archive.webservices.archivespark.model.TypedEnrichRoot 4 | import org.archive.webservices.archivespark.model.dataloads.{ByteLoad, TextLoad} 5 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 6 | 7 | trait ArchEnrichRoot[+Meta] 8 | extends TypedEnrichRoot[Meta] 9 | with FileLoad.Root 10 | with ByteLoad.Root 11 | with TextLoad.Root 12 | with PlainTextLoad.Root 13 | with LocalFileCache { 14 | def mime: String 15 | def meta: FileMetaData 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchFileRecord.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import io.circe.Json 4 | import org.archive.webservices.archivespark.functions.StringContent 5 | import org.archive.webservices.archivespark.model.EnrichRootCompanion 6 | import org.archive.webservices.archivespark.model.dataloads.{ByteLoad, DataLoad, TextLoad} 7 | import org.archive.webservices.archivespark.model.pointers.FieldPointer 8 | import org.archive.webservices.archivespark.util.Json.json 9 | import org.archive.webservices.ars.model.collections.inputspecs.FileRecord 10 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 11 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.{ArchFileBytes, ArchFileCache} 12 | import org.archive.webservices.sparkling.io.IOUtil 13 | import org.archive.webservices.sparkling.logging.{Log, LogContext} 14 | 15 | import java.io.{File, InputStream} 16 | import scala.collection.immutable.ListMap 17 | import scala.util.Try 18 | 19 | class ArchFileRecord(record: FileRecord) extends ArchEnrichRoot[FileRecord] { 20 | implicit private val logContext: LogContext = LogContext(this) 21 | 22 | override def companion: EnrichRootCompanion[ArchFileRecord] = ArchFileRecord 23 | override def get: FileRecord = record 24 | 25 | override def metaToJson: Json = { 26 | json( 27 | ListMap[String, Any]( 28 | "filename" -> record.filename, 29 | "mime" -> Try(record.mime).fold("Error: " + _.getMessage, identity), 30 | "path" -> Try(record.path).fold("Error: " + _.getMessage, identity))) 31 | } 32 | 33 | def mime: String = record.mime 34 | 35 | def meta: FileMetaData = record.meta 36 | 37 | override def payloadAccess: InputStream = { 38 | Log.info(s"Accessing ${record.filename}...") 39 | IOUtil.supportMark(record.access) 40 | } 41 | 42 | override def cacheLocal(): File = { 43 | Log.info(s"Caching ${record.filename}...") 44 | super.cacheLocal() 45 | } 46 | } 47 | 48 | object ArchFileRecord extends EnrichRootCompanion[ArchFileRecord] { 49 | override def dataLoad[T](load: DataLoad[T]): Option[FieldPointer[ArchFileRecord, T]] = 50 | (load match { 51 | case FileLoad => Some(ArchFileCache) 52 | case ByteLoad => Some(ArchFileBytes) 53 | case TextLoad | PlainTextLoad => Some(StringContent) 54 | case _ => None 55 | }).map(_.asInstanceOf[FieldPointer[ArchFileRecord, T]]) 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchFileSpec.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import org.archive.webservices.archivespark.dataspecs.DataSpec 6 | import org.archive.webservices.ars.model.collections.inputspecs.FileRecord 7 | 8 | class ArchFileSpec(@transient val rdd: RDD[FileRecord]) 9 | extends DataSpec[FileRecord, ArchFileRecord] { 10 | override def load(sc: SparkContext, minPartitions: Int): RDD[FileRecord] = rdd 11 | override def parse(file: FileRecord): Option[ArchFileRecord] = Some(new ArchFileRecord(file)) 12 | } 13 | 14 | object ArchFileSpec { 15 | def apply(rdd: RDD[FileRecord]) = new ArchFileSpec(rdd) 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchWarcRecord.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import io.circe.Json 4 | import org.archive.webservices.archivespark.functions.{HtmlText, StringContent} 5 | import org.archive.webservices.archivespark.model.EnrichRootCompanion 6 | import org.archive.webservices.archivespark.model.dataloads.{ByteLoad, DataLoad, TextLoad} 7 | import org.archive.webservices.archivespark.model.pointers.FieldPointer 8 | import org.archive.webservices.archivespark.specific.warc.WarcLikeRecord 9 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaData 10 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.{ArchFileCache, ArchWarcPayload} 11 | import org.archive.webservices.sparkling.cdx.CdxRecord 12 | import org.archive.webservices.sparkling.io.IOUtil 13 | import org.archive.webservices.sparkling.logging.{Log, LogContext} 14 | import org.archive.webservices.sparkling.warc.WarcRecord 15 | 16 | import java.io.{File, InputStream} 17 | 18 | class ArchWarcRecord(val warc: WarcRecord) extends ArchEnrichRoot[CdxRecord] with WarcLikeRecord { 19 | implicit private val logContext: LogContext = LogContext(this) 20 | 21 | override def companion: EnrichRootCompanion[ArchWarcRecord] = ArchWarcRecord 22 | 23 | override lazy val get: CdxRecord = { 24 | warc.toCdx(0L, handleRevisits = true, handleOthers = true).get 25 | } 26 | 27 | def mime: String = warc.http.flatMap(_.mime).getOrElse("/") 28 | 29 | override lazy val meta: FileMetaData = FileMetaData.fromCdx(get) 30 | 31 | override def metaToJson: Json = meta.toJson 32 | 33 | override def payloadAccess: InputStream = IOUtil.supportMark(warc.http.map(_.body).getOrElse(warc.payload)) 34 | 35 | override def cacheLocal(): File = { 36 | Log.info(s"Caching ${warc.url.getOrElse("N/A")}...") 37 | super.cacheLocal() 38 | } 39 | } 40 | 41 | object ArchWarcRecord extends EnrichRootCompanion[ArchWarcRecord] { 42 | override def dataLoad[T](load: DataLoad[T]): Option[FieldPointer[ArchWarcRecord, T]] = { 43 | (load match { 44 | case FileLoad => Some(ArchFileCache) 45 | case ByteLoad => Some(ArchWarcPayload) 46 | case TextLoad => Some(StringContent) 47 | case PlainTextLoad => Some(HtmlText) 48 | case _ => None 49 | }).map(_.asInstanceOf[FieldPointer[ArchWarcRecord, T]]) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchWarcSpec.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import org.archive.webservices.archivespark.dataspecs.DataSpec 6 | import org.archive.webservices.sparkling.warc.WarcRecord 7 | 8 | class ArchWarcSpec(@transient val rdd: RDD[WarcRecord]) 9 | extends DataSpec[WarcRecord, ArchWarcRecord] { 10 | override def load(sc: SparkContext, minPartitions: Int): RDD[WarcRecord] = rdd 11 | override def parse(warc: WarcRecord): Option[ArchWarcRecord] = Some(new ArchWarcRecord(warc)) 12 | } 13 | 14 | object ArchWarcSpec { 15 | def apply(rdd: RDD[WarcRecord]) = new ArchWarcSpec(rdd) 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/ArchiveSparkEnrichJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.archive.webservices.archivespark.model.EnrichFunc 5 | import org.archive.webservices.archivespark.model.dataloads.ByteLoad 6 | import org.archive.webservices.archivespark.model.pointers.DataLoadPointer 7 | import org.archive.webservices.archivespark.util.Bytes 8 | import org.archive.webservices.ars.processing.DerivationJobConf 9 | 10 | abstract class ArchiveSparkEnrichJob extends ArchiveSparkBaseJob { 11 | def byteLoad: DataLoadPointer[ArchEnrichRoot[_], Bytes] = 12 | ArchiveSparkEnrichJob.byteLoad 13 | 14 | def fileLoad: DataLoadPointer[ArchEnrichRoot[_], String] = 15 | ArchiveSparkEnrichJob.fileLoad 16 | 17 | def plainTextLoad: DataLoadPointer[ArchEnrichRoot[_], String] = 18 | ArchiveSparkEnrichJob.plainTextLoad 19 | 20 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] 21 | 22 | override def enrich( 23 | rdd: RDD[ArchEnrichRoot[_]], 24 | conf: DerivationJobConf): RDD[ArchEnrichRoot[_]] = { 25 | val funcs = functions(conf) 26 | var enriched = if (funcs.length <= 1) rdd else { 27 | val (longest, longestPath) = funcs.map(f => (f, f.dependencyPath.toSet)).maxBy(_._2.size) 28 | if (funcs.exists(f => f != longest && !longestPath.contains(f))) rdd.map { r => 29 | r.cacheEnabled = true 30 | r 31 | } else rdd 32 | } 33 | for (func <- funcs) enriched = enriched.enrich(func) 34 | enriched.map { r => 35 | r.clearCache() 36 | r 37 | } 38 | } 39 | } 40 | 41 | object ArchiveSparkEnrichJob { 42 | val byteLoad: DataLoadPointer[ArchEnrichRoot[_], Bytes] = 43 | DataLoadPointer(ByteLoad) 44 | 45 | val fileLoad: DataLoadPointer[ArchEnrichRoot[_], String] = 46 | DataLoadPointer(FileLoad) 47 | 48 | val plainTextLoad: DataLoadPointer[ArchEnrichRoot[_], String] = 49 | DataLoadPointer(PlainTextLoad) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/FileLoad.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.archive.webservices.archivespark.model.dataloads.DataLoad 4 | 5 | object FileLoad extends DataLoad[String] { 6 | trait Root extends DataLoadRoot 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/LocalFileCache.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.apache.commons.io.input.BoundedInputStream 4 | import org.archive.webservices.archivespark.util.Bytes 5 | import org.archive.webservices.sparkling._ 6 | import org.archive.webservices.sparkling.io.IOUtil 7 | import org.archive.webservices.sparkling.logging.{Log, LogContext} 8 | 9 | import java.io.{BufferedInputStream, File, FileInputStream, InputStream} 10 | import scala.util.Try 11 | 12 | object LocalFileCache { 13 | val MaxMemoryCacheSize: Long = 1.mb 14 | } 15 | 16 | trait LocalFileCache { 17 | implicit private val logContext: LogContext = LogContext(this) 18 | 19 | @transient private var _memoryCache: Option[Array[Byte]] = None 20 | @transient private var _localCacheFile: Option[File] = None 21 | 22 | @transient var cacheEnabled = false 23 | 24 | def isLocalCached: Boolean = _localCacheFile.isDefined 25 | 26 | def localCacheFile: Option[File] = _localCacheFile 27 | 28 | private def cacheLocal(in: => Option[InputStream]): File = _localCacheFile.getOrElse { 29 | synchronized(_localCacheFile.getOrElse { 30 | val file = IOUtil.tmpFile 31 | Log.info(s"Caching to ${file.getPath}...") 32 | val out = IOUtil.fileOut(file) 33 | try { 34 | for (bytes <- _memoryCache) { 35 | out.write(bytes) 36 | _memoryCache = None 37 | } 38 | for (s <- in) { 39 | try { 40 | IOUtil.copy(s, out) 41 | } finally { 42 | s.close() 43 | } 44 | } 45 | } finally out.close() 46 | _localCacheFile = Some(file) 47 | Log.info(s"Cached ${file.getPath}.") 48 | file 49 | }) 50 | } 51 | 52 | def cacheLocal(): File = cacheLocal(if (_memoryCache.isDefined) None else Some(payloadAccess)) 53 | 54 | def clearCache(): Unit = if (_localCacheFile.isDefined || _memoryCache.isDefined) synchronized { 55 | for (file <- _localCacheFile) file.delete() 56 | _localCacheFile = None 57 | _memoryCache = None 58 | } 59 | 60 | def localFileCache: Option[InputStream] = _localCacheFile.map { file => 61 | new BufferedInputStream(new FileInputStream(file)) 62 | } 63 | 64 | def cachePayload(): Unit = if (_memoryCache.isEmpty && _localCacheFile.isEmpty) { 65 | synchronized { 66 | if (_memoryCache.isEmpty && _localCacheFile.isEmpty) { 67 | val in = payloadAccess 68 | try { 69 | val bounded = new BoundedInputStream(in, LocalFileCache.MaxMemoryCacheSize + 1) 70 | val array = IOUtil.bytes(bounded) 71 | _memoryCache = Some(array) 72 | if (array.length > LocalFileCache.MaxMemoryCacheSize) cacheLocal(Some(in)) 73 | } catch { 74 | case e: Exception => 75 | // skip if payload can't be read, e.g. malformed HTTP stream / decoding error 76 | Log.error(e.getMessage) 77 | } finally { 78 | Try(in.close()) 79 | } 80 | } 81 | } 82 | } 83 | 84 | def cachedPayload: Bytes = Bytes.either({ 85 | if (cacheEnabled) cachePayload() 86 | _memoryCache.map(Left(_)).getOrElse { 87 | _localCacheFile.map(file => Right(new FileInputStream(file))).getOrElse { 88 | Right(payloadAccess) 89 | } 90 | } 91 | }) 92 | 93 | def payloadAccess: InputStream 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/base/PlainTextLoad.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.base 2 | 3 | import org.archive.webservices.archivespark.model.dataloads.DataLoad 4 | 5 | object PlainTextLoad extends DataLoad[String] { 6 | trait Root extends DataLoadRoot 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/ArchFileBytes.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import org.archive.webservices.archivespark.model.pointers.FieldPointer 4 | import org.archive.webservices.archivespark.model.{Derivatives, EnrichFunc, TypedEnrichRoot, TypedEnrichable} 5 | import org.archive.webservices.archivespark.util.Bytes 6 | import org.archive.webservices.ars.model.collections.inputspecs.FileRecord 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache 8 | import org.archive.webservices.sparkling._ 9 | 10 | import scala.util.Try 11 | 12 | object ArchFileBytes 13 | extends EnrichFunc[TypedEnrichRoot[FileRecord] with LocalFileCache, FileRecord, Bytes] { 14 | val MaxContentLength: Long = 1.mb 15 | 16 | val source: FieldPointer[TypedEnrichRoot[FileRecord] with LocalFileCache, FileRecord] = 17 | FieldPointer.root[TypedEnrichRoot[FileRecord] with LocalFileCache, FileRecord] 18 | 19 | val fields: Seq[String] = Seq("bytes") 20 | 21 | override def derive(source: TypedEnrichable[FileRecord], derivatives: Derivatives): Unit = { 22 | Try(source.asInstanceOf[LocalFileCache].cachedPayload).toOption 23 | .getOrElse(Bytes(source.get.access)) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/ArchFileCache.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import org.archive.webservices.archivespark.model.pointers.FieldPointer 4 | import org.archive.webservices.archivespark.model.{Derivatives, EnrichFunc, EnrichRoot, TypedEnrichable} 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache 6 | 7 | object ArchFileCache extends EnrichFunc[EnrichRoot with LocalFileCache, Any, String] { 8 | val source: FieldPointer[EnrichRoot with LocalFileCache, Any] = FieldPointer(Seq.empty) 9 | val fields: Seq[String] = Seq("filePath") 10 | override val isTransparent: Boolean = true 11 | override def derive(source: TypedEnrichable[Any], derivatives: Derivatives): Unit = { 12 | derivatives << source.asInstanceOf[LocalFileCache].cacheLocal().getPath 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/ArchWarcPayload.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import org.archive.webservices.archivespark.model.pointers.FieldPointer 4 | import org.archive.webservices.archivespark.model.{Derivatives, EnrichFunc, TypedEnrichable} 5 | import org.archive.webservices.archivespark.specific.warc.functions._ 6 | import org.archive.webservices.archivespark.util.Bytes 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.ArchWarcRecord 8 | import org.archive.webservices.sparkling.cdx.CdxRecord 9 | 10 | class ArchWarcPayload private (http: Boolean = true) 11 | extends EnrichFunc[ArchWarcRecord, CdxRecord, Bytes] { 12 | import WarcPayloadFields._ 13 | 14 | val source: FieldPointer[ArchWarcRecord, CdxRecord] = 15 | FieldPointer.root[ArchWarcRecord, CdxRecord] 16 | 17 | val fields: Seq[String] = { 18 | if (http) Seq(RecordHeader, HttpStatusLine, HttpHeader, Payload) 19 | else Seq(RecordHeader, Payload) 20 | } 21 | 22 | override val defaultField: String = Payload 23 | 24 | override def derive(source: TypedEnrichable[CdxRecord], derivatives: Derivatives): Unit = { 25 | val record = source.asInstanceOf[ArchWarcRecord] 26 | val warc = record.warc 27 | derivatives << warc.headers.toMap 28 | if (http) { 29 | for (msg <- warc.http) { 30 | derivatives << msg.statusLine 31 | derivatives << msg.headers 32 | derivatives << record.cachedPayload 33 | } 34 | } else { 35 | derivatives << record.cachedPayload 36 | } 37 | } 38 | } 39 | 40 | object ArchWarcPayload extends ArchWarcPayload(http = true) { 41 | def apply(http: Boolean = true) = new ArchWarcPayload(http) 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/CoreNlpEntities.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import org.archive.webservices.archivespark.functions.{Entities, EntitiesConstants} 4 | import org.archive.webservices.archivespark.model.{EnrichFunc, EnrichRoot} 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache 6 | import org.archive.webservices.sparkling.io.StageSyncManager 7 | import org.archive.webservices.sparkling.util.IteratorUtil 8 | 9 | import java.util.Properties 10 | 11 | class CoreNlpEntities( 12 | properties: Properties = EntitiesConstants.DefaultProps, 13 | filterLatin: Boolean = false) extends Entities(properties, filterLatin = filterLatin) { 14 | override def initPartition(partition: Iterator[EnrichRoot]): Iterator[EnrichRoot] = { 15 | StageSyncManager.lockMutex() 16 | super.initPartition(partition) 17 | } 18 | 19 | override def cleanup(): Unit = StageSyncManager.unlockMutex() 20 | 21 | override def enrichPartition[R <: EnrichRoot](partition: Iterator[R], func: EnrichFunc[R, _, _]): Iterator[R] = { 22 | IteratorUtil.preload(partition.map { r => 23 | r.asInstanceOf[LocalFileCache].cachePayload() 24 | r 25 | }, numPreload = 50, parallelism = 2)(func.enrich) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/TrOCR.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.CondaBasedArchiveSparkFunctionAdapter 4 | 5 | object TrOCR extends CondaBasedArchiveSparkFunctionAdapter[String] { 6 | override def func: CondaBasedFunction[String] = new TrOCR 7 | } 8 | 9 | class TrOCR extends CondaBasedFunction[String] { 10 | override val label: String = "trocr" 11 | override val dataDir: String = s"$label/20240807195100" 12 | override val condaEnv: String = s"conda-$label-env" 13 | override val pythonFile: String = s"$label-run.py" 14 | override val additionalPackages: Seq[String] = 15 | Seq(s"$label-models.tar.gz", "craft-pytorch.tar.gz") 16 | override val pythonArgumentFiles: Seq[String] = Seq( 17 | s"$label-base-handwritten", 18 | "weights/craft_mlt_25k.pth", 19 | "weights/craft_refiner_CTW1500.pth") 20 | 21 | override def processOutput(output: String): Option[String] = Some(output) 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/Whisper.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import _root_.io.circe.parser._ 4 | import io.circe.Json 5 | import org.archive.webservices.ars.Arch 6 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.CondaBasedArchiveSparkFunctionAdapter 7 | 8 | object Whisper extends CondaBasedArchiveSparkFunctionAdapter[Json] { 9 | override def func: CondaBasedFunction[Json] = new Whisper 10 | } 11 | 12 | class Whisper extends CondaBasedFunction[Json] { 13 | override val label: String = "whisper" 14 | override val dataDir: String = s"$label/20240807195100" 15 | override val condaEnv: String = s"conda-$label-env" 16 | override val pythonFile: String = s"$label-run.py" 17 | override val pythonArgumentFiles: Seq[String] = Seq("base.en.pt") 18 | 19 | override def processOutput(output: String): Option[Json] = { 20 | val trim = output.trim 21 | if (trim.isEmpty) None 22 | else 23 | parse(trim) match { 24 | case Left(failure) => 25 | Arch.reportError( 26 | s"ArchiveSpark Whisper Output JSON Parsing Error", 27 | failure.getMessage(), 28 | Map("output" -> output)) 29 | None 30 | case Right(json) => Some(json) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/WhisperText.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions 2 | 3 | import io.circe.Json 4 | import org.archive.webservices.archivespark.model.{EnrichFunc, EnrichRoot, GlobalEnrichFunc} 5 | import org.archive.webservices.ars.processing.DerivationJobParameters 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.LocalFileCache 7 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.ArchArchiveSparkFunctionAdapter 8 | 9 | object WhisperText extends ArchArchiveSparkFunctionAdapter[Json] { 10 | override def initFunc(params: DerivationJobParameters): EnrichFunc[_, Json, _] = { 11 | params.get[Double]("maxNoSpeechProb") match { 12 | case Some(maxNoSpeechProb) => new WhisperText(maxNoSpeechProb) 13 | case None => super.initFunc(params) 14 | } 15 | } 16 | 17 | override def baseFunc: EnrichFunc[_, Json, _] = new WhisperText(0.5) 18 | } 19 | 20 | class WhisperText(maxNoSpeechProb: Double) 21 | extends GlobalEnrichFunc[EnrichRoot with LocalFileCache, Json, String] { 22 | val func: EnrichFunc[EnrichRoot with LocalFileCache, Json, String] = Whisper.func.map("text") { 23 | json => 24 | json.asArray.toSeq.flatten 25 | .map(_.hcursor) 26 | .filter { cursor => 27 | cursor.get[Double]("no_speech_prob").exists(_ <= maxNoSpeechProb) 28 | } 29 | .flatMap { cursor => 30 | cursor.get[String]("text").toOption 31 | } 32 | .mkString 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/adapters/ArchArchiveSparkFunctionAdapter.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.archivespark.model.pointers.FieldPointer 5 | import org.archive.webservices.ars.processing.DerivationJobParameters 6 | import org.archive.webservices.ars.processing.jobs.archivespark.base.ArchEnrichRoot 7 | 8 | import scala.util.Try 9 | 10 | trait ArchArchiveSparkFunctionAdapter[Source] { 11 | def name: String = baseFunc.getClass.getSimpleName.stripSuffix("$") 12 | def baseFunc: EnrichFunc[_, Source, _] 13 | def defaultDependency: Option[FieldPointer[ArchEnrichRoot[_], Source]] = None 14 | def noParams(on: Option[FieldPointer[ArchEnrichRoot[_], Source]]) 15 | : EnrichFunc[ArchEnrichRoot[_], Source, _] = { 16 | val dependency = on.orElse(defaultDependency) 17 | dependency 18 | .map(baseFunc.on(_)) 19 | .getOrElse(baseFunc) 20 | .asInstanceOf[EnrichFunc[ArchEnrichRoot[_], Source, _]] 21 | } 22 | def noParams: EnrichFunc[ArchEnrichRoot[_], Source, _] = noParams(None) 23 | def withParams( 24 | params: DerivationJobParameters, 25 | on: Option[FieldPointer[ArchEnrichRoot[_], Source]] = None) 26 | : EnrichFunc[ArchEnrichRoot[_], Source, _] = { 27 | if (params.isEmpty) noParams(on) 28 | else { 29 | val dependency = on.orElse(defaultDependency) 30 | val func = initFunc(params) 31 | dependency 32 | .map(func.on) 33 | .getOrElse(func) 34 | .asInstanceOf[EnrichFunc[ArchEnrichRoot[_], Source, _]] 35 | } 36 | } 37 | def initFunc(params: DerivationJobParameters): EnrichFunc[_, Source, _] = baseFunc 38 | def toDependencyPointer(func: EnrichFunc[ArchEnrichRoot[_], _, _]) 39 | : Option[FieldPointer[ArchEnrichRoot[_], Source]] = Try { 40 | func.asInstanceOf[FieldPointer[ArchEnrichRoot[_], Source]] 41 | }.toOption 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/adapters/CondaBasedArchiveSparkFunctionAdapter.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters 2 | import org.archive.webservices.archivespark.model.EnrichFunc 3 | import org.archive.webservices.ars.processing.DerivationJobParameters 4 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.CondaBasedFunction 5 | 6 | trait CondaBasedArchiveSparkFunctionAdapter[Source] 7 | extends ArchArchiveSparkFunctionAdapter[Source] { 8 | def func: CondaBasedFunction[Source] 9 | 10 | override def baseFunc: EnrichFunc[_, Source, _] = func.asInstanceOf[EnrichFunc[_, Source, _]] 11 | 12 | override def initFunc(params: DerivationJobParameters): EnrichFunc[_, Source, _] = { 13 | val f = func 14 | f.initFunc(params) 15 | f.asInstanceOf[EnrichFunc[_, Source, _]] 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/functions/adapters/EntitiesAdapter.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters 2 | import edu.stanford.nlp.pipeline.StanfordCoreNLP 3 | import org.archive.webservices.archivespark.functions.{Entities, EntitiesConstants} 4 | import org.archive.webservices.archivespark.model.EnrichFunc 5 | import org.archive.webservices.archivespark.model.pointers.DataLoadPointer 6 | import org.archive.webservices.ars.processing.DerivationJobParameters 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchiveSparkEnrichJob} 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.CoreNlpEntities 9 | 10 | import java.util.Properties 11 | import scala.collection.JavaConverters.asScalaSetConverter 12 | 13 | object EntitiesAdapter extends ArchArchiveSparkFunctionAdapter[String] { 14 | override lazy val baseFunc: Entities = new CoreNlpEntities() 15 | 16 | override def defaultDependency: Option[DataLoadPointer[ArchEnrichRoot[_], String]] = Some( 17 | ArchiveSparkEnrichJob.plainTextLoad) 18 | 19 | override def initFunc(params: DerivationJobParameters): EnrichFunc[_, String, _] = { 20 | val langParam = params.get[String]("lang").map(_.toLowerCase) 21 | langParam match { 22 | case Some("chinese") => new CoreNlpEntities(properties(langParam)) 23 | case _ => new CoreNlpEntities(properties(langParam), filterLatin = true) 24 | } 25 | } 26 | 27 | def properties(lang: Option[String] = None): Properties = { 28 | val default = EntitiesConstants.DefaultProps 29 | lang match { 30 | case Some(l) => 31 | val props = new StanfordCoreNLP(l).getProperties 32 | for (p <- default.stringPropertyNames.asScala) 33 | props.setProperty(p, default.getProperty(p)) 34 | props 35 | case None => default 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/EntityExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord, ArchiveSparkEnrichJob} 6 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.EntitiesAdapter 7 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobParameters} 8 | 9 | object EntityExtraction extends ArchiveSparkEnrichJob { 10 | val name: String = "Named entities" 11 | val description: String = 12 | "Names of persons, organizations, and geographic locations detected in each text-bearing document in the collection. Output: one or more JSONL files comprising a JSON object for each input record." 13 | 14 | val uuid: String = "018d114d-3426-730e-94a1-b56ca73fc1ad" 15 | 16 | override val infoUrl = 17 | "https://arch-webservices.zendesk.com/hc/en-us/articles/15810489328276-ARCH-named-entities-datasets" 18 | 19 | val category: ArchJobCategory = ArchJobCategories.Text 20 | 21 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 22 | val superFilter = super.warcPredicate(conf) 23 | warc => superFilter(warc) && warc.status == 200 24 | } 25 | 26 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 27 | record => record.mime.startsWith("text/") 28 | } 29 | 30 | def entitiesFunc(params: DerivationJobParameters): EnrichFunc[ArchEnrichRoot[_], _, _] = { 31 | EntitiesAdapter.withParams(params) 32 | } 33 | 34 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 35 | Seq(entitiesFunc(conf.params)) 36 | } 37 | } -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/TrOcrEntityExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.DerivationJobConf 6 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord} 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.TrOCR 9 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.EntitiesAdapter 10 | 11 | object TrOcrEntityExtraction extends AiJob { 12 | val uuid: String = "019078a8-7b16-7a87-8b50-a30166e547dd" 13 | 14 | val name: String = "Named entities from text recognition" 15 | val description: String = 16 | "Names of persons, organizations, geographic locations, and dates from text recognized in collection images. Output: one or more JSONL files comprising a JSON object for each input record." 17 | 18 | override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/15810489328276-ARCH-named-entities-datasets" 19 | 20 | override val category: ArchJobCategory = ArchJobCategories.Text 21 | 22 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 23 | val superFilter = super.warcPredicate(conf) 24 | warc => superFilter(warc) && warc.status == 200 25 | } 26 | 27 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 28 | record => record.mime.startsWith("image/") 29 | } 30 | 31 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 32 | val text = TrOCR.noParams 33 | val entities = EntitiesAdapter.noParams(on = EntitiesAdapter.toDependencyPointer(text)) 34 | Seq(text, entities) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/TrOcrProcessing.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.DerivationJobConf 6 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord} 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.TrOCR 9 | 10 | object TrOcrProcessing extends AiJob { 11 | val uuid: String = "019078a5-c6f3-7051-bb71-5b1f135307df" 12 | 13 | val name: String = "Text recognition" 14 | val description: String = 15 | "Text recognized and transcribed from images in a collection, including handwriting. Output: one or more JSONL files comprising a JSON object for each input record." 16 | 17 | override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#ocr" 18 | 19 | override val category: ArchJobCategory = ArchJobCategories.Text 20 | 21 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 22 | val superFilter = super.warcPredicate(conf) 23 | warc => superFilter(warc) && warc.status == 200 24 | } 25 | 26 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 27 | record => record.mime.startsWith("image/") 28 | } 29 | 30 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 31 | Seq(TrOCR.noParams) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/WhisperEntityExtraction.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.DerivationJobConf 6 | import org.archive.webservices.ars.processing.jobs.archivespark 7 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob 8 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord} 9 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.adapters.EntitiesAdapter 10 | 11 | object WhisperEntityExtraction extends AiJob { 12 | val uuid: String = "018f7b09-f7ca-756d-a4ca-69cea914185d" 13 | 14 | val name: String = "Named entities from speech recognition" 15 | val description: String = 16 | "Names of persons, organizations, geographic locations, and dates in text transcribed from collection audio and video documents. Output: one or more JSONL files comprising a JSON object for each input record." 17 | 18 | override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/15810489328276-ARCH-named-entities-datasets" 19 | 20 | override val category: ArchJobCategory = ArchJobCategories.Text 21 | 22 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 23 | val superFilter = super.warcPredicate(conf) 24 | warc => superFilter(warc) && warc.status == 200 25 | } 26 | 27 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 28 | record => record.mime.startsWith("audio/") || record.mime.startsWith("video/") 29 | } 30 | 31 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 32 | val whisperText = archivespark.functions.WhisperText.withParams(conf.params) 33 | val entities = EntitiesAdapter.noParams(on = EntitiesAdapter.toDependencyPointer(whisperText)) 34 | Seq(whisperText, entities) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/WhisperText.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.DerivationJobConf 6 | import org.archive.webservices.ars.processing.jobs.archivespark 7 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob 8 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord} 9 | 10 | object WhisperText extends AiJob { 11 | val uuid: String = "0191e26a-056c-77e2-8fe0-dfba9928b3e2" 12 | 13 | val name: String = "Speech recognition" 14 | val description: String = 15 | "Text transcribed from speech recognized in collection audio and video documents. Output: one or more JSONL files comprising a JSON object for each input record." 16 | 17 | override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#speech" 18 | 19 | override val category: ArchJobCategory = ArchJobCategories.Text 20 | 21 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 22 | val superFilter = super.warcPredicate(conf) 23 | warc => superFilter(warc) && warc.status == 200 24 | } 25 | 26 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 27 | record => record.mime.startsWith("audio/") || record.mime.startsWith("video/") 28 | } 29 | 30 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 31 | Seq(archivespark.functions.WhisperText.withParams(conf.params)) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/archivespark/preset/WhisperTranscription.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.archivespark.preset 2 | 3 | import org.archive.webservices.archivespark.model.EnrichFunc 4 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory} 5 | import org.archive.webservices.ars.processing.DerivationJobConf 6 | import org.archive.webservices.ars.processing.jobs.archivespark.AiJob 7 | import org.archive.webservices.ars.processing.jobs.archivespark.base.{ArchEnrichRoot, ArchWarcRecord} 8 | import org.archive.webservices.ars.processing.jobs.archivespark.functions.Whisper 9 | 10 | object WhisperTranscription extends AiJob { 11 | val uuid: String = "018f7b0a-4f3c-7846-862a-ff1ae26ce139" 12 | 13 | val name: String = "Speech recognition (raw)" 14 | val description: String = 15 | "Raw transcription output and technical metadata from speech recognized in collection audio and video documents. Output: one or more JSONL files comprising a JSON object for each input record." 16 | 17 | override def infoUrl: String = "https://arch-webservices.zendesk.com/hc/en-us/articles/14410760790164-ARCH-Text-datasets#speech" 18 | 19 | override val category: ArchJobCategory = ArchJobCategories.Text 20 | 21 | override def warcPredicate(conf: DerivationJobConf): ArchWarcRecord => Boolean = { 22 | val superFilter = super.warcPredicate(conf) 23 | warc => superFilter(warc) && warc.status == 200 24 | } 25 | 26 | override def genericPredicate(conf: DerivationJobConf): ArchEnrichRoot[_] => Boolean = { 27 | record => record.mime.startsWith("audio/") || record.mime.startsWith("video/") 28 | } 29 | 30 | def functions(conf: DerivationJobConf): Seq[EnrichFunc[ArchEnrichRoot[_], _, _]] = { 31 | Seq(Whisper.noParams) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/shared/ArsJob.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.shared 2 | 3 | import org.archive.webservices.ars.WasapiController 4 | import org.archive.webservices.ars.model.ArchConf 5 | import org.archive.webservices.ars.processing.{DerivationJob, DerivationJobConf} 6 | 7 | trait ArsJob extends DerivationJob { 8 | override def templateVariables(conf: DerivationJobConf): Seq[(String, Any)] = { 9 | val wasapiUrl = ArchConf.baseUrl + { 10 | "/wasapi/v1/jobs/" + id + "/result?collection=" + conf.inputSpec.collectionId + { 11 | if (conf.isSample) "&sample=true" else "" 12 | } 13 | } 14 | super.templateVariables(conf) ++ Seq( 15 | "wasapiUrl" -> wasapiUrl, 16 | "wasapiPages" -> (outFiles(conf).size.toDouble / WasapiController.FixedPageSize).ceil.toInt) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/processing/jobs/system/MetadataSummary.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.processing.jobs.system 2 | 3 | import org.archive.webservices.ars.model.collections.inputspecs.InputSpecLoader 4 | import org.archive.webservices.ars.model.collections.inputspecs.meta.FileMetaSummary 5 | import org.archive.webservices.ars.model.{ArchJobCategories, ArchJobCategory, ArchJobInstanceInfo, DerivativeOutput} 6 | import org.archive.webservices.ars.processing._ 7 | import org.archive.webservices.sparkling.Sparkling.executionContext 8 | import org.archive.webservices.sparkling.io._ 9 | 10 | import scala.concurrent.Future 11 | 12 | object MetadataSummary extends SparkJob { 13 | val name = "Metadata Summary" 14 | val uuid = "4a3fae37-99de-4a64-843d-bce3a44807b1" 15 | val category: ArchJobCategory = ArchJobCategories.System 16 | def description = "Summarizes metadata of a given input spec" 17 | 18 | val relativeOutPath: String = s"/$id" 19 | 20 | val SummaryFile = "summary.json" 21 | 22 | def run(conf: DerivationJobConf): Future[Boolean] = { 23 | SparkJobManager.context.map { sc => 24 | SparkJobManager.initThread(sc, MetadataSummary, conf) 25 | InputSpecLoader.loadSpark(conf.inputSpec) { rdd => 26 | val summary = rdd 27 | .mapPartitions { partition => 28 | val summary = new FileMetaSummary() 29 | for (f <- partition) summary.add(f.meta) 30 | Iterator(summary) 31 | } 32 | .fold(FileMetaSummary.empty)(_ ++ _) 33 | HdfsIO.writeLines( 34 | conf.outputPath + relativeOutPath + "/" + SummaryFile, 35 | lines = Seq(summary.toJsonSchema.spaces4)) 36 | true 37 | } 38 | } 39 | } 40 | 41 | override def history(conf: DerivationJobConf): DerivationJobInstance = { 42 | val instance = super.history(conf) 43 | val started = 44 | HdfsIO.exists(conf.outputPath + relativeOutPath + "/" + ArchJobInstanceInfo.InfoFile) 45 | if (started) { 46 | val completed = HdfsIO.exists(conf.outputPath + relativeOutPath + "/" + SummaryFile) 47 | instance.state = if (completed) ProcessingState.Finished else ProcessingState.Failed 48 | } 49 | instance 50 | } 51 | 52 | override def outFiles(conf: DerivationJobConf): Iterator[DerivativeOutput] = Iterator( 53 | DerivativeOutput(SummaryFile, conf.outputPath + relativeOutPath, "JSON", "application/json")) 54 | 55 | override val templateName: Option[String] = Some("jobs/DefaultArsJob") 56 | 57 | override def reset(conf: DerivationJobConf): Unit = 58 | HdfsIO.delete(conf.outputPath + relativeOutPath) 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/CacheUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import org.scalatra.ActionResult 4 | import org.scalatra.guavaCache.GuavaCache 5 | 6 | import javax.servlet.http.HttpServletRequest 7 | import scala.concurrent.duration._ 8 | 9 | object CacheUtil { 10 | val Charset: String = "UTF-8" 11 | 12 | val RequestCacheDuration: Duration = 10.minutes 13 | 14 | def cache[R](key: String, enabled: Boolean = true, ttl: Option[Duration] = None)( 15 | value: => R): R = 16 | if (enabled) { 17 | GuavaCache.get[R](key) match { 18 | case Some(cached) => cached 19 | case None => 20 | val v = value 21 | GuavaCache.put(key, v, ttl) 22 | v 23 | } 24 | } else value 25 | 26 | def put[R](key: String, value: R, enabled: Boolean = true, ttl: Option[Duration] = None): R = { 27 | GuavaCache.put(key, value, ttl) 28 | value 29 | } 30 | 31 | def get[R](key: String): Option[R] = GuavaCache.get[R](key) 32 | 33 | def cacheRequest( 34 | request: HttpServletRequest, 35 | enabled: Boolean = true, 36 | subjects: Set[Any] = Set.empty)(value: => ActionResult): ActionResult = 37 | if (enabled) { 38 | val key = "request#" + request.getRequestURI + "?" + request.getQueryString 39 | Iterator 40 | .continually { 41 | GuavaCache.get[Option[ActionResult]](key) match { 42 | case Some(cached) => 43 | if (cached.isEmpty) Thread.sleep(1000) 44 | cached 45 | case None => 46 | GuavaCache.put(key, None, None) 47 | try { 48 | val result = value 49 | if (result.status.code == 200) 50 | GuavaCache.put(key, Some(result), Some(RequestCacheDuration)) 51 | else GuavaCache.remove(key) 52 | Some(result) 53 | } catch { 54 | case e: Exception => 55 | GuavaCache.remove(key) 56 | throw e 57 | } 58 | } 59 | } 60 | .flatten 61 | .next() 62 | } else value 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/Common.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | object Common { 4 | def tryOrElse[R](orElse: => R)(action: => R): R = { 5 | try { 6 | action 7 | } catch { 8 | case e: Exception => 9 | e.printStackTrace() 10 | orElse 11 | } 12 | } 13 | 14 | def retryWhile(cond: => Boolean, sleepMs: Int, maxTimes: Int, sleepInc: Int => Int): Boolean = { 15 | var sleep = sleepMs 16 | var times = 1 17 | var result = cond 18 | while (result && times < maxTimes) { 19 | Thread.sleep(sleep) 20 | sleep = sleepInc(sleep) 21 | times += 1 22 | result = cond 23 | } 24 | !result 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/DatafileUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import io.circe.parser.parse 4 | import io.circe.{Json, JsonObject} 5 | 6 | import java.io.{File, PrintWriter} 7 | import scala.io.Source 8 | import scala.util.Try 9 | 10 | object DatafileUtil { 11 | private def getPath(filename: String) = s"data/$filename" 12 | 13 | def load(filename: String): Json = { 14 | val source = Source.fromFile(getPath(filename), "utf-8") 15 | Try { 16 | try { 17 | parse(source.getLines.mkString).right.get 18 | } finally { 19 | source.close() 20 | } 21 | }.getOrElse(Json.fromJsonObject(JsonObject.empty)) 22 | } 23 | 24 | def store(filename: String, json: Json): Unit = { 25 | val path = getPath(filename) 26 | val source = Source.fromFile(path, "utf-8") 27 | try { 28 | val pw = new PrintWriter(new File(path)) 29 | pw.write(json.toString) 30 | pw.close() 31 | } finally { 32 | source.close() 33 | } 34 | } 35 | 36 | def loadArchUsers() = load("arch-users.json") 37 | def storeArchUsers(json: Json) = store("arch-users.json", json) 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/DatasetUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import org.archive.webservices.ars.model.ArchCollection 4 | import org.archive.webservices.ars.model.users.ArchUser 5 | import org.archive.webservices.ars.processing.{DerivationJobConf, DerivationJobInstance, JobManager} 6 | 7 | object DatasetUtil { 8 | def formatId(collectionId: String, job: DerivationJobInstance): String = { 9 | s"${collectionId}:${if (job.conf.isSample) "1" else "0"}:${job.job.id}" 10 | } 11 | 12 | def parseId( 13 | datasetId: String, 14 | user: ArchUser): Option[(ArchCollection, DerivationJobInstance)] = { 15 | val Array(collectionId, isSample, jobId) = 16 | datasetId.reverse.split(":", 3).map(_.reverse).reverse 17 | val sample = if (isSample == "1") true else false 18 | for { 19 | collection <- ArchCollection.get(ArchCollection.userCollectionId(collectionId, user)) 20 | job <- ( 21 | JobManager 22 | .getInstanceOrGlobal( 23 | jobId, 24 | DerivationJobConf.collection(collection, sample = sample, global = false), 25 | Some(DerivationJobConf.collection(collection, sample = sample, global = true))) 26 | ) 27 | } yield (collection, job) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/FormatUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import org.archive.webservices.sparkling.util.StringUtil 4 | 5 | import java.time.Instant 6 | 7 | object FormatUtil { 8 | def formatBytes(bytes: Long): String = { 9 | val units = Seq("B", "KB", "MB", "GB", "TB", "PB") 10 | if (bytes < 0) "0 " + units.head 11 | else { 12 | var unitIdx = 0 13 | var b = bytes.toDouble 14 | while (b > 1024 && unitIdx < units.length - 1) { 15 | unitIdx += 1 16 | b = b / 1024 17 | } 18 | StringUtil.formatNumber(b, 1) + " " + units(unitIdx) 19 | } 20 | } 21 | 22 | def instantTimeString(instant: Instant): String = 23 | instant.toString 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/HttpUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import org.apache.commons.io.input.BoundedInputStream 4 | import org.archive.webservices.ars.model.ArchConf 5 | import org.archive.webservices.sparkling._ 6 | import org.archive.webservices.sparkling.html.HtmlProcessor 7 | import org.archive.webservices.sparkling.http.HttpMessage 8 | import org.archive.webservices.sparkling.io.CatchingInputStream 9 | 10 | import java.io.InputStream 11 | import java.net.{HttpURLConnection, InetSocketAddress, Proxy, URL} 12 | import javax.net.ssl.HttpsURLConnection 13 | 14 | object HttpUtil { 15 | val MaxContentLength: Long = 1.mb 16 | 17 | def bodyString(body: InputStream, http: HttpMessage): String = { 18 | val boundedBody = new BoundedInputStream(body, MaxContentLength) 19 | HtmlProcessor.readStream( 20 | new CatchingInputStream(boundedBody), 21 | http.charset.toSeq ++ HttpMessage.BodyCharsets) 22 | } 23 | 24 | lazy val proxy: Proxy = { 25 | val split = ArchConf.httpProxy.split(':') 26 | if (split.length > 1) { 27 | new Proxy(Proxy.Type.HTTP, new InetSocketAddress(split.head, split(1).toInt)) 28 | } else Proxy.NO_PROXY 29 | } 30 | 31 | def openConnection(url: String): HttpURLConnection = { 32 | val u = new URL(url) 33 | if (ArchConf.httpProxyHosts.contains(u.getHost)) { 34 | u.openConnection(proxy) 35 | } else u.openConnection() 36 | }.asInstanceOf[HttpURLConnection] 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/LazyCache.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import io.circe.Json 4 | import io.circe.syntax.EncoderOps 5 | import org.archive.webservices.sparkling.io.HdfsIO 6 | import org.scalatra.{ActionResult, Ok} 7 | 8 | import scala.concurrent.ExecutionContext.Implicits.global 9 | import scala.concurrent.Future 10 | 11 | object LazyCache { 12 | val writingSuffix = "_writing" 13 | 14 | private var lazyFuture = Future(true) 15 | private var processing = Map.empty[String, Future[Boolean]] 16 | 17 | def getOrCache[A]( 18 | cacheFile: String)(parse: String => Option[A], write: String => Unit): Future[A] = 19 | synchronized { 20 | val parsed = getIfCached(cacheFile)(parse) 21 | if (parsed.isDefined) return Future(parsed.get) 22 | processing 23 | .getOrElse( 24 | cacheFile, { 25 | lazyFuture = lazyFuture.map(_ => { 26 | val tmpFilePath = cacheFile + writingSuffix 27 | write(tmpFilePath) 28 | HdfsIO.rename(tmpFilePath, cacheFile) 29 | true 30 | }) 31 | processing += cacheFile -> lazyFuture 32 | lazyFuture 33 | }) 34 | .map { _ => 35 | processing -= cacheFile 36 | parse(cacheFile).get 37 | } 38 | } 39 | 40 | def getIfCached[A](cacheFile: String)(parse: String => Option[A]): Option[A] = { 41 | if (HdfsIO.exists(cacheFile)) parse(cacheFile) else None 42 | } 43 | 44 | def lazyJsonResponse[A]( 45 | cached: Option[Future[A]])(orElse: => A, json: A => Json): ActionResult = { 46 | lazyJsonResponse[A, A](cached, identity, orElse, json) 47 | } 48 | 49 | def lazyJsonResponse[A, B]( 50 | cached: Option[Future[A]], 51 | map: A => B, 52 | orElse: => B, 53 | json: B => Json): ActionResult = { 54 | cached match { 55 | case Some(future) => 56 | if (future.isCompleted) { 57 | Ok( 58 | json(map(future.value.flatMap(_.toOption).get)).spaces4, 59 | Map("Content-Type" -> "application/json")) 60 | } else { 61 | Ok(Map("lazy" -> true).asJson.spaces4, Map("Content-Type" -> "application/json")) 62 | } 63 | case None => 64 | Ok(json(orElse).spaces4, Map("Content-Type" -> "application/json")) 65 | } 66 | } 67 | 68 | def lazyProcess[A](cached: Option[Future[A]], orElse: => A)(process: A => Unit): Unit = { 69 | lazyProcess[A, A](cached, identity, orElse)(process) 70 | } 71 | 72 | def lazyProcess[A, B](cached: Option[Future[A]], map: A => B, orElse: => B)( 73 | process: B => Unit): Unit = { 74 | cached match { 75 | case Some(future) => 76 | future.onComplete(v => process(map(v.get))) 77 | case None => 78 | process(orElse) 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/PublicSuffixUtil.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.broadcast.Broadcast 5 | import org.archive.webservices.ars.model.ArchConf 6 | 7 | import scala.io.Source 8 | 9 | import org.archive.webservices.ars.model.ArchConf 10 | 11 | object PublicSuffixUtil { 12 | private var _broadcast: Option[(String, Broadcast[Set[String]])] = None 13 | 14 | def broadcast(sc: SparkContext): Broadcast[Set[String]] = { 15 | if (_broadcast.isDefined && _broadcast.get._1 == sc.applicationId) _broadcast.get._2 16 | else { 17 | for ((_, bc) <- _broadcast) bc.destroy() 18 | val bc = sc.broadcast(Suffixes) 19 | _broadcast = Some((sc.applicationId, bc)) 20 | bc 21 | } 22 | } 23 | 24 | lazy val Suffixes: Set[String] = { 25 | val source = Source 26 | .fromURL(ArchConf.publicSuffixListUrl, "utf-8") 27 | try { 28 | source.getLines 29 | .map(_.trim) 30 | .filter(_.nonEmpty) 31 | .filter(!_.startsWith("//")) 32 | .toSet 33 | } catch { 34 | case _: Exception => 35 | Set.empty 36 | } finally { 37 | source.close() 38 | } 39 | } 40 | 41 | def resolve(host: String): String = resolve(host, Suffixes) 42 | 43 | def resolve(host: String, suffixes: Set[String]): String = { 44 | val hostSplit = host.split('.') 45 | hostSplit.tails 46 | .filter(_.length > 1) 47 | .find { domain => 48 | val suffix = domain.tail 49 | suffixes.contains(suffix.mkString(".")) || (suffix.length > 1 && { 50 | suffixes.contains("*." + suffix.tail.mkString(".")) 51 | }) 52 | } 53 | .getOrElse(hostSplit) 54 | .mkString(".") 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/org/archive/webservices/ars/util/UUID.scala: -------------------------------------------------------------------------------- 1 | package org.archive.webservices.ars.util 2 | 3 | import com.fasterxml.uuid.Generators 4 | 5 | object UUID { 6 | def uuid7 = { 7 | // see https://github.com/cowtowncoder/java-uuid-generator 8 | Generators.timeBasedEpochGenerator().generate() 9 | } 10 | 11 | def uuid7str: String = uuid7.toString 12 | } 13 | -------------------------------------------------------------------------------- /src/test/scala/org/archive/webservices/ars/ApiController.scala: -------------------------------------------------------------------------------- 1 | package test.org.archive.webservices.ars 2 | 3 | import io.circe.parser.parse 4 | import org.scalatra.test.scalatest._ 5 | 6 | import org.archive.webservices.ars.{ApiController, DefaultController} 7 | 8 | import Fixtures._ 9 | 10 | class ApiControllerSpec extends UnitSpec { 11 | addServlet(classOf[DefaultController], "/*") 12 | addServlet(classOf[ApiController], "/api/*") 13 | 14 | test("/api/collections returns status 403 when not authenticated") { 15 | get("/api/collections") { 16 | status should equal (403) 17 | } 18 | } 19 | 20 | test("/api/collections returns status 200 and count=0 when no collections exist") { 21 | loggedInAs(makeArchUser()) { 22 | get("/api/collections") { 23 | status should equal (200) 24 | val cur = parse(body).right.get.hcursor 25 | cur.get[Int]("count").right.get should equal (0) 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/org/archive/webservices/ars/Fixtures.scala: -------------------------------------------------------------------------------- 1 | package test.org.archive.webservices.ars 2 | 3 | import java.io.{File, PrintWriter} 4 | 5 | import scala.io.Source 6 | 7 | import io.circe.Json 8 | import io.circe.syntax._ 9 | import io.circe.parser.parse 10 | 11 | import org.archive.webservices.sparkling.util.DigestUtil 12 | 13 | import org.archive.webservices.ars.model.users.ArchUser 14 | 15 | object Fixtures { 16 | private def load(path: String): Json = { 17 | val source = Source.fromFile(path) 18 | val json = parse(source.getLines.mkString).right.get 19 | source.close() 20 | json 21 | } 22 | 23 | private def store(path: String, json: Json): Unit = { 24 | val source = Source.fromFile(path) 25 | val pw = new PrintWriter(new File(path)) 26 | pw.write(json.toString) 27 | pw.close() 28 | } 29 | 30 | def makeArchUser(admin: Boolean = false): ArchUser = { 31 | // Insert a randomly-generated user into the arch-users.json file and 32 | // return the corresponding ArchUser instance. 33 | val path = "data/arch-users.json" 34 | val json = load(path) 35 | var userId = java.time.Instant.now.toEpochMilli.toString 36 | // In the event of a collision, append a "0". 37 | val existingUserIds = json.hcursor.keys.get.toSet 38 | while (existingUserIds.contains(userId)) { 39 | userId += "0" 40 | } 41 | store(path, json.deepMerge(Map( 42 | userId -> Map( 43 | "name" -> userId.asJson, 44 | "password" -> s"sha1:${DigestUtil.sha1Base32(userId)}".asJson, 45 | "admin" -> admin.asJson 46 | )).asJson 47 | )) 48 | ArchUser.invalidateData() 49 | ArchUser.get(s"arch:$userId").get 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/scala/org/archive/webservices/ars/JobUuidApiController.scala: -------------------------------------------------------------------------------- 1 | package test.org.archive.webservices.ars 2 | 3 | import io.circe.parser.parse 4 | import io.circe.syntax._ 5 | import org.scalatra.test.scalatest._ 6 | 7 | import org.archive.webservices.ars.{ApiController, DefaultController, JobUuidApiController} 8 | import org.archive.webservices.ars.processing.jobs.DomainFrequencyExtraction 9 | 10 | import Fixtures._ 11 | 12 | class JobUuidApiControllerSpec extends UnitSpec { 13 | addServlet(classOf[DefaultController], "/*") 14 | addServlet(classOf[JobUuidApiController], "/api/job/*") 15 | addServlet(classOf[ApiController], "/api/*") 16 | 17 | test("Generating a DomainFrequencyExtraction on the test collection works") { 18 | val user = makeArchUser() 19 | loggedInAs(user) { 20 | post(s"/api/runjob/${DomainFrequencyExtraction.id}?sample=false", 21 | parse(s""" 22 | { 23 | "user": "${user.userName}", 24 | "inputSpec": { 25 | "type": "collection", 26 | "collectionId": "SPECIAL-test-collection" 27 | }, 28 | "params": { 29 | "dataset": "${DomainFrequencyExtraction.id}" 30 | } 31 | } 32 | """).toOption.get.toString.getBytes, 33 | Map("content-type" -> "application/json") 34 | ) { 35 | status should equal (200) 36 | val cur = parse(body).right.get.hcursor 37 | cur.get[String]("id").right.get should equal (DomainFrequencyExtraction.id) 38 | cur.get[String]("name").right.get should equal (DomainFrequencyExtraction.name) 39 | cur.get[Int]("sample").right.get should equal (-1) 40 | cur.get[String]("state").right.get should equal ("Running") 41 | cur.get[Boolean]("started").right.get should equal (true) 42 | cur.get[Boolean]("finished").right.get should equal (false) 43 | cur.get[Boolean]("failed").right.get should equal (false) 44 | cur.get[String]("activeStage").right.get should equal ("Processing") 45 | cur.get[String]("activeState").right.get should equal ("Running") 46 | 47 | val uuid = cur.get[String]("uuid").right.get 48 | 49 | Thread.sleep(15000) 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/scala/org/archive/webservices/ars/UnitSpec.scala: -------------------------------------------------------------------------------- 1 | package test.org.archive.webservices.ars 2 | 3 | import java.io.File 4 | 5 | import org.apache.commons.io.FileUtils 6 | import org.eclipse.jetty.server.Server 7 | import org.scalamock.scalatest.MockFactory 8 | import org.scalatest.{BeforeAndAfterAll, FunSuiteLike} 9 | import org.scalatra.test.scalatest._ 10 | 11 | import org.archive.webservices.ars.model.ArchConf 12 | import org.archive.webservices.ars.model.users.ArchUser 13 | 14 | /* Common Base Test Class */ 15 | abstract class UnitSpec extends ScalatraSuite with FunSuiteLike with MockFactory with BeforeAndAfterAll { 16 | private val dataDir = "data" 17 | private val backupDataDir = ".data-test-bak" 18 | 19 | // Configure tests to use our non-standard webapp path. 20 | servletContextHandler.setResourceBase("webapp") 21 | 22 | override def beforeAll { 23 | super.beforeAll() 24 | // Assert that the configured environment is valid for testing. 25 | assert(ArchConf.deploymentEnvironment == "DEV") 26 | 27 | // Create a backup of the existing data directory. 28 | FileUtils.copyDirectory(new File(dataDir), new File(backupDataDir)) 29 | } 30 | 31 | override def afterAll { 32 | super.afterAll() 33 | // Restore the pre-existing data directory. 34 | FileUtils.copyDirectory(new File(backupDataDir), new File(dataDir)) 35 | } 36 | 37 | // https://stackoverflow.com/a/34030731 38 | def setEnv(key: String, value: String) = { 39 | val field = System.getenv().getClass.getDeclaredField("m") 40 | field.setAccessible(true) 41 | val map = 42 | field.get(System.getenv()).asInstanceOf[java.util.Map[java.lang.String, java.lang.String]] 43 | map.put(key, value) 44 | } 45 | 46 | def loggedInAs[A](user: ArchUser)(test: => A): A = { 47 | session { 48 | post("/login", params = Seq(("username", user.fullName), ("password", user.fullName))) { 49 | status should equal (302) 50 | } 51 | test 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/scala/org/archive/webservices/ars/model/LocalArchConfSpec.scala: -------------------------------------------------------------------------------- 1 | package test.org.archive.webservices.ars 2 | 3 | import org.archive.webservices.ars.model.LocalArchConf 4 | 5 | class LocalArchConfSpec extends UnitSpec { 6 | setEnv("ARCH_BASE_PATH", "/") 7 | setEnv("ARCH_PROTO", "http") 8 | setEnv("ARCH_HOST", "arch.archive-it.org") 9 | setEnv("ARCH_EXTERNAL_PORT", "80") 10 | 11 | def conf = { 12 | mock[LocalArchConf] 13 | } 14 | 15 | test("baseUrl excludes port when proto=http and port=80") { 16 | setEnv("ARCH_PROTO", "http") 17 | setEnv("ARCH_EXTERNAL_PORT", "80") 18 | conf.baseUrl should be("http://arch.archive-it.org") 19 | } 20 | 21 | test("baseUrl excludes port when proto=https and port=443") { 22 | setEnv("ARCH_PROTO", "https") 23 | setEnv("ARCH_EXTERNAL_PORT", "443") 24 | conf.baseUrl should be("https://arch.archive-it.org") 25 | } 26 | 27 | test("baseUrl includes port when proto=http and port!=80") { 28 | setEnv("ARCH_PROTO", "http") 29 | setEnv("ARCH_EXTERNAL_PORT", "81") 30 | conf.baseUrl should be("http://arch.archive-it.org:81") 31 | } 32 | 33 | test("baseUrl includes port when proto=https and port!=443") { 34 | setEnv("ARCH_PROTO", "https") 35 | setEnv("ARCH_EXTERNAL_PORT", "444") 36 | conf.baseUrl should be("https://arch.archive-it.org:444") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /templates/sendmail_failed.txt: -------------------------------------------------------------------------------- 1 | From: arch-no-reply@archive-it.org 2 | To: helge@archive.org, archiveit-alerts@archive.org, kody@archive.org, karlb@archive.org, tpadilla@archive.org 3 | Subject: ARCH: $jobName on $collectionName has FAILED! 4 | MIME-Version: 1.0 5 | Content-Type: multipart/alternative; boundary=ARCHARCHARCH 6 | 7 | --ARCHARCHARCH 8 | Content-Type: text/plain; charset=utf-8 9 | Hi Helge, 10 | =============================================== 11 | 12 | $jobName job on $collectionName has FAILED for $userName ($accountId). 13 | 14 | --ARCHARCHARCH 15 | Content-Type: text/html; charset=utf-8 16 | 17 | 18 | 19 | 20 | 21 | 22 |

!!! JOB FAILED !!!

23 |

$jobName job on $collectionName has FAILED for $userName ($accountId).

24 | 25 | 26 | --ARCHARCHARCH-- 27 | -------------------------------------------------------------------------------- /templates/sendmail_finished.txt: -------------------------------------------------------------------------------- 1 | From: arch-no-reply@archive-it.org 2 | To: $to 3 | Subject: ARCH: Your $jobName dataset from $collectionName is ready to use 4 | MIME-Version: 1.0 5 | Content-Type: multipart/alternative; boundary=ARCHARCHARCH 6 | 7 | --ARCHARCHARCH 8 | Content-Type: text/plain; charset=utf-8 9 | Hello $userName, 10 | 11 | ARCH has created your $jobName dataset from $collectionName. You may find it here: $datasetUrl 12 | 13 | Best, 14 | The ARCH team 15 | --ARCHARCHARCH 16 | Content-Type: text/html; charset=utf-8 17 | 18 | 19 | 20 | 21 | 22 | 23 |

Hello $userName,

24 |

25 | ARCH has created your $jobName dataset from $collectionName. You may find it here: $datasetUrl 26 |

27 |

28 | Best, 29 |
30 | The ARCH team 31 |

32 |

33 | Having trouble? Let us know! 34 |

35 | 36 | 37 | --ARCHARCHARCH-- 38 | -------------------------------------------------------------------------------- /templates/sendmail_udq-finished.txt: -------------------------------------------------------------------------------- 1 | From: arch-no-reply@archive-it.org 2 | To: $to 3 | Subject: ARCH: Your custom collection “$udqCollectionName” is ready to use 4 | MIME-Version: 1.0 5 | Content-Type: multipart/alternative; boundary=ARCHARCHARCH 6 | 7 | --ARCHARCHARCH 8 | Content-Type: text/plain; charset=utf-8 9 | Hello $userName, 10 | 11 | ARCH has created your custom collection, “$udqCollectionName.” You may find and use it here: $collectionsUrl 12 | 13 | Best, 14 | The ARCH team 15 | --ARCHARCHARCH 16 | Content-Type: text/html; charset=utf-8 17 | 18 | 19 | 20 | 21 | 22 | 23 |

Hello $userName,

24 |

25 | ARCH has created your custom collection, "$udqCollectionName.” You may find and use it here: $collectionsUrl 26 |

27 |

28 | Best, 29 |
30 | The ARCH team 31 |

32 |

33 | Having trouble? Let us know! 34 |

35 | 36 | 37 | --ARCHARCHARCH-- 38 | -------------------------------------------------------------------------------- /webapp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/internetarchive/arch/7cd8a6f4d2d557275b9d19381c8ad234ca134a60/webapp/.gitkeep --------------------------------------------------------------------------------